From ed98ad53ec0dcd6603dc73fd19068f9e4994a2b3 Mon Sep 17 00:00:00 2001 From: dudaodong Date: Tue, 21 Feb 2023 14:16:36 +0800 Subject: [PATCH] feat: add SplitWords --- strutil/string.go | 42 ++++++++++++++++++++++++++++++++++ strutil/string_example_test.go | 25 ++++++++++++++++++++ strutil/string_internal.go | 33 ++++++++++++++++++++++++++ strutil/string_test.go | 17 ++++++++++++++ 4 files changed, 117 insertions(+) diff --git a/strutil/string.go b/strutil/string.go index 8b54475..6d1728c 100644 --- a/strutil/string.go +++ b/strutil/string.go @@ -287,3 +287,45 @@ func Substring(s string, offset int, length uint) string { return strings.Replace(str, "\x00", "", -1) } + +// SplitWords splits a string into words, word only contains alphabetic characters. +// Play: todo +func SplitWords(s string) []string { + var word string + var words []string + var r rune + var size, pos int + + isWord := false + + for len(s) > 0 { + r, size = utf8.DecodeRuneInString(s) + + switch { + case isLetter(r): + if !isWord { + isWord = true + word = s + pos = 0 + } + + case isWord && (r == '\'' || r == '-'): + // is word + + default: + if isWord { + isWord = false + words = append(words, word[:pos]) + } + } + + pos += size + s = s[size:] + } + + if isWord { + words = append(words, word[:pos]) + } + + return words +} diff --git a/strutil/string_example_test.go b/strutil/string_example_test.go index 6b043dd..c7850ed 100644 --- a/strutil/string_example_test.go +++ b/strutil/string_example_test.go @@ -388,3 +388,28 @@ func ExampleSubstring() { // de // 你好 } + +func ExampleSplitWords() { + + result1 := SplitWords("a word") + result2 := SplitWords("I'am a programmer") + result3 := SplitWords("Bonjour, je suis programmeur") + result4 := SplitWords("a -b-c' 'd'e") + result5 := SplitWords("你好,我是一名码农") + result6 := SplitWords("こんにちは,私はプログラマーです") + + fmt.Println(result1) + fmt.Println(result2) + fmt.Println(result3) + fmt.Println(result4) + fmt.Println(result5) + fmt.Println(result6) + + // Output: + // [a word] + // [I'am a programmer] + // [Bonjour je suis programmeur] + // [a b-c' d'e] + // [] + // [] +} diff --git a/strutil/string_internal.go b/strutil/string_internal.go index 9ae2920..faee9b0 100644 --- a/strutil/string_internal.go +++ b/strutil/string_internal.go @@ -135,3 +135,36 @@ func padAtPosition(str string, length int, padStr string, position int) string { return leftPad + str + rightPad } + +// isLetter checks r is a letter but not CJK character. +func isLetter(r rune) bool { + if !unicode.IsLetter(r) { + return false + } + + switch { + // cjk char: /[\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff66-\uff9f]/ + + // hiragana and katakana (Japanese only) + case r >= '\u3034' && r < '\u30ff': + return false + + // CJK unified ideographs extension A (Chinese, Japanese, and Korean) + case r >= '\u3400' && r < '\u4dbf': + return false + + // CJK unified ideographs (Chinese, Japanese, and Korean) + case r >= '\u4e00' && r < '\u9fff': + return false + + // CJK compatibility ideographs (Chinese, Japanese, and Korean) + case r >= '\uf900' && r < '\ufaff': + return false + + // half-width katakana (Japanese only) + case r >= '\uff66' && r < '\uff9f': + return false + } + + return true +} diff --git a/strutil/string_test.go b/strutil/string_test.go index 6edc4d0..4fd95d6 100644 --- a/strutil/string_test.go +++ b/strutil/string_test.go @@ -308,3 +308,20 @@ func TestSubstring(t *testing.T) { assert.Equal("de", Substring("abcde", -2, 3)) assert.Equal("你好", Substring("你好,欢迎你", 0, 2)) } + +func TestSplitWords(t *testing.T) { + assert := internal.NewAssert(t, "TestSplitWords") + + cases := map[string][]string{ + "a word": {"a", "word"}, + "I'am a programmer": {"I'am", "a", "programmer"}, + "Bonjour, je suis programmeur": {"Bonjour", "je", "suis", "programmeur"}, + "a -b-c' 'd'e": {"a", "b-c'", "d'e"}, + "你好,我是一名码农": nil, + "こんにちは,私はプログラマーです": nil, + } + + for k, v := range cases { + assert.Equal(v, SplitWords(k)) + } +}