1
0
mirror of https://github.com/duke-git/lancet.git synced 2026-02-10 07:42:27 +08:00

doc: add doc for SplitWords and WordCount

This commit is contained in:
dudaodong
2023-02-23 10:23:38 +08:00
parent c02654559a
commit 15c1537bf0
5 changed files with 647 additions and 333 deletions

View File

@@ -253,3 +253,74 @@ func SplitEx(s, sep string, removeEmptyString bool) []string {
return ret
}
// SplitWords splits a string into words, word only contains alphabetic characters.
func SplitWords(s string) []string {
var word string
var words []string
var r rune
var size, pos int
isWord := false
for len(s) > 0 {
r, size = utf8.DecodeRuneInString(s)
switch {
case isLetter(r):
if !isWord {
isWord = true
word = s
pos = 0
}
case isWord && (r == '\'' || r == '-'):
// is word
default:
if isWord {
isWord = false
words = append(words, word[:pos])
}
}
pos += size
s = s[size:]
}
if isWord {
words = append(words, word[:pos])
}
return words
}
// WordCount return the number of meaningful word, word only contains alphabetic characters.
func WordCount(s string) int {
var r rune
var size, count int
isWord := false
for len(s) > 0 {
r, size = utf8.DecodeRuneInString(s)
switch {
case isLetter(r):
if !isWord {
isWord = true
count++
}
case isWord && (r == '\'' || r == '-'):
// is word
default:
isWord = false
}
s = s[size:]
}
return count
}

View File

@@ -132,3 +132,36 @@ func padAtPosition(str string, length int, padStr string, position int) string {
return leftPad + str + rightPad
}
// isLetter checks r is a letter but not CJK character.
func isLetter(r rune) bool {
if !unicode.IsLetter(r) {
return false
}
switch {
// cjk char: /[\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff66-\uff9f]/
// hiragana and katakana (Japanese only)
case r >= '\u3034' && r < '\u30ff':
return false
// CJK unified ideographs extension A (Chinese, Japanese, and Korean)
case r >= '\u3400' && r < '\u4dbf':
return false
// CJK unified ideographs (Chinese, Japanese, and Korean)
case r >= '\u4e00' && r < '\u9fff':
return false
// CJK compatibility ideographs (Chinese, Japanese, and Korean)
case r >= '\uf900' && r < '\ufaff':
return false
// half-width katakana (Japanese only)
case r >= '\uff66' && r < '\uff9f':
return false
}
return true
}

View File

@@ -295,3 +295,37 @@ func TestSplitEx(t *testing.T) {
assert.Equal([]string{" a", "b", "c", ""}, SplitEx(" a = b = c = ", " = ", false))
assert.Equal([]string{" a", "b", "c"}, SplitEx(" a = b = c = ", " = ", true))
}
func TestSplitWords(t *testing.T) {
assert := internal.NewAssert(t, "TestSplitWords")
cases := map[string][]string{
"a word": {"a", "word"},
"I'am a programmer": {"I'am", "a", "programmer"},
"Bonjour, je suis programmeur": {"Bonjour", "je", "suis", "programmeur"},
"a -b-c' 'd'e": {"a", "b-c'", "d'e"},
"你好,我是一名码农": nil,
"こんにちは,私はプログラマーです": nil,
}
for k, v := range cases {
assert.Equal(v, SplitWords(k))
}
}
func TestWordCount(t *testing.T) {
assert := internal.NewAssert(t, "TestSplitWords")
cases := map[string]int{
"a word": 2, // {"a", "word"},
"I'am a programmer": 3, // {"I'am", "a", "programmer"},
"Bonjour, je suis programmeur": 4, // {"Bonjour", "je", "suis", "programmeur"},
"a -b-c' 'd'e": 3, // {"a", "b-c'", "d'e"},
"你好,我是一名码农": 0, // nil,
"こんにちは,私はプログラマーです": 0, // nil,
}
for k, v := range cases {
assert.Equal(v, WordCount(k))
}
}