feat: add address.Smart and Decompose for parse CN address (#346)

* feat: add address.Smart and Decompose for parse CN address * feat: add Xinjiang directly-administered county-level cities support - Add '自治区直辖县级市' as a city-level unit (ID: 4043) in A2Data for Xinjiang - Add 12 directly-administered county-level cities in A3Data (IDs: 4044-4055): * 石河子市 (Shihezi, 1976, 8th Division) - ID: 4044 * 阿拉尔市 (Aral, 2002, 1st Division) - ID: 4045 * 图木舒克市 (Tumxuk, 2002, 3rd Division) - ID: 4046 * 五家渠市 (Wujiaqu, 2002, 6th Division) - ID: 4047 * 北屯市 (Beitun, 2011, 10th Division) - ID: 4048 * 铁门关市 (Tiemenguan, 2012, 2nd Division) - ID: 4049 * 双河市 (Shuanghe, 2014, 5th Division) - ID: 4050 * 可克达拉市 (Kokdala, 2015, 4th Division) - ID: 4051 * 昆玉市 (Kunyu, 2016, 14th Division) - ID: 4052 * 胡杨河市 (Huyanghe, 2019, 7th Division) - ID: 4053 * 新星市 (Xinxing, 2021, 13th Division) - ID: 4054 * 白杨市 (Baiyang, 2023, 9th Division) - ID: 4055 - All county-level cities are under PID 4043 (自治区直辖县级市) - Add test case for Xinjiang Shihezi city address parsing - Now supports parsing addresses like: 新疆石河子市北三路25小区 * docs: formated address data * fix: parse repeat address error * feat: update readme file --------- Co-authored-by: Jiawen <im@linjiawen.com>
2026-02-19 04:02:27 +08:00 · 2026-01-13 14:00:44 +08:00
parent a1cebec9f2
commit b3fd282b50
8 changed files with 5679 additions and 0 deletions
--- a/formatter/address_helper.go
+++ b/formatter/address_helper.go
@@ -0,0 +1,75 @@
+package formatter
+
+import (
+	"strings"
+	"unicode/utf8"
+)
+
+// mbStrpos 返回字符串首次出现的位置（UTF-8字符计数）
+func mbStrpos(haystack, needle string) int {
+	if needle == "" {
+		return 0
+	}
+	idx := strings.Index(haystack, needle)
+	if idx == -1 {
+		return -1
+	}
+	return utf8.RuneCountInString(haystack[:idx])
+}
+
+// mbStrripos 返回字符串最后出现的位置（UTF-8字符计数）
+func mbStrripos(haystack, needle string) int {
+	if needle == "" {
+		return utf8.RuneCountInString(haystack)
+	}
+	idx := strings.LastIndex(haystack, needle)
+	if idx == -1 {
+		return -1
+	}
+	return utf8.RuneCountInString(haystack[:idx])
+}
+
+// mbStrstr 检查字符串是否包含子串
+func mbStrstr(haystack, needle string) bool {
+	return strings.Contains(haystack, needle)
+}
+
+// mbSubstr 截取字符串（UTF-8字符计数）
+// start: 起始位置（从0开始）
+// length: 截取长度（字符数）
+func mbSubstr(str string, start, length int) string {
+	runes := []rune(str)
+	strLen := len(runes)
+
+	// 处理负数起始位置
+	if start < 0 {
+		start = strLen + start
+		if start < 0 {
+			start = 0
+		}
+	}
+
+	// 起始位置超出字符串长度
+	if start >= strLen {
+		return ""
+	}
+
+	// 计算结束位置
+	end := start + length
+	if end > strLen {
+		end = strLen
+	}
+	if end < start {
+		return ""
+	}
+
+	return string(runes[start:end])
+}
+
+// mbSubstrCount 统计子串出现次数
+func mbSubstrCount(haystack, needle string) int {
+	if needle == "" {
+		return 0
+	}
+	return strings.Count(haystack, needle)
+}