1
0
mirror of https://github.com/duke-git/lancet.git synced 2026-02-04 12:52:28 +08:00
Files
lancet/formatter/address.go
Javen b3fd282b50 feat: add address.Smart and Decompose for parse CN address (#346)
* feat: add address.Smart and Decompose for parse CN address

* feat: add Xinjiang directly-administered county-level cities support

- Add '自治区直辖县级市' as a city-level unit (ID: 4043) in A2Data for Xinjiang
- Add 12 directly-administered county-level cities in A3Data (IDs: 4044-4055):
  * 石河子市 (Shihezi, 1976, 8th Division) - ID: 4044
  * 阿拉尔市 (Aral, 2002, 1st Division) - ID: 4045
  * 图木舒克市 (Tumxuk, 2002, 3rd Division) - ID: 4046
  * 五家渠市 (Wujiaqu, 2002, 6th Division) - ID: 4047
  * 北屯市 (Beitun, 2011, 10th Division) - ID: 4048
  * 铁门关市 (Tiemenguan, 2012, 2nd Division) - ID: 4049
  * 双河市 (Shuanghe, 2014, 5th Division) - ID: 4050
  * 可克达拉市 (Kokdala, 2015, 4th Division) - ID: 4051
  * 昆玉市 (Kunyu, 2016, 14th Division) - ID: 4052
  * 胡杨河市 (Huyanghe, 2019, 7th Division) - ID: 4053
  * 新星市 (Xinxing, 2021, 13th Division) - ID: 4054
  * 白杨市 (Baiyang, 2023, 9th Division) - ID: 4055
- All county-level cities are under PID 4043 (自治区直辖县级市)
- Add test case for Xinjiang Shihezi city address parsing
- Now supports parsing addresses like: 新疆石河子市北三路25小区

* docs: formated address data

* fix: parse repeat address error

* feat: update readme file

---------

Co-authored-by: Jiawen <im@linjiawen.com>
2026-01-13 14:00:44 +08:00

495 lines
18 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Package formatter implements some functions to format string, struct.
package formatter
import (
"regexp"
"strings"
"unicode/utf8"
)
// AddressInfo represents the parsed address information including user details and location.
// AddressInfo 表示解析后的地址信息,包括用户详细信息和位置信息
type AddressInfo struct {
Name string `json:"name"` // Name of the recipient / 姓名
Mobile string `json:"mobile"` // Mobile phone number or landline / 手机号或座机
IDN string `json:"idn"` // ID card number / 身份证号
Postcode string `json:"postcode"` // Postal code / 邮编
Province string `json:"province"` // Province / 省
City string `json:"city"` // City / 市
Region string `json:"region"` // District or county / 区/县
Street string `json:"street"` // Street address / 街道详细地址
Addr string `json:"addr"` // Original address string / 原始地址字符串
}
// fuzzyResult represents the result of fuzzy address parsing.
// fuzzyResult 表示模糊地址解析的结果
type fuzzyResult struct {
A1 string // Province level / 省级
A2 string // City level / 市级
A3 string // District/County level / 区/县级
Street string // Street address / 街道地址
}
// ParseCNAddress parses a Chinese address string intelligently and extracts structured information.
// It can parse addresses with or without user information (name, phone, ID card, etc.).
// When withUser is true, it extracts user information from the address string.
// When withUser is false, it only parses the location information.
// The function handles various address formats including:
// - Standard format: "Province City District Street"
// - Compact format: "Name Phone Province City District Street"
// - With keywords: "Name: xxx Phone: xxx Address: xxx"
// - County-level cities: "Province City CountyCity District" (e.g., "河北省石家庄市新乐市")
// ParseCNAddress 智能解析中国地址字符串并提取结构化信息。
// 可以解析带或不带用户信息(姓名、电话、身份证等)的地址。
// 当 withUser 为 true 时,从地址字符串中提取用户信息。
// 当 withUser 为 false 时,仅解析位置信息。
// 该函数处理多种地址格式,包括:
// - 标准格式:"省 市 区 街道"
// - 紧凑格式:"姓名 电话 省 市 区 街道"
// - 带关键词:"姓名:xxx 电话:xxx 地址:xxx"
// - 县级市:"省 市 县级市 区"(如"河北省石家庄市新乐市"
func ParseCNAddress(str string, withUser bool) *AddressInfo {
result := &AddressInfo{}
if withUser {
ParsePersonInfo := ParsePersonInfo(str)
result = ParsePersonInfo
} else {
result.Addr = str
}
fuzz := fuzz(result.Addr)
parse := parse(fuzz.A1, fuzz.A2, fuzz.A3)
result.Province = parse.Province
result.City = parse.City
result.Region = parse.Region
// 提取街道地址:从原始地址中找到区/县的位置,提取后面的内容
if result.Region != "" && result.Addr != "" {
// 在原始地址中查找区/县的位置转换为rune数组以正确处理中文
addrRunes := []rune(result.Addr)
regionRunes := []rune(result.Region)
regionPos := mbStrpos(result.Addr, result.Region)
if regionPos != -1 {
// 提取区/县后面的内容作为街道地址
streetStart := regionPos + len(regionRunes)
if streetStart < len(addrRunes) {
result.Street = string(addrRunes[streetStart:])
}
} else if fuzz.Street != "" {
// 如果没找到区/县使用fuzz返回的街道
result.Street = fuzz.Street
}
} else if fuzz.Street != "" {
result.Street = fuzz.Street
}
// 清理街道地址中的重复省市区信息(可能存在部分匹配的残留)
result.Street = strings.ReplaceAll(result.Street, result.Region, "")
result.Street = strings.ReplaceAll(result.Street, result.City, "")
result.Street = strings.ReplaceAll(result.Street, result.Province, "")
// 清理街道地址中的残留片段(如"自治区直辖县级市"被替换后的残留)
result.Street = strings.ReplaceAll(result.Street, "自治区直辖县级市", "")
result.Street = strings.ReplaceAll(result.Street, "直辖县级市", "")
result.Street = strings.TrimSpace(result.Street)
return result
}
// ParsePersonInfo extracts user information (name, phone, ID card, postal code) from an address string.
// It separates personal information from the address, supporting various formats:
// - Labeled format: "Name: xxx Phone: xxx Address: xxx"
// - Compact format: "Name Phone Address" (e.g., "张三13800138000北京市朝阳区")
// - With separators: using colons, commas, newlines as delimiters
// Returns an AddressInfo with extracted user information and cleaned address string in Addr field.
// ParsePersonInfo 从地址字符串中提取用户信息(姓名、电话、身份证、邮编)。
// 将个人信息与地址分离,支持多种格式:
// - 带标签格式:"姓名:xxx 电话:xxx 地址:xxx"
// - 紧凑格式:"姓名 电话 地址"(如"张三13800138000北京市朝阳区"
// - 带分隔符:使用冒号、逗号、换行符作为分隔符
// 返回包含提取的用户信息和清理后地址字符串(在 Addr 字段中)的 AddressInfo。
func ParsePersonInfo(str string) *AddressInfo {
compose := &AddressInfo{}
// 先尝试提取带标签的信息
// 提取姓名 (支持: 姓名:xxx, 收货人:xxx, 收件人:xxx)
nameRe := regexp.MustCompile(`(?:姓名|收货人|收件人)[:]\s*([^\s\d\n]+)`)
if match := nameRe.FindStringSubmatch(str); len(match) > 1 {
compose.Name = strings.TrimSpace(match[1])
str = nameRe.ReplaceAllString(str, " ")
}
// 提取手机号或座机号 (支持: 电话:xxx, 手机:xxx, 联系电话:xxx)
phoneRe := regexp.MustCompile(`(?:电话|手机号码|手机|联系电话)[:]\s*([\d\-]+)`)
if match := phoneRe.FindStringSubmatch(str); len(match) > 1 {
compose.Mobile = strings.TrimSpace(match[1])
str = phoneRe.ReplaceAllString(str, " ")
}
// 提取所在地区 (支持: 所在地区:xxx)
regionRe := regexp.MustCompile(`所在地区[:]\s*([^\n]+)`)
if match := regionRe.FindStringSubmatch(str); len(match) > 1 {
// 将所在地区保留在字符串中,不删除
// str 保持不变,让后续的地址解析处理
}
// 提取详细地址 (支持: 详细地址:xxx, 收货地址:xxx, 地址:xxx)
addrRe := regexp.MustCompile(`(?:详细地址|收货地址|地址)[:]\s*([^\n]+)`)
if match := addrRe.FindStringSubmatch(str); len(match) > 1 {
// 保留详细地址在字符串中
str = addrRe.ReplaceAllString(str, " "+match[1])
}
// 如果还没有提取到姓名和手机号,尝试识别紧凑格式 (如: 马云13593464918陕西省...)
if compose.Name == "" && compose.Mobile == "" {
// 匹配: 2-4个汉字 + 7-12位数字 + 剩余内容
compactRe := regexp.MustCompile(`^([\x{4e00}-\x{9fa5}]{2,4})(\d{7,12})(.*)$`)
if match := compactRe.FindStringSubmatch(str); len(match) > 3 {
compose.Name = match[1]
compose.Mobile = match[2]
str = match[3] // 保留剩余的地址部分
}
}
// 替换常见的地址关键词为空格
replacements := map[string]string{
"收货地址": " ", "详细地址": " ", "地址": " ", "收货人": " ",
"收件人": " ", "收货": " ", "所在地区": " ", "邮编": " ",
"电话": " ", "手机号码": " ", "身份证号码": " ", "身份证号": " ",
"身份证": " ", "姓名": " ", "联系电话": " ", "手机": " ",
"": " ", ":": " ", "": " ", ";": " ",
"": " ", ",": " ", "。": " ", "\n": " ", "\r": " ",
}
for old, new := range replacements {
str = strings.ReplaceAll(str, old, new)
}
// 将多个空格合并为一个
spaceRe := regexp.MustCompile(`\s{1,}`)
str = spaceRe.ReplaceAllString(str, " ")
// 处理座机号格式 (如: 800-8585222)
telRe := regexp.MustCompile(`(\d{3,4})-(\d{6,8})`)
str = telRe.ReplaceAllString(str, "$1$2")
// 提取身份证号 (18位或17位+X)
idnRe := regexp.MustCompile(`\d{18}|\d{17}[Xx]`)
if match := idnRe.FindString(str); match != "" {
compose.IDN = strings.ToUpper(match)
str = strings.ReplaceAll(str, match, "")
}
// 如果之前没有提取到手机号,现在提取
if compose.Mobile == "" {
mobileRe := regexp.MustCompile(`\d{7,12}`)
if match := mobileRe.FindString(str); match != "" {
compose.Mobile = match
str = strings.ReplaceAll(str, match, "")
}
} else {
// 已经提取过手机号,从字符串中删除
str = strings.ReplaceAll(str, compose.Mobile, "")
}
// 提取邮编
postcodeRe := regexp.MustCompile(`\d{6}`)
if match := postcodeRe.FindString(str); match != "" {
compose.Postcode = match
str = strings.ReplaceAll(str, match, "")
}
// 清理多余空格
str = strings.TrimSpace(spaceRe.ReplaceAllString(str, " "))
// 如果之前没有提取到姓名,现在提取
if compose.Name == "" {
// 提取姓名(取最短的词作为姓名,排除空字符串)
splitArr := strings.Split(str, " ")
if len(splitArr) > 0 {
for _, value := range splitArr {
value = strings.TrimSpace(value)
if value == "" {
continue
}
if compose.Name == "" {
compose.Name = value
} else if utf8.RuneCountInString(value) < utf8.RuneCountInString(compose.Name) && utf8.RuneCountInString(value) >= 2 {
compose.Name = value
}
}
if compose.Name != "" {
str = strings.TrimSpace(strings.ReplaceAll(str, compose.Name, ""))
}
}
} else {
// 已经提取过姓名,从字符串中删除
str = strings.TrimSpace(strings.ReplaceAll(str, compose.Name, ""))
}
compose.Addr = str
return compose
}
// fuzz 根据统计规律分析出二三级地址
func fuzz(addr string) *fuzzyResult {
addrOrigin := addr
addr = strings.ReplaceAll(addr, " ", "")
addr = strings.ReplaceAll(addr, ",", "")
// 先替换"自治区直辖县级市"为"市",避免后续"自治区"替换时产生问题
addr = strings.ReplaceAll(addr, "自治区直辖县级市", "市")
addr = strings.ReplaceAll(addr, "自治区", "省")
addr = strings.ReplaceAll(addr, "自治州", "州")
addr = strings.ReplaceAll(addr, "小区", "")
addr = strings.ReplaceAll(addr, "校区", "")
// 过滤"市辖区" - 这是一个行政术语占位符,不是真正的区名
addr = strings.ReplaceAll(addr, "市辖区", "")
a1 := ""
a2 := ""
a3 := ""
street := ""
deep3KeywordPos := -1
// 判断是否包含县/区/旗
countyPos := mbStrpos(addr, "县")
districtPos := mbStrpos(addr, "区")
bannerPos := mbStrpos(addr, "旗")
// 只要存在这些关键词就处理,不再限制位置
hasEarlyCounty := countyPos != -1
hasEarlyDistrict := districtPos != -1
hasEarlyBanner := bannerPos != -1
if hasEarlyCounty || hasEarlyDistrict || hasEarlyBanner {
// 优先检查是否存在县级市(如"新乐市"
// 如果同时存在"XX市"和"XX区"/"XX县",优先处理"市"
hasCountyLevelCity := false
if mbStrstr(addr, "市") {
// 查找所有"市"的位置
cityCount := mbSubstrCount(addr, "市")
if cityCount >= 2 {
// 找到第二个"市"的位置(可能是县级市)
firstCityPos := mbStrpos(addr, "市")
// 从第一个"市"之后继续查找
addrAfterFirstCity := mbSubstr(addr, firstCityPos+1, utf8.RuneCountInString(addr)-firstCityPos-1)
secondCityPos := mbStrpos(addrAfterFirstCity, "市")
if secondCityPos != -1 {
secondCityAbsPos := firstCityPos + 1 + secondCityPos
// 检查第二个"市"后面是否存在"区"或"县"
addrAfterSecondCity := mbSubstr(addr, secondCityAbsPos+1, utf8.RuneCountInString(addr)-secondCityAbsPos-1)
if mbStrstr(addrAfterSecondCity, "区") || mbStrstr(addrAfterSecondCity, "县") {
// 提取两个"市"之间的内容
betweenCities := mbSubstr(addr, firstCityPos+1, secondCityAbsPos-firstCityPos)
// 检查是否是重复的地名(如"北京市北京市"或"杭州市西湖区杭州市"
// 如果两个"市"之间包含"区"或"县",说明不是县级市,而是重复地名
if !mbStrstr(betweenCities, "区") && !mbStrstr(betweenCities, "县") {
// 第一个"市"及之前的内容
firstCityFull := mbSubstr(addr, 0, firstCityPos+1)
if betweenCities != firstCityFull {
// 不是重复地名,这是县级市
a3 = betweenCities
deep3KeywordPos = secondCityAbsPos
hasCountyLevelCity = true
}
}
}
}
}
}
if !hasCountyLevelCity {
// 处理旗
if mbStrstr(addr, "旗") {
deep3KeywordPos = mbStrpos(addr, "旗")
a3 = mbSubstr(addr, deep3KeywordPos-1, 2)
}
// 处理区
if mbStrstr(addr, "区") {
// 使用第一个"区"(避免重复地名干扰,如"西湖区杭州市西湖区"
deep3KeywordPos = mbStrpos(addr, "区")
if mbStrstr(addr, "市") {
// 策略:找到"区"之前的最后一个"市"
// 这样可以避免详细地址中的"市"字干扰(如"农贸市场")
zonePos := deep3KeywordPos
// 从开头到"区"的子串中,查找最后一个"市"
addrBeforeZone := mbSubstr(addr, 0, zonePos)
cityPos := mbStrripos(addrBeforeZone, "市")
if cityPos != -1 {
a3 = mbSubstr(addr, cityPos+1, zonePos-cityPos)
} else {
// 没有找到"市",使用默认逻辑
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
}
} else {
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
}
}
// 处理县
if mbStrstr(addr, "县") {
// 使用第一个"县"(避免重复地名干扰)
deep3KeywordPos = mbStrpos(addr, "县")
if mbStrstr(addr, "市") {
// 从开头到"县"的子串中,查找最后一个"市"
addrBeforeCounty := mbSubstr(addr, 0, deep3KeywordPos)
cityPos := mbStrripos(addrBeforeCounty, "市")
if cityPos != -1 {
a3 = mbSubstr(addr, cityPos+1, deep3KeywordPos-cityPos)
} else {
if mbStrstr(addr, "自治县") {
a3 = mbSubstr(addr, deep3KeywordPos-6, 7)
firstChar := mbSubstr(a3, 0, 1)
if firstChar == "省" || firstChar == "市" || firstChar == "州" {
a3 = mbSubstr(a3, 1, utf8.RuneCountInString(a3)-1)
}
} else {
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
}
}
} else {
if mbStrstr(addr, "自治县") {
a3 = mbSubstr(addr, deep3KeywordPos-6, 7)
firstChar := mbSubstr(a3, 0, 1)
if firstChar == "省" || firstChar == "市" || firstChar == "州" {
a3 = mbSubstr(a3, 1, utf8.RuneCountInString(a3)-1)
}
} else {
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
}
}
}
}
if deep3KeywordPos != -1 {
street = mbSubstr(addrOrigin, deep3KeywordPos+1, utf8.RuneCountInString(addrOrigin)-deep3KeywordPos-1)
}
} else {
// 处理市
if mbStrripos(addr, "市") != -1 {
cityCount := mbSubstrCount(addr, "市")
if cityCount == 1 {
deep3KeywordPos = mbStrripos(addr, "市")
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
street = mbSubstr(addrOrigin, deep3KeywordPos+1, utf8.RuneCountInString(addrOrigin)-deep3KeywordPos-1)
} else if cityCount >= 2 {
deep3KeywordPos = mbStrripos(addr, "市")
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
street = mbSubstr(addrOrigin, deep3KeywordPos+1, utf8.RuneCountInString(addrOrigin)-deep3KeywordPos-1)
}
} else {
a3 = ""
street = addr
}
}
// 提取市级地址
if mbStrpos(addr, "市") != -1 || mbStrstr(addr, "盟") || mbStrstr(addr, "州") {
tmpPos := -1
if tmpPos = mbStrpos(addr, "市"); tmpPos != -1 {
// 使用第一个"市"(避免重复地名干扰,如"杭州市西湖区杭州市"
// 向前查找省的位置,如果有省就从省后开始,否则从开头开始
addrBeforeCity := mbSubstr(addr, 0, tmpPos)
provincePos := mbStrripos(addrBeforeCity, "省")
startPos := 0
if provincePos != -1 {
startPos = provincePos + 1
}
a2 = mbSubstr(addr, startPos, tmpPos-startPos+1)
} else if tmpPos = mbStrpos(addr, "盟"); tmpPos != -1 {
a2 = mbSubstr(addr, tmpPos-2, 3)
} else if mbStrpos(addr, "州") != -1 {
if tmpPos = mbStrpos(addr, "自治州"); tmpPos != -1 {
a2 = mbSubstr(addr, tmpPos-4, 5)
} else {
tmpPos = mbStrpos(addr, "州")
a2 = mbSubstr(addr, tmpPos-2, 3)
}
}
}
return &fuzzyResult{
A1: a1,
A2: a2,
A3: a3,
Street: street,
}
}
// parse 智能解析出省市区
func parse(a1, a2, a3 string) *AddressInfo {
r := &AddressInfo{}
if a3 == "" {
return r
}
// 在三级地址数据中查找匹配
area3Matches := make(map[int]*Region)
for id, v := range A3Data {
if mbStrpos(v.Name, a3) != -1 {
area3Matches[id] = v
}
}
// 多个匹配项,需要通过二级地址筛选
if len(area3Matches) > 1 {
if a2 != "" {
area2Matches := make(map[int]*Region)
for id, v := range A2Data {
if mbStrpos(v.Name, a2) != -1 {
area2Matches[id] = v
}
}
if len(area2Matches) > 0 {
for _, v := range area3Matches {
if city, ok := area2Matches[v.PID]; ok {
r.City = city.Name
r.Region = v.Name
if province, ok := A1Data[city.PID]; ok {
r.Province = province.Name
}
}
}
}
} else {
r.Province = ""
r.City = ""
r.Region = a3
}
} else if len(area3Matches) == 1 {
// 唯一匹配
for _, v := range area3Matches {
r.Region = v.Name
if city, ok := A2Data[v.PID]; ok {
r.City = city.Name
if province, ok := A1Data[city.PID]; ok {
r.Province = province.Name
}
}
}
} else if len(area3Matches) == 0 && a2 == a3 {
// 没有匹配到三级地址,但二级地址等于三级地址,可能是直辖市
shengID := 0
for _, v := range A2Data {
if mbStrpos(v.Name, a2) != -1 {
r.City = v.Name
shengID = v.PID
break
}
}
if province, ok := A1Data[shengID]; ok {
r.Province = province.Name
}
r.Region = ""
}
return r
}