mirror of
https://github.com/duke-git/lancet.git
synced 2026-02-09 23:22:28 +08:00
feat: add address.Smart and Decompose for parse CN address (#346)
* feat: add address.Smart and Decompose for parse CN address * feat: add Xinjiang directly-administered county-level cities support - Add '自治区直辖县级市' as a city-level unit (ID: 4043) in A2Data for Xinjiang - Add 12 directly-administered county-level cities in A3Data (IDs: 4044-4055): * 石河子市 (Shihezi, 1976, 8th Division) - ID: 4044 * 阿拉尔市 (Aral, 2002, 1st Division) - ID: 4045 * 图木舒克市 (Tumxuk, 2002, 3rd Division) - ID: 4046 * 五家渠市 (Wujiaqu, 2002, 6th Division) - ID: 4047 * 北屯市 (Beitun, 2011, 10th Division) - ID: 4048 * 铁门关市 (Tiemenguan, 2012, 2nd Division) - ID: 4049 * 双河市 (Shuanghe, 2014, 5th Division) - ID: 4050 * 可克达拉市 (Kokdala, 2015, 4th Division) - ID: 4051 * 昆玉市 (Kunyu, 2016, 14th Division) - ID: 4052 * 胡杨河市 (Huyanghe, 2019, 7th Division) - ID: 4053 * 新星市 (Xinxing, 2021, 13th Division) - ID: 4054 * 白杨市 (Baiyang, 2023, 9th Division) - ID: 4055 - All county-level cities are under PID 4043 (自治区直辖县级市) - Add test case for Xinjiang Shihezi city address parsing - Now supports parsing addresses like: 新疆石河子市北三路25小区 * docs: formated address data * fix: parse repeat address error * feat: update readme file --------- Co-authored-by: Jiawen <im@linjiawen.com>
This commit is contained in:
494
formatter/address.go
Normal file
494
formatter/address.go
Normal file
@@ -0,0 +1,494 @@
|
||||
// Package formatter implements some functions to format string, struct.
|
||||
package formatter
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// AddressInfo represents the parsed address information including user details and location.
|
||||
// AddressInfo 表示解析后的地址信息,包括用户详细信息和位置信息
|
||||
type AddressInfo struct {
|
||||
Name string `json:"name"` // Name of the recipient / 姓名
|
||||
Mobile string `json:"mobile"` // Mobile phone number or landline / 手机号或座机
|
||||
IDN string `json:"idn"` // ID card number / 身份证号
|
||||
Postcode string `json:"postcode"` // Postal code / 邮编
|
||||
Province string `json:"province"` // Province / 省
|
||||
City string `json:"city"` // City / 市
|
||||
Region string `json:"region"` // District or county / 区/县
|
||||
Street string `json:"street"` // Street address / 街道详细地址
|
||||
Addr string `json:"addr"` // Original address string / 原始地址字符串
|
||||
}
|
||||
|
||||
// fuzzyResult represents the result of fuzzy address parsing.
|
||||
// fuzzyResult 表示模糊地址解析的结果
|
||||
type fuzzyResult struct {
|
||||
A1 string // Province level / 省级
|
||||
A2 string // City level / 市级
|
||||
A3 string // District/County level / 区/县级
|
||||
Street string // Street address / 街道地址
|
||||
}
|
||||
|
||||
// ParseCNAddress parses a Chinese address string intelligently and extracts structured information.
|
||||
// It can parse addresses with or without user information (name, phone, ID card, etc.).
|
||||
// When withUser is true, it extracts user information from the address string.
|
||||
// When withUser is false, it only parses the location information.
|
||||
// The function handles various address formats including:
|
||||
// - Standard format: "Province City District Street"
|
||||
// - Compact format: "Name Phone Province City District Street"
|
||||
// - With keywords: "Name: xxx Phone: xxx Address: xxx"
|
||||
// - County-level cities: "Province City CountyCity District" (e.g., "河北省石家庄市新乐市")
|
||||
// ParseCNAddress 智能解析中国地址字符串并提取结构化信息。
|
||||
// 可以解析带或不带用户信息(姓名、电话、身份证等)的地址。
|
||||
// 当 withUser 为 true 时,从地址字符串中提取用户信息。
|
||||
// 当 withUser 为 false 时,仅解析位置信息。
|
||||
// 该函数处理多种地址格式,包括:
|
||||
// - 标准格式:"省 市 区 街道"
|
||||
// - 紧凑格式:"姓名 电话 省 市 区 街道"
|
||||
// - 带关键词:"姓名:xxx 电话:xxx 地址:xxx"
|
||||
// - 县级市:"省 市 县级市 区"(如"河北省石家庄市新乐市")
|
||||
func ParseCNAddress(str string, withUser bool) *AddressInfo {
|
||||
result := &AddressInfo{}
|
||||
|
||||
if withUser {
|
||||
ParsePersonInfo := ParsePersonInfo(str)
|
||||
result = ParsePersonInfo
|
||||
} else {
|
||||
result.Addr = str
|
||||
}
|
||||
|
||||
fuzz := fuzz(result.Addr)
|
||||
parse := parse(fuzz.A1, fuzz.A2, fuzz.A3)
|
||||
|
||||
result.Province = parse.Province
|
||||
result.City = parse.City
|
||||
result.Region = parse.Region
|
||||
|
||||
// 提取街道地址:从原始地址中找到区/县的位置,提取后面的内容
|
||||
if result.Region != "" && result.Addr != "" {
|
||||
// 在原始地址中查找区/县的位置(转换为rune数组以正确处理中文)
|
||||
addrRunes := []rune(result.Addr)
|
||||
regionRunes := []rune(result.Region)
|
||||
regionPos := mbStrpos(result.Addr, result.Region)
|
||||
|
||||
if regionPos != -1 {
|
||||
// 提取区/县后面的内容作为街道地址
|
||||
streetStart := regionPos + len(regionRunes)
|
||||
if streetStart < len(addrRunes) {
|
||||
result.Street = string(addrRunes[streetStart:])
|
||||
}
|
||||
} else if fuzz.Street != "" {
|
||||
// 如果没找到区/县,使用fuzz返回的街道
|
||||
result.Street = fuzz.Street
|
||||
}
|
||||
} else if fuzz.Street != "" {
|
||||
result.Street = fuzz.Street
|
||||
}
|
||||
|
||||
// 清理街道地址中的重复省市区信息(可能存在部分匹配的残留)
|
||||
result.Street = strings.ReplaceAll(result.Street, result.Region, "")
|
||||
result.Street = strings.ReplaceAll(result.Street, result.City, "")
|
||||
result.Street = strings.ReplaceAll(result.Street, result.Province, "")
|
||||
// 清理街道地址中的残留片段(如"自治区直辖县级市"被替换后的残留)
|
||||
result.Street = strings.ReplaceAll(result.Street, "自治区直辖县级市", "")
|
||||
result.Street = strings.ReplaceAll(result.Street, "直辖县级市", "")
|
||||
result.Street = strings.TrimSpace(result.Street)
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// ParsePersonInfo extracts user information (name, phone, ID card, postal code) from an address string.
|
||||
// It separates personal information from the address, supporting various formats:
|
||||
// - Labeled format: "Name: xxx Phone: xxx Address: xxx"
|
||||
// - Compact format: "Name Phone Address" (e.g., "张三13800138000北京市朝阳区")
|
||||
// - With separators: using colons, commas, newlines as delimiters
|
||||
// Returns an AddressInfo with extracted user information and cleaned address string in Addr field.
|
||||
// ParsePersonInfo 从地址字符串中提取用户信息(姓名、电话、身份证、邮编)。
|
||||
// 将个人信息与地址分离,支持多种格式:
|
||||
// - 带标签格式:"姓名:xxx 电话:xxx 地址:xxx"
|
||||
// - 紧凑格式:"姓名 电话 地址"(如"张三13800138000北京市朝阳区")
|
||||
// - 带分隔符:使用冒号、逗号、换行符作为分隔符
|
||||
// 返回包含提取的用户信息和清理后地址字符串(在 Addr 字段中)的 AddressInfo。
|
||||
func ParsePersonInfo(str string) *AddressInfo {
|
||||
compose := &AddressInfo{}
|
||||
|
||||
// 先尝试提取带标签的信息
|
||||
// 提取姓名 (支持: 姓名:xxx, 收货人:xxx, 收件人:xxx)
|
||||
nameRe := regexp.MustCompile(`(?:姓名|收货人|收件人)[::]\s*([^\s\d\n]+)`)
|
||||
if match := nameRe.FindStringSubmatch(str); len(match) > 1 {
|
||||
compose.Name = strings.TrimSpace(match[1])
|
||||
str = nameRe.ReplaceAllString(str, " ")
|
||||
}
|
||||
|
||||
// 提取手机号或座机号 (支持: 电话:xxx, 手机:xxx, 联系电话:xxx)
|
||||
phoneRe := regexp.MustCompile(`(?:电话|手机号码|手机|联系电话)[::]\s*([\d\-]+)`)
|
||||
if match := phoneRe.FindStringSubmatch(str); len(match) > 1 {
|
||||
compose.Mobile = strings.TrimSpace(match[1])
|
||||
str = phoneRe.ReplaceAllString(str, " ")
|
||||
}
|
||||
|
||||
// 提取所在地区 (支持: 所在地区:xxx)
|
||||
regionRe := regexp.MustCompile(`所在地区[::]\s*([^\n]+)`)
|
||||
if match := regionRe.FindStringSubmatch(str); len(match) > 1 {
|
||||
// 将所在地区保留在字符串中,不删除
|
||||
// str 保持不变,让后续的地址解析处理
|
||||
}
|
||||
|
||||
// 提取详细地址 (支持: 详细地址:xxx, 收货地址:xxx, 地址:xxx)
|
||||
addrRe := regexp.MustCompile(`(?:详细地址|收货地址|地址)[::]\s*([^\n]+)`)
|
||||
if match := addrRe.FindStringSubmatch(str); len(match) > 1 {
|
||||
// 保留详细地址在字符串中
|
||||
str = addrRe.ReplaceAllString(str, " "+match[1])
|
||||
}
|
||||
|
||||
// 如果还没有提取到姓名和手机号,尝试识别紧凑格式 (如: 马云13593464918陕西省...)
|
||||
if compose.Name == "" && compose.Mobile == "" {
|
||||
// 匹配: 2-4个汉字 + 7-12位数字 + 剩余内容
|
||||
compactRe := regexp.MustCompile(`^([\x{4e00}-\x{9fa5}]{2,4})(\d{7,12})(.*)$`)
|
||||
if match := compactRe.FindStringSubmatch(str); len(match) > 3 {
|
||||
compose.Name = match[1]
|
||||
compose.Mobile = match[2]
|
||||
str = match[3] // 保留剩余的地址部分
|
||||
}
|
||||
}
|
||||
|
||||
// 替换常见的地址关键词为空格
|
||||
replacements := map[string]string{
|
||||
"收货地址": " ", "详细地址": " ", "地址": " ", "收货人": " ",
|
||||
"收件人": " ", "收货": " ", "所在地区": " ", "邮编": " ",
|
||||
"电话": " ", "手机号码": " ", "身份证号码": " ", "身份证号": " ",
|
||||
"身份证": " ", "姓名": " ", "联系电话": " ", "手机": " ",
|
||||
":": " ", ":": " ", ";": " ", ";": " ",
|
||||
",": " ", ",": " ", "。": " ", "\n": " ", "\r": " ",
|
||||
}
|
||||
|
||||
for old, new := range replacements {
|
||||
str = strings.ReplaceAll(str, old, new)
|
||||
}
|
||||
|
||||
// 将多个空格合并为一个
|
||||
spaceRe := regexp.MustCompile(`\s{1,}`)
|
||||
str = spaceRe.ReplaceAllString(str, " ")
|
||||
|
||||
// 处理座机号格式 (如: 800-8585222)
|
||||
telRe := regexp.MustCompile(`(\d{3,4})-(\d{6,8})`)
|
||||
str = telRe.ReplaceAllString(str, "$1$2")
|
||||
|
||||
// 提取身份证号 (18位或17位+X)
|
||||
idnRe := regexp.MustCompile(`\d{18}|\d{17}[Xx]`)
|
||||
if match := idnRe.FindString(str); match != "" {
|
||||
compose.IDN = strings.ToUpper(match)
|
||||
str = strings.ReplaceAll(str, match, "")
|
||||
}
|
||||
|
||||
// 如果之前没有提取到手机号,现在提取
|
||||
if compose.Mobile == "" {
|
||||
mobileRe := regexp.MustCompile(`\d{7,12}`)
|
||||
if match := mobileRe.FindString(str); match != "" {
|
||||
compose.Mobile = match
|
||||
str = strings.ReplaceAll(str, match, "")
|
||||
}
|
||||
} else {
|
||||
// 已经提取过手机号,从字符串中删除
|
||||
str = strings.ReplaceAll(str, compose.Mobile, "")
|
||||
}
|
||||
|
||||
// 提取邮编
|
||||
postcodeRe := regexp.MustCompile(`\d{6}`)
|
||||
if match := postcodeRe.FindString(str); match != "" {
|
||||
compose.Postcode = match
|
||||
str = strings.ReplaceAll(str, match, "")
|
||||
}
|
||||
|
||||
// 清理多余空格
|
||||
str = strings.TrimSpace(spaceRe.ReplaceAllString(str, " "))
|
||||
|
||||
// 如果之前没有提取到姓名,现在提取
|
||||
if compose.Name == "" {
|
||||
// 提取姓名(取最短的词作为姓名,排除空字符串)
|
||||
splitArr := strings.Split(str, " ")
|
||||
if len(splitArr) > 0 {
|
||||
for _, value := range splitArr {
|
||||
value = strings.TrimSpace(value)
|
||||
if value == "" {
|
||||
continue
|
||||
}
|
||||
if compose.Name == "" {
|
||||
compose.Name = value
|
||||
} else if utf8.RuneCountInString(value) < utf8.RuneCountInString(compose.Name) && utf8.RuneCountInString(value) >= 2 {
|
||||
compose.Name = value
|
||||
}
|
||||
}
|
||||
if compose.Name != "" {
|
||||
str = strings.TrimSpace(strings.ReplaceAll(str, compose.Name, ""))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// 已经提取过姓名,从字符串中删除
|
||||
str = strings.TrimSpace(strings.ReplaceAll(str, compose.Name, ""))
|
||||
}
|
||||
|
||||
compose.Addr = str
|
||||
return compose
|
||||
}
|
||||
|
||||
// fuzz 根据统计规律分析出二三级地址
|
||||
func fuzz(addr string) *fuzzyResult {
|
||||
addrOrigin := addr
|
||||
addr = strings.ReplaceAll(addr, " ", "")
|
||||
addr = strings.ReplaceAll(addr, ",", "")
|
||||
// 先替换"自治区直辖县级市"为"市",避免后续"自治区"替换时产生问题
|
||||
addr = strings.ReplaceAll(addr, "自治区直辖县级市", "市")
|
||||
addr = strings.ReplaceAll(addr, "自治区", "省")
|
||||
addr = strings.ReplaceAll(addr, "自治州", "州")
|
||||
addr = strings.ReplaceAll(addr, "小区", "")
|
||||
addr = strings.ReplaceAll(addr, "校区", "")
|
||||
// 过滤"市辖区" - 这是一个行政术语占位符,不是真正的区名
|
||||
addr = strings.ReplaceAll(addr, "市辖区", "")
|
||||
|
||||
a1 := ""
|
||||
a2 := ""
|
||||
a3 := ""
|
||||
street := ""
|
||||
|
||||
deep3KeywordPos := -1
|
||||
|
||||
// 判断是否包含县/区/旗
|
||||
countyPos := mbStrpos(addr, "县")
|
||||
districtPos := mbStrpos(addr, "区")
|
||||
bannerPos := mbStrpos(addr, "旗")
|
||||
|
||||
// 只要存在这些关键词就处理,不再限制位置
|
||||
hasEarlyCounty := countyPos != -1
|
||||
hasEarlyDistrict := districtPos != -1
|
||||
hasEarlyBanner := bannerPos != -1
|
||||
|
||||
if hasEarlyCounty || hasEarlyDistrict || hasEarlyBanner {
|
||||
// 优先检查是否存在县级市(如"新乐市")
|
||||
// 如果同时存在"XX市"和"XX区"/"XX县",优先处理"市"
|
||||
hasCountyLevelCity := false
|
||||
if mbStrstr(addr, "市") {
|
||||
// 查找所有"市"的位置
|
||||
cityCount := mbSubstrCount(addr, "市")
|
||||
if cityCount >= 2 {
|
||||
// 找到第二个"市"的位置(可能是县级市)
|
||||
firstCityPos := mbStrpos(addr, "市")
|
||||
// 从第一个"市"之后继续查找
|
||||
addrAfterFirstCity := mbSubstr(addr, firstCityPos+1, utf8.RuneCountInString(addr)-firstCityPos-1)
|
||||
secondCityPos := mbStrpos(addrAfterFirstCity, "市")
|
||||
if secondCityPos != -1 {
|
||||
secondCityAbsPos := firstCityPos + 1 + secondCityPos
|
||||
// 检查第二个"市"后面是否存在"区"或"县"
|
||||
addrAfterSecondCity := mbSubstr(addr, secondCityAbsPos+1, utf8.RuneCountInString(addr)-secondCityAbsPos-1)
|
||||
if mbStrstr(addrAfterSecondCity, "区") || mbStrstr(addrAfterSecondCity, "县") {
|
||||
// 提取两个"市"之间的内容
|
||||
betweenCities := mbSubstr(addr, firstCityPos+1, secondCityAbsPos-firstCityPos)
|
||||
// 检查是否是重复的地名(如"北京市北京市"或"杭州市西湖区杭州市")
|
||||
// 如果两个"市"之间包含"区"或"县",说明不是县级市,而是重复地名
|
||||
if !mbStrstr(betweenCities, "区") && !mbStrstr(betweenCities, "县") {
|
||||
// 第一个"市"及之前的内容
|
||||
firstCityFull := mbSubstr(addr, 0, firstCityPos+1)
|
||||
if betweenCities != firstCityFull {
|
||||
// 不是重复地名,这是县级市
|
||||
a3 = betweenCities
|
||||
deep3KeywordPos = secondCityAbsPos
|
||||
hasCountyLevelCity = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !hasCountyLevelCity {
|
||||
// 处理旗
|
||||
if mbStrstr(addr, "旗") {
|
||||
deep3KeywordPos = mbStrpos(addr, "旗")
|
||||
a3 = mbSubstr(addr, deep3KeywordPos-1, 2)
|
||||
}
|
||||
|
||||
// 处理区
|
||||
if mbStrstr(addr, "区") {
|
||||
// 使用第一个"区"(避免重复地名干扰,如"西湖区杭州市西湖区")
|
||||
deep3KeywordPos = mbStrpos(addr, "区")
|
||||
|
||||
if mbStrstr(addr, "市") {
|
||||
// 策略:找到"区"之前的最后一个"市"
|
||||
// 这样可以避免详细地址中的"市"字干扰(如"农贸市场")
|
||||
zonePos := deep3KeywordPos
|
||||
// 从开头到"区"的子串中,查找最后一个"市"
|
||||
addrBeforeZone := mbSubstr(addr, 0, zonePos)
|
||||
cityPos := mbStrripos(addrBeforeZone, "市")
|
||||
if cityPos != -1 {
|
||||
a3 = mbSubstr(addr, cityPos+1, zonePos-cityPos)
|
||||
} else {
|
||||
// 没有找到"市",使用默认逻辑
|
||||
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
|
||||
}
|
||||
} else {
|
||||
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
|
||||
}
|
||||
}
|
||||
|
||||
// 处理县
|
||||
if mbStrstr(addr, "县") {
|
||||
// 使用第一个"县"(避免重复地名干扰)
|
||||
deep3KeywordPos = mbStrpos(addr, "县")
|
||||
if mbStrstr(addr, "市") {
|
||||
// 从开头到"县"的子串中,查找最后一个"市"
|
||||
addrBeforeCounty := mbSubstr(addr, 0, deep3KeywordPos)
|
||||
cityPos := mbStrripos(addrBeforeCounty, "市")
|
||||
if cityPos != -1 {
|
||||
a3 = mbSubstr(addr, cityPos+1, deep3KeywordPos-cityPos)
|
||||
} else {
|
||||
if mbStrstr(addr, "自治县") {
|
||||
a3 = mbSubstr(addr, deep3KeywordPos-6, 7)
|
||||
firstChar := mbSubstr(a3, 0, 1)
|
||||
if firstChar == "省" || firstChar == "市" || firstChar == "州" {
|
||||
a3 = mbSubstr(a3, 1, utf8.RuneCountInString(a3)-1)
|
||||
}
|
||||
} else {
|
||||
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if mbStrstr(addr, "自治县") {
|
||||
a3 = mbSubstr(addr, deep3KeywordPos-6, 7)
|
||||
firstChar := mbSubstr(a3, 0, 1)
|
||||
if firstChar == "省" || firstChar == "市" || firstChar == "州" {
|
||||
a3 = mbSubstr(a3, 1, utf8.RuneCountInString(a3)-1)
|
||||
}
|
||||
} else {
|
||||
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if deep3KeywordPos != -1 {
|
||||
street = mbSubstr(addrOrigin, deep3KeywordPos+1, utf8.RuneCountInString(addrOrigin)-deep3KeywordPos-1)
|
||||
}
|
||||
} else {
|
||||
// 处理市
|
||||
if mbStrripos(addr, "市") != -1 {
|
||||
cityCount := mbSubstrCount(addr, "市")
|
||||
if cityCount == 1 {
|
||||
deep3KeywordPos = mbStrripos(addr, "市")
|
||||
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
|
||||
street = mbSubstr(addrOrigin, deep3KeywordPos+1, utf8.RuneCountInString(addrOrigin)-deep3KeywordPos-1)
|
||||
} else if cityCount >= 2 {
|
||||
deep3KeywordPos = mbStrripos(addr, "市")
|
||||
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
|
||||
street = mbSubstr(addrOrigin, deep3KeywordPos+1, utf8.RuneCountInString(addrOrigin)-deep3KeywordPos-1)
|
||||
}
|
||||
} else {
|
||||
a3 = ""
|
||||
street = addr
|
||||
}
|
||||
}
|
||||
|
||||
// 提取市级地址
|
||||
if mbStrpos(addr, "市") != -1 || mbStrstr(addr, "盟") || mbStrstr(addr, "州") {
|
||||
tmpPos := -1
|
||||
if tmpPos = mbStrpos(addr, "市"); tmpPos != -1 {
|
||||
// 使用第一个"市"(避免重复地名干扰,如"杭州市西湖区杭州市")
|
||||
// 向前查找省的位置,如果有省就从省后开始,否则从开头开始
|
||||
addrBeforeCity := mbSubstr(addr, 0, tmpPos)
|
||||
provincePos := mbStrripos(addrBeforeCity, "省")
|
||||
startPos := 0
|
||||
if provincePos != -1 {
|
||||
startPos = provincePos + 1
|
||||
}
|
||||
a2 = mbSubstr(addr, startPos, tmpPos-startPos+1)
|
||||
} else if tmpPos = mbStrpos(addr, "盟"); tmpPos != -1 {
|
||||
a2 = mbSubstr(addr, tmpPos-2, 3)
|
||||
} else if mbStrpos(addr, "州") != -1 {
|
||||
if tmpPos = mbStrpos(addr, "自治州"); tmpPos != -1 {
|
||||
a2 = mbSubstr(addr, tmpPos-4, 5)
|
||||
} else {
|
||||
tmpPos = mbStrpos(addr, "州")
|
||||
a2 = mbSubstr(addr, tmpPos-2, 3)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return &fuzzyResult{
|
||||
A1: a1,
|
||||
A2: a2,
|
||||
A3: a3,
|
||||
Street: street,
|
||||
}
|
||||
}
|
||||
|
||||
// parse 智能解析出省市区
|
||||
func parse(a1, a2, a3 string) *AddressInfo {
|
||||
r := &AddressInfo{}
|
||||
|
||||
if a3 == "" {
|
||||
return r
|
||||
}
|
||||
|
||||
// 在三级地址数据中查找匹配
|
||||
area3Matches := make(map[int]*Region)
|
||||
for id, v := range A3Data {
|
||||
if mbStrpos(v.Name, a3) != -1 {
|
||||
area3Matches[id] = v
|
||||
}
|
||||
}
|
||||
|
||||
// 多个匹配项,需要通过二级地址筛选
|
||||
if len(area3Matches) > 1 {
|
||||
if a2 != "" {
|
||||
area2Matches := make(map[int]*Region)
|
||||
for id, v := range A2Data {
|
||||
if mbStrpos(v.Name, a2) != -1 {
|
||||
area2Matches[id] = v
|
||||
}
|
||||
}
|
||||
|
||||
if len(area2Matches) > 0 {
|
||||
for _, v := range area3Matches {
|
||||
if city, ok := area2Matches[v.PID]; ok {
|
||||
r.City = city.Name
|
||||
r.Region = v.Name
|
||||
if province, ok := A1Data[city.PID]; ok {
|
||||
r.Province = province.Name
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
r.Province = ""
|
||||
r.City = ""
|
||||
r.Region = a3
|
||||
}
|
||||
} else if len(area3Matches) == 1 {
|
||||
// 唯一匹配
|
||||
for _, v := range area3Matches {
|
||||
r.Region = v.Name
|
||||
if city, ok := A2Data[v.PID]; ok {
|
||||
r.City = city.Name
|
||||
if province, ok := A1Data[city.PID]; ok {
|
||||
r.Province = province.Name
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if len(area3Matches) == 0 && a2 == a3 {
|
||||
// 没有匹配到三级地址,但二级地址等于三级地址,可能是直辖市
|
||||
shengID := 0
|
||||
for _, v := range A2Data {
|
||||
if mbStrpos(v.Name, a2) != -1 {
|
||||
r.City = v.Name
|
||||
shengID = v.PID
|
||||
break
|
||||
}
|
||||
}
|
||||
if province, ok := A1Data[shengID]; ok {
|
||||
r.Province = province.Name
|
||||
}
|
||||
r.Region = ""
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
4470
formatter/address_data.go
Normal file
4470
formatter/address_data.go
Normal file
File diff suppressed because it is too large
Load Diff
75
formatter/address_helper.go
Normal file
75
formatter/address_helper.go
Normal file
@@ -0,0 +1,75 @@
|
||||
package formatter
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// mbStrpos 返回字符串首次出现的位置(UTF-8字符计数)
|
||||
func mbStrpos(haystack, needle string) int {
|
||||
if needle == "" {
|
||||
return 0
|
||||
}
|
||||
idx := strings.Index(haystack, needle)
|
||||
if idx == -1 {
|
||||
return -1
|
||||
}
|
||||
return utf8.RuneCountInString(haystack[:idx])
|
||||
}
|
||||
|
||||
// mbStrripos 返回字符串最后出现的位置(UTF-8字符计数)
|
||||
func mbStrripos(haystack, needle string) int {
|
||||
if needle == "" {
|
||||
return utf8.RuneCountInString(haystack)
|
||||
}
|
||||
idx := strings.LastIndex(haystack, needle)
|
||||
if idx == -1 {
|
||||
return -1
|
||||
}
|
||||
return utf8.RuneCountInString(haystack[:idx])
|
||||
}
|
||||
|
||||
// mbStrstr 检查字符串是否包含子串
|
||||
func mbStrstr(haystack, needle string) bool {
|
||||
return strings.Contains(haystack, needle)
|
||||
}
|
||||
|
||||
// mbSubstr 截取字符串(UTF-8字符计数)
|
||||
// start: 起始位置(从0开始)
|
||||
// length: 截取长度(字符数)
|
||||
func mbSubstr(str string, start, length int) string {
|
||||
runes := []rune(str)
|
||||
strLen := len(runes)
|
||||
|
||||
// 处理负数起始位置
|
||||
if start < 0 {
|
||||
start = strLen + start
|
||||
if start < 0 {
|
||||
start = 0
|
||||
}
|
||||
}
|
||||
|
||||
// 起始位置超出字符串长度
|
||||
if start >= strLen {
|
||||
return ""
|
||||
}
|
||||
|
||||
// 计算结束位置
|
||||
end := start + length
|
||||
if end > strLen {
|
||||
end = strLen
|
||||
}
|
||||
if end < start {
|
||||
return ""
|
||||
}
|
||||
|
||||
return string(runes[start:end])
|
||||
}
|
||||
|
||||
// mbSubstrCount 统计子串出现次数
|
||||
func mbSubstrCount(haystack, needle string) int {
|
||||
if needle == "" {
|
||||
return 0
|
||||
}
|
||||
return strings.Count(haystack, needle)
|
||||
}
|
||||
360
formatter/address_test.go
Normal file
360
formatter/address_test.go
Normal file
@@ -0,0 +1,360 @@
|
||||
package formatter
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseCNAddress(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
withUser bool
|
||||
want *AddressInfo
|
||||
}{
|
||||
{
|
||||
name: "完整地址信息",
|
||||
input: "张三 13800138000 北京市朝阳区建国路1号",
|
||||
withUser: true,
|
||||
want: &AddressInfo{
|
||||
Name: "张三",
|
||||
Mobile: "13800138000",
|
||||
Province: "北京",
|
||||
City: "北京市",
|
||||
Region: "朝阳区",
|
||||
Street: "建国路1号",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "带身份证和邮编",
|
||||
input: "李四 18612345678 110101199001011234 100000 上海市浦东新区世纪大道100号",
|
||||
withUser: true,
|
||||
want: &AddressInfo{
|
||||
Name: "李四",
|
||||
Mobile: "18612345678",
|
||||
IDN: "110101199001011234",
|
||||
Postcode: "100000",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "仅地址不含用户信息",
|
||||
input: "北京市海淀区中关村大街1号",
|
||||
withUser: false,
|
||||
want: &AddressInfo{
|
||||
Province: "北京",
|
||||
City: "北京市",
|
||||
Region: "海淀区",
|
||||
Street: "中关村大街1号",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "带收货关键词",
|
||||
input: "收货人:王五 电话:13900139000 收货地址:天津市河西区友谊路20号",
|
||||
withUser: true,
|
||||
want: &AddressInfo{
|
||||
Name: "王五",
|
||||
Mobile: "13900139000",
|
||||
Province: "天津",
|
||||
City: "天津市",
|
||||
Region: "河西区",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "紧凑格式地址",
|
||||
input: "马云13593464918陕西省西安市雁塔区丈八沟街道高新四路南江国际",
|
||||
withUser: true,
|
||||
want: &AddressInfo{
|
||||
Name: "马云",
|
||||
Mobile: "13593464918",
|
||||
Province: "陕西省",
|
||||
City: "西安市",
|
||||
Region: "雁塔区",
|
||||
Street: "丈八沟街道高新四路南江国际",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "带座机号格式",
|
||||
input: "姓名:马云\n联系电话:800-8585222\n所在地区:河北省石家庄市新华区\n详细地址:中华北大街68号鹿城商务中心6号楼1413室",
|
||||
withUser: true,
|
||||
want: &AddressInfo{
|
||||
Name: "马云",
|
||||
Mobile: "800-8585222",
|
||||
Province: "河北省",
|
||||
City: "石家庄市",
|
||||
Region: "新华区",
|
||||
Street: "中华北大街68号鹿城商务中心6号楼1413室",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "北京市重复格式",
|
||||
input: "北京市北京市市辖区东城区",
|
||||
withUser: false,
|
||||
want: &AddressInfo{
|
||||
Province: "北京",
|
||||
City: "北京市",
|
||||
Region: "东城区",
|
||||
Street: "",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "河北省新乐市地址",
|
||||
input: "河北省石家庄市新乐市经济开发区兴工街10号来优品仓库",
|
||||
withUser: false,
|
||||
want: &AddressInfo{
|
||||
Province: "河北省",
|
||||
City: "石家庄市",
|
||||
Region: "新乐市",
|
||||
Street: "经济开发区兴工街10号来优品仓库",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "江苏仪征市地址",
|
||||
input: "江苏省扬州市仪征市真州镇解放东路99号",
|
||||
withUser: false,
|
||||
want: &AddressInfo{
|
||||
Province: "江苏省",
|
||||
City: "扬州市",
|
||||
Region: "仪征市",
|
||||
Street: "真州镇解放东路99号",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "新疆石河子市地址",
|
||||
input: "新疆石河子市北三路25小区",
|
||||
withUser: false,
|
||||
want: &AddressInfo{
|
||||
Province: "新疆维吾尔自治区",
|
||||
City: "自治区直辖县级市",
|
||||
Region: "石河子市",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "新疆石河子市-简化格式省+县级市",
|
||||
input: "新疆维吾尔自治区石河子市",
|
||||
withUser: false,
|
||||
want: &AddressInfo{
|
||||
Province: "新疆维吾尔自治区",
|
||||
City: "自治区直辖县级市",
|
||||
Region: "石河子市",
|
||||
Street: "",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "新疆石河子市-完整行政区划表述",
|
||||
input: "新疆维吾尔自治区自治区直辖县级市石河子市",
|
||||
withUser: false,
|
||||
want: &AddressInfo{
|
||||
Province: "新疆维吾尔自治区",
|
||||
City: "自治区直辖县级市",
|
||||
Region: "石河子市",
|
||||
Street: "",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "浙江杭州西湖区重复地址",
|
||||
input: "浙江省杭州市西湖区杭州市西湖区人民政府109号",
|
||||
withUser: false,
|
||||
want: &AddressInfo{
|
||||
Province: "浙江省",
|
||||
City: "杭州市",
|
||||
Region: "西湖区",
|
||||
Street: "人民政府109号",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "湖南长沙市重复地址",
|
||||
input: "湖南省长沙市岳麓区银盆岭街道长沙市人民政府长沙市政府大楼",
|
||||
withUser: false,
|
||||
want: &AddressInfo{
|
||||
Province: "湖南省",
|
||||
City: "长沙市",
|
||||
Region: "岳麓区",
|
||||
Street: "银盆岭街道人民政府政府大楼",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := ParseCNAddress(tt.input, tt.withUser)
|
||||
|
||||
// 打印结果便于调试
|
||||
jsonData, _ := json.MarshalIndent(got, "", " ")
|
||||
t.Logf("Result: %s", jsonData)
|
||||
|
||||
// 验证主要字段
|
||||
if tt.want.Name != "" && got.Name != tt.want.Name {
|
||||
t.Errorf("Name = %v, want %v", got.Name, tt.want.Name)
|
||||
}
|
||||
if tt.want.Mobile != "" && got.Mobile != tt.want.Mobile {
|
||||
t.Errorf("Mobile = %v, want %v", got.Mobile, tt.want.Mobile)
|
||||
}
|
||||
if tt.want.Province != "" && got.Province != tt.want.Province {
|
||||
t.Errorf("Province = %v, want %v", got.Province, tt.want.Province)
|
||||
}
|
||||
if tt.want.City != "" && got.City != tt.want.City {
|
||||
t.Errorf("City = %v, want %v", got.City, tt.want.City)
|
||||
}
|
||||
if tt.want.Region != "" && got.Region != tt.want.Region {
|
||||
t.Errorf("Region = %v, want %v", got.Region, tt.want.Region)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParsePersonInfo(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
verify func(*testing.T, *AddressInfo)
|
||||
}{
|
||||
{
|
||||
name: "提取姓名和手机号",
|
||||
input: "张三 13800138000 北京市朝阳区",
|
||||
verify: func(t *testing.T, got *AddressInfo) {
|
||||
if got.Name != "张三" {
|
||||
t.Errorf("Name = %v, want 张三", got.Name)
|
||||
}
|
||||
if got.Mobile != "13800138000" {
|
||||
t.Errorf("Mobile = %v, want 13800138000", got.Mobile)
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "提取身份证号",
|
||||
input: "李四 110101199001011234 上海市",
|
||||
verify: func(t *testing.T, got *AddressInfo) {
|
||||
if got.Name != "李四" {
|
||||
t.Errorf("Name = %v, want 李四", got.Name)
|
||||
}
|
||||
if got.IDN != "110101199001011234" {
|
||||
t.Errorf("IDN = %v, want 110101199001011234", got.IDN)
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "提取邮编",
|
||||
input: "王五 100000 天津市",
|
||||
verify: func(t *testing.T, got *AddressInfo) {
|
||||
if got.Name != "王五" {
|
||||
t.Errorf("Name = %v, want 王五", got.Name)
|
||||
}
|
||||
if got.Postcode != "100000" {
|
||||
t.Errorf("Postcode = %v, want 100000", got.Postcode)
|
||||
}
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := ParsePersonInfo(tt.input)
|
||||
jsonData, _ := json.MarshalIndent(got, "", " ")
|
||||
t.Logf("Result: %s", jsonData)
|
||||
tt.verify(t, got)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFuzz(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
input string
|
||||
want *fuzzyResult
|
||||
}{
|
||||
{
|
||||
name: "包含区",
|
||||
input: "北京市朝阳区建国路1号",
|
||||
want: &fuzzyResult{
|
||||
A2: "北京市",
|
||||
A3: "朝阳区",
|
||||
Street: "建国路1号",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "包含县",
|
||||
input: "河北省石家庄市正定县",
|
||||
want: &fuzzyResult{
|
||||
A2: "石家庄市",
|
||||
A3: "正定县",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "复杂街道地址",
|
||||
input: "浙江省杭州市拱墅区武林街道杭州锦麟宾馆中河片区",
|
||||
want: &fuzzyResult{
|
||||
A2: "杭州市",
|
||||
A3: "拱墅区",
|
||||
Street: "武林街道杭州锦麟宾馆中河片区",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "北京市重复格式",
|
||||
input: "北京市北京市市辖区东城区",
|
||||
want: &fuzzyResult{
|
||||
A2: "北京市",
|
||||
A3: "东城区",
|
||||
Street: "",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "详细地址包含市字",
|
||||
input: "北京市朝阳区建外大街1号国贸商城",
|
||||
want: &fuzzyResult{
|
||||
A2: "北京市",
|
||||
A3: "朝阳区",
|
||||
Street: "建外大街1号国贸商城",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "详细地址真的包含市字",
|
||||
input: "北京市朝阳区农贸市场路1号",
|
||||
want: &fuzzyResult{
|
||||
A2: "北京市",
|
||||
A3: "朝阳区",
|
||||
Street: "农贸市场路1号",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "河北省新乐市地址",
|
||||
input: "河北省石家庄市新乐市经济开发区兴工街10号来优品仓库",
|
||||
want: &fuzzyResult{
|
||||
A2: "石家庄市",
|
||||
A3: "新乐市",
|
||||
Street: "经济开发区兴工街10号来优品仓库",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := fuzz(tt.input)
|
||||
jsonData, _ := json.MarshalIndent(got, "", " ")
|
||||
t.Logf("Result: %s", jsonData)
|
||||
|
||||
if got.A2 != tt.want.A2 {
|
||||
t.Errorf("A2 = %v, want %v", got.A2, tt.want.A2)
|
||||
}
|
||||
if got.A3 != tt.want.A3 {
|
||||
t.Errorf("A3 = %v, want %v", got.A3, tt.want.A3)
|
||||
}
|
||||
if tt.want.Street != "" && got.Street != tt.want.Street {
|
||||
t.Errorf("Street = %v, want %v", got.Street, tt.want.Street)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func ExampleParseCNAddress() {
|
||||
// 解析包含用户信息的完整地址
|
||||
result := ParseCNAddress("张三 13800138000 北京市朝阳区建国路1号", true)
|
||||
jsonData, _ := json.MarshalIndent(result, "", " ")
|
||||
println(string(jsonData))
|
||||
}
|
||||
|
||||
func ExampleParsePersonInfo() {
|
||||
// 分离用户信息
|
||||
result := ParsePersonInfo("收货人:李四 电话:18612345678 地址:上海市浦东新区世纪大道100号")
|
||||
jsonData, _ := json.MarshalIndent(result, "", " ")
|
||||
println(string(jsonData))
|
||||
}
|
||||
Reference in New Issue
Block a user