1
0
mirror of https://github.com/duke-git/lancet.git synced 2026-02-04 12:52:28 +08:00

Compare commits

..

2 Commits

Author SHA1 Message Date
Javen
b3fd282b50 feat: add address.Smart and Decompose for parse CN address (#346)
* feat: add address.Smart and Decompose for parse CN address

* feat: add Xinjiang directly-administered county-level cities support

- Add '自治区直辖县级市' as a city-level unit (ID: 4043) in A2Data for Xinjiang
- Add 12 directly-administered county-level cities in A3Data (IDs: 4044-4055):
  * 石河子市 (Shihezi, 1976, 8th Division) - ID: 4044
  * 阿拉尔市 (Aral, 2002, 1st Division) - ID: 4045
  * 图木舒克市 (Tumxuk, 2002, 3rd Division) - ID: 4046
  * 五家渠市 (Wujiaqu, 2002, 6th Division) - ID: 4047
  * 北屯市 (Beitun, 2011, 10th Division) - ID: 4048
  * 铁门关市 (Tiemenguan, 2012, 2nd Division) - ID: 4049
  * 双河市 (Shuanghe, 2014, 5th Division) - ID: 4050
  * 可克达拉市 (Kokdala, 2015, 4th Division) - ID: 4051
  * 昆玉市 (Kunyu, 2016, 14th Division) - ID: 4052
  * 胡杨河市 (Huyanghe, 2019, 7th Division) - ID: 4053
  * 新星市 (Xinxing, 2021, 13th Division) - ID: 4054
  * 白杨市 (Baiyang, 2023, 9th Division) - ID: 4055
- All county-level cities are under PID 4043 (自治区直辖县级市)
- Add test case for Xinjiang Shihezi city address parsing
- Now supports parsing addresses like: 新疆石河子市北三路25小区

* docs: formated address data

* fix: parse repeat address error

* feat: update readme file

---------

Co-authored-by: Jiawen <im@linjiawen.com>
2026-01-13 14:00:44 +08:00
efinKiaC
a1cebec9f2 ReadFileByLine bugfix (#350) 2026-01-13 11:24:08 +08:00
9 changed files with 5688 additions and 11 deletions

View File

@@ -986,6 +986,12 @@ import "github.com/duke-git/lancet/v2/formatter"
- **<big>ParseBinaryBytes</big>** : return the human readable bytes size string into the amount it represents(base 1024).
[[doc](https://github.com/duke-git/lancet/blob/main/docs/en/api/packages/formatter.md#ParseBinaryBytes)]
[[play](https://go.dev/play/p/69v1tTT62x8)]
- **<big>ParseCNAddress</big>** : parses a Chinese address string intelligently and extracts structured information (province, city, district, street, name, phone, etc.). Supports various address formats including county-level cities.
[[doc](https://github.com/duke-git/lancet/blob/main/docs/en/api/packages/formatter.md#ParseCNAddress)]
[[play](https://go.dev/play/p/o5l09hQopEV)]
- **<big>ParsePersonInfo</big>** : extracts user information (name, phone, ID card, postal code) from an address string and separates it from the location address.
[[doc](https://github.com/duke-git/lancet/blob/main/docs/en/api/packages/formatter.md#ParsePersonInfo)]
[[play](https://go.dev/play/p/JO-uTlJlTy7)]
<h3 id="function"> 13. Function package can control the flow of function execution and support part of functional programming.&nbsp; &nbsp; &nbsp; &nbsp;<a href="#index">index</a></h3>

View File

@@ -996,6 +996,12 @@ import "github.com/duke-git/lancet/v2/formatter"
- **<big>ParseBinaryBytes</big>** : 将字节单位字符串转换成其所表示的字节数(以 1024 为基数)。
[[doc](https://github.com/duke-git/lancet/blob/main/docs/api/packages/formatter.md#ParseBinaryBytes)]
[[play](https://go.dev/play/p/69v1tTT62x8)]
- **<big>ParseCNAddress</big>** : 智能解析中国地址字符串并提取结构化信息(省、市、区、街道、姓名、电话等)。支持多种地址格式,包括县级市。
[[doc](https://github.com/duke-git/lancet/blob/main/docs/api/packages/formatter.md#ParseCNAddress)]
[[play](https://go.dev/play/p/o5l09hQopEV)]
- **<big>ParsePersonInfo</big>** : 从地址字符串中提取用户信息(姓名、电话、身份证、邮编)并将其与位置地址分离。
[[doc](https://github.com/duke-git/lancet/blob/main/docs/api/packages/formatter.md#ParsePersonInfo)]
[[play](https://go.dev/play/p/JO-uTlJlTy7)]
<h3 id="function"> 13. function 函数包控制函数执行流程,包含部分函数式编程。&nbsp; &nbsp; &nbsp; &nbsp;<a href="#index">回到目录</a></h3>

View File

@@ -8,6 +8,7 @@ formatter 格式化器包含一些数据格式化处理方法。
- [https://github.com/duke-git/lancet/blob/main/formatter/formatter.go](https://github.com/duke-git/lancet/blob/main/formatter/formatter.go)
- [https://github.com/duke-git/lancet/blob/main/formatter/byte.go](https://github.com/duke-git/lancet/blob/main/formatter/byte.go)
- [https://github.com/duke-git/lancet/blob/main/formatter/address.go](https://github.com/duke-git/lancet/blob/main/formatter/address.go)
<div STYLE="page-break-after: always;"></div>
@@ -30,6 +31,8 @@ import (
- [BinaryBytes](#BinaryBytes)
- [ParseDecimalBytes](#ParseDecimalBytes)
- [ParseBinaryBytes](#ParseBinaryBytes)
- [ParseCNAddress](#ParseCNAddress)
- [ParsePersonInfo](#ParsePersonInfo)
<div STYLE="page-break-after: always;"></div>
@@ -308,3 +311,134 @@ func main() {
// 12492
}
```
### <span id="ParseCNAddress">ParseCNAddress</span>
<p>智能解析中国地址字符串并提取结构化信息。可以解析带或不带用户信息(姓名、电话、身份证等)的地址。当 withUser 为 true 时,从地址字符串中提取用户信息。当 withUser 为 false 时,仅解析位置信息。支持多种地址格式:标准格式、紧凑格式、带关键词格式、县级市格式等。</p>
<b>函数签名:</b>
```go
func ParseCNAddress(str string, withUser bool) *AddressInfo
```
<b>示例:<span style="float:right;display:inline-block;">[运行](https://go.dev/play/p/o5l09hQopEV)</span></b>
```go
package main
import (
"encoding/json"
"fmt"
"github.com/duke-git/lancet/v2/formatter"
)
func main() {
// 解析包含用户信息的完整地址
result1 := formatter.ParseCNAddress("张三 13800138000 北京市朝阳区建国路1号", true)
jsonData1, _ := json.MarshalIndent(result1, "", " ")
fmt.Println("示例 1 - 带用户信息:")
fmt.Println(string(jsonData1))
// 仅解析地址,不提取用户信息
result2 := formatter.ParseCNAddress("北京市海淀区中关村大街1号", false)
fmt.Printf("\n示例 2 - 仅地址:\n")
fmt.Printf("省: %s, 市: %s, 区: %s, 街道: %s\n",
result2.Province, result2.City, result2.Region, result2.Street)
// 解析县级市地址
result3 := formatter.ParseCNAddress("河北省石家庄市新乐市经济开发区兴工街10号", false)
fmt.Printf("\n示例 3 - 县级市:\n")
fmt.Printf("省: %s, 市: %s, 区/县: %s, 街道: %s\n",
result3.Province, result3.City, result3.Region, result3.Street)
// 紧凑格式
result4 := formatter.ParseCNAddress("马云13593464918陕西省西安市雁塔区丈八沟街道", true)
fmt.Printf("\n示例 4 - 紧凑格式:\n")
fmt.Printf("姓名: %s, 电话: %s, 地址: %s%s%s%s\n",
result4.Name, result4.Mobile, result4.Province, result4.City, result4.Region, result4.Street)
// Output:
// 示例 1 - 带用户信息:
// {
// "name": "张三",
// "mobile": "13800138000",
// "idn": "",
// "postcode": "",
// "province": "北京",
// "city": "北京市",
// "region": "朝阳区",
// "street": "建国路1号",
// "addr": "北京市朝阳区建国路1号"
// }
//
// 示例 2 - 仅地址:
// 省: 北京, 市: 北京市, 区: 海淀区, 街道: 中关村大街1号
//
// 示例 3 - 县级市:
// 省: 河北省, 市: 石家庄市, 区/县: 新乐市, 街道: 经济开发区兴工街10号
//
// 示例 4 - 紧凑格式:
// 姓名: 马云, 电话: 13593464918, 地址: 陕西省西安市雁塔区丈八沟街道
}
```
### <span id="ParsePersonInfo">ParsePersonInfo</span>
<p>从地址字符串中提取用户信息(姓名、电话、身份证、邮编)。将个人信息与地址分离,支持带标签格式、紧凑格式、带分隔符格式。返回包含提取的用户信息和清理后地址字符串的 AddressInfo。</p>
<b>函数签名:</b>
```go
func ParsePersonInfo(str string) *AddressInfo
```
<b>示例:<span style="float:right;display:inline-block;">[运行](https://go.dev/play/p/JO-uTlJlTy7)</span></b>
```go
package main
import (
"encoding/json"
"fmt"
"github.com/duke-git/lancet/v2/formatter"
)
func main() {
// 提取姓名和手机号
result1 := formatter.ParsePersonInfo("张三 13800138000 北京市朝阳区")
fmt.Println("示例 1 - 姓名和手机号:")
fmt.Printf("姓名: %s, 手机: %s, 地址: %s\n", result1.Name, result1.Mobile, result1.Addr)
// 提取身份证号
result2 := formatter.ParsePersonInfo("李四 110101199001011234 上海市")
fmt.Println("\n示例 2 - 身份证号:")
fmt.Printf("姓名: %s, 身份证: %s, 地址: %s\n", result2.Name, result2.IDN, result2.Addr)
// 带标签格式
result3 := formatter.ParsePersonInfo("收货人:王五 电话13900139000 收货地址天津市河西区友谊路20号")
jsonData3, _ := json.MarshalIndent(result3, "", " ")
fmt.Println("\n示例 3 - 带标签格式:")
fmt.Println(string(jsonData3))
// Output:
// 示例 1 - 姓名和手机号:
// 姓名: 张三, 手机: 13800138000, 地址: 北京市朝阳区
//
// 示例 2 - 身份证号:
// 姓名: 李四, 身份证: 110101199001011234, 地址: 上海市
//
// 示例 3 - 带标签格式:
// {
// "name": "王五",
// "mobile": "13900139000",
// "idn": "",
// "postcode": "",
// "province": "",
// "city": "",
// "region": "",
// "street": "",
// "addr": "天津市河西区友谊路20号"
// }
}
```

View File

@@ -8,6 +8,7 @@ formatter contains some functions for data formatting.
- [https://github.com/duke-git/lancet/blob/main/formatter/formatter.go](https://github.com/duke-git/lancet/blob/main/formatter/formatter.go)
- [https://github.com/duke-git/lancet/blob/main/formatter/byte.go](https://github.com/duke-git/lancet/blob/main/formatter/byte.go)
- [https://github.com/duke-git/lancet/blob/main/formatter/address.go](https://github.com/duke-git/lancet/blob/main/formatter/address.go)
<div STYLE="page-break-after: always;"></div>
@@ -30,6 +31,8 @@ import (
- [BinaryBytes](#BinaryBytes)
- [ParseDecimalBytes](#ParseDecimalBytes)
- [ParseBinaryBytes](#ParseBinaryBytes)
- [ParseCNAddress](#ParseCNAddress)
- [ParsePersonInfo](#ParsePersonInfo)
<div STYLE="page-break-after: always;"></div>
@@ -308,3 +311,134 @@ func main() {
// 12492
}
```
### <span id="ParseCNAddress">ParseCNAddress</span>
<p>Parses a Chinese address string intelligently and extracts structured information. It can parse addresses with or without user information (name, phone, ID card, etc.). When withUser is true, it extracts user information from the address string. When withUser is false, it only parses the location information. Supports various address formats: standard format, compact format, labeled format, county-level cities format, etc.</p>
<b>Signature:</b>
```go
func ParseCNAddress(str string, withUser bool) *AddressInfo
```
<b>Example:<span style="float:right;display:inline-block;">[Run](https://go.dev/play/p/o5l09hQopEV)</span></b>
```go
package main
import (
"encoding/json"
"fmt"
"github.com/duke-git/lancet/v2/formatter"
)
func main() {
// Parse complete address with user information
result1 := formatter.ParseCNAddress("张三 13800138000 北京市朝阳区建国路1号", true)
jsonData1, _ := json.MarshalIndent(result1, "", " ")
fmt.Println("Example 1 - With user info:")
fmt.Println(string(jsonData1))
// Parse address only, without extracting user information
result2 := formatter.ParseCNAddress("北京市海淀区中关村大街1号", false)
fmt.Printf("\nExample 2 - Address only:\n")
fmt.Printf("Province: %s, City: %s, Region: %s, Street: %s\n",
result2.Province, result2.City, result2.Region, result2.Street)
// Parse county-level city address
result3 := formatter.ParseCNAddress("河北省石家庄市新乐市经济开发区兴工街10号", false)
fmt.Printf("\nExample 3 - County-level city:\n")
fmt.Printf("Province: %s, City: %s, Region: %s, Street: %s\n",
result3.Province, result3.City, result3.Region, result3.Street)
// Compact format
result4 := formatter.ParseCNAddress("马云13593464918陕西省西安市雁塔区丈八沟街道", true)
fmt.Printf("\nExample 4 - Compact format:\n")
fmt.Printf("Name: %s, Phone: %s, Address: %s%s%s%s\n",
result4.Name, result4.Mobile, result4.Province, result4.City, result4.Region, result4.Street)
// Output:
// Example 1 - With user info:
// {
// "name": "张三",
// "mobile": "13800138000",
// "idn": "",
// "postcode": "",
// "province": "北京",
// "city": "北京市",
// "region": "朝阳区",
// "street": "建国路1号",
// "addr": "北京市朝阳区建国路1号"
// }
//
// Example 2 - Address only:
// Province: 北京, City: 北京市, Region: 海淀区, Street: 中关村大街1号
//
// Example 3 - County-level city:
// Province: 河北省, City: 石家庄市, Region: 新乐市, Street: 经济开发区兴工街10号
//
// Example 4 - Compact format:
// Name: 马云, Phone: 13593464918, Address: 陕西省西安市雁塔区丈八沟街道
}
```
### <span id="ParsePersonInfo">ParsePersonInfo</span>
<p>Extracts user information (name, phone, ID card, postal code) from an address string. It separates personal information from the address, supporting labeled format, compact format, and formats with separators. Returns an AddressInfo with extracted user information and cleaned address string.</p>
<b>Signature:</b>
```go
func ParsePersonInfo(str string) *AddressInfo
```
<b>Example:<span style="float:right;display:inline-block;">[Run](https://go.dev/play/p/JO-uTlJlTy7)</span></b>
```go
package main
import (
"encoding/json"
"fmt"
"github.com/duke-git/lancet/v2/formatter"
)
func main() {
// Extract name and phone
result1 := formatter.ParsePersonInfo("张三 13800138000 北京市朝阳区")
fmt.Println("Example 1 - Name and phone:")
fmt.Printf("Name: %s, Phone: %s, Address: %s\n", result1.Name, result1.Mobile, result1.Addr)
// Extract ID card number
result2 := formatter.ParsePersonInfo("李四 110101199001011234 上海市")
fmt.Println("\nExample 2 - ID card number:")
fmt.Printf("Name: %s, ID Card: %s, Address: %s\n", result2.Name, result2.IDN, result2.Addr)
// Labeled format
result3 := formatter.ParsePersonInfo("收货人:王五 电话13900139000 收货地址天津市河西区友谊路20号")
jsonData3, _ := json.MarshalIndent(result3, "", " ")
fmt.Println("\nExample 3 - Labeled format:")
fmt.Println(string(jsonData3))
// Output:
// Example 1 - Name and phone:
// Name: 张三, Phone: 13800138000, Address: 北京市朝阳区
//
// Example 2 - ID card number:
// Name: 李四, ID Card: 110101199001011234, Address: 上海市
//
// Example 3 - Labeled format:
// {
// "name": "王五",
// "mobile": "13900139000",
// "idn": "",
// "postcode": "",
// "province": "",
// "city": "",
// "region": "",
// "street": "",
// "addr": "天津市河西区友谊路20号"
// }
}
```

View File

@@ -24,9 +24,10 @@ import (
"strings"
"sync"
"github.com/duke-git/lancet/v2/validator"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/transform"
"github.com/duke-git/lancet/v2/validator"
)
// FileReader is a reader supporting offset seeking and reading one
@@ -283,21 +284,18 @@ func ReadFileByLine(path string) ([]string, error) {
}
defer f.Close()
scanner := bufio.NewScanner(f)
result := make([]string, 0)
buf := bufio.NewReader(f)
for {
line, _, err := buf.ReadLine()
l := string(line)
if err == io.EOF {
break
}
if err != nil {
continue
}
for scanner.Scan() {
l := scanner.Text()
result = append(result, l)
}
if err := scanner.Err(); err != nil {
return nil, err
}
return result, nil
}

494
formatter/address.go Normal file
View File

@@ -0,0 +1,494 @@
// Package formatter implements some functions to format string, struct.
package formatter
import (
"regexp"
"strings"
"unicode/utf8"
)
// AddressInfo represents the parsed address information including user details and location.
// AddressInfo 表示解析后的地址信息,包括用户详细信息和位置信息
type AddressInfo struct {
Name string `json:"name"` // Name of the recipient / 姓名
Mobile string `json:"mobile"` // Mobile phone number or landline / 手机号或座机
IDN string `json:"idn"` // ID card number / 身份证号
Postcode string `json:"postcode"` // Postal code / 邮编
Province string `json:"province"` // Province / 省
City string `json:"city"` // City / 市
Region string `json:"region"` // District or county / 区/县
Street string `json:"street"` // Street address / 街道详细地址
Addr string `json:"addr"` // Original address string / 原始地址字符串
}
// fuzzyResult represents the result of fuzzy address parsing.
// fuzzyResult 表示模糊地址解析的结果
type fuzzyResult struct {
A1 string // Province level / 省级
A2 string // City level / 市级
A3 string // District/County level / 区/县级
Street string // Street address / 街道地址
}
// ParseCNAddress parses a Chinese address string intelligently and extracts structured information.
// It can parse addresses with or without user information (name, phone, ID card, etc.).
// When withUser is true, it extracts user information from the address string.
// When withUser is false, it only parses the location information.
// The function handles various address formats including:
// - Standard format: "Province City District Street"
// - Compact format: "Name Phone Province City District Street"
// - With keywords: "Name: xxx Phone: xxx Address: xxx"
// - County-level cities: "Province City CountyCity District" (e.g., "河北省石家庄市新乐市")
// ParseCNAddress 智能解析中国地址字符串并提取结构化信息。
// 可以解析带或不带用户信息(姓名、电话、身份证等)的地址。
// 当 withUser 为 true 时,从地址字符串中提取用户信息。
// 当 withUser 为 false 时,仅解析位置信息。
// 该函数处理多种地址格式,包括:
// - 标准格式:"省 市 区 街道"
// - 紧凑格式:"姓名 电话 省 市 区 街道"
// - 带关键词:"姓名:xxx 电话:xxx 地址:xxx"
// - 县级市:"省 市 县级市 区"(如"河北省石家庄市新乐市"
func ParseCNAddress(str string, withUser bool) *AddressInfo {
result := &AddressInfo{}
if withUser {
ParsePersonInfo := ParsePersonInfo(str)
result = ParsePersonInfo
} else {
result.Addr = str
}
fuzz := fuzz(result.Addr)
parse := parse(fuzz.A1, fuzz.A2, fuzz.A3)
result.Province = parse.Province
result.City = parse.City
result.Region = parse.Region
// 提取街道地址:从原始地址中找到区/县的位置,提取后面的内容
if result.Region != "" && result.Addr != "" {
// 在原始地址中查找区/县的位置转换为rune数组以正确处理中文
addrRunes := []rune(result.Addr)
regionRunes := []rune(result.Region)
regionPos := mbStrpos(result.Addr, result.Region)
if regionPos != -1 {
// 提取区/县后面的内容作为街道地址
streetStart := regionPos + len(regionRunes)
if streetStart < len(addrRunes) {
result.Street = string(addrRunes[streetStart:])
}
} else if fuzz.Street != "" {
// 如果没找到区/县使用fuzz返回的街道
result.Street = fuzz.Street
}
} else if fuzz.Street != "" {
result.Street = fuzz.Street
}
// 清理街道地址中的重复省市区信息(可能存在部分匹配的残留)
result.Street = strings.ReplaceAll(result.Street, result.Region, "")
result.Street = strings.ReplaceAll(result.Street, result.City, "")
result.Street = strings.ReplaceAll(result.Street, result.Province, "")
// 清理街道地址中的残留片段(如"自治区直辖县级市"被替换后的残留)
result.Street = strings.ReplaceAll(result.Street, "自治区直辖县级市", "")
result.Street = strings.ReplaceAll(result.Street, "直辖县级市", "")
result.Street = strings.TrimSpace(result.Street)
return result
}
// ParsePersonInfo extracts user information (name, phone, ID card, postal code) from an address string.
// It separates personal information from the address, supporting various formats:
// - Labeled format: "Name: xxx Phone: xxx Address: xxx"
// - Compact format: "Name Phone Address" (e.g., "张三13800138000北京市朝阳区")
// - With separators: using colons, commas, newlines as delimiters
// Returns an AddressInfo with extracted user information and cleaned address string in Addr field.
// ParsePersonInfo 从地址字符串中提取用户信息(姓名、电话、身份证、邮编)。
// 将个人信息与地址分离,支持多种格式:
// - 带标签格式:"姓名:xxx 电话:xxx 地址:xxx"
// - 紧凑格式:"姓名 电话 地址"(如"张三13800138000北京市朝阳区"
// - 带分隔符:使用冒号、逗号、换行符作为分隔符
// 返回包含提取的用户信息和清理后地址字符串(在 Addr 字段中)的 AddressInfo。
func ParsePersonInfo(str string) *AddressInfo {
compose := &AddressInfo{}
// 先尝试提取带标签的信息
// 提取姓名 (支持: 姓名:xxx, 收货人:xxx, 收件人:xxx)
nameRe := regexp.MustCompile(`(?:姓名|收货人|收件人)[:]\s*([^\s\d\n]+)`)
if match := nameRe.FindStringSubmatch(str); len(match) > 1 {
compose.Name = strings.TrimSpace(match[1])
str = nameRe.ReplaceAllString(str, " ")
}
// 提取手机号或座机号 (支持: 电话:xxx, 手机:xxx, 联系电话:xxx)
phoneRe := regexp.MustCompile(`(?:电话|手机号码|手机|联系电话)[:]\s*([\d\-]+)`)
if match := phoneRe.FindStringSubmatch(str); len(match) > 1 {
compose.Mobile = strings.TrimSpace(match[1])
str = phoneRe.ReplaceAllString(str, " ")
}
// 提取所在地区 (支持: 所在地区:xxx)
regionRe := regexp.MustCompile(`所在地区[:]\s*([^\n]+)`)
if match := regionRe.FindStringSubmatch(str); len(match) > 1 {
// 将所在地区保留在字符串中,不删除
// str 保持不变,让后续的地址解析处理
}
// 提取详细地址 (支持: 详细地址:xxx, 收货地址:xxx, 地址:xxx)
addrRe := regexp.MustCompile(`(?:详细地址|收货地址|地址)[:]\s*([^\n]+)`)
if match := addrRe.FindStringSubmatch(str); len(match) > 1 {
// 保留详细地址在字符串中
str = addrRe.ReplaceAllString(str, " "+match[1])
}
// 如果还没有提取到姓名和手机号,尝试识别紧凑格式 (如: 马云13593464918陕西省...)
if compose.Name == "" && compose.Mobile == "" {
// 匹配: 2-4个汉字 + 7-12位数字 + 剩余内容
compactRe := regexp.MustCompile(`^([\x{4e00}-\x{9fa5}]{2,4})(\d{7,12})(.*)$`)
if match := compactRe.FindStringSubmatch(str); len(match) > 3 {
compose.Name = match[1]
compose.Mobile = match[2]
str = match[3] // 保留剩余的地址部分
}
}
// 替换常见的地址关键词为空格
replacements := map[string]string{
"收货地址": " ", "详细地址": " ", "地址": " ", "收货人": " ",
"收件人": " ", "收货": " ", "所在地区": " ", "邮编": " ",
"电话": " ", "手机号码": " ", "身份证号码": " ", "身份证号": " ",
"身份证": " ", "姓名": " ", "联系电话": " ", "手机": " ",
"": " ", ":": " ", "": " ", ";": " ",
"": " ", ",": " ", "。": " ", "\n": " ", "\r": " ",
}
for old, new := range replacements {
str = strings.ReplaceAll(str, old, new)
}
// 将多个空格合并为一个
spaceRe := regexp.MustCompile(`\s{1,}`)
str = spaceRe.ReplaceAllString(str, " ")
// 处理座机号格式 (如: 800-8585222)
telRe := regexp.MustCompile(`(\d{3,4})-(\d{6,8})`)
str = telRe.ReplaceAllString(str, "$1$2")
// 提取身份证号 (18位或17位+X)
idnRe := regexp.MustCompile(`\d{18}|\d{17}[Xx]`)
if match := idnRe.FindString(str); match != "" {
compose.IDN = strings.ToUpper(match)
str = strings.ReplaceAll(str, match, "")
}
// 如果之前没有提取到手机号,现在提取
if compose.Mobile == "" {
mobileRe := regexp.MustCompile(`\d{7,12}`)
if match := mobileRe.FindString(str); match != "" {
compose.Mobile = match
str = strings.ReplaceAll(str, match, "")
}
} else {
// 已经提取过手机号,从字符串中删除
str = strings.ReplaceAll(str, compose.Mobile, "")
}
// 提取邮编
postcodeRe := regexp.MustCompile(`\d{6}`)
if match := postcodeRe.FindString(str); match != "" {
compose.Postcode = match
str = strings.ReplaceAll(str, match, "")
}
// 清理多余空格
str = strings.TrimSpace(spaceRe.ReplaceAllString(str, " "))
// 如果之前没有提取到姓名,现在提取
if compose.Name == "" {
// 提取姓名(取最短的词作为姓名,排除空字符串)
splitArr := strings.Split(str, " ")
if len(splitArr) > 0 {
for _, value := range splitArr {
value = strings.TrimSpace(value)
if value == "" {
continue
}
if compose.Name == "" {
compose.Name = value
} else if utf8.RuneCountInString(value) < utf8.RuneCountInString(compose.Name) && utf8.RuneCountInString(value) >= 2 {
compose.Name = value
}
}
if compose.Name != "" {
str = strings.TrimSpace(strings.ReplaceAll(str, compose.Name, ""))
}
}
} else {
// 已经提取过姓名,从字符串中删除
str = strings.TrimSpace(strings.ReplaceAll(str, compose.Name, ""))
}
compose.Addr = str
return compose
}
// fuzz 根据统计规律分析出二三级地址
func fuzz(addr string) *fuzzyResult {
addrOrigin := addr
addr = strings.ReplaceAll(addr, " ", "")
addr = strings.ReplaceAll(addr, ",", "")
// 先替换"自治区直辖县级市"为"市",避免后续"自治区"替换时产生问题
addr = strings.ReplaceAll(addr, "自治区直辖县级市", "市")
addr = strings.ReplaceAll(addr, "自治区", "省")
addr = strings.ReplaceAll(addr, "自治州", "州")
addr = strings.ReplaceAll(addr, "小区", "")
addr = strings.ReplaceAll(addr, "校区", "")
// 过滤"市辖区" - 这是一个行政术语占位符,不是真正的区名
addr = strings.ReplaceAll(addr, "市辖区", "")
a1 := ""
a2 := ""
a3 := ""
street := ""
deep3KeywordPos := -1
// 判断是否包含县/区/旗
countyPos := mbStrpos(addr, "县")
districtPos := mbStrpos(addr, "区")
bannerPos := mbStrpos(addr, "旗")
// 只要存在这些关键词就处理,不再限制位置
hasEarlyCounty := countyPos != -1
hasEarlyDistrict := districtPos != -1
hasEarlyBanner := bannerPos != -1
if hasEarlyCounty || hasEarlyDistrict || hasEarlyBanner {
// 优先检查是否存在县级市(如"新乐市"
// 如果同时存在"XX市"和"XX区"/"XX县",优先处理"市"
hasCountyLevelCity := false
if mbStrstr(addr, "市") {
// 查找所有"市"的位置
cityCount := mbSubstrCount(addr, "市")
if cityCount >= 2 {
// 找到第二个"市"的位置(可能是县级市)
firstCityPos := mbStrpos(addr, "市")
// 从第一个"市"之后继续查找
addrAfterFirstCity := mbSubstr(addr, firstCityPos+1, utf8.RuneCountInString(addr)-firstCityPos-1)
secondCityPos := mbStrpos(addrAfterFirstCity, "市")
if secondCityPos != -1 {
secondCityAbsPos := firstCityPos + 1 + secondCityPos
// 检查第二个"市"后面是否存在"区"或"县"
addrAfterSecondCity := mbSubstr(addr, secondCityAbsPos+1, utf8.RuneCountInString(addr)-secondCityAbsPos-1)
if mbStrstr(addrAfterSecondCity, "区") || mbStrstr(addrAfterSecondCity, "县") {
// 提取两个"市"之间的内容
betweenCities := mbSubstr(addr, firstCityPos+1, secondCityAbsPos-firstCityPos)
// 检查是否是重复的地名(如"北京市北京市"或"杭州市西湖区杭州市"
// 如果两个"市"之间包含"区"或"县",说明不是县级市,而是重复地名
if !mbStrstr(betweenCities, "区") && !mbStrstr(betweenCities, "县") {
// 第一个"市"及之前的内容
firstCityFull := mbSubstr(addr, 0, firstCityPos+1)
if betweenCities != firstCityFull {
// 不是重复地名,这是县级市
a3 = betweenCities
deep3KeywordPos = secondCityAbsPos
hasCountyLevelCity = true
}
}
}
}
}
}
if !hasCountyLevelCity {
// 处理旗
if mbStrstr(addr, "旗") {
deep3KeywordPos = mbStrpos(addr, "旗")
a3 = mbSubstr(addr, deep3KeywordPos-1, 2)
}
// 处理区
if mbStrstr(addr, "区") {
// 使用第一个"区"(避免重复地名干扰,如"西湖区杭州市西湖区"
deep3KeywordPos = mbStrpos(addr, "区")
if mbStrstr(addr, "市") {
// 策略:找到"区"之前的最后一个"市"
// 这样可以避免详细地址中的"市"字干扰(如"农贸市场")
zonePos := deep3KeywordPos
// 从开头到"区"的子串中,查找最后一个"市"
addrBeforeZone := mbSubstr(addr, 0, zonePos)
cityPos := mbStrripos(addrBeforeZone, "市")
if cityPos != -1 {
a3 = mbSubstr(addr, cityPos+1, zonePos-cityPos)
} else {
// 没有找到"市",使用默认逻辑
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
}
} else {
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
}
}
// 处理县
if mbStrstr(addr, "县") {
// 使用第一个"县"(避免重复地名干扰)
deep3KeywordPos = mbStrpos(addr, "县")
if mbStrstr(addr, "市") {
// 从开头到"县"的子串中,查找最后一个"市"
addrBeforeCounty := mbSubstr(addr, 0, deep3KeywordPos)
cityPos := mbStrripos(addrBeforeCounty, "市")
if cityPos != -1 {
a3 = mbSubstr(addr, cityPos+1, deep3KeywordPos-cityPos)
} else {
if mbStrstr(addr, "自治县") {
a3 = mbSubstr(addr, deep3KeywordPos-6, 7)
firstChar := mbSubstr(a3, 0, 1)
if firstChar == "省" || firstChar == "市" || firstChar == "州" {
a3 = mbSubstr(a3, 1, utf8.RuneCountInString(a3)-1)
}
} else {
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
}
}
} else {
if mbStrstr(addr, "自治县") {
a3 = mbSubstr(addr, deep3KeywordPos-6, 7)
firstChar := mbSubstr(a3, 0, 1)
if firstChar == "省" || firstChar == "市" || firstChar == "州" {
a3 = mbSubstr(a3, 1, utf8.RuneCountInString(a3)-1)
}
} else {
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
}
}
}
}
if deep3KeywordPos != -1 {
street = mbSubstr(addrOrigin, deep3KeywordPos+1, utf8.RuneCountInString(addrOrigin)-deep3KeywordPos-1)
}
} else {
// 处理市
if mbStrripos(addr, "市") != -1 {
cityCount := mbSubstrCount(addr, "市")
if cityCount == 1 {
deep3KeywordPos = mbStrripos(addr, "市")
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
street = mbSubstr(addrOrigin, deep3KeywordPos+1, utf8.RuneCountInString(addrOrigin)-deep3KeywordPos-1)
} else if cityCount >= 2 {
deep3KeywordPos = mbStrripos(addr, "市")
a3 = mbSubstr(addr, deep3KeywordPos-2, 3)
street = mbSubstr(addrOrigin, deep3KeywordPos+1, utf8.RuneCountInString(addrOrigin)-deep3KeywordPos-1)
}
} else {
a3 = ""
street = addr
}
}
// 提取市级地址
if mbStrpos(addr, "市") != -1 || mbStrstr(addr, "盟") || mbStrstr(addr, "州") {
tmpPos := -1
if tmpPos = mbStrpos(addr, "市"); tmpPos != -1 {
// 使用第一个"市"(避免重复地名干扰,如"杭州市西湖区杭州市"
// 向前查找省的位置,如果有省就从省后开始,否则从开头开始
addrBeforeCity := mbSubstr(addr, 0, tmpPos)
provincePos := mbStrripos(addrBeforeCity, "省")
startPos := 0
if provincePos != -1 {
startPos = provincePos + 1
}
a2 = mbSubstr(addr, startPos, tmpPos-startPos+1)
} else if tmpPos = mbStrpos(addr, "盟"); tmpPos != -1 {
a2 = mbSubstr(addr, tmpPos-2, 3)
} else if mbStrpos(addr, "州") != -1 {
if tmpPos = mbStrpos(addr, "自治州"); tmpPos != -1 {
a2 = mbSubstr(addr, tmpPos-4, 5)
} else {
tmpPos = mbStrpos(addr, "州")
a2 = mbSubstr(addr, tmpPos-2, 3)
}
}
}
return &fuzzyResult{
A1: a1,
A2: a2,
A3: a3,
Street: street,
}
}
// parse 智能解析出省市区
func parse(a1, a2, a3 string) *AddressInfo {
r := &AddressInfo{}
if a3 == "" {
return r
}
// 在三级地址数据中查找匹配
area3Matches := make(map[int]*Region)
for id, v := range A3Data {
if mbStrpos(v.Name, a3) != -1 {
area3Matches[id] = v
}
}
// 多个匹配项,需要通过二级地址筛选
if len(area3Matches) > 1 {
if a2 != "" {
area2Matches := make(map[int]*Region)
for id, v := range A2Data {
if mbStrpos(v.Name, a2) != -1 {
area2Matches[id] = v
}
}
if len(area2Matches) > 0 {
for _, v := range area3Matches {
if city, ok := area2Matches[v.PID]; ok {
r.City = city.Name
r.Region = v.Name
if province, ok := A1Data[city.PID]; ok {
r.Province = province.Name
}
}
}
}
} else {
r.Province = ""
r.City = ""
r.Region = a3
}
} else if len(area3Matches) == 1 {
// 唯一匹配
for _, v := range area3Matches {
r.Region = v.Name
if city, ok := A2Data[v.PID]; ok {
r.City = city.Name
if province, ok := A1Data[city.PID]; ok {
r.Province = province.Name
}
}
}
} else if len(area3Matches) == 0 && a2 == a3 {
// 没有匹配到三级地址,但二级地址等于三级地址,可能是直辖市
shengID := 0
for _, v := range A2Data {
if mbStrpos(v.Name, a2) != -1 {
r.City = v.Name
shengID = v.PID
break
}
}
if province, ok := A1Data[shengID]; ok {
r.Province = province.Name
}
r.Region = ""
}
return r
}

4470
formatter/address_data.go Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,75 @@
package formatter
import (
"strings"
"unicode/utf8"
)
// mbStrpos 返回字符串首次出现的位置UTF-8字符计数
func mbStrpos(haystack, needle string) int {
if needle == "" {
return 0
}
idx := strings.Index(haystack, needle)
if idx == -1 {
return -1
}
return utf8.RuneCountInString(haystack[:idx])
}
// mbStrripos 返回字符串最后出现的位置UTF-8字符计数
func mbStrripos(haystack, needle string) int {
if needle == "" {
return utf8.RuneCountInString(haystack)
}
idx := strings.LastIndex(haystack, needle)
if idx == -1 {
return -1
}
return utf8.RuneCountInString(haystack[:idx])
}
// mbStrstr 检查字符串是否包含子串
func mbStrstr(haystack, needle string) bool {
return strings.Contains(haystack, needle)
}
// mbSubstr 截取字符串UTF-8字符计数
// start: 起始位置从0开始
// length: 截取长度(字符数)
func mbSubstr(str string, start, length int) string {
runes := []rune(str)
strLen := len(runes)
// 处理负数起始位置
if start < 0 {
start = strLen + start
if start < 0 {
start = 0
}
}
// 起始位置超出字符串长度
if start >= strLen {
return ""
}
// 计算结束位置
end := start + length
if end > strLen {
end = strLen
}
if end < start {
return ""
}
return string(runes[start:end])
}
// mbSubstrCount 统计子串出现次数
func mbSubstrCount(haystack, needle string) int {
if needle == "" {
return 0
}
return strings.Count(haystack, needle)
}

360
formatter/address_test.go Normal file
View File

@@ -0,0 +1,360 @@
package formatter
import (
"encoding/json"
"testing"
)
func TestParseCNAddress(t *testing.T) {
tests := []struct {
name string
input string
withUser bool
want *AddressInfo
}{
{
name: "完整地址信息",
input: "张三 13800138000 北京市朝阳区建国路1号",
withUser: true,
want: &AddressInfo{
Name: "张三",
Mobile: "13800138000",
Province: "北京",
City: "北京市",
Region: "朝阳区",
Street: "建国路1号",
},
},
{
name: "带身份证和邮编",
input: "李四 18612345678 110101199001011234 100000 上海市浦东新区世纪大道100号",
withUser: true,
want: &AddressInfo{
Name: "李四",
Mobile: "18612345678",
IDN: "110101199001011234",
Postcode: "100000",
},
},
{
name: "仅地址不含用户信息",
input: "北京市海淀区中关村大街1号",
withUser: false,
want: &AddressInfo{
Province: "北京",
City: "北京市",
Region: "海淀区",
Street: "中关村大街1号",
},
},
{
name: "带收货关键词",
input: "收货人:王五 电话13900139000 收货地址天津市河西区友谊路20号",
withUser: true,
want: &AddressInfo{
Name: "王五",
Mobile: "13900139000",
Province: "天津",
City: "天津市",
Region: "河西区",
},
},
{
name: "紧凑格式地址",
input: "马云13593464918陕西省西安市雁塔区丈八沟街道高新四路南江国际",
withUser: true,
want: &AddressInfo{
Name: "马云",
Mobile: "13593464918",
Province: "陕西省",
City: "西安市",
Region: "雁塔区",
Street: "丈八沟街道高新四路南江国际",
},
},
{
name: "带座机号格式",
input: "姓名:马云\n联系电话800-8585222\n所在地区河北省石家庄市新华区\n详细地址:中华北大街68号鹿城商务中心6号楼1413室",
withUser: true,
want: &AddressInfo{
Name: "马云",
Mobile: "800-8585222",
Province: "河北省",
City: "石家庄市",
Region: "新华区",
Street: "中华北大街68号鹿城商务中心6号楼1413室",
},
},
{
name: "北京市重复格式",
input: "北京市北京市市辖区东城区",
withUser: false,
want: &AddressInfo{
Province: "北京",
City: "北京市",
Region: "东城区",
Street: "",
},
},
{
name: "河北省新乐市地址",
input: "河北省石家庄市新乐市经济开发区兴工街10号来优品仓库",
withUser: false,
want: &AddressInfo{
Province: "河北省",
City: "石家庄市",
Region: "新乐市",
Street: "经济开发区兴工街10号来优品仓库",
},
},
{
name: "江苏仪征市地址",
input: "江苏省扬州市仪征市真州镇解放东路99号",
withUser: false,
want: &AddressInfo{
Province: "江苏省",
City: "扬州市",
Region: "仪征市",
Street: "真州镇解放东路99号",
},
},
{
name: "新疆石河子市地址",
input: "新疆石河子市北三路25小区",
withUser: false,
want: &AddressInfo{
Province: "新疆维吾尔自治区",
City: "自治区直辖县级市",
Region: "石河子市",
},
},
{
name: "新疆石河子市-简化格式省+县级市",
input: "新疆维吾尔自治区石河子市",
withUser: false,
want: &AddressInfo{
Province: "新疆维吾尔自治区",
City: "自治区直辖县级市",
Region: "石河子市",
Street: "",
},
},
{
name: "新疆石河子市-完整行政区划表述",
input: "新疆维吾尔自治区自治区直辖县级市石河子市",
withUser: false,
want: &AddressInfo{
Province: "新疆维吾尔自治区",
City: "自治区直辖县级市",
Region: "石河子市",
Street: "",
},
},
{
name: "浙江杭州西湖区重复地址",
input: "浙江省杭州市西湖区杭州市西湖区人民政府109号",
withUser: false,
want: &AddressInfo{
Province: "浙江省",
City: "杭州市",
Region: "西湖区",
Street: "人民政府109号",
},
},
{
name: "湖南长沙市重复地址",
input: "湖南省长沙市岳麓区银盆岭街道长沙市人民政府长沙市政府大楼",
withUser: false,
want: &AddressInfo{
Province: "湖南省",
City: "长沙市",
Region: "岳麓区",
Street: "银盆岭街道人民政府政府大楼",
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := ParseCNAddress(tt.input, tt.withUser)
// 打印结果便于调试
jsonData, _ := json.MarshalIndent(got, "", " ")
t.Logf("Result: %s", jsonData)
// 验证主要字段
if tt.want.Name != "" && got.Name != tt.want.Name {
t.Errorf("Name = %v, want %v", got.Name, tt.want.Name)
}
if tt.want.Mobile != "" && got.Mobile != tt.want.Mobile {
t.Errorf("Mobile = %v, want %v", got.Mobile, tt.want.Mobile)
}
if tt.want.Province != "" && got.Province != tt.want.Province {
t.Errorf("Province = %v, want %v", got.Province, tt.want.Province)
}
if tt.want.City != "" && got.City != tt.want.City {
t.Errorf("City = %v, want %v", got.City, tt.want.City)
}
if tt.want.Region != "" && got.Region != tt.want.Region {
t.Errorf("Region = %v, want %v", got.Region, tt.want.Region)
}
})
}
}
func TestParsePersonInfo(t *testing.T) {
tests := []struct {
name string
input string
verify func(*testing.T, *AddressInfo)
}{
{
name: "提取姓名和手机号",
input: "张三 13800138000 北京市朝阳区",
verify: func(t *testing.T, got *AddressInfo) {
if got.Name != "张三" {
t.Errorf("Name = %v, want 张三", got.Name)
}
if got.Mobile != "13800138000" {
t.Errorf("Mobile = %v, want 13800138000", got.Mobile)
}
},
},
{
name: "提取身份证号",
input: "李四 110101199001011234 上海市",
verify: func(t *testing.T, got *AddressInfo) {
if got.Name != "李四" {
t.Errorf("Name = %v, want 李四", got.Name)
}
if got.IDN != "110101199001011234" {
t.Errorf("IDN = %v, want 110101199001011234", got.IDN)
}
},
},
{
name: "提取邮编",
input: "王五 100000 天津市",
verify: func(t *testing.T, got *AddressInfo) {
if got.Name != "王五" {
t.Errorf("Name = %v, want 王五", got.Name)
}
if got.Postcode != "100000" {
t.Errorf("Postcode = %v, want 100000", got.Postcode)
}
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := ParsePersonInfo(tt.input)
jsonData, _ := json.MarshalIndent(got, "", " ")
t.Logf("Result: %s", jsonData)
tt.verify(t, got)
})
}
}
func TestFuzz(t *testing.T) {
tests := []struct {
name string
input string
want *fuzzyResult
}{
{
name: "包含区",
input: "北京市朝阳区建国路1号",
want: &fuzzyResult{
A2: "北京市",
A3: "朝阳区",
Street: "建国路1号",
},
},
{
name: "包含县",
input: "河北省石家庄市正定县",
want: &fuzzyResult{
A2: "石家庄市",
A3: "正定县",
},
},
{
name: "复杂街道地址",
input: "浙江省杭州市拱墅区武林街道杭州锦麟宾馆中河片区",
want: &fuzzyResult{
A2: "杭州市",
A3: "拱墅区",
Street: "武林街道杭州锦麟宾馆中河片区",
},
},
{
name: "北京市重复格式",
input: "北京市北京市市辖区东城区",
want: &fuzzyResult{
A2: "北京市",
A3: "东城区",
Street: "",
},
},
{
name: "详细地址包含市字",
input: "北京市朝阳区建外大街1号国贸商城",
want: &fuzzyResult{
A2: "北京市",
A3: "朝阳区",
Street: "建外大街1号国贸商城",
},
},
{
name: "详细地址真的包含市字",
input: "北京市朝阳区农贸市场路1号",
want: &fuzzyResult{
A2: "北京市",
A3: "朝阳区",
Street: "农贸市场路1号",
},
},
{
name: "河北省新乐市地址",
input: "河北省石家庄市新乐市经济开发区兴工街10号来优品仓库",
want: &fuzzyResult{
A2: "石家庄市",
A3: "新乐市",
Street: "经济开发区兴工街10号来优品仓库",
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := fuzz(tt.input)
jsonData, _ := json.MarshalIndent(got, "", " ")
t.Logf("Result: %s", jsonData)
if got.A2 != tt.want.A2 {
t.Errorf("A2 = %v, want %v", got.A2, tt.want.A2)
}
if got.A3 != tt.want.A3 {
t.Errorf("A3 = %v, want %v", got.A3, tt.want.A3)
}
if tt.want.Street != "" && got.Street != tt.want.Street {
t.Errorf("Street = %v, want %v", got.Street, tt.want.Street)
}
})
}
}
func ExampleParseCNAddress() {
// 解析包含用户信息的完整地址
result := ParseCNAddress("张三 13800138000 北京市朝阳区建国路1号", true)
jsonData, _ := json.MarshalIndent(result, "", " ")
println(string(jsonData))
}
func ExampleParsePersonInfo() {
// 分离用户信息
result := ParsePersonInfo("收货人:李四 电话18612345678 地址上海市浦东新区世纪大道100号")
jsonData, _ := json.MarshalIndent(result, "", " ")
println(string(jsonData))
}