analysis-pinyin
analysis-pinyin copied to clipboard
开启ignore_pinyin_offset = false, 因为分词获得的数组排列顺序偶发不一致,导致数据无法index
报错提示: startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards startOffset=1,endOffset=2,lastStartOffset=4 for field 'nickname.pinyin'
创建索引的命令:
put /crm_fans_v1
{
"settings": {
"number_of_shards": 1,
"analysis": {
"analyzer": {
"phone_analyzer": {
"tokenizer": "phone_tokenizer"
},
"pinyin_analyzer": {
"tokenizer": "pinyin_tokenizer"
},
"pinyin_search_analyzer": {
"tokenizer": "pinyin_search_tokenizer"
}
},
"tokenizer": {
"phone_tokenizer": {
"type": "ngram",
"min_gram": 2,
"max_gram": 3,
"token_chars": [
"digit"
]
},
"pinyin_tokenizer": {
"type": "pinyin",
"keep_first_letter": false,
"keep_separate_first_letter": true,
"keep_full_pinyin": false,
"lowercase": false,
"keep_joined_full_pinyin": false,
"remove_duplicated_term": false,
"keep_none_chinese": true,
"keep_none_chinese_together": false,
"keep_none_chinese_in_first_letter": false,
"ignore_pinyin_offset": false,
"trim_whitespace": true
},
"pinyin_search_tokenizer": {
"type": "standard",
"max_token_length": 1
}
}
}
},
"mappings": {
"_doc": {
"properties": {
"id": {
"type": "long"
},
"busId": {
"type": "long"
},
"deleted": {
"type": "boolean"
},
"nickname": {
"type": "text",
"term_vector": "with_positions_offsets",
"fields": {
"keyword": {
"type": "keyword"
},
"pinyin": {
"type": "text",
"analyzer": "pinyin_analyzer",
"search_analyzer": "pinyin_search_analyzer",
"term_vector": "with_positions_offsets"
}
}
},
"mobilePhone": {
"type": "text",
"analyzer": "phone_analyzer",
"term_vector": "with_positions_offsets",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"gender": {
"type": "keyword"
},
"customerId": {
"type": "long"
},
"customerType": {
"type": "keyword"
},
"subscribeTime": {
"type": "long"
},
"profile": {
"type": "keyword"
},
"openId": {
"type": "keyword"
},
"totalSendCouponTimes": {
"type": "integer"
},
"currentMonthSendCouponTimes": {
"type": "integer"
}
}
}
}
}
进行分词 get /crm_fans_v1/_analyze { "text": ["Yang小波🔜🐑"], "analyzer": "pinyin_analyzer" }
分词得到的错误数据
{
"tokens" : [
{
"token" : "Y",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
},
{
"token" : "x",
"start_offset" : 4,
"end_offset" : 5,
"type" : "word",
"position" : 0
},
{
"token" : "a",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 1
},
{
"token" : "b",
"start_offset" : 5,
"end_offset" : 6,
"type" : "word",
"position" : 1
},
{
"token" : "n",
"start_offset" : 2,
"end_offset" : 3,
"type" : "word",
"position" : 2
},
{
"token" : "g",
"start_offset" : 3,
"end_offset" : 4,
"type" : "word",
"position" : 3
}
]
}
多试几次就得到正确的分词数据:
{
"tokens" : [
{
"token" : "Y",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
},
{
"token" : "a",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 1
},
{
"token" : "n",
"start_offset" : 2,
"end_offset" : 3,
"type" : "word",
"position" : 2
},
{
"token" : "g",
"start_offset" : 3,
"end_offset" : 4,
"type" : "word",
"position" : 3
},
{
"token" : "x",
"start_offset" : 4,
"end_offset" : 5,
"type" : "word",
"position" : 3
},
{
"token" : "b",
"start_offset" : 5,
"end_offset" : 6,
"type" : "word",
"position" : 4
}
]
}
在 master 分支的版本重新试一下看看。