analysis-pinyin
analysis-pinyin copied to clipboard
为什么有些keep_first_letter 和keep_joined_full_pinyin 这两个参数是生效的有些不生效
缺少 “wangkai“ 和 “wk”
{
"tokens": [
{
"token": "w",
"start_offset": 0,
"end_offset": 1,
"type": "CN_CHAR",
"position": 0
},
{
"token": "wang",
"start_offset": 0,
"end_offset": 1,
"type": "CN_CHAR",
"position": 0
},
{
"token": "王",
"start_offset": 0,
"end_offset": 1,
"type": "CN_CHAR",
"position": 0
},
{
"token": "k",
"start_offset": 1,
"end_offset": 2,
"type": "CN_CHAR",
"position": 1
},
{
"token": "kai",
"start_offset": 1,
"end_offset": 2,
"type": "CN_CHAR",
"position": 1
},
{
"token": "楷",
"start_offset": 1,
"end_offset": 2,
"type": "CN_CHAR",
"position": 1
}
]
}
刘德华却是正常的
{
"tokens": [
{
"token": "l",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 0
},
{
"token": "liu",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 0
},
{
"token": "d",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 1
},
{
"token": "de",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 1
},
{
"token": "h",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 2
},
{
"token": "hua",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 2
},
{
"token": "刘德华",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 2
},
{
"token": "liudehua",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 2
},
{
"token": "ldh",
"start_offset": 0,
"end_offset": 3,
"type": "CN_WORD",
"position": 2
}
]
}
用的什么版本的 es 测的,我这里6.0的测了一下。
PUT pboos-map-adress-1
{
"settings": {
"number_of_shards": 6,
"index.refresh_interval": "5s",
"analysis": {
"analyzer": {
"pinyin_analyzer": {
"tokenizer": "my_pinyin"
}
},
"tokenizer": {
"my_pinyin": {
"type": "pinyin",
"keep_separate_first_letter": true,
"keep_full_pinyin": true,
"keep_original": true,
"limit_first_letter_length": 16,
"lowercase": true,
"remove_duplicated_term": true,
"ignore_pinyin_offset":false,
"keep_joined_full_pinyin":true
}
}
}
}
}
GET pboos-map-adress-1/_analyze
{
"text": ["王楷"],
"analyzer": "pinyin_analyzer"
}
{
"tokens": [
{
"token": "w",
"start_offset": 0,
"end_offset": 1,
"type": "word",
"position": 0
},
{
"token": "wang",
"start_offset": 0,
"end_offset": 1,
"type": "word",
"position": 0
},
{
"token": "王楷",
"start_offset": 0,
"end_offset": 2,
"type": "word",
"position": 0
},
{
"token": "wangkai",
"start_offset": 0,
"end_offset": 7,
"type": "word",
"position": 0
},
{
"token": "wk",
"start_offset": 0,
"end_offset": 2,
"type": "word",
"position": 0
},
{
"token": "k",
"start_offset": 1,
"end_offset": 2,
"type": "word",
"position": 1
},
{
"token": "kai",
"start_offset": 1,
"end_offset": 2,
"type": "word",
"position": 1
}
]
}