scws
scws copied to clipboard
根本分不了词啊,那里错了?
root@wcjs-test:/usr/local/scws/bin# cat a.txt 奔驰 12.0 2.2 n 蓝天 11.2 2.2 n 每日一问 30.1 5.0 nz
root@wcjs-test:/usr/local/scws/bin# ./scws-gen-dict -c utf8 -i a.txt Output file exists: Success root@wcjs-test:/usr/local/scws/bin# ./scws -i '奔驰在每日一问里面好像有点厉害了' -c utf8 -d dict.xdb -A -U 奔驰/n 在/un 每日一问/n 里/un 面/un 好/un 像/un 有/un 点/un 厉/un 害/un 了/un +--[scws(scws-cli/1.2.3)]----------+ | TextLen: 48 | | Prepare: 0.0002 (sec) | | Segment: 0.0003 (sec) | +--------------------------------+
<?php
$sh = scws_open();
scws_set_charset($sh, 'utf8');
scws_set_dict($sh, '/usr/local/scws/bin/dict.xdb');
//scws_set_rule($sh, '/path/to/rules.ini');
$text = "奔驰在每日一问里面好像有点厉害了";
scws_send_text($sh, $text);
$top = scws_get_result($sh);
scws_close($sh);
print_r($top);
?>
| Array
-- | --
| (
| [0] => Array
| (
| [word] => 奔
| [off] => 0
| [len] => 3
| [idf] => 0
| [attr] => un
| )
|
| [1] => Array
| (
| [word] => 驰
| [off] => 3
| [len] => 3
| [idf] => 0
| [attr] => un
| )
|
| [2] => Array
| (
| [word] => 在
| [off] => 6
| [len] => 3
| [idf] => 0
| [attr] => un
| )
|
| [3] => Array
| (
| [word] => 每
| [off] => 9
| [len] => 3
| [idf] => 0
| [attr] => un
| )
|
| [4] => Array
| (
| [word] => 日
| [off] => 12
| [len] => 3
| [idf] => 0
| [attr] => un
| )
|
| [5] => Array
| (
| [word] => 一
| [off] => 15
| [len] => 3
| [idf] => 0
| [attr] => un
| )
|
| [6] => Array
| (
| [word] => 问
| [off] => 18
| [len] => 3
| [idf] => 0
| [attr] => un
| )
|
| [7] => Array
| (
| [word] => 里
| [off] => 21
| [len] => 3
| [idf] => 0
| [attr] => un
| )
|
| [8] => Array
| (
| [word] => 面
| [off] => 24
| [len] => 3
| [idf] => 0
| [attr] => un
| )
|
| [9] => Array
| (
| [word] => 好
| [off] => 27
| [len] => 3
| [idf] => 0
| [attr] => un
| )
|
| [10] => Array
| (
| [word] => 像
| [off] => 30
| [len] => 3
| [idf] => 0
| [attr] => un
| )
|
| [11] => Array
| (
| [word] => 有
| [off] => 33
| [len] => 3
| [idf] => 0
| [attr] => un
| )
|
| [12] => Array
| (
| [word] => 点
| [off] => 36
| [len] => 3
| [idf] => 0
| [attr] => un
| )
|
| [13] => Array
| (
| [word] => 厉
| [off] => 39
| [len] => 3
| [idf] => 0
| [attr] => un
| )
|
| [14] => Array
| (
| [word] => 害
| [off] => 42
| [len] => 3
| [idf] => 0
| [attr] => un
| )
|
| [15] => Array
| (
| [word] => 了
| [off] => 45
| [len] => 3
| [idf] => 0
| [attr] => un
| )
|
| )
|
$ cat dict_aa.txt 奔驰 12.0 2.2 n 蓝天 11.2 2.2 n 每日一问 30.1 5.0 nz
$ scws-gen-dict -i dict_aa.txt -o dict_aa.xdb Reading the input file: dict_aa.txt ...OK, total nodes=10 Optimizing... OK Dump the tree data to: dict_aa.xdb ... OK, all been done!
$ scws -c utf8 -d dict_jieba1.xdb:dict_aa.xdb -N -i "奔驰在每日一问里面好像有点厉害了" 奔驰 在 每日一问 里面 好像 有点 厉害 了
dict_jieba1.xdb 是我用 https://github.com/fxsjy/jieba/tree/master/extra_dict (dict.txt.big和dict.txt.small合併) 編譯的。
这不是有分词吗?在 2023年6月8日,00:04,Fung Cheok Yin @.***> 写道: $ cat dict_aa.txt 奔驰 12.0 2.2 n 蓝天 11.2 2.2 n 每日一问 30.1 5.0 nz $ scws-gen-dict -i dict_aa.txt -o dict_aa.xdb Reading the input file: dict_aa.txt ...OK, total nodes=10 Optimizing... OK Dump the tree data to: dict_aa.xdb ... OK, all been done! $ scws -c utf8 -d dict_jieba1.xdb:dict_aa.xdb -N -i "奔驰在每日一问里面好像有点厉害了" 奔驰 在 每日一问 里面 好像 有点 厉害 了 dict_jieba1.xdb 是我用 https://github.com/fxsjy/jieba/tree/master/extra_dict (dict.txt.big和dict.txt.small合併) 編譯的。
—Reply to this email directly, view it on GitHub, or unsubscribe.You are receiving this because you are subscribed to this thread.Message ID: @.***>