tf-idf-keyword
tf-idf-keyword copied to clipboard
分词过程应该可以优化
def segment(sentence, cut_all=False):
sentence = sentence.replace('\n', '').replace('\u3000', '').replace('\u00A0', '')
sentence = ' '.join(jieba.cut(sentence, cut_all=cut_all))
return re.sub('[a-zA-Z0-9.。::,,))((!!??”“\"]', '', sentence).split() # 可以先替换,然后分词