ckiptagger
ckiptagger copied to clipboard
AssertionError while running NER
您好,參考官網的範例寫了一個 get_nlp_result
function,會 iterate data_df
的 row,將文字資料 row[text_col]
依序送進 ws、pos 和 ner function 處理
在跑ner的時候有時會遇到 AssertionError error,請問這是甚麼問題造成的呢?
def get_nlp_result(data_df, id_col, text_col):
start = time.time()
pos_list = []
entity_list = []
sentence_list = []
for index, row in data_df.iterrows(): # document level
# print(f"\ndocument {index}")
# clean data
result = []
tmp = Sentence_Segmentation(row[text_col])
flat_list = [item for sublist in tmp for item in sublist]
# ckip
w_sentence_list = ws(flat_list, coerce_dictionary = dictionary2) # set dictionary
pos_sentence_list = pos(w_sentence_list)
entity_sentence_list = ner(w_sentence_list, pos_sentence_list)
for i, sentence in enumerate(flat_list): # sentence level
# print(f"sentence {i}: {sentence}")
sentence_list.append([row[id_col],sentence])
temp_tokens = get_pos(row[id_col],w_sentence_list[i], pos_sentence_list[i])
temp_entites = get_ner(row[id_col],entity_sentence_list[i])
pos_list.append(temp_tokens)
if len(temp_entites) != 0:
entity_list.append(temp_entites)
pos_flat = [item for sublist in pos_list for item in sublist]
entity_flat = [item for sublist in entity_list for item in sublist]
pos_table = pd.DataFrame(data=pos_flat,
columns=[id_col,'word','pos'])
entity_table = pd.DataFrame(data=entity_flat,
columns=[id_col,'word','ner'])
sentence_table = pd.DataFrame(data=sentence_list,
columns=[id_col,'sentence'])
end = time.time()
print("time costing: {}".format(end - start))
return pos_table, entity_table, sentence_table
- CkipTagger預設開啟「字元編碼標準化 (character_normalization)」以協助模型辨識編碼不同但語義相同的字元。
- 但少數含有特殊符號的詞在標準化後可能與原詞不同,CkipTagger將之視為嚴重問題並丟出AssertionError。
- 使用者呼叫ner時可以catch此例外情況,但建議盡量只為有此問題的句子特別關閉字元編碼標準化。
try:
entity_sentence_list = ner(word_sentence_list, pos_sentence_list)
except AssertionError:
entity_sentence_list = []
for word_sentence, pos_sentence in zip(word_sentence_list, pos_sentence_list):
try:
singleton_entity_sentence_list = ner([word_sentence], [pos_sentence])
except AssertionError:
singleton_entity_sentence_list = ner([word_sentence], [pos_sentence], character_normalization=False)
entity_sentence_list.append(singleton_entity_sentence_list[0])