Langchain-Chatchat
Langchain-Chatchat copied to clipboard
针对于编码问题比如'gbk' codec can't encode character '\xab' in position 14: illegal multibyte sequence粗浅的解决方法
功能描述 / Feature Description 首先文档必须是docx的,然后可以使用以下代码来规范docx以使docx不会报这个错误,pdf和txt等也可以仿照这个程序,应该都大差不差
# -*- coding: utf-8 -*-
from docx import Document
import re
symbols = ['\u301c', '\xbb', '\xa3', '\xab', '\u2022', '\xae', '\xa5', '\xa9']
allowed_chars = ['“', '”', '‘', '’', '"', '"', "'", "'", ':', ';', ':', ';', '《', '》', ',', '.', '?', '/', '?', ',', '。', '、', '\\', '+', '=', '-', '——', '…', '&', '*', '(', ')', '(', ')', '%', '$', '#', '@', '!']
allowed_chars += [chr(i) for i in range(ord('a'), ord('z')+1)]
allowed_chars += [chr(i) for i in range(ord('A'), ord('Z')+1)]
allowed_chars += [chr(i) for i in range(ord('0'), ord('9')+1)]
allowed_chars += [chr(i) for i in range(0x4e00, 0x9fff+1)]
allowed_chars += [' ', '\n', '\f']
allowed_chars = ''.join(allowed_chars)
def remove_header_footer_bullet_numbering(docx_file):
doc = Document(docx_file)
# 删除页眉和页脚
for section in doc.sections:
section.header.is_linked_to_previous = False
section.footer.is_linked_to_previous = False
for paragraph in section.header.paragraphs:
paragraph.text = ""
for paragraph in section.footer.paragraphs:
paragraph.text = ""
# 删除项目符号与编号
for paragraph in doc.paragraphs:
if paragraph.style.name.startswith('List'):
paragraph.style = doc.styles['Normal']
#删除特殊符号
for paragraph in doc.paragraphs:
for symbol in symbols:
paragraph.text = paragraph.text.replace(symbol, '')
#仅剩下可用符号
for paragraph in doc.paragraphs:
paragraph.text = re.sub(f'[^{allowed_chars}]', '', paragraph.text)
# 删除汉字间空格
for paragraph in doc.paragraphs:
text = paragraph.text
new_text = []
for i in range(len(text)):
if text[i] == ' ' and i > 0 and i < len(text) - 1:
if text[i - 1] >= '\u4e00' and text[i - 1] <= '\u9fff' and text[i + 1] >= '\u4e00' and text[i + 1] <= '\u9fff':
continue
new_text.append(text[i])
paragraph.text = ''.join(new_text)
#删除空段落
paragraphs = list(doc.paragraphs)
for i in range(len(paragraphs) - 1, 0, -1):
if not paragraphs[i].text.strip() and not paragraphs[i - 1].text.strip():
p = paragraphs.pop(i)
p._element.getparent().remove(p._element)
doc.save(docx_file)
remove_header_footer_bullet_numbering('try.docx')
遇到了同样问题
我在向知识库加载pdf的时候出现了同样的问题,请问我应该如何解决呢?
很奇怪,我每次报这个错,重新部署,又能上传了