KAG
KAG copied to clipboard
Bugs in mix_reader for determining file paths
Search before asking
- [X] I had searched in the issues and found no similar issues.
Operating system information
Linux
What happened
if os.path.exists(input):
raise FileNotFoundError(f"File {input} not found.")
This is the code in mix_read. Is the not missing? It should probably be
if not os.path.exists(input):
return self.parse_map[file_suffix]._invoke(input)
Should it be
return self.parse_map[reader_type]._invoke(input)
How to reproduce
def build(self, **kwargs):
llm = OpenAIClient(api_key='',
base_url='https://api.deepseek.com',
model='deepseek-chat',
)
txt_reader = TXTReader()
pdf_reader = PDFReader(llm=llm)
dict_reader = DictReader()
md_reader = MarkDownReader(llm=llm)
docx_reader = DocxReader(llm=llm)
self.reader = MixReader(txt_reader=txt_reader,
pdf_reader=pdf_reader,
dict_reader=dict_reader,
md_reader=md_reader,
docx_reader=docx_reader)
self.splitter = LengthSplitter(split_length=1000, window_length=200)
self.vectorizer = BatchVectorizer.from_config(
KAG_CONFIG.all_config["chain_vectorizer"]
)
self.extractor = SchemaFreeExtractor(llm=llm, ner_prompt=OpenIENERPrompt(),
std_prompt=OpenIEEntitystandardizationdPrompt(),
triple_prompt=OpenIETriplePrompt())
self.writer = KGWriter()
chain = self.reader >> self.splitter >> self.extractor >> self.vectorizer >> self.writer
return chain
def get_component_with_ckpts(self):
return [
self.vectorizer,
]
def import_data(): file_path = os.path.dirname(file) data_folder_path = os.path.join(file_path, 'data')
for root, dirs, files in os.walk(data_folder_path):
for file in files:
if any(file.endswith(ext) for ext in ['.txt', '.pdf', '.docx', '.md']):
file_fill_path = os.path.join(root, file)
exits = os.path.exists(file_fill_path)
print(f'{file} exits')
chain = KagDemoBuildChain()
chain.invoke(file_path=file_fill_path)
if name == 'main': import_data()
Are you willing to submit PR?
- [X] Yes I am willing to submit a PR!