graphrag
graphrag copied to clipboard
[Bug]: Filename gets replaced with empty string when loading input
Do you need to file an issue?
- [X] I have searched the existing issues and this bug is not already filed.
- [X] My model is hosted on OpenAI or Azure. If not, please look at the "model providers" issue and don't file a new one here.
- [X] I believe this is a legitimate bug, not just a question. If this is a question, please use the Discussions area.
Describe the bug
Using replace on the full input file path causes naming issues when file path contains a substring that matches the input root directory during indexing - this also affects the reading of the file. e.g:
- input/input.txt -> .txt
- input/sample-input.txt -> sample-.txt
Steps to reproduce
- Using the
create_base_text_unitsworkflow - Running only the
orderbyworkflow step - You will notice that the parquet output lists the titles wrongly if the file name contains a substring of the input root directory.
Expected Behavior
The filename should be preserved, to allow reading of file
GraphRAG Config Used
"llm": {
"api_key": "==== REDACTED ====",
"type": "openai_chat",
"model": "gpt-4-turbo-preview",
"max_tokens": 4000,
"temperature": 0.0,
"top_p": 1.0,
"n": 1,
"request_timeout": 180.0,
"api_base": null,
"api_version": null,
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": null,
"model_supports_json": null,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 10,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 25
},
"parallelization": {
"stagger": 0.3,
"num_threads": 50
},
"async_mode": "threaded",
"root_dir": ".",
"reporting": {
"type": "file",
"base_dir": "output/${timestamp}/reports",
"storage_account_blob_url": null
},
"storage": {
"type": "file",
"base_dir": "output/${timestamp}/artifacts",
"storage_account_blob_url": null
},
"cache": {
"type": "file",
"base_dir": "cache",
"storage_account_blob_url": null
},
"input": {
"type": "file",
"file_type": "text",
"base_dir": "input",
"storage_account_blob_url": null,
"encoding": "utf-8",
"file_pattern": ".*\\.txt$",
"file_filter": null,
"source_column": null,
"timestamp_column": null,
"timestamp_format": null,
"text_column": "text",
"title_column": null,
"document_attribute_columns": []
},
"embed_graph": {
"enabled": false,
"num_walks": 10,
"walk_length": 40,
"window_size": 2,
"iterations": 3,
"random_seed": 597832,
"strategy": null
},
"embeddings": {
"llm": {
"api_key": "==== REDACTED ====",
"type": "openai_embedding",
"model": "text-embedding-3-small",
"max_tokens": 4000,
"temperature": 0,
"top_p": 1,
"n": 1,
"request_timeout": 180.0,
"api_base": null,
"api_version": null,
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": null,
"model_supports_json": null,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 10,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 25
},
"parallelization": {
"stagger": 0.3,
"num_threads": 50
},
"async_mode": "threaded",
"batch_size": 16,
"batch_max_tokens": 8191,
"target": "required",
"skip": [],
"vector_store": null,
"strategy": null
},
"chunks": {
"size": 1200,
"overlap": 100,
"group_by_columns": [
"id"
],
"strategy": null,
"encoding_model": null
},
"snapshots": {
"graphml": false,
"raw_entities": false,
"top_level_nodes": false
},
"entity_extraction": {
"llm": {
"api_key": "==== REDACTED ====",
"type": "openai_chat",
"model": "gpt-4-turbo-preview",
"max_tokens": 4000,
"temperature": 0.0,
"top_p": 1.0,
"n": 1,
"request_timeout": 180.0,
"api_base": null,
"api_version": null,
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": null,
"model_supports_json": null,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 10,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 25
},
"parallelization": {
"stagger": 0.3,
"num_threads": 50
},
"async_mode": "threaded",
"prompt": null,
"entity_types": [
"organization",
"person",
"geo",
"event"
],
"max_gleanings": 1,
"strategy": null,
"encoding_model": null
},
"summarize_descriptions": {
"llm": {
"api_key": "==== REDACTED ====",
"type": "openai_chat",
"model": "gpt-4-turbo-preview",
"max_tokens": 4000,
"temperature": 0.0,
"top_p": 1.0,
"n": 1,
"request_timeout": 180.0,
"api_base": null,
"api_version": null,
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": null,
"model_supports_json": null,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 10,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 25
},
"parallelization": {
"stagger": 0.3,
"num_threads": 50
},
"async_mode": "threaded",
"prompt": null,
"max_length": 500,
"strategy": null
},
"community_reports": {
"llm": {
"api_key": "==== REDACTED ====",
"type": "openai_chat",
"model": "gpt-4-turbo-preview",
"max_tokens": 4000,
"temperature": 0.0,
"top_p": 1.0,
"n": 1,
"request_timeout": 180.0,
"api_base": null,
"api_version": null,
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": null,
"model_supports_json": null,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 10,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 25
},
"parallelization": {
"stagger": 0.3,
"num_threads": 50
},
"async_mode": "threaded",
"prompt": null,
"max_length": 2000,
"max_input_length": 8000,
"strategy": null
},
"claim_extraction": {
"llm": {
"api_key": "==== REDACTED ====",
"type": "openai_chat",
"model": "gpt-4-turbo-preview",
"max_tokens": 4000,
"temperature": 0.0,
"top_p": 1.0,
"n": 1,
"request_timeout": 180.0,
"api_base": null,
"api_version": null,
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": null,
"model_supports_json": null,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 10,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 25
},
"parallelization": {
"stagger": 0.3,
"num_threads": 50
},
"async_mode": "threaded",
"enabled": false,
"prompt": null,
"description": "Any claims or facts that could be relevant to information discovery.",
"max_gleanings": 1,
"strategy": null,
"encoding_model": null
},
"cluster_graph": {
"max_cluster_size": 10,
"strategy": null
},
"umap": {
"enabled": false
},
"local_search": {
"text_unit_prop": 0.5,
"community_prop": 0.1,
"conversation_history_max_turns": 5,
"top_k_entities": 10,
"top_k_relationships": 10,
"temperature": 0.0,
"top_p": 1.0,
"n": 1,
"max_tokens": 12000,
"llm_max_tokens": 2000
},
"global_search": {
"temperature": 0.0,
"top_p": 1.0,
"n": 1,
"max_tokens": 12000,
"data_max_tokens": 12000,
"map_max_tokens": 1000,
"reduce_max_tokens": 2000,
"concurrency": 32
},
"encoding_model": "cl100k_base",
"skip_workflows": []
}
Logs and screenshots
Log output
03:49:54,278 graphrag.index.input.text INFO found text files from input, found [('xyx.txt', {}), ('sample-.txt', {}), ('sample.txt', {})]
Given that these are the following files in my input directory - xyz.txt, sample-input.txt, sample.txt
Additional Information
- GraphRAG Version: 0.3.1
- Python Version: 3.12 It looks like a quick fix so far - I can work on it :)
Tested this today and it does not appear to be a problem in newer versions