WARNING Warning! Error loading file book.txt. Skipping...
Is your feature request related to a problem? Please describe.
in my log
12:35:02,358 graphrag.config.read_dotenv INFO Loading pipeline .env file
12:35:02,363 graphrag.index.cli INFO using default configuration: {
"llm": {
"api_key": "REDACTED, length 6",
"type": "openai_chat",
"model": "gemma",
"max_tokens": 4000,
"temperature": 0.0,
"top_p": 1.0,
"n": 1,
"request_timeout": 180.0,
"api_base": "http://127.0.0.1:11434/v1",
"api_version": null,
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": null,
"model_supports_json": true,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 10,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 25
},
"parallelization": {
"stagger": 0.3,
"num_threads": 50
},
"async_mode": "threaded",
"root_dir": "./ragtest",
"reporting": {
"type": "file",
"base_dir": "output/${timestamp}/reports",
"storage_account_blob_url": null
},
"storage": {
"type": "file",
"base_dir": "output/${timestamp}/artifacts",
"storage_account_blob_url": null
},
"cache": {
"type": "file",
"base_dir": "cache",
"storage_account_blob_url": null
},
"input": {
"type": "file",
"file_type": "text",
"base_dir": "input",
"storage_account_blob_url": null,
"encoding": "utf-8",
"file_pattern": ".*\\.txt$",
"file_filter": null,
"source_column": null,
"timestamp_column": null,
"timestamp_format": null,
"text_column": "text",
"title_column": null,
"document_attribute_columns": []
},
"embed_graph": {
"enabled": false,
"num_walks": 10,
"walk_length": 40,
"window_size": 2,
"iterations": 3,
"random_seed": 597832,
"strategy": null
},
"embeddings": {
"llm": {
"api_key": "REDACTED, length 9",
"type": "openai_embedding",
"model": "/nomic-ai/nomic-embed-text-v1.5-GGUF/nomic-embed-text-v1.5.Q4_K_M.gguf",
"max_tokens": 4000,
"temperature": 0,
"top_p": 1,
"n": 1,
"request_timeout": 180.0,
"api_base": "http://localhost:1234/v1",
"api_version": null,
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": null,
"model_supports_json": null,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 10,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 25
},
"parallelization": {
"stagger": 0.3,
"num_threads": 50
},
"async_mode": "threaded",
"batch_size": 16,
"batch_max_tokens": 8191,
"target": "required",
"skip": [],
"vector_store": null,
"strategy": null
},
"chunks": {
"size": 1200,
"overlap": 100,
"group_by_columns": [
"id"
],
"strategy": null
},
"snapshots": {
"graphml": false,
"raw_entities": false,
"top_level_nodes": false
},
"entity_extraction": {
"llm": {
"api_key": "REDACTED, length 6",
"type": "openai_chat",
"model": "gemma",
"max_tokens": 4000,
"temperature": 0.0,
"top_p": 1.0,
"n": 1,
"request_timeout": 180.0,
"api_base": "http://127.0.0.1:11434/v1",
"api_version": null,
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": null,
"model_supports_json": true,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 10,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 25
},
"parallelization": {
"stagger": 0.3,
"num_threads": 50
},
"async_mode": "threaded",
"prompt": "prompts/entity_extraction.txt",
"entity_types": [
"organization",
"person",
"geo",
"event"
],
"max_gleanings": 1,
"strategy": null
},
"summarize_descriptions": {
"llm": {
"api_key": "REDACTED, length 6",
"type": "openai_chat",
"model": "gemma",
"max_tokens": 4000,
"temperature": 0.0,
"top_p": 1.0,
"n": 1,
"request_timeout": 180.0,
"api_base": "http://127.0.0.1:11434/v1",
"api_version": null,
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": null,
"model_supports_json": true,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 10,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 25
},
"parallelization": {
"stagger": 0.3,
"num_threads": 50
},
"async_mode": "threaded",
"prompt": "prompts/summarize_descriptions.txt",
"max_length": 500,
"strategy": null
},
"community_reports": {
"llm": {
"api_key": "REDACTED, length 6",
"type": "openai_chat",
"model": "gemma",
"max_tokens": 4000,
"temperature": 0.0,
"top_p": 1.0,
"n": 1,
"request_timeout": 180.0,
"api_base": "http://127.0.0.1:11434/v1",
"api_version": null,
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": null,
"model_supports_json": true,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 10,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 25
},
"parallelization": {
"stagger": 0.3,
"num_threads": 50
},
"async_mode": "threaded",
"prompt": "prompts/community_report.txt",
"max_length": 2000,
"max_input_length": 8000,
"strategy": null
},
"claim_extraction": {
"llm": {
"api_key": "REDACTED, length 6",
"type": "openai_chat",
"model": "gemma",
"max_tokens": 4000,
"temperature": 0.0,
"top_p": 1.0,
"n": 1,
"request_timeout": 180.0,
"api_base": "http://127.0.0.1:11434/v1",
"api_version": null,
"proxy": null,
"cognitive_services_endpoint": null,
"deployment_name": null,
"model_supports_json": true,
"tokens_per_minute": 0,
"requests_per_minute": 0,
"max_retries": 10,
"max_retry_wait": 10.0,
"sleep_on_rate_limit_recommendation": true,
"concurrent_requests": 25
},
"parallelization": {
"stagger": 0.3,
"num_threads": 50
},
"async_mode": "threaded",
"enabled": false,
"prompt": "prompts/claim_extraction.txt",
"description": "Any claims or facts that could be relevant to information discovery.",
"max_gleanings": 1,
"strategy": null
},
"cluster_graph": {
"max_cluster_size": 10,
"strategy": null
},
"umap": {
"enabled": false
},
"local_search": {
"text_unit_prop": 0.5,
"community_prop": 0.1,
"conversation_history_max_turns": 5,
"top_k_entities": 10,
"top_k_relationships": 10,
"temperature": 0.0,
"top_p": 1.0,
"n": 1,
"max_tokens": 12000,
"llm_max_tokens": 2000
},
"global_search": {
"temperature": 0.0,
"top_p": 1.0,
"n": 1,
"max_tokens": 12000,
"data_max_tokens": 12000,
"map_max_tokens": 1000,
"reduce_max_tokens": 2000,
"concurrency": 32
},
"encoding_model": "cl100k_base",
"skip_workflows": []
}
12:35:02,363 graphrag.index.create_pipeline_config INFO skipping workflows
12:35:02,373 graphrag.index.run INFO Running pipeline
12:35:02,373 graphrag.index.storage.file_pipeline_storage INFO Creating file storage at ragtest\output\20240722-123502\artifacts
12:35:02,373 graphrag.index.input.load_input INFO loading input from root_dir=input
12:35:02,373 graphrag.index.input.load_input INFO using file storage for input
12:35:02,373 graphrag.index.storage.file_pipeline_storage INFO search ragtest\input for files matching .*\.txt$
12:35:02,373 graphrag.index.input.text INFO found text files from input, found [('book.txt', {})]
12:35:02,373 graphrag.index.input.text WARNING Warning! Error loading file book.txt. Skipping...
12:35:02,373 graphrag.index.input.text INFO Found 1 files, loading 0
12:35:02,388 graphrag.index.workflows.load INFO Workflow Run Order: ['create_base_text_units', 'create_base_extracted_entities', 'create_summarized_entities', 'create_base_entity_graph', 'create_final_entities', 'create_final_nodes', 'create_final_communities', 'join_text_units_to_entity_ids', 'create_final_relationships', 'join_text_units_to_relationship_ids', 'create_final_community_reports', 'create_final_text_units', 'create_base_documents', 'create_final_documents']
12:35:02,388 graphrag.index.run INFO Final # of rows loaded: 0
12:35:02,574 graphrag.index.run INFO Running workflow: create_base_text_units...
12:35:02,574 graphrag.index.run INFO dependencies for create_base_text_units: []
12:35:02,584 datashaper.workflow.workflow INFO executing verb orderby
12:35:02,584 datashaper.workflow.workflow ERROR Error executing verb "orderby" in create_base_text_units: 'id'
Traceback (most recent call last):
File "C:\ProgramData\anaconda3\Lib\site-packages\datashaper\workflow\workflow.py", line 410, in _execute_verb
result = node.verb.func(**verb_args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\ProgramData\anaconda3\Lib\site-packages\datashaper\engine\verbs\orderby.py", line 32, in orderby
output = input_table.sort_values(by=columns, ascending=ascending)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\frame.py", line 7189, in sort_values
k = self._get_label_or_level_values(by[0], axis=axis)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py", line 1911, in _get_label_or_level_values
raise KeyError(key)
KeyError: 'id'
12:35:02,594 graphrag.index.reporting.file_workflow_callbacks INFO Error executing verb "orderby" in create_base_text_units: 'id' details=None
12:35:02,594 graphrag.index.run ERROR error running workflow create_base_text_units
Traceback (most recent call last):
File "C:\Users\DELL\Desktop\graphrag-main\graphrag\index\run.py", line 323, in run_pipeline
result = await workflow.run(context, callbacks)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\ProgramData\anaconda3\Lib\site-packages\datashaper\workflow\workflow.py", line 369, in run
timing = await self._execute_verb(node, context, callbacks)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\ProgramData\anaconda3\Lib\site-packages\datashaper\workflow\workflow.py", line 410, in _execute_verb
result = node.verb.func(**verb_args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\ProgramData\anaconda3\Lib\site-packages\datashaper\engine\verbs\orderby.py", line 32, in orderby
output = input_table.sort_values(by=columns, ascending=ascending)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\frame.py", line 7189, in sort_values
k = self._get_label_or_level_values(by[0], axis=axis)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\generic.py", line 1911, in _get_label_or_level_values
raise KeyError(key)
KeyError: 'id'
12:35:02,599 graphrag.index.reporting.file_workflow_callbacks INFO Error running pipeline! details=None
there is a warning
WARNING Warning! Error loading file book.txt. Skipping...
I think this is a reason why
PS C:\Users\DELL\Desktop\graphrag-main> python -m graphrag.index --root ./ragtest
🚀 Reading settings from ragtest\settings.yaml
❌ create_base_text_units
None
⠸ GraphRAG Indexer
├── Loading Input (InputFileType.text) - 1 files loaded (0 filtered) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00 0:00:00
└── create_base_text_units
❌ Errors occurred during the pipeline run, see logs for more details.
PS C:\Users\DELL\Desktop\graphrag-main>
Describe the solution you'd like
how to solve this warning
WARNING Warning! Error loading file book.txt. Skipping...
Additional context
No response
Any solution for this one, I'm having the same issue.
It might be because of the encoding format of the book.txt. In that case, Changing the file Encoding property to utf-8 should resolve the issue
PR https://github.com/microsoft/graphrag/pull/639 has been merged and included in GraphRAG 0.2.0, which enforces UTF-8 encoding. Please let us know if upgrading resolves your issue.
This issue has been marked stale due to inactivity after repo maintainer or community member responses that request more information or suggest a solution. It will be closed after five additional days.
Please, is there a solution to this problem?
We have resolved several issues related to text encoding and JSON parsing that are rolled up into version 0.2.2. Please try again with that version and re-open if this is still an issue.
I'm using the 0.2.2 version of graphrag and i still got the same error
🚀 Reading settings from ragtest\settings.yaml ❌ create_base_text_units None ⠹ GraphRAG Indexer ├── Loading Input (InputFileType.text) - 1 files loaded (0 filtered) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:00 0:00:00 └── create_base_text_units ❌ Errors occurred during the pipeline run, see logs for more details.
14:50:16,78 graphrag.index.input.text INFO found text files from input, found [('book.txt', {})] 14:50:16,81 graphrag.index.input.text WARNING Warning! Error loading file book.txt. Skipping...
It might be because of the encoding format of the book.txt. In that case, Changing the file Encoding property to utf-8 should resolve the issue
Thanks! This is exactly my problem. Thanks a lot!