[Bug]: Filename gets replaced with empty string when loading input

Open danielolamide opened this issue 1 year ago • 1 comments

Do you need to file an issue?

[X] I have searched the existing issues and this bug is not already filed.
[X] My model is hosted on OpenAI or Azure. If not, please look at the "model providers" issue and don't file a new one here.
[X] I believe this is a legitimate bug, not just a question. If this is a question, please use the Discussions area.

Describe the bug

Using replace on the full input file path causes naming issues when file path contains a substring that matches the input root directory during indexing - this also affects the reading of the file. e.g:

input/input.txt -> .txt
input/sample-input.txt -> sample-.txt

Steps to reproduce

Using the create_base_text_units workflow
Running only the orderby workflow step
You will notice that the parquet output lists the titles wrongly if the file name contains a substring of the input root directory.

Expected Behavior

The filename should be preserved, to allow reading of file

GraphRAG Config Used

    "llm": {
        "api_key": "==== REDACTED ====",
        "type": "openai_chat",
        "model": "gpt-4-turbo-preview",
        "max_tokens": 4000,
        "temperature": 0.0,
        "top_p": 1.0,
        "n": 1,
        "request_timeout": 180.0,
        "api_base": null,
        "api_version": null,
        "proxy": null,
        "cognitive_services_endpoint": null,
        "deployment_name": null,
        "model_supports_json": null,
        "tokens_per_minute": 0,
        "requests_per_minute": 0,
        "max_retries": 10,
        "max_retry_wait": 10.0,
        "sleep_on_rate_limit_recommendation": true,
        "concurrent_requests": 25
    },
    "parallelization": {
        "stagger": 0.3,
        "num_threads": 50
    },
    "async_mode": "threaded",
    "root_dir": ".",
    "reporting": {
        "type": "file",
        "base_dir": "output/${timestamp}/reports",
        "storage_account_blob_url": null
    },
    "storage": {
        "type": "file",
        "base_dir": "output/${timestamp}/artifacts",
        "storage_account_blob_url": null
    },
    "cache": {
        "type": "file",
        "base_dir": "cache",
        "storage_account_blob_url": null
    },
    "input": {
        "type": "file",
        "file_type": "text",
        "base_dir": "input",
        "storage_account_blob_url": null,
        "encoding": "utf-8",
        "file_pattern": ".*\\.txt$",
        "file_filter": null,
        "source_column": null,
        "timestamp_column": null,
        "timestamp_format": null,
        "text_column": "text",
        "title_column": null,
        "document_attribute_columns": []
    },
    "embed_graph": {
        "enabled": false,
        "num_walks": 10,
        "walk_length": 40,
        "window_size": 2,
        "iterations": 3,
        "random_seed": 597832,
        "strategy": null
    },
    "embeddings": {
        "llm": {
            "api_key": "==== REDACTED ====",
            "type": "openai_embedding",
            "model": "text-embedding-3-small",
            "max_tokens": 4000,
            "temperature": 0,
            "top_p": 1,
            "n": 1,
            "request_timeout": 180.0,
            "api_base": null,
            "api_version": null,
            "proxy": null,
            "cognitive_services_endpoint": null,
            "deployment_name": null,
            "model_supports_json": null,
            "tokens_per_minute": 0,
            "requests_per_minute": 0,
            "max_retries": 10,
            "max_retry_wait": 10.0,
            "sleep_on_rate_limit_recommendation": true,
            "concurrent_requests": 25
        },
        "parallelization": {
            "stagger": 0.3,
            "num_threads": 50
        },
        "async_mode": "threaded",
        "batch_size": 16,
        "batch_max_tokens": 8191,
        "target": "required",
        "skip": [],
        "vector_store": null,
        "strategy": null
    },
    "chunks": {
        "size": 1200,
        "overlap": 100,
        "group_by_columns": [
            "id"
        ],
        "strategy": null,
        "encoding_model": null
    },
    "snapshots": {
        "graphml": false,
        "raw_entities": false,
        "top_level_nodes": false
    },
    "entity_extraction": {
        "llm": {
            "api_key": "==== REDACTED ====",
            "type": "openai_chat",
            "model": "gpt-4-turbo-preview",
            "max_tokens": 4000,
            "temperature": 0.0,
            "top_p": 1.0,
            "n": 1,
            "request_timeout": 180.0,
            "api_base": null,
            "api_version": null,
            "proxy": null,
            "cognitive_services_endpoint": null,
            "deployment_name": null,
            "model_supports_json": null,
            "tokens_per_minute": 0,
            "requests_per_minute": 0,
            "max_retries": 10,
            "max_retry_wait": 10.0,
            "sleep_on_rate_limit_recommendation": true,
            "concurrent_requests": 25
        },
        "parallelization": {
            "stagger": 0.3,
            "num_threads": 50
        },
        "async_mode": "threaded",
        "prompt": null,
        "entity_types": [
            "organization",
            "person",
            "geo",
            "event"
        ],
        "max_gleanings": 1,
        "strategy": null,
        "encoding_model": null
    },
    "summarize_descriptions": {
        "llm": {
            "api_key": "==== REDACTED ====",
            "type": "openai_chat",
            "model": "gpt-4-turbo-preview",
            "max_tokens": 4000,
            "temperature": 0.0,
            "top_p": 1.0,
            "n": 1,
            "request_timeout": 180.0,
            "api_base": null,
            "api_version": null,
            "proxy": null,
            "cognitive_services_endpoint": null,
            "deployment_name": null,
            "model_supports_json": null,
            "tokens_per_minute": 0,
            "requests_per_minute": 0,
            "max_retries": 10,
            "max_retry_wait": 10.0,
            "sleep_on_rate_limit_recommendation": true,
            "concurrent_requests": 25
        },
        "parallelization": {
            "stagger": 0.3,
            "num_threads": 50
        },
        "async_mode": "threaded",
        "prompt": null,
        "max_length": 500,
        "strategy": null
    },
    "community_reports": {
        "llm": {
            "api_key": "==== REDACTED ====",
            "type": "openai_chat",
            "model": "gpt-4-turbo-preview",
            "max_tokens": 4000,
            "temperature": 0.0,
            "top_p": 1.0,
            "n": 1,
            "request_timeout": 180.0,
            "api_base": null,
            "api_version": null,
            "proxy": null,
            "cognitive_services_endpoint": null,
            "deployment_name": null,
            "model_supports_json": null,
            "tokens_per_minute": 0,
            "requests_per_minute": 0,
            "max_retries": 10,
            "max_retry_wait": 10.0,
            "sleep_on_rate_limit_recommendation": true,
            "concurrent_requests": 25
        },
        "parallelization": {
            "stagger": 0.3,
            "num_threads": 50
        },
        "async_mode": "threaded",
        "prompt": null,
        "max_length": 2000,
        "max_input_length": 8000,
        "strategy": null
    },
    "claim_extraction": {
        "llm": {
            "api_key": "==== REDACTED ====",
            "type": "openai_chat",
            "model": "gpt-4-turbo-preview",
            "max_tokens": 4000,
            "temperature": 0.0,
            "top_p": 1.0,
            "n": 1,
            "request_timeout": 180.0,
            "api_base": null,
            "api_version": null,
            "proxy": null,
            "cognitive_services_endpoint": null,
            "deployment_name": null,
            "model_supports_json": null,
            "tokens_per_minute": 0,
            "requests_per_minute": 0,
            "max_retries": 10,
            "max_retry_wait": 10.0,
            "sleep_on_rate_limit_recommendation": true,
            "concurrent_requests": 25
        },
        "parallelization": {
            "stagger": 0.3,
            "num_threads": 50
        },
        "async_mode": "threaded",
        "enabled": false,
        "prompt": null,
        "description": "Any claims or facts that could be relevant to information discovery.",
        "max_gleanings": 1,
        "strategy": null,
        "encoding_model": null
    },
    "cluster_graph": {
        "max_cluster_size": 10,
        "strategy": null
    },
    "umap": {
        "enabled": false
    },
    "local_search": {
        "text_unit_prop": 0.5,
        "community_prop": 0.1,
        "conversation_history_max_turns": 5,
        "top_k_entities": 10,
        "top_k_relationships": 10,
        "temperature": 0.0,
        "top_p": 1.0,
        "n": 1,
        "max_tokens": 12000,
        "llm_max_tokens": 2000
    },
    "global_search": {
        "temperature": 0.0,
        "top_p": 1.0,
        "n": 1,
        "max_tokens": 12000,
        "data_max_tokens": 12000,
        "map_max_tokens": 1000,
        "reduce_max_tokens": 2000,
        "concurrency": 32
    },
    "encoding_model": "cl100k_base",
    "skip_workflows": []
}

Logs and screenshots

Incorrect filename

Log output 03:49:54,278 graphrag.index.input.text INFO found text files from input, found [('xyx.txt', {}), ('sample-.txt', {}), ('sample.txt', {})] Given that these are the following files in my input directory - xyz.txt, sample-input.txt, sample.txt

Additional Information

GraphRAG Version: 0.3.1
Python Version: 3.12 It looks like a quick fix so far - I can work on it :)

Aug 29 '24 03:08 danielolamide

Tested this today and it does not appear to be a problem in newer versions

Nov 03 '25 23:11 natoverse