[Bug]: Extra LLM parameters are ignored (temperature, n, top_p) caused vLLM to crash!
Describe the bug
GraphRAG parsing parameters missed completely the controlled parameters for LLM, such as temperature, n, top_p. Although these are in the settings.yaml file (described below) but using the verbose mode ưe found that these parameters werre somehow missed parsing.
Steps to reproduce
The verbose mode showing that 3 parameters those should be included (temperature, n, top_p) are completely mising:
🚀 Reading settings from ragtest/settings.yaml Using default configuration: { "llm": { "api_key": "REDACTED, length 3", "type": "openai_chat", "model": "Gemma2_9b_it", "max_tokens": 1500, "request_timeout": 180.0, "api_base": "http://localhost:8900/v1", "api_version": "gemma2-9b-it", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "gemma2", "model_supports_json": true, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 1, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 1 }, "parallelization": { "stagger": 0.3, "num_threads": 1 }, "async_mode": "threaded", "root_dir": "./ragtest", "reporting": { "type": "file", "base_dir": "output/${timestamp}/reports", "storage_account_blob_url": null }, "storage": { "type": "file", "base_dir": "output/${timestamp}/artifacts", "storage_account_blob_url": null }, "cache": { "type": "file", "base_dir": "cache", "storage_account_blob_url": null }, "input": { "type": "file", "file_type": "text", "base_dir": "input", "storage_account_blob_url": null, "encoding": "utf-8", "file_pattern": ".*\.txt$", "file_filter": null, "source_column": null, "timestamp_column": null, "timestamp_format": null, "text_column": "text", "title_column": null, "document_attribute_columns": [] }, "embed_graph": { "enabled": false, "num_walks": 10, "walk_length": 40, "window_size": 2, "iterations": 3, "random_seed": 597832, "strategy": null }, "embeddings": { "llm": { "api_key": "REDACTED, length 3", "type": "openai_embedding", "model": "intfloat/multilingual-e5-large", "max_tokens": 4000, "request_timeout": 180.0, "api_base": "http://localhost:8000/v1", "api_version": "gemma2-9b-it", "organization": "REDACTED, length 4", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "gemma2", "model_supports_json": null, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 10, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 1 }, "parallelization": { "stagger": 0.3, "num_threads": 1 }, "async_mode": "threaded", "batch_size": 16, "batch_max_tokens": 8191, "target": "required", "skip": [], "vector_store": null, "strategy": null }, "chunks": { "size": 512, "overlap": 100, "group_by_columns": [ "id" ], "strategy": null }, "snapshots": { "graphml": false, "raw_entities": false, "top_level_nodes": false }, "entity_extraction": { "llm": { "api_key": "REDACTED, length 3", "type": "openai_chat", "model": "Gemma2_9b_it", "max_tokens": 1500, "request_timeout": 180.0, "api_base": "http://localhost:8900/v1", "api_version": "gemma2-9b-it", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "gemma2", "model_supports_json": true, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 1, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 1 }, "parallelization": { "stagger": 0.3, "num_threads": 1 }, "async_mode": "threaded", "prompt": "prompts/entity_extraction.txt", "entity_types": [ "person", "partnership", "owner", "friend", "role", "technology", "equipment", "organization", "tax code", "event", "location", "date", "factory", "farm", "tower", "resort", "hotel", "real estate", "concept", "decision", "article", "creditor", "debtor", "stock owner", "bond owner", "fund raiser", "issuance", "guarantor", "investigator", "convicted", "arrested" ], "max_gleanings": 0, "strategy": null }, "summarize_descriptions": { "llm": { "api_key": "REDACTED, length 3", "type": "openai_chat", "model": "Gemma2_9b_it", "max_tokens": 1500, "request_timeout": 180.0, "api_base": "http://localhost:8900/v1", "api_version": "gemma2-9b-it", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "gemma2", "model_supports_json": true, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 1, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 1 }, "parallelization": { "stagger": 0.3, "num_threads": 1 }, "async_mode": "threaded", "prompt": "prompts/summarize_descriptions.txt", "max_length": 500, "strategy": null }, "community_reports": { "llm": { "api_key": "REDACTED, length 3", "type": "openai_chat", "model": "Gemma2_9b_it", "max_tokens": 1500, "request_timeout": 180.0, "api_base": "http://localhost:8900/v1", "api_version": "gemma2-9b-it", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "gemma2", "model_supports_json": true, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 1, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 1 }, "parallelization": { "stagger": 0.3, "num_threads": 1 }, "async_mode": "threaded", "prompt": null, "max_length": 2000, "max_input_length": 8000, "strategy": null }, "claim_extraction": { "llm": { "api_key": "REDACTED, length 3", "type": "openai_chat", "model": "Gemma2_9b_it", "max_tokens": 1500, "request_timeout": 180.0, "api_base": "http://localhost:8900/v1", "api_version": "gemma2-9b-it", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "gemma2", "model_supports_json": true, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 1, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 1 }, "parallelization": { "stagger": 0.3, "num_threads": 1 }, "async_mode": "threaded", "enabled": false, "prompt": "prompts/claim_extraction.txt", "description": "c\u00e1c kh\u1eb3ng \u0111\u1ecbnh c\u00f3 minh ch\u1ee9ng li\u00ean quan \u0111\u1ebfn c\u00e1c th\u1ef1c th\u1ec3 (entities), t\u1ed5 ch\u1ee9c (orgnizations) hay c\u00e1c quan h\u1ec7 (relationships).", "max_gleanings": 0, "strategy": null }, "cluster_graph": { "max_cluster_size": 10, "strategy": null }, "umap": { "enabled": false }, "local_search": { "text_unit_prop": 0.5, "community_prop": 0.1, "conversation_history_max_turns": 5, "top_k_entities": 10, "top_k_relationships": 10, "max_tokens": 12000, "llm_max_tokens": 2000 }, "global_search": { "max_tokens": 12000, "data_max_tokens": 12000, "map_max_tokens": 1000, "reduce_max_tokens": 2000, "concurrency": 32 }, "encoding_model": "cl100k_base", "skip_workflows": [] }
Expected Behavior
GraphRAG should use correctly all parameters that is described in the docs here: https://microsoft.github.io/graphrag/posts/config/json_yaml/
GraphRAG Config Used
encoding_model: cl100k_base skip_workflows: [] llm: api_key: ABC type: openai_chat # or azure_openai_chat model: Gemma2_9b_it model_supports_json: true # recommended if this is available for your model. max_tokens: 1500 request_timeout: 180.0 api_base: http://localhost:8900/v1 api_version: gemma2-9b-it organization: EraX deployment_name: gemma2 max_retries: 1 concurrent_requests: 1 # the number of parallel inflight requests that may be made temperature: 1.0 top_p: 0.95 n: 1 max_retry_wait: 10.0 sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times tokens_per_minute: 150_000 # set a leaky bucket throttle requests_per_minute: 10_000 # set a leaky bucket throttle
parallelization: stagger: 0.3 num_threads: 1 # the number of threads to use for parallel processing
async_mode: threaded # or asyncio
embeddings:
parallelization: override the global parallelization settings for embeddings
async_mode: threaded # or asyncio llm: api_key: ABC type: openai_embedding # or azure_openai_embedding model: intfloat/multilingual-e5-large api_base: http://localhost:8000/v1 api_version: gemma2-9b-it organization: EraX deployment_name: gemma2 temperature: 1.0 top_p: 0.95 n: 1 # tokens_per_minute: 150_000 # set a leaky bucket throttle # requests_per_minute: 10_000 # set a leaky bucket throttle max_retries: 10 # max_retry_wait: 10.0 # sleep_on_rate_limit_recommendation: true # whether to sleep when azure suggests wait-times concurrent_requests: 1 # the number of parallel inflight requests that may be made # batch_size: 16 # the number of documents to send in a single request # batch_max_tokens: 8191 # the maximum number of tokens to send in a single request # target: required # or optional
chunks: size: 512 overlap: 100 group_by_columns: [id] # by default, we don't allow chunks to cross documents
input: type: file # or blob file_type: text # or csv base_dir: "input" file_encoding: utf-8 file_pattern: ".*\.txt$"
cache: type: file # or blob base_dir: "cache"
connection_string: <azure_blob_storage_connection_string>
container_name: <azure_blob_storage_container_name>
storage: type: file # or blob base_dir: "output/${timestamp}/artifacts"
connection_string: <azure_blob_storage_connection_string>
container_name: <azure_blob_storage_container_name>
reporting: type: file # or console, blob base_dir: "output/${timestamp}/reports"
connection_string: <azure_blob_storage_connection_string>
container_name: <azure_blob_storage_container_name>
entity_extraction:
llm: override the global llm settings for this task
parallelization: override the global parallelization settings for this task
async_mode: override the global async_mode settings for this task
llm: n: 1 temperature: 1.0 frequency_penalty: 1.0 top_p: 0.95 prompt: "prompts/entity_extraction.txt" entity_types: [person, partnership, owner, friend, role, technology, equipment, organization, tax code, event, location, date, factory, farm, tower, resort, hotel, real estate, concept, decision, article, creditor, debtor, stock owner, bond owner, fund raiser, issuance, guarantor, investigator, convicted, arrested] max_gleanings: 0
summarize_descriptions:
llm: override the global llm settings for this task
parallelization: override the global parallelization settings for this task
async_mode: override the global async_mode settings for this task
llm: n: 1 temperature: 1.0 frequency_penalty: 1.0 top_p: 0.95 prompt: "prompts/summarize_descriptions.txt" max_length: 500
claim_extraction:
llm: override the global llm settings for this task
llm: n: 1 temperature: 1.0 frequency_penalty: 1.0 top_p: 0.95
parallelization: override the global parallelization settings for this task
async_mode: override the global async_mode settings for this task
enabled: true
prompt: "prompts/claim_extraction.txt" description: "các khẳng định có minh chứng liên quan đến các thực thể (entities), tổ chức (orgnizations) hay các quan hệ (relationships)." max_gleanings: 0
community_report:
llm: override the global llm settings for this task
parallelization: override the global parallelization settings for this task
async_mode: override the global async_mode settings for this task
llm: n: 1 temperature: 1.0 frequency_penalty: 1.0 top_p: 0.95 prompt: "prompts/community_report.txt" max_length: 2000 max_input_length: 8000
cluster_graph: max_cluster_size: 10
embed_graph: enabled: false # if true, will generate node2vec embeddings for nodes
num_walks: 10
walk_length: 40
window_size: 2
iterations: 3
random_seed: 597832
umap: enabled: false # if true, will generate UMAP embeddings for nodes
snapshots: graphml: false raw_entities: false top_level_nodes: false
local_search:
text_unit_prop: 0.5
community_prop: 0.1
conversation_history_max_turns: 5
top_k_mapped_entities: 10
top_k_relationships: 10
max_tokens: 12000
global_search:
max_tokens: 12000
data_max_tokens: 12000
map_max_tokens: 1000
reduce_max_tokens: 2000
concurrency: 32
Logs and screenshots
🚀 Reading settings from ragtest/settings.yaml Using default configuration: { "llm": { "api_key": "REDACTED, length 3", "type": "openai_chat", "model": "Gemma2_9b_it", "max_tokens": 1500, "request_timeout": 180.0, "api_base": "http://localhost:8900/v1", "api_version": "gemma2-9b-it", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "gemma2", "model_supports_json": true, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 1, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 1 }, "parallelization": { "stagger": 0.3, "num_threads": 1 }, "async_mode": "threaded", "root_dir": "./ragtest", "reporting": { "type": "file", "base_dir": "output/${timestamp}/reports", "storage_account_blob_url": null }, "storage": { "type": "file", "base_dir": "output/${timestamp}/artifacts", "storage_account_blob_url": null }, "cache": { "type": "file", "base_dir": "cache", "storage_account_blob_url": null }, "input": { "type": "file", "file_type": "text", "base_dir": "input", "storage_account_blob_url": null, "encoding": "utf-8", "file_pattern": ".*\.txt$", "file_filter": null, "source_column": null, "timestamp_column": null, "timestamp_format": null, "text_column": "text", "title_column": null, "document_attribute_columns": [] }, "embed_graph": { "enabled": false, "num_walks": 10, "walk_length": 40, "window_size": 2, "iterations": 3, "random_seed": 597832, "strategy": null }, "embeddings": { "llm": { "api_key": "REDACTED, length 3", "type": "openai_embedding", "model": "intfloat/multilingual-e5-large", "max_tokens": 4000, "request_timeout": 180.0, "api_base": "http://localhost:8000/v1", "api_version": "gemma2-9b-it", "organization": "REDACTED, length 4", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "gemma2", "model_supports_json": null, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 10, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 1 }, "parallelization": { "stagger": 0.3, "num_threads": 1 }, "async_mode": "threaded", "batch_size": 16, "batch_max_tokens": 8191, "target": "required", "skip": [], "vector_store": null, "strategy": null }, "chunks": { "size": 512, "overlap": 100, "group_by_columns": [ "id" ], "strategy": null }, "snapshots": { "graphml": false, "raw_entities": false, "top_level_nodes": false }, "entity_extraction": { "llm": { "api_key": "REDACTED, length 3", "type": "openai_chat", "model": "Gemma2_9b_it", "max_tokens": 1500, "request_timeout": 180.0, "api_base": "http://localhost:8900/v1", "api_version": "gemma2-9b-it", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "gemma2", "model_supports_json": true, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 1, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 1 }, "parallelization": { "stagger": 0.3, "num_threads": 1 }, "async_mode": "threaded", "prompt": "prompts/entity_extraction.txt", "entity_types": [ "person", "partnership", "owner", "friend", "role", "technology", "equipment", "organization", "tax code", "event", "location", "date", "factory", "farm", "tower", "resort", "hotel", "real estate", "concept", "decision", "article", "creditor", "debtor", "stock owner", "bond owner", "fund raiser", "issuance", "guarantor", "investigator", "convicted", "arrested" ], "max_gleanings": 0, "strategy": null }, "summarize_descriptions": { "llm": { "api_key": "REDACTED, length 3", "type": "openai_chat", "model": "Gemma2_9b_it", "max_tokens": 1500, "request_timeout": 180.0, "api_base": "http://localhost:8900/v1", "api_version": "gemma2-9b-it", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "gemma2", "model_supports_json": true, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 1, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 1 }, "parallelization": { "stagger": 0.3, "num_threads": 1 }, "async_mode": "threaded", "prompt": "prompts/summarize_descriptions.txt", "max_length": 500, "strategy": null }, "community_reports": { "llm": { "api_key": "REDACTED, length 3", "type": "openai_chat", "model": "Gemma2_9b_it", "max_tokens": 1500, "request_timeout": 180.0, "api_base": "http://localhost:8900/v1", "api_version": "gemma2-9b-it", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "gemma2", "model_supports_json": true, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 1, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 1 }, "parallelization": { "stagger": 0.3, "num_threads": 1 }, "async_mode": "threaded", "prompt": null, "max_length": 2000, "max_input_length": 8000, "strategy": null }, "claim_extraction": { "llm": { "api_key": "REDACTED, length 3", "type": "openai_chat", "model": "Gemma2_9b_it", "max_tokens": 1500, "request_timeout": 180.0, "api_base": "http://localhost:8900/v1", "api_version": "gemma2-9b-it", "proxy": null, "cognitive_services_endpoint": null, "deployment_name": "gemma2", "model_supports_json": true, "tokens_per_minute": 0, "requests_per_minute": 0, "max_retries": 1, "max_retry_wait": 10.0, "sleep_on_rate_limit_recommendation": true, "concurrent_requests": 1 }, "parallelization": { "stagger": 0.3, "num_threads": 1 }, "async_mode": "threaded", "enabled": false, "prompt": "prompts/claim_extraction.txt", "description": "c\u00e1c kh\u1eb3ng \u0111\u1ecbnh c\u00f3 minh ch\u1ee9ng li\u00ean quan \u0111\u1ebfn c\u00e1c th\u1ef1c th\u1ec3 (entities), t\u1ed5 ch\u1ee9c (orgnizations) hay c\u00e1c quan h\u1ec7 (relationships).", "max_gleanings": 0, "strategy": null }, "cluster_graph": { "max_cluster_size": 10, "strategy": null }, "umap": { "enabled": false }, "local_search": { "text_unit_prop": 0.5, "community_prop": 0.1, "conversation_history_max_turns": 5, "top_k_entities": 10, "top_k_relationships": 10, "max_tokens": 12000, "llm_max_tokens": 2000 }, "global_search": { "max_tokens": 12000, "data_max_tokens": 12000, "map_max_tokens": 1000, "reduce_max_tokens": 2000, "concurrency": 32 }, "encoding_model": "cl100k_base", "skip_workflows": [] }
Additional Information
All latest version installed today 14 July 2024.
Thanks, Steve
These parameters were not included in the configuration initialization. It will be fixed in the next release. For the mean time, try running from source code using poetry if you need to adjust the parameters. https://microsoft.github.io/graphrag/posts/developing
Consolidating alternate model issues here: https://github.com/microsoft/graphrag/issues/657