Consolidate and refactor configuration based on RAG Triad

Open Lep06fg opened this issue 2 years ago • 1 comments

As a RAG Experiment Accelerator user I would like to have a meaningful, hierarchy based config file structure SO I would be able to easily understand the different features and settings which I can conduct experiments on.

Current config file structure


{
    "index_name_prefix": "idx",
    "experiment_name": "",
    "job_name": "",
    "job_description": "",
    "preprocess": false,
    "chunking": {
        "chunk_size": [1000],
        "overlap_size": [200],
        "generate_title": false,
        "generate_summary": false,
        "override_content_with_summary": false
    },
    "embedding_models": [
        {
            "type": "azure",
            "deployment_name": "text-embedding-ada-002"
        }
    ],
    "ef_construction": [400],
    "ef_search": [400],
    "language":{
        "analyzers": {
            "analyzer_name": "en.microsoft",
            "index_analyzer_name": "",
            "search_analyzer_name": "",
            "char_filters": [],
            "tokenizers": [],
            "token_filters": []
        },
        "query_language": "en-us"
    },
    "rerank": true,
    "rerank_type": "crossencoder",
    "llm_re_rank_threshold": 3,
    "cross_encoder_at_k": 4,
    "crossencoder_model": "cross-encoder/stsb-roberta-base",
    "search_types": [
        "search_for_manual_hybrid"
    ],
    "retrieve_num_of_documents": 5,
    "metric_types": [
        "cosine"
    ],
    "azure_oai_chat_deployment_name": "gpt-35-turbo",
    "azure_oai_eval_deployment_name": "gpt-35-turbo",
    "openai_temperature": 0,
    "search_relevancy_threshold": 0.8,
    "data_formats": "all",
    "eval_data_jsonl_file_path": "./artifacts/eval_data.jsonl",
    "chunking_strategy": "basic",
    "chain_of_thoughts": true,
    "hyde": "disabled",
    "query_expansion": false,
    "min_query_expansion_related_question_similarity_score": 90,
    "azure_document_intelligence_model": "prebuilt-read"
}

Desired config file structure

{
    "experiment": {
        "name": "",
        "job_name": "",
        "job_description": ""
    },
    "indexing": {
        "index_name_prefix": "idx",
        "data_formats": "all",
        "sampling": {
                "sample_data": "true",
                "sample_percentage": "10",
                "optimum_k": "auto",
                "min_cluster": "2",
                "max_cluster": "30",
         },
        "preprocess": false,
        "chunking": {
            "chunking_strategy": "basic",
            "chunk_size": [1000],
            "overlap_size": [200],
            "generate_title": false,
            "generate_summary": false,
            "override_content_with_summary": false,
            "azure_document_intelligence_model": "prebuilt-read"
        },
        "azure_search": {
            "hnsw": {
                "ef_construction": [400],
                "ef_search": [400]
            },
            "language":{
                "analyzers": {
                    "analyzer_name": "en.microsoft",
                    "index_analyzer_name": "",
                    "search_analyzer_name": "",
                    "char_filters": [],
                    "tokenizers": [],
                    "token_filters": []
                },
                "query_language": "en-us"
            }
        }
    },
    "embedding": {
        "embedding_models": [{
            "type": "azure",
            "deployment_name": "text-embedding-ada-002"
        }]
    },
    "retrieval": {
        "reranking": {
            "rerank": true,
            "rerank_type": "crossencoder",
            "llm_re_rank_threshold": 3,
            "cross_encoder_at_k": 4,
            "crossencoder_model": "cross-encoder/stsb-roberta-base"
        },
        "search_types": [
            "search_for_manual_hybrid"
        ],
        "max_num_of_documents": 5,
        "chain_of_thoughts": true,
        "hyde": "disabled",
        "query_expansion": {
            "enabled": false,
            "min_query_expansion_related_question_similarity_score": 90
        }
    },
    "generation": {
        "azure_oai_chat_deployment_name": "gpt-35-turbo",
        "openai_temperature": 0
    },
    "evaluation": {
        "metric_types": [
            "cosine"
        ],
        "azure_oai_eval_deployment_name": "gpt-35-turbo",
        "search_relevancy_threshold": 0.8,
        "eval_data_jsonl_file_path": "./artifacts/eval_data.jsonl"
    }
}

### Tasks
- [x] Design new configration structure
- [ ] Refactor config structure
- [ ] Fix sample file
- [ ] Fix CI file

Thoughtrs:

indexing.json (indexing, chunking, preprocess)
experimentation.json (querying, generation)
retrieval evaluation
generation evaluation.json (iterativly)

May 02 '24 15:05 Lep06fg

@ritesh-modi @shanepeckham @martinpeck @julia-meshcheryakova, could you please share your thoughts about this design for refactor?

May 12 '24 14:05 guybartal