azure-search-openai-demo
azure-search-openai-demo copied to clipboard
reuse dot .env key values in prepdocs instead of building log argument cm lines
As we anyway maintain already the iac and python app configuration in dot .env files, why not reuse and extend them to be used on python cmd line programs via env cfg file parameter instead having dozens of args injected via cmd line shell script or env vars to set manually again . eg in prepdocs.sh/.py Below code e.g would read all args from .env file and reuse the existing IAC(bicep) cfg
Here’s the updated Python code to map the `.env` variables to the corresponding arguments without introducing any new arguments. This code uses the mappings we discussed:
```python
import os
from dotenv import load_dotenv
import argparse
# Load environment variables from .env file
load_dotenv()
# Define your argument parser
parser = argparse.ArgumentParser(
description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.",
epilog="Example: prepdocs.py '.\\data\\*' --storageaccount myaccount --container mycontainer --searchservice mysearch --index myindex -v",
)
# Add arguments as per your original code
parser.add_argument("files", nargs="?", help="Files to be processed")
parser.add_argument("--datalakestorageaccount", required=False, help="Optional. Azure Data Lake Storage Gen2 Account name")
parser.add_argument("--datalakefilesystem", required=False, default="gptkbcontainer", help="Optional. Azure Data Lake Storage Gen2 filesystem name")
parser.add_argument("--datalakepath", required=False, help="Optional. Azure Data Lake Storage Gen2 filesystem path containing files to index. If omitted, index the entire filesystem")
parser.add_argument("--datalakekey", required=False, help="Optional. Use this key when authenticating to Azure Data Lake Gen2")
parser.add_argument("--useacls", action="store_true", help="Store ACLs from Azure Data Lake Gen2 Filesystem in the search index")
parser.add_argument("--category", help="Value for the category field in the search index for all sections indexed in this run")
parser.add_argument("--skipblobs", action="store_true", help="Skip uploading individual pages to Azure Blob Storage")
parser.add_argument("--storageaccount", help="Azure Blob Storage account name")
parser.add_argument("--container", help="Azure Blob Storage container name")
parser.add_argument("--storageresourcegroup", help="Azure blob storage resource group")
parser.add_argument("--storagekey", required=False, help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)")
parser.add_argument("--tenantid", required=False, help="Optional. Use this to define the Azure directory where to authenticate")
parser.add_argument("--subscriptionid", required=False, help="Optional. Use this to define managed identity connection string in integrated vectorization")
parser.add_argument("--searchservice", help="Name of the Azure AI Search service where content should be indexed (must exist already)")
parser.add_argument("--searchserviceassignedid", required=False, help="Search service system assigned Identity (Managed identity) (used for integrated vectorization)")
parser.add_argument("--index", help="Name of the Azure AI Search index where content should be indexed (will be created if it doesn't exist)")
parser.add_argument("--searchkey", required=False, help="Optional. Use this Azure AI Search account key instead of the current user identity to login (use az login to set current user for Azure)")
parser.add_argument("--searchanalyzername", required=False, default="en.microsoft", help="Optional. Name of the Azure AI Search analyzer to use for the content field in the index")
parser.add_argument("--openaihost", help="Host of the API used to compute embeddings ('azure' or 'openai')")
parser.add_argument("--openaiservice", help="Name of the Azure OpenAI service used to compute embeddings")
parser.add_argument("--openaideployment", help="Name of the Azure OpenAI model deployment for an embedding model ('text-embedding-ada-002' recommended)")
parser.add_argument("--openaimodelname", help="Name of the Azure OpenAI embedding model ('text-embedding-ada-002' recommended)")
parser.add_argument("--openaidimensions", required=False, default=1536, type=int, help="Dimensions for the embedding model (defaults to 1536 for 'text-embedding-ada-002')")
parser.add_argument("--novectors", action="store_true", help="Don't compute embeddings for the sections (e.g. don't call the OpenAI embeddings API during indexing)")
parser.add_argument("--disablebatchvectors", action="store_true", help="Don't compute embeddings in batch for the sections")
parser.add_argument("--openaicustomurl", required=False, help="Optional. Use this custom OpenAI URL instead of the default OpenAI URL")
parser.add_argument("--openaikey", required=False, help="Optional. Use this OpenAI account key instead of the current Azure user identity to login.")
parser.add_argument("--openaiorg", required=False, help="This is required only when using non-Azure endpoints.")
parser.add_argument("--remove", action="store_true", help="Remove references to this document from blob storage and the search index")
parser.add_argument("--removeall", action="store_true", help="Remove all blobs from blob storage and documents from the search index")
parser.add_argument("--localpdfparser", action="store_true", help="Use PyPdf local PDF parser (supports only digital PDFs) instead of Azure Document Intelligence service to extract text, tables and layout from the documents")
parser.add_argument("--localhtmlparser", action="store_true", help="Use Beautiful soap local HTML parser instead of Azure Document Intelligence service to extract text, tables and layout from the documents")
parser.add_argument("--documentintelligenceservice", required=False, help="Optional. Name of the Azure Document Intelligence service which will be used to extract text, tables and layout from the documents (must exist already)")
parser.add_argument("--documentintelligencekey", required=False, help="Optional. Use this Azure Document Intelligence account key instead of the current user identity to login (use az login to set current user for Azure)")
parser.add_argument("--searchimages", action="store_true", required=False, help="Optional. Generate image embeddings to enable each page to be searched as an image")
parser.add_argument("--visionendpoint", required=False, help="Optional, required if --searchimages is specified. Endpoint of Azure AI Vision service to use when embedding images.")
parser.add_argument("--useintvectorization", required=False, help="Required if --useintvectorization is specified. Enable Integrated vectorizer indexer support which is in preview)")
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
# Parse the arguments
args = parser.parse_args()
# Map .env variables to args
self.files = args.files or os.getenv('FILES')
self.datalakestorageaccount = args.datalakestorageaccount or os.getenv('AZURE_DATALAKE_STORAGE_ACCOUNT')
self.datalakefilesystem = args.datalakefilesystem or os.getenv('AZURE_DATALAKE_FILESYSTEM', 'gptkbcontainer')
self.datalakepath = args.datalakepath or os.getenv('AZURE_DATALAKE_PATH')
self.datalakekey = args.datalakekey or os.getenv('AZURE_DATALAKE_KEY')
self.useacls = args.useacls or os.getenv('USE_ACLS') == 'true'
self.category = args.category or os.getenv('CATEGORY')
self.skipblobs = args.skipblobs or os.getenv('SKIP_BLOBS') == 'true'
self.storageaccount = args.storageaccount or os.getenv('AZURE_STORAGE_ACCOUNT')
self.container = args.container or os.getenv('AZURE_STORAGE_CONTAINER')
self.storageresourcegroup = args.storageresourcegroup or os.getenv('AZURE_STORAGE_RESOURCE_GROUP')
self.storagekey = args.storagekey or os.getenv('AZURE_STORAGE_KEY')
self.tenantid = args.tenantid or os.getenv('AZURE_TENANT_ID')
self.subscriptionid = args.subscriptionid or os.getenv('AZURE_SUBSCRIPTION_ID')
self.searchservice = args.searchservice or os.getenv('AZURE_SEARCH_SERVICE')
self.searchserviceassignedid = args.searchserviceassignedid or os.getenv('AZURE_SEARCH_SERVICE_ASSIGNED_ID')
self.index = args.index or os.getenv('AZURE_SEARCH_INDEX')
self.searchkey = args.searchkey or os.getenv('AZURE_SEARCH_KEY')
self.searchanalyzername = args.searchanalyzername or os.getenv('AZURE_SEARCH_ANALYZER_NAME', 'en.microsoft')
self.openaihost = args.openaihost or os.getenv('OPENAI_HOST')
self.openaiservice = args.openaiservice or os.getenv('AZURE_OPENAI_SERVICE')
self.openaideployment = args.openaideployment or os.getenv('AZURE_OPENAI_DEPLOYMENT')
self.openaimodelname = args.openaimodelname or os.getenv('AZURE_OPENAI_MODEL_NAME')
self.openaidimensions = args.openaidimensions or os.getenv('AZURE_OPENAI_EMB_DIMENSIONS', 1536)
self.novectors = args.novectors or os.getenv('NO_VECTORS') == 'true'
self.disablebatchvectors = args.disablebatchvectors or os.getenv('DISABLE_BATCH_VECTORS') == 'true'
self.openaicustomurl = args.openaicustomurl or os.getenv('AZURE_OPENAI_CUSTOM_URL')
self.openaikey = args.openaikey or os.getenv('AZURE_OPENAI_API_KEY_OVERRIDE')
self.openaiorg = args.openaiorg or os.getenv('OPENAI_ORG')
self.remove = args.remove or os.getenv('REMOVE') == 'true'
self.removeall = args.removeall or os.getenv('REMOVE_ALL') == 'true'
self.localpdfparser = args.localpdfparser or os.getenv('LOCAL_PDF_PARSER') == 'true'
self.localhtmlparser = args.localhtmlparser or os.getenv('LOCAL_HTML_PARSER') == 'true'
self.documentintelligenceservice = args.documentintelligenceservice or os.getenv('AZURE_DOCUMENT_INTELLIGENCE_SERVICE')
self.documentintelligencekey = args.documentintelligencekey or os.getenv('AZURE_DOCUMENT_INTELL_INTELLIGENCE_SERVICE')
................
................