Document ID 0 with page ID 1 already exists in the index
I used vidore/colqwen2-v1.0 for RAG on multiple pdf files with streamlit. The streamline interface inputs zip file, unzips them and RAG on the pdf files. I tried assigning 'index_name' parameter based on the individual unique pdf file name. However, I am getting the error 'Document ID 0 with page ID 1 already exists in the index' after the first pdf is indexed. The code is working fine if there is only 1 pdf file. However, when there are multiple pdf files, the error is generated.
Thankyou for your assistance.
import streamlit as st import base64 from huggingface_hub import notebook_login from byaldi import RAGMultiModalModel from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor from PIL import Image from io import BytesIO import torch import re import os from openai import OpenAI from dotenv import load_dotenv from zipfile import ZipFile import tempfile import uuid
def main(): load_dotenv() client = OpenAI() # Define a directory for file uploads within the current working directory #upload_dir = os.path.join(os.getcwd(), "uploaded_docs") #os.makedirs(upload_dir, exist_ok=True) # Create the directory if it doesn't exist # Define a temporary directory for file processing upload_dir = tempfile.mkdtemp()
# Set page layout to wide
st.set_page_config(layout="wide")
st.title("Colpali Based Multimodal RAG App")
# Create sidebar for configuration options
with st.sidebar:
st.header("Configuration Options")
# Dropdown for selecting Colpali model
colpali_model = st.selectbox(
"Select Colpali Model",
options=["vidore/colpali", "vidore/colpali-v1.2","vidore/colqwen2-v1.0"]
)
# Dropdown for selecting Multi-Model LLM
multi_model_llm = st.selectbox(
"Select Multi-Model LLM",
options=["gpt-4o", "Qwin", "Llama3.2"]
)
# File upload button
uploaded_file = st.file_uploader("Upload a zip file", type=["zip"])
# Main content layout
if uploaded_file is not None:
col1, col2 = st.columns([1, 2])
with col1:
st.write("### Uploaded ZIP file")
zip_path = os.path.join(upload_dir, uploaded_file.name)
with open(zip_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.success(f"File saved: {uploaded_file.name}")
# Extract the zip file
pdf_dir = os.path.join(upload_dir, "extracted_pdfs")
os.makedirs(pdf_dir, exist_ok=True)
with ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(pdf_dir)
# Get all PDF files in the extracted directory
pdf_files = [os.path.join(pdf_dir, file) for file in os.listdir(pdf_dir) if file.lower().endswith(".pdf")]
st.write(f"Found {len(pdf_files)} PDF files in the ZIP file.")
@st.cache_resource
def load_models(colpali_model):
RAG = RAGMultiModalModel.from_pretrained(colpali_model, verbose=10)
return RAG
RAG= load_models(colpali_model)
for pdf_file in pdf_files:
index_name = f"{os.path.basename(pdf_file).replace('.pdf', '')}_{uuid.uuid4().hex}" # Use the PDF name as index_name
print("Index name",index_name)
st.subheader(f"Processing: {os.path.basename(pdf_file)}")
with st.spinner(f"Indexing {index_name}..."):
try:
@st.cache_data
def create_rag_index(image_path,index_name):
RAG.index(
input_path=image_path,
index_name=index_name,
store_collection_with_index=True,
overwrite=True, # Ensure any existing index is cleared
)
create_rag_index(pdf_file,index_name)
except ValueError as e:
st.error(f"ValueError while indexing {os.path.basename(pdf_file)}: {e}")
except Exception as e:
st.error(f"Error while indexing {os.path.basename(pdf_file)}: {e}")
with col2:
# Text input for the user query
text_query = st.text_input("Enter your text query")
# Search and Extract Text button
if st.button("Search and Extract Text"):
if text_query:
results = RAG.search(text_query, k=1, return_base64_results=True)
image_data = base64.b64decode(results[0].base64)
image = Image.open(BytesIO(image_data))
thumbnail = image.resize((1, 1))
# Display the thumbnail
st.image(thumbnail, use_column_width=False)
# Create a button to show the full image
with st.expander("View Full Image"):
st.image(image, caption="Result Image", use_column_width=True)
response = client.chat.completions.create(
model=multi_model_llm,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": text_query},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{results[0].base64}"
}
},
],
}
],
max_tokens=300,
)
# print(response)
output =response.choices[0].message.content
st.subheader("Query with LLM Model")
st.markdown(output,unsafe_allow_html=True)
# Placeholder for search results
# st.markdown(highlighted_output, unsafe_allow_html=True)
else:
st.warning("Please enter a query.")
else:
st.info("Upload a ZIP file to get started.")
if name == 'main': main()