Document ID 0 with page ID 1 already exists in the index

Open josephmattamana90 opened this issue 11 months ago • 0 comments

I used vidore/colqwen2-v1.0 for RAG on multiple pdf files with streamlit. The streamline interface inputs zip file, unzips them and RAG on the pdf files. I tried assigning 'index_name' parameter based on the individual unique pdf file name. However, I am getting the error 'Document ID 0 with page ID 1 already exists in the index' after the first pdf is indexed. The code is working fine if there is only 1 pdf file. However, when there are multiple pdf files, the error is generated.

Thankyou for your assistance.

import streamlit as st import base64 from huggingface_hub import notebook_login from byaldi import RAGMultiModalModel from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor from PIL import Image from io import BytesIO import torch import re import os from openai import OpenAI from dotenv import load_dotenv from zipfile import ZipFile import tempfile import uuid

def main(): load_dotenv() client = OpenAI() # Define a directory for file uploads within the current working directory #upload_dir = os.path.join(os.getcwd(), "uploaded_docs") #os.makedirs(upload_dir, exist_ok=True) # Create the directory if it doesn't exist # Define a temporary directory for file processing upload_dir = tempfile.mkdtemp()

# Set page layout to wide
st.set_page_config(layout="wide")

st.title("Colpali Based Multimodal RAG App")

# Create sidebar for configuration options
with st.sidebar:
    st.header("Configuration Options")
    
    # Dropdown for selecting Colpali model
    colpali_model = st.selectbox(
        "Select Colpali Model",
        options=["vidore/colpali", "vidore/colpali-v1.2","vidore/colqwen2-v1.0"]
    )
    
    # Dropdown for selecting Multi-Model LLM
    multi_model_llm = st.selectbox(
        "Select Multi-Model LLM",
        options=["gpt-4o", "Qwin", "Llama3.2"]
    )
    
    # File upload button
    uploaded_file = st.file_uploader("Upload a zip file", type=["zip"])

# Main content layout
if uploaded_file is not None:
    col1, col2 = st.columns([1, 2])
    
    with col1:
        st.write("### Uploaded ZIP file")
        zip_path = os.path.join(upload_dir, uploaded_file.name)
        with open(zip_path, "wb") as f:
            f.write(uploaded_file.getbuffer())
        st.success(f"File saved: {uploaded_file.name}")

        # Extract the zip file
        pdf_dir = os.path.join(upload_dir, "extracted_pdfs")
        os.makedirs(pdf_dir, exist_ok=True)

        with ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(pdf_dir)

        # Get all PDF files in the extracted directory
        pdf_files = [os.path.join(pdf_dir, file) for file in os.listdir(pdf_dir) if file.lower().endswith(".pdf")]
        st.write(f"Found {len(pdf_files)} PDF files in the ZIP file.")

        @st.cache_resource
        def load_models(colpali_model):
            RAG = RAGMultiModalModel.from_pretrained(colpali_model, verbose=10)
            return RAG
        RAG= load_models(colpali_model)

        for pdf_file in pdf_files:
            index_name = f"{os.path.basename(pdf_file).replace('.pdf', '')}_{uuid.uuid4().hex}"  # Use the PDF name as index_name
            print("Index name",index_name)
            
            st.subheader(f"Processing: {os.path.basename(pdf_file)}")
            with st.spinner(f"Indexing {index_name}..."):
                try:
                    @st.cache_data
                    def create_rag_index(image_path,index_name):
                        RAG.index(
                            input_path=image_path,
                            index_name=index_name,
                            store_collection_with_index=True,
                            overwrite=True,  # Ensure any existing index is cleared
                            )
                    create_rag_index(pdf_file,index_name)
                
                except ValueError as e:
                    st.error(f"ValueError while indexing {os.path.basename(pdf_file)}: {e}")
                except Exception as e:
                    st.error(f"Error while indexing {os.path.basename(pdf_file)}: {e}")

                
    
        
    with col2:
        # Text input for the user query
        text_query = st.text_input("Enter your text query")

        # Search and Extract Text button
        if st.button("Search and Extract Text"):
            if text_query:
                results = RAG.search(text_query, k=1, return_base64_results=True)

                image_data = base64.b64decode(results[0].base64)
                image = Image.open(BytesIO(image_data))
                thumbnail = image.resize((1, 1))
                # Display the thumbnail
                st.image(thumbnail, use_column_width=False)

                # Create a button to show the full image
                with st.expander("View Full Image"):
                    st.image(image, caption="Result Image", use_column_width=True)

                response = client.chat.completions.create(
                model=multi_model_llm,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": text_query},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/png;base64,{results[0].base64}"
                                }
                            },
                        ],
                    }
                ],
                max_tokens=300,
            )
                # print(response)
                output =response.choices[0].message.content
                st.subheader("Query with LLM Model")
                st.markdown(output,unsafe_allow_html=True)
                    # Placeholder for search results
                    # st.markdown(highlighted_output, unsafe_allow_html=True)
            else:
                st.warning("Please enter a query.")
else:
    st.info("Upload a ZIP file to get started.")

if name == 'main': main()

Jan 19 '25 14:01 josephmattamana90