llmflows icon indicating copy to clipboard operation
llmflows copied to clipboard

[FEATURE REQUEST]: How to create a wikipedia Vector Store in your demo LLM-99

Open Connor-Shen opened this issue 1 year ago • 1 comments

The problem you are trying to solve:

Thank you so much for your amazing work! I have experienced your LLM-99 physics demo and it‘s fantastic! I'm a little curious about how you got these physics wikipedia to build a vector database. Have you crawled all physics wikipedias? Thanks again for your great work!

Suggested new feature or change:

Vector Store building

Connor-Shen avatar Apr 04 '24 09:04 Connor-Shen

yes I used a scrappy script to crawl the wikipedia pages:

# pylint: skip-file
from mediawiki import MediaWiki
import nltk
from nltk.tokenize import sent_tokenize
import os
from llmflows.llms import OpenAIEmbeddings
from llmflows.vectorstores import VectorDoc, Pinecone
from wikipedia_articles import wikipedia_pages
from tqdm import tqdm

nltk.download("punkt")

wikipedia = MediaWiki()

print(len(wikipedia_pages))
wiki_chunk_size = 200

# TODO: failed batches: 3600-3800, 4400-4600, 4800-5000, 5600-5800, 6200-6400, 7400-7600
# 8200-8400, 8400-8600, 8600-8800, 8800-9000, 9200-9400, 10200-10400, 10600-10800, 
# 11000-11200, 12200-12400, 12400-12600, 12600-12800, 12800-13000, 13000-13200, 
# 13200-13400, 13800-14000

for i in tqdm(range(0, len(wikipedia_pages), wiki_chunk_size)):
    print(f"working on wikipedia chunk {i} : {i+wiki_chunk_size}")
    chunk_pages = wikipedia_pages[i:i+wiki_chunk_size]

    superconductor_docs = []

    fails = []

    for page in tqdm(chunk_pages):
        try:
            page_obj = wikipedia.page(page)
            page_sentences = sent_tokenize(page_obj.content)
            metadata = {"page_title": page, "link": f"https://en.wikipedia.org/wiki/{page}"}

            # concatenate every 8 sentences with an overlap of 1 sentence
            vector_docs = [
                VectorDoc(doc="".join(page_sentences[i:i + 8]), metadata=metadata)
                for i in range(0, len(page_sentences), 7)
            ]
            for i, vd in enumerate(vector_docs):
                if len(vd.doc) > 30000:
                    print(f"doc too long: {len(vd.doc)}")
                    print(vd.metadata["link"])
                    vector_docs.pop(i)
            superconductor_docs += vector_docs
        except:
            print(f"failed to get {page}")
            fails.append(page)


    piencone_api_key = os.environ.get("PINECONE_API_KEY", "<YOUR-API-KEY>")
    openai_api_key = os.environ.get("OPENAI_API_KEY", "<your-api-key>")

    # Create embeddings LLM
    embeddings_llm = OpenAIEmbeddings(api_key=openai_api_key, max_retries=100)

    # Convert text docs to VectorDocs and get embeddings
    embedded_docs = embeddings_llm.generate(superconductor_docs)

    # initialize Pinecone
    vector_db = Pinecone(
        index_name="llm-99",
        api_key=piencone_api_key,
        environment="us-west4-gcp-free",
    )

    # Add the embedded documents to the vector database

    chunk_size = 50

    for i in range(0, len(embedded_docs), chunk_size):
        try:
            print(f"upserting chunk {i} : {i+chunk_size}")
            vector_db.upsert(docs=embedded_docs[i:i+chunk_size])
        except Exception as err:
            print(f"failed to upsert chunk {i} : {i+chunk_size}, reason: {str(err)}")

I hope this helps! Let me know if you have any more questions

stoyan-stoyanov avatar Apr 09 '24 00:04 stoyan-stoyanov