llmflows
llmflows copied to clipboard
[FEATURE REQUEST]: How to create a wikipedia Vector Store in your demo LLM-99
The problem you are trying to solve:
Thank you so much for your amazing work! I have experienced your LLM-99 physics demo and it‘s fantastic! I'm a little curious about how you got these physics wikipedia to build a vector database. Have you crawled all physics wikipedias? Thanks again for your great work!
Suggested new feature or change:
Vector Store building
yes I used a scrappy script to crawl the wikipedia pages:
# pylint: skip-file
from mediawiki import MediaWiki
import nltk
from nltk.tokenize import sent_tokenize
import os
from llmflows.llms import OpenAIEmbeddings
from llmflows.vectorstores import VectorDoc, Pinecone
from wikipedia_articles import wikipedia_pages
from tqdm import tqdm
nltk.download("punkt")
wikipedia = MediaWiki()
print(len(wikipedia_pages))
wiki_chunk_size = 200
# TODO: failed batches: 3600-3800, 4400-4600, 4800-5000, 5600-5800, 6200-6400, 7400-7600
# 8200-8400, 8400-8600, 8600-8800, 8800-9000, 9200-9400, 10200-10400, 10600-10800,
# 11000-11200, 12200-12400, 12400-12600, 12600-12800, 12800-13000, 13000-13200,
# 13200-13400, 13800-14000
for i in tqdm(range(0, len(wikipedia_pages), wiki_chunk_size)):
print(f"working on wikipedia chunk {i} : {i+wiki_chunk_size}")
chunk_pages = wikipedia_pages[i:i+wiki_chunk_size]
superconductor_docs = []
fails = []
for page in tqdm(chunk_pages):
try:
page_obj = wikipedia.page(page)
page_sentences = sent_tokenize(page_obj.content)
metadata = {"page_title": page, "link": f"https://en.wikipedia.org/wiki/{page}"}
# concatenate every 8 sentences with an overlap of 1 sentence
vector_docs = [
VectorDoc(doc="".join(page_sentences[i:i + 8]), metadata=metadata)
for i in range(0, len(page_sentences), 7)
]
for i, vd in enumerate(vector_docs):
if len(vd.doc) > 30000:
print(f"doc too long: {len(vd.doc)}")
print(vd.metadata["link"])
vector_docs.pop(i)
superconductor_docs += vector_docs
except:
print(f"failed to get {page}")
fails.append(page)
piencone_api_key = os.environ.get("PINECONE_API_KEY", "<YOUR-API-KEY>")
openai_api_key = os.environ.get("OPENAI_API_KEY", "<your-api-key>")
# Create embeddings LLM
embeddings_llm = OpenAIEmbeddings(api_key=openai_api_key, max_retries=100)
# Convert text docs to VectorDocs and get embeddings
embedded_docs = embeddings_llm.generate(superconductor_docs)
# initialize Pinecone
vector_db = Pinecone(
index_name="llm-99",
api_key=piencone_api_key,
environment="us-west4-gcp-free",
)
# Add the embedded documents to the vector database
chunk_size = 50
for i in range(0, len(embedded_docs), chunk_size):
try:
print(f"upserting chunk {i} : {i+chunk_size}")
vector_db.upsert(docs=embedded_docs[i:i+chunk_size])
except Exception as err:
print(f"failed to upsert chunk {i} : {i+chunk_size}, reason: {str(err)}")
I hope this helps! Let me know if you have any more questions