openai-cookbook
openai-cookbook copied to clipboard
Update needed to Obtain_dataset Python script for incorporating throttling
This is what I used ` import os from dotenv import load_dotenv
print("Loading environment") load_dotenv()
import pandas as pd
input_datapath = 'data/rpi-data-feed_1.csv' # to save space, we provide a pre-filtered dataset print("Reading csv = ", input_datapath) df = pd.read_csv(input_datapath, index_col='ID', header=0) print("Input rows: ", len(df)) print("Cleaning up and aggregating") df = df.dropna() df['combined'] = "Title: " + df.Title.str.strip() + "; Metadata: (" + df.Metadata.str.strip() + ")" print("Input rows after cleaning: ", len(df)) print(df)
print("Sorting rows") df = df.sort_values('ID').tail(1_100)
from transformers import GPT2TokenizerFast tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
remove reviews that are too long
print("Counting tokens") df['n_tokens'] = df.combined.apply(lambda x: len(tokenizer.encode(x))) print("Removing capped rows") df = df[df.n_tokens<8192] print("Final Input rows: ", len(df)) input("Press Enter to continue...") import openai from openai.embeddings_utils import get_embedding
Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
import time import backoff # for exponential backoff
@backoff.on_exception(backoff.expo, openai.error.RateLimitError) def get_embeddings_with_backoff(*args, **kwargs): time.sleep(1) # 60000 print("Processing: ", *args) return get_embedding(*args, **kwargs)
print("Calculating Embeddings") df['ada_search'] = df.combined.apply(lambda x: get_embeddings_with_backoff(x, engine='text-embedding-ada-002')) output_datapath = 'data/products_with_embeddings.csv' print("writing output file: ", output_datapath) df.to_csv(output_datapath) `
Great suggestion. Is the issue that you were hitting rate limit errors with the Obtain dataset notebook? The get_embedding function should have retries with exponential backoff built in.