openai-cookbook
openai-cookbook copied to clipboard
Fine tuning QA example using legacy code and needs update
Instead of using search functionality, it should be using embeddings search.
This the diff in order to make this notebook work with the current version of the APIs.
diff --git a/examples/fine-tuned_qa/olympics-3-train-qa.ipynb b/examples/fine-tuned_qa/olympics-3-train-qa.ipynb
index c046e3c..15654d9 100644
--- a/examples/fine-tuned_qa/olympics-3-train-qa.ipynb
+++ b/examples/fine-tuned_qa/olympics-3-train-qa.ipynb
@@ -1,12 +1,5 @@
{
"cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "<span style=\"color:orange; font-weight:bold\">Note: To answer questions based on text documents, we recommend the procedure in <a href=\"https://github.com/openai/openai-cookbook/blob/main/examples/Question_answering_using_embeddings.ipynb\">Question Answering using Embeddings</a>. Some of the code below may rely on <a href=\"https://github.com/openai/openai-cookbook/tree/main/transition_guides_for_deprecated_API_endpoints\">deprecated API endpoints</a>.</span>"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -157,6 +150,19 @@
"df.head()"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_embedding(text, model=\"text-embedding-ada-002\"):\n",
+ " text = text.replace(\"\\n\", \" \")\n",
+ " return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']\n",
+ "\n",
+ "df['questions_embedding'] = df.questions.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -238,28 +244,22 @@
"outputs": [],
"source": [
"import random\n",
+ "from openai.embeddings_utils import cosine_similarity\n",
"\n",
- "def get_random_similar_contexts(question, context, file_id=olympics_search_fileid, search_model='ada', max_rerank=10):\n",
- " \"\"\"\n",
- " Find similar contexts to the given context using the search file\n",
+ "def get_random_similar_contexts(df, question, context, max_rerank=10, search_model='text-embedding-ada-002'):\n",
" \"\"\"\n",
- " try:\n",
- " results = openai.Engine(search_model).search(\n",
- " search_model=search_model, \n",
- " query=question, \n",
- " max_rerank=max_rerank,\n",
- " file=file_id\n",
- " )\n",
- " candidates = []\n",
- " for result in results['data'][:3]:\n",
- " if result['text'] == context:\n",
- " continue\n",
- " candidates.append(result['text'])\n",
- " random_candidate = random.choice(candidates)\n",
- " return random_candidate\n",
- " except Exception as e:\n",
- " print(e)\n",
- " return \"\"\n",
+ " Find similar contexts to the given context using question embeddings search\n",
+ " \"\"\" \n",
+ " embedding = get_embedding(question, search_model)\n",
+ " df['tmp_question_similarities'] = df.questions_embedding.apply(lambda x: cosine_similarity(x, embedding))\n",
+ " res = df.sort_values('tmp_question_similarities', ascending=False).head(max_rerank)\n",