openai/openai-python
Publicmirrored fromhttps://github.com/openai/openai-pythonAvailable
examples/embeddings/Obtain_dataset.ipynb
192lines · modecode
| 1 | { |
| 2 | "cells": [ |
| 3 | { |
| 4 | "cell_type": "markdown", |
| 5 | "metadata": {}, |
| 6 | "source": [ |
| 7 | "## 1. Load the dataset\n", |
| 8 | "\n", |
| 9 | "The dataset used in this example is [fine-food reviews](https://www.kaggle.com/snap/amazon-fine-food-reviews) from Amazon. The dataset contains a total of 568,454 food reviews Amazon users left up to October 2012. We will use a subset of this dataset, consisting of 1,000 most recent reviews for illustration purposes. The reviews are in English and tend to be positive or negative. Each review has a ProductId, UserId, Score, review title (Summary) and review body (Text).\n", |
| 10 | "\n", |
| 11 | "We will combine the review summary and review text into a single combined text. The model will encode this combined text and it will output a single vector embedding." |
| 12 | ] |
| 13 | }, |
| 14 | { |
| 15 | "cell_type": "code", |
| 16 | "execution_count": 1, |
| 17 | "metadata": {}, |
| 18 | "outputs": [ |
| 19 | { |
| 20 | "data": { |
| 21 | "text/html": [ |
| 22 | "<div>\n", |
| 23 | "<style scoped>\n", |
| 24 | " .dataframe tbody tr th:only-of-type {\n", |
| 25 | " vertical-align: middle;\n", |
| 26 | " }\n", |
| 27 | "\n", |
| 28 | " .dataframe tbody tr th {\n", |
| 29 | " vertical-align: top;\n", |
| 30 | " }\n", |
| 31 | "\n", |
| 32 | " .dataframe thead th {\n", |
| 33 | " text-align: right;\n", |
| 34 | " }\n", |
| 35 | "</style>\n", |
| 36 | "<table border=\"1\" class=\"dataframe\">\n", |
| 37 | " <thead>\n", |
| 38 | " <tr style=\"text-align: right;\">\n", |
| 39 | " <th></th>\n", |
| 40 | " <th>Time</th>\n", |
| 41 | " <th>ProductId</th>\n", |
| 42 | " <th>UserId</th>\n", |
| 43 | " <th>Score</th>\n", |
| 44 | " <th>Summary</th>\n", |
| 45 | " <th>Text</th>\n", |
| 46 | " <th>combined</th>\n", |
| 47 | " </tr>\n", |
| 48 | " <tr>\n", |
| 49 | " <th>Id</th>\n", |
| 50 | " <th></th>\n", |
| 51 | " <th></th>\n", |
| 52 | " <th></th>\n", |
| 53 | " <th></th>\n", |
| 54 | " <th></th>\n", |
| 55 | " <th></th>\n", |
| 56 | " <th></th>\n", |
| 57 | " </tr>\n", |
| 58 | " </thead>\n", |
| 59 | " <tbody>\n", |
| 60 | " <tr>\n", |
| 61 | " <th>1</th>\n", |
| 62 | " <td>1303862400</td>\n", |
| 63 | " <td>B001E4KFG0</td>\n", |
| 64 | " <td>A3SGXH7AUHU8GW</td>\n", |
| 65 | " <td>5</td>\n", |
| 66 | " <td>Good Quality Dog Food</td>\n", |
| 67 | " <td>I have bought several of the Vitality canned d...</td>\n", |
| 68 | " <td>Title: Good Quality Dog Food; Content: I have ...</td>\n", |
| 69 | " </tr>\n", |
| 70 | " <tr>\n", |
| 71 | " <th>2</th>\n", |
| 72 | " <td>1346976000</td>\n", |
| 73 | " <td>B00813GRG4</td>\n", |
| 74 | " <td>A1D87F6ZCVE5NK</td>\n", |
| 75 | " <td>1</td>\n", |
| 76 | " <td>Not as Advertised</td>\n", |
| 77 | " <td>Product arrived labeled as Jumbo Salted Peanut...</td>\n", |
| 78 | " <td>Title: Not as Advertised; Content: Product arr...</td>\n", |
| 79 | " </tr>\n", |
| 80 | " </tbody>\n", |
| 81 | "</table>\n", |
| 82 | "</div>" |
| 83 | ], |
| 84 | "text/plain": [ |
| 85 | " Time ProductId UserId Score Summary \\\n", |
| 86 | "Id \n", |
| 87 | "1 1303862400 B001E4KFG0 A3SGXH7AUHU8GW 5 Good Quality Dog Food \n", |
| 88 | "2 1346976000 B00813GRG4 A1D87F6ZCVE5NK 1 Not as Advertised \n", |
| 89 | "\n", |
| 90 | " Text \\\n", |
| 91 | "Id \n", |
| 92 | "1 I have bought several of the Vitality canned d... \n", |
| 93 | "2 Product arrived labeled as Jumbo Salted Peanut... \n", |
| 94 | "\n", |
| 95 | " combined \n", |
| 96 | "Id \n", |
| 97 | "1 Title: Good Quality Dog Food; Content: I have ... \n", |
| 98 | "2 Title: Not as Advertised; Content: Product arr... " |
| 99 | ] |
| 100 | }, |
| 101 | "execution_count": 1, |
| 102 | "metadata": {}, |
| 103 | "output_type": "execute_result" |
| 104 | } |
| 105 | ], |
| 106 | "source": [ |
| 107 | "import pandas as pd\n", |
| 108 | "\n", |
| 109 | "df = pd.read_csv('input/Reviews.csv', index_col=0)\n", |
| 110 | "df = df[['Time', 'ProductId', 'UserId', 'Score', 'Summary', 'Text']]\n", |
| 111 | "df = df.dropna()\n", |
| 112 | "df['combined'] = \"Title: \" + df.Summary.str.strip() + \"; Content: \" + df.Text.str.strip()\n", |
| 113 | "df.head(2)" |
| 114 | ] |
| 115 | }, |
| 116 | { |
| 117 | "cell_type": "code", |
| 118 | "execution_count": 2, |
| 119 | "metadata": {}, |
| 120 | "outputs": [ |
| 121 | { |
| 122 | "data": { |
| 123 | "text/plain": [ |
| 124 | "1000" |
| 125 | ] |
| 126 | }, |
| 127 | "execution_count": 2, |
| 128 | "metadata": {}, |
| 129 | "output_type": "execute_result" |
| 130 | } |
| 131 | ], |
| 132 | "source": [ |
| 133 | "# subsample to 1k most recent reviews and remove samples that are too long\n", |
| 134 | "df = df.sort_values('Time').tail(1_100)\n", |
| 135 | "df.drop('Time', axis=1, inplace=True)\n", |
| 136 | "\n", |
| 137 | "from transformers import GPT2TokenizerFast\n", |
| 138 | "tokenizer = GPT2TokenizerFast.from_pretrained(\"gpt2\")\n", |
| 139 | "\n", |
| 140 | "# remove reviews that are too long\n", |
| 141 | "df['n_tokens'] = df.combined.apply(lambda x: len(tokenizer.encode(x)))\n", |
| 142 | "df = df[df.n_tokens<2000].tail(1_000)\n", |
| 143 | "len(df)" |
| 144 | ] |
| 145 | }, |
| 146 | { |
| 147 | "cell_type": "markdown", |
| 148 | "metadata": {}, |
| 149 | "source": [ |
| 150 | "### 2. Get embeddings and save them for future reuse" |
| 151 | ] |
| 152 | }, |
| 153 | { |
| 154 | "cell_type": "code", |
| 155 | "execution_count": 3, |
| 156 | "metadata": {}, |
| 157 | "outputs": [], |
| 158 | "source": [ |
| 159 | "from openai.embeddings_utils import get_embedding\n", |
| 160 | "\n", |
| 161 | "# This will take just under 10 minutes\n", |
| 162 | "df['babbage_similarity'] = df.combined.apply(lambda x: get_embedding(x, engine='text-similarity-babbage-001'))\n", |
| 163 | "df['babbage_search'] = df.combined.apply(lambda x: get_embedding(x, engine='text-search-babbage-doc-001'))\n", |
| 164 | "df.to_csv('output/embedded_1k_reviews.csv')" |
| 165 | ] |
| 166 | } |
| 167 | ], |
| 168 | "metadata": { |
| 169 | "interpreter": { |
| 170 | "hash": "be4b5d5b73a21c599de40d6deb1129796d12dc1cc33a738f7bac13269cfcafe8" |
| 171 | }, |
| 172 | "kernelspec": { |
| 173 | "display_name": "Python 3.7.3 64-bit ('base': conda)", |
| 174 | "name": "python3" |
| 175 | }, |
| 176 | "language_info": { |
| 177 | "codemirror_mode": { |
| 178 | "name": "ipython", |
| 179 | "version": 3 |
| 180 | }, |
| 181 | "file_extension": ".py", |
| 182 | "mimetype": "text/x-python", |
| 183 | "name": "python", |
| 184 | "nbconvert_exporter": "python", |
| 185 | "pygments_lexer": "ipython3", |
| 186 | "version": "3.9.9" |
| 187 | }, |
| 188 | "orig_nbformat": 4 |
| 189 | }, |
| 190 | "nbformat": 4, |
| 191 | "nbformat_minor": 2 |
| 192 | } |
| 193 | |