notebook work

traefikturkey · Apr 8, 2024 · 4aa7ee0 · 4aa7ee0
1 parent 1c0c999
commit 4aa7ee0
Show file tree

Hide file tree

Showing 5 changed files with 827 additions and 59 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -39,6 +39,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     curl \
     gosu \
     less \
+    libopenblas-dev \
     locales \
     make \
     sudo \
@@ -98,6 +99,7 @@ FROM base as build
 RUN apt-get update && apt-get install -y --no-install-recommends \
     binutils \
     build-essential \
+    pkg-config gfortran \
     cmake \
     coreutils \
     extra-cmake-modules \

diff --git a/notebooks/generate_and_store_embeddings.ipynb b/notebooks/generate_and_store_embeddings.ipynb
@@ -0,0 +1,138 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from IPython.display import display\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "\n",
+    "import json\n",
+    "from ast import literal_eval\n",
+    "\n",
+    "import requests\n",
+    "\n",
+    "from scipy import sparse\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "\n",
+    "import numpy as np\n",
+    "import sklearn.metrics.pairwise as pw\n",
+    "\n",
+    "from sklearn.cluster import KMeans\n",
+    "import matplotlib.pyplot as plt\n",
+    "from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n",
+    "import scipy\n",
+    "\n",
+    "import math\n",
+    "import random\n",
+    "import sklearn\n",
+    "from nltk.corpus import stopwords\n",
+    "from scipy.sparse import csr_matrix\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "from scipy.sparse.linalg import svds\n",
+    "from sklearn.preprocessing import MinMaxScaler\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from langchain_community.llms import Ollama\n",
+    "from langchain.embeddings import OllamaEmbeddings\n",
+    "\n",
+    "ollama_url = os.getenv('OLLAMA_URL')\n",
+    "model = OllamaEmbeddings(base_url=ollama_url, model=\"nomic-embed-text\")\n",
+    "\n",
+    "with open('./data/youtube_history.json', 'r', encoding='utf-8') as f:\n",
+    "\tvideos = json.load(f)\n",
+    "\n",
+    "for video in videos:\n",
+    "\ttext = video['title']\n",
+    "\tif 'tags' in video:\n",
+    "\t\ttext += f\" {\" \".join(video['tags'])}\"\n",
+    "\t\n",
+    "\tvideo['soup'] = text\n",
+    "\tvideo['embeddings'] = model.embed_query(text)\n",
+    "\t\n",
+    "with open('./data/youtube_history_embeddings.json', 'w', encoding='utf-8') as f:\n",
+    "  json.dump(videos, f, ensure_ascii=False, indent=2)\n",
+    "\t "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 2810 entries, 0 to 2809\n",
+      "Data columns (total 11 columns):\n",
+      " #   Column           Non-Null Count  Dtype \n",
+      "---  ------           --------------  ----- \n",
+      " 0   url              2810 non-null   object\n",
+      " 1   title            2810 non-null   object\n",
+      " 2   visit_count      2810 non-null   int64 \n",
+      " 3   last_visit_time  2810 non-null   int64 \n",
+      " 4   publishedAt      2810 non-null   object\n",
+      " 5   description      2810 non-null   object\n",
+      " 6   channelTitle     2810 non-null   object\n",
+      " 7   channelId        2810 non-null   object\n",
+      " 8   tags             2162 non-null   object\n",
+      " 9   soup             2810 non-null   object\n",
+      " 10  embeddings       2810 non-null   object\n",
+      "dtypes: int64(2), object(9)\n",
+      "memory usage: 241.6+ KB\n",
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "videos = pd.read_json('./data/youtube_history_embeddings.json') \n",
+    "#print(videos)\n",
+    "print(videos.info()) \n",
+    "\n",
+    "\n",
+    "#print(pd.Series(videos.index, index=videos['title']).drop_duplicates())\n",
+    "\n",
+    "# metadata = videos.reset_index()\n",
+    "# #print(metadata)\n",
+    "# indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()\n",
+    "# print(indices)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}