Skip to content

Commit

Permalink
notebook work
Browse files Browse the repository at this point in the history
  • Loading branch information
ilude committed Apr 8, 2024
1 parent 1c0c999 commit 4aa7ee0
Show file tree
Hide file tree
Showing 5 changed files with 827 additions and 59 deletions.
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
gosu \
less \
libopenblas-dev \
locales \
make \
sudo \
Expand Down Expand Up @@ -98,6 +99,7 @@ FROM base as build
RUN apt-get update && apt-get install -y --no-install-recommends \
binutils \
build-essential \
pkg-config gfortran \
cmake \
coreutils \
extra-cmake-modules \
Expand Down
138 changes: 138 additions & 0 deletions notebooks/generate_and_store_embeddings.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from IPython.display import display\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"\n",
"import json\n",
"from ast import literal_eval\n",
"\n",
"import requests\n",
"\n",
"from scipy import sparse\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"\n",
"import numpy as np\n",
"import sklearn.metrics.pairwise as pw\n",
"\n",
"from sklearn.cluster import KMeans\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n",
"import scipy\n",
"\n",
"import math\n",
"import random\n",
"import sklearn\n",
"from nltk.corpus import stopwords\n",
"from scipy.sparse import csr_matrix\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"from scipy.sparse.linalg import svds\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from langchain_community.llms import Ollama\n",
"from langchain.embeddings import OllamaEmbeddings\n",
"\n",
"ollama_url = os.getenv('OLLAMA_URL')\n",
"model = OllamaEmbeddings(base_url=ollama_url, model=\"nomic-embed-text\")\n",
"\n",
"with open('./data/youtube_history.json', 'r', encoding='utf-8') as f:\n",
"\tvideos = json.load(f)\n",
"\n",
"for video in videos:\n",
"\ttext = video['title']\n",
"\tif 'tags' in video:\n",
"\t\ttext += f\" {\" \".join(video['tags'])}\"\n",
"\t\n",
"\tvideo['soup'] = text\n",
"\tvideo['embeddings'] = model.embed_query(text)\n",
"\t\n",
"with open('./data/youtube_history_embeddings.json', 'w', encoding='utf-8') as f:\n",
" json.dump(videos, f, ensure_ascii=False, indent=2)\n",
"\t "
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 2810 entries, 0 to 2809\n",
"Data columns (total 11 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 url 2810 non-null object\n",
" 1 title 2810 non-null object\n",
" 2 visit_count 2810 non-null int64 \n",
" 3 last_visit_time 2810 non-null int64 \n",
" 4 publishedAt 2810 non-null object\n",
" 5 description 2810 non-null object\n",
" 6 channelTitle 2810 non-null object\n",
" 7 channelId 2810 non-null object\n",
" 8 tags 2162 non-null object\n",
" 9 soup 2810 non-null object\n",
" 10 embeddings 2810 non-null object\n",
"dtypes: int64(2), object(9)\n",
"memory usage: 241.6+ KB\n",
"None\n"
]
}
],
"source": [
"videos = pd.read_json('./data/youtube_history_embeddings.json') \n",
"#print(videos)\n",
"print(videos.info()) \n",
"\n",
"\n",
"#print(pd.Series(videos.index, index=videos['title']).drop_duplicates())\n",
"\n",
"# metadata = videos.reset_index()\n",
"# #print(metadata)\n",
"# indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()\n",
"# print(indices)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 4aa7ee0

Please sign in to comment.