-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
827 additions
and
59 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 17, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"from IPython.display import display\n", | ||
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | ||
"from sklearn.metrics.pairwise import cosine_similarity\n", | ||
"\n", | ||
"import json\n", | ||
"from ast import literal_eval\n", | ||
"\n", | ||
"import requests\n", | ||
"\n", | ||
"from scipy import sparse\n", | ||
"from sklearn.feature_extraction.text import CountVectorizer\n", | ||
"\n", | ||
"import numpy as np\n", | ||
"import sklearn.metrics.pairwise as pw\n", | ||
"\n", | ||
"from sklearn.cluster import KMeans\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n", | ||
"import scipy\n", | ||
"\n", | ||
"import math\n", | ||
"import random\n", | ||
"import sklearn\n", | ||
"from nltk.corpus import stopwords\n", | ||
"from scipy.sparse import csr_matrix\n", | ||
"from sklearn.model_selection import train_test_split\n", | ||
"\n", | ||
"from scipy.sparse.linalg import svds\n", | ||
"from sklearn.preprocessing import MinMaxScaler\n", | ||
"import matplotlib.pyplot as plt" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 26, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"from langchain_community.llms import Ollama\n", | ||
"from langchain.embeddings import OllamaEmbeddings\n", | ||
"\n", | ||
"ollama_url = os.getenv('OLLAMA_URL')\n", | ||
"model = OllamaEmbeddings(base_url=ollama_url, model=\"nomic-embed-text\")\n", | ||
"\n", | ||
"with open('./data/youtube_history.json', 'r', encoding='utf-8') as f:\n", | ||
"\tvideos = json.load(f)\n", | ||
"\n", | ||
"for video in videos:\n", | ||
"\ttext = video['title']\n", | ||
"\tif 'tags' in video:\n", | ||
"\t\ttext += f\" {\" \".join(video['tags'])}\"\n", | ||
"\t\n", | ||
"\tvideo['soup'] = text\n", | ||
"\tvideo['embeddings'] = model.embed_query(text)\n", | ||
"\t\n", | ||
"with open('./data/youtube_history_embeddings.json', 'w', encoding='utf-8') as f:\n", | ||
" json.dump(videos, f, ensure_ascii=False, indent=2)\n", | ||
"\t " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 28, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"<class 'pandas.core.frame.DataFrame'>\n", | ||
"RangeIndex: 2810 entries, 0 to 2809\n", | ||
"Data columns (total 11 columns):\n", | ||
" # Column Non-Null Count Dtype \n", | ||
"--- ------ -------------- ----- \n", | ||
" 0 url 2810 non-null object\n", | ||
" 1 title 2810 non-null object\n", | ||
" 2 visit_count 2810 non-null int64 \n", | ||
" 3 last_visit_time 2810 non-null int64 \n", | ||
" 4 publishedAt 2810 non-null object\n", | ||
" 5 description 2810 non-null object\n", | ||
" 6 channelTitle 2810 non-null object\n", | ||
" 7 channelId 2810 non-null object\n", | ||
" 8 tags 2162 non-null object\n", | ||
" 9 soup 2810 non-null object\n", | ||
" 10 embeddings 2810 non-null object\n", | ||
"dtypes: int64(2), object(9)\n", | ||
"memory usage: 241.6+ KB\n", | ||
"None\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"videos = pd.read_json('./data/youtube_history_embeddings.json') \n", | ||
"#print(videos)\n", | ||
"print(videos.info()) \n", | ||
"\n", | ||
"\n", | ||
"#print(pd.Series(videos.index, index=videos['title']).drop_duplicates())\n", | ||
"\n", | ||
"# metadata = videos.reset_index()\n", | ||
"# #print(metadata)\n", | ||
"# indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()\n", | ||
"# print(indices)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.