diff --git a/Dockerfile b/Dockerfile index e682d89..a263a9a 100755 --- a/Dockerfile +++ b/Dockerfile @@ -39,6 +39,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ curl \ gosu \ less \ + libopenblas-dev \ locales \ make \ sudo \ @@ -98,6 +99,7 @@ FROM base as build RUN apt-get update && apt-get install -y --no-install-recommends \ binutils \ build-essential \ + pkg-config gfortran \ cmake \ coreutils \ extra-cmake-modules \ diff --git a/notebooks/generate_and_store_embeddings.ipynb b/notebooks/generate_and_store_embeddings.ipynb new file mode 100644 index 0000000..7d38336 --- /dev/null +++ b/notebooks/generate_and_store_embeddings.ipynb @@ -0,0 +1,138 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from IPython.display import display\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "\n", + "import json\n", + "from ast import literal_eval\n", + "\n", + "import requests\n", + "\n", + "from scipy import sparse\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "import numpy as np\n", + "import sklearn.metrics.pairwise as pw\n", + "\n", + "from sklearn.cluster import KMeans\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n", + "import scipy\n", + "\n", + "import math\n", + "import random\n", + "import sklearn\n", + "from nltk.corpus import stopwords\n", + "from scipy.sparse import csr_matrix\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "from scipy.sparse.linalg import svds\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from langchain_community.llms import Ollama\n", + "from langchain.embeddings import OllamaEmbeddings\n", + "\n", + "ollama_url = os.getenv('OLLAMA_URL')\n", + "model = OllamaEmbeddings(base_url=ollama_url, model=\"nomic-embed-text\")\n", + "\n", + "with open('./data/youtube_history.json', 'r', encoding='utf-8') as f:\n", + "\tvideos = json.load(f)\n", + "\n", + "for video in videos:\n", + "\ttext = video['title']\n", + "\tif 'tags' in video:\n", + "\t\ttext += f\" {\" \".join(video['tags'])}\"\n", + "\t\n", + "\tvideo['soup'] = text\n", + "\tvideo['embeddings'] = model.embed_query(text)\n", + "\t\n", + "with open('./data/youtube_history_embeddings.json', 'w', encoding='utf-8') as f:\n", + " json.dump(videos, f, ensure_ascii=False, indent=2)\n", + "\t " + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 2810 entries, 0 to 2809\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 url 2810 non-null object\n", + " 1 title 2810 non-null object\n", + " 2 visit_count 2810 non-null int64 \n", + " 3 last_visit_time 2810 non-null int64 \n", + " 4 publishedAt 2810 non-null object\n", + " 5 description 2810 non-null object\n", + " 6 channelTitle 2810 non-null object\n", + " 7 channelId 2810 non-null object\n", + " 8 tags 2162 non-null object\n", + " 9 soup 2810 non-null object\n", + " 10 embeddings 2810 non-null object\n", + "dtypes: int64(2), object(9)\n", + "memory usage: 241.6+ KB\n", + "None\n" + ] + } + ], + "source": [ + "videos = pd.read_json('./data/youtube_history_embeddings.json') \n", + "#print(videos)\n", + "print(videos.info()) \n", + "\n", + "\n", + "#print(pd.Series(videos.index, index=videos['title']).drop_duplicates())\n", + "\n", + "# metadata = videos.reset_index()\n", + "# #print(metadata)\n", + "# indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()\n", + "# print(indices)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/history.ipynb b/notebooks/history.ipynb index b5f5021..1f2cd84 100644 --- a/notebooks/history.ipynb +++ b/notebooks/history.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -10,26 +10,46 @@ "output_type": "stream", "text": [ "Defaulting to user installation because normal site-packages is not writeable\n", - "Collecting pysqlite3\n", - " Using cached pysqlite3-0.5.2.tar.gz (40 kB)\n", - " Preparing metadata (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25hBuilding wheels for collected packages: pysqlite3\n", - " Building wheel for pysqlite3 (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for pysqlite3: filename=pysqlite3-0.5.2-cp312-cp312-linux_x86_64.whl size=168069 sha256=e64bd34d91b91617e4aa91ef31ff133c3c5f873b8d89537cc7b31886fc5bd08b\n", - " Stored in directory: /home/anvil/.cache/pip/wheels/8e/74/94/b439e432c5dac1fa80bd4c6fc443aaaa7370606785abbf1590\n", - "Successfully built pysqlite3\n", - "Installing collected packages: pysqlite3\n", - "Successfully installed pysqlite3-0.5.2\n" + "Requirement already satisfied: pysqlite3 in /home/anvil/.local/lib/python3.12/site-packages (0.5.2)\n", + "Requirement already satisfied: scikit-learn in /home/anvil/.local/lib/python3.12/site-packages (1.4.1.post1)\n", + "Requirement already satisfied: torch in /home/anvil/.local/lib/python3.12/site-packages (2.2.2)\n", + "Requirement already satisfied: numpy in /dependencies (1.26.4)\n", + "Requirement already satisfied: pydantic in /dependencies (2.6.4)\n", + "Requirement already satisfied: scipy>=1.6.0 in /home/anvil/.local/lib/python3.12/site-packages (from scikit-learn) (1.13.0)\n", + "Requirement already satisfied: joblib>=1.2.0 in /home/anvil/.local/lib/python3.12/site-packages (from scikit-learn) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/anvil/.local/lib/python3.12/site-packages (from scikit-learn) (3.4.0)\n", + "Requirement already satisfied: filelock in /home/anvil/.local/lib/python3.12/site-packages (from torch) (3.13.3)\n", + "Requirement already satisfied: typing-extensions>=4.8.0 in /dependencies (from torch) (4.11.0)\n", + "Requirement already satisfied: sympy in /home/anvil/.local/lib/python3.12/site-packages (from torch) (1.12)\n", + "Requirement already satisfied: networkx in /home/anvil/.local/lib/python3.12/site-packages (from torch) (3.3)\n", + "Requirement already satisfied: jinja2 in /dependencies (from torch) (3.1.3)\n", + "Requirement already satisfied: fsspec in /home/anvil/.local/lib/python3.12/site-packages (from torch) (2024.3.1)\n", + "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /home/anvil/.local/lib/python3.12/site-packages (from torch) (12.1.105)\n", + "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /home/anvil/.local/lib/python3.12/site-packages (from torch) (12.1.105)\n", + "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /home/anvil/.local/lib/python3.12/site-packages (from torch) (12.1.105)\n", + "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /home/anvil/.local/lib/python3.12/site-packages (from torch) (8.9.2.26)\n", + "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /home/anvil/.local/lib/python3.12/site-packages (from torch) (12.1.3.1)\n", + "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /home/anvil/.local/lib/python3.12/site-packages (from torch) (11.0.2.54)\n", + "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /home/anvil/.local/lib/python3.12/site-packages (from torch) (10.3.2.106)\n", + "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /home/anvil/.local/lib/python3.12/site-packages (from torch) (11.4.5.107)\n", + "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /home/anvil/.local/lib/python3.12/site-packages (from torch) (12.1.0.106)\n", + "Requirement already satisfied: nvidia-nccl-cu12==2.19.3 in /home/anvil/.local/lib/python3.12/site-packages (from torch) (2.19.3)\n", + "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /home/anvil/.local/lib/python3.12/site-packages (from torch) (12.1.105)\n", + "Requirement already satisfied: nvidia-nvjitlink-cu12 in /home/anvil/.local/lib/python3.12/site-packages (from nvidia-cusolver-cu12==11.4.5.107->torch) (12.4.127)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /dependencies (from pydantic) (0.6.0)\n", + "Requirement already satisfied: pydantic-core==2.16.3 in /dependencies (from pydantic) (2.16.3)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /dependencies (from jinja2->torch) (2.1.5)\n", + "Requirement already satisfied: mpmath>=0.19 in /home/anvil/.local/lib/python3.12/site-packages (from sympy->torch) (1.3.0)\n" ] } ], "source": [ - "! pip install pysqlite3 \n" + "! pip install pysqlite3 scikit-learn torch numpy pydantic" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -56,7 +76,20 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import dotenv\n", + "import os\n", + "\n", + "dotenv.load_dotenv()\n", + "google_api_key = os.getenv('GOOGLE_API_KEY')" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -68,31 +101,36 @@ "with open('./data/brave_history.json', 'r', encoding='utf-8') as f:\n", " history = json.load(f)\n", "\n", - "# Function to fetch video title and transcript URL\n", - "def get_video_info(video_url):\n", - " video_id = parse_qs(urlparse(video_url).query)['v'][0]\n", - " api_url = f\"https://www.youtube.com/get_video_info?video_id={video_id}\"\n", - " response = requests.get(api_url)\n", - " data = parse_qs(response.text)\n", - "\n", - " title = data.get('title', ['No video title'])[0]\n", - " transcript_url = f\"https://www.youtube.com/api/timedtext?v={video_id}&lang=en\"\n", - "\n", - " return title, transcript_url\n", + "video_title_url = \"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={google_api_key}\"\n", "\n", "# Iterate through the history and process YouTube URLs\n", "youtube_history = []\n", - "for entry in history:\n", - " url = entry['url']\n", + "for video in history:\n", + " url = video['url']\n", " if 'youtube.com/watch' in url:\n", " try:\n", - " title, transcript_url = get_video_info(url)\n", - " entry['title'] = title\n", - " entry['transcript_url'] = transcript_url\n", - " youtube_history.append(entry)\n", - " except Exception as e:\n", - " print(f\"Error processing URL: {url}\")\n", - " print(e)\n", + " video_id = parse_qs(urlparse(url).query)['v'][0]\n", + " \n", + " video_id = parse_qs(urlparse(video['url']).query)['v'][0]\n", + " #print(f\"video_id: {video_id}\")\n", + " json_result = requests.get(video_title_url.format(video_id=video_id, google_api_key=google_api_key)).json()\n", + " \n", + " if json_result['items']:\n", + " snippet = json_result['items'][0]['snippet']\n", + " video['title'] = snippet['title']\n", + " \n", + " video['publishedAt'] = snippet['publishedAt']\n", + " video['description'] = snippet['description']\n", + " video['channelTitle'] = snippet['channelTitle']\n", + " video['channelId'] = snippet['channelId']\n", + " video['publishedAt'] = snippet['publishedAt']\n", + " \n", + " if 'tags' in snippet:\n", + " video['tags'] = snippet['tags']\n", + " youtube_history.append(video)\n", + " except:\n", + " #print(json.dumps(json_result, indent=2))\n", + " continue\n", "\n", "# Save the modified entries to a new file\n", "with open('./data/youtube_history.json', 'w', encoding='utf-8') as f:\n", @@ -101,60 +139,506 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.llms import Ollama\n", + "from langchain.embeddings import OllamaEmbeddings\n", + "\n", + "ollama_url = os.getenv('OLLAMA_URL')\n", + "model = OllamaEmbeddings(base_url=ollama_url, model=\"nomic-embed-text\")\n", + "embedding = model.embed_documents(['Huginn: Free Open Source Automated Agents Platform open source software open source alternative elestio open source free software free open source software huginn platform huginn tutorial huginn platform overview'])\n", + "#print(embedding)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "with open('./data/youtube_history.json', 'r', encoding='utf-8') as f:\n", + " videos = json.load(f)\n", + "\n", + "documents = []\n", + " \n", + "for video in videos:\n", + " text = video['title']\n", + " if 'tags' in video:\n", + " text += f\" {\" \".join(video['tags'])}\"\n", + " \n", + " documents.append(text) \n", + " #text = ' '.join(set(text.split()))\n", + " #print(text)\n", + " \n", + "embeddings = model.embed_documents(documents)\n", + "#print(embeddings) " + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans\n", + "import numpy as np\n", + "import torch\n", + "import random\n", + "from sklearn.cluster import KMeans\n", + "from scipy.spatial.distance import cdist\n", + "import feedparser" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "X = np.array(embeddings)\n", + "kmeans = KMeans(n_clusters=10, random_state=42).fit(X)\n", + "#print(kmeans.cluster_centers_)" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [], + "source": [ + "from dataclasses import dataclass, field\n", + "from datetime import datetime\n", + "from typing import List\n", + "\n", + "@dataclass\n", + "class Article:\n", + " feed_name: str\n", + " title: str\n", + " link: str\n", + " summary: str\n", + " embedding: List[float]\n", + " pub_date: datetime\n", + " updated: datetime\n", + " distance: float = None\n", + "\n", + " def __init__(self, feed_name: str, title: str, link: str, summary: str, embedding: List[float], pub_date: datetime, updated: datetime):\n", + " self.feed_name = feed_name\n", + " self.title = title\n", + " self.link = link\n", + " self.summary = summary\n", + " self.embedding = embedding\n", + " self.pub_date = pub_date\n", + " self.updated = updated" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [], + "source": [ + "import html\n", + "from bs4 import BeautifulSoup\n", + "\n", + "def clean_text(text):\n", + " text = text.replace('\\n', ' ').replace('\\r', ' ').strip()\n", + " text = BeautifulSoup(html.unescape(text), 'lxml').text\n", + " return text\n", + "\n", + "def parse_feed(url):\n", + "\tfeed_item = feedparser.parse(url)\n", + "\tentries = []\n", + "\tfor article in feed_item.entries:\n", + "\t\tentries.append(Article(\n", + "\t\t\tfeed_name = feed_item.feed.title,\n", + "\t\t\ttitle = article.get('title', ''),\n", + "\t\t\tlink = article.link,\n", + "\t\t\tsummary = article.get('summary', ''),\n", + "\t\t\tembedding = model.embed_query(f\"{article.get('title', '')} {article.get('summary', '')}\".strip()),\n", + "\t\t\tpub_date = datetime.now(),\n", + "\t\t\tupdated = datetime.now()\n", + "\t\t))\n", + " \n", + "\treturn entries\n", + "\n", + "entries = parse_feed('https://www.reddit.com/r/selfhosted/rising.rss')\n", + "entries += parse_feed(\"https://www.battleswarmblog.com/?feed=rss2\")\n", + "entries += parse_feed(\"https://cafehayek.com/feed\")\n", + "entries += parse_feed(\"https://rss.slashdot.org/Slashdot/slashdotMain\")\n", + "entries += parse_feed(\"https://www.realclearpolitics.com/index.xml\")\n", + "entries += parse_feed(\"https://instapundit.com/feed/\")\n", + "entries += parse_feed(\"https://twitchy.com/feed\")\n", + "entries += parse_feed(\"https://hnrss.org/frontpage\")\n", + "entries += parse_feed(\"https://www.reddit.com/r/selfhosted/top.rss?t=day\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [], + "source": [ + "def filter_articles(\n", + " articles: list[Article],\n", + " cluster_centers: np.ndarray,\n", + " filter_ratio: float = 0.5,\n", + " random_ratio: float = 0.1,\n", + ") -> list[Article]:\n", + " \"\"\"Filter out articles according to the user's preferences.\n", + "\n", + " This function tries to return a list of articles that are most relevant to\n", + " the user's interests. It does so by clustering the read articles and then\n", + " calculating the distance of each passed article to the closest cluster. The\n", + " articles are then sorted by this distance and the top `filter_ratio` fraction\n", + " of articles are returned. A small fraction of random articles are also included\n", + " to add some diversity and allow for discovery of new topics.\n", + "\n", + " If there aren't enough read articles to form clusters, the original list of\n", + " articles is returned identically.\n", + "\n", + " Args:\n", + " articles: List of articles to filter.\n", + " read_articles: List of articles that the user has read.\n", + " filter_ratio: Fraction of articles to return (default 0.5).\n", + " random_ratio: Fraction of random articles to include (default 0.1).\n", + "\n", + " Returns:\n", + " List of articles sorted by relevance\n", + " \"\"\"\n", + " random.Random(42).shuffle(articles)\n", + " n_random = int(len(articles) * random_ratio)\n", + " random_articles = articles[:n_random]\n", + " del articles[:n_random]\n", + "\n", + " articles_embeddings_list = [article.embedding for article in articles if article.embedding]\n", + "\n", + " if not articles_embeddings_list:\n", + " print(\"No embeddings found for articles. Returning articles as is.\")\n", + " return articles\n", + " # Calculate distance of each passed article to the closest cluster\n", + " articles_embeddings = np.array(articles_embeddings_list)\n", + " distances = cdist(articles_embeddings, cluster_centers, metric=\"cosine\")\n", + " print(distances)\n", + " min_distances = distances.min(axis=1)\n", + "\n", + " # Sort articles by distance to the closest cluster\n", + " sorted_articles_with_distance = sorted(\n", + " zip(min_distances, articles), key=lambda x: x[0]\n", + " )\n", + " \n", + " for distance, article in sorted_articles_with_distance:\n", + " article.distance = distance\n", + " \n", + " sorted_articles = [article for _, article in sorted_articles_with_distance]\n", + "\n", + " # Filter out articles based on the filter_ratio\n", + " num_to_filter = int(len(sorted_articles) * filter_ratio)\n", + " print(num_to_filter)\n", + " return sorted(\n", + " sorted_articles[:num_to_filter] + random_articles,\n", + " key=lambda x: x.pub_date or x.updated,\n", + " reverse=True,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "filtered_articles = filter_articles(\n", + " articles=entries,\n", + " cluster_centers=kmeans.cluster_centers_,\n", + " filter_ratio=0.5,\n", + " random_ratio=0,\n", + " )\n", + "\n", + "for article in filtered_articles:\n", + "\tprint(f\"{article.distance} - {article.feed_name} - {article.title}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "def calculate_article_distances(kmeans, articles):\n", + " # Extract the embeddings from the articles\n", + " embeddings = [article.embedding for article in articles]\n", + " X = np.array(embeddings)\n", + "\n", + " # Calculate the distances from each article embedding to the KMeans centers\n", + " distances = []\n", + " for embedding in X:\n", + " min_distance = float('inf')\n", + " for center in kmeans.cluster_centers_:\n", + " distance = 1 - np.dot(embedding, center) / (np.linalg.norm(embedding) * np.linalg.norm(center))\n", + " min_distance = min(min_distance, distance)\n", + " distances.append(min_distance)\n", + "\n", + " # Set the distance attribute for each article\n", + " for i, article in enumerate(articles):\n", + " article.distance = distances[i]\n", + "\n", + " # Sort the articles by the calculated distances\n", + " sorted_articles = sorted(articles, key=lambda x: x.distance)\n", + "\n", + " return sorted_articles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "filtered_articles = calculate_article_distances(kmeans, entries)\n", + "\n", + "\n", + "for article in filtered_articles:\n", + "\tprint(f\"{article.distance} - {article.feed_name} - {article.title}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 136, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "http://gdata.youtube.com/feeds/api/videos/LKiBlGDfRU8?alt=json&v=2\n" + "Defaulting to user installation because normal site-packages is not writeable\n", + "Requirement already satisfied: pandas in /home/anvil/.local/lib/python3.12/site-packages (2.2.1)\n", + "Requirement already satisfied: matplotlib in /home/anvil/.local/lib/python3.12/site-packages (3.8.4)\n", + "Collecting nltk\n", + " Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)\n", + "Requirement already satisfied: numpy<2,>=1.26.0 in /dependencies (from pandas) (1.26.4)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /home/anvil/.local/lib/python3.12/site-packages (from pandas) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /home/anvil/.local/lib/python3.12/site-packages (from pandas) (2024.1)\n", + "Requirement already satisfied: tzdata>=2022.7 in /home/anvil/.local/lib/python3.12/site-packages (from pandas) (2024.1)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /home/anvil/.local/lib/python3.12/site-packages (from matplotlib) (1.2.1)\n", + "Requirement already satisfied: cycler>=0.10 in /home/anvil/.local/lib/python3.12/site-packages (from matplotlib) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /home/anvil/.local/lib/python3.12/site-packages (from matplotlib) (4.51.0)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /home/anvil/.local/lib/python3.12/site-packages (from matplotlib) (1.4.5)\n", + "Requirement already satisfied: packaging>=20.0 in /dependencies (from matplotlib) (23.2)\n", + "Requirement already satisfied: pillow>=8 in /home/anvil/.local/lib/python3.12/site-packages (from matplotlib) (10.3.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /home/anvil/.local/lib/python3.12/site-packages (from matplotlib) (3.1.2)\n", + "Requirement already satisfied: click in /dependencies (from nltk) (8.1.7)\n", + "Requirement already satisfied: joblib in /home/anvil/.local/lib/python3.12/site-packages (from nltk) (1.3.2)\n", + "Collecting regex>=2021.8.3 (from nltk)\n", + " Downloading regex-2023.12.25-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.9/40.9 kB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting tqdm (from nltk)\n", + " Downloading tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.6/57.6 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: six>=1.5 in /dependencies (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", + "Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m8.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hDownloading regex-2023.12.25-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (789 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m789.1/789.1 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0mm\n", + "\u001b[?25hDownloading tqdm-4.66.2-py3-none-any.whl (78 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.3/78.3 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: tqdm, regex, nltk\n", + "Successfully installed nltk-3.8.1 regex-2023.12.25 tqdm-4.66.2\n" ] } ], "source": [ - "video_id = \"LKiBlGDfRU8\"\n", - "url = f\"http://gdata.youtube.com/feeds/api/videos/{video_id}?alt=json&v=2\"\n", - "print(url)" + "! pip install pandas matplotlib nltk" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 137, "metadata": {}, "outputs": [], "source": [ - "import dotenv\n", - "import os\n", + "import pandas as pd\n", + "from IPython.display import display\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", - "dotenv.load_dotenv()\n", - "google_api_key = os.getenv('GOOGLE_API_KEY')" + "import json\n", + "from ast import literal_eval\n", + "\n", + "import requests\n", + "\n", + "from scipy import sparse\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "import numpy as np\n", + "import sklearn.metrics.pairwise as pw\n", + "\n", + "from sklearn.cluster import KMeans\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.preprocessing import LabelEncoder, MinMaxScaler\n", + "import scipy\n", + "\n", + "import math\n", + "import random\n", + "import sklearn\n", + "from nltk.corpus import stopwords\n", + "from scipy.sparse import csr_matrix\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "from scipy.sparse.linalg import svds\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [], + "source": [ + "tfidf = TfidfVectorizer(stop_words='english')" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 156, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'publishedAt': '2024-04-05T15:00:30Z', 'channelId': 'UC1yNl2E66ZzKApQdRuTQ4tw', 'title': \"My dream died, and now I'm here\", 'description': 'This is my contribution to bring the \"you\" back into YouTube #uinutube\\n\\nYou can support me on Patreon ➜ https://www.patreon.com/Sabine', 'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/LKiBlGDfRU8/default.jpg', 'width': 120, 'height': 90}, 'medium': {'url': 'https://i.ytimg.com/vi/LKiBlGDfRU8/mqdefault.jpg', 'width': 320, 'height': 180}, 'high': {'url': 'https://i.ytimg.com/vi/LKiBlGDfRU8/hqdefault.jpg', 'width': 480, 'height': 360}, 'standard': {'url': 'https://i.ytimg.com/vi/LKiBlGDfRU8/sddefault.jpg', 'width': 640, 'height': 480}, 'maxres': {'url': 'https://i.ytimg.com/vi/LKiBlGDfRU8/maxresdefault.jpg', 'width': 1280, 'height': 720}}, 'channelTitle': 'Sabine Hossenfelder', 'tags': ['science without the gobbledygoook', 'hossenfelder', 'academia', 'academia is broken', 'academia problems', 'academia crisis', 'science', 'science problems', 'physics', 'physics problems', 'sabine hossenfelder', 'fail', 'UINUTUBE', 'uinutube', 'the you in YouTube', 'youtube', 'you in youtube', 'youtube creators'], 'categoryId': '28', 'liveBroadcastContent': 'none', 'defaultLanguage': 'en', 'localized': {'title': \"My dream died, and now I'm here\", 'description': 'This is my contribution to bring the \"you\" back into YouTube #uinutube\\n\\nYou can support me on Patreon ➜ https://www.patreon.com/Sabine'}, 'defaultAudioLanguage': 'en'}\n", - "Title: My dream died, and now I'm here\n" + "(2810, 12324)\n" ] } ], "source": [ - "import requests\n", + "data = pd.DataFrame(documents)\n", + "tfidf_matrix = tfidf.fit_transform(data[0])\n", "\n", - "url = \"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={google_api_key}\"\n", - "r = requests.get(url.format(video_id=video_id, google_api_key=google_api_key))\n", - "json = r.json()\n", - "#print(json)\n", - "snippet = json['items'][0]['snippet']\n", - "print(snippet)\n", - "print(f\"Title: {snippet['title']}\")" + "# Output the shape of tfidf_matrix\n", + "print(tfidf_matrix.shape)\n", + "# print(tfidf.get_stop_words())\n", + "\n", + "# count vectorize\n", + "count = CountVectorizer(stop_words='english')\n", + "count_matrix = count.fit_transform(data[0])\n", + "cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix, True)\n", + "# display(cosine_sim.shape)\n", + "# display(cosine_sim)" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2810,)\n" + ] + }, + { + "data": { + "text/plain": [ + "0\n", + "Build ENTIRE Apps With A Single Prompt - FREE Open-Source Devika Tutorial devika ai coding open ai llm ai open source 0\n", + "Let's build a room sensor - Part 1 - Temperature, Humidity, and Bluetooth ESPHome DHT22 ESP ESP32 Home Assistant Bluetooth BLE 1\n", + "Everyone's Racing To Replace Redis - Who Will Win? web development full stack typescript javascript react programming programmer theo t3 stack t3 t3.gg t3dotgg 2\n", + "Anthropic Claude Prompt Generator 3\n", + "Huginn: Free Open Source Automated Agents Platform open source software open source alternative elestio open source free software free open source software huginn platform huginn tutorial huginn platform overview zapier alternative free zapier alternative free zapier automation tool automation platform open-source automation 4\n", + "Learn AI Engineer Skills For Beginners: OpenAI API + Python ai chatgpt ai engineer ai engineering ai engineer tutorial openai api tutorial openai api python tutorial learn ai engineering ai engineer skills openai api open ai api p ai tools ai career path ai career ai future ai tutorial learn ai ai engineer course ai course 5\n", + "Learn AI Engineer Skills For Beginners: OpenAI API + Python ai chatgpt ai engineer ai engineering ai engineer tutorial openai api tutorial openai api python tutorial learn ai engineering ai engineer skills openai api open ai api p ai tools ai career path ai career ai future ai tutorial learn ai ai engineer course ai course 6\n", + "How To Export Bookmarks From Brave Browser - Tutorial how to export bookmarks in brave how to export bookmarks from brave browser how to export bookmarks from brave brave browser export bookmarks brave export bookmarks brave browser bookmarks export bookmarks brave export bookmarks brave browser brave 7\n", + "Forever Alone - A New Russian Tank Tactic? - (YouTube Cut) 8\n", + "LlamaIndex Webinar: Building an LLM-powered Browser Agent 9\n", + "dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Construct a reverse map of indices and movie titles\n", + "# Reset index of your main DataFrame and construct reverse mapping as before\n", + "metadata = data.reset_index()\n", + "# indices = pd.Series(metadata.index, index=metadata['title'])\n", + "indices = pd.Series(metadata.index, index=metadata[0]).drop_duplicates()\n", + "# print(indices.shape)\n", + "# display(indices[:10])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": {}, + "outputs": [], + "source": [ + "# Function that takes in movie title as input and outputs most similar movies\n", + "def get_recommendations(title, indices, cosine_sim, data):\n", + " # Get the index of the article that matches the title\n", + " idx = indices[title]\n", + " # print(idx)\n", + " # return 0\n", + " # Get the pairwsie similarity scores of all movies with that movie\n", + " sim_scores = list(enumerate(cosine_sim[idx]))\n", + " # print(sim_scores)\n", + " # return 0\n", + " # Sort the movies based on the similarity scores\n", + " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n", + " # print(sim_scores)\n", + " # return 0\n", + " # Get the scores of the 10 most similar movies\n", + " sim_scores = sim_scores[1:11]\n", + " # print(sim_scores)\n", + " # return 0\n", + " # Get the movie indices\n", + " movie_indices = [i[0] for i in sim_scores]\n", + " # print(movie_indices)\n", + " # return 0\n", + " # Return the top 10 most similar movies\n", + " return data[0].iloc[movie_indices]" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'The Rise And Growth of Ethereum Gets Mainstream Coverage'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/.local/lib/python3.12/site-packages/pandas/core/indexes/base.py:3805\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3805\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32mindex.pyx:167\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:191\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:234\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine._get_loc_duplicates\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:242\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine._maybe_get_bool_indexer\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:134\u001b[0m, in \u001b[0;36mpandas._libs.index._unpack_bool_indexer\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'The Rise And Growth of Ethereum Gets Mainstream Coverage'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[164], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mget_recommendations\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mThe Rise And Growth of Ethereum Gets Mainstream Coverage\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindices\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcosine_sim\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[43m)\u001b[49m)\n", + "Cell \u001b[0;32mIn[161], line 4\u001b[0m, in \u001b[0;36mget_recommendations\u001b[0;34m(title, indices, cosine_sim, data)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_recommendations\u001b[39m(title, indices, cosine_sim, data):\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Get the index of the article that matches the title\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m idx \u001b[38;5;241m=\u001b[39m \u001b[43mindices\u001b[49m\u001b[43m[\u001b[49m\u001b[43mtitle\u001b[49m\u001b[43m]\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# print(idx)\u001b[39;00m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# return 0\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;66;03m# Get the pairwsie similarity scores of all movies with that movie\u001b[39;00m\n\u001b[1;32m 8\u001b[0m sim_scores \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28menumerate\u001b[39m(cosine_sim[idx]))\n", + "File \u001b[0;32m~/.local/lib/python3.12/site-packages/pandas/core/series.py:1112\u001b[0m, in \u001b[0;36mSeries.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1109\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[key]\n\u001b[1;32m 1111\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m key_is_scalar:\n\u001b[0;32m-> 1112\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_value\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1114\u001b[0m \u001b[38;5;66;03m# Convert generator to list before going through hashable part\u001b[39;00m\n\u001b[1;32m 1115\u001b[0m \u001b[38;5;66;03m# (We will iterate through the generator there to check for slices)\u001b[39;00m\n\u001b[1;32m 1116\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n", + "File \u001b[0;32m~/.local/lib/python3.12/site-packages/pandas/core/series.py:1228\u001b[0m, in \u001b[0;36mSeries._get_value\u001b[0;34m(self, label, takeable)\u001b[0m\n\u001b[1;32m 1225\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[label]\n\u001b[1;32m 1227\u001b[0m \u001b[38;5;66;03m# Similar to Index.get_value, but we do not fall back to positional\u001b[39;00m\n\u001b[0;32m-> 1228\u001b[0m loc \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlabel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1230\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(loc):\n\u001b[1;32m 1231\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_values[loc]\n", + "File \u001b[0;32m~/.local/lib/python3.12/site-packages/pandas/core/indexes/base.py:3812\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3809\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3810\u001b[0m ):\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3812\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3815\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3816\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3817\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[0;31mKeyError\u001b[0m: 'The Rise And Growth of Ethereum Gets Mainstream Coverage'" + ] + } + ], + "source": [ + "print(get_recommendations('The Rise And Growth of Ethereum Gets Mainstream Coverage', indices, cosine_sim, metadata))" ] }, { @@ -162,7 +646,10 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# print(get_recommendations('Google Data Center 360° Tour', indices, cosine_sim, metadata))\n", + "# print(get_recommendations('Intel\\'s internal IoT platform for real-time enterprise analytics', indices, cosine_sim, metadata))" + ] } ], "metadata": { diff --git a/notebooks/u.ipynb b/notebooks/u.ipynb new file mode 100644 index 0000000..02a12cc --- /dev/null +++ b/notebooks/u.ipynb @@ -0,0 +1,132 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Collecting scipy==1.11.0\n", + " Using cached scipy-1.11.0.tar.gz (56.0 MB)\n", + " Installing build dependencies ... \u001b[?25ldone\n", + "\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n", + "\u001b[?25h Installing backend dependencies ... \u001b[?25ldone\n", + "\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25lerror\n", + " \u001b[1;31merror\u001b[0m: \u001b[1msubprocess-exited-with-error\u001b[0m\n", + " \n", + " \u001b[31m×\u001b[0m \u001b[32mPreparing metadata \u001b[0m\u001b[1;32m(\u001b[0m\u001b[32mpyproject.toml\u001b[0m\u001b[1;32m)\u001b[0m did not run successfully.\n", + " \u001b[31m│\u001b[0m exit code: \u001b[1;36m1\u001b[0m\n", + " \u001b[31m╰─>\u001b[0m \u001b[31m[43 lines of output]\u001b[0m\n", + " \u001b[31m \u001b[0m \u001b[36m\u001b[1m+ meson setup /tmp/pip-install-yhyoc3xz/scipy_f9121692c8e74fe0bb7d7ed58a4b2c89 /tmp/pip-install-yhyoc3xz/scipy_f9121692c8e74fe0bb7d7ed58a4b2c89/.mesonpy-3s1j1x0y/build -Dbuildtype=release -Db_ndebug=if-release -Db_vscrt=md --native-file=/tmp/pip-install-yhyoc3xz/scipy_f9121692c8e74fe0bb7d7ed58a4b2c89/.mesonpy-3s1j1x0y/build/meson-python-native-file.ini\u001b[0m\n", + " \u001b[31m \u001b[0m The Meson build system\n", + " \u001b[31m \u001b[0m Version: 1.4.0\n", + " \u001b[31m \u001b[0m Source dir: /tmp/pip-install-yhyoc3xz/scipy_f9121692c8e74fe0bb7d7ed58a4b2c89\n", + " \u001b[31m \u001b[0m Build dir: /tmp/pip-install-yhyoc3xz/scipy_f9121692c8e74fe0bb7d7ed58a4b2c89/.mesonpy-3s1j1x0y/build\n", + " \u001b[31m \u001b[0m Build type: native build\n", + " \u001b[31m \u001b[0m Project name: SciPy\n", + " \u001b[31m \u001b[0m Project version: 1.11.0\n", + " \u001b[31m \u001b[0m C compiler for the host machine: cc (gcc 12.2.0 \"cc (Debian 12.2.0-14) 12.2.0\")\n", + " \u001b[31m \u001b[0m C linker for the host machine: cc ld.bfd 2.40\n", + " \u001b[31m \u001b[0m C++ compiler for the host machine: c++ (gcc 12.2.0 \"c++ (Debian 12.2.0-14) 12.2.0\")\n", + " \u001b[31m \u001b[0m C++ linker for the host machine: c++ ld.bfd 2.40\n", + " \u001b[31m \u001b[0m Cython compiler for the host machine: cython (cython 0.29.37)\n", + " \u001b[31m \u001b[0m Host machine cpu family: x86_64\n", + " \u001b[31m \u001b[0m Host machine cpu: x86_64\n", + " \u001b[31m \u001b[0m Program python found: YES (/usr/local/bin/python)\n", + " \u001b[31m \u001b[0m Found pkg-config: YES (/usr/bin/pkg-config) 1.8.1\n", + " \u001b[31m \u001b[0m Run-time dependency python found: YES 3.12\n", + " \u001b[31m \u001b[0m Program cython found: YES (/tmp/pip-build-env-y8bdawz7/overlay/bin/cython)\n", + " \u001b[31m \u001b[0m Compiler for C supports arguments -Wno-unused-but-set-variable: YES\n", + " \u001b[31m \u001b[0m Compiler for C supports arguments -Wno-unused-function: YES\n", + " \u001b[31m \u001b[0m Compiler for C supports arguments -Wno-conversion: YES\n", + " \u001b[31m \u001b[0m Compiler for C supports arguments -Wno-misleading-indentation: YES\n", + " \u001b[31m \u001b[0m Library m found: YES\n", + " \u001b[31m \u001b[0m Fortran compiler for the host machine: gfortran (gcc 12.2.0 \"GNU Fortran (Debian 12.2.0-14) 12.2.0\")\n", + " \u001b[31m \u001b[0m Fortran linker for the host machine: gfortran ld.bfd 2.40\n", + " \u001b[31m \u001b[0m Compiler for Fortran supports arguments -Wno-conversion: YES\n", + " \u001b[31m \u001b[0m Checking if \"-Wl,--version-script\" : links: YES\n", + " \u001b[31m \u001b[0m Program pythran found: YES (/tmp/pip-build-env-y8bdawz7/overlay/bin/pythran)\n", + " \u001b[31m \u001b[0m Found CMake: /usr/bin/cmake (3.25.1)\n", + " \u001b[31m \u001b[0m WARNING: CMake Toolchain: Failed to determine CMake compilers state\n", + " \u001b[31m \u001b[0m Run-time dependency xsimd found: NO (tried pkgconfig and cmake)\n", + " \u001b[31m \u001b[0m Run-time dependency threads found: YES\n", + " \u001b[31m \u001b[0m Library npymath found: YES\n", + " \u001b[31m \u001b[0m Library npyrandom found: YES\n", + " \u001b[31m \u001b[0m pybind11-config found: YES (/tmp/pip-build-env-y8bdawz7/overlay/bin/pybind11-config) 2.10.4\n", + " \u001b[31m \u001b[0m Run-time dependency pybind11 found: YES 2.10.4\n", + " \u001b[31m \u001b[0m Run-time dependency openblas found: NO (tried pkgconfig and cmake)\n", + " \u001b[31m \u001b[0m Run-time dependency openblas found: NO (tried pkgconfig and cmake)\n", + " \u001b[31m \u001b[0m \n", + " \u001b[31m \u001b[0m ../../scipy/meson.build:159:9: ERROR: Dependency \"OpenBLAS\" not found, tried pkgconfig and cmake\n", + " \u001b[31m \u001b[0m \n", + " \u001b[31m \u001b[0m A full log can be found at /tmp/pip-install-yhyoc3xz/scipy_f9121692c8e74fe0bb7d7ed58a4b2c89/.mesonpy-3s1j1x0y/build/meson-logs/meson-log.txt\n", + " \u001b[31m \u001b[0m \u001b[31m[end of output]\u001b[0m\n", + " \n", + " \u001b[1;35mnote\u001b[0m: This error originates from a subprocess, and is likely not a problem with pip.\n", + "\u001b[?25h\u001b[1;31merror\u001b[0m: \u001b[1mmetadata-generation-failed\u001b[0m\n", + "\n", + "\u001b[31m×\u001b[0m Encountered error while generating package metadata.\n", + "\u001b[31m╰─>\u001b[0m See above for output.\n", + "\n", + "\u001b[1;35mnote\u001b[0m: This is an issue with the package mentioned above, not pip.\n", + "\u001b[1;36mhint\u001b[0m: See above for details.\n" + ] + } + ], + "source": [ + "! pip install --prefer-binary scipy==1.11.0 gensim" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name 'triu' from 'scipy.linalg' (/home/anvil/.local/lib/python3.12/site-packages/scipy/linalg/__init__.py)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mgensim\u001b[39;00m\n", + "File \u001b[0;32m~/.local/lib/python3.12/site-packages/gensim/__init__.py:11\u001b[0m\n\u001b[1;32m 7\u001b[0m __version__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m4.3.2\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mlogging\u001b[39;00m\n\u001b[0;32m---> 11\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgensim\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m parsing, corpora, matutils, interfaces, models, similarities, utils \u001b[38;5;66;03m# noqa:F401\u001b[39;00m\n\u001b[1;32m 14\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mgetLogger(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgensim\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m logger\u001b[38;5;241m.\u001b[39mhandlers: \u001b[38;5;66;03m# To ensure reload() doesn't add another one\u001b[39;00m\n", + "File \u001b[0;32m~/.local/lib/python3.12/site-packages/gensim/corpora/__init__.py:6\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124;03mThis package contains implementations of various streaming corpus I/O format.\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# bring corpus classes directly into package namespace, to save some typing\u001b[39;00m\n\u001b[0;32m----> 6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mindexedcorpus\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IndexedCorpus \u001b[38;5;66;03m# noqa:F401 must appear before the other classes\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmmcorpus\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m MmCorpus \u001b[38;5;66;03m# noqa:F401\u001b[39;00m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbleicorpus\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BleiCorpus \u001b[38;5;66;03m# noqa:F401\u001b[39;00m\n", + "File \u001b[0;32m~/.local/lib/python3.12/site-packages/gensim/corpora/indexedcorpus.py:14\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mlogging\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m\n\u001b[0;32m---> 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgensim\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m interfaces, utils\n\u001b[1;32m 16\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mgetLogger(\u001b[38;5;18m__name__\u001b[39m)\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mIndexedCorpus\u001b[39;00m(interfaces\u001b[38;5;241m.\u001b[39mCorpusABC):\n", + "File \u001b[0;32m~/.local/lib/python3.12/site-packages/gensim/interfaces.py:19\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;124;03m\"\"\"Basic interfaces used across the whole Gensim package.\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \n\u001b[1;32m 9\u001b[0m \u001b[38;5;124;03mThese interfaces are used for building corpora, model transformation and similarity queries.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 14\u001b[0m \n\u001b[1;32m 15\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mlogging\u001b[39;00m\n\u001b[0;32m---> 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgensim\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m utils, matutils\n\u001b[1;32m 22\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mgetLogger(\u001b[38;5;18m__name__\u001b[39m)\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mCorpusABC\u001b[39;00m(utils\u001b[38;5;241m.\u001b[39mSaveLoad):\n", + "File \u001b[0;32m~/.local/lib/python3.12/site-packages/gensim/matutils.py:20\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mscipy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msparse\u001b[39;00m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mscipy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mstats\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m entropy\n\u001b[0;32m---> 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mscipy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlinalg\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m get_blas_funcs, triu\n\u001b[1;32m 21\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mscipy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlinalg\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlapack\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m get_lapack_funcs\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mscipy\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mspecial\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m psi \u001b[38;5;66;03m# gamma function utils\u001b[39;00m\n", + "\u001b[0;31mImportError\u001b[0m: cannot import name 'triu' from 'scipy.linalg' (/home/anvil/.local/lib/python3.12/site-packages/scipy/linalg/__init__.py)" + ] + } + ], + "source": [ + "import gensim\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt index 35a6033..8c81f56 100755 --- a/requirements.txt +++ b/requirements.txt @@ -3,17 +3,26 @@ argcomplete bs4 docker feedparser -flask-admin +flask-admin flask[async] flask-bcrypt flask-caching -#flask-htmx flask-minify flask-sqlalchemy +gensim hypercorn==0.15.0 langchain langchain-community lxml==5.1.1 +matplotlib +nltk +numpy +pandas +pydantic python-dotenv -pyyaml +pysqlite3 requests +scikit-learn +scipy==1.11.0 +torch +pyyaml