iiab · deldesir · Mar 11, 2024 · Mar 12, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/cps/constants.py b/cps/constants.py
@@ -48,6 +48,12 @@
 # an initial metadata manifest (prior to downloading videos or media) here:
 XKLB_DB_FILE      = "/library/calibre-web/xklb-metadata.db"
 
+# Maximum number of videos to download when adding a new video playlist
+MAX_VIDEOS_PER_DOWNLOAD = 100
+
+# Maximum number of gigabytes to download when adding a new video playlist
+MAX_GB_PER_DOWNLOAD = 10
+
 if HOME_CONFIG:
     home_dir = os.path.join(os.path.expanduser("~"), ".calibre-web")
     if not os.path.exists(home_dir):

diff --git a/cps/tasks/metadata_extract.py b/cps/tasks/metadata_extract.py
@@ -5,7 +5,7 @@
 from datetime import datetime
 from flask_babel import lazy_gettext as N_, gettext as _
 
-from cps.constants import XKLB_DB_FILE
+from cps.constants import XKLB_DB_FILE, MAX_VIDEOS_PER_DOWNLOAD
 from cps.services.worker import WorkerThread
 from cps.tasks.download import TaskDownload
 from cps.services.worker import CalibreTask, STAT_FINISH_SUCCESS, STAT_FAIL, STAT_STARTED, STAT_WAITING
@@ -29,7 +29,6 @@ def __init__(self, task_message, media_url, original_url, current_user_name):
         self.columns = None
         self.shelf_title = None
         self.shelf_id = None
-        self.playlist_id = None
         self.main_message = None
 
     def run(self, worker_thread):
@@ -48,13 +47,12 @@ def run(self, worker_thread):
             subprocess_args = [lb_executable, "tubeadd", self.media_url]
             log.info("Subprocess args: %s", subprocess_args)
 
-            # Execute the download process using process_open
+            # Execute the metadata fetching process using process_open
             try:
                 p = process_open(subprocess_args, newlines=True)
-
                 p.wait()
                 self_main_message = f"{self.media_url_link}"
-                self.message = self_main_message
+                self.message = self_main_message + "..."
 
                 # Database operations
                 requested_urls = {}
@@ -63,11 +61,10 @@ def run(self, worker_thread):
                         cursor = conn.execute("PRAGMA table_info(media)")
                         self.columns = [column[1] for column in cursor.fetchall()]
                         if "error" in self.columns:
-                            rows = conn.execute("SELECT path, duration FROM media WHERE error IS NULL AND path LIKE 'http%'").fetchall()
+                            rows = conn.execute("SELECT path, duration, time_uploaded, view_count, size FROM media WHERE error IS NULL AND path LIKE 'http%'").fetchall()
                         else:
-                            rows = conn.execute("SELECT path, duration FROM media WHERE path LIKE 'http%'").fetchall()
+                            rows = conn.execute("SELECT path, duration, time_uploaded, view_count, size FROM media WHERE path LIKE 'http%'").fetchall()
 
-                        # Abort if there are no urls
                         if not rows:
                             log.info("No urls found in the database")
                             error = conn.execute("SELECT error, webpath FROM media WHERE error IS NOT NULL AND webpath = ?", (self.media_url,)).fetchone()
@@ -80,13 +77,27 @@ def run(self, worker_thread):
                         for row in rows:
                             path = row[0]
                             duration = row[1]
+                            time_uploaded = row[2]
+                            view_count = row[3]
+                            size = row[4]
+
+                            time_uploaded = datetime.utcfromtimestamp(time_uploaded)
+                            now = datetime.now()
+                            days_since_publish = (now - time_uploaded).days or 1
+                            views_per_day = view_count / days_since_publish
+
                             is_playlist_video = False
                             if "playlists_id" in self.columns:
                                 playlist_id = conn.execute("SELECT playlists_id FROM media WHERE path = ?", (path,)).fetchone()
                                 if playlist_id:
-                                    is_playlist_video = True
+                                    is_playlist_video = True 
+
                             requested_urls[path] = {
                                 "duration": duration,
+                                "time_uploaded": time_uploaded,
+                                "view_count": view_count,
+                                "size": size,
+                                "views_per_day": views_per_day,
                                 "is_playlist_video": is_playlist_video
                             }
 
@@ -95,36 +106,40 @@ def run(self, worker_thread):
                         self.message = f"{self.media_url_link} failed: {db_error}"
 
                     # get the shelf title
-                    if any([requested_urls[url]["is_playlist_video"] for url in requested_urls.keys()]):
+                    if "list=" in self.media_url or "@" in self.media_url or any([requested_urls[url]["is_playlist_video"] for url in requested_urls.keys()]):
+                        url_part = self.media_url.split("/")[-1]
+                        if "list=" in url_part:
+                            url_part = url_part.split("list=")[-1]
+                            try:
+                                self.shelf_title = conn.execute("SELECT title FROM playlists WHERE extractor_playlist_id = ?", (url_part,)).fetchone()[0]
+                            except sqlite3.Error as db_error:
+                                log.error("An error occurred while trying to connect to the database: %s", db_error)
+                        elif "@" in url_part:
+                            self.shelf_title = url_part.split("@")[-1]
+                        else:
+                            self.shelf_title = "Unnamed Bookshelf"
+                        response = requests.get(self.original_url, params={"current_user_name": self.current_user_name, "shelf_title": self.shelf_title})
+                        if response.status_code == 200:
+                            self.shelf_id = response.json()["shelf_id"]
+                        else:
+                            log.error("An error occurred while trying to send the shelf title to %s", self.original_url)
+
+                        # remove shorts from the requested_urls dict
+                        requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if "shorts" not in url}
+
+                        # sort the videos by views per day and get the top ones (up to the maximum number of videos per download or the length of the dictionary)
+                        requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:min(MAX_VIDEOS_PER_DOWNLOAD, len(requested_urls))])
+                        log.debug("Videos sorted by views per day: \n%s", "\n".join([f"{index + 1}-{conn.execute('SELECT title FROM media WHERE path = ?', (requested_url,)).fetchone()[0]}:{requested_urls[requested_url]['views_per_day']}" for index, requested_url in enumerate(requested_urls.keys())]))
+                    else:
                         try:
-                            self.playlist_id = self.media_url.split("/")[-1]
-                            if "list=" in self.playlist_id:
-                                self.playlist_id = self.playlist_id.split("list=")[-1]
-                                self.shelf_title = conn.execute("SELECT title FROM playlists WHERE extractor_playlist_id = ?", (self.playlist_id,)).fetchone()[0]
-                            elif "@" in self.playlist_id:
-                                self.shelf_title = self.playlist_id.split("@")[-1]
-                            else:
-                                self.shelf_title = "Unnamed Bookshelf"
+                            extractor_id = conn.execute("SELECT extractor_id FROM media WHERE ? LIKE '%' || extractor_id || '%'", (self.media_url,)).fetchone()[0]
+                            requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if extractor_id in url} # filter the requested_urls dict
                         except sqlite3.Error as db_error:
-                            if "no such table: playlists" in str(db_error):
-                                log.info("No playlists table found in the database")
-                                self.playlist_id = None
-                            else:
-                                log.error("An error occurred while trying to connect to the database: %s", db_error)
-                                self.message = f"{self.media_url_link} failed to download: {db_error}"
-                                self.progress = 0
-                        finally:
-                            log.info("Shelf title: %s", self.shelf_title)
+                            log.error("An error occurred while trying to connect to the database: %s", db_error)
+                            self.message = f"{self.media_url_link} failed to download: {db_error}"    
 
                 conn.close()
 
-                if self.shelf_title:
-                    response = requests.get(self.original_url, params={"current_user_name": self.current_user_name, "shelf_title": self.shelf_title})
-                    if response.status_code == 200:
-                        self.shelf_id = response.json()["shelf_id"]
-                    else:
-                        log.error("An error occurred while trying to send the shelf title to %s", self.original_url)
-
                 num_requested_urls = len(requested_urls.keys())
                 total_duration = 0
 
@@ -135,14 +150,17 @@ def run(self, worker_thread):
                                                     )
                     WorkerThread.add(self.current_user_name, task_download)
 
-                    self.progress = (index + 1) / num_requested_urls
                     if requested_urls[requested_url]["duration"] is not None:
                         total_duration += requested_urls[requested_url]["duration"]
                     self.message = self_main_message + f"<br><br>Number of Videos: {index + 1}/{num_requested_urls}<br>Total Duration: {datetime.utcfromtimestamp(total_duration).strftime('%H:%M:%S')}"
+                    self.progress = (index + 1) / num_requested_urls
+
+                self.end_time = datetime.now()
 
             except Exception as e:
                 log.error("An error occurred during the subprocess execution: %s", e)
                 self.message = f"{self.media_url_link} failed: {e}"
+                self.end_time = datetime.now()
 
             finally:
                 if p.returncode == 0 or self.progress == 1.0:

diff --git a/scripts/lb-wrapper b/scripts/lb-wrapper
@@ -47,7 +47,7 @@ fi
 # fetching metadata. This will prevent hanging for playlist URLs or short URLs.
 # "...to be able to list videos that are not downloaded yet"
 if [[ $XKLB_INTERNAL_CMD == "tubeadd" ]]; then
-    xklb_full_cmd="${XKLB_EXECUTABLE} tubeadd ${XKLB_DB_FILE} ${URL} --force ${VERBOSITY}"
+    xklb_full_cmd="${XKLB_EXECUTABLE} tubeadd ${XKLB_DB_FILE} ${URL} --force --extra ${VERBOSITY}"
 elif [[ $XKLB_INTERNAL_CMD == "dl" ]]; then
     xklb_full_cmd="${XKLB_EXECUTABLE} dl ${XKLB_DB_FILE} --prefix ${TMP_DOWNLOADS_DIR} --video --search ${URL} ${FORMAT_OPTIONS} --write-thumbnail ${VERBOSITY}"
 else