Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Download "top 100 videos" from YouTube channel or playlist, sorted by views-per-day #139

Closed
wants to merge 9 commits into from
6 changes: 6 additions & 0 deletions cps/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@
# an initial metadata manifest (prior to downloading videos or media) here:
XKLB_DB_FILE = "/library/calibre-web/xklb-metadata.db"

# Maximum number of videos to download when adding a new video playlist
MAX_VIDEOS_PER_DOWNLOAD = 100

# Maximum number of gigabytes to download when adding a new video playlist
MAX_GB_PER_DOWNLOAD = 10

if HOME_CONFIG:
home_dir = os.path.join(os.path.expanduser("~"), ".calibre-web")
if not os.path.exists(home_dir):
Expand Down
88 changes: 53 additions & 35 deletions cps/tasks/metadata_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from datetime import datetime
from flask_babel import lazy_gettext as N_, gettext as _

from cps.constants import XKLB_DB_FILE
from cps.constants import XKLB_DB_FILE, MAX_VIDEOS_PER_DOWNLOAD
from cps.services.worker import WorkerThread
from cps.tasks.download import TaskDownload
from cps.services.worker import CalibreTask, STAT_FINISH_SUCCESS, STAT_FAIL, STAT_STARTED, STAT_WAITING
Expand All @@ -29,7 +29,6 @@ def __init__(self, task_message, media_url, original_url, current_user_name):
self.columns = None
self.shelf_title = None
self.shelf_id = None
self.playlist_id = None
self.main_message = None

def run(self, worker_thread):
Expand All @@ -48,13 +47,12 @@ def run(self, worker_thread):
subprocess_args = [lb_executable, "tubeadd", self.media_url]
log.info("Subprocess args: %s", subprocess_args)

# Execute the download process using process_open
# Execute the metadata fetching process using process_open
try:
p = process_open(subprocess_args, newlines=True)

p.wait()
self_main_message = f"{self.media_url_link}"
self.message = self_main_message
self.message = self_main_message + "..."

# Database operations
requested_urls = {}
Expand All @@ -63,11 +61,10 @@ def run(self, worker_thread):
cursor = conn.execute("PRAGMA table_info(media)")
self.columns = [column[1] for column in cursor.fetchall()]
if "error" in self.columns:
rows = conn.execute("SELECT path, duration FROM media WHERE error IS NULL AND path LIKE 'http%'").fetchall()
rows = conn.execute("SELECT path, duration, time_uploaded, view_count, size FROM media WHERE error IS NULL AND path LIKE 'http%'").fetchall()
else:
rows = conn.execute("SELECT path, duration FROM media WHERE path LIKE 'http%'").fetchall()
rows = conn.execute("SELECT path, duration, time_uploaded, view_count, size FROM media WHERE path LIKE 'http%'").fetchall()

# Abort if there are no urls
if not rows:
log.info("No urls found in the database")
error = conn.execute("SELECT error, webpath FROM media WHERE error IS NOT NULL AND webpath = ?", (self.media_url,)).fetchone()
Expand All @@ -80,13 +77,27 @@ def run(self, worker_thread):
for row in rows:
path = row[0]
duration = row[1]
time_uploaded = row[2]
view_count = row[3]
size = row[4]

time_uploaded = datetime.utcfromtimestamp(time_uploaded)
now = datetime.now()
days_since_publish = (now - time_uploaded).days or 1
views_per_day = view_count / days_since_publish

is_playlist_video = False
if "playlists_id" in self.columns:
playlist_id = conn.execute("SELECT playlists_id FROM media WHERE path = ?", (path,)).fetchone()
if playlist_id:
is_playlist_video = True
is_playlist_video = True

requested_urls[path] = {
"duration": duration,
"time_uploaded": time_uploaded,
"view_count": view_count,
"size": size,
"views_per_day": views_per_day,
"is_playlist_video": is_playlist_video
}

Expand All @@ -95,36 +106,40 @@ def run(self, worker_thread):
self.message = f"{self.media_url_link} failed: {db_error}"

# get the shelf title
if any([requested_urls[url]["is_playlist_video"] for url in requested_urls.keys()]):
if "list=" in self.media_url or "@" in self.media_url or any([requested_urls[url]["is_playlist_video"] for url in requested_urls.keys()]):
url_part = self.media_url.split("/")[-1]
if "list=" in url_part:
url_part = url_part.split("list=")[-1]
try:
self.shelf_title = conn.execute("SELECT title FROM playlists WHERE extractor_playlist_id = ?", (url_part,)).fetchone()[0]
except sqlite3.Error as db_error:
log.error("An error occurred while trying to connect to the database: %s", db_error)
elif "@" in url_part:
self.shelf_title = url_part.split("@")[-1]
else:
self.shelf_title = "Unnamed Bookshelf"
response = requests.get(self.original_url, params={"current_user_name": self.current_user_name, "shelf_title": self.shelf_title})
if response.status_code == 200:
self.shelf_id = response.json()["shelf_id"]
else:
log.error("An error occurred while trying to send the shelf title to %s", self.original_url)

# remove shorts from the requested_urls dict
requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if "shorts" not in url}

# sort the videos by views per day and get the top ones (up to the maximum number of videos per download or the length of the dictionary)
requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:min(MAX_VIDEOS_PER_DOWNLOAD, len(requested_urls))])
log.debug("Videos sorted by views per day: \n%s", "\n".join([f"{index + 1}-{conn.execute('SELECT title FROM media WHERE path = ?', (requested_url,)).fetchone()[0]}:{requested_urls[requested_url]['views_per_day']}" for index, requested_url in enumerate(requested_urls.keys())]))
else:
try:
self.playlist_id = self.media_url.split("/")[-1]
if "list=" in self.playlist_id:
self.playlist_id = self.playlist_id.split("list=")[-1]
self.shelf_title = conn.execute("SELECT title FROM playlists WHERE extractor_playlist_id = ?", (self.playlist_id,)).fetchone()[0]
elif "@" in self.playlist_id:
self.shelf_title = self.playlist_id.split("@")[-1]
else:
self.shelf_title = "Unnamed Bookshelf"
extractor_id = conn.execute("SELECT extractor_id FROM media WHERE ? LIKE '%' || extractor_id || '%'", (self.media_url,)).fetchone()[0]
requested_urls = {url: requested_urls[url] for url in requested_urls.keys() if extractor_id in url} # filter the requested_urls dict
except sqlite3.Error as db_error:
if "no such table: playlists" in str(db_error):
log.info("No playlists table found in the database")
self.playlist_id = None
else:
log.error("An error occurred while trying to connect to the database: %s", db_error)
self.message = f"{self.media_url_link} failed to download: {db_error}"
self.progress = 0
finally:
log.info("Shelf title: %s", self.shelf_title)
log.error("An error occurred while trying to connect to the database: %s", db_error)
self.message = f"{self.media_url_link} failed to download: {db_error}"

conn.close()

if self.shelf_title:
response = requests.get(self.original_url, params={"current_user_name": self.current_user_name, "shelf_title": self.shelf_title})
if response.status_code == 200:
self.shelf_id = response.json()["shelf_id"]
else:
log.error("An error occurred while trying to send the shelf title to %s", self.original_url)

num_requested_urls = len(requested_urls.keys())
total_duration = 0

Expand All @@ -135,14 +150,17 @@ def run(self, worker_thread):
)
WorkerThread.add(self.current_user_name, task_download)

self.progress = (index + 1) / num_requested_urls
if requested_urls[requested_url]["duration"] is not None:
total_duration += requested_urls[requested_url]["duration"]
self.message = self_main_message + f"<br><br>Number of Videos: {index + 1}/{num_requested_urls}<br>Total Duration: {datetime.utcfromtimestamp(total_duration).strftime('%H:%M:%S')}"
self.progress = (index + 1) / num_requested_urls

self.end_time = datetime.now()

except Exception as e:
log.error("An error occurred during the subprocess execution: %s", e)
self.message = f"{self.media_url_link} failed: {e}"
self.end_time = datetime.now()

finally:
if p.returncode == 0 or self.progress == 1.0:
Expand Down
2 changes: 1 addition & 1 deletion scripts/lb-wrapper
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ fi
# fetching metadata. This will prevent hanging for playlist URLs or short URLs.
# "...to be able to list videos that are not downloaded yet"
if [[ $XKLB_INTERNAL_CMD == "tubeadd" ]]; then
xklb_full_cmd="${XKLB_EXECUTABLE} tubeadd ${XKLB_DB_FILE} ${URL} --force ${VERBOSITY}"
xklb_full_cmd="${XKLB_EXECUTABLE} tubeadd ${XKLB_DB_FILE} ${URL} --force --extra ${VERBOSITY}"
elif [[ $XKLB_INTERNAL_CMD == "dl" ]]; then
xklb_full_cmd="${XKLB_EXECUTABLE} dl ${XKLB_DB_FILE} --prefix ${TMP_DOWNLOADS_DIR} --video --search ${URL} ${FORMAT_OPTIONS} --write-thumbnail ${VERBOSITY}"
else
Expand Down