Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Download "top 100 videos" from YouTube channel or playlist, sorted by views-per-day #139

Closed
wants to merge 9 commits into from
3 changes: 3 additions & 0 deletions cps/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@
# an initial metadata manifest (prior to downloading videos or media) here:
XKLB_DB_FILE = "/library/calibre-web/xklb-metadata.db"

# Number of videos to be downloaded (default: 100) based on views per day
NUMBER_OF_VIDEOS = 100

if HOME_CONFIG:
home_dir = os.path.join(os.path.expanduser("~"), ".calibre-web")
if not os.path.exists(home_dir):
Expand Down
43 changes: 34 additions & 9 deletions cps/tasks/metadata_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from datetime import datetime
from flask_babel import lazy_gettext as N_, gettext as _

from cps.constants import XKLB_DB_FILE
from cps.constants import XKLB_DB_FILE, NUMBER_OF_VIDEOS
from cps.services.worker import WorkerThread
from cps.tasks.download import TaskDownload
from cps.services.worker import CalibreTask, STAT_FINISH_SUCCESS, STAT_FAIL, STAT_STARTED, STAT_WAITING
Expand Down Expand Up @@ -48,7 +48,7 @@ def run(self, worker_thread):
subprocess_args = [lb_executable, "tubeadd", self.media_url]
log.info("Subprocess args: %s", subprocess_args)

# Execute the download process using process_open
# Execute the metadata fetching process using process_open
try:
p = process_open(subprocess_args, newlines=True)

Expand Down Expand Up @@ -115,16 +115,41 @@ def run(self, worker_thread):
self.progress = 0
finally:
log.info("Shelf title: %s", self.shelf_title)
response = requests.get(self.original_url, params={"current_user_name": self.current_user_name, "shelf_title": self.shelf_title})
if response.status_code == 200:
self.shelf_id = response.json()["shelf_id"]
else:
log.error("An error occurred while trying to send the shelf title to %s", self.original_url)

# update the metadata of every video in the shelf
for index, requested_url in enumerate(requested_urls.keys()):
try:
p = process_open([lb_executable, "tubeadd", requested_url], newlines=True)
p.wait()
except Exception as e:
log.error("An error occurred during updating the metadata of %s: %s", requested_url, e)
self.message = f"{requested_url} failed: {e}"
for index, requested_url in enumerate(requested_urls.keys()):
try:
view_count = conn.execute("SELECT view_count FROM media WHERE path = ?", (requested_url,)).fetchone()[0]
time_uploaded = conn.execute("SELECT time_uploaded FROM media WHERE path = ?", (requested_url,)).fetchone()[0]
time_uploaded = datetime.utcfromtimestamp(time_uploaded)
now = datetime.now()
# calculate views per day
days_since_publish = (now - time_uploaded).days
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we force days_since_publish to be 1 at minimum (i.e. 1 day) to avoid divide-by-zero errors?

try:
requested_urls[requested_url]["views_per_day"] = view_count / days_since_publish
except ZeroDivisionError:
requested_urls[requested_url]["views_per_day"] = 0
except Exception as e:
log.error("An error occurred during the subprocess execution: %s", e)
self.message = f"{requested_url} failed: {e}"

# sort the videos by views per day and get the top ones (up to the NUMBER_OF_VIDEOS constant or the length of the dictionary)
requested_urls = dict(sorted(requested_urls.items(), key=lambda item: item[1]["views_per_day"], reverse=True)[:min(NUMBER_OF_VIDEOS, len(requested_urls))])

conn.close()

if self.shelf_title:
response = requests.get(self.original_url, params={"current_user_name": self.current_user_name, "shelf_title": self.shelf_title})
if response.status_code == 200:
self.shelf_id = response.json()["shelf_id"]
else:
log.error("An error occurred while trying to send the shelf title to %s", self.original_url)

num_requested_urls = len(requested_urls.keys())
total_duration = 0

Expand Down