From ccf5f857eff4d1317e21cc1fe99a15cfb8ac0d5c Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Sun, 25 Feb 2024 15:14:17 +0000 Subject: [PATCH] adds configurable limits to instagram/youtube --- .../archivers/instagram_api_archiver.py | 34 +++++++++++++------ .../archivers/youtubedl_archiver.py | 6 +++- src/auto_archiver/version.py | 2 +- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/src/auto_archiver/archivers/instagram_api_archiver.py b/src/auto_archiver/archivers/instagram_api_archiver.py index 5afaab2..5f68f59 100644 --- a/src/auto_archiver/archivers/instagram_api_archiver.py +++ b/src/auto_archiver/archivers/instagram_api_archiver.py @@ -22,6 +22,7 @@ def __init__(self, config: dict) -> None: super().__init__(config) self.assert_valid_string("access_token") self.assert_valid_string("api_endpoint") + self.full_profile_max_posts = int(self.full_profile_max_posts) if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1] self.full_profile = bool(self.full_profile) @@ -33,6 +34,7 @@ def configs() -> dict: "access_token": {"default": None, "help": "a valid instagrapi-api token"}, "api_endpoint": {"default": None, "help": "API endpoint to use"}, "full_profile": {"default": False, "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information."}, + "full_profile_max_posts": {"default": 0, "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights"}, "minimize_json_output": {"default": True, "help": "if true, will remove empty values from the json output"}, } @@ -117,16 +119,7 @@ def download_profile(self, result: Metadata, username: str) -> Metadata: # download all highlights try: - count_highlights = 0 - highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id}) - for h in highlights: - try: - h_info = self._download_highlights_reusable(result, h.get("pk")) - count_highlights += len(h_info.get("items", [])) - except Exception as e: - result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}") - logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}") - result.set("#highlights", count_highlights) + self.download_all_highlights(result, username, user_id) except Exception as e: result.append("errors", f"Error downloading highlights for {username}") logger.error(f"Error downloading highlights for {username}: {e}") @@ -135,6 +128,21 @@ def download_profile(self, result: Metadata, username: str) -> Metadata: result.set_url(url) # reset as scrape_item modifies it return result.success("insta profile") + def download_all_highlights(self, result, username, user_id): + count_highlights = 0 + highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id}) + for h in highlights: + try: + h_info = self._download_highlights_reusable(result, h.get("pk")) + count_highlights += len(h_info.get("items", [])) + except Exception as e: + result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}") + logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}") + if self.full_profile_max_posts and count_highlights >= self.full_profile_max_posts: + logger.info(f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}") + break + result.set("#highlights", count_highlights) + def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata: if id: post = self.call_api(f"v1/media/by/id", {"id": id}) @@ -211,6 +219,9 @@ def download_all_posts(self, result: Metadata, user_id: str): logger.error(f"Error downloading post, skipping {p.get('id')}: {e}") pbar.update(1) post_count+=1 + if self.full_profile_max_posts and post_count >= self.full_profile_max_posts: + logger.info(f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}") + break result.set("#posts", post_count) def download_all_tagged(self, result: Metadata, user_id: str): @@ -233,6 +244,9 @@ def download_all_tagged(self, result: Metadata, user_id: str): logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e}") pbar.update(1) tagged_count+=1 + if self.full_profile_max_posts and tagged_count >= self.full_profile_max_posts: + logger.info(f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}") + break result.set("#tagged", tagged_count) diff --git a/src/auto_archiver/archivers/youtubedl_archiver.py b/src/auto_archiver/archivers/youtubedl_archiver.py index 06c43cd..ab65bcc 100644 --- a/src/auto_archiver/archivers/youtubedl_archiver.py +++ b/src/auto_archiver/archivers/youtubedl_archiver.py @@ -15,6 +15,8 @@ def __init__(self, config: dict) -> None: self.livestreams = bool(self.livestreams) self.live_from_start = bool(self.live_from_start) self.end_means_success = bool(self.end_means_success) + self.allow_playlist = bool(self.allow_playlist) + self.max_downloads = self.max_downloads @staticmethod def configs() -> dict: @@ -26,6 +28,8 @@ def configs() -> dict: "live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."}, "proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"}, "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."}, + 'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."}, + "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."}, } def download(self, item: Metadata) -> Metadata: @@ -35,7 +39,7 @@ def download(self, item: Metadata) -> Metadata: logger.debug('Using Facebook cookie') yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie - ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': True, 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy} + ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads} ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en" try: diff --git a/src/auto_archiver/version.py b/src/auto_archiver/version.py index cfca40c..d30622e 100644 --- a/src/auto_archiver/version.py +++ b/src/auto_archiver/version.py @@ -3,7 +3,7 @@ _MINOR = "9" # On main and in a nightly release the patch should be one ahead of the last # released build. -_PATCH = "10" +_PATCH = "11" # This is mainly for nightly builds which have the suffix ".dev$DATE". See # https://semver.org/#is-v123-a-semantic-version for the semantics. _SUFFIX = ""