adds configurable limits to instagram/youtube

bellingcat · Feb 25, 2024 · ccf5f85 · ccf5f85
1 parent 7de317d
commit ccf5f85
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 12 deletions.
diff --git a/src/auto_archiver/archivers/instagram_api_archiver.py b/src/auto_archiver/archivers/instagram_api_archiver.py
@@ -22,6 +22,7 @@ def __init__(self, config: dict) -> None:
         super().__init__(config)
         self.assert_valid_string("access_token")
         self.assert_valid_string("api_endpoint")
+        self.full_profile_max_posts = int(self.full_profile_max_posts)
         if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1]
 
         self.full_profile = bool(self.full_profile)
@@ -33,6 +34,7 @@ def configs() -> dict:
             "access_token": {"default": None, "help": "a valid instagrapi-api token"},
             "api_endpoint": {"default": None, "help": "API endpoint to use"},
             "full_profile": {"default": False, "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information."},
+            "full_profile_max_posts": {"default": 0, "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights"},
             "minimize_json_output": {"default": True, "help": "if true, will remove empty values from the json output"},
         }
 
@@ -117,16 +119,7 @@ def download_profile(self, result: Metadata, username: str) -> Metadata:
 
             # download all highlights
             try:
-                count_highlights = 0
-                highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id})
-                for h in highlights:
-                    try: 
-                        h_info = self._download_highlights_reusable(result, h.get("pk"))
-                        count_highlights += len(h_info.get("items", []))
-                    except Exception as e:
-                        result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}")
-                        logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}")
-                result.set("#highlights", count_highlights)
+                self.download_all_highlights(result, username, user_id)
             except Exception as e:
                 result.append("errors", f"Error downloading highlights for {username}")
                 logger.error(f"Error downloading highlights for {username}: {e}")
@@ -135,6 +128,21 @@ def download_profile(self, result: Metadata, username: str) -> Metadata:
         result.set_url(url) # reset as scrape_item modifies it
         return result.success("insta profile")
 
+    def download_all_highlights(self, result, username, user_id):
+        count_highlights = 0
+        highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id})
+        for h in highlights:
+            try: 
+                h_info = self._download_highlights_reusable(result, h.get("pk"))
+                count_highlights += len(h_info.get("items", []))
+            except Exception as e:
+                result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}")
+                logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}")
+            if self.full_profile_max_posts and count_highlights >= self.full_profile_max_posts:
+                logger.info(f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}")
+                break
+        result.set("#highlights", count_highlights)
+
     def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata:
         if id:
             post = self.call_api(f"v1/media/by/id", {"id": id})
@@ -211,6 +219,9 @@ def download_all_posts(self, result: Metadata, user_id: str):
                     logger.error(f"Error downloading post, skipping {p.get('id')}: {e}")
                 pbar.update(1)
                 post_count+=1
+            if self.full_profile_max_posts and post_count >= self.full_profile_max_posts:
+                logger.info(f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}")
+                break
         result.set("#posts", post_count)
 
     def download_all_tagged(self, result: Metadata, user_id: str):
@@ -233,6 +244,9 @@ def download_all_tagged(self, result: Metadata, user_id: str):
                     logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e}")
                 pbar.update(1)
                 tagged_count+=1
+            if self.full_profile_max_posts and tagged_count >= self.full_profile_max_posts:
+                logger.info(f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}")
+                break
         result.set("#tagged", tagged_count)
 
 

diff --git a/src/auto_archiver/archivers/youtubedl_archiver.py b/src/auto_archiver/archivers/youtubedl_archiver.py
@@ -15,6 +15,8 @@ def __init__(self, config: dict) -> None:
         self.livestreams = bool(self.livestreams)
         self.live_from_start = bool(self.live_from_start)
         self.end_means_success = bool(self.end_means_success)
+        self.allow_playlist = bool(self.allow_playlist)
+        self.max_downloads = self.max_downloads
 
     @staticmethod
     def configs() -> dict:
@@ -26,6 +28,8 @@ def configs() -> dict:
             "live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."},
             "proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"},
             "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
+            'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
+            "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
         }
 
     def download(self, item: Metadata) -> Metadata:
@@ -35,7 +39,7 @@ def download(self, item: Metadata) -> Metadata:
             logger.debug('Using Facebook cookie')
             yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie
 
-        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': True, 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy}
+        ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
         ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"
 
         try:

diff --git a/src/auto_archiver/version.py b/src/auto_archiver/version.py
@@ -3,7 +3,7 @@
 _MINOR = "9"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "10"
+_PATCH = "11"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""