Skip to content

Commit

Permalink
adds configurable limits to instagram/youtube
Browse files Browse the repository at this point in the history
  • Loading branch information
msramalho committed Feb 25, 2024
1 parent 7de317d commit ccf5f85
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 12 deletions.
34 changes: 24 additions & 10 deletions src/auto_archiver/archivers/instagram_api_archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def __init__(self, config: dict) -> None:
super().__init__(config)
self.assert_valid_string("access_token")
self.assert_valid_string("api_endpoint")
self.full_profile_max_posts = int(self.full_profile_max_posts)
if self.api_endpoint[-1] == "/": self.api_endpoint = self.api_endpoint[:-1]

self.full_profile = bool(self.full_profile)
Expand All @@ -33,6 +34,7 @@ def configs() -> dict:
"access_token": {"default": None, "help": "a valid instagrapi-api token"},
"api_endpoint": {"default": None, "help": "API endpoint to use"},
"full_profile": {"default": False, "help": "if true, will download all posts, tagged posts, stories, and highlights for a profile, if false, will only download the profile pic and information."},
"full_profile_max_posts": {"default": 0, "help": "Use to limit the number of posts to download when full_profile is true. 0 means no limit. limit is applied softly since posts are fetched in batch, once to: posts, tagged posts, and highlights"},
"minimize_json_output": {"default": True, "help": "if true, will remove empty values from the json output"},
}

Expand Down Expand Up @@ -117,16 +119,7 @@ def download_profile(self, result: Metadata, username: str) -> Metadata:

# download all highlights
try:
count_highlights = 0
highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id})
for h in highlights:
try:
h_info = self._download_highlights_reusable(result, h.get("pk"))
count_highlights += len(h_info.get("items", []))
except Exception as e:
result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}")
logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}")
result.set("#highlights", count_highlights)
self.download_all_highlights(result, username, user_id)
except Exception as e:
result.append("errors", f"Error downloading highlights for {username}")
logger.error(f"Error downloading highlights for {username}: {e}")
Expand All @@ -135,6 +128,21 @@ def download_profile(self, result: Metadata, username: str) -> Metadata:
result.set_url(url) # reset as scrape_item modifies it
return result.success("insta profile")

def download_all_highlights(self, result, username, user_id):
count_highlights = 0
highlights = self.call_api(f"v1/user/highlights", {"user_id": user_id})
for h in highlights:
try:
h_info = self._download_highlights_reusable(result, h.get("pk"))
count_highlights += len(h_info.get("items", []))
except Exception as e:
result.append("errors", f"Error downloading highlight id{h.get('pk')} for {username}")
logger.error(f"Error downloading highlight id{h.get('pk')} for {username}: {e}")
if self.full_profile_max_posts and count_highlights >= self.full_profile_max_posts:
logger.info(f"HIGHLIGHTS reached full_profile_max_posts={self.full_profile_max_posts}")
break
result.set("#highlights", count_highlights)

def download_post(self, result: Metadata, code: str = None, id: str = None, context: str = None) -> Metadata:
if id:
post = self.call_api(f"v1/media/by/id", {"id": id})
Expand Down Expand Up @@ -211,6 +219,9 @@ def download_all_posts(self, result: Metadata, user_id: str):
logger.error(f"Error downloading post, skipping {p.get('id')}: {e}")
pbar.update(1)
post_count+=1
if self.full_profile_max_posts and post_count >= self.full_profile_max_posts:
logger.info(f"POSTS reached full_profile_max_posts={self.full_profile_max_posts}")
break
result.set("#posts", post_count)

def download_all_tagged(self, result: Metadata, user_id: str):
Expand All @@ -233,6 +244,9 @@ def download_all_tagged(self, result: Metadata, user_id: str):
logger.error(f"Error downloading tagged post, skipping {p.get('id')}: {e}")
pbar.update(1)
tagged_count+=1
if self.full_profile_max_posts and tagged_count >= self.full_profile_max_posts:
logger.info(f"TAGS reached full_profile_max_posts={self.full_profile_max_posts}")
break
result.set("#tagged", tagged_count)


Expand Down
6 changes: 5 additions & 1 deletion src/auto_archiver/archivers/youtubedl_archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ def __init__(self, config: dict) -> None:
self.livestreams = bool(self.livestreams)
self.live_from_start = bool(self.live_from_start)
self.end_means_success = bool(self.end_means_success)
self.allow_playlist = bool(self.allow_playlist)
self.max_downloads = self.max_downloads

@staticmethod
def configs() -> dict:
Expand All @@ -26,6 +28,8 @@ def configs() -> dict:
"live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."},
"proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"},
"end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
"max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
}

def download(self, item: Metadata) -> Metadata:
Expand All @@ -35,7 +39,7 @@ def download(self, item: Metadata) -> Metadata:
logger.debug('Using Facebook cookie')
yt_dlp.utils.std_headers['cookie'] = self.facebook_cookie

ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': True, 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy}
ydl_options = {'outtmpl': os.path.join(ArchivingContext.get_tmp_dir(), f'%(id)s.%(ext)s'), 'quiet': False, 'noplaylist': not self.allow_playlist , 'writesubtitles': self.subtitles, 'writeautomaticsub': self.subtitles, "live_from_start": self.live_from_start, "proxy": self.proxy, "max_downloads": self.max_downloads, "playlistend": self.max_downloads}
ydl = yt_dlp.YoutubeDL(ydl_options) # allsubtitles and subtitleslangs not working as expected, so default lang is always "en"

try:
Expand Down
2 changes: 1 addition & 1 deletion src/auto_archiver/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
_MINOR = "9"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "10"
_PATCH = "11"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""
Expand Down

0 comments on commit ccf5f85

Please sign in to comment.