diff --git a/src/webtoon_downloader.log b/src/webtoon_downloader.log deleted file mode 100644 index 0c3f95b..0000000 --- a/src/webtoon_downloader.log +++ /dev/null @@ -1,3 +0,0 @@ -Directory Exists: [#80BBA6]Crawling_Dreams[/] -Directory Exists: [#80BBA6]Crawling_Dreams[/] -2023-06-14 21:48:59,718 - __main__ - WARNING - Directory Exists: [#80BBA6]Crawling_Dreams[/] diff --git a/src/webtoon_downloader.py b/src/webtoon_downloader.py index d75548a..0ddaaff 100644 --- a/src/webtoon_downloader.py +++ b/src/webtoon_downloader.py @@ -13,6 +13,7 @@ from logging.handlers import RotatingFileHandler from threading import Event from typing import List, Union +from urllib.parse import parse_qs, urlparse import requests import rich @@ -112,31 +113,36 @@ def render(self, task) -> Text: ) ######################## Log Configuration ################################ console = Console() -traceback.install(console=console, show_locals=True) +traceback.install(console=console, show_locals=False) logging.getLogger("urllib3").setLevel(logging.CRITICAL) +# create logger log = logging.getLogger(__name__) -FILE_FORMAT = "%(asctime)s - %(name)-15s - %(levelname)-8s - %(filename)s - %(lineno)d - %(message)s" # added filename and lineno +log.setLevel(logging.DEBUG) # set log level + +# create formatter CLI_FORMAT = "%(message)s" +FILE_FORMAT = "%(asctime)s - %(levelname)-8s - %(message)s - %(filename)s - %(lineno)d - %(name)s" # rearranged + +# create file handler LOG_FILENAME = "webtoon_downloader.log" -file_handler = RotatingFileHandler(LOG_FILENAME, maxBytes=10000, backupCount=10) -file_handler.setFormatter(logging.Formatter(FILE_FORMAT)) +file_handler = RotatingFileHandler( + LOG_FILENAME, maxBytes=1000000, backupCount=10, encoding="utf-8" +) file_handler.setLevel(logging.DEBUG) +file_handler.setFormatter(logging.Formatter(FILE_FORMAT)) -logging.basicConfig( - level="WARNING", - format=CLI_FORMAT, - datefmt="[%X]", - handlers=[ - file_handler, - RichHandler( - console=progress.console, - rich_tracebacks=True, - tracebacks_show_locals=True, - markup=True, - ), - ], +console_handler = RichHandler( + level=logging.WARNING, + console=console, + rich_tracebacks=True, + tracebacks_show_locals=False, + markup=True, ) -########################################################################### +console_handler.setLevel(logging.WARNING) +console_handler.setFormatter(logging.Formatter(CLI_FORMAT)) +log.addHandler(file_handler) +log.addHandler(console_handler) +################################################################## done_event = Event() N_CONCURRENT_CHAPTERS_DOWNLOAD = 4 @@ -195,21 +201,48 @@ def get_chapter_viewer_url(html: Union[str, BeautifulSoup]) -> str: .find("a")["href"] .split("&")[0] ) - elif isinstance(html, BeautifulSoup): + + if isinstance(html, BeautifulSoup): return ( html.find("li", attrs={"data-episode-no": True}) .find("a")["href"] .split("&")[0] ) - else: - raise TypeError( - "variable passed is neither a string nor a BeautifulSoup object" + + raise TypeError("variable passed is neither a string nor a BeautifulSoup object") + + +def get_first_chapter_episode_no(session: requests.session, series_url: str) -> int: + """ + Fetches the 'episode_no' field that is used in a Webtoon's viewer. Most series should start with episode 1 + but some start at 2, and others might even be 80+. + + Args: + session: The session with which to send HTTP requests. + series_url: The URL of the Webtoon series. + + Returns: + The episode number of the first chapter of the Webtoon series. + """ + + try: + # First attempt to fetch the episode_no + soup = BeautifulSoup(session.get(series_url).text, "html.parser") + href = soup.find("a", id="_btnEpisode")["href"] + return int(parse_qs(urlparse(href).query)["episode_no"][0]) + except (TypeError, KeyError): + # If the second attempt fails, try to get the first episode from the list + soup = BeautifulSoup(session.get(f"{series_url}&page=9999").text, "html.parser") + return min( + int(episode["data-episode-no"]) + for episode in soup.find_all("li", {"class": "_episodeItem"}) ) def get_chapters_details( session: requests.session, viewer_url: str, + series_url: str, start_chapter: int = 1, end_chapter: int = None, ) -> List[ChapterInfo]: @@ -236,10 +269,10 @@ def get_chapters_details( ---------- (list[ChapterInfo]): list of all chapter details extracted. """ - resp = session.get(f"{viewer_url}&episode_no=1") - if resp.status_code == 404: - resp = session.get(f"{viewer_url}&episode_no=2") - + first_chapter = get_first_chapter_episode_no(session, series_url) + episode_url = f"{viewer_url}&episode_no={first_chapter}" + log.info("Sending request to '%s'", episode_url) + resp = session.get(episode_url) soup = BeautifulSoup(resp.text, "lxml") chapter_details = [ ChapterInfo( @@ -390,7 +423,9 @@ def download_chapter( destination folder path to store the downloaded image files. (default: current working directory) """ - log.debug("[italic red]Accessing[/italic red] chapter %d", chapter_info) + log.debug( + "[italic red]Accessing[/italic red] chapter %d", chapter_info.chapter_number + ) img_urls = get_img_urls( session=session, viewer_url=viewer_url, @@ -416,6 +451,7 @@ def download_chapter( ) if done_event.is_set(): return + log.info( "Chapter %d download complete with a total of %d pages [green]✓", chapter_info.chapter_number, @@ -464,8 +500,8 @@ def download_webtoon( session.cookies.set("needGDPR", "FALSE", domain=".webtoons.com") session.cookies.set("needCCPA", "FALSE", domain=".webtoons.com") session.cookies.set("needCOPPA", "FALSE", domain=".webtoons.com") - r = session.get(f"{series_url}", headers=headers) - soup = BeautifulSoup(r.text, "lxml") + resp = session.get(series_url, headers=headers) + soup = BeautifulSoup(resp.text, "lxml") viewer_url = get_chapter_viewer_url(soup) series_title = get_series_title(series_url, soup) if not (dest): @@ -483,11 +519,13 @@ def download_webtoon( ) with progress: if download_latest_chapter: - chapters_to_download = [get_chapters_details(session, viewer_url)[-1]] + chapters_to_download = [ + get_chapters_details(session, viewer_url, series_url)[-1] + ] else: chapters_to_download = get_chapters_details( - session, viewer_url, start_chapter, end_chapter + session, viewer_url, series_url, start_chapter, end_chapter ) start_chapter, end_chapter = ( @@ -608,15 +646,19 @@ def main(): return series_url = args.url separate = args.seperate or args.separate - download_webtoon( - series_url, - args.start, - args.end, - args.dest, - args.images_format, - args.latest, - separate, - ) + try: + download_webtoon( + series_url, + args.start, + args.end, + args.dest, + args.images_format, + args.latest, + separate, + ) + except Exception as exc: + log.exception(exc) + sys.exit(1) if __name__ == "__main__":