diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d9352fedd871..55cf3b3a271c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -360,7 +360,7 @@ jobs: - name: Install Requirements run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds python devscripts/install_deps.py -o --include build - python devscripts/install_deps.py --include py2exe --include curl-cffi + python devscripts/install_deps.py --include curl-cffi python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-5.8.0-py3-none-any.whl" - name: Prepare @@ -369,12 +369,20 @@ jobs: python devscripts/make_lazy_extractors.py - name: Build run: | - python -m bundle.py2exe - Move-Item ./dist/yt-dlp.exe ./dist/yt-dlp_min.exe python -m bundle.pyinstaller python -m bundle.pyinstaller --onedir + Move-Item ./dist/yt-dlp.exe ./dist/yt-dlp_real.exe Compress-Archive -Path ./dist/yt-dlp/* -DestinationPath ./dist/yt-dlp_win.zip + - name: Install Requirements (py2exe) + run: | + python devscripts/install_deps.py --include py2exe + - name: Build (py2exe) + run: | + python -m bundle.py2exe + Move-Item ./dist/yt-dlp.exe ./dist/yt-dlp_min.exe + Move-Item ./dist/yt-dlp_real.exe ./dist/yt-dlp.exe + - name: Verify --update-to if: vars.UPDATE_TO_VERIFICATION run: | diff --git a/README.md b/README.md index e3257682b56c..887cfde2319a 100644 --- a/README.md +++ b/README.md @@ -263,7 +263,7 @@ You can also run `make yt-dlp` instead to compile only the binary without updati ### Standalone Py2Exe Builds (Windows) -While we provide the option to build with [py2exe](https://www.py2exe.org), it is recommended to build [using PyInstaller](#standalone-pyinstaller-builds) instead since the py2exe builds **cannot contain `pycryptodomex`/`certifi` and needs VC++14** on the target computer to run. +While we provide the option to build with [py2exe](https://www.py2exe.org), it is recommended to build [using PyInstaller](#standalone-pyinstaller-builds) instead since the py2exe builds **cannot contain `pycryptodomex`/`certifi` and need VC++14** on the target computer to run. If you wish to build it anyway, install Python (if it is not already installed) and you can run the following commands: @@ -666,7 +666,7 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git The name of the browser to load cookies from. Currently supported browsers are: brave, chrome, chromium, edge, firefox, - opera, safari, vivaldi. Optionally, the + opera, safari, vivaldi, whale. Optionally, the KEYRING used for decrypting Chromium cookies on Linux, the name/path of the PROFILE to load cookies from, and the CONTAINER name @@ -1760,7 +1760,7 @@ The following extractors use this feature: #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb`, `mweb_embedscreen`, `mediaconnect` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients. +* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb`, `mweb_embedscreen` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. The `android` clients will always be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) @@ -1813,8 +1813,9 @@ The following extractors use this feature: * `app_name`: Default app name to use with mobile API calls, e.g. `trill` * `app_version`: Default app version to use with mobile API calls - should be set along with `manifest_app_version`, e.g. `34.1.2` * `manifest_app_version`: Default numeric app version to use with mobile API calls, e.g. `2023401020` -* `aid`: Default app ID to use with API calls, e.g. `1180` -* `app_info`: One or more app info strings in the format of `/[app_name]/[app_version]/[manifest_app_version]/[aid]`, where `iid` is the unique app install ID. `iid` is the only required value; all other values and their `/` separators can be omitted, e.g. `tiktok:app_info=1234567890123456789` or `tiktok:app_info=123,456/trill///1180,789//34.0.1/340001` +* `aid`: Default app ID to use with mobile API calls, e.g. `1180` +* `app_info`: Enable mobile API extraction with one or more app info strings in the format of `/[app_name]/[app_version]/[manifest_app_version]/[aid]`, where `iid` is the unique app install ID. `iid` is the only required value; all other values and their `/` separators can be omitted, e.g. `tiktok:app_info=1234567890123456789` or `tiktok:app_info=123,456/trill///1180,789//34.0.1/340001` +* `device_id`: Enable mobile API extraction with a genuine device ID to be used with mobile API calls. Default is a random 19-digit string #### rokfinchannel * `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks` @@ -1840,6 +1841,9 @@ The following extractors use this feature: #### afreecatvlive * `cdn`: One or more CDN IDs to use with the API call for stream URLs, e.g. `gcp_cdn`, `gs_cdn_pc_app`, `gs_cdn_mobile_web`, `gs_cdn_pc_web` +#### soundcloud +* `formats`: Formats to request from the API. Requested values should be in the format of `{protocol}_{extension}` (omitting the bitrate), e.g. `hls_opus,http_aac`. The `*` character functions as a wildcard, e.g. `*_mp3`, and can passed by itself to request all formats. Known protocols include `http`, `hls` and `hls-aes`; known extensions include `aac`, `opus` and `mp3`. Original `download` formats are always extracted. Default is `http_aac,hls_aac,http_opus,hls_opus,http_mp3,hls_mp3` + **Note**: These options may be changed/removed in the future without concern for backward compatibility diff --git a/pyproject.toml b/pyproject.toml index 5fadd14495ae..8e3bce4bfc48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,7 @@ build = [ "build", "hatchling", "pip", + "setuptools>=66.1.0,<70", "wheel", ] dev = [ @@ -73,7 +74,10 @@ pyinstaller = [ "pyinstaller>=6.3; sys_platform!='darwin'", "pyinstaller==5.13.2; sys_platform=='darwin'", # needed for curl_cffi ] -py2exe = ["py2exe>=0.12"] +py2exe = [ + "py2exe>=0.12", + "requests==2.31.*", +] [project.urls] Documentation = "https://github.com/yt-dlp/yt-dlp#readme" diff --git a/test/test_networking.py b/test/test_networking.py index 994467014d36..d127cbb94c46 100644 --- a/test/test_networking.py +++ b/test/test_networking.py @@ -6,7 +6,7 @@ import pytest -from yt_dlp.networking.common import Features +from yt_dlp.networking.common import Features, DEFAULT_TIMEOUT sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -523,20 +523,17 @@ def test_read_timeout(self, handler): def test_connect_timeout(self, handler): # nothing should be listening on this port connect_timeout_url = 'http://10.255.255.255' - with handler(timeout=0.01) as rh: + with handler(timeout=0.01) as rh, pytest.raises(TransportError): now = time.time() - with pytest.raises(TransportError): - validate_and_send( - rh, Request(connect_timeout_url)) - assert 0.01 <= time.time() - now < 20 + validate_and_send(rh, Request(connect_timeout_url)) + assert time.time() - now < DEFAULT_TIMEOUT - with handler() as rh: - with pytest.raises(TransportError): - # Per request timeout, should override handler timeout - now = time.time() - validate_and_send( - rh, Request(connect_timeout_url, extensions={'timeout': 0.01})) - assert 0.01 <= time.time() - now < 20 + # Per request timeout, should override handler timeout + request = Request(connect_timeout_url, extensions={'timeout': 0.01}) + with handler() as rh, pytest.raises(TransportError): + now = time.time() + validate_and_send(rh, request) + assert time.time() - now < DEFAULT_TIMEOUT def test_source_address(self, handler): source_address = f'127.0.0.{random.randint(5, 255)}' diff --git a/test/test_websockets.py b/test/test_websockets.py index bc9f2187a123..aa0dfa2d5570 100644 --- a/test/test_websockets.py +++ b/test/test_websockets.py @@ -3,11 +3,12 @@ # Allow direct execution import os import sys +import time import pytest from test.helper import verify_address_availability -from yt_dlp.networking.common import Features +from yt_dlp.networking.common import Features, DEFAULT_TIMEOUT sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -202,11 +203,26 @@ def test_raise_http_error(self, handler, status): ({'timeout': sys.float_info.min}, {}), ({}, {'timeout': sys.float_info.min}), ]) - def test_timeout(self, handler, params, extensions): + def test_read_timeout(self, handler, params, extensions): with handler(**params) as rh: with pytest.raises(TransportError): ws_validate_and_send(rh, Request(self.ws_base_url, extensions=extensions)) + def test_connect_timeout(self, handler): + # nothing should be listening on this port + connect_timeout_url = 'ws://10.255.255.255' + with handler(timeout=0.01) as rh, pytest.raises(TransportError): + now = time.time() + ws_validate_and_send(rh, Request(connect_timeout_url)) + assert time.time() - now < DEFAULT_TIMEOUT + + # Per request timeout, should override handler timeout + request = Request(connect_timeout_url, extensions={'timeout': 0.01}) + with handler() as rh, pytest.raises(TransportError): + now = time.time() + ws_validate_and_send(rh, request) + assert time.time() - now < DEFAULT_TIMEOUT + def test_cookies(self, handler): cookiejar = YoutubeDLCookieJar() cookiejar.set_cookie(http.cookiejar.Cookie( diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 0de0672e1236..815897d5a5ac 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -46,7 +46,7 @@ from .utils._utils import _YDLLogger from .utils.networking import normalize_url -CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} +CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'whale'} SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'} @@ -219,6 +219,7 @@ def _get_chromium_based_browser_settings(browser_name): 'edge': os.path.join(appdata_local, R'Microsoft\Edge\User Data'), 'opera': os.path.join(appdata_roaming, R'Opera Software\Opera Stable'), 'vivaldi': os.path.join(appdata_local, R'Vivaldi\User Data'), + 'whale': os.path.join(appdata_local, R'Naver\Naver Whale\User Data'), }[browser_name] elif sys.platform == 'darwin': @@ -230,6 +231,7 @@ def _get_chromium_based_browser_settings(browser_name): 'edge': os.path.join(appdata, 'Microsoft Edge'), 'opera': os.path.join(appdata, 'com.operasoftware.Opera'), 'vivaldi': os.path.join(appdata, 'Vivaldi'), + 'whale': os.path.join(appdata, 'Naver/Whale'), }[browser_name] else: @@ -241,6 +243,7 @@ def _get_chromium_based_browser_settings(browser_name): 'edge': os.path.join(config, 'microsoft-edge'), 'opera': os.path.join(config, 'opera'), 'vivaldi': os.path.join(config, 'vivaldi'), + 'whale': os.path.join(config, 'naver-whale'), }[browser_name] # Linux keyring names can be determined by snooping on dbus while opening the browser in KDE: @@ -252,6 +255,7 @@ def _get_chromium_based_browser_settings(browser_name): 'edge': 'Microsoft Edge' if sys.platform == 'darwin' else 'Chromium', 'opera': 'Opera' if sys.platform == 'darwin' else 'Chromium', 'vivaldi': 'Vivaldi' if sys.platform == 'darwin' else 'Chrome', + 'whale': 'Whale', }[browser_name] browsers_without_profiles = {'opera'} diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index 1c180b1fd5b6..46fe006cc93b 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -5,6 +5,7 @@ ExtractorError, GeoRestrictedError, int_or_none, + join_nonempty, parse_iso8601, parse_qs, strip_or_none, @@ -31,20 +32,6 @@ class ArteTVIE(ArteTVBaseIE): _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', 'only_matching': True, - }, { - 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', - 'info_dict': { - 'id': '100103-000-A', - 'title': 'USA: Dyskryminacja na porodówce', - 'description': 'md5:242017b7cce59ffae340a54baefcafb1', - 'alt_title': 'ARTE Reportage', - 'upload_date': '20201103', - 'duration': 554, - 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530', - 'timestamp': 1604417980, - 'ext': 'mp4', - }, - 'params': {'skip_download': 'm3u8'} }, { 'note': 'No alt_title', 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/', @@ -58,6 +45,23 @@ class ArteTVIE(ArteTVBaseIE): }, { 'url': 'https://www.arte.tv/de/videos/110203-006-A/zaz/', 'only_matching': True, + }, { + 'url': 'https://www.arte.tv/fr/videos/109067-000-A/la-loi-de-teheran/', + 'info_dict': { + 'id': '109067-000-A', + 'ext': 'mp4', + 'description': 'md5:d2ca367b8ecee028dddaa8bd1aebc739', + 'timestamp': 1713927600, + 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/3rR6PLzfbigSkkeHtkCZNF/940x530', + 'duration': 7599, + 'title': 'La loi de Téhéran', + 'upload_date': '20240424', + 'subtitles': { + 'fr': 'mincount:1', + 'fr-acc': 'mincount:1', + 'fr-forced': 'mincount:1', + }, + }, }, { 'note': 'age-restricted', 'url': 'https://www.arte.tv/de/videos/006785-000-A/the-element-of-crime/', @@ -71,23 +75,7 @@ class ArteTVIE(ArteTVBaseIE): 'upload_date': '20230930', 'ext': 'mp4', }, - }, { - 'url': 'https://www.arte.tv/de/videos/085374-003-A/im-hohen-norden-geboren/', - 'info_dict': { - 'id': '085374-003-A', - 'ext': 'mp4', - 'description': 'md5:ab79ec7cc472a93164415b4e4916abf9', - 'timestamp': 1702872000, - 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/TnyHBfPxv3v2GEY3suXGZP/940x530', - 'duration': 2594, - 'title': 'Die kurze Zeit der Jugend', - 'alt_title': 'Im hohen Norden geboren', - 'upload_date': '20231218', - 'subtitles': { - 'fr': 'mincount:1', - 'fr-acc': 'mincount:1', - }, - }, + 'skip': '404 Not Found', }] _GEO_BYPASS = True @@ -143,16 +131,18 @@ def _fix_accessible_subs_locale(subs): updated_subs = {} for lang, sub_formats in subs.items(): for fmt in sub_formats: - if fmt.get('url', '').endswith('-MAL.m3u8'): - lang += '-acc' - updated_subs.setdefault(lang, []).append(fmt) + url = fmt.get('url') or '' + suffix = ('acc' if url.endswith('-MAL.m3u8') + else 'forced' if '_VO' not in url + else None) + updated_subs.setdefault(join_nonempty(lang, suffix), []).append(fmt) return updated_subs def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') lang = mobj.group('lang') or mobj.group('lang_2') - langauge_code = self._LANG_MAP.get(lang) + language_code = self._LANG_MAP.get(lang) config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id, headers={ 'x-validated-age': '18' @@ -180,10 +170,10 @@ def _real_extract(self, url): m = self._VERSION_CODE_RE.match(stream_version_code) if m: lang_pref = int(''.join('01'[x] for x in ( - m.group('vlang') == langauge_code, # we prefer voice in the requested language + m.group('vlang') == language_code, # we prefer voice in the requested language not m.group('audio_desc'), # and not the audio description version bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice - m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language + m.group('sub_lang') == language_code, # if subtitles are present, we prefer them in the requested language not m.group('has_sub'), # but we prefer no subtitles otherwise not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles ))) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 015af9e1d616..f6b58b361f87 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -602,7 +602,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', - 'title': 'Russia stages massive WW2 parade', + 'title': 'Russia stages massive WW2 parade despite Western boycott', 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', }, 'playlist_count': 2, @@ -623,6 +623,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'info_dict': { 'id': '3662a707-0af9-3149-963f-47bea720b460', 'title': 'BUGGER', + 'description': r're:BUGGER The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$', }, 'playlist_count': 18, }, { @@ -631,14 +632,14 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'info_dict': { 'id': 'p02mprgb', 'ext': 'mp4', - 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'description': 'md5:2868290467291b37feda7863f7a83f54', + 'title': 'Germanwings crash site aerial video', + 'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$', 'duration': 47, 'timestamp': 1427219242, 'upload_date': '20150324', + 'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg', }, 'params': { - # rtmp download 'skip_download': True, } }, { @@ -656,21 +657,24 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE }, 'params': { 'skip_download': True, - } + }, + 'skip': 'now SIMORGH_DATA with no video', }, { # single video embedded with data-playable containing XML playlists (regional section) 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'info_dict': { - 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', + 'id': '39275083', + 'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'ext': 'mp4', 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', - 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8', + 'description': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', 'timestamp': 1434713142, 'upload_date': '20150619', + 'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg', }, 'params': { 'skip_download': True, - } + }, }, { # single video from video playlist embedded with vxp-playlist-data JSON 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', @@ -683,22 +687,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE }, 'params': { 'skip_download': True, - } + }, + 'skip': '404 Not Found', }, { - # single video story with digitalData + # single video story with __PWA_PRELOADED_STATE__ 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', 'info_dict': { 'id': 'p02q6gc4', - 'ext': 'flv', - 'title': 'Sri Lanka’s spicy secret', - 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', - 'timestamp': 1437674293, - 'upload_date': '20150723', + 'ext': 'mp4', + 'title': 'Tasting the spice of life in Jaffna', + 'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{151} aftertaste\.$', + 'timestamp': 1646058397, + 'upload_date': '20220228', + 'duration': 255, + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg', }, - 'params': { - # rtmp download - 'skip_download': True, - } }, { # single video story without digitalData 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', @@ -710,12 +713,10 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'timestamp': 1415867444, 'upload_date': '20141113', }, - 'params': { - # rtmp download - 'skip_download': True, - } + 'skip': 'redirects to TopGear home page', }, { # single video embedded with Morph + # TODO: replacement test page 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975', 'info_dict': { 'id': 'p041vhd0', @@ -726,27 +727,22 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'uploader': 'BBC Sport', 'uploader_id': 'bbc_sport', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'Georestricted to UK', + 'skip': 'Video no longer in page', }, { - # single video with playlist.sxml URL in playlist param + # single video in __INITIAL_DATA__ 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', 'ext': 'mp4', - 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.', + 'title': 'Ronaldo to Man Utd, Arsenal to spend?', + 'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$', + 'timestamp': 1437750175, + 'upload_date': '20150724', + 'thumbnail': r're:https?://.+/.+media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png', 'duration': 140, }, - 'params': { - # rtmp download - 'skip_download': True, - } }, { - # article with multiple videos embedded with playlist.sxml in playlist param + # article with multiple videos embedded with Morph.setPayload 'url': 'http://www.bbc.com/sport/0/football/34475836', 'info_dict': { 'id': '34475836', @@ -754,6 +750,21 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.', }, 'playlist_count': 3, + }, { + # Testing noplaylist + 'url': 'http://www.bbc.com/sport/0/football/34475836', + 'info_dict': { + 'id': 'p034ppnv', + 'ext': 'mp4', + 'title': 'All you need to know about Jurgen Klopp', + 'timestamp': 1444335081, + 'upload_date': '20151008', + 'duration': 122.0, + 'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg', + }, + 'params': { + 'noplaylist': True, + }, }, { # school report article with single video 'url': 'http://www.bbc.co.uk/schoolreport/35744779', @@ -762,6 +773,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'title': 'School which breaks down barriers in Jerusalem', }, 'playlist_count': 1, + 'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt', }, { # single video with playlist URL from weather section 'url': 'http://www.bbc.com/weather/features/33601775', @@ -778,18 +790,33 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'thumbnail': r're:https?://.+/.+\.jpg', 'timestamp': 1437785037, 'upload_date': '20150725', + 'duration': 105, }, }, { # video with window.__INITIAL_DATA__ and value as JSON string 'url': 'https://www.bbc.com/news/av/world-europe-59468682', 'info_dict': { - 'id': 'p0b71qth', + 'id': 'p0b779gc', 'ext': 'mp4', 'title': 'Why France is making this woman a national hero', - 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', + 'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{208} Second World War.', 'thumbnail': r're:https?://.+/.+\.jpg', - 'timestamp': 1638230731, - 'upload_date': '20211130', + 'timestamp': 1638215626, + 'upload_date': '20211129', + 'duration': 125, + }, + }, { + # video with script id __NEXT_DATA__ and value as JSON string + 'url': 'https://www.bbc.com/news/uk-68546268', + 'info_dict': { + 'id': 'p0hj0lq7', + 'ext': 'mp4', + 'title': 'Nasser Hospital doctor describes his treatment by IDF', + 'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1710188248, + 'upload_date': '20240311', + 'duration': 104, }, }, { # single video article embedded with data-media-vpid @@ -817,6 +844,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'uploader': 'Radio 3', 'uploader_id': 'bbc_radio_three', }, + 'skip': '404 Not Found', }, { 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227', 'info_dict': { @@ -824,6 +852,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'ext': 'mp4', 'title': 'md5:2fabf12a726603193a2879a055f72514', 'description': 'Learn English words and phrases from this story', + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg', }, 'add_ie': [BBCCoUkIE.ie_key()], }, { @@ -832,28 +861,30 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'info_dict': { 'id': 'p07c6sb9', 'ext': 'mp4', - 'title': 'How positive thinking is harming your happiness', - 'alt_title': 'The downsides of positive thinking', - 'description': 'md5:fad74b31da60d83b8265954ee42d85b4', + 'title': 'The downsides of positive thinking', + 'description': 'The downsides of positive thinking', 'duration': 235, - 'thumbnail': r're:https?://.+/p07c9dsr.jpg', - 'upload_date': '20190604', - 'categories': ['Psychology'], + 'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)', + 'upload_date': '20220223', + 'timestamp': 1645632746, }, }, { # BBC Sounds - 'url': 'https://www.bbc.co.uk/sounds/play/m001q78b', + 'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx', 'info_dict': { - 'id': 'm001q789', + 'id': 'p0hrw4nr', 'ext': 'mp4', - 'title': 'The Night Tracks Mix - Music for the darkling hour', - 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg', - 'chapters': 'count:8', - 'description': 'md5:815fb51cbdaa270040aab8145b3f1d67', - 'uploader': 'Radio 3', - 'duration': 1800, - 'uploader_id': 'bbc_radio_three', - }, + 'title': 'Are our coastlines being washed away?', + 'description': r're:(?s)Around the world, coastlines are constantly changing .{2000,} Images\)$', + 'timestamp': 1713556800, + 'upload_date': '20240419', + 'duration': 1588, + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg', + 'uploader': 'World Service', + 'uploader_id': 'bbc_world_service', + 'series': 'CrowdScience', + 'chapters': [], + } }, { # onion routes 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576', 'only_matching': True, @@ -1008,8 +1039,7 @@ def _real_extract(self, url): webpage, 'group id', default=None) if group_id: return self.url_result( - 'https://www.bbc.co.uk/programmes/%s' % group_id, - ie=BBCCoUkIE.ie_key()) + f'https://www.bbc.co.uk/programmes/{group_id}', BBCCoUkIE) # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( @@ -1069,83 +1099,133 @@ def _real_extract(self, url): } # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) - # There are several setPayload calls may be present but the video - # seems to be always related to the first one - morph_payload = self._parse_json( - self._search_regex( - r'Morph\.setPayload\([^,]+,\s*({.+?})\);', - webpage, 'morph payload', default='{}'), - playlist_id, fatal=False) + # Several setPayload calls may be present but the video(s) + # should be in one that mentions leadMedia or videoData + morph_payload = self._search_json( + r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id, + contains_pattern=r'{(?s:(?:(?!).)+(?:"leadMedia"|\\"videoData\\")\s*:.+)}', + default={}) if morph_payload: - components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] - for component in components: - if not isinstance(component, dict): - continue - lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict) - if not lead_media: - continue - identifiers = lead_media.get('identifiers') - if not identifiers or not isinstance(identifiers, dict): - continue - programme_id = identifiers.get('vpid') or identifiers.get('playablePid') + for lead_media in traverse_obj(morph_payload, ( + 'body', 'components', ..., 'props', 'leadMedia', {dict})): + programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any)) if not programme_id: continue - title = lead_media.get('title') or self._og_search_title(webpage) formats, subtitles = self._download_media_selector(programme_id) - description = lead_media.get('summary') - uploader = lead_media.get('masterBrand') - uploader_id = lead_media.get('mid') - duration = None - duration_d = lead_media.get('duration') - if isinstance(duration_d, dict): - duration = parse_duration(dict_get( - duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration'))) return { 'id': programme_id, - 'title': title, - 'description': description, - 'duration': duration, - 'uploader': uploader, - 'uploader_id': uploader_id, + 'title': lead_media.get('title') or self._og_search_title(webpage), + **traverse_obj(lead_media, { + 'description': ('summary', {str}), + 'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}), + 'uploader': ('masterBrand', {str}), + 'uploader_id': ('mid', {str}), + }), 'formats': formats, 'subtitles': subtitles, } + body = self._parse_json(traverse_obj(morph_payload, ( + 'body', 'content', 'article', 'body')), playlist_id, fatal=False) + for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')): + if video_data.get('vpid'): + video_id = video_data['vpid'] + formats, subtitles = self._download_media_selector(video_id) + entry = { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + } + else: + video_id = video_data['pid'] + entry = self.url_result( + f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE, + video_id, url_transparent=True) + entry.update({ + 'timestamp': traverse_obj(morph_payload, ( + 'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601}) + ), + **traverse_obj(video_data, { + 'thumbnail': (('iChefImage', 'image'), {url_or_none}, any), + 'title': (('title', 'caption'), {str}, any), + 'duration': ('duration', {parse_duration}), + }), + }) + if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id): + return entry + entries.append(entry) + if entries: + playlist_title = traverse_obj(morph_payload, ( + 'body', 'content', 'article', 'headline', {str})) or playlist_title + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) - preload_state = self._parse_json(self._search_regex( - r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), playlist_id, fatal=False) - if preload_state: - current_programme = preload_state.get('programmes', {}).get('current') or {} - programme_id = current_programme.get('id') - if current_programme and programme_id and current_programme.get('type') == 'playable_item': - title = current_programme.get('titles', {}).get('tertiary') or playlist_title - formats, subtitles = self._download_media_selector(programme_id) - synopses = current_programme.get('synopses') or {} - network = current_programme.get('network') or {} - duration = int_or_none( - current_programme.get('duration', {}).get('value')) - thumbnail = None - image_url = current_programme.get('image_url') - if image_url: - thumbnail = image_url.replace('{recipe}', 'raw') + # various PRELOADED_STATE JSON + preload_state = self._search_json( + r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage, + 'preload state', playlist_id, transform_source=js_to_json, default={}) + # PRELOADED_STATE with current programmme + current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict})) + programme_id = traverse_obj(current_programme, ('id', {str})) + if programme_id and current_programme.get('type') == 'playable_item': + title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title + formats, subtitles = self._download_media_selector(programme_id) + return { + 'id': programme_id, + 'title': title, + 'formats': formats, + **traverse_obj(current_programme, { + 'description': ('synopses', ('long', 'medium', 'short'), {str}, any), + 'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}), + 'duration': ('duration', 'value', {int_or_none}), + 'uploader': ('network', 'short_title', {str}), + 'uploader_id': ('network', 'id', {str}), + 'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any), + 'series': ('titles', 'primary', {str}), + }), + 'subtitles': subtitles, + 'chapters': traverse_obj(preload_state, ( + 'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), { + 'title': ('titles', {lambda x: join_nonempty( + 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), + 'start_time': ('offset', 'start', {float_or_none}), + 'end_time': ('offset', 'end', {float_or_none}), + }) + ), + } + + # PWA_PRELOADED_STATE with article video asset + asset_id = traverse_obj(preload_state, ( + 'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id, + 'assetVideo', 0, {str}, any)) + if asset_id: + video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str})) + if video_id: + article = traverse_obj(preload_state, ( + 'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any)) + + def image_url(image_id): + return traverse_obj(preload_state, ( + 'entities', 'images', image_id, 'url', + {lambda u: url_or_none(u.replace('$recipe', 'raw'))})) + + formats, subtitles = self._download_media_selector(video_id) return { - 'id': programme_id, - 'title': title, - 'description': dict_get(synopses, ('long', 'medium', 'short')), - 'thumbnail': thumbnail, - 'duration': duration, - 'uploader': network.get('short_title'), - 'uploader_id': network.get('id'), + 'id': video_id, + **traverse_obj(preload_state, ('entities', 'videos', asset_id, { + 'title': ('title', {str}), + 'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any), + 'thumbnail': (0, {image_url}), + 'duration': ('duration', {int_or_none}), + })), 'formats': formats, 'subtitles': subtitles, - 'chapters': traverse_obj(preload_state, ( - 'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), { - 'title': ('titles', {lambda x: join_nonempty( - 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), - 'start_time': ('offset', 'start', {float_or_none}), - 'end_time': ('offset', 'end', {float_or_none}), - })) or None, + 'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})), } + else: + return self.url_result( + f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE, + asset_id, playlist_title, display_id=playlist_id, + description=playlist_description) bbc3_config = self._parse_json( self._search_regex( @@ -1191,6 +1271,28 @@ def _real_extract(self, url): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + def parse_model(model): + """Extract single video from model structure""" + item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) + if not item_id: + return + formats, subtitles = self._download_media_selector(item_id) + return { + 'id': item_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), + 'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any), + 'duration': ('versions', 0, 'duration', {int}), + 'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}), + }) + } + + def is_type(*types): + return lambda _, v: v['type'] in types + initial_data = self._search_regex( r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, 'quoted preload state', default=None) @@ -1202,6 +1304,19 @@ def _real_extract(self, url): initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) initial_data = self._parse_json(initial_data, playlist_id, fatal=False) if initial_data: + for video_data in traverse_obj(initial_data, ( + 'stores', 'article', 'articleBodyContent', is_type('video'))): + model = traverse_obj(video_data, ( + 'model', 'blocks', is_type('aresMedia'), + 'model', 'blocks', is_type('aresMediaMetadata'), + 'model', {dict}, any)) + entry = parse_model(model) + if entry: + entries.append(entry) + if entries: + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def parse_media(media): if not media: return @@ -1234,27 +1349,90 @@ def parse_media(media): 'subtitles': subtitles, 'timestamp': item_time, 'description': strip_or_none(item_desc), + 'duration': int_or_none(item.get('duration')), }) - for resp in (initial_data.get('data') or {}).values(): - name = resp.get('name') + + for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])): + name = resp['name'] if name == 'media-experience': parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) elif name == 'article': - for block in (try_get(resp, - (lambda x: x['data']['blocks'], - lambda x: x['data']['content']['model']['blocks'],), - list) or []): - if block.get('type') not in ['media', 'video']: - continue - parse_media(block.get('model')) + for block in traverse_obj(resp, ( + 'data', (None, ('content', 'model')), 'blocks', + is_type('media', 'video'), 'model', {dict})): + parse_media(block) return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + # extract from SIMORGH_DATA hydration JSON + simorgh_data = self._search_json( + r'window\s*\.\s*SIMORGH_DATA\s*=', webpage, + 'simorgh data', playlist_id, default={}) + if simorgh_data: + done = False + for video_data in traverse_obj(simorgh_data, ( + 'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))): + model = traverse_obj(video_data, ( + 'model', 'blocks', is_type('aresMedia'), + 'model', 'blocks', is_type('aresMediaMetadata'), + 'model', {dict}, any)) + if video_data['type'] == 'video': + entry = parse_model(model) + else: # legacyMedia: no duration, subtitles + block_id, entry = traverse_obj(model, ('blockId', {str})), None + media_data = traverse_obj(simorgh_data, ( + 'pageData', 'promo', 'media', + {lambda x: x if x['id'] == block_id else None})) + formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), { + 'url': ('url', {url_or_none}), + 'ext': ('format', {str}), + 'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}), + })) + if formats: + entry = { + 'id': block_id, + 'display_id': playlist_id, + 'formats': formats, + 'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})), + **traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), + 'description': ('synopses', ('long', 'medium', 'short'), {str}, any), + 'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}), + }), + } + done = True + if entry: + entries.append(entry) + if done: + break + if entries: + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), re.findall(pattern, webpage)))) + # US accessed article with single embedded video (e.g. + # https://www.bbc.com/news/uk-68546268) + next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), + ('props', 'pageProps', 'page')) + model = traverse_obj(next_data, ( + ..., 'contents', is_type('video'), + 'model', 'blocks', is_type('media'), + 'model', 'blocks', is_type('mediaMetadata'), + 'model', {dict}, any)) + if model and (entry := parse_model(model)): + if not entry.get('timestamp'): + entry['timestamp'] = traverse_obj(next_data, ( + ..., 'contents', is_type('timestamp'), 'model', + 'timestamp', {functools.partial(int_or_none, scale=1000)}, any)) + entries.append(entry) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + # Multiple video article (e.g. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index 90b4d082e2d3..0a5a524c16ae 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -16,7 +16,6 @@ merge_dicts, multipart_encode, parse_duration, - random_birthday, traverse_obj, try_call, try_get, @@ -63,38 +62,57 @@ class CDAIE(InfoExtractor): 'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'crash404', - 'view_count': int, 'average_rating': float, 'duration': 137, 'age_limit': 0, + 'upload_date': '20160220', + 'timestamp': 1455968218, } }, { - # Age-restricted - 'url': 'http://www.cda.pl/video/1273454c4', + # Age-restricted with vfilm redirection + 'url': 'https://www.cda.pl/video/8753244c4', + 'md5': 'd8eeb83d63611289507010d3df3bb8b3', 'info_dict': { - 'id': '1273454c4', + 'id': '8753244c4', 'ext': 'mp4', - 'title': 'Bronson (2008) napisy HD 1080p', - 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', + 'title': '[18+] Bez Filtra: Rezerwowe Psy czyli... najwulgarniejsza polska gra?', + 'description': 'md5:ae80bac31bd6a9f077a6cce03c7c077e', 'height': 1080, - 'uploader': 'boniek61', + 'uploader': 'arhn eu', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 5554, + 'duration': 991, 'age_limit': 18, - 'view_count': int, 'average_rating': float, - }, + 'timestamp': 1633888264, + 'upload_date': '20211010', + } + }, { + # Age-restricted without vfilm redirection + 'url': 'https://www.cda.pl/video/17028157b8', + 'md5': 'c1fe5ff4582bace95d4f0ce0fbd0f992', + 'info_dict': { + 'id': '17028157b8', + 'ext': 'mp4', + 'title': 'STENDUPY MICHAŁ OGIŃSKI', + 'description': 'md5:5851f3272bfc31f762d616040a1d609a', + 'height': 480, + 'uploader': 'oginski', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 18855, + 'age_limit': 18, + 'average_rating': float, + 'timestamp': 1699705901, + 'upload_date': '20231111', + } }, { 'url': 'http://ebd.cda.pl/0x0/5749950c', 'only_matching': True, }] def _download_age_confirm_page(self, url, video_id, *args, **kwargs): - form_data = random_birthday('rok', 'miesiac', 'dzien') - form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) - data, content_type = multipart_encode(form_data) + data, content_type = multipart_encode({'age_confirm': ''}) return self._download_webpage( - urljoin(url, '/a/validatebirth'), video_id, *args, + url, video_id, *args, data=data, headers={ 'Referer': url, 'Content-Type': content_type, @@ -164,7 +182,7 @@ def _real_extract(self, url): if 'Authorization' in self._API_HEADERS: return self._api_extract(video_id) else: - return self._web_extract(video_id, url) + return self._web_extract(video_id) def _api_extract(self, video_id): meta = self._download_json( @@ -197,9 +215,9 @@ def _api_extract(self, video_id): 'view_count': meta.get('views'), } - def _web_extract(self, video_id, url): + def _web_extract(self, video_id): self._set_cookie('cda.pl', 'cda.player', 'html5') - webpage = self._download_webpage( + webpage, urlh = self._download_webpage_handle( f'{self._BASE_URL}/video/{video_id}/vfilm', video_id) if 'Ten film jest dostępny dla użytkowników premium' in webpage: @@ -209,10 +227,10 @@ def _web_extract(self, video_id, url): self.raise_geo_restricted() need_confirm_age = False - if self._html_search_regex(r'(]+action="[^"]*/a/validatebirth[^"]*")', + if self._html_search_regex(r'(]+name="[^"]*age_confirm[^"]*")', webpage, 'birthday validate form', default=None): webpage = self._download_age_confirm_page( - url, video_id, note='Confirming age') + urlh.url, video_id, note='Confirming age') need_confirm_age = True formats = [] @@ -222,9 +240,6 @@ def _web_extract(self, video_id, url): (?:<\1[^>]*>[^<]*|(?!)(?:.|\n))*? <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P[^<]+) ''', webpage, 'uploader', default=None, group='uploader') - view_count = self._search_regex( - r'Odsłony:(?:\s| )*([0-9]+)', webpage, - 'view_count', default=None) average_rating = self._search_regex( (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P[0-9.]+)', r']+\bclass=["\']rating["\'][^>]*>(?P[0-9.]+)'), webpage, 'rating', fatal=False, @@ -235,7 +250,6 @@ def _web_extract(self, video_id, url): 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), 'uploader': uploader, - 'view_count': int_or_none(view_count), 'average_rating': float_or_none(average_rating), 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index bebbc6b43f90..a952828fba1d 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -957,7 +957,8 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote= if urlh is False: assert not fatal return False - content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, + encoding=encoding, data=data) return (content, urlh) @staticmethod @@ -1005,8 +1006,10 @@ def __check_blocked(self, content): 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', expected=True) - def _request_dump_filename(self, url, video_id): - basen = f'{video_id}_{url}' + def _request_dump_filename(self, url, video_id, data=None): + if data is not None: + data = hashlib.md5(data).hexdigest() + basen = join_nonempty(video_id, data, url, delim='_') trim_length = self.get_param('trim_file_name') or 240 if len(basen) > trim_length: h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() @@ -1028,7 +1031,8 @@ def __decode_webpage(self, webpage_bytes, encoding, headers): except LookupError: return webpage_bytes.decode('utf-8', 'replace') - def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, + prefix=None, encoding=None, data=None): webpage_bytes = urlh.read() if prefix is not None: webpage_bytes = prefix + webpage_bytes @@ -1037,7 +1041,9 @@ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errno dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) if self.get_param('write_pages'): - filename = self._request_dump_filename(urlh.url, video_id) + if isinstance(url_or_request, Request): + data = self._create_request(url_or_request, data).data + filename = self._request_dump_filename(urlh.url, video_id, data) self.to_screen(f'Saving request to {filename}') with open(filename, 'wb') as outf: outf.write(webpage_bytes) @@ -1098,7 +1104,7 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote, impersonate=None, require_impersonation=False): if self.get_param('load_pages'): url_or_request = self._create_request(url_or_request, data, headers, query) - filename = self._request_dump_filename(url_or_request.url, video_id) + filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data) self.to_screen(f'Loading request from {filename}') try: with open(filename, 'rb') as dumpf: diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index cb2c9c11b7e1..9e7400e522a5 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -3,6 +3,7 @@ import uuid from .common import InfoExtractor +from ..networking import Request from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, @@ -27,6 +28,7 @@ class CrunchyrollBaseIE(InfoExtractor): _BASE_URL = 'https://www.crunchyroll.com' _API_BASE = 'https://api.crunchyroll.com' _NETRC_MACHINE = 'crunchyroll' + _SWITCH_USER_AGENT = 'Crunchyroll/1.8.0 Nintendo Switch/12.3.12.0 UE4/4.27' _REFRESH_TOKEN = None _AUTH_HEADERS = None _AUTH_EXPIRY = None @@ -286,10 +288,19 @@ def _extract_stream(self, identifier, display_id=None): display_id = identifier self._update_auth() - stream_response = self._download_json( - f'https://cr-play-service.prd.crunchyrollsvc.com/v1/{identifier}/console/switch/play', - display_id, note='Downloading stream info', errnote='Failed to download stream info', - headers=CrunchyrollBaseIE._AUTH_HEADERS) + headers = {**CrunchyrollBaseIE._AUTH_HEADERS, 'User-Agent': self._SWITCH_USER_AGENT} + try: + stream_response = self._download_json( + f'https://cr-play-service.prd.crunchyrollsvc.com/v1/{identifier}/console/switch/play', + display_id, note='Downloading stream info', errnote='Failed to download stream info', headers=headers) + except ExtractorError as error: + if self.get_param('ignore_no_formats_error'): + self.report_warning(error.orig_msg) + return [], {} + elif isinstance(error.cause, HTTPError) and error.cause.status == 420: + raise ExtractorError( + 'You have reached the rate-limit for active streams; try again later', expected=True) + raise available_formats = {'': ('', '', stream_response['url'])} for hardsub_lang, stream in traverse_obj(stream_response, ('hardSubs', {dict.items}, lambda _, v: v[1]['url'])): @@ -318,7 +329,7 @@ def _extract_stream(self, identifier, display_id=None): fatal=False, note=f'Downloading {f"{format_id} " if hardsub_lang else ""}MPD manifest') self._merge_subtitles(dash_subs, target=subtitles) else: - continue # XXX: Update this if/when meta mpd formats are working + continue # XXX: Update this if meta mpd formats work; will be tricky with token invalidation for f in adaptive_formats: if f.get('acodec') != 'none': f['language'] = audio_locale @@ -328,6 +339,15 @@ def _extract_stream(self, identifier, display_id=None): for locale, subtitle in traverse_obj(stream_response, (('subtitles', 'captions'), {dict.items}, ...)): subtitles.setdefault(locale, []).append(traverse_obj(subtitle, {'url': 'url', 'ext': 'format'})) + # Invalidate stream token to avoid rate-limit + error_msg = 'Unable to invalidate stream token; you may experience rate-limiting' + if stream_token := stream_response.get('token'): + self._request_webpage(Request( + f'https://cr-play-service.prd.crunchyrollsvc.com/v1/token/{identifier}/{stream_token}/inactive', + headers=headers, method='PATCH'), display_id, 'Invalidating stream token', error_msg, fatal=False) + else: + self.report_warning(error_msg) + return formats, subtitles @staticmethod diff --git a/yt_dlp/extractor/eplus.py b/yt_dlp/extractor/eplus.py index 88a8d5a949af..d2ad5b441e73 100644 --- a/yt_dlp/extractor/eplus.py +++ b/yt_dlp/extractor/eplus.py @@ -16,13 +16,31 @@ class EplusIbIE(InfoExtractor): _VALID_URL = [r'https?://live\.eplus\.jp/ex/player\?ib=(?P(?:\w|%2B|%2F){86}%3D%3D)', r'https?://live\.eplus\.jp/(?Psample|\d+)'] _TESTS = [{ - 'url': 'https://live.eplus.jp/ex/player?ib=YEFxb3Vyc2Dombnjg7blkrLlrablnJLjgrnjgq%2Fjg7zjg6vjgqLjgqTjg4njg6vlkIzlpb3kvJpgTGllbGxhIQ%3D%3D', + 'url': 'https://live.eplus.jp/ex/player?ib=41K6Wzbr3PlcMD%2FOKHFlC%2FcZCe2Eaw7FK%2BpJS1ooUHki8d0vGSy2mYqxillQBe1dSnOxU%2B8%2FzXKls4XPBSb3vw%3D%3D', 'info_dict': { - 'id': '354502-0001-002', - 'title': 'LoveLive!Series Presents COUNTDOWN LoveLive! 2021→2022~LIVE with a smile!~【Streaming+(配信)】', + 'id': '335699-0001-006', + 'title': '少女☆歌劇 レヴュースタァライト -The LIVE 青嵐- BLUE GLITTER <定点映像配信>【Streaming+(配信)】', 'live_status': 'was_live', - 'release_date': '20211231', - 'release_timestamp': 1640952000, + 'release_date': '20201221', + 'release_timestamp': 1608544800, + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': [ + 'This event may not be accessible', + 'No video formats found', + 'Requested format is not available', + ], + }, { + 'url': 'https://live.eplus.jp/ex/player?ib=6QSsQdyRAwOFZrEHWlhRm7vocgV%2FO0YzBZ%2BaBEBg1XR%2FmbLn0R%2F048dUoAY038%2F%2F92MJ73BsoAtvUpbV6RLtDQ%3D%3D&show_id=2371511', + 'info_dict': { + 'id': '348021-0054-001', + 'title': 'ラブライブ!スーパースター!! Liella! First LoveLive! Tour ~Starlines~【東京/DAY.1】', + 'live_status': 'was_live', + 'release_date': '20220115', + 'release_timestamp': 1642233600, 'description': str, }, 'params': { @@ -124,6 +142,10 @@ def _real_extract(self, url): if data_json.get('drm_mode') == 'ON': self.report_drm(video_id) + if data_json.get('is_pass_ticket') == 'YES': + raise ExtractorError( + 'This URL is for a pass ticket instead of a player page', expected=True) + delivery_status = data_json.get('delivery_status') archive_mode = data_json.get('archive_mode') release_timestamp = try_call(lambda: unified_timestamp(data_json['event_datetime']) - 32400) diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index 29a3e43cc122..d94f28ceb176 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -97,7 +97,7 @@ def is_logged(webpage): login_form = self._hidden_inputs(login_page) login_form.update({ - 'username': username, + 'email': username, 'password': password, }) diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index c9ca41a5cdc9..358146171f12 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -1,3 +1,4 @@ +import functools import itertools import json import re @@ -12,6 +13,7 @@ error_to_compat_str, float_or_none, int_or_none, + join_nonempty, mimetype2ext, parse_qs, str_or_none, @@ -68,6 +70,16 @@ class SoundcloudBaseIE(InfoExtractor): 'original': 0, } + _DEFAULT_FORMATS = ['http_aac', 'hls_aac', 'http_opus', 'hls_opus', 'http_mp3', 'hls_mp3'] + + @functools.cached_property + def _is_requested(self): + return re.compile(r'|'.join(set( + re.escape(pattern).replace(r'\*', r'.*') if pattern != 'default' + else '|'.join(map(re.escape, self._DEFAULT_FORMATS)) + for pattern in self._configuration_arg('formats', ['default'], ie_key=SoundcloudIE) + ))).fullmatch + def _store_client_id(self, client_id): self.cache.store('soundcloud', 'client_id', client_id) @@ -216,7 +228,7 @@ def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_f redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') if redirect_url: urlh = self._request_webpage( - HEADRequest(redirect_url), track_id, fatal=False) + HEADRequest(redirect_url), track_id, 'Checking for original download format', fatal=False) if urlh: format_url = urlh.url format_urls.add(format_url) @@ -258,7 +270,7 @@ def add_format(f, protocol, is_preview=False): abr = f.get('abr') if abr: f['abr'] = int(abr) - if protocol == 'hls': + if protocol in ('hls', 'hls-aes'): protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' else: protocol = 'http' @@ -274,11 +286,32 @@ def add_format(f, protocol, is_preview=False): if extract_flat: break format_url = t['url'] - stream = None + protocol = traverse_obj(t, ('format', 'protocol', {str})) + if protocol == 'progressive': + protocol = 'http' + if protocol != 'hls' and '/hls' in format_url: + protocol = 'hls' + if protocol == 'encrypted-hls' or '/encrypted-hls' in format_url: + protocol = 'hls-aes' + + ext = None + if preset := traverse_obj(t, ('preset', {str_or_none})): + ext = preset.split('_')[0] + if ext not in KNOWN_EXTENSIONS: + ext = mimetype2ext(traverse_obj(t, ('format', 'mime_type', {str}))) + + identifier = join_nonempty(protocol, ext, delim='_') + if not self._is_requested(identifier): + self.write_debug(f'"{identifier}" is not a requested format, skipping') + continue + + stream = None for retry in self.RetryManager(fatal=False): try: - stream = self._download_json(format_url, track_id, query=query, headers=self._HEADERS) + stream = self._download_json( + format_url, track_id, f'Downloading {identifier} format info JSON', + query=query, headers=self._HEADERS) except ExtractorError as e: if isinstance(e.cause, HTTPError) and e.cause.status == 429: self.report_warning( @@ -289,27 +322,14 @@ def add_format(f, protocol, is_preview=False): else: self.report_warning(e.msg) - if not isinstance(stream, dict): - continue - stream_url = url_or_none(stream.get('url')) + stream_url = traverse_obj(stream, ('url', {url_or_none})) if invalid_url(stream_url): continue format_urls.add(stream_url) - stream_format = t.get('format') or {} - protocol = stream_format.get('protocol') - if protocol != 'hls' and '/hls' in format_url: - protocol = 'hls' - ext = None - preset = str_or_none(t.get('preset')) - if preset: - ext = preset.split('_')[0] - if ext not in KNOWN_EXTENSIONS: - ext = mimetype2ext(stream_format.get('mime_type')) add_format({ 'url': stream_url, 'ext': ext, - }, 'http' if protocol == 'progressive' else protocol, - t.get('snipped') or '/preview/' in format_url) + }, protocol, t.get('snipped') or '/preview/' in format_url) for f in formats: f['vcodec'] = 'none' diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 3d965dd4529f..7772dd1f281f 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -1,8 +1,8 @@ +import functools import itertools import json import random import re -import string import time import uuid @@ -15,10 +15,13 @@ UnsupportedError, UserNotLive, determine_ext, + filter_dict, format_field, int_or_none, join_nonempty, merge_dicts, + mimetype2ext, + parse_qs, qualities, remove_start, srt_subtitles_timecode, @@ -45,19 +48,28 @@ class TikTokBaseIE(InfoExtractor): # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0 'aid': '0', } - _KNOWN_APP_INFO = [ - '7351144126450059040', - '7351149742343391009', - '7351153174894626592', - ] _APP_INFO_POOL = None _APP_INFO = None _APP_USER_AGENT = None - @property + @functools.cached_property + def _KNOWN_APP_INFO(self): + # If we have a genuine device ID, we may not need any IID + default = [''] if self._KNOWN_DEVICE_ID else [] + return self._configuration_arg('app_info', default, ie_key=TikTokIE) + + @functools.cached_property + def _KNOWN_DEVICE_ID(self): + return self._configuration_arg('device_id', [None], ie_key=TikTokIE)[0] + + @functools.cached_property + def _DEVICE_ID(self): + return self._KNOWN_DEVICE_ID or str(random.randint(7250000000000000000, 7351147085025500000)) + + @functools.cached_property def _API_HOSTNAME(self): return self._configuration_arg( - 'api_hostname', ['api22-normal-c-useast2a.tiktokv.com'], ie_key=TikTokIE)[0] + 'api_hostname', ['api16-normal-c-useast1a.tiktokv.com'], ie_key=TikTokIE)[0] def _get_next_app_info(self): if self._APP_INFO_POOL is None: @@ -66,13 +78,10 @@ def _get_next_app_info(self): for key, default in self._APP_INFO_DEFAULTS.items() if key != 'iid' } - app_info_list = ( - self._configuration_arg('app_info', ie_key=TikTokIE) - or random.sample(self._KNOWN_APP_INFO, len(self._KNOWN_APP_INFO))) self._APP_INFO_POOL = [ {**defaults, **dict( (k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/')) if v - )} for app_info in app_info_list + )} for app_info in self._KNOWN_APP_INFO ] if not self._APP_INFO_POOL: @@ -119,7 +128,7 @@ def _call_api_impl(self, ep, query, video_id, fatal=True, }, query=query) def _build_api_query(self, query): - return { + return filter_dict({ **query, 'device_platform': 'android', 'os': 'android', @@ -160,10 +169,10 @@ def _build_api_query(self, query): 'build_number': self._APP_INFO['app_version'], 'region': 'US', 'ts': int(time.time()), - 'iid': self._APP_INFO['iid'], - 'device_id': random.randint(7250000000000000000, 7351147085025500000), + 'iid': self._APP_INFO.get('iid'), + 'device_id': self._DEVICE_ID, 'openudid': ''.join(random.choices('0123456789abcdef', k=16)), - } + }) def _call_api(self, ep, query, video_id, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): @@ -203,7 +212,31 @@ def _extract_aweme_app(self, aweme_id): raise ExtractorError('Unable to find video in feed', video_id=aweme_id) return self._parse_aweme_video_app(aweme_detail) - def _get_subtitles(self, aweme_detail, aweme_id): + def _extract_web_data_and_status(self, url, video_id, fatal=True): + webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=fatal) or '' + video_data, status = {}, None + + if universal_data := self._get_universal_data(webpage, video_id): + self.write_debug('Found universal data for rehydration') + status = traverse_obj(universal_data, ('webapp.video-detail', 'statusCode', {int})) or 0 + video_data = traverse_obj(universal_data, ('webapp.video-detail', 'itemInfo', 'itemStruct', {dict})) + + elif sigi_data := self._get_sigi_state(webpage, video_id): + self.write_debug('Found sigi state data') + status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0 + video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict})) + + elif next_data := self._search_nextjs_data(webpage, video_id, default={}): + self.write_debug('Found next.js data') + status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0 + video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict})) + + elif fatal: + raise ExtractorError('Unable to extract webpage video data') + + return video_data, status + + def _get_subtitles(self, aweme_detail, aweme_id, user_url): # TODO: Extract text positioning info subtitles = {} # aweme/detail endpoint subs @@ -234,32 +267,32 @@ def _get_subtitles(self, aweme_detail, aweme_id): }) # webpage subs if not subtitles: - for caption in traverse_obj(aweme_detail, ('video', 'subtitleInfos', ...), expected_type=dict): - if not caption.get('Url'): - continue + if user_url: # only _parse_aweme_video_app needs to extract the webpage here + aweme_detail, _ = self._extract_web_data_and_status( + f'{user_url}/video/{aweme_id}', aweme_id, fatal=False) + for caption in traverse_obj(aweme_detail, ('video', 'subtitleInfos', lambda _, v: v['Url'])): subtitles.setdefault(caption.get('LanguageCodeName') or 'en', []).append({ 'ext': remove_start(caption.get('Format'), 'web'), 'url': caption['Url'], }) return subtitles + def _parse_url_key(self, url_key): + format_id, codec, res, bitrate = self._search_regex( + r'v[^_]+_(?P(?P[^_]+)_(?P\d+p)_(?P\d+))', url_key, + 'url key', default=(None, None, None, None), group=('id', 'codec', 'res', 'bitrate')) + if not format_id: + return {}, None + return { + 'format_id': format_id, + 'vcodec': 'h265' if codec == 'bytevc1' else codec, + 'tbr': int_or_none(bitrate, scale=1000) or None, + 'quality': qualities(self.QUALITIES)(res), + }, res + def _parse_aweme_video_app(self, aweme_detail): aweme_id = aweme_detail['aweme_id'] video_info = aweme_detail['video'] - - def parse_url_key(url_key): - format_id, codec, res, bitrate = self._search_regex( - r'v[^_]+_(?P(?P[^_]+)_(?P\d+p)_(?P\d+))', url_key, - 'url key', default=(None, None, None, None), group=('id', 'codec', 'res', 'bitrate')) - if not format_id: - return {}, None - return { - 'format_id': format_id, - 'vcodec': 'h265' if codec == 'bytevc1' else codec, - 'tbr': int_or_none(bitrate, scale=1000) or None, - 'quality': qualities(self.QUALITIES)(res), - }, res - known_resolutions = {} def audio_meta(url): @@ -274,7 +307,7 @@ def audio_meta(url): } if ext == 'mp3' or '-music-' in url else {} def extract_addr(addr, add_meta={}): - parsed_meta, res = parse_url_key(addr.get('url_key', '')) + parsed_meta, res = self._parse_url_key(addr.get('url_key', '')) is_bytevc2 = parsed_meta.get('vcodec') == 'bytevc2' if res: known_resolutions.setdefault(res, {}).setdefault('height', int_or_none(addr.get('height'))) @@ -288,7 +321,7 @@ def extract_addr(addr, add_meta={}): 'acodec': 'aac', 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked **add_meta, **parsed_meta, - # bytevc2 is bytedance's proprietary (unplayable) video codec + # bytevc2 is bytedance's own custom h266/vvc codec, as-of-yet unplayable 'preference': -100 if is_bytevc2 else -1, 'format_note': join_nonempty( add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, @@ -300,6 +333,7 @@ def extract_addr(addr, add_meta={}): formats = [] width = int_or_none(video_info.get('width')) height = int_or_none(video_info.get('height')) + ratio = try_call(lambda: width / height) or 0.5625 if video_info.get('play_addr'): formats.extend(extract_addr(video_info['play_addr'], { 'format_id': 'play_addr', @@ -316,8 +350,8 @@ def extract_addr(addr, add_meta={}): 'format_id': 'download_addr', 'format_note': 'Download video%s' % (', watermarked' if video_info.get('has_watermark') else ''), 'vcodec': 'h264', - 'width': dl_width or width, - 'height': try_call(lambda: int(dl_width / 0.5625)) or height, # download_addr['height'] is wrong + 'width': dl_width, + 'height': try_call(lambda: int(dl_width / ratio)), # download_addr['height'] is wrong 'preference': -2 if video_info.get('has_watermark') else -1, })) if video_info.get('play_addr_h264'): @@ -403,7 +437,7 @@ def extract_addr(addr, add_meta={}): 'album': str_or_none(music_info.get('album')) or None, 'artists': re.split(r'(?:, | & )', music_author) if music_author else None, 'formats': formats, - 'subtitles': self.extract_subtitles(aweme_detail, aweme_id), + 'subtitles': self.extract_subtitles(aweme_detail, aweme_id, user_url), 'thumbnails': thumbnails, 'duration': int_or_none(traverse_obj(video_info, 'duration', ('download_addr', 'duration')), scale=1000), 'availability': self._availability( @@ -424,26 +458,88 @@ def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id): formats = [] width = int_or_none(video_info.get('width')) height = int_or_none(video_info.get('height')) + ratio = try_call(lambda: width / height) or 0.5625 + COMMON_FORMAT_INFO = { + 'ext': 'mp4', + 'vcodec': 'h264', + 'acodec': 'aac', + } + + for bitrate_info in traverse_obj(video_info, ('bitrateInfo', lambda _, v: v['PlayAddr']['UrlList'])): + format_info, res = self._parse_url_key( + traverse_obj(bitrate_info, ('PlayAddr', 'UrlKey', {str})) or '') + # bytevc2 is bytedance's own custom h266/vvc codec, as-of-yet unplayable + is_bytevc2 = format_info.get('vcodec') == 'bytevc2' + format_info.update({ + 'format_note': 'UNPLAYABLE' if is_bytevc2 else None, + 'preference': -100 if is_bytevc2 else -1, + 'filesize': traverse_obj(bitrate_info, ('PlayAddr', 'DataSize', {int_or_none})), + }) + + if dimension := (res and int(res[:-1])): + if dimension == 540: # '540p' is actually 576p + dimension = 576 + if ratio < 1: # portrait: res/dimension is width + y = int(dimension / ratio) + format_info.update({ + 'width': dimension, + 'height': y - (y % 2), + }) + else: # landscape: res/dimension is height + x = int(dimension * ratio) + format_info.update({ + 'width': x - (x % 2), + 'height': dimension, + }) + + for video_url in traverse_obj(bitrate_info, ('PlayAddr', 'UrlList', ..., {url_or_none})): + formats.append({ + **COMMON_FORMAT_INFO, + **format_info, + 'url': self._proto_relative_url(video_url), + }) + + # We don't have res string for play formats, but need quality for sorting & de-duplication + play_quality = traverse_obj(formats, (lambda _, v: v['width'] == width, 'quality', any)) for play_url in traverse_obj(video_info, ('playAddr', ((..., 'src'), None), {url_or_none})): formats.append({ + **COMMON_FORMAT_INFO, + 'format_id': 'play', 'url': self._proto_relative_url(play_url), - 'ext': 'mp4', 'width': width, 'height': height, + 'quality': play_quality, }) for download_url in traverse_obj(video_info, (('downloadAddr', ('download', 'url')), {url_or_none})): formats.append({ + **COMMON_FORMAT_INFO, 'format_id': 'download', 'url': self._proto_relative_url(download_url), - 'ext': 'mp4', - 'width': width, - 'height': height, }) self._remove_duplicate_formats(formats) + for f in traverse_obj(formats, lambda _, v: 'unwatermarked' not in v['url']): + f.update({ + 'format_note': join_nonempty(f.get('format_note'), 'watermarked', delim=', '), + 'preference': f.get('preference') or -2, + }) + + # Is it a slideshow with only audio for download? + if not formats and traverse_obj(music_info, ('playUrl', {url_or_none})): + audio_url = music_info['playUrl'] + ext = traverse_obj(parse_qs(audio_url), ( + 'mime_type', -1, {lambda x: x.replace('_', '/')}, {mimetype2ext})) or 'm4a' + formats.append({ + 'format_id': 'audio', + 'url': self._proto_relative_url(audio_url), + 'ext': ext, + 'acodec': 'aac' if ext == 'm4a' else ext, + 'vcodec': 'none', + }) + thumbnails = [] for thumb_url in traverse_obj(aweme_detail, ( (None, 'video'), ('thumbnail', 'cover', 'dynamicCover', 'originCover'), {url_or_none})): @@ -455,10 +551,17 @@ def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id): return { 'id': video_id, + **traverse_obj(music_info, { + 'track': ('title', {str}), + 'album': ('album', {str}, {lambda x: x or None}), + 'artists': ('authorName', {str}, {lambda x: [x] if x else None}), + 'duration': ('duration', {int_or_none}), + }), **traverse_obj(aweme_detail, { 'title': ('desc', {str}), 'description': ('desc', {str}), - 'duration': ('video', 'duration', {int_or_none}), + # audio-only slideshows have a video duration of 0 and an actual audio duration + 'duration': ('video', 'duration', {int_or_none}, {lambda x: x or None}), 'timestamp': ('createTime', {int_or_none}), }), **traverse_obj(author_info or aweme_detail, { @@ -473,14 +576,10 @@ def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id): 'repost_count': 'shareCount', 'comment_count': 'commentCount', }, expected_type=int_or_none), - **traverse_obj(music_info, { - 'track': ('title', {str}), - 'album': ('album', {str}, {lambda x: x or None}), - 'artists': ('authorName', {str}, {lambda x: [x] if x else None}), - }), 'channel_id': channel_id, 'uploader_url': user_url, 'formats': formats, + 'subtitles': self.extract_subtitles(aweme_detail, video_id, None), 'thumbnails': thumbnails, 'http_headers': { 'Referer': webpage_url, @@ -757,32 +856,16 @@ class TikTokIE(TikTokBaseIE): def _real_extract(self, url): video_id, user_id = self._match_valid_url(url).group('id', 'user_id') - try: - return self._extract_aweme_app(video_id) - except ExtractorError as e: - e.expected = True - self.report_warning(f'{e}; trying with webpage') - url = self._create_url(user_id, video_id) - webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'}) - - if universal_data := self._get_universal_data(webpage, video_id): - self.write_debug('Found universal data for rehydration') - status = traverse_obj(universal_data, ('webapp.video-detail', 'statusCode', {int})) or 0 - video_data = traverse_obj(universal_data, ('webapp.video-detail', 'itemInfo', 'itemStruct', {dict})) - - elif sigi_data := self._get_sigi_state(webpage, video_id): - self.write_debug('Found sigi state data') - status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0 - video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict})) - - elif next_data := self._search_nextjs_data(webpage, video_id, default={}): - self.write_debug('Found next.js data') - status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0 - video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict})) + if self._KNOWN_APP_INFO: + try: + return self._extract_aweme_app(video_id) + except ExtractorError as e: + e.expected = True + self.report_warning(f'{e}; trying with webpage') - else: - raise ExtractorError('Unable to extract webpage video data') + url = self._create_url(user_id, video_id) + video_data, status = self._extract_web_data_and_status(url, video_id) if video_data and status == 0: return self._parse_aweme_video_web(video_data, url, video_id) @@ -850,7 +933,7 @@ def _video_entries_api(self, webpage, user_id, username): 'max_cursor': 0, 'min_cursor': 0, 'retry_type': 'no_retry', - 'device_id': ''.join(random.choices(string.digits, k=19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. + 'device_id': self._DEVICE_ID, # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. } for page in itertools.count(1): @@ -898,7 +981,7 @@ def _entries(self, list_id, display_id): 'cursor': 0, 'count': 20, 'type': 5, - 'device_id': ''.join(random.choices(string.digits, k=19)) + 'device_id': self._DEVICE_ID, } for page in itertools.count(1): diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index ecc865655dc1..fc80dade8f12 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -34,9 +34,9 @@ class TwitterBaseIE(InfoExtractor): _NETRC_MACHINE = 'twitter' - _API_BASE = 'https://api.twitter.com/1.1/' - _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' - _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' + _API_BASE = 'https://api.x.com/1.1/' + _GRAPHQL_API_BASE = 'https://x.com/i/api/graphql/' + _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:(?:twitter|x)\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' _AUTH = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' _LEGACY_AUTH = 'AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE' _flow_token = None @@ -153,6 +153,14 @@ def _search_dimensions_in_video_url(a_format, video_url): def is_logged_in(self): return bool(self._get_cookies(self._API_BASE).get('auth_token')) + # XXX: Temporary workaround until twitter.com => x.com migration is completed + def _real_initialize(self): + if self.is_logged_in or not self._get_cookies('https://twitter.com/').get('auth_token'): + return + # User has not yet been migrated to x.com and has passed twitter.com cookies + TwitterBaseIE._API_BASE = 'https://api.twitter.com/1.1/' + TwitterBaseIE._GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' + @functools.cached_property def _selected_api(self): return self._configuration_arg('api', ['graphql'], ie_key='Twitter')[0] @@ -196,17 +204,15 @@ def _perform_login(self, username, password): if self.is_logged_in: return - webpage = self._download_webpage('https://twitter.com/', None, 'Downloading login page') - guest_token = self._search_regex( - r'\.cookie\s*=\s*["\']gt=(\d+);', webpage, 'gt', default=None) or self._fetch_guest_token(None) + guest_token = self._fetch_guest_token(None) headers = { **self._set_base_headers(), 'content-type': 'application/json', 'x-guest-token': guest_token, 'x-twitter-client-language': 'en', 'x-twitter-active-user': 'yes', - 'Referer': 'https://twitter.com/', - 'Origin': 'https://twitter.com', + 'Referer': 'https://x.com/', + 'Origin': 'https://x.com', } def build_login_json(*subtask_inputs): @@ -1191,6 +1197,31 @@ class TwitterIE(TwitterBaseIE): 'age_limit': 0, '_old_archive_ids': ['twitter 1724884212803834154'], }, + }, { + # x.com + 'url': 'https://x.com/historyinmemes/status/1790637656616943991', + 'md5': 'daca3952ba0defe2cfafb1276d4c1ea5', + 'info_dict': { + 'id': '1790637589910654976', + 'ext': 'mp4', + 'title': 'Historic Vids - One of the most intense moments in history', + 'description': 'One of the most intense moments in history https://t.co/Zgzhvix8ES', + 'display_id': '1790637656616943991', + 'uploader': 'Historic Vids', + 'uploader_id': 'historyinmemes', + 'uploader_url': 'https://twitter.com/historyinmemes', + 'channel_id': '855481986290524160', + 'upload_date': '20240515', + 'timestamp': 1715756260.0, + 'duration': 15.488, + 'tags': [], + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', + 'age_limit': 0, + '_old_archive_ids': ['twitter 1790637656616943991'], + } }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index a5fe179c293c..e676c5cde24c 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2353,6 +2353,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'format': '17', # 3gp format available on android 'extractor_args': {'youtube': {'player_client': ['android']}}, }, + 'skip': 'android client broken', }, { # Skip download of additional client configs (remix client config in this case) @@ -2730,7 +2731,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'heatmap': 'count:100', }, 'params': { - 'extractor_args': {'youtube': {'player_client': ['android'], 'player_skip': ['webpage']}}, + 'extractor_args': {'youtube': {'player_client': ['ios'], 'player_skip': ['webpage']}}, }, }, ] @@ -3317,7 +3318,36 @@ def _extract_heatmap(self, data): 'value': ('intensityScoreNormalized', {float_or_none}), })) or None - def _extract_comment(self, comment_renderer, parent=None): + def _extract_comment(self, entities, parent=None): + comment_entity_payload = get_first(entities, ('payload', 'commentEntityPayload', {dict})) + if not (comment_id := traverse_obj(comment_entity_payload, ('properties', 'commentId', {str}))): + return + + toolbar_entity_payload = get_first(entities, ('payload', 'engagementToolbarStateEntityPayload', {dict})) + time_text = traverse_obj(comment_entity_payload, ('properties', 'publishedTime', {str})) or '' + + return { + 'id': comment_id, + 'parent': parent or 'root', + **traverse_obj(comment_entity_payload, { + 'text': ('properties', 'content', 'content', {str}), + 'like_count': ('toolbar', 'likeCountA11y', {parse_count}), + 'author_id': ('author', 'channelId', {self.ucid_or_none}), + 'author': ('author', 'displayName', {str}), + 'author_thumbnail': ('author', 'avatarThumbnailUrl', {url_or_none}), + 'author_is_uploader': ('author', 'isCreator', {bool}), + 'author_is_verified': ('author', 'isVerified', {bool}), + 'author_url': ('author', 'channelCommand', 'innertubeCommand', ( + ('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url') + ), {lambda x: urljoin('https://www.youtube.com', x)}), + }, get_all=False), + 'is_favorited': (None if toolbar_entity_payload is None else + toolbar_entity_payload.get('heartState') == 'TOOLBAR_HEART_STATE_HEARTED'), + '_time_text': time_text, # FIXME: non-standard, but we need a way of showing that it is an estimate. + 'timestamp': self._parse_time_text(time_text), + } + + def _extract_comment_old(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') if not comment_id: return @@ -3398,21 +3428,39 @@ def extract_header(contents): break return _continuation - def extract_thread(contents): + def extract_thread(contents, entity_payloads): if not parent: tracker['current_page_thread'] = 0 for content in contents: if not parent and tracker['total_parent_comments'] >= max_parents: yield comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer']) - comment_renderer = get_first( - (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]], - expected_type=dict, default={}) - comment = self._extract_comment(comment_renderer, parent) + # old comment format + if not entity_payloads: + comment_renderer = get_first( + (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]], + expected_type=dict, default={}) + + comment = self._extract_comment_old(comment_renderer, parent) + + # new comment format + else: + view_model = ( + traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel', {dict})) + or traverse_obj(content, ('commentViewModel', {dict}))) + comment_keys = traverse_obj(view_model, (('commentKey', 'toolbarStateKey'), {str})) + if not comment_keys: + continue + entities = traverse_obj(entity_payloads, lambda _, v: v['entityKey'] in comment_keys) + comment = self._extract_comment(entities, parent) + if comment: + comment['is_pinned'] = traverse_obj(view_model, ('pinnedText', {str})) is not None + if not comment: continue comment_id = comment['id'] + if comment.get('is_pinned'): tracker['pinned_comment_ids'].add(comment_id) # Sometimes YouTube may break and give us infinite looping comments. @@ -3505,7 +3553,7 @@ def extract_thread(contents): check_get_keys = None if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0): check_get_keys = [[*continuation_items_path, ..., ( - 'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]] + 'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentViewModel', 'commentRenderer'))]] try: response = self._extract_response( item_id=None, query=continuation, @@ -3529,6 +3577,7 @@ def extract_thread(contents): raise is_forced_continuation = False continuation = None + mutations = traverse_obj(response, ('frameworkUpdates', 'entityBatchUpdate', 'mutations', ..., {dict})) for continuation_items in traverse_obj(response, continuation_items_path, expected_type=list, default=[]): if is_first_continuation: continuation = extract_header(continuation_items) @@ -3537,7 +3586,7 @@ def extract_thread(contents): break continue - for entry in extract_thread(continuation_items): + for entry in extract_thread(continuation_items, mutations): if not entry: return yield entry @@ -3614,8 +3663,6 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, yt_query = { 'videoId': video_id, } - if _split_innertube_client(client)[0] in ('android', 'android_embedscreen'): - yt_query['params'] = 'CgIIAQ==' pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0] if pp_arg: @@ -3631,19 +3678,24 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, def _get_requested_clients(self, url, smuggled_data): requested_clients = [] - default = ['ios', 'android', 'web'] + android_clients = [] + default = ['ios', 'web'] allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'), key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) for client in self._configuration_arg('player_client'): - if client in allowed_clients: - requested_clients.append(client) - elif client == 'default': + if client == 'default': requested_clients.extend(default) elif client == 'all': requested_clients.extend(allowed_clients) - else: + elif client not in allowed_clients: self.report_warning(f'Skipping unsupported client {client}') + elif client.startswith('android'): + android_clients.append(client) + else: + requested_clients.append(client) + # Force deprioritization of broken Android clients for format de-duplication + requested_clients.extend(android_clients) if not requested_clients: requested_clients = default @@ -3862,6 +3914,14 @@ def build_fragments(f): f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) + # Android client formats are broken due to integrity check enforcement + # Ref: https://github.com/yt-dlp/yt-dlp/issues/9554 + is_broken = client_name and client_name.startswith(short_client_name('android')) + if is_broken: + self.report_warning( + f'{video_id}: Android client formats are broken and may yield HTTP Error 403. ' + 'They will be deprioritized', only_once=True) + name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' fps = int_or_none(fmt.get('fps')) or 0 dct = { @@ -3874,7 +3934,7 @@ def build_fragments(f): name, fmt.get('isDrc') and 'DRC', try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), - throttled and 'THROTTLED', is_damaged and 'DAMAGED', + throttled and 'THROTTLED', is_damaged and 'DAMAGED', is_broken and 'BROKEN', (self.get_param('verbose') or all_formats) and client_name, delim=', '), # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 @@ -3892,8 +3952,8 @@ def build_fragments(f): 'language': join_nonempty(audio_track.get('id', '').split('.')[0], 'desc' if language_preference < -1 else '') or None, 'language_preference': language_preference, - # Strictly de-prioritize damaged and 3gp formats - 'preference': -10 if is_damaged else -2 if itag == '17' else None, + # Strictly de-prioritize broken, damaged and 3gp formats + 'preference': -20 if is_broken else -10 if is_damaged else -2 if itag == '17' else None, } mime_mobj = re.match( r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') diff --git a/yt_dlp/networking/_requests.py b/yt_dlp/networking/_requests.py index e3edc77f3803..6397a2c0ca92 100644 --- a/yt_dlp/networking/_requests.py +++ b/yt_dlp/networking/_requests.py @@ -28,6 +28,7 @@ import requests.utils import urllib3.connection import urllib3.exceptions +import urllib3.util from ._helper import ( InstanceStoreMixin, @@ -180,10 +181,25 @@ def proxy_manager_for(self, proxy, **proxy_kwargs): extra_kwargs['proxy_ssl_context'] = self._proxy_ssl_context return super().proxy_manager_for(proxy, **proxy_kwargs, **self._pm_args, **extra_kwargs) + # Skip `requests` internal verification; we use our own SSLContext + # requests 2.31.0+ def cert_verify(*args, **kwargs): - # lean on SSLContext for cert verification pass + # requests 2.31.0-2.32.1 + def _get_connection(self, request, *_, proxies=None, **__): + return self.get_connection(request.url, proxies) + + # requests 2.32.2+: Reimplementation without `_urllib3_request_context` + def get_connection_with_tls_context(self, request, verify, proxies=None, cert=None): + url = urllib3.util.parse_url(request.url).url + + manager = self.poolmanager + if proxy := select_proxy(url, proxies): + manager = self.proxy_manager_for(proxy) + + return manager.connection_from_url(url) + class RequestsSession(requests.sessions.Session): """ diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index a2217034c90c..d473e16c5a50 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -31,6 +31,8 @@ ) from ..utils.networking import HTTPHeaderDict, normalize_url +DEFAULT_TIMEOUT = 20 + def register_preference(*handlers: type[RequestHandler]): assert all(issubclass(handler, RequestHandler) for handler in handlers) @@ -235,7 +237,7 @@ def __init__( self._logger = logger self.headers = headers or {} self.cookiejar = cookiejar if cookiejar is not None else YoutubeDLCookieJar() - self.timeout = float(timeout or 20) + self.timeout = float(timeout or DEFAULT_TIMEOUT) self.proxies = proxies or {} self.source_address = source_address self.verbose = verbose