[youtube] Extract formats from multiple DASH manifests (Closes #6093)
DASH manifest pointed by dashmpd from the video webpage and one pointed by get_video_info may be different (namely different itag set) - some itags are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH manifest pointed by get_video_info's dashmpd). The general idea is to take a union of itags of both DASH manifests (for example video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093).
This commit is contained in:
parent
2988835af5
commit
d8d24a922a
1 changed files with 46 additions and 31 deletions
|
@ -853,6 +853,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
else:
|
else:
|
||||||
player_url = None
|
player_url = None
|
||||||
|
|
||||||
|
dash_mpds = []
|
||||||
|
|
||||||
|
def add_dash_mpd(video_info):
|
||||||
|
dash_mpd = video_info.get('dashmpd')
|
||||||
|
if dash_mpd and dash_mpd[0] not in dash_mpds:
|
||||||
|
dash_mpds.append(dash_mpd[0])
|
||||||
|
|
||||||
# Get video info
|
# Get video info
|
||||||
embed_webpage = None
|
embed_webpage = None
|
||||||
if re.search(r'player-age-gate-content">', video_webpage) is not None:
|
if re.search(r'player-age-gate-content">', video_webpage) is not None:
|
||||||
|
@ -873,34 +880,40 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
note='Refetching age-gated info webpage',
|
note='Refetching age-gated info webpage',
|
||||||
errnote='unable to download video info webpage')
|
errnote='unable to download video info webpage')
|
||||||
video_info = compat_parse_qs(video_info_webpage)
|
video_info = compat_parse_qs(video_info_webpage)
|
||||||
|
add_dash_mpd(video_info)
|
||||||
else:
|
else:
|
||||||
age_gate = False
|
age_gate = False
|
||||||
try:
|
# Try looking directly into the video webpage
|
||||||
# Try looking directly into the video webpage
|
mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
|
||||||
mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
|
if mobj:
|
||||||
if not mobj:
|
|
||||||
raise ValueError('Could not find ytplayer.config') # caught below
|
|
||||||
json_code = uppercase_escape(mobj.group(1))
|
json_code = uppercase_escape(mobj.group(1))
|
||||||
ytplayer_config = json.loads(json_code)
|
ytplayer_config = json.loads(json_code)
|
||||||
args = ytplayer_config['args']
|
args = ytplayer_config['args']
|
||||||
# Convert to the same format returned by compat_parse_qs
|
if args.get('url_encoded_fmt_stream_map'):
|
||||||
video_info = dict((k, [v]) for k, v in args.items())
|
# Convert to the same format returned by compat_parse_qs
|
||||||
if not args.get('url_encoded_fmt_stream_map'):
|
video_info = dict((k, [v]) for k, v in args.items())
|
||||||
raise ValueError('No stream_map present') # caught below
|
add_dash_mpd(video_info)
|
||||||
except ValueError:
|
# We also try looking in get_video_info since it may contain different dashmpd
|
||||||
# We fallback to the get_video_info pages (used by the embed page)
|
# URL that points to a DASH manifest with possibly different itag set (some itags
|
||||||
self.report_video_info_webpage_download(video_id)
|
# are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
|
||||||
for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
|
# manifest pointed by get_video_info's dashmpd).
|
||||||
video_info_url = (
|
# The general idea is to take a union of itags of both DASH manifests (for example
|
||||||
'%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
|
# video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
|
||||||
% (proto, video_id, el_type))
|
self.report_video_info_webpage_download(video_id)
|
||||||
video_info_webpage = self._download_webpage(
|
for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
|
||||||
video_info_url,
|
video_info_url = (
|
||||||
video_id, note=False,
|
'%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
|
||||||
errnote='unable to download video info webpage')
|
% (proto, video_id, el_type))
|
||||||
video_info = compat_parse_qs(video_info_webpage)
|
video_info_webpage = self._download_webpage(
|
||||||
if 'token' in video_info:
|
video_info_url,
|
||||||
break
|
video_id, note=False,
|
||||||
|
errnote='unable to download video info webpage')
|
||||||
|
get_video_info = compat_parse_qs(video_info_webpage)
|
||||||
|
add_dash_mpd(get_video_info)
|
||||||
|
if not video_info:
|
||||||
|
video_info = get_video_info
|
||||||
|
if 'token' in get_video_info:
|
||||||
|
break
|
||||||
if 'token' not in video_info:
|
if 'token' not in video_info:
|
||||||
if 'reason' in video_info:
|
if 'reason' in video_info:
|
||||||
raise ExtractorError(
|
raise ExtractorError(
|
||||||
|
@ -1118,24 +1131,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
|
|
||||||
# Look for the DASH manifest
|
# Look for the DASH manifest
|
||||||
if self._downloader.params.get('youtube_include_dash_manifest', True):
|
if self._downloader.params.get('youtube_include_dash_manifest', True):
|
||||||
dash_mpd = video_info.get('dashmpd')
|
for dash_manifest_url in dash_mpds:
|
||||||
if dash_mpd:
|
dash_formats = {}
|
||||||
dash_manifest_url = dash_mpd[0]
|
|
||||||
try:
|
try:
|
||||||
dash_formats = self._parse_dash_manifest(
|
for df in self._parse_dash_manifest(
|
||||||
video_id, dash_manifest_url, player_url, age_gate)
|
video_id, dash_manifest_url, player_url, age_gate):
|
||||||
|
# Do not overwrite DASH format found in some previous DASH manifest
|
||||||
|
if df['format_id'] not in dash_formats:
|
||||||
|
dash_formats[df['format_id']] = df
|
||||||
except (ExtractorError, KeyError) as e:
|
except (ExtractorError, KeyError) as e:
|
||||||
self.report_warning(
|
self.report_warning(
|
||||||
'Skipping DASH manifest: %r' % e, video_id)
|
'Skipping DASH manifest: %r' % e, video_id)
|
||||||
else:
|
if dash_formats:
|
||||||
# Remove the formats we found through non-DASH, they
|
# Remove the formats we found through non-DASH, they
|
||||||
# contain less info and it can be wrong, because we use
|
# contain less info and it can be wrong, because we use
|
||||||
# fixed values (for example the resolution). See
|
# fixed values (for example the resolution). See
|
||||||
# https://github.com/rg3/youtube-dl/issues/5774 for an
|
# https://github.com/rg3/youtube-dl/issues/5774 for an
|
||||||
# example.
|
# example.
|
||||||
dash_keys = set(df['format_id'] for df in dash_formats)
|
dash_keys = set(df['format_id'] for df in dash_formats.values())
|
||||||
formats = [f for f in formats if f['format_id'] not in dash_keys]
|
formats = [f for f in formats if f['format_id'] not in dash_keys]
|
||||||
formats.extend(dash_formats)
|
formats.extend(dash_formats.values())
|
||||||
|
|
||||||
# Check for malformed aspect ratio
|
# Check for malformed aspect ratio
|
||||||
stretched_m = re.search(
|
stretched_m = re.search(
|
||||||
|
|
Loading…
Reference in a new issue