From aabc2be69316550cbec486bbde5695d1ae13ee9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 30 Aug 2015 08:07:07 +0600 Subject: [PATCH] [youtube] Simplify and extract more metadata from url_encoded_fmt_stream_map (Closes #5993) --- youtube_dl/extractor/youtube.py | 53 ++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3a2c7c562..030ec70ca 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1304,32 +1304,49 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'ratebypass' not in url: url += '&ratebypass=yes' - width = None - height = None - size_str = url_data.get('size', [''])[0] - if size_str.count('x') == 1: - width, height = [int_or_none(x) for x in size_str.split('x')] - - format_url = { + # Some itags are not included in DASH manifest thus corresponding formats will + # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993). + # Trying to extract metadata from url_encoded_fmt_stream_map entry. + mobj = re.search(r'^(?P\d+)[xX](?P\d+)$', url_data.get('size', [''])[0]) + width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) + dct = { 'format_id': format_id, 'url': url, 'player_url': player_url, - # As of this writing these are only defined for DASH formats: 'filesize': int_or_none(url_data.get('clen', [None])[0]), - 'tbr': float_or_none(url_data.get('bitrate', [None])[0], scale=1024), + 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), 'width': width, 'height': height, 'fps': int_or_none(url_data.get('fps', [None])[0]), + 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0], } - - # drop Nones so they do not overwrite the defaults from self._formats - format_url = dict((k, v) for k, v in format_url.items() if v is not None) - - format_full = self._formats.get(format_id, {}).copy() - format_full.update(format_url) - - formats.append(format_full) - + type_ = url_data.get('type', [None])[0] + if type_: + type_split = type_.split(';') + kind_ext = type_split[0].split('/') + if len(kind_ext) == 2: + kind, ext = kind_ext + dct['ext'] = ext + if kind in ('audio', 'video'): + codecs = None + for mobj in re.finditer( + r'(?P[a-zA-Z_-]+)=(?P["\']?)(?P.+?)(?P=quote)(?:;|$)', type_): + if mobj.group('key') == 'codecs': + codecs = mobj.group('val') + break + if codecs: + codecs = codecs.split(',') + if len(codecs) == 2: + acodec, vcodec = codecs[0], codecs[1] + else: + acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0]) + dct.update({ + 'acodec': acodec, + 'vcodec': vcodec, + }) + if format_id in self._formats: + dct.update(self._formats[format_id]) + formats.append(dct) elif video_info.get('hlsvp'): manifest_url = video_info['hlsvp'][0] url_map = self._extract_from_m3u8(manifest_url, video_id)