Merge pull request #8876 from remitamine/html5_media
[extractor/common] add helper method to extract html5 media entries
This commit is contained in:
commit
0b68de3cc1
3 changed files with 118 additions and 0 deletions
|
@ -81,6 +81,7 @@ from youtube_dl.utils import (
|
|||
cli_option,
|
||||
cli_valueless_option,
|
||||
cli_bool_option,
|
||||
parse_codecs,
|
||||
)
|
||||
from youtube_dl.compat import (
|
||||
compat_chr,
|
||||
|
@ -608,6 +609,29 @@ class TestUtil(unittest.TestCase):
|
|||
limit_length('foo bar baz asd', 12).startswith('foo bar'))
|
||||
self.assertTrue('...' in limit_length('foo bar baz asd', 12))
|
||||
|
||||
def test_parse_codecs(self):
|
||||
self.assertEqual(parse_codecs(''), {})
|
||||
self.assertEqual(parse_codecs('avc1.77.30, mp4a.40.2'), {
|
||||
'vcodec': 'avc1.77.30',
|
||||
'acodec': 'mp4a.40.2',
|
||||
})
|
||||
self.assertEqual(parse_codecs('mp4a.40.2'), {
|
||||
'vcodec': 'none',
|
||||
'acodec': 'mp4a.40.2',
|
||||
})
|
||||
self.assertEqual(parse_codecs('mp4a.40.5,avc1.42001e'), {
|
||||
'vcodec': 'avc1.42001e',
|
||||
'acodec': 'mp4a.40.5',
|
||||
})
|
||||
self.assertEqual(parse_codecs('avc3.640028'), {
|
||||
'vcodec': 'avc3.640028',
|
||||
'acodec': 'none',
|
||||
})
|
||||
self.assertEqual(parse_codecs(', h264,,newcodec,aac'), {
|
||||
'vcodec': 'h264',
|
||||
'acodec': 'aac',
|
||||
})
|
||||
|
||||
def test_escape_rfc3986(self):
|
||||
reserved = "!*'();:@&=+$,/?#[]"
|
||||
unreserved = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~'
|
||||
|
|
|
@ -55,6 +55,8 @@ from ..utils import (
|
|||
update_Request,
|
||||
update_url_query,
|
||||
parse_m3u8_attributes,
|
||||
extract_attributes,
|
||||
parse_codecs,
|
||||
)
|
||||
|
||||
|
||||
|
@ -1635,6 +1637,62 @@ class InfoExtractor(object):
|
|||
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
|
||||
return formats
|
||||
|
||||
def _parse_html5_media_entries(self, base_url, webpage):
|
||||
def absolute_url(video_url):
|
||||
return compat_urlparse.urljoin(base_url, video_url)
|
||||
|
||||
def parse_content_type(content_type):
|
||||
if not content_type:
|
||||
return {}
|
||||
ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
|
||||
if ctr:
|
||||
mimetype, codecs = ctr.groups()
|
||||
f = parse_codecs(codecs)
|
||||
f['ext'] = mimetype2ext(mimetype)
|
||||
return f
|
||||
return {}
|
||||
|
||||
entries = []
|
||||
for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
|
||||
media_info = {
|
||||
'formats': [],
|
||||
'subtitles': {},
|
||||
}
|
||||
media_attributes = extract_attributes(media_tag)
|
||||
src = media_attributes.get('src')
|
||||
if src:
|
||||
media_info['formats'].append({
|
||||
'url': absolute_url(src),
|
||||
'vcodec': 'none' if media_type == 'audio' else None,
|
||||
})
|
||||
media_info['thumbnail'] = media_attributes.get('poster')
|
||||
if media_content:
|
||||
for source_tag in re.findall(r'<source[^>]+>', media_content):
|
||||
source_attributes = extract_attributes(source_tag)
|
||||
src = source_attributes.get('src')
|
||||
if not src:
|
||||
continue
|
||||
f = parse_content_type(source_attributes.get('type'))
|
||||
f.update({
|
||||
'url': absolute_url(src),
|
||||
'vcodec': 'none' if media_type == 'audio' else None,
|
||||
})
|
||||
media_info['formats'].append(f)
|
||||
for track_tag in re.findall(r'<track[^>]+>', media_content):
|
||||
track_attributes = extract_attributes(track_tag)
|
||||
kind = track_attributes.get('kind')
|
||||
if not kind or kind == 'subtitles':
|
||||
src = track_attributes.get('src')
|
||||
if not src:
|
||||
continue
|
||||
lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
|
||||
media_info['subtitles'].setdefault(lang, []).append({
|
||||
'url': absolute_url(src),
|
||||
})
|
||||
if media_info['formats']:
|
||||
entries.append(media_info)
|
||||
return entries
|
||||
|
||||
def _live_title(self, name):
|
||||
""" Generate the title for a live video """
|
||||
now = datetime.datetime.now()
|
||||
|
|
|
@ -2126,6 +2126,42 @@ def mimetype2ext(mt):
|
|||
}.get(res, res)
|
||||
|
||||
|
||||
def parse_codecs(codecs_str):
|
||||
# http://tools.ietf.org/html/rfc6381
|
||||
if not codecs_str:
|
||||
return {}
|
||||
splited_codecs = list(filter(None, map(
|
||||
lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
|
||||
vcodec, acodec = None, None
|
||||
for full_codec in splited_codecs:
|
||||
codec = full_codec.split('.')[0]
|
||||
if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
|
||||
if not vcodec:
|
||||
vcodec = full_codec
|
||||
elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'):
|
||||
if not acodec:
|
||||
acodec = full_codec
|
||||
else:
|
||||
write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
|
||||
if not vcodec and not acodec:
|
||||
if len(splited_codecs) == 2:
|
||||
return {
|
||||
'vcodec': vcodec,
|
||||
'acodec': acodec,
|
||||
}
|
||||
elif len(splited_codecs) == 1:
|
||||
return {
|
||||
'vcodec': 'none',
|
||||
'acodec': vcodec,
|
||||
}
|
||||
else:
|
||||
return {
|
||||
'vcodec': vcodec or 'none',
|
||||
'acodec': acodec or 'none',
|
||||
}
|
||||
return {}
|
||||
|
||||
|
||||
def urlhandle_detect_ext(url_handle):
|
||||
getheader = url_handle.headers.get
|
||||
|
||||
|
|
Loading…
Reference in a new issue