[extractor/common] Improve HTML5 entries extraction and add some realworld tests
This commit is contained in:
		
							parent
							
								
									0dc41787af
								
							
						
					
					
						commit
						d493f15c11
					
				
					 2 changed files with 214 additions and 7 deletions
				
			
		| 
						 | 
				
			
			@ -107,6 +107,184 @@ class TestInfoExtractor(unittest.TestCase):
 | 
			
		|||
        self.assertRaises(ExtractorError, self.ie._download_json, uri, None)
 | 
			
		||||
        self.assertEqual(self.ie._download_json(uri, None, fatal=False), None)
 | 
			
		||||
 | 
			
		||||
    def test_parse_html5_media_entries(self):
 | 
			
		||||
        # from https://www.r18.com/
 | 
			
		||||
        # with kpbs in label
 | 
			
		||||
        expect_dict(
 | 
			
		||||
            self,
 | 
			
		||||
            self.ie._parse_html5_media_entries(
 | 
			
		||||
                'https://www.r18.com/',
 | 
			
		||||
                r'''
 | 
			
		||||
                <video id="samplevideo_amateur" class="js-samplevideo video-js vjs-default-skin vjs-big-play-centered" controls preload="auto" width="400" height="225" poster="//pics.r18.com/digital/amateur/mgmr105/mgmr105jp.jpg">
 | 
			
		||||
                    <source id="video_source" src="https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_sm_w.mp4" type="video/mp4"  res="240" label="300kbps">
 | 
			
		||||
                    <source id="video_source" src="https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_dm_w.mp4" type="video/mp4"  res="480" label="1000kbps">
 | 
			
		||||
                    <source id="video_source" src="https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_dmb_w.mp4" type="video/mp4"  res="740" label="1500kbps">
 | 
			
		||||
                    <p>Your browser does not support the video tag.</p>
 | 
			
		||||
                </video>
 | 
			
		||||
                ''', None)[0],
 | 
			
		||||
            {
 | 
			
		||||
                'formats': [{
 | 
			
		||||
                    'url': 'https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_sm_w.mp4',
 | 
			
		||||
                    'ext': 'mp4',
 | 
			
		||||
                    'format_id': '300kbps',
 | 
			
		||||
                    'height': 240,
 | 
			
		||||
                    'tbr': 300,
 | 
			
		||||
                }, {
 | 
			
		||||
                    'url': 'https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_dm_w.mp4',
 | 
			
		||||
                    'ext': 'mp4',
 | 
			
		||||
                    'format_id': '1000kbps',
 | 
			
		||||
                    'height': 480,
 | 
			
		||||
                    'tbr': 1000,
 | 
			
		||||
                }, {
 | 
			
		||||
                    'url': 'https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_dmb_w.mp4',
 | 
			
		||||
                    'ext': 'mp4',
 | 
			
		||||
                    'format_id': '1500kbps',
 | 
			
		||||
                    'height': 740,
 | 
			
		||||
                    'tbr': 1500,
 | 
			
		||||
                }],
 | 
			
		||||
                'thumbnail': '//pics.r18.com/digital/amateur/mgmr105/mgmr105jp.jpg'
 | 
			
		||||
            })
 | 
			
		||||
 | 
			
		||||
        # from https://www.csfd.cz/
 | 
			
		||||
        # with width and height
 | 
			
		||||
        expect_dict(
 | 
			
		||||
            self,
 | 
			
		||||
            self.ie._parse_html5_media_entries(
 | 
			
		||||
                'https://www.csfd.cz/',
 | 
			
		||||
                r'''
 | 
			
		||||
                <video width="770" height="328" preload="none" controls  poster="https://img.csfd.cz/files/images/film/video/preview/163/344/163344118_748d20.png?h360" >
 | 
			
		||||
                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327358_eac647.mp4" type="video/mp4" width="640" height="360">
 | 
			
		||||
                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327360_3d2646.mp4" type="video/mp4" width="1280" height="720">
 | 
			
		||||
                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327356_91f258.mp4" type="video/mp4" width="1920" height="1080">
 | 
			
		||||
                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327359_962b4a.webm" type="video/webm" width="640" height="360">
 | 
			
		||||
                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327361_6feee0.webm" type="video/webm" width="1280" height="720">
 | 
			
		||||
                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327357_8ab472.webm" type="video/webm" width="1920" height="1080">
 | 
			
		||||
                    <track src="https://video.csfd.cz/files/subtitles/163/344/163344115_4c388b.srt" type="text/x-srt" kind="subtitles" srclang="cs" label="cs">
 | 
			
		||||
                </video>
 | 
			
		||||
                ''', None)[0],
 | 
			
		||||
            {
 | 
			
		||||
                'formats': [{
 | 
			
		||||
                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327358_eac647.mp4',
 | 
			
		||||
                    'ext': 'mp4',
 | 
			
		||||
                    'width': 640,
 | 
			
		||||
                    'height': 360,
 | 
			
		||||
                }, {
 | 
			
		||||
                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327360_3d2646.mp4',
 | 
			
		||||
                    'ext': 'mp4',
 | 
			
		||||
                    'width': 1280,
 | 
			
		||||
                    'height': 720,
 | 
			
		||||
                }, {
 | 
			
		||||
                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327356_91f258.mp4',
 | 
			
		||||
                    'ext': 'mp4',
 | 
			
		||||
                    'width': 1920,
 | 
			
		||||
                    'height': 1080,
 | 
			
		||||
                }, {
 | 
			
		||||
                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327359_962b4a.webm',
 | 
			
		||||
                    'ext': 'webm',
 | 
			
		||||
                    'width': 640,
 | 
			
		||||
                    'height': 360,
 | 
			
		||||
                }, {
 | 
			
		||||
                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327361_6feee0.webm',
 | 
			
		||||
                    'ext': 'webm',
 | 
			
		||||
                    'width': 1280,
 | 
			
		||||
                    'height': 720,
 | 
			
		||||
                }, {
 | 
			
		||||
                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327357_8ab472.webm',
 | 
			
		||||
                    'ext': 'webm',
 | 
			
		||||
                    'width': 1920,
 | 
			
		||||
                    'height': 1080,
 | 
			
		||||
                }],
 | 
			
		||||
                'subtitles': {
 | 
			
		||||
                    'cs': [{'url': 'https://video.csfd.cz/files/subtitles/163/344/163344115_4c388b.srt'}]
 | 
			
		||||
                },
 | 
			
		||||
                'thumbnail': 'https://img.csfd.cz/files/images/film/video/preview/163/344/163344118_748d20.png?h360'
 | 
			
		||||
            })
 | 
			
		||||
 | 
			
		||||
        # from https://tamasha.com/v/Kkdjw
 | 
			
		||||
        # with height in label
 | 
			
		||||
        expect_dict(
 | 
			
		||||
            self,
 | 
			
		||||
            self.ie._parse_html5_media_entries(
 | 
			
		||||
                'https://tamasha.com/v/Kkdjw',
 | 
			
		||||
                r'''
 | 
			
		||||
                <video crossorigin="anonymous">
 | 
			
		||||
                        <source src="https://s-v2.tamasha.com/statics/videos_file/19/8f/Kkdjw_198feff8577d0057536e905cce1fb61438dd64e0_n_240.mp4" type="video/mp4" label="AUTO" res="0"/>
 | 
			
		||||
                                <source src="https://s-v2.tamasha.com/statics/videos_file/19/8f/Kkdjw_198feff8577d0057536e905cce1fb61438dd64e0_n_240.mp4" type="video/mp4"
 | 
			
		||||
                                        label="240p" res="240"/>
 | 
			
		||||
                                <source src="https://s-v2.tamasha.com/statics/videos_file/20/00/Kkdjw_200041c66f657fc967db464d156eafbc1ed9fe6f_n_144.mp4" type="video/mp4"
 | 
			
		||||
                                        label="144p" res="144"/>
 | 
			
		||||
                </video>
 | 
			
		||||
                ''', None)[0],
 | 
			
		||||
            {
 | 
			
		||||
                'formats': [{
 | 
			
		||||
                    'url': 'https://s-v2.tamasha.com/statics/videos_file/19/8f/Kkdjw_198feff8577d0057536e905cce1fb61438dd64e0_n_240.mp4',
 | 
			
		||||
                }, {
 | 
			
		||||
                    'url': 'https://s-v2.tamasha.com/statics/videos_file/19/8f/Kkdjw_198feff8577d0057536e905cce1fb61438dd64e0_n_240.mp4',
 | 
			
		||||
                    'ext': 'mp4',
 | 
			
		||||
                    'format_id': '240p',
 | 
			
		||||
                    'height': 240,
 | 
			
		||||
                }, {
 | 
			
		||||
                    'url': 'https://s-v2.tamasha.com/statics/videos_file/20/00/Kkdjw_200041c66f657fc967db464d156eafbc1ed9fe6f_n_144.mp4',
 | 
			
		||||
                    'ext': 'mp4',
 | 
			
		||||
                    'format_id': '144p',
 | 
			
		||||
                    'height': 144,
 | 
			
		||||
                }]
 | 
			
		||||
            })
 | 
			
		||||
 | 
			
		||||
        # from https://www.directvnow.com
 | 
			
		||||
        # with data-src
 | 
			
		||||
        expect_dict(
 | 
			
		||||
            self,
 | 
			
		||||
            self.ie._parse_html5_media_entries(
 | 
			
		||||
                'https://www.directvnow.com',
 | 
			
		||||
                r'''
 | 
			
		||||
                <video id="vid1" class="header--video-masked active" muted playsinline>
 | 
			
		||||
                    <source data-src="https://cdn.directv.com/content/dam/dtv/prod/website_directvnow-international/videos/DTVN_hdr_HBO_v3.mp4" type="video/mp4" />
 | 
			
		||||
                </video>
 | 
			
		||||
                ''', None)[0],
 | 
			
		||||
            {
 | 
			
		||||
                'formats': [{
 | 
			
		||||
                    'ext': 'mp4',
 | 
			
		||||
                    'url': 'https://cdn.directv.com/content/dam/dtv/prod/website_directvnow-international/videos/DTVN_hdr_HBO_v3.mp4',
 | 
			
		||||
                }]
 | 
			
		||||
            })
 | 
			
		||||
 | 
			
		||||
        # from https://www.directvnow.com
 | 
			
		||||
        # with data-src
 | 
			
		||||
        expect_dict(
 | 
			
		||||
            self,
 | 
			
		||||
            self.ie._parse_html5_media_entries(
 | 
			
		||||
                'https://www.directvnow.com',
 | 
			
		||||
                r'''
 | 
			
		||||
                <video id="vid1" class="header--video-masked active" muted playsinline>
 | 
			
		||||
                    <source data-src="https://cdn.directv.com/content/dam/dtv/prod/website_directvnow-international/videos/DTVN_hdr_HBO_v3.mp4" type="video/mp4" />
 | 
			
		||||
                </video>
 | 
			
		||||
                ''', None)[0],
 | 
			
		||||
            {
 | 
			
		||||
                'formats': [{
 | 
			
		||||
                    'url': 'https://cdn.directv.com/content/dam/dtv/prod/website_directvnow-international/videos/DTVN_hdr_HBO_v3.mp4',
 | 
			
		||||
                    'ext': 'mp4',
 | 
			
		||||
                }]
 | 
			
		||||
            })
 | 
			
		||||
 | 
			
		||||
        # from https://www.klarna.com/uk/
 | 
			
		||||
        # with data-video-src
 | 
			
		||||
        expect_dict(
 | 
			
		||||
            self,
 | 
			
		||||
            self.ie._parse_html5_media_entries(
 | 
			
		||||
                'https://www.directvnow.com',
 | 
			
		||||
                r'''
 | 
			
		||||
                <video loop autoplay muted class="responsive-video block-kl__video video-on-medium">
 | 
			
		||||
                    <source src="" data-video-desktop data-video-src="https://www.klarna.com/uk/wp-content/uploads/sites/11/2019/01/KL062_Smooth3_0_DogWalking_5s_920x080_.mp4" type="video/mp4" />
 | 
			
		||||
                </video>
 | 
			
		||||
                ''', None)[0],
 | 
			
		||||
            {
 | 
			
		||||
                'formats': [{
 | 
			
		||||
                    'url': 'https://www.klarna.com/uk/wp-content/uploads/sites/11/2019/01/KL062_Smooth3_0_DogWalking_5s_920x080_.mp4',
 | 
			
		||||
                    'ext': 'mp4',
 | 
			
		||||
                }],
 | 
			
		||||
            })
 | 
			
		||||
 | 
			
		||||
    def test_extract_jwplayer_data_realworld(self):
 | 
			
		||||
        # from http://www.suffolk.edu/sjc/
 | 
			
		||||
        expect_dict(
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -44,6 +44,7 @@ from ..utils import (
 | 
			
		|||
    compiled_regex_type,
 | 
			
		||||
    determine_ext,
 | 
			
		||||
    determine_protocol,
 | 
			
		||||
    dict_get,
 | 
			
		||||
    error_to_compat_str,
 | 
			
		||||
    ExtractorError,
 | 
			
		||||
    extract_attributes,
 | 
			
		||||
| 
						 | 
				
			
			@ -56,13 +57,16 @@ from ..utils import (
 | 
			
		|||
    JSON_LD_RE,
 | 
			
		||||
    mimetype2ext,
 | 
			
		||||
    orderedSet,
 | 
			
		||||
    parse_bitrate,
 | 
			
		||||
    parse_codecs,
 | 
			
		||||
    parse_duration,
 | 
			
		||||
    parse_iso8601,
 | 
			
		||||
    parse_m3u8_attributes,
 | 
			
		||||
    parse_resolution,
 | 
			
		||||
    RegexNotFoundError,
 | 
			
		||||
    sanitized_Request,
 | 
			
		||||
    sanitize_filename,
 | 
			
		||||
    str_or_none,
 | 
			
		||||
    unescapeHTML,
 | 
			
		||||
    unified_strdate,
 | 
			
		||||
    unified_timestamp,
 | 
			
		||||
| 
						 | 
				
			
			@ -2481,18 +2485,43 @@ class InfoExtractor(object):
 | 
			
		|||
            media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
 | 
			
		||||
            if media_content:
 | 
			
		||||
                for source_tag in re.findall(r'<source[^>]+>', media_content):
 | 
			
		||||
                    source_attributes = extract_attributes(source_tag)
 | 
			
		||||
                    src = source_attributes.get('src')
 | 
			
		||||
                    s_attr = extract_attributes(source_tag)
 | 
			
		||||
                    # data-video-src and data-src are non standard but seen
 | 
			
		||||
                    # several times in the wild
 | 
			
		||||
                    src = dict_get(s_attr, ('src', 'data-video-src', 'data-src'))
 | 
			
		||||
                    if not src:
 | 
			
		||||
                        continue
 | 
			
		||||
                    f = parse_content_type(source_attributes.get('type'))
 | 
			
		||||
                    f = parse_content_type(s_attr.get('type'))
 | 
			
		||||
                    is_plain_url, formats = _media_formats(src, media_type, f)
 | 
			
		||||
                    if is_plain_url:
 | 
			
		||||
                        # res attribute is not standard but seen several times
 | 
			
		||||
                        # in the wild
 | 
			
		||||
                        # width, height, res, label and title attributes are
 | 
			
		||||
                        # all not standard but seen several times in the wild
 | 
			
		||||
                        labels = [
 | 
			
		||||
                            s_attr.get(lbl)
 | 
			
		||||
                            for lbl in ('label', 'title')
 | 
			
		||||
                            if str_or_none(s_attr.get(lbl))
 | 
			
		||||
                        ]
 | 
			
		||||
                        width = int_or_none(s_attr.get('width'))
 | 
			
		||||
                        height = (int_or_none(s_attr.get('height')) or
 | 
			
		||||
                                  int_or_none(s_attr.get('res')))
 | 
			
		||||
                        if not width or not height:
 | 
			
		||||
                            for lbl in labels:
 | 
			
		||||
                                resolution = parse_resolution(lbl)
 | 
			
		||||
                                if not resolution:
 | 
			
		||||
                                    continue
 | 
			
		||||
                                width = width or resolution.get('width')
 | 
			
		||||
                                height = height or resolution.get('height')
 | 
			
		||||
                        for lbl in labels:
 | 
			
		||||
                            tbr = parse_bitrate(lbl)
 | 
			
		||||
                            if tbr:
 | 
			
		||||
                                break
 | 
			
		||||
                        else:
 | 
			
		||||
                            tbr = None
 | 
			
		||||
                        f.update({
 | 
			
		||||
                            'height': int_or_none(source_attributes.get('res')),
 | 
			
		||||
                            'format_id': source_attributes.get('label'),
 | 
			
		||||
                            'width': width,
 | 
			
		||||
                            'height': height,
 | 
			
		||||
                            'tbr': tbr,
 | 
			
		||||
                            'format_id': s_attr.get('label') or s_attr.get('title'),
 | 
			
		||||
                        })
 | 
			
		||||
                        f.update(formats[0])
 | 
			
		||||
                        media_info['formats'].append(f)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue