Refactor fragments interface and dash segments downloader
- Eliminate segment_urls and initialization_url + Introduce manifest_url (manifest may contain unfragmented data in this case url will be used for direct media URL and manifest_url for manifest itself correspondingly) * Rewrite dashsegments downloader to use fragments data * Improve generic mpd extraction
This commit is contained in:
		
							parent
							
								
									21d21b0c72
								
							
						
					
					
						commit
						86f4d14f81
					
				
					 3 changed files with 26 additions and 44 deletions
				
			
		| 
						 | 
				
			
			@ -1,7 +1,6 @@
 | 
			
		|||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
from .fragment import FragmentFD
 | 
			
		||||
from ..compat import compat_urllib_error
 | 
			
		||||
| 
						 | 
				
			
			@ -19,34 +18,32 @@ class DashSegmentsFD(FragmentFD):
 | 
			
		|||
    FD_NAME = 'dashsegments'
 | 
			
		||||
 | 
			
		||||
    def real_download(self, filename, info_dict):
 | 
			
		||||
        base_url = info_dict['url']
 | 
			
		||||
        segment_urls = [info_dict['segment_urls'][0]] if self.params.get('test', False) else info_dict['segment_urls']
 | 
			
		||||
        initialization_url = info_dict.get('initialization_url')
 | 
			
		||||
        segments = info_dict['fragments'][:1] if self.params.get(
 | 
			
		||||
            'test', False) else info_dict['fragments']
 | 
			
		||||
 | 
			
		||||
        ctx = {
 | 
			
		||||
            'filename': filename,
 | 
			
		||||
            'total_frags': len(segment_urls) + (1 if initialization_url else 0),
 | 
			
		||||
            'total_frags': len(segments),
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        self._prepare_and_start_frag_download(ctx)
 | 
			
		||||
 | 
			
		||||
        def combine_url(base_url, target_url):
 | 
			
		||||
            if re.match(r'^https?://', target_url):
 | 
			
		||||
                return target_url
 | 
			
		||||
            return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url)
 | 
			
		||||
 | 
			
		||||
        segments_filenames = []
 | 
			
		||||
 | 
			
		||||
        fragment_retries = self.params.get('fragment_retries', 0)
 | 
			
		||||
        skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
 | 
			
		||||
 | 
			
		||||
        def process_segment(segment, tmp_filename, fatal):
 | 
			
		||||
            target_url, segment_name = segment
 | 
			
		||||
        def process_segment(segment, tmp_filename, num):
 | 
			
		||||
            segment_url = segment['url']
 | 
			
		||||
            segment_name = 'Frag%d' % num
 | 
			
		||||
            target_filename = '%s-%s' % (tmp_filename, segment_name)
 | 
			
		||||
            # In DASH, the first segment contains necessary headers to
 | 
			
		||||
            # generate a valid MP4 file, so always abort for the first segment
 | 
			
		||||
            fatal = num == 0 or not skip_unavailable_fragments
 | 
			
		||||
            count = 0
 | 
			
		||||
            while count <= fragment_retries:
 | 
			
		||||
                try:
 | 
			
		||||
                    success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)})
 | 
			
		||||
                    success = ctx['dl'].download(target_filename, {'url': segment_url})
 | 
			
		||||
                    if not success:
 | 
			
		||||
                        return False
 | 
			
		||||
                    down, target_sanitized = sanitize_open(target_filename, 'rb')
 | 
			
		||||
| 
						 | 
				
			
			@ -72,16 +69,8 @@ class DashSegmentsFD(FragmentFD):
 | 
			
		|||
                return False
 | 
			
		||||
            return True
 | 
			
		||||
 | 
			
		||||
        segments_to_download = [(initialization_url, 'Init')] if initialization_url else []
 | 
			
		||||
        segments_to_download.extend([
 | 
			
		||||
            (segment_url, 'Seg%d' % i)
 | 
			
		||||
            for i, segment_url in enumerate(segment_urls)])
 | 
			
		||||
 | 
			
		||||
        for i, segment in enumerate(segments_to_download):
 | 
			
		||||
            # In DASH, the first segment contains necessary headers to
 | 
			
		||||
            # generate a valid MP4 file, so always abort for the first segment
 | 
			
		||||
            fatal = i == 0 or not skip_unavailable_fragments
 | 
			
		||||
            if not process_segment(segment, ctx['tmpfilename'], fatal):
 | 
			
		||||
        for i, segment in enumerate(segments):
 | 
			
		||||
            if not process_segment(segment, ctx['tmpfilename'], i):
 | 
			
		||||
                return False
 | 
			
		||||
 | 
			
		||||
        self._finish_frag_download(ctx)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -86,9 +86,10 @@ class InfoExtractor(object):
 | 
			
		|||
                    from worst to best quality.
 | 
			
		||||
 | 
			
		||||
                    Potential fields:
 | 
			
		||||
                    * url        Mandatory. The URL of the video file or URL of
 | 
			
		||||
                                 the manifest file in case of fragmented media
 | 
			
		||||
                                 (DASH, hls, hds).
 | 
			
		||||
                    * url        Mandatory. The URL of the video file
 | 
			
		||||
                    * manifest_url
 | 
			
		||||
                                 The URL of the manifest file in case of
 | 
			
		||||
                                 fragmented media (DASH, hls, hds)
 | 
			
		||||
                    * ext        Will be calculated from URL if missing
 | 
			
		||||
                    * format     A human-readable description of the format
 | 
			
		||||
                                 ("mp4 container with h264/opus").
 | 
			
		||||
| 
						 | 
				
			
			@ -1528,9 +1529,10 @@ class InfoExtractor(object):
 | 
			
		|||
        mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
 | 
			
		||||
 | 
			
		||||
        return self._parse_mpd_formats(
 | 
			
		||||
            compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
 | 
			
		||||
            compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
 | 
			
		||||
            formats_dict=formats_dict, mpd_url=mpd_url)
 | 
			
		||||
 | 
			
		||||
    def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
 | 
			
		||||
    def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
 | 
			
		||||
        """
 | 
			
		||||
        Parse formats from MPD manifest.
 | 
			
		||||
        References:
 | 
			
		||||
| 
						 | 
				
			
			@ -1654,6 +1656,7 @@ class InfoExtractor(object):
 | 
			
		|||
                        f = {
 | 
			
		||||
                            'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
 | 
			
		||||
                            'url': base_url,
 | 
			
		||||
                            'manifest_url': mpd_url,
 | 
			
		||||
                            'ext': mimetype2ext(mime_type),
 | 
			
		||||
                            'width': int_or_none(representation_attrib.get('width')),
 | 
			
		||||
                            'height': int_or_none(representation_attrib.get('height')),
 | 
			
		||||
| 
						 | 
				
			
			@ -1682,14 +1685,6 @@ class InfoExtractor(object):
 | 
			
		|||
                                if 'total_number' not in representation_ms_info and 'segment_duration':
 | 
			
		||||
                                    segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
 | 
			
		||||
                                    representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
 | 
			
		||||
                                representation_ms_info['segment_urls'] = [
 | 
			
		||||
                                    media_template % {
 | 
			
		||||
                                        'Number': segment_number,
 | 
			
		||||
                                        'Bandwidth': representation_attrib.get('bandwidth'),
 | 
			
		||||
                                    }
 | 
			
		||||
                                    for segment_number in range(
 | 
			
		||||
                                        representation_ms_info['start_number'],
 | 
			
		||||
                                        representation_ms_info['total_number'] + representation_ms_info['start_number'])]
 | 
			
		||||
                                representation_ms_info['fragments'] = [{
 | 
			
		||||
                                    'url': media_template % {
 | 
			
		||||
                                        'Number': segment_number,
 | 
			
		||||
| 
						 | 
				
			
			@ -1703,7 +1698,6 @@ class InfoExtractor(object):
 | 
			
		|||
                                # $Number*$ or $Time$ in media template with S list available
 | 
			
		||||
                                # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
 | 
			
		||||
                                # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
 | 
			
		||||
                                representation_ms_info['segment_urls'] = []
 | 
			
		||||
                                representation_ms_info['fragments'] = []
 | 
			
		||||
                                segment_time = 0
 | 
			
		||||
                                segment_d = None
 | 
			
		||||
| 
						 | 
				
			
			@ -1715,7 +1709,6 @@ class InfoExtractor(object):
 | 
			
		|||
                                        'Bandwidth': representation_attrib.get('bandwidth'),
 | 
			
		||||
                                        'Number': segment_number,
 | 
			
		||||
                                    }
 | 
			
		||||
                                    representation_ms_info['segment_urls'].append(segment_url)
 | 
			
		||||
                                    representation_ms_info['fragments'].append({
 | 
			
		||||
                                        'url': segment_url,
 | 
			
		||||
                                        'duration': float_or_none(segment_d, representation_ms_info['timescale']),
 | 
			
		||||
| 
						 | 
				
			
			@ -1745,17 +1738,15 @@ class InfoExtractor(object):
 | 
			
		|||
                                        'duration': float_or_none(s['d'], representation_ms_info['timescale']),
 | 
			
		||||
                                    })
 | 
			
		||||
                            representation_ms_info['fragments'] = fragments
 | 
			
		||||
                        if 'segment_urls' in representation_ms_info:
 | 
			
		||||
                        # NB: MPD manifest may contain direct URLs to unfragmented media.
 | 
			
		||||
                        # No fragments key is present in this case.
 | 
			
		||||
                        if 'fragments' in representation_ms_info:
 | 
			
		||||
                            f.update({
 | 
			
		||||
                                'segment_urls': representation_ms_info['segment_urls'],
 | 
			
		||||
                                'fragments': [],
 | 
			
		||||
                                'protocol': 'http_dash_segments',
 | 
			
		||||
                            })
 | 
			
		||||
                            if 'initialization_url' in representation_ms_info:
 | 
			
		||||
                                initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
 | 
			
		||||
                                f.update({
 | 
			
		||||
                                    'initialization_url': initialization_url,
 | 
			
		||||
                                })
 | 
			
		||||
                                if not f.get('url'):
 | 
			
		||||
                                    f['url'] = initialization_url
 | 
			
		||||
                                f['fragments'].append({'url': initialization_url})
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1657,7 +1657,9 @@ class GenericIE(InfoExtractor):
 | 
			
		|||
                return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
 | 
			
		||||
            elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
 | 
			
		||||
                info_dict['formats'] = self._parse_mpd_formats(
 | 
			
		||||
                    doc, video_id, mpd_base_url=url.rpartition('/')[0])
 | 
			
		||||
                    doc, video_id,
 | 
			
		||||
                    mpd_base_url=full_response.geturl().rpartition('/')[0],
 | 
			
		||||
                    mpd_url=url)
 | 
			
		||||
                self._sort_formats(info_dict['formats'])
 | 
			
		||||
                return info_dict
 | 
			
		||||
            elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue