[arte] Add support for playlists and rework tests (Closes #9632)
This commit is contained in:
		
							parent
							
								
									6a1df4fb5f
								
							
						
					
					
						commit
						6e6b9f600f
					
				
					 2 changed files with 110 additions and 64 deletions
				
			
		| 
						 | 
				
			
			@ -61,10 +61,7 @@ class ArteTvIE(InfoExtractor):
 | 
			
		|||
        }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ArteTVPlus7IE(InfoExtractor):
 | 
			
		||||
    IE_NAME = 'arte.tv:+7'
 | 
			
		||||
    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)'
 | 
			
		||||
 | 
			
		||||
class ArteTVBaseIE(InfoExtractor):
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def _extract_url_info(cls, url):
 | 
			
		||||
        mobj = re.match(cls._VALID_URL, url)
 | 
			
		||||
| 
						 | 
				
			
			@ -78,60 +75,6 @@ class ArteTVPlus7IE(InfoExtractor):
 | 
			
		|||
            video_id = mobj.group('id')
 | 
			
		||||
        return video_id, lang
 | 
			
		||||
 | 
			
		||||
    def _real_extract(self, url):
 | 
			
		||||
        video_id, lang = self._extract_url_info(url)
 | 
			
		||||
        webpage = self._download_webpage(url, video_id)
 | 
			
		||||
        return self._extract_from_webpage(webpage, video_id, lang)
 | 
			
		||||
 | 
			
		||||
    def _extract_from_webpage(self, webpage, video_id, lang):
 | 
			
		||||
        patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']')
 | 
			
		||||
        ids = (video_id, '')
 | 
			
		||||
        # some pages contain multiple videos (like
 | 
			
		||||
        # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D),
 | 
			
		||||
        # so we first try to look for json URLs that contain the video id from
 | 
			
		||||
        # the 'vid' parameter.
 | 
			
		||||
        patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates]
 | 
			
		||||
        json_url = self._html_search_regex(
 | 
			
		||||
            patterns, webpage, 'json vp url', default=None)
 | 
			
		||||
        if not json_url:
 | 
			
		||||
            def find_iframe_url(webpage, default=NO_DEFAULT):
 | 
			
		||||
                return self._html_search_regex(
 | 
			
		||||
                    r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1',
 | 
			
		||||
                    webpage, 'iframe url', group='url', default=default)
 | 
			
		||||
 | 
			
		||||
            iframe_url = find_iframe_url(webpage, None)
 | 
			
		||||
            if not iframe_url:
 | 
			
		||||
                embed_url = self._html_search_regex(
 | 
			
		||||
                    r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None)
 | 
			
		||||
                if embed_url:
 | 
			
		||||
                    player = self._download_json(
 | 
			
		||||
                        embed_url, video_id, 'Downloading player page')
 | 
			
		||||
                    iframe_url = find_iframe_url(player['html'])
 | 
			
		||||
            # en and es URLs produce react-based pages with different layout (e.g.
 | 
			
		||||
            # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world)
 | 
			
		||||
            if not iframe_url:
 | 
			
		||||
                program = self._search_regex(
 | 
			
		||||
                    r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n',
 | 
			
		||||
                    webpage, 'program', default=None)
 | 
			
		||||
                if program:
 | 
			
		||||
                    embed_html = self._parse_json(program, video_id)
 | 
			
		||||
                    if embed_html:
 | 
			
		||||
                        iframe_url = find_iframe_url(embed_html['embed_html'])
 | 
			
		||||
            if iframe_url:
 | 
			
		||||
                json_url = compat_parse_qs(
 | 
			
		||||
                    compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
 | 
			
		||||
        if json_url:
 | 
			
		||||
            title = self._search_regex(
 | 
			
		||||
                r'<h3[^>]+title=(["\'])(?P<title>.+?)\1',
 | 
			
		||||
                webpage, 'title', default=None, group='title')
 | 
			
		||||
            return self._extract_from_json_url(json_url, video_id, lang, title=title)
 | 
			
		||||
        # Different kind of embed URL (e.g.
 | 
			
		||||
        # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium)
 | 
			
		||||
        embed_url = self._search_regex(
 | 
			
		||||
            r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1',
 | 
			
		||||
            webpage, 'embed url', group='url')
 | 
			
		||||
        return self.url_result(embed_url)
 | 
			
		||||
 | 
			
		||||
    def _extract_from_json_url(self, json_url, video_id, lang, title=None):
 | 
			
		||||
        info = self._download_json(json_url, video_id)
 | 
			
		||||
        player_info = info['videoJsonPlayer']
 | 
			
		||||
| 
						 | 
				
			
			@ -235,6 +178,74 @@ class ArteTVPlus7IE(InfoExtractor):
 | 
			
		|||
        return info_dict
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ArteTVPlus7IE(ArteTVBaseIE):
 | 
			
		||||
    IE_NAME = 'arte.tv:+7'
 | 
			
		||||
    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)'
 | 
			
		||||
 | 
			
		||||
    _TESTS = [{
 | 
			
		||||
        'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D',
 | 
			
		||||
        'only_matching': True,
 | 
			
		||||
    }]
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def suitable(cls, url):
 | 
			
		||||
        return False if ArteTVPlaylistIE.suitable(url) else super(ArteTVPlus7IE, cls).suitable(url)
 | 
			
		||||
 | 
			
		||||
    def _real_extract(self, url):
 | 
			
		||||
        video_id, lang = self._extract_url_info(url)
 | 
			
		||||
        webpage = self._download_webpage(url, video_id)
 | 
			
		||||
        return self._extract_from_webpage(webpage, video_id, lang)
 | 
			
		||||
 | 
			
		||||
    def _extract_from_webpage(self, webpage, video_id, lang):
 | 
			
		||||
        patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']')
 | 
			
		||||
        ids = (video_id, '')
 | 
			
		||||
        # some pages contain multiple videos (like
 | 
			
		||||
        # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D),
 | 
			
		||||
        # so we first try to look for json URLs that contain the video id from
 | 
			
		||||
        # the 'vid' parameter.
 | 
			
		||||
        patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates]
 | 
			
		||||
        json_url = self._html_search_regex(
 | 
			
		||||
            patterns, webpage, 'json vp url', default=None)
 | 
			
		||||
        if not json_url:
 | 
			
		||||
            def find_iframe_url(webpage, default=NO_DEFAULT):
 | 
			
		||||
                return self._html_search_regex(
 | 
			
		||||
                    r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1',
 | 
			
		||||
                    webpage, 'iframe url', group='url', default=default)
 | 
			
		||||
 | 
			
		||||
            iframe_url = find_iframe_url(webpage, None)
 | 
			
		||||
            if not iframe_url:
 | 
			
		||||
                embed_url = self._html_search_regex(
 | 
			
		||||
                    r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None)
 | 
			
		||||
                if embed_url:
 | 
			
		||||
                    player = self._download_json(
 | 
			
		||||
                        embed_url, video_id, 'Downloading player page')
 | 
			
		||||
                    iframe_url = find_iframe_url(player['html'])
 | 
			
		||||
            # en and es URLs produce react-based pages with different layout (e.g.
 | 
			
		||||
            # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world)
 | 
			
		||||
            if not iframe_url:
 | 
			
		||||
                program = self._search_regex(
 | 
			
		||||
                    r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n',
 | 
			
		||||
                    webpage, 'program', default=None)
 | 
			
		||||
                if program:
 | 
			
		||||
                    embed_html = self._parse_json(program, video_id)
 | 
			
		||||
                    if embed_html:
 | 
			
		||||
                        iframe_url = find_iframe_url(embed_html['embed_html'])
 | 
			
		||||
            if iframe_url:
 | 
			
		||||
                json_url = compat_parse_qs(
 | 
			
		||||
                    compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
 | 
			
		||||
        if json_url:
 | 
			
		||||
            title = self._search_regex(
 | 
			
		||||
                r'<h3[^>]+title=(["\'])(?P<title>.+?)\1',
 | 
			
		||||
                webpage, 'title', default=None, group='title')
 | 
			
		||||
            return self._extract_from_json_url(json_url, video_id, lang, title=title)
 | 
			
		||||
        # Different kind of embed URL (e.g.
 | 
			
		||||
        # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium)
 | 
			
		||||
        embed_url = self._search_regex(
 | 
			
		||||
            r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1',
 | 
			
		||||
            webpage, 'embed url', group='url')
 | 
			
		||||
        return self.url_result(embed_url)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# It also uses the arte_vp_url url from the webpage to extract the information
 | 
			
		||||
class ArteTVCreativeIE(ArteTVPlus7IE):
 | 
			
		||||
    IE_NAME = 'arte.tv:creative'
 | 
			
		||||
| 
						 | 
				
			
			@ -267,7 +278,7 @@ class ArteTVInfoIE(ArteTVPlus7IE):
 | 
			
		|||
    IE_NAME = 'arte.tv:info'
 | 
			
		||||
    _VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
 | 
			
		||||
 | 
			
		||||
    _TEST = {
 | 
			
		||||
    _TESTS = [{
 | 
			
		||||
        'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere',
 | 
			
		||||
        'info_dict': {
 | 
			
		||||
            'id': '067528-000-A',
 | 
			
		||||
| 
						 | 
				
			
			@ -275,7 +286,7 @@ class ArteTVInfoIE(ArteTVPlus7IE):
 | 
			
		|||
            'title': 'Service civique, un cache misère ?',
 | 
			
		||||
            'upload_date': '20160403',
 | 
			
		||||
        },
 | 
			
		||||
    }
 | 
			
		||||
    }]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ArteTVFutureIE(ArteTVPlus7IE):
 | 
			
		||||
| 
						 | 
				
			
			@ -300,6 +311,8 @@ class ArteTVDDCIE(ArteTVPlus7IE):
 | 
			
		|||
    IE_NAME = 'arte.tv:ddc'
 | 
			
		||||
    _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)'
 | 
			
		||||
 | 
			
		||||
    _TESTS = []
 | 
			
		||||
 | 
			
		||||
    def _real_extract(self, url):
 | 
			
		||||
        video_id, lang = self._extract_url_info(url)
 | 
			
		||||
        if lang == 'folge':
 | 
			
		||||
| 
						 | 
				
			
			@ -318,7 +331,7 @@ class ArteTVConcertIE(ArteTVPlus7IE):
 | 
			
		|||
    IE_NAME = 'arte.tv:concert'
 | 
			
		||||
    _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)'
 | 
			
		||||
 | 
			
		||||
    _TEST = {
 | 
			
		||||
    _TESTS = [{
 | 
			
		||||
        'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde',
 | 
			
		||||
        'md5': '9ea035b7bd69696b67aa2ccaaa218161',
 | 
			
		||||
        'info_dict': {
 | 
			
		||||
| 
						 | 
				
			
			@ -328,14 +341,14 @@ class ArteTVConcertIE(ArteTVPlus7IE):
 | 
			
		|||
            'upload_date': '20140128',
 | 
			
		||||
            'description': 'md5:486eb08f991552ade77439fe6d82c305',
 | 
			
		||||
        },
 | 
			
		||||
    }
 | 
			
		||||
    }]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ArteTVCinemaIE(ArteTVPlus7IE):
 | 
			
		||||
    IE_NAME = 'arte.tv:cinema'
 | 
			
		||||
    _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)'
 | 
			
		||||
 | 
			
		||||
    _TEST = {
 | 
			
		||||
    _TESTS = [{
 | 
			
		||||
        'url': 'http://cinema.arte.tv/de/node/38291',
 | 
			
		||||
        'md5': '6b275511a5107c60bacbeeda368c3aa1',
 | 
			
		||||
        'info_dict': {
 | 
			
		||||
| 
						 | 
				
			
			@ -345,7 +358,7 @@ class ArteTVCinemaIE(ArteTVPlus7IE):
 | 
			
		|||
            'upload_date': '20160122',
 | 
			
		||||
            'description': 'md5:7f749bbb77d800ef2be11d54529b96bc',
 | 
			
		||||
        },
 | 
			
		||||
    }
 | 
			
		||||
    }]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ArteTVMagazineIE(ArteTVPlus7IE):
 | 
			
		||||
| 
						 | 
				
			
			@ -390,9 +403,41 @@ class ArteTVEmbedIE(ArteTVPlus7IE):
 | 
			
		|||
        )
 | 
			
		||||
    '''
 | 
			
		||||
 | 
			
		||||
    _TESTS = []
 | 
			
		||||
 | 
			
		||||
    def _real_extract(self, url):
 | 
			
		||||
        mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
        video_id = mobj.group('id')
 | 
			
		||||
        lang = mobj.group('lang')
 | 
			
		||||
        json_url = mobj.group('json_url')
 | 
			
		||||
        return self._extract_from_json_url(json_url, video_id, lang)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ArteTVPlaylistIE(ArteTVBaseIE):
 | 
			
		||||
    IE_NAME = 'arte.tv:playlist'
 | 
			
		||||
    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/[^#]*#collection/(?P<id>PL-\d+)'
 | 
			
		||||
 | 
			
		||||
    _TESTS = [{
 | 
			
		||||
        'url': 'http://www.arte.tv/guide/de/plus7/?country=DE#collection/PL-013263/ARTETV',
 | 
			
		||||
        'info_dict': {
 | 
			
		||||
            'id': 'PL-013263',
 | 
			
		||||
            'title': 'Areva & Uramin',
 | 
			
		||||
        },
 | 
			
		||||
        'playlist_mincount': 6,
 | 
			
		||||
    }, {
 | 
			
		||||
        'url': 'http://www.arte.tv/guide/de/playlists?country=DE#collection/PL-013190/ARTETV',
 | 
			
		||||
        'only_matching': True,
 | 
			
		||||
    }]
 | 
			
		||||
 | 
			
		||||
    def _real_extract(self, url):
 | 
			
		||||
        playlist_id, lang = self._extract_url_info(url)
 | 
			
		||||
        collection = self._download_json(
 | 
			
		||||
            'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos'
 | 
			
		||||
            % (lang, playlist_id), playlist_id)
 | 
			
		||||
        title = collection.get('title')
 | 
			
		||||
        description = collection.get('shortDescription') or collection.get('teaserText')
 | 
			
		||||
        entries = [
 | 
			
		||||
            self._extract_from_json_url(
 | 
			
		||||
                video['jsonUrl'], video.get('programId') or playlist_id, lang)
 | 
			
		||||
            for video in collection['videos'] if video.get('jsonUrl')]
 | 
			
		||||
        return self.playlist_result(entries, playlist_id, title, description)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -56,6 +56,7 @@ from .arte import (
 | 
			
		|||
    ArteTVDDCIE,
 | 
			
		||||
    ArteTVMagazineIE,
 | 
			
		||||
    ArteTVEmbedIE,
 | 
			
		||||
    ArteTVPlaylistIE,
 | 
			
		||||
)
 | 
			
		||||
from .atresplayer import AtresPlayerIE
 | 
			
		||||
from .atttechchannel import ATTTechChannelIE
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue