ArteTVIE: extract the video with the correct language
Some urls from the French version of the page could download the German version. Also instead of extracting the json url from the webpage, build it to skip the download
This commit is contained in:
parent
24a267b562
commit
9826925a20
1 changed files with 15 additions and 5 deletions
|
@ -16,7 +16,7 @@ class ArteTvIE(InfoExtractor):
|
||||||
www.arte.tv/guide, the extraction process is different for each one.
|
www.arte.tv/guide, the extraction process is different for each one.
|
||||||
The videos expire in 7 days, so we can't add tests.
|
The videos expire in 7 days, so we can't add tests.
|
||||||
"""
|
"""
|
||||||
_EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
|
_EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
|
||||||
_VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?:fr|de)/.*-(?P<id>.*?).html'
|
_VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?:fr|de)/.*-(?P<id>.*?).html'
|
||||||
_LIVE_URL = r'index-[0-9]+\.html$'
|
_LIVE_URL = r'index-[0-9]+\.html$'
|
||||||
|
|
||||||
|
@ -57,10 +57,11 @@ class ArteTvIE(InfoExtractor):
|
||||||
mobj = re.match(self._EMISSION_URL, url)
|
mobj = re.match(self._EMISSION_URL, url)
|
||||||
if mobj is not None:
|
if mobj is not None:
|
||||||
name = mobj.group('name')
|
name = mobj.group('name')
|
||||||
|
lang = mobj.group('lang')
|
||||||
# This is not a real id, it can be for example AJT for the news
|
# This is not a real id, it can be for example AJT for the news
|
||||||
# http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
|
# http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
|
||||||
video_id = mobj.group('id')
|
video_id = mobj.group('id')
|
||||||
return self._extract_emission(url, video_id)
|
return self._extract_emission(url, video_id, lang)
|
||||||
|
|
||||||
mobj = re.match(self._VIDEOS_URL, url)
|
mobj = re.match(self._VIDEOS_URL, url)
|
||||||
if mobj is not None:
|
if mobj is not None:
|
||||||
|
@ -72,10 +73,9 @@ class ArteTvIE(InfoExtractor):
|
||||||
# self.extractLiveStream(url)
|
# self.extractLiveStream(url)
|
||||||
# return
|
# return
|
||||||
|
|
||||||
def _extract_emission(self, url, video_id):
|
def _extract_emission(self, url, video_id, lang):
|
||||||
"""Extract from www.arte.tv/guide"""
|
"""Extract from www.arte.tv/guide"""
|
||||||
webpage = self._download_webpage(url, video_id)
|
json_url = 'http://org-www.arte.tv/papi/tvguide/videos/stream/player/F/%s_PLUS7-F/ALL/ALL.json' % video_id
|
||||||
json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
|
|
||||||
|
|
||||||
json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
|
json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
|
||||||
self.report_extraction(video_id)
|
self.report_extraction(video_id)
|
||||||
|
@ -91,6 +91,16 @@ class ArteTvIE(InfoExtractor):
|
||||||
}
|
}
|
||||||
|
|
||||||
formats = player_info['VSR'].values()
|
formats = player_info['VSR'].values()
|
||||||
|
def _match_lang(f):
|
||||||
|
# Return true if that format is in the language of the url
|
||||||
|
if lang == 'fr':
|
||||||
|
l = 'F'
|
||||||
|
elif lang == 'de':
|
||||||
|
l = 'A'
|
||||||
|
regexes = [r'VO?%s' % l, r'V%s-ST.' % l]
|
||||||
|
return any(re.match(r, f['versionCode']) for r in regexes)
|
||||||
|
# Some formats may not be in the same language as the url
|
||||||
|
formats = filter(_match_lang, formats)
|
||||||
# We order the formats by quality
|
# We order the formats by quality
|
||||||
formats = sorted(formats, key=lambda f: int(f['height']))
|
formats = sorted(formats, key=lambda f: int(f['height']))
|
||||||
# Pick the best quality
|
# Pick the best quality
|
||||||
|
|
Loading…
Reference in a new issue