[viafree] Improve video id extraction (Closes #10615)

This commit is contained in:
Sergey M․ 2016-09-11 14:59:14 +07:00
parent bfcda07a27
commit 2cb93afcd8
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D
1 changed files with 31 additions and 5 deletions

View File

@ -16,6 +16,7 @@ from ..utils import (
parse_iso8601, parse_iso8601,
qualities, qualities,
try_get, try_get,
js_to_json,
update_url_query, update_url_query,
) )
@ -367,6 +368,10 @@ class ViafreeIE(InfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
'add_ie': [TVPlayIE.ie_key()], 'add_ie': [TVPlayIE.ie_key()],
}, {
# Different og:image URL schema
'url': 'www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-2',
'only_matching': True,
}, { }, {
'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1', 'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1',
'only_matching': True, 'only_matching': True,
@ -384,14 +389,35 @@ class ViafreeIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
data = self._parse_json(
self._search_regex(
r'(?s)window\.App\s*=\s*({.+?})\s*;\s*</script',
webpage, 'data', default='{}'),
video_id, transform_source=lambda x: re.sub(
r'(?s)function\s+[a-zA-Z_][\da-zA-Z_]*\s*\([^)]*\)\s*{[^}]*}\s*',
'null', x), fatal=False)
video_id = None video_id = None
if data:
video_id = try_get(
data, lambda x: x['context']['dispatcher']['stores'][
'ContentPageProgramStore']['currentVideo']['id'],
compat_str)
# Fallback #1 (extract from og:image URL schema)
if not video_id:
thumbnail = self._og_search_thumbnail(webpage, default=None) thumbnail = self._og_search_thumbnail(webpage, default=None)
if thumbnail: if thumbnail:
video_id = self._search_regex( video_id = self._search_regex(
r'https?://[^/]+/imagecache/(?:[^/]+/)+seasons/\d+/(\d{6,})/', # Patterns seen:
# http://cdn.playapi.mtgx.tv/imagecache/600x315/cloud/content-images/inbox/765166/a2e95e5f1d735bab9f309fa345cc3f25.jpg
# http://cdn.playapi.mtgx.tv/imagecache/600x315/cloud/content-images/seasons/15204/758770/4a5ba509ca8bc043e1ebd1a76131cdf2.jpg
r'https?://[^/]+/imagecache/(?:[^/]+/)+(\d{6,})/',
thumbnail, 'video id', default=None) thumbnail, 'video id', default=None)
# Fallback #2. Extract from raw JSON string.
# May extract wrong video id if relatedClips is present.
if not video_id: if not video_id:
video_id = self._search_regex( video_id = self._search_regex(
r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](\d{6,})', r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](\d{6,})',