[teamcoco] Fix extraction for full episodes(closes #16573)

2018-05-30 13:21:07 +01:00 · 2018-05-30 13:21:07 +01:00 · e0d42dd4b2
commit e0d42dd4b2
parent a07879d6b2
3 changed files with 122 additions and 88 deletions
--- a/youtube_dl/extractor/tbs.py
+++ b/youtube_dl/extractor/tbs.py
@ -4,6 +4,10 @@ from __future__ import unicode_literals
 import re

 from .turner import TurnerBaseIE
+from ..compat import (
+    compat_urllib_parse_urlparse,
+    compat_parse_qs,
+)
 from ..utils import (
    float_or_none,
    int_or_none,
@ -38,48 +42,22 @@ class TBSIE(TurnerBaseIE):
    def _real_extract(self, url):
        site, display_id = re.match(self._VALID_URL, url).groups()
        webpage = self._download_webpage(url, display_id)
-        video_data = self._parse_json(self._search_regex(
+        drupal_settings = self._parse_json(self._search_regex(
            r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>',
-            webpage, 'drupal setting'), display_id)['turner_playlist'][0]
+            webpage, 'drupal setting'), display_id)
+        video_data = drupal_settings['turner_playlist'][0]

        media_id = video_data['mediaID']
        title = video_data['title']
+        tokenizer_query = compat_parse_qs(compat_urllib_parse_urlparse(
+            drupal_settings['ngtv_token_url']).query)

-        streams_data = self._download_json(
-            'http://medium.ngtv.io/media/%s/tv' % media_id,
-            media_id)['media']['tv']
-        duration = None
-        chapters = []
-        formats = []
-        for supported_type in ('unprotected', 'bulkaes'):
-            stream_data = streams_data.get(supported_type, {})
-            m3u8_url = stream_data.get('secureUrl') or stream_data.get('url')
-            if not m3u8_url:
-                continue
-            if stream_data.get('playlistProtection') == 'spe':
-                m3u8_url = self._add_akamai_spe_token(
-                    'http://token.vgtf.net/token/token_spe',
-                    m3u8_url, media_id, {
-                        'url': url,
-                        'site_name': site[:3].upper(),
-                        'auth_required': video_data.get('authRequired') == '1',
-                    })
-            formats.extend(self._extract_m3u8_formats(
-                m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False))
-
-            duration = float_or_none(stream_data.get('totalRuntime') or video_data.get('duration'))
-
-            if not chapters:
-                for chapter in stream_data.get('contentSegments', []):
-                    start_time = float_or_none(chapter.get('start'))
-                    duration = float_or_none(chapter.get('duration'))
-                    if start_time is None or duration is None:
-                        continue
-                    chapters.append({
-                        'start_time': start_time,
-                        'end_time': start_time + duration,
-                    })
-        self._sort_formats(formats)
+        info = self._extract_ngtv_info(
+            media_id, tokenizer_query, {
+                'url': url,
+                'site_name': site[:3].upper(),
+                'auth_required': video_data.get('authRequired') == '1',
+            })

        thumbnails = []
        for image_id, image in video_data.get('images', {}).items():
@ -98,15 +76,14 @@ class TBSIE(TurnerBaseIE):
                })
            thumbnails.append(i)

-        return {
+        info.update({
            'id': media_id,
            'title': title,
            'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')),
-            'duration': duration,
+            'duration': float_or_none(video_data.get('duration')) or info.get('duration'),
            'timestamp': int_or_none(video_data.get('created')),
            'season_number': int_or_none(video_data.get('season')),
            'episode_number': int_or_none(video_data.get('episode')),
-            'cahpters': chapters,
            'thumbnails': thumbnails,
-            'formats': formats,
-        }
+        })
+        return info
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@ -3,7 +3,7 @@ from __future__ import unicode_literals

 import json

-from .common import InfoExtractor
+from .turner import TurnerBaseIE
 from ..utils import (
    determine_ext,
    ExtractorError,
@ -15,7 +15,7 @@ from ..utils import (
 )


-class TeamcocoIE(InfoExtractor):
+class TeamcocoIE(TurnerBaseIE):
    _VALID_URL = r'https?://teamcoco\.com/(?P<id>([^/]+/)*[^/?#]+)'
    _TESTS = [
        {
@ -110,6 +110,8 @@ class TeamcocoIE(InfoExtractor):
          name
        }
        duration
+        turnerMediaId
+        turnerMediaAuthToken
      }
    }
    ... on NotFoundSlug {
@ -123,53 +125,65 @@ class TeamcocoIE(InfoExtractor):
        record = response['record']
        video_id = record['id']

-        video_sources = self._graphql_call('''{
-  %s(id: "%s") {
-    src
-  }
-}''', 'RecordVideoSource', video_id) or {}
-
-        formats = []
-        get_quality = qualities(['low', 'sd', 'hd', 'uhd'])
-        for format_id, src in video_sources.get('src', {}).items():
-            if not isinstance(src, dict):
-                continue
-            src_url = src.get('src')
-            if not src_url:
-                continue
-            ext = determine_ext(src_url, mimetype2ext(src.get('type')))
-            if format_id == 'hls' or ext == 'm3u8':
-                # compat_urllib_parse.urljoin does not work here
-                if src_url.startswith('/'):
-                    src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url
-                formats.extend(self._extract_m3u8_formats(
-                    src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
-            else:
-                if src_url.startswith('/mp4:protected/'):
-                    # TODO Correct extraction for these files
-                    continue
-                tbr = int_or_none(self._search_regex(
-                    r'(\d+)k\.mp4', src_url, 'tbr', default=None))
-
-                formats.append({
-                    'url': src_url,
-                    'ext': ext,
-                    'tbr': tbr,
-                    'format_id': format_id,
-                    'quality': get_quality(format_id),
-                })
-        if not formats:
-            formats = self._extract_m3u8_formats(
-                record['file']['url'], video_id, 'mp4', fatal=False)
-        self._sort_formats(formats)
-
-        return {
+        info = {
            'id': video_id,
            'display_id': display_id,
-            'formats': formats,
            'title': record['title'],
            'thumbnail': record.get('thumb', {}).get('preview'),
            'description': record.get('teaser'),
            'duration': parse_duration(record.get('duration')),
            'timestamp': parse_iso8601(record.get('publishOn')),
        }
+
+        media_id = record.get('turnerMediaId')
+        if media_id:
+            self._initialize_geo_bypass({
+                'countries': ['US'],
+            })
+            info.update(self._extract_ngtv_info(media_id, {
+                'accessToken': record['turnerMediaAuthToken'],
+                'accessTokenType': 'jws',
+            }))
+        else:
+            video_sources = self._graphql_call('''{
+  %s(id: "%s") {
+    src
+  }
+}''', 'RecordVideoSource', video_id) or {}
+
+            formats = []
+            get_quality = qualities(['low', 'sd', 'hd', 'uhd'])
+            for format_id, src in video_sources.get('src', {}).items():
+                if not isinstance(src, dict):
+                    continue
+                src_url = src.get('src')
+                if not src_url:
+                    continue
+                ext = determine_ext(src_url, mimetype2ext(src.get('type')))
+                if format_id == 'hls' or ext == 'm3u8':
+                    # compat_urllib_parse.urljoin does not work here
+                    if src_url.startswith('/'):
+                        src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url
+                    formats.extend(self._extract_m3u8_formats(
+                        src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
+                else:
+                    if src_url.startswith('/mp4:protected/'):
+                        # TODO Correct extraction for these files
+                        continue
+                    tbr = int_or_none(self._search_regex(
+                        r'(\d+)k\.mp4', src_url, 'tbr', default=None))
+
+                    formats.append({
+                        'url': src_url,
+                        'ext': ext,
+                        'tbr': tbr,
+                        'format_id': format_id,
+                        'quality': get_quality(format_id),
+                    })
+            if not formats:
+                formats = self._extract_m3u8_formats(
+                    record['file']['url'], video_id, 'mp4', fatal=False)
+            self._sort_formats(formats)
+            info['formats'] = formats
+
+        return info
--- a/youtube_dl/extractor/turner.py
+++ b/youtube_dl/extractor/turner.py
@ -9,6 +9,7 @@ from ..utils import (
    xpath_text,
    int_or_none,
    determine_ext,
+    float_or_none,
    parse_duration,
    xpath_attr,
    update_url_query,
@ -23,14 +24,17 @@ class TurnerBaseIE(AdobePassIE):
    def _extract_timestamp(self, video_data):
        return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts'))

-    def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data):
+    def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, custom_tokenizer_query=None):
        secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*'
        token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path)
        if not token:
            query = {
                'path': secure_path,
-                'videoId': content_id,
            }
+            if custom_tokenizer_query:
+                query.update(custom_tokenizer_query)
+            else:
+                query['videoId'] = content_id
            if ap_data.get('auth_required'):
                query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name'])
            auth = self._download_xml(
@ -188,3 +192,42 @@ class TurnerBaseIE(AdobePassIE):
            'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')),
            'is_live': is_live,
        }
+
+    def _extract_ngtv_info(self, media_id, tokenizer_query, ap_data=None):
+        streams_data = self._download_json(
+            'http://medium.ngtv.io/media/%s/tv' % media_id,
+            media_id)['media']['tv']
+        duration = None
+        chapters = []
+        formats = []
+        for supported_type in ('unprotected', 'bulkaes'):
+            stream_data = streams_data.get(supported_type, {})
+            m3u8_url = stream_data.get('secureUrl') or stream_data.get('url')
+            if not m3u8_url:
+                continue
+            if stream_data.get('playlistProtection') == 'spe':
+                m3u8_url = self._add_akamai_spe_token(
+                    'http://token.ngtv.io/token/token_spe',
+                    m3u8_url, media_id, ap_data or {}, tokenizer_query)
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False))
+
+            duration = float_or_none(stream_data.get('totalRuntime'))
+
+            if not chapters:
+                for chapter in stream_data.get('contentSegments', []):
+                    start_time = float_or_none(chapter.get('start'))
+                    chapter_duration = float_or_none(chapter.get('duration'))
+                    if start_time is None or chapter_duration is None:
+                        continue
+                    chapters.append({
+                        'start_time': start_time,
+                        'end_time': start_time + chapter_duration,
+                    })
+        self._sort_formats(formats)
+
+        return {
+            'formats': formats,
+            'chapters': chapters,
+            'duration': duration,
+        }