[r7] Fix extraction and add support for articles (Closes #9826)

2016-06-19 02:25:34 +07:00 · 2016-06-19 02:25:34 +07:00 · 7577d849a6
commit 7577d849a6
parent cb23192bc4
2 changed files with 64 additions and 36 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -631,7 +631,10 @@ from .qqmusic import (
    QQMusicToplistIE,
    QQMusicPlaylistIE,
 )
-from .r7 import R7IE
+from .r7 import (
+    R7IE,
+    R7ArticleIE,
+)
 from .radiocanada import (
    RadioCanadaIE,
    RadioCanadaAudioVideoIE,
--- a/youtube_dl/extractor/r7.py
+++ b/youtube_dl/extractor/r7.py
@ -2,15 +2,12 @@
 from __future__ import unicode_literals

 from .common import InfoExtractor
-from ..utils import (
-    js_to_json,
-    unescapeHTML,
-    int_or_none,
-)
+from ..utils import int_or_none


 class R7IE(InfoExtractor):
-    _VALID_URL = r'''(?x)https?://
+    _VALID_URL = r'''(?x)
+                        https?://
                        (?:
                            (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/|
                            noticias\.r7\.com(?:/[^/]+)+/[^/]+-|
@ -25,6 +22,7 @@ class R7IE(InfoExtractor):
            'id': '54e7050b0cf2ff57e0279389',
            'ext': 'mp4',
            'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"',
+            'description': 'md5:01812008664be76a6479aa58ec865b72',
            'thumbnail': 're:^https?://.*\.jpg$',
            'duration': 98,
            'like_count': int,
@ -44,45 +42,72 @@ class R7IE(InfoExtractor):
    def _real_extract(self, url):
        video_id = self._match_id(url)

-        webpage = self._download_webpage(
-            'http://player.r7.com/video/i/%s' % video_id, video_id)
+        video = self._download_json(
+            'http://player-api.r7.com/video/i/%s' % video_id, video_id)

-        item = self._parse_json(js_to_json(self._search_regex(
-            r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id)
-
-        title = unescapeHTML(item['title'])
-        thumbnail = item.get('init', {}).get('thumbUri')
-        duration = None
-
-        statistics = item.get('statistics', {})
-        like_count = int_or_none(statistics.get('likes'))
-        view_count = int_or_none(statistics.get('views'))
+        title = video['title']

        formats = []
-        for format_key, format_dict in item['playlist'][0].items():
-            src = format_dict.get('src')
-            if not src:
-                continue
-            format_id = format_dict.get('format') or format_key
-            if duration is None:
-                duration = format_dict.get('duration')
-            if '.f4m' in src:
-                formats.extend(self._extract_f4m_formats(src, video_id, preference=-1))
-            elif src.endswith('.m3u8'):
-                formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2))
-            else:
-                formats.append({
-                    'url': src,
-                    'format_id': format_id,
-                })
+        media_url_hls = video.get('media_url_hls')
+        if media_url_hls:
+            formats.extend(self._extract_m3u8_formats(
+                media_url_hls, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls', fatal=False))
+        media_url = video.get('media_url')
+        if media_url:
+            f = {
+                'url': media_url,
+                'format_id': 'http',
+            }
+            # m3u8 format always matches the http format, let's copy metadata from
+            # one to another
+            m3u8_formats = list(filter(
+                lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+                formats))
+            if len(m3u8_formats) == 1:
+                f_copy = m3u8_formats[0].copy()
+                f_copy.update(f)
+                f_copy['protocol'] = 'http'
+                f = f_copy
+            formats.append(f)
        self._sort_formats(formats)

+        description = video.get('description')
+        thumbnail = video.get('thumb')
+        duration = int_or_none(video.get('media_duration'))
+        like_count = int_or_none(video.get('likes'))
+        view_count = int_or_none(video.get('views'))
+
        return {
            'id': video_id,
            'title': title,
+            'description': description,
            'thumbnail': thumbnail,
            'duration': duration,
            'like_count': like_count,
            'view_count': view_count,
            'formats': formats,
        }
+
+
+class R7ArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015',
+        'only_matching': True,
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return False if R7IE.suitable(url) else super(R7ArticleIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            r'<div[^>]+(?:id=["\']player-|class=["\']embed["\'][^>]+id=["\'])([\da-f]{24})',
+            webpage, 'video id')
+
+        return self.url_result('http://player.r7.com/video/i/%s' % video_id, R7IE.ie_key())