[r7] Fix extraction and add support for articles (Closes #9826)
This commit is contained in:
		
							parent
							
								
									cb23192bc4
								
							
						
					
					
						commit
						7577d849a6
					
				
					 2 changed files with 64 additions and 36 deletions
				
			
		| 
						 | 
				
			
			@ -631,7 +631,10 @@ from .qqmusic import (
 | 
			
		|||
    QQMusicToplistIE,
 | 
			
		||||
    QQMusicPlaylistIE,
 | 
			
		||||
)
 | 
			
		||||
from .r7 import R7IE
 | 
			
		||||
from .r7 import (
 | 
			
		||||
    R7IE,
 | 
			
		||||
    R7ArticleIE,
 | 
			
		||||
)
 | 
			
		||||
from .radiocanada import (
 | 
			
		||||
    RadioCanadaIE,
 | 
			
		||||
    RadioCanadaAudioVideoIE,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -2,22 +2,19 @@
 | 
			
		|||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from .common import InfoExtractor
 | 
			
		||||
from ..utils import (
 | 
			
		||||
    js_to_json,
 | 
			
		||||
    unescapeHTML,
 | 
			
		||||
    int_or_none,
 | 
			
		||||
)
 | 
			
		||||
from ..utils import int_or_none
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class R7IE(InfoExtractor):
 | 
			
		||||
    _VALID_URL = r'''(?x)https?://
 | 
			
		||||
    _VALID_URL = r'''(?x)
 | 
			
		||||
                        https?://
 | 
			
		||||
                        (?:
 | 
			
		||||
                            (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/|
 | 
			
		||||
                            noticias\.r7\.com(?:/[^/]+)+/[^/]+-|
 | 
			
		||||
                            player\.r7\.com/video/i/
 | 
			
		||||
                        )
 | 
			
		||||
                        (?P<id>[\da-f]{24})
 | 
			
		||||
                        '''
 | 
			
		||||
                    '''
 | 
			
		||||
    _TESTS = [{
 | 
			
		||||
        'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html',
 | 
			
		||||
        'md5': '403c4e393617e8e8ddc748978ee8efde',
 | 
			
		||||
| 
						 | 
				
			
			@ -25,6 +22,7 @@ class R7IE(InfoExtractor):
 | 
			
		|||
            'id': '54e7050b0cf2ff57e0279389',
 | 
			
		||||
            'ext': 'mp4',
 | 
			
		||||
            'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"',
 | 
			
		||||
            'description': 'md5:01812008664be76a6479aa58ec865b72',
 | 
			
		||||
            'thumbnail': 're:^https?://.*\.jpg$',
 | 
			
		||||
            'duration': 98,
 | 
			
		||||
            'like_count': int,
 | 
			
		||||
| 
						 | 
				
			
			@ -44,45 +42,72 @@ class R7IE(InfoExtractor):
 | 
			
		|||
    def _real_extract(self, url):
 | 
			
		||||
        video_id = self._match_id(url)
 | 
			
		||||
 | 
			
		||||
        webpage = self._download_webpage(
 | 
			
		||||
            'http://player.r7.com/video/i/%s' % video_id, video_id)
 | 
			
		||||
        video = self._download_json(
 | 
			
		||||
            'http://player-api.r7.com/video/i/%s' % video_id, video_id)
 | 
			
		||||
 | 
			
		||||
        item = self._parse_json(js_to_json(self._search_regex(
 | 
			
		||||
            r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id)
 | 
			
		||||
 | 
			
		||||
        title = unescapeHTML(item['title'])
 | 
			
		||||
        thumbnail = item.get('init', {}).get('thumbUri')
 | 
			
		||||
        duration = None
 | 
			
		||||
 | 
			
		||||
        statistics = item.get('statistics', {})
 | 
			
		||||
        like_count = int_or_none(statistics.get('likes'))
 | 
			
		||||
        view_count = int_or_none(statistics.get('views'))
 | 
			
		||||
        title = video['title']
 | 
			
		||||
 | 
			
		||||
        formats = []
 | 
			
		||||
        for format_key, format_dict in item['playlist'][0].items():
 | 
			
		||||
            src = format_dict.get('src')
 | 
			
		||||
            if not src:
 | 
			
		||||
                continue
 | 
			
		||||
            format_id = format_dict.get('format') or format_key
 | 
			
		||||
            if duration is None:
 | 
			
		||||
                duration = format_dict.get('duration')
 | 
			
		||||
            if '.f4m' in src:
 | 
			
		||||
                formats.extend(self._extract_f4m_formats(src, video_id, preference=-1))
 | 
			
		||||
            elif src.endswith('.m3u8'):
 | 
			
		||||
                formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2))
 | 
			
		||||
            else:
 | 
			
		||||
                formats.append({
 | 
			
		||||
                    'url': src,
 | 
			
		||||
                    'format_id': format_id,
 | 
			
		||||
                })
 | 
			
		||||
        media_url_hls = video.get('media_url_hls')
 | 
			
		||||
        if media_url_hls:
 | 
			
		||||
            formats.extend(self._extract_m3u8_formats(
 | 
			
		||||
                media_url_hls, video_id, 'mp4', entry_protocol='m3u8_native',
 | 
			
		||||
                m3u8_id='hls', fatal=False))
 | 
			
		||||
        media_url = video.get('media_url')
 | 
			
		||||
        if media_url:
 | 
			
		||||
            f = {
 | 
			
		||||
                'url': media_url,
 | 
			
		||||
                'format_id': 'http',
 | 
			
		||||
            }
 | 
			
		||||
            # m3u8 format always matches the http format, let's copy metadata from
 | 
			
		||||
            # one to another
 | 
			
		||||
            m3u8_formats = list(filter(
 | 
			
		||||
                lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
 | 
			
		||||
                formats))
 | 
			
		||||
            if len(m3u8_formats) == 1:
 | 
			
		||||
                f_copy = m3u8_formats[0].copy()
 | 
			
		||||
                f_copy.update(f)
 | 
			
		||||
                f_copy['protocol'] = 'http'
 | 
			
		||||
                f = f_copy
 | 
			
		||||
            formats.append(f)
 | 
			
		||||
        self._sort_formats(formats)
 | 
			
		||||
 | 
			
		||||
        description = video.get('description')
 | 
			
		||||
        thumbnail = video.get('thumb')
 | 
			
		||||
        duration = int_or_none(video.get('media_duration'))
 | 
			
		||||
        like_count = int_or_none(video.get('likes'))
 | 
			
		||||
        view_count = int_or_none(video.get('views'))
 | 
			
		||||
 | 
			
		||||
        return {
 | 
			
		||||
            'id': video_id,
 | 
			
		||||
            'title': title,
 | 
			
		||||
            'description': description,
 | 
			
		||||
            'thumbnail': thumbnail,
 | 
			
		||||
            'duration': duration,
 | 
			
		||||
            'like_count': like_count,
 | 
			
		||||
            'view_count': view_count,
 | 
			
		||||
            'formats': formats,
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class R7ArticleIE(InfoExtractor):
 | 
			
		||||
    _VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P<id>\d+)'
 | 
			
		||||
    _TEST = {
 | 
			
		||||
        'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015',
 | 
			
		||||
        'only_matching': True,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def suitable(cls, url):
 | 
			
		||||
        return False if R7IE.suitable(url) else super(R7ArticleIE, cls).suitable(url)
 | 
			
		||||
 | 
			
		||||
    def _real_extract(self, url):
 | 
			
		||||
        display_id = self._match_id(url)
 | 
			
		||||
 | 
			
		||||
        webpage = self._download_webpage(url, display_id)
 | 
			
		||||
 | 
			
		||||
        video_id = self._search_regex(
 | 
			
		||||
            r'<div[^>]+(?:id=["\']player-|class=["\']embed["\'][^>]+id=["\'])([\da-f]{24})',
 | 
			
		||||
            webpage, 'video id')
 | 
			
		||||
 | 
			
		||||
        return self.url_result('http://player.r7.com/video/i/%s' % video_id, R7IE.ie_key())
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue