[9gag] Add extractor

2013-12-05 14:29:08 +01:00 · 2013-12-05 14:29:08 +01:00 · 7fc3fa0545
commit 7fc3fa0545
parent 29030c0a4c
3 changed files with 73 additions and 3 deletions
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -405,7 +405,8 @@ class YoutubeDL(object):
        for key, value in extra_info.items():
            info_dict.setdefault(key, value)
-    def extract_info(self, url, download=True, ie_key=None, extra_info={}):
+    def extract_info(self, url, download=True, ie_key=None, extra_info={},
                     process=True):
        '''
        Returns a list with a dictionary for each video we find.
        If 'download', also downloads the videos.
@ -441,7 +442,10 @@ class YoutubeDL(object):
                        'webpage_url': url,
                        'extractor_key': ie.ie_key(),
                    })
                if process:
                    return self.process_ie_result(ie_result, download, extra_info)
                else:
                    return ie_result
            except ExtractorError as de: # An error we somewhat expected
                self.report_error(compat_str(de), de.format_traceback())
                break
@ -474,8 +478,32 @@ class YoutubeDL(object):
                                     download,
                                     ie_key=ie_result.get('ie_key'),
                                     extra_info=extra_info)
-        elif result_type == 'playlist':
+        elif result_type == 'url_transparent':
            # Use the information from the embedding page
            info = self.extract_info(
                ie_result['url'], ie_key=ie_result.get('ie_key'),
                extra_info=extra_info, download=False, process=False)
            def make_result(embedded_info):
                new_result = ie_result.copy()
                for f in ('_type', 'url', 'ext', 'player_url', 'formats',
                          'entries', 'urlhandle', 'ie_key', 'duration',
                          'subtitles', 'annotations', 'format'):
                    if f in new_result:
                        del new_result[f]
                    if f in embedded_info:
                        new_result[f] = embedded_info[f]
                return new_result
            new_result = make_result(info)
            assert new_result.get('_type') != 'url_transparent'
            if new_result.get('_type') == 'compat_list':
                new_result['entries'] = [
                    make_result(e) for e in new_result['entries']]
            return self.process_ie_result(
                new_result, download=download, extra_info=extra_info)
        elif result_type == 'playlist':
            # We process each entry in the playlist
            playlist = ie_result.get('title', None) or ie_result.get('id', None)
            self.to_screen(u'[download] Downloading playlist: %s' % playlist)
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -102,6 +102,7 @@ from .nbc import NBCNewsIE
 from .newgrounds import NewgroundsIE
 from .nhl import NHLIE, NHLVideocenterIE
 from .niconico import NiconicoIE
 from .ninegag import NineGagIE
 from .nowvideo import NowVideoIE
 from .ooyala import OoyalaIE
 from .orf import ORFIE
--- a/youtube_dl/extractor/ninegag.py
+++ b/youtube_dl/extractor/ninegag.py
@ -0,0 +1,41 @@
 import json
 import re
 from .common import InfoExtractor
 class NineGagIE(InfoExtractor):
    IE_NAME = '9gag'
    _VALID_URL = r'^https?://(?:www\.)?9gag\.tv/v/(?P<id>[0-9]+)'
    _TEST = {
        u"url": u"http://9gag.tv/v/1912",
        u"file": u"1912.mp4",
        u"info_dict": {
            u"description": u"This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)",
            u"title": u"\"People Are Awesome 2013\" Is Absolutely Awesome"
        },
        u'add_ie': [u'Youtube']
    }
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        webpage = self._download_webpage(url, video_id)
        data_json = self._html_search_regex(r'''(?x)
            <div\s*id="tv-video"\s*data-video-source="youtube"\s*
                data-video-meta="([^"]+)"''', webpage, u'video metadata')
        data = json.loads(data_json)
        return {
            '_type': 'url_transparent',
            'url': data['youtubeVideoId'],
            'ie_key': 'Youtube',
            'id': video_id,
            'title': data['title'],
            'description': data['description'],
            'view_count': int(data['view_count']),
            'thumbnail': data['thumbnail_url'],
        }