[instagram] Add support for user profiles (Fixes #2606)

2014-03-23 16:06:03 +01:00 · 2014-03-23 16:06:03 +01:00 · ea38e55fff
commit ea38e55fff
parent 257cfebfe6
6 changed files with 124 additions and 29 deletions
--- a/test/helper.py
+++ b/test/helper.py
@ -110,3 +110,21 @@ def expect_info_dict(self, expected_dict, got_dict):
            self.assertEqual(expected, got,
                u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
    # Check for the presence of mandatory fields
    for key in ('id', 'url', 'title', 'ext'):
        self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key)
    # Check for mandatory fields that are automatically set by YoutubeDL
    for key in ['webpage_url', 'extractor', 'extractor_key']:
        self.assertTrue(got_dict.get(key), u'Missing field: %s' % key)
    # Are checkable fields missing from the test case definition?
    test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
        for key, value in got_dict.items()
        if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location'))
    missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys())
    if missing_keys:
        sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n')
        self.assertFalse(
            missing_keys,
            'Missing keys in test definition: %s' % (
                ', '.join(sorted(missing_keys))))
--- a/test/test_download.py
+++ b/test/test_download.py
@ -137,25 +137,6 @@ def generator(test_case):
                    info_dict = json.load(infof)
                expect_info_dict(self, tc.get('info_dict', {}), info_dict)
                # Check for the presence of mandatory fields
                for key in ('id', 'url', 'title', 'ext'):
                    self.assertTrue(key in info_dict.keys() and info_dict[key])
                # Check for mandatory fields that are automatically set by YoutubeDL
                for key in ['webpage_url', 'extractor', 'extractor_key']:
                    self.assertTrue(info_dict.get(key), u'Missing field: %s' % key)
                # Are checkable fields missing from the test case definition?
                test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
                    for key, value in info_dict.items()
                    if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location'))
                missing_keys = set(test_info_dict.keys()) - set(tc.get('info_dict', {}).keys())
                if missing_keys:
                    sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n')
                    self.assertFalse(
                        missing_keys,
                        'Missing keys in test definition: %s' % (
                            ','.join(sorted(missing_keys))))
        finally:
            try_rm_tcs_files()
--- a/test/test_playlists.py
+++ b/test/test_playlists.py
@ -9,8 +9,10 @@ import sys
 import unittest
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from test.helper import FakeYDL
+from test.helper import (
-
+    expect_info_dict,
    FakeYDL,
 )
 from youtube_dl.extractor import (
    AcademicEarthCourseIE,
@ -39,6 +41,7 @@ from youtube_dl.extractor import (
    TEDIE,
    ToypicsUserIE,
    XTubeUserIE,
    InstagramUserIE,
 )
@ -287,5 +290,28 @@ class TestPlaylists(unittest.TestCase):
        self.assertEqual(result['id'], 'greenshowers')
        self.assertTrue(len(result['entries']) >= 155)
    def test_InstagramUser(self):
        dl = FakeYDL()
        ie = InstagramUserIE(dl)
        result = ie.extract('http://instagram.com/porsche')
        self.assertIsPlaylist(result)
        self.assertEqual(result['id'], 'porsche')
        self.assertTrue(len(result['entries']) >= 2)
        test_video = next(
            e for e in result['entries']
            if e['id'] == '614605558512799803_462752227')
        dl.add_default_extra_info(test_video, ie, '(irrelevant URL)')
        dl.process_video_result(test_video, download=False)
        EXPECTED = {
            'id': '614605558512799803_462752227',
            'ext': 'mp4',
            'title': '#Porsche Intelligent Performance.',
            'thumbnail': 're:^https?://.*\.jpg',
            'uploader': 'Porsche',
            'uploader_id': 'porsche',
        }
        expect_info_dict(self, EXPECTED, test_video)
 if __name__ == '__main__':
    unittest.main()
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -512,13 +512,7 @@ class YoutubeDL(object):
                        '_type': 'compat_list',
                        'entries': ie_result,
                    }
-                self.add_extra_info(ie_result,
+                self.add_default_extra_info(ie_result, ie, url)
                    {
                        'extractor': ie.IE_NAME,
                        'webpage_url': url,
                        'webpage_url_basename': url_basename(url),
                        'extractor_key': ie.ie_key(),
                    })
                if process:
                    return self.process_ie_result(ie_result, download, extra_info)
                else:
@ -537,6 +531,14 @@ class YoutubeDL(object):
        else:
            self.report_error('no suitable InfoExtractor for URL %s' % url)
    def add_default_extra_info(self, ie_result, ie, url):
        self.add_extra_info(ie_result, {
            'extractor': ie.IE_NAME,
            'webpage_url': url,
            'webpage_url_basename': url_basename(url),
            'extractor_key': ie.ie_key(),
        })
    def process_ie_result(self, ie_result, download=True, extra_info={}):
        """
        Take the result of the ie(may be modified) and resolve all unresolved
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -112,7 +112,7 @@ from .imdb import (
 )
 from .ina import InaIE
 from .infoq import InfoQIE
-from .instagram import InstagramIE
+from .instagram import InstagramIE, InstagramUserIE
 from .internetvideoarchive import InternetVideoArchiveIE
 from .iprima import IPrimaIE
 from .ivi import (
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@ -3,6 +3,9 @@ from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..utils import (
    int_or_none,
 )
 class InstagramIE(InfoExtractor):
@ -37,3 +40,68 @@ class InstagramIE(InfoExtractor):
            'uploader_id': uploader_id,
            'description': desc,
        }
 class InstagramUserIE(InfoExtractor):
    _VALID_URL = r'http://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'
    IE_DESC = 'Instagram user profile'
    IE_NAME = 'instagram:user'
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        uploader_id = mobj.group('username')
        entries = []
        page_count = 0
        media_url = 'http://instagram.com/%s/media' % uploader_id
        while True:
            page = self._download_json(
                media_url, uploader_id,
                note='Downloading page %d ' % (page_count + 1),
            )
            page_count += 1
            for it in page['items']:
                if it.get('type') != 'video':
                    continue
                like_count = int_or_none(it.get('likes', {}).get('count'))
                user = it.get('user', {})
                formats = [{
                    'format_id': k,
                    'height': v.get('height'),
                    'width': v.get('width'),
                    'url': v['url'],
                } for k, v in it['videos'].items()]
                self._sort_formats(formats)
                thumbnails_el = it.get('images', {})
                thumbnail = thumbnails_el.get('thumbnail', {}).get('url')
                title = it.get('caption', {}).get('text', it['id'])
                entries.append({
                    'id': it['id'],
                    'title': title,
                    'formats': formats,
                    'thumbnail': thumbnail,
                    'webpage_url': it.get('link'),
                    'uploader': user.get('full_name'),
                    'uploader_id': user.get('username'),
                    'like_count': like_count,
                    'upload_timestamp': int_or_none(it.get('created_time')),
                })
            if not page['items']:
                break
            max_id = page['items'][-1]['id']
            media_url = (
                'http://instagram.com/%s/media?max_id=%s' % (
                    uploader_id, max_id))
        return {
            '_type': 'playlist',
            'entries': entries,
            'id': uploader_id,
            'title': uploader_id,
        }