[toypics] Separate user and video extraction (#2601)

This commit is contained in:
Philipp Hagemeister 2014-03-22 15:15:01 +01:00
parent 55442a7812
commit 231f76b530
3 changed files with 59 additions and 29 deletions

View file

@ -37,6 +37,7 @@ from youtube_dl.extractor import (
GoogleSearchIE, GoogleSearchIE,
GenericIE, GenericIE,
TEDIE, TEDIE,
ToypicsUserIE,
) )
@ -269,5 +270,13 @@ class TestPlaylists(unittest.TestCase):
self.assertEqual(result['title'], 'Who are the hackers?') self.assertEqual(result['title'], 'Who are the hackers?')
self.assertTrue(len(result['entries']) >= 6) self.assertTrue(len(result['entries']) >= 6)
def test_toypics_user(self):
dl = FakeYDL()
ie = ToypicsUserIE(dl)
result = ie.extract('http://videos.toypics.net/Mikey')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], 'Mikey')
self.assertTrue(len(result['entries']) >= 17)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View file

@ -239,7 +239,7 @@ from .theplatform import ThePlatformIE
from .thisav import ThisAVIE from .thisav import ThisAVIE
from .tinypic import TinyPicIE from .tinypic import TinyPicIE
from .toutv import TouTvIE from .toutv import TouTvIE
from .toypics import ToypicsIE from .toypics import ToypicsUserIE, ToypicsIE
from .traileraddict import TrailerAddictIE from .traileraddict import TrailerAddictIE
from .trilulilu import TriluliluIE from .trilulilu import TriluliluIE
from .trutube import TruTubeIE from .trutube import TruTubeIE

View file

@ -2,43 +2,26 @@ from .common import InfoExtractor
from math import ceil from math import ceil
import re import re
class ToypicsIE(InfoExtractor): class ToypicsIE(InfoExtractor):
_VALID_URL = r'(?:http://)?videos\.toypics\.net/.*' IE_DESC = 'Toypics user profile'
_VALID_URL = r'http://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*'
_TEST = { _TEST = {
'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/', 'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/',
#'md5': '8a8b546956bbd0e769dbe28f6e80abb3', == $head -c10K 12929646011616163504.mp4 |md5sum //no idea why it fails 'md5': '16e806ad6d6f58079d210fe30985e08b',
'info_dict': { 'info_dict': {
'id': '514', 'id': '514',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Chance-Bulge\'d, 2', 'title': 'Chance-Bulge\'d, 2',
'age_limit': 18 'age_limit': 18,
'uploader': 'kidsune',
} }
} }
PAGINATED=8
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(r'(http://)?videos\.toypics\.net/(?P<username>[^/?]+)$', url) mobj = re.match(self._VALID_URL, url)
if not mobj: video_id = mobj.group('id')
return self.extract_one(url) page = self._download_webpage(url, video_id)
return [self.extract_one(u) for u in self.process_paginated(url,
r'public/">Public Videos \((?P<videos_count>[0-9]+)\)</a></li>',
r'<p class="video-entry-title">\n\s*<a href="(http://videos.toypics.net/view/[^"]+)">'
)]
def process_paginated(self, profile_url, re_total, re_video_page):
profile_page = self._download_webpage(profile_url, 'profile' , 'getting profile page: '+profile_url)
videos_count = self._html_search_regex(re_total, profile_page, 'videos count')
lst = []
for n in xrange(1,int(ceil(float(videos_count)/self.PAGINATED)) +1):
lpage_url = profile_url +'/public/%d'%n
lpage = self._download_webpage(lpage_url, 'page %d'%n)
lst.extend(re.findall(re_video_page, lpage))
return lst
def extract_one(self,url):
mobj = re.match(r'(http://)?videos\.toypics\.net/view/(?P<videoid>[0-9]+)/.*', url)
video_id = mobj.group('videoid')
page = self._download_webpage(url, video_id, 'getting page: '+url)
video_url = self._html_search_regex( video_url = self._html_search_regex(
r'src:\s+"(http://static[0-9]+\.toypics\.net/flvideo/[^"]+)"', page, 'video URL') r'src:\s+"(http://static[0-9]+\.toypics\.net/flvideo/[^"]+)"', page, 'video URL')
title = self._html_search_regex( title = self._html_search_regex(
@ -48,8 +31,46 @@ class ToypicsIE(InfoExtractor):
return { return {
'id': video_id, 'id': video_id,
'url': video_url, 'url': video_url,
'ext': video_url[-3:],
'title': title, 'title': title,
'uploader': username, 'uploader': username,
'age_limit': 18 'age_limit': 18,
}
class ToypicsUserIE(InfoExtractor):
IE_DESC = 'Toypics user profile'
_VALID_URL = r'http://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
username = mobj.group('username')
profile_page = self._download_webpage(
url, username, note='Retrieving profile page')
video_count = int(self._search_regex(
r'public/">Public Videos \(([0-9]+)\)</a></li>', profile_page,
'video count'))
PAGE_SIZE = 8
urls = []
page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE
for n in range(1, page_count + 1):
lpage_url = url + '/public/%d' % n
lpage = self._download_webpage(
lpage_url, username,
note='Downloading page %d/%d' % (n, page_count))
urls.extend(
re.findall(
r'<p class="video-entry-title">\n\s*<a href="(http://videos.toypics.net/view/[^"]+)">',
lpage))
return {
'_type': 'playlist',
'id': username,
'entries': [{
'_type': 'url',
'url': url,
'ie_key': 'Toypics',
} for url in urls]
} }