Add infrastructure for paged lists
This commit allows to download pages in playlists as needed instead of all at once. Before this commit, youtube-dl http://www.youtube.com/user/ANNnewsCH/videos --playlist-end 2 --skip-download took quite some time - now it's almost instantaneous. As an example, the youtube:user extractor has been converted. Fixes #2175
This commit is contained in:
parent
c91778f8c0
commit
b7ab059084
4 changed files with 92 additions and 25 deletions
|
@ -18,6 +18,7 @@ from youtube_dl.utils import (
|
||||||
find_xpath_attr,
|
find_xpath_attr,
|
||||||
get_meta_content,
|
get_meta_content,
|
||||||
orderedSet,
|
orderedSet,
|
||||||
|
PagedList,
|
||||||
parse_duration,
|
parse_duration,
|
||||||
sanitize_filename,
|
sanitize_filename,
|
||||||
shell_quote,
|
shell_quote,
|
||||||
|
@ -200,5 +201,26 @@ class TestUtil(unittest.TestCase):
|
||||||
self.assertEqual(parse_duration('9:12:43'), 33163)
|
self.assertEqual(parse_duration('9:12:43'), 33163)
|
||||||
self.assertEqual(parse_duration('x:y'), None)
|
self.assertEqual(parse_duration('x:y'), None)
|
||||||
|
|
||||||
|
def test_paged_list(self):
|
||||||
|
def testPL(size, pagesize, sliceargs, expected):
|
||||||
|
def get_page(pagenum):
|
||||||
|
firstid = pagenum * pagesize
|
||||||
|
upto = min(size, pagenum * pagesize + pagesize)
|
||||||
|
for i in range(firstid, upto):
|
||||||
|
yield i
|
||||||
|
|
||||||
|
pl = PagedList(get_page, pagesize)
|
||||||
|
got = pl.getslice(*sliceargs)
|
||||||
|
self.assertEqual(got, expected)
|
||||||
|
|
||||||
|
testPL(5, 2, (), [0, 1, 2, 3, 4])
|
||||||
|
testPL(5, 2, (1,), [1, 2, 3, 4])
|
||||||
|
testPL(5, 2, (2,), [2, 3, 4])
|
||||||
|
testPL(5, 2, (4,), [4])
|
||||||
|
testPL(5, 2, (0, 3), [0, 1, 2])
|
||||||
|
testPL(5, 2, (1, 4), [1, 2, 3])
|
||||||
|
testPL(5, 2, (2, 99), [2, 3, 4])
|
||||||
|
testPL(5, 2, (20, 99), [])
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
@ -39,6 +39,7 @@ from .utils import (
|
||||||
locked_file,
|
locked_file,
|
||||||
make_HTTPS_handler,
|
make_HTTPS_handler,
|
||||||
MaxDownloadsReached,
|
MaxDownloadsReached,
|
||||||
|
PagedList,
|
||||||
PostProcessingError,
|
PostProcessingError,
|
||||||
platform_name,
|
platform_name,
|
||||||
preferredencoding,
|
preferredencoding,
|
||||||
|
@ -575,19 +576,27 @@ class YoutubeDL(object):
|
||||||
|
|
||||||
playlist_results = []
|
playlist_results = []
|
||||||
|
|
||||||
n_all_entries = len(ie_result['entries'])
|
|
||||||
playliststart = self.params.get('playliststart', 1) - 1
|
playliststart = self.params.get('playliststart', 1) - 1
|
||||||
playlistend = self.params.get('playlistend', None)
|
playlistend = self.params.get('playlistend', None)
|
||||||
# For backwards compatibility, interpret -1 as whole list
|
# For backwards compatibility, interpret -1 as whole list
|
||||||
if playlistend == -1:
|
if playlistend == -1:
|
||||||
playlistend = None
|
playlistend = None
|
||||||
|
|
||||||
entries = ie_result['entries'][playliststart:playlistend]
|
if isinstance(ie_result['entries'], list):
|
||||||
n_entries = len(entries)
|
n_all_entries = len(ie_result['entries'])
|
||||||
|
entries = ie_result['entries'][playliststart:playlistend]
|
||||||
self.to_screen(
|
n_entries = len(entries)
|
||||||
"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
|
self.to_screen(
|
||||||
(ie_result['extractor'], playlist, n_all_entries, n_entries))
|
"[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
|
||||||
|
(ie_result['extractor'], playlist, n_all_entries, n_entries))
|
||||||
|
else:
|
||||||
|
assert isinstance(ie_result['entries'], PagedList)
|
||||||
|
entries = ie_result['entries'].getslice(
|
||||||
|
playliststart, playlistend)
|
||||||
|
n_entries = len(entries)
|
||||||
|
self.to_screen(
|
||||||
|
"[%s] playlist %s: Downloading %d videos" %
|
||||||
|
(ie_result['extractor'], playlist, n_entries))
|
||||||
|
|
||||||
for i, entry in enumerate(entries, 1):
|
for i, entry in enumerate(entries, 1):
|
||||||
self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
|
self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
|
||||||
|
|
|
@ -27,6 +27,7 @@ from ..utils import (
|
||||||
get_element_by_id,
|
get_element_by_id,
|
||||||
get_element_by_attribute,
|
get_element_by_attribute,
|
||||||
ExtractorError,
|
ExtractorError,
|
||||||
|
PagedList,
|
||||||
RegexNotFoundError,
|
RegexNotFoundError,
|
||||||
unescapeHTML,
|
unescapeHTML,
|
||||||
unified_strdate,
|
unified_strdate,
|
||||||
|
@ -1580,44 +1581,35 @@ class YoutubeUserIE(InfoExtractor):
|
||||||
# page by page until there are no video ids - it means we got
|
# page by page until there are no video ids - it means we got
|
||||||
# all of them.
|
# all of them.
|
||||||
|
|
||||||
url_results = []
|
def download_page(pagenum):
|
||||||
|
|
||||||
for pagenum in itertools.count(0):
|
|
||||||
start_index = pagenum * self._GDATA_PAGE_SIZE + 1
|
start_index = pagenum * self._GDATA_PAGE_SIZE + 1
|
||||||
|
|
||||||
gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
|
gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
|
||||||
page = self._download_webpage(gdata_url, username,
|
page = self._download_webpage(
|
||||||
u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
|
gdata_url, username,
|
||||||
|
u'Downloading video ids from %d to %d' % (
|
||||||
|
start_index, start_index + self._GDATA_PAGE_SIZE))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = json.loads(page)
|
response = json.loads(page)
|
||||||
except ValueError as err:
|
except ValueError as err:
|
||||||
raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
|
raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
|
||||||
if 'entry' not in response['feed']:
|
if 'entry' not in response['feed']:
|
||||||
# Number of videos is a multiple of self._MAX_RESULTS
|
return
|
||||||
break
|
|
||||||
|
|
||||||
# Extract video identifiers
|
# Extract video identifiers
|
||||||
entries = response['feed']['entry']
|
entries = response['feed']['entry']
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
title = entry['title']['$t']
|
title = entry['title']['$t']
|
||||||
video_id = entry['id']['$t'].split('/')[-1]
|
video_id = entry['id']['$t'].split('/')[-1]
|
||||||
url_results.append({
|
yield {
|
||||||
'_type': 'url',
|
'_type': 'url',
|
||||||
'url': video_id,
|
'url': video_id,
|
||||||
'ie_key': 'Youtube',
|
'ie_key': 'Youtube',
|
||||||
'id': 'video_id',
|
'id': 'video_id',
|
||||||
'title': title,
|
'title': title,
|
||||||
})
|
}
|
||||||
|
url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
|
||||||
# A little optimization - if current page is not
|
|
||||||
# "full", ie. does not contain PAGE_SIZE video ids then
|
|
||||||
# we can assume that this page is the last one - there
|
|
||||||
# are no more ids on further pages - no need to query
|
|
||||||
# again.
|
|
||||||
|
|
||||||
if len(entries) < self._GDATA_PAGE_SIZE:
|
|
||||||
break
|
|
||||||
|
|
||||||
return self.playlist_result(url_results, playlist_title=username)
|
return self.playlist_result(url_results, playlist_title=username)
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,7 @@ import datetime
|
||||||
import email.utils
|
import email.utils
|
||||||
import errno
|
import errno
|
||||||
import gzip
|
import gzip
|
||||||
|
import itertools
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import locale
|
import locale
|
||||||
|
@ -1161,3 +1162,46 @@ def check_executable(exe, args=[]):
|
||||||
except OSError:
|
except OSError:
|
||||||
return False
|
return False
|
||||||
return exe
|
return exe
|
||||||
|
|
||||||
|
|
||||||
|
class PagedList(object):
|
||||||
|
def __init__(self, pagefunc, pagesize):
|
||||||
|
self._pagefunc = pagefunc
|
||||||
|
self._pagesize = pagesize
|
||||||
|
|
||||||
|
def getslice(self, start=0, end=None):
|
||||||
|
res = []
|
||||||
|
for pagenum in itertools.count(start // self._pagesize):
|
||||||
|
firstid = pagenum * self._pagesize
|
||||||
|
nextfirstid = pagenum * self._pagesize + self._pagesize
|
||||||
|
if start >= nextfirstid:
|
||||||
|
continue
|
||||||
|
|
||||||
|
page_results = list(self._pagefunc(pagenum))
|
||||||
|
|
||||||
|
startv = (
|
||||||
|
start % self._pagesize
|
||||||
|
if firstid <= start < nextfirstid
|
||||||
|
else 0)
|
||||||
|
|
||||||
|
endv = (
|
||||||
|
((end - 1) % self._pagesize) + 1
|
||||||
|
if (end is not None and firstid <= end <= nextfirstid)
|
||||||
|
else None)
|
||||||
|
|
||||||
|
if startv != 0 or endv is not None:
|
||||||
|
page_results = page_results[startv:endv]
|
||||||
|
res.extend(page_results)
|
||||||
|
|
||||||
|
# A little optimization - if current page is not "full", ie. does
|
||||||
|
# not contain page_size videos then we can assume that this page
|
||||||
|
# is the last one - there are no more ids on further pages -
|
||||||
|
# i.e. no need to query again.
|
||||||
|
if len(page_results) + startv < self._pagesize:
|
||||||
|
break
|
||||||
|
|
||||||
|
# If we got the whole page, but the next page is not interesting,
|
||||||
|
# break out early as well
|
||||||
|
if end == nextfirstid:
|
||||||
|
break
|
||||||
|
return res
|
||||||
|
|
Loading…
Reference in a new issue