[generic] Handle audio streams that do not implement HEAD (Fixes #4032)

This commit is contained in:
Philipp Hagemeister 2014-10-26 17:05:44 +01:00
parent 488447455d
commit 23be51d8ce
2 changed files with 32 additions and 28 deletions

View file

@ -242,7 +242,6 @@ class InfoExtractor(object):
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns a tuple (page content as string, URL handle) """ """ Returns a tuple (page content as string, URL handle) """
# Strip hashes from the URL (#1038) # Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)): if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0] url_or_request = url_or_request.partition('#')[0]
@ -251,6 +250,10 @@ class InfoExtractor(object):
if urlh is False: if urlh is False:
assert not fatal assert not fatal
return False return False
content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
return (content, urlh)
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
content_type = urlh.headers.get('Content-Type', '') content_type = urlh.headers.get('Content-Type', '')
webpage_bytes = urlh.read() webpage_bytes = urlh.read()
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@ -309,7 +312,7 @@ class InfoExtractor(object):
msg += ' Visit %s for more details' % blocked_iframe msg += ' Visit %s for more details' % blocked_iframe
raise ExtractorError(msg, expected=True) raise ExtractorError(msg, expected=True)
return (content, urlh) return content
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
""" Returns the data of the page as a string """ """ Returns the data of the page as a string """

View file

@ -503,14 +503,14 @@ class GenericIE(InfoExtractor):
self.to_screen('%s: Requesting header' % video_id) self.to_screen('%s: Requesting header' % video_id)
head_req = HEADRequest(url) head_req = HEADRequest(url)
response = self._request_webpage( head_response = self._request_webpage(
head_req, video_id, head_req, video_id,
note=False, errnote='Could not send HEAD request to %s' % url, note=False, errnote='Could not send HEAD request to %s' % url,
fatal=False) fatal=False)
if response is not False: if head_response is not False:
# Check for redirect # Check for redirect
new_url = response.geturl() new_url = head_response.geturl()
if url != new_url: if url != new_url:
self.report_following_redirect(new_url) self.report_following_redirect(new_url)
if force_videoid: if force_videoid:
@ -518,13 +518,17 @@ class GenericIE(InfoExtractor):
new_url, {'force_videoid': force_videoid}) new_url, {'force_videoid': force_videoid})
return self.url_result(new_url) return self.url_result(new_url)
full_response = None
if head_response is False:
full_response = self._request_webpage(url, video_id)
head_response = full_response
# Check for direct link to a video # Check for direct link to a video
content_type = response.headers.get('Content-Type', '') content_type = head_response.headers.get('Content-Type', '')
m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type) m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
if m: if m:
upload_date = response.headers.get('Last-Modified') upload_date = unified_strdate(
if upload_date: head_response.headers.get('Last-Modified'))
upload_date = unified_strdate(upload_date)
return { return {
'id': video_id, 'id': video_id,
'title': os.path.splitext(url_basename(url))[0], 'title': os.path.splitext(url_basename(url))[0],
@ -539,13 +543,10 @@ class GenericIE(InfoExtractor):
if not self._downloader.params.get('test', False) and not is_intentional: if not self._downloader.params.get('test', False) and not is_intentional:
self._downloader.report_warning('Falling back on generic information extractor.') self._downloader.report_warning('Falling back on generic information extractor.')
try: if full_response:
webpage = _webpage_read_content(url, video_id)
else:
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
except ValueError:
# since this is the last-resort InfoExtractor, if
# this error is thrown, it'll be thrown here
raise ExtractorError('Failed to download URL: %s' % url)
self.report_extraction(video_id) self.report_extraction(video_id)
# Is it an RSS feed? # Is it an RSS feed?