[extractor/common] Add the encoding parameter

The QQMusic info extractor need forced encoding for correct working.
This commit is contained in:
Yen Chi Hsuan 2015-03-21 12:21:27 +08:00
parent a685ae511a
commit c9a779695d
2 changed files with 23 additions and 15 deletions

View file

@ -324,7 +324,7 @@ class InfoExtractor(object):
self._downloader.report_warning(errmsg) self._downloader.report_warning(errmsg)
return False return False
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
""" Returns a tuple (page content as string, URL handle) """ """ Returns a tuple (page content as string, URL handle) """
# Strip hashes from the URL (#1038) # Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)): if isinstance(url_or_request, (compat_str, str)):
@ -334,14 +334,11 @@ class InfoExtractor(object):
if urlh is False: if urlh is False:
assert not fatal assert not fatal
return False return False
content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
return (content, urlh) return (content, urlh)
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None): @staticmethod
content_type = urlh.headers.get('Content-Type', '') def _guess_encoding_from_content(content_type, webpage_bytes):
webpage_bytes = urlh.read()
if prefix is not None:
webpage_bytes = prefix + webpage_bytes
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
if m: if m:
encoding = m.group(1) encoding = m.group(1)
@ -354,6 +351,16 @@ class InfoExtractor(object):
encoding = 'utf-16' encoding = 'utf-16'
else: else:
encoding = 'utf-8' encoding = 'utf-8'
return encoding
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
content_type = urlh.headers.get('Content-Type', '')
webpage_bytes = urlh.read()
if prefix is not None:
webpage_bytes = prefix + webpage_bytes
if not encoding:
encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
if self._downloader.params.get('dump_intermediate_pages', False): if self._downloader.params.get('dump_intermediate_pages', False):
try: try:
url = url_or_request.get_full_url() url = url_or_request.get_full_url()
@ -410,13 +417,13 @@ class InfoExtractor(object):
return content return content
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5): def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
""" Returns the data of the page as a string """ """ Returns the data of the page as a string """
success = False success = False
try_count = 0 try_count = 0
while success is False: while success is False:
try: try:
res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
success = True success = True
except compat_http_client.IncompleteRead as e: except compat_http_client.IncompleteRead as e:
try_count += 1 try_count += 1
@ -431,10 +438,10 @@ class InfoExtractor(object):
def _download_xml(self, url_or_request, video_id, def _download_xml(self, url_or_request, video_id,
note='Downloading XML', errnote='Unable to download XML', note='Downloading XML', errnote='Unable to download XML',
transform_source=None, fatal=True): transform_source=None, fatal=True, encoding=None):
"""Return the xml as an xml.etree.ElementTree.Element""" """Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage( xml_string = self._download_webpage(
url_or_request, video_id, note, errnote, fatal=fatal) url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
if xml_string is False: if xml_string is False:
return xml_string return xml_string
if transform_source: if transform_source:
@ -445,9 +452,10 @@ class InfoExtractor(object):
note='Downloading JSON metadata', note='Downloading JSON metadata',
errnote='Unable to download JSON metadata', errnote='Unable to download JSON metadata',
transform_source=None, transform_source=None,
fatal=True): fatal=True, encoding=None):
json_string = self._download_webpage( json_string = self._download_webpage(
url_or_request, video_id, note, errnote, fatal=fatal) url_or_request, video_id, note, errnote, fatal=fatal,
encoding=encoding)
if (not fatal) and json_string is False: if (not fatal) and json_string is False:
return None return None
return self._parse_json( return self._parse_json(

View file

@ -24,7 +24,7 @@ class QQMusicIE(InfoExtractor):
'title': '可惜没如果', 'title': '可惜没如果',
'upload_date': '20141227', 'upload_date': '20141227',
'creator': '林俊杰', 'creator': '林俊杰',
'description': 'md5:242c97c2847e0495583b7b13764f7106', 'description': 'md5:4348ff1dd24036906baa7b6f973f8d30',
} }
}] }]
@ -41,7 +41,7 @@ class QQMusicIE(InfoExtractor):
detail_info_page = self._download_webpage( detail_info_page = self._download_webpage(
'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid, 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid,
mid, note='Download song detail info', mid, note='Download song detail info',
errnote='Unable to get song detail info') errnote='Unable to get song detail info', encoding='gbk')
song_name = self._html_search_regex( song_name = self._html_search_regex(
r"songname:\s*'([^']+)'", detail_info_page, 'song name') r"songname:\s*'([^']+)'", detail_info_page, 'song name')