Use _download_xml
in more extractors
This commit is contained in:
parent
a0088bdf93
commit
1825836235
4 changed files with 25 additions and 22 deletions
|
@ -1,5 +1,4 @@
|
||||||
import re
|
import re
|
||||||
import xml.etree.ElementTree
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
|
@ -65,18 +64,18 @@ class AppleTrailersIE(InfoExtractor):
|
||||||
uploader_id = mobj.group('company')
|
uploader_id = mobj.group('company')
|
||||||
|
|
||||||
playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')
|
playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')
|
||||||
playlist_snippet = self._download_webpage(playlist_url, movie)
|
def fix_html(s):
|
||||||
playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet)
|
s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s)
|
||||||
playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned)
|
s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s)
|
||||||
# The ' in the onClick attributes are not escaped, it couldn't be parsed
|
# The ' in the onClick attributes are not escaped, it couldn't be parsed
|
||||||
# with xml.etree.ElementTree.fromstring
|
|
||||||
# like: http://trailers.apple.com/trailers/wb/gravity/
|
# like: http://trailers.apple.com/trailers/wb/gravity/
|
||||||
def _clean_json(m):
|
def _clean_json(m):
|
||||||
return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''')
|
return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''')
|
||||||
playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned)
|
s = re.sub(self._JSON_RE, _clean_json, s)
|
||||||
playlist_html = u'<html>' + playlist_cleaned + u'</html>'
|
s = u'<html>' + s + u'</html>'
|
||||||
|
return s
|
||||||
|
doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
|
||||||
|
|
||||||
doc = xml.etree.ElementTree.fromstring(playlist_html)
|
|
||||||
playlist = []
|
playlist = []
|
||||||
for li in doc.findall('./div/ul/li'):
|
for li in doc.findall('./div/ul/li'):
|
||||||
on_click = li.find('.//a').attrib['onClick']
|
on_click = li.find('.//a').attrib['onClick']
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
import re
|
import re
|
||||||
import xml.etree.ElementTree
|
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
find_xpath_attr,
|
find_xpath_attr,
|
||||||
|
fix_xml_all_ampersand,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -30,12 +30,10 @@ class ClipsyndicateIE(InfoExtractor):
|
||||||
# it includes a required token
|
# it includes a required token
|
||||||
flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars')
|
flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars')
|
||||||
|
|
||||||
playlist_page = self._download_webpage(
|
pdoc = self._download_xml(
|
||||||
'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
|
'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
|
||||||
video_id, u'Downloading video info')
|
video_id, u'Downloading video info',
|
||||||
# Fix broken xml
|
transform_source=fix_xml_all_ampersand)
|
||||||
playlist_page = re.sub('&', '&', playlist_page)
|
|
||||||
pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8'))
|
|
||||||
|
|
||||||
track_doc = pdoc.find('trackList/track')
|
track_doc = pdoc.find('trackList/track')
|
||||||
def find_param(name):
|
def find_param(name):
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
import re
|
import re
|
||||||
import xml.etree.ElementTree
|
|
||||||
import operator
|
import operator
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
|
from ..utils import (
|
||||||
|
fix_xml_all_ampersand,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class MetacriticIE(InfoExtractor):
|
class MetacriticIE(InfoExtractor):
|
||||||
|
@ -23,9 +25,8 @@ class MetacriticIE(InfoExtractor):
|
||||||
video_id = mobj.group('id')
|
video_id = mobj.group('id')
|
||||||
webpage = self._download_webpage(url, video_id)
|
webpage = self._download_webpage(url, video_id)
|
||||||
# The xml is not well formatted, there are raw '&'
|
# The xml is not well formatted, there are raw '&'
|
||||||
info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id,
|
info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id,
|
||||||
video_id, u'Downloading info xml').replace('&', '&')
|
video_id, u'Downloading info xml', transform_source=fix_xml_all_ampersand)
|
||||||
info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
|
|
||||||
|
|
||||||
clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
|
clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
|
||||||
formats = []
|
formats = []
|
||||||
|
|
|
@ -1057,3 +1057,8 @@ def month_by_name(name):
|
||||||
return ENGLISH_NAMES.index(name) + 1
|
return ENGLISH_NAMES.index(name) + 1
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def fix_xml_all_ampersand(xml_str):
|
||||||
|
"""Replace all the '&' by '&' in XML"""
|
||||||
|
return xml_str.replace(u'&', u'&')
|
||||||
|
|
Loading…
Reference in a new issue