Extractor for non-password protected GDC Vault videos

This commit is contained in:
David Wagner 2014-02-26 22:25:04 +00:00
parent 2acea5c03d
commit 9e68f9fdf1
2 changed files with 62 additions and 0 deletions

View file

@ -89,6 +89,7 @@ from .funnyordie import FunnyOrDieIE
from .gamekings import GamekingsIE from .gamekings import GamekingsIE
from .gamespot import GameSpotIE from .gamespot import GameSpotIE
from .gametrailers import GametrailersIE from .gametrailers import GametrailersIE
from .gdcvault import GDCVaultIE
from .generic import GenericIE from .generic import GenericIE
from .googleplus import GooglePlusIE from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE from .googlesearch import GoogleSearchIE

View file

@ -0,0 +1,61 @@
from __future__ import unicode_literals
import re
import json
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import unified_strdate
class GDCVaultIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)'
_TEST = {
u'url': u'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple',
u'md5': u'7ce8388f544c88b7ac11c7ab1b593704',
u'info_dict': {
u"id": u"1019721",
u"ext": u"mp4",
u"title": u"Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)"
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage_url = 'http://www.gdcvault.com/play/' + video_id
start_page = self._download_webpage(webpage_url, video_id)
self.report_extraction(video_id)
xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root')
xml_name = self._html_search_regex(r'<iframe src=".*?\?xml=(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename')
xml_decription_url = xml_root + 'xml/' + xml_name
xml_description = self._download_xml(xml_decription_url, video_id)
video_title = xml_description.find('./metadata/title').text
mp4_video = xml_description.find('./metadata/mp4video').text
mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video)
video_root = mobj.group('root')
formats = xml_description.findall('./metadata/MBRVideos/MBRVideo')
video_formats = []
for format in formats:
mobj = re.match(r'mp4\:(?P<path>.*)', format.find('streamName').text)
url = video_root + mobj.group('path')
vbr = format.find('bitrate').text
video_formats.append({
'url': url,
'vbr': int(vbr),
})
return [{
'id': video_id,
'formats': video_formats,
'title': video_title,
}]