[CSpan] Add detection for Senate ISVP. Closes #5302

This commit is contained in:
Yen Chi Hsuan 2015-04-21 03:18:38 +08:00
parent f91e1a8739
commit 2fe1b5bd2a
3 changed files with 41 additions and 3 deletions

View file

@ -7,7 +7,9 @@ from ..utils import (
int_or_none, int_or_none,
unescapeHTML, unescapeHTML,
find_xpath_attr, find_xpath_attr,
smuggle_url,
) )
from .senateisvp import SenateISVPIE
class CSpanIE(InfoExtractor): class CSpanIE(InfoExtractor):
@ -40,6 +42,15 @@ class CSpanIE(InfoExtractor):
'title': 'General Motors Ignition Switch Recall', 'title': 'General Motors Ignition Switch Recall',
}, },
'playlist_duration_sum': 14855, 'playlist_duration_sum': 14855,
}, {
# Video from senate.gov
'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers',
'md5': '7314c4b96dad66dd8e63dc3518ceaa6f',
'info_dict': {
'id': 'judiciary031715',
'ext': 'flv',
'title': 'Immigration Reforms Needed to Protect Skilled American Workers',
}
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -56,7 +67,7 @@ class CSpanIE(InfoExtractor):
# present, otherwise this is a stripped version # present, otherwise this is a stripped version
r'<p class=\'initial\'>(.*?)</p>' r'<p class=\'initial\'>(.*?)</p>'
], ],
webpage, 'description', flags=re.DOTALL) webpage, 'description', flags=re.DOTALL, default=None)
info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id
data = self._download_json(info_url, video_id) data = self._download_json(info_url, video_id)
@ -68,6 +79,11 @@ class CSpanIE(InfoExtractor):
title = find_xpath_attr(doc, './/string', 'name', 'title').text title = find_xpath_attr(doc, './/string', 'name', 'title').text
thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text
senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
if senate_isvp_url:
surl = smuggle_url(senate_isvp_url, {'force_title': title})
return self.url_result(surl, 'SenateISVP', video_id, title)
files = data['video']['files'] files = data['video']['files']
entries = [{ entries = [{

View file

@ -35,6 +35,7 @@ from .rutv import RUTVIE
from .smotri import SmotriIE from .smotri import SmotriIE
from .condenast import CondeNastIE from .condenast import CondeNastIE
from .udn import UDNEmbedIE from .udn import UDNEmbedIE
from .senateisvp import SenateISVPIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -1365,6 +1366,11 @@ class GenericIE(InfoExtractor):
return self.url_result( return self.url_result(
compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
# Look for Senate ISVP iframe
senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
if senate_isvp_url:
return self.url_result(surl, 'SenateISVP')
def check_video(vurl): def check_video(vurl):
if YoutubeIE.suitable(vurl): if YoutubeIE.suitable(vurl):
return True return True

View file

@ -3,7 +3,10 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ExtractorError from ..utils import (
ExtractorError,
unsmuggle_url,
)
from ..compat import ( from ..compat import (
compat_parse_qs, compat_parse_qs,
compat_urlparse, compat_urlparse,
@ -73,12 +76,22 @@ class SenateISVPIE(InfoExtractor):
} }
}] }]
@staticmethod
def _search_iframe_url(webpage):
mobj = re.search(
r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/\?[^'\"]+)['\"]",
webpage)
if mobj:
return mobj.group('url')
def _get_info_for_comm(self, committee): def _get_info_for_comm(self, committee):
for entry in self._COMM_MAP: for entry in self._COMM_MAP:
if entry[0] == committee: if entry[0] == committee:
return entry[1:] return entry[1:]
def _real_extract(self, url): def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs')) qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs'))
if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): if not qs.get('filename') or not qs.get('type') or not qs.get('comm'):
raise ExtractorError('Invalid URL', expected=True) raise ExtractorError('Invalid URL', expected=True)
@ -87,6 +100,9 @@ class SenateISVPIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
if smuggled_data.get('force_title'):
title = smuggled_data['force_title']
else:
title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id) title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id)
poster = qs.get('poster') poster = qs.get('poster')
if poster: if poster: