[wsj] Improve and modernize (closes #12558)

This commit is contained in:
Sergey M․ 2017-04-15 20:51:47 +07:00
parent 3266d08af2
commit b2a19e3829
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D

View file

@ -12,11 +12,12 @@ from ..utils import (
class WSJIE(InfoExtractor): class WSJIE(InfoExtractor):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
(?: (?:
https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=| https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=|
https?://(?:www\.)?wsj\.com/video/[^/]+/| https?://(?:www\.)?wsj\.com/video/[^/]+/|
wsj: wsj:
) )
(?P<id>[a-zA-Z0-9-]+)''' (?P<id>[a-fA-F0-9-]{36})
'''
IE_DESC = 'Wall Street Journal' IE_DESC = 'Wall Street Journal'
_TESTS = [{ _TESTS = [{
'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', 'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
@ -39,12 +40,17 @@ class WSJIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
api_url = ( info = self._download_json(
'http://video-api.wsj.com/api-video/find_all_videos.asp?' 'http://video-api.wsj.com/api-video/find_all_videos.asp', video_id,
'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,' query={
'thumbnailList,author,description,name,duration,videoURL,' 'type': 'guid',
'titletag,formattedCreationDate,keywords,editor' % video_id) 'count': 1,
info = self._download_json(api_url, video_id)['items'][0] 'query': video_id,
'fields': ','.join((
'type', 'hls', 'videoMP4List', 'thumbnailList', 'author',
'description', 'name', 'duration', 'videoURL', 'titletag',
'formattedCreationDate', 'keywords', 'editor')),
})['items'][0]
title = info.get('name', info.get('titletag')) title = info.get('name', info.get('titletag'))
formats = [] formats = []
@ -91,8 +97,8 @@ class WSJIE(InfoExtractor):
class WSJArticleIE(InfoExtractor): class WSJArticleIE(InfoExtractor):
_VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>\w[^/]+)' _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)'
_TESTS = [{ _TEST = {
'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?', 'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?',
'info_dict': { 'info_dict': {
'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362', 'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362',
@ -101,11 +107,11 @@ class WSJArticleIE(InfoExtractor):
'uploader_id': 'ralcaraz', 'uploader_id': 'ralcaraz',
'title': 'Bao Bao the Panda Leaves for China', 'title': 'Bao Bao the Panda Leaves for China',
} }
}] }
def _real_extract(self, url): def _real_extract(self, url):
article_id = self._match_id(url) article_id = self._match_id(url)
webpage = self._download_webpage(url, article_id) webpage = self._download_webpage(url, article_id)
video_id = self._search_regex(r'data-src=["\']([A-Z0-9\-]+)', video_id = self._search_regex(
webpage, 'video id') r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id')
return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id) return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id)