[ellentube] Fix extraction (closes #14407)

This commit is contained in:
Alex Seiler 2017-10-23 21:15:48 +02:00 committed by Sergey M․
parent 1115271ac6
commit e2707a832c
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D
3 changed files with 145 additions and 104 deletions

View file

@ -0,0 +1,140 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
clean_html,
int_or_none,
)
class EllenTubeIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?:
https://api-prod\.ellentube\.com/ellenapi/api/item/
|ellentube:
)
(?P<id>
[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}
)'''
_TESTS = [{
'url': 'https://api-prod.ellentube.com/ellenapi/api/item/75c64c16-aefd-4558-b4f5-3de09b22e6fc',
'match_only': True,
}, {
'url': 'ellentube:734a3353-f697-4e79-9ca9-bfc3002dc1e0',
'match_only': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
data = self._download_json(
'https://api-prod.ellentube.com/ellenapi/api/item/%s' % video_id, video_id)
title = data['title']
description = data.get('description')
publish_time = int_or_none(data.get('publishTime'))
thumbnail = data.get('thumbnail')
formats = []
duration = None
for entry in data.get('media'):
if entry.get('id') == 'm3u8':
formats = self._extract_m3u8_formats(
entry.get('url'), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
duration = int_or_none(entry.get('duration'))
break
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'duration': duration,
'thumbnail': thumbnail,
'timestamp': publish_time,
'formats': formats,
}
class EllenTubeVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ellentube\.com/video/(?P<id>.+)\.html'
_TEST = {
'url': 'https://www.ellentube.com/video/ellen-meets-las-vegas-survivors-jesus-campos-and-stephen-schuck.html',
'md5': '2fabc277131bddafdd120e0fc0f974c9',
'info_dict': {
'id': '0822171c-3829-43bf-b99f-d77358ae75e3',
'ext': 'mp4',
'title': 'Ellen Meets Las Vegas Survivors Jesus Campos and Stephen Schuck',
'description': 'md5:76e3355e2242a78ad9e3858e5616923f',
'duration': 514,
'timestamp': 1508505120000,
'thumbnail': 'https://warnerbros-h.assetsadobe.com/is/image/content/dam/ellen/videos/episodes/season15/32/video--2728751654987218111',
}
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._html_search_regex(
r'(?s)<!--\s*CONTENT\s*-->.*data-config.+([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
webpage, 'video id')
return self.url_result('ellentube:%s' % video_id, 'EllenTube')
class EllenTubePlaylistIE(InfoExtractor):
def _extract_videos_from_json(self, data, display_id):
return [self.url_result('ellentube:%s' % elem['id'], 'EllenTube')
for elem in data if elem.get('type') == 'VIDEO']
def _extract_playlist(self, url, display_id, extract_description=True):
webpage = self._download_webpage(url, display_id)
playlist_data = self._html_search_regex(
r'<div\s+data-component\s*=\s*"Details"(.+)</div>', webpage, 'playlist data')
playlist_title = self._search_regex(
r'"title"\s*:\s*"(.+?)"', playlist_data, 'playlist title')
playlist_description = clean_html(self._search_regex(
r'"description"\s*:\s*"(.+?)"', playlist_data, 'playlist description',
fatal=False)) if extract_description else None
api_search = self._search_regex(
r'"filter"\s*:\s*"(.+?)"', playlist_data, 'playlist api request')
api_data = self._download_json(
'https://api-prod.ellentube.com/ellenapi/api/feed/?%s' % api_search,
display_id)
return self.playlist_result(
self._extract_videos_from_json(api_data, display_id),
display_id, playlist_title, playlist_description)
class EllenTubeEpisodeIE(EllenTubePlaylistIE):
_VALID_URL = r'https?://(?:www\.)?ellentube\.com/episode/(?P<id>.+)\.html'
_TEST = {
'url': 'https://www.ellentube.com/episode/dax-shepard-jordan-fisher-haim.html',
'info_dict': {
'id': 'dax-shepard-jordan-fisher-haim',
'title': 'Dax Shepard, \'DWTS\' Team Jordan Fisher & Lindsay Arnold, HAIM',
'description': 'md5:aed85d42892f6126e71ec5ed2aea2a0d'
},
'playlist_count': 6,
}
def _real_extract(self, url):
display_id = self._match_id(url)
return self._extract_playlist(url, display_id)
class EllenTubeStudioIE(EllenTubePlaylistIE):
_VALID_URL = r'https?://(?:www\.)?ellentube\.com/studios/(?P<id>.+)\.html'
_TEST = {
'url': 'https://www.ellentube.com/studios/macey-goes-rving0.html',
'info_dict': {
'id': 'macey-goes-rving0',
'title': 'Macey Goes RVing',
},
'playlist_mincount': 3,
}
def _real_extract(self, url):
display_id = self._match_id(url)
return self._extract_playlist(url, display_id, False)

View file

@ -1,101 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from .kaltura import KalturaIE
from ..utils import NO_DEFAULT
class EllenTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)'
_TESTS = [{
'url': 'http://www.ellentv.com/videos/0-ipq1gsai/',
'md5': '4294cf98bc165f218aaa0b89e0fd8042',
'info_dict': {
'id': '0_ipq1gsai',
'ext': 'mov',
'title': 'Fast Fingers of Fate',
'description': 'md5:3539013ddcbfa64b2a6d1b38d910868a',
'timestamp': 1428035648,
'upload_date': '20150403',
'uploader_id': 'batchUser',
},
}, {
# not available via http://widgets.ellentube.com/
'url': 'http://www.ellentv.com/videos/1-szkgu2m2/',
'info_dict': {
'id': '1_szkgu2m2',
'ext': 'flv',
'title': "Ellen's Amazingly Talented Audience",
'description': 'md5:86ff1e376ff0d717d7171590e273f0a5',
'timestamp': 1255140900,
'upload_date': '20091010',
'uploader_id': 'ellenkaltura@gmail.com',
},
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
URLS = ('http://widgets.ellentube.com/videos/%s' % video_id, url)
for num, url_ in enumerate(URLS, 1):
webpage = self._download_webpage(
url_, video_id, fatal=num == len(URLS))
default = NO_DEFAULT if num == len(URLS) else None
partner_id = self._search_regex(
r"var\s+partnerId\s*=\s*'([^']+)", webpage, 'partner id',
default=default)
kaltura_id = self._search_regex(
[r'id="kaltura_player_([^"]+)"',
r"_wb_entry_id\s*:\s*'([^']+)",
r'data-kaltura-entry-id="([^"]+)'],
webpage, 'kaltura id', default=default)
if partner_id and kaltura_id:
break
return self.url_result('kaltura:%s:%s' % (partner_id, kaltura_id), KalturaIE.ie_key())
class EllenTVClipsIE(InfoExtractor):
IE_NAME = 'EllenTV:clips'
_VALID_URL = r'https?://(?:www\.)?ellentv\.com/episodes/(?P<id>[a-z0-9_-]+)'
_TEST = {
'url': 'http://www.ellentv.com/episodes/meryl-streep-vanessa-hudgens/',
'info_dict': {
'id': 'meryl-streep-vanessa-hudgens',
'title': 'Meryl Streep, Vanessa Hudgens',
},
'playlist_mincount': 5,
}
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
playlist = self._extract_playlist(webpage, playlist_id)
return {
'_type': 'playlist',
'id': playlist_id,
'title': self._og_search_title(webpage),
'entries': self._extract_entries(playlist)
}
def _extract_playlist(self, webpage, playlist_id):
json_string = self._search_regex(r'playerView.addClips\(\[\{(.*?)\}\]\);', webpage, 'json')
return self._parse_json('[{' + json_string + '}]', playlist_id)
def _extract_entries(self, playlist):
return [
self.url_result(
'kaltura:%s:%s' % (item['kaltura_partner_id'], item['kaltura_entry_id']),
KalturaIE.ie_key(), video_id=item['kaltura_entry_id'])
for item in playlist]

View file

@ -311,9 +311,11 @@ from .ehow import EHowIE
from .eighttracks import EightTracksIE from .eighttracks import EightTracksIE
from .einthusan import EinthusanIE from .einthusan import EinthusanIE
from .eitb import EitbIE from .eitb import EitbIE
from .ellentv import ( from .ellentube import (
EllenTVIE, EllenTubeIE,
EllenTVClipsIE, EllenTubeEpisodeIE,
EllenTubeStudioIE,
EllenTubeVideoIE,
) )
from .elpais import ElPaisIE from .elpais import ElPaisIE
from .embedly import EmbedlyIE from .embedly import EmbedlyIE