[funnyordie] move extraction to VoxMedia extractor and improve vox volume embed extraction(closes #16846)

This commit is contained in:
Remita Amine 2019-07-10 16:47:37 +01:00
parent cfe781d4fa
commit e4d53148f5
3 changed files with 67 additions and 197 deletions

View File

@ -396,7 +396,6 @@ from .frontendmasters import (
) )
from .funimation import FunimationIE from .funimation import FunimationIE
from .funk import FunkIE from .funk import FunkIE
from .funnyordie import FunnyOrDieIE
from .fusion import FusionIE from .fusion import FusionIE
from .fxnetworks import FXNetworksIE from .fxnetworks import FXNetworksIE
from .gaia import GaiaIE from .gaia import GaiaIE

View File

@ -1,162 +0,0 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
unified_timestamp,
)
class FunnyOrDieIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?funnyordie\.com/(?P<type>embed|articles|videos)/(?P<id>[0-9a-f]+)(?:$|[?#/])'
_TESTS = [{
'url': 'http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version',
'md5': 'bcd81e0c4f26189ee09be362ad6e6ba9',
'info_dict': {
'id': '0732f586d7',
'ext': 'mp4',
'title': 'Heart-Shaped Box: Literal Video Version',
'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338',
'thumbnail': r're:^http:.*\.jpg$',
'uploader': 'DASjr',
'timestamp': 1317904928,
'upload_date': '20111006',
'duration': 318.3,
},
}, {
'url': 'http://www.funnyordie.com/embed/e402820827',
'info_dict': {
'id': 'e402820827',
'ext': 'mp4',
'title': 'Please Use This Song (Jon Lajoie)',
'description': 'Please use this to sell something. www.jonlajoie.com',
'thumbnail': r're:^http:.*\.jpg$',
'timestamp': 1398988800,
'upload_date': '20140502',
},
'params': {
'skip_download': True,
},
}, {
'url': 'http://www.funnyordie.com/articles/ebf5e34fc8/10-hours-of-walking-in-nyc-as-a-man',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
links = re.findall(r'<source src="([^"]+/v)[^"]+\.([^"]+)" type=\'video', webpage)
if not links:
raise ExtractorError('No media links available for %s' % video_id)
links.sort(key=lambda link: 1 if link[1] == 'mp4' else 0)
m3u8_url = self._search_regex(
r'<source[^>]+src=(["\'])(?P<url>.+?/master\.m3u8[^"\']*)\1',
webpage, 'm3u8 url', group='url')
formats = []
m3u8_formats = self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False)
source_formats = list(filter(
lambda f: f.get('vcodec') != 'none', m3u8_formats))
bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)(?=[,/])', m3u8_url)]
bitrates.sort()
if source_formats:
self._sort_formats(source_formats)
for bitrate, f in zip(bitrates, source_formats or [{}] * len(bitrates)):
for path, ext in links:
ff = f.copy()
if ff:
if ext != 'mp4':
ff = dict(
[(k, v) for k, v in ff.items()
if k in ('height', 'width', 'format_id')])
ff.update({
'format_id': ff['format_id'].replace('hls', ext),
'ext': ext,
'protocol': 'http',
})
else:
ff.update({
'format_id': '%s-%d' % (ext, bitrate),
'vbr': bitrate,
})
ff['url'] = self._proto_relative_url(
'%s%d.%s' % (path, bitrate, ext))
formats.append(ff)
self._check_formats(formats, video_id)
formats.extend(m3u8_formats)
self._sort_formats(
formats, field_preference=('height', 'width', 'tbr', 'format_id'))
subtitles = {}
for src, src_lang in re.findall(r'<track kind="captions" src="([^"]+)" srclang="([^"]+)"', webpage):
subtitles[src_lang] = [{
'ext': src.split('/')[-1],
'url': 'http://www.funnyordie.com%s' % src,
}]
timestamp = unified_timestamp(self._html_search_meta(
'uploadDate', webpage, 'timestamp', default=None))
uploader = self._html_search_regex(
r'<h\d[^>]+\bclass=["\']channel-preview-name[^>]+>(.+?)</h',
webpage, 'uploader', default=None)
title, description, thumbnail, duration = [None] * 4
medium = self._parse_json(
self._search_regex(
r'jsonMedium\s*=\s*({.+?});', webpage, 'JSON medium',
default='{}'),
video_id, fatal=False)
if medium:
title = medium.get('title')
duration = float_or_none(medium.get('duration'))
if not timestamp:
timestamp = unified_timestamp(medium.get('publishDate'))
post = self._parse_json(
self._search_regex(
r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details',
default='{}'),
video_id, fatal=False)
if post:
if not title:
title = post.get('name')
description = post.get('description')
thumbnail = post.get('picture')
if not title:
title = self._og_search_title(webpage)
if not description:
description = self._og_search_description(webpage)
if not duration:
duration = int_or_none(self._html_search_meta(
('video:duration', 'duration'), webpage, 'duration', default=False))
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
'timestamp': timestamp,
'duration': duration,
'formats': formats,
'subtitles': subtitles,
}

View File

@ -4,7 +4,10 @@ from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from .once import OnceIE from .once import OnceIE
from ..compat import compat_urllib_parse_unquote from ..compat import compat_urllib_parse_unquote
from ..utils import ExtractorError from ..utils import (
ExtractorError,
int_or_none,
)
class VoxMediaVolumeIE(OnceIE): class VoxMediaVolumeIE(OnceIE):
@ -13,18 +16,43 @@ class VoxMediaVolumeIE(OnceIE):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
video_data = self._parse_json(self._search_regex(
r'Volume\.createVideo\(({.+})\s*,\s*{.*}\s*,\s*\[.*\]\s*,\s*{.*}\);', webpage, 'video data'), video_id) setup = self._parse_json(self._search_regex(
for provider_video_type in ('ooyala', 'youtube', 'brightcove'): r'setup\s*=\s*({.+});', webpage, 'setup'), video_id)
provider_video_id = video_data.get('%s_id' % provider_video_type) video_data = setup.get('video') or {}
if not provider_video_id:
continue
info = { info = {
'id': video_id, 'id': video_id,
'title': video_data.get('title_short'), 'title': video_data.get('title_short'),
'description': video_data.get('description_long') or video_data.get('description_short'), 'description': video_data.get('description_long') or video_data.get('description_short'),
'thumbnail': video_data.get('brightcove_thumbnail') 'thumbnail': video_data.get('brightcove_thumbnail')
} }
asset = setup.get('asset') or setup.get('params') or {}
formats = []
hls_url = asset.get('hls_url')
if hls_url:
formats.extend(self._extract_m3u8_formats(
hls_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
mp4_url = asset.get('mp4_url')
if mp4_url:
tbr = self._search_regex(r'-(\d+)k\.', mp4_url, 'bitrate', default=None)
format_id = 'http'
if tbr:
format_id += '-' + tbr
formats.append({
'format_id': format_id,
'url': mp4_url,
'tbr': int_or_none(tbr),
})
if formats:
self._sort_formats(formats)
info['formats'] = formats
return info
for provider_video_type in ('ooyala', 'youtube', 'brightcove'):
provider_video_id = video_data.get('%s_id' % provider_video_type)
if not provider_video_id:
continue
if provider_video_type == 'brightcove': if provider_video_type == 'brightcove':
info['formats'] = self._extract_once_formats(provider_video_id) info['formats'] = self._extract_once_formats(provider_video_id)
self._sort_formats(info['formats']) self._sort_formats(info['formats'])
@ -39,46 +67,49 @@ class VoxMediaVolumeIE(OnceIE):
class VoxMediaIE(InfoExtractor): class VoxMediaIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:(?:theverge|vox|sbnation|eater|polygon|curbed|racked)\.com|recode\.net)/(?:[^/]+/)*(?P<id>[^/?]+)' _VALID_URL = r'https?://(?:www\.)?(?:(?:theverge|vox|sbnation|eater|polygon|curbed|racked|funnyordie)\.com|recode\.net)/(?:[^/]+/)*(?P<id>[^/?]+)'
_TESTS = [{ _TESTS = [{
# Volume embed, Youtube
'url': 'http://www.theverge.com/2014/6/27/5849272/material-world-how-google-discovered-what-software-is-made-of', 'url': 'http://www.theverge.com/2014/6/27/5849272/material-world-how-google-discovered-what-software-is-made-of',
'info_dict': { 'info_dict': {
'id': '11eXZobjrG8DCSTgrNjVinU-YmmdYjhe', 'id': 'j4mLW6x17VM',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Google\'s new material design direction', 'title': 'Material world: how Google discovered what software is made of',
'description': 'md5:2f44f74c4d14a1f800ea73e1c6832ad2', 'description': 'md5:dfc17e7715e3b542d66e33a109861382',
'upload_date': '20190710',
'uploader_id': 'TheVerge',
'uploader': 'The Verge',
}, },
'params': { 'add_ie': ['Youtube'],
# m3u8 download
'skip_download': True,
},
'add_ie': ['Ooyala'],
}, { }, {
# data-ooyala-id # Volume embed, Youtube
'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet', 'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet',
'md5': 'd744484ff127884cd2ba09e3fa604e4b', 'md5': '4c8f4a0937752b437c3ebc0ed24802b5',
'info_dict': { 'info_dict': {
'id': 'RkZXU4cTphOCPDMZg5oEounJyoFI0g-B', 'id': 'Gy8Md3Eky38',
'ext': 'mp4', 'ext': 'mp4',
'title': 'The Nexus 6: hands-on with Google\'s phablet', 'title': 'The Nexus 6: hands-on with Google\'s phablet',
'description': 'md5:87a51fe95ff8cea8b5bdb9ac7ae6a6af', 'description': 'md5:d9f0216e5fb932dd2033d6db37ac3f1d',
'uploader_id': 'TheVerge',
'upload_date': '20141021',
'uploader': 'The Verge',
}, },
'add_ie': ['Ooyala'], 'add_ie': ['Youtube'],
'skip': 'Video Not Found', 'skip': 'similar to the previous test',
}, { }, {
# volume embed # Volume embed, Youtube
'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill', 'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill',
'info_dict': { 'info_dict': {
'id': 'wydzk3dDpmRz7PQoXRsTIX6XTkPjYL0b', 'id': 'YCjDnX-Xzhg',
'ext': 'mp4', 'ext': 'mp4',
'title': 'The new frontier of LGBTQ civil rights, explained', 'title': "Mississippi's laws are so bad that its anti-LGBTQ law isn't needed to allow discrimination",
'description': 'md5:0dc58e94a465cbe91d02950f770eb93f', 'description': 'md5:fc1317922057de31cd74bce91eb1c66c',
'uploader_id': 'voxdotcom',
'upload_date': '20150915',
'uploader': 'Vox',
}, },
'params': { 'add_ie': ['Youtube'],
# m3u8 download 'skip': 'similar to the previous test',
'skip_download': True,
},
'add_ie': ['Ooyala'],
}, { }, {
# youtube embed # youtube embed
'url': 'http://www.vox.com/2016/3/24/11291692/robot-dance', 'url': 'http://www.vox.com/2016/3/24/11291692/robot-dance',
@ -93,6 +124,7 @@ class VoxMediaIE(InfoExtractor):
'uploader': 'Vox', 'uploader': 'Vox',
}, },
'add_ie': ['Youtube'], 'add_ie': ['Youtube'],
'skip': 'Page no longer contain videos',
}, { }, {
# SBN.VideoLinkset.entryGroup multiple ooyala embeds # SBN.VideoLinkset.entryGroup multiple ooyala embeds
'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
@ -118,10 +150,11 @@ class VoxMediaIE(InfoExtractor):
'description': 'md5:e02d56b026d51aa32c010676765a690d', 'description': 'md5:e02d56b026d51aa32c010676765a690d',
}, },
}], }],
'skip': 'Page no longer contain videos',
}, { }, {
# volume embed, Brightcove Once # volume embed, Brightcove Once
'url': 'https://www.recode.net/2014/6/17/11628066/post-post-pc-ceo-the-full-code-conference-video-of-microsofts-satya', 'url': 'https://www.recode.net/2014/6/17/11628066/post-post-pc-ceo-the-full-code-conference-video-of-microsofts-satya',
'md5': '01571a896281f77dc06e084138987ea2', 'md5': '2dbc77b8b0bff1894c2fce16eded637d',
'info_dict': { 'info_dict': {
'id': '1231c973d', 'id': '1231c973d',
'ext': 'mp4', 'ext': 'mp4',