From 2e90dff2c2ecade8afb444b086fbc0ad6d2c812d Mon Sep 17 00:00:00 2001 From: felix Date: Mon, 16 Mar 2015 20:05:02 +0100 Subject: [PATCH 1/8] The Daily Show Podcast support --- youtube_dl/extractor/__init__.py | 3 +- youtube_dl/extractor/comedycentral.py | 21 ++++++++++++++ youtube_dl/extractor/libsyn.py | 41 +++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 youtube_dl/extractor/libsyn.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1bb3e1a1c..e94779d40 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -84,7 +84,7 @@ from .cnn import ( ) from .collegehumor import CollegeHumorIE from .collegerama import CollegeRamaIE -from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE +from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE, TheDailyShowPodcastIE from .comcarcoff import ComCarCoffIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .condenast import CondeNastIE @@ -250,6 +250,7 @@ from .letv import ( LetvPlaylistIE ) from .lifenews import LifeNewsIE +from .libsyn import LibsynIE from .liveleak import LiveLeakIE from .livestream import ( LivestreamIE, diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index e5edcc84b..e427b9821 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import re +from .common import InfoExtractor from .mtv import MTVServicesInfoExtractor from ..compat import ( compat_str, @@ -272,3 +273,23 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): 'title': show_name + ' ' + title, 'description': description, } + +class TheDailyShowPodcastIE(InfoExtractor): + _VALID_URL = r'(?Phttps?:)?//thedailyshow\.cc\.com/podcast/(?P[a-z\-]+)' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + player_url = self._search_regex(r']+)?\s*src="((?:https?:)?//html5-player\.libsyn\.com/embed/episode/id/[0-9]+)', webpage, 'player URL') + if player_url.startswith('//'): + mobj = re.match(self._VALID_URL, url) + scheme = mobj.group('scheme') + if not scheme: + scheme = 'https:' + player_url = scheme + player_url + + return { + '_type': 'url_transparent', + 'url': player_url, + } diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py new file mode 100644 index 000000000..4b5029f89 --- /dev/null +++ b/youtube_dl/extractor/libsyn.py @@ -0,0 +1,41 @@ +# encoding: utf-8 +from .common import InfoExtractor +from ..utils import ( + unified_strdate, +) + +class LibsynIE(InfoExtractor): + _VALID_URL = r'(?:https?:)?//html5-player\.libsyn\.com/embed/episode/id/(?P[0-9]+)(?:/.*)?' + + def _real_extract(self, url): + if url.startswith('//'): + url = 'https:' + url + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + podcast_title = self._search_regex(r'

(.*?)

', webpage, 'show title') + podcast_episode_title = self._search_regex(r'

(.*?)

', webpage, 'episode title') + podcast_date = unified_strdate(self._search_regex(r'
Released: (.*?)
', webpage, 'release date')) + podcast_description = self._search_regex(r'
(.*?)
', webpage, 'description') + + url0 = self._search_regex(r'var mediaURLLibsyn = "(?Phttps?://.*)";', webpage, 'first media URL') + url1 = self._search_regex(r'var mediaURL = "(?Phttps?://.*)";', webpage, 'second media URL') + + if url0 != url1: + formats = [{ + 'url': url0 + }, { + 'url': url1 + }] + else: + formats = [{ + 'url': url0 + }] + + return { + 'id': display_id, + 'title': podcast_episode_title, + 'description': podcast_description, + 'upload_date': podcast_date, + 'formats': formats, + } From 9ef4f12b534578ae3d3e47815492c90826c03c36 Mon Sep 17 00:00:00 2001 From: felix Date: Tue, 17 Mar 2015 18:54:36 +0100 Subject: [PATCH 2/8] testcases for libsyn and The Daily Show Podcast extractors --- youtube_dl/extractor/comedycentral.py | 4 ++++ youtube_dl/extractor/libsyn.py | 9 +++++++++ 2 files changed, 13 insertions(+) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index e427b9821..bd3817b56 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -276,6 +276,10 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): class TheDailyShowPodcastIE(InfoExtractor): _VALID_URL = r'(?Phttps?:)?//thedailyshow\.cc\.com/podcast/(?P[a-z\-]+)' + _TESTS = [{ + "url": "http://thedailyshow.cc.com/podcast/episodetwelve", + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py index 4b5029f89..88379f276 100644 --- a/youtube_dl/extractor/libsyn.py +++ b/youtube_dl/extractor/libsyn.py @@ -6,6 +6,15 @@ from ..utils import ( class LibsynIE(InfoExtractor): _VALID_URL = r'(?:https?:)?//html5-player\.libsyn\.com/embed/episode/id/(?P[0-9]+)(?:/.*)?' + _TESTS = [{ + 'url': "http://html5-player.libsyn.com/embed/episode/id/3377616/", + 'info_dict': { + 'id': "3377616", + 'ext': "mp3", + 'title': "Episode 12: Bassem Youssef: Egypt's Jon Stewart", + 'description': "

Bassem Youssef joins executive producer Steve Bodow and senior producer Sara Taksler for a conversation about how The Daily Show inspired Bassem to create Al-Bernameg, his massively popular (and now banned) Egyptian news satire program. Sara discusses her soon-to-be-released documentary, Tickling Giants, which chronicles how Bassem and his staff risked their safety every day to tell jokes.

", + }, + }] def _real_extract(self, url): if url.startswith('//'): From 49aeedb8cb4dcf317c970a58c590d42e37904720 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Mar 2015 08:11:10 +0600 Subject: [PATCH 3/8] [libsyn] Improve and simplify --- youtube_dl/extractor/libsyn.py | 81 +++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py index 88379f276..6bf741db8 100644 --- a/youtube_dl/extractor/libsyn.py +++ b/youtube_dl/extractor/libsyn.py @@ -1,50 +1,59 @@ -# encoding: utf-8 +# coding: utf-8 +from __future__ import unicode_literals + +import re + from .common import InfoExtractor -from ..utils import ( - unified_strdate, -) +from ..utils import unified_strdate + class LibsynIE(InfoExtractor): - _VALID_URL = r'(?:https?:)?//html5-player\.libsyn\.com/embed/episode/id/(?P[0-9]+)(?:/.*)?' - _TESTS = [{ - 'url': "http://html5-player.libsyn.com/embed/episode/id/3377616/", + _VALID_URL = r'https?://html5-player\.libsyn\.com/embed/episode/id/(?P[0-9]+)' + + _TEST = { + 'url': 'http://html5-player.libsyn.com/embed/episode/id/3377616/', + 'md5': '443360ee1b58007bc3dcf09b41d093bb', 'info_dict': { - 'id': "3377616", - 'ext': "mp3", - 'title': "Episode 12: Bassem Youssef: Egypt's Jon Stewart", - 'description': "

Bassem Youssef joins executive producer Steve Bodow and senior producer Sara Taksler for a conversation about how The Daily Show inspired Bassem to create Al-Bernameg, his massively popular (and now banned) Egyptian news satire program. Sara discusses her soon-to-be-released documentary, Tickling Giants, which chronicles how Bassem and his staff risked their safety every day to tell jokes.

", + 'id': '3377616', + 'ext': 'mp3', + 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", + 'description': 'md5:601cb790edd05908957dae8aaa866465', + 'upload_date': '20150220', }, - }] + } def _real_extract(self, url): - if url.startswith('//'): - url = 'https:' + url - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + video_id = self._match_id(url) - podcast_title = self._search_regex(r'

(.*?)

', webpage, 'show title') - podcast_episode_title = self._search_regex(r'

(.*?)

', webpage, 'episode title') - podcast_date = unified_strdate(self._search_regex(r'
Released: (.*?)
', webpage, 'release date')) - podcast_description = self._search_regex(r'
(.*?)
', webpage, 'description') + webpage = self._download_webpage(url, video_id) - url0 = self._search_regex(r'var mediaURLLibsyn = "(?Phttps?://.*)";', webpage, 'first media URL') - url1 = self._search_regex(r'var mediaURL = "(?Phttps?://.*)";', webpage, 'second media URL') + formats = [{ + 'url': media_url, + } for media_url in set(re.findall('var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))] - if url0 != url1: - formats = [{ - 'url': url0 - }, { - 'url': url1 - }] - else: - formats = [{ - 'url': url0 - }] + podcast_title = self._search_regex( + r'

([^<]+)

', webpage, 'title') + episode_title = self._search_regex( + r'

([^<]+)

', webpage, 'title', default=None) + + title = '%s - %s' %(podcast_title, episode_title) if podcast_title else episode_title + + description = self._html_search_regex( + r'
(.+?)
', webpage, + 'description', fatal=False) + + thumbnail = self._search_regex( + r']+class="info-show-icon"[^>]+src="([^"]+)"', + webpage, 'thumbnail', fatal=False) + + release_date = unified_strdate(self._search_regex( + r'
Released: ([^<]+)<', webpage, 'release date', fatal=False)) return { - 'id': display_id, - 'title': podcast_episode_title, - 'description': podcast_description, - 'upload_date': podcast_date, + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': release_date, 'formats': formats, } From a1d0aa7b882484685a1a02185d0dafd51c545701 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Mar 2015 08:11:47 +0600 Subject: [PATCH 4/8] [libsyn] Fix extractor alphabetic order --- youtube_dl/extractor/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a20492fc3..82b75a144 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -250,8 +250,8 @@ from .letv import ( LetvTvIE, LetvPlaylistIE ) -from .lifenews import LifeNewsIE from .libsyn import LibsynIE +from .lifenews import LifeNewsIE from .liveleak import LiveLeakIE from .livestream import ( LivestreamIE, From cefdf970ccd8017cd67e949004e5e4c770aacdb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Mar 2015 08:18:13 +0600 Subject: [PATCH 5/8] [extractor/generic] Support Libsyn embeds --- youtube_dl/extractor/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8716e4503..84e8f14b2 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1013,6 +1013,12 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) + # Look for Libsyn player + mobj = re.search( + r']+src=(["\'])(?P(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + # Look for Ooyala videos mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P[^"&]+)', webpage) or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P.{32})[\'"]', webpage) or From 2051acdeb2ed9a0edf3b6b70682699c37d19d851 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Mar 2015 08:20:27 +0600 Subject: [PATCH 6/8] [extractor/generic] Add test for Libsyn embed --- youtube_dl/extractor/generic.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 84e8f14b2..8a49b0b54 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -527,6 +527,17 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Viddler'], }, + # Libsyn embed + { + 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve', + 'info_dict': { + 'id': '3377616', + 'ext': 'mp3', + 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", + 'description': 'md5:601cb790edd05908957dae8aaa866465', + 'upload_date': '20150220', + }, + }, # jwplayer YouTube { 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/', From cf2e2eb1c0b626f2d5f210ffd14642aceb0358e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Mar 2015 08:23:20 +0600 Subject: [PATCH 7/8] [comedycentral] Drop thedailyshow podcast extractor Generic extractor is just fine for Libsyn embeds --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/comedycentral.py | 24 ------------------------ 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 82b75a144..d73826d44 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -84,7 +84,7 @@ from .cnn import ( ) from .collegehumor import CollegeHumorIE from .collegerama import CollegeRamaIE -from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE, TheDailyShowPodcastIE +from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .comcarcoff import ComCarCoffIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .condenast import CondeNastIE diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index bd3817b56..648a6f990 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -273,27 +273,3 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): 'title': show_name + ' ' + title, 'description': description, } - -class TheDailyShowPodcastIE(InfoExtractor): - _VALID_URL = r'(?Phttps?:)?//thedailyshow\.cc\.com/podcast/(?P[a-z\-]+)' - _TESTS = [{ - "url": "http://thedailyshow.cc.com/podcast/episodetwelve", - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - player_url = self._search_regex(r']+)?\s*src="((?:https?:)?//html5-player\.libsyn\.com/embed/episode/id/[0-9]+)', webpage, 'player URL') - if player_url.startswith('//'): - mobj = re.match(self._VALID_URL, url) - scheme = mobj.group('scheme') - if not scheme: - scheme = 'https:' - player_url = scheme + player_url - - return { - '_type': 'url_transparent', - 'url': player_url, - } From 1a4123de04d0168ef4a14e6064148eb248d65dc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 22 Mar 2015 08:23:38 +0600 Subject: [PATCH 8/8] [comedycentral] Remove unused import --- youtube_dl/extractor/comedycentral.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 648a6f990..e5edcc84b 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor from .mtv import MTVServicesInfoExtractor from ..compat import ( compat_str,