From faab1d3836ca6c2a3c28ee02efe25d211282f45f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?=
Date: Fri, 6 Sep 2013 14:38:41 +0200
Subject: [PATCH 1/5] [youtube] Fix detection of feeds urls (fixes #1294)
Urls like https://www.youtube.com/feed/watch_later were being as users (before the last changes to YoutubeUserIE, as videos)
---
test/test_all_urls.py | 6 ++++++
youtube_dl/extractor/youtube.py | 4 ++--
2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/test/test_all_urls.py b/test/test_all_urls.py
index fe4090d18..219c453af 100644
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -44,6 +44,12 @@ class TestAllURLsMatching(unittest.TestCase):
def test_youtube_user_matching(self):
self.assertMatch('www.youtube.com/NASAgovVideo/videos', ['youtube:user'])
+ def test_youtube_feeds(self):
+ self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watch_later'])
+ self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions'])
+ self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended'])
+ self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites'])
+
def test_justin_tv_channelid_matching(self):
self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv"))
self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv"))
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 98a44f333..62aecea02 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1015,14 +1015,14 @@ class YoutubeChannelIE(InfoExtractor):
class YoutubeUserIE(InfoExtractor):
IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
- _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)([A-Za-z0-9_-]+)'
+ _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
_TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
_GDATA_PAGE_SIZE = 50
_GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
IE_NAME = u'youtube:user'
def suitable(cls, url):
- if YoutubeIE.suitable(url): return False
+ if YoutubeIE.suitable(url) or YoutubeFavouritesIE.suitable(url): return False
else: return super(YoutubeUserIE, cls).suitable(url)
def _real_extract(self, url):
From e3ea47908747bff4b46b4000fb1de944b400c21a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?=
Date: Fri, 6 Sep 2013 16:24:24 +0200
Subject: [PATCH 2/5] [youtube] Fix some issues with the detection of
playlist/channel urls (reported in #1374)
They were being caught by YoutubeUserIE, now it only extracts a url if the rest of extractors aren't suitable.
Now the url tests check that the urls can only be extracted with an specific extractor.
---
test/test_all_urls.py | 27 ++++++++++++++++-----------
youtube_dl/extractor/youtube.py | 8 ++++++--
2 files changed, 22 insertions(+), 13 deletions(-)
diff --git a/test/test_all_urls.py b/test/test_all_urls.py
index 219c453af..5d8d93e0e 100644
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -21,14 +21,15 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertEqual(self.matching_ies(url), ie_list)
def test_youtube_playlist_matching(self):
- self.assertTrue(YoutubePlaylistIE.suitable(u'ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8'))
- self.assertTrue(YoutubePlaylistIE.suitable(u'UUBABnxM4Ar9ten8Mdjj1j0Q')) #585
- self.assertTrue(YoutubePlaylistIE.suitable(u'PL63F0C78739B09958'))
- self.assertTrue(YoutubePlaylistIE.suitable(u'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q'))
- self.assertTrue(YoutubePlaylistIE.suitable(u'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8'))
- self.assertTrue(YoutubePlaylistIE.suitable(u'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC'))
- self.assertTrue(YoutubePlaylistIE.suitable(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668
- self.assertFalse(YoutubePlaylistIE.suitable(u'PLtS2H6bU1M'))
+ assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist'])
+ assertPlaylist(u'ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
+ assertPlaylist(u'UUBABnxM4Ar9ten8Mdjj1j0Q') #585
+ assertPlaylist(u'PL63F0C78739B09958')
+ assertPlaylist(u'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
+ assertPlaylist(u'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
+ assertPlaylist(u'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
+ assertPlaylist(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') #668
+ self.assertFalse('youtube:playlist' in self.matching_ies(u'PLtS2H6bU1M'))
def test_youtube_matching(self):
self.assertTrue(YoutubeIE.suitable(u'PLtS2H6bU1M'))
@@ -37,9 +38,10 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube'])
def test_youtube_channel_matching(self):
- self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM'))
- self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec'))
- self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM/videos'))
+ assertChannel = lambda url: self.assertMatch(url, ['youtube:channel'])
+ assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM')
+ assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec')
+ assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')
def test_youtube_user_matching(self):
self.assertMatch('www.youtube.com/NASAgovVideo/videos', ['youtube:user'])
@@ -50,6 +52,9 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended'])
self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites'])
+ def test_youtube_show_matching(self):
+ self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show'])
+
def test_justin_tv_channelid_matching(self):
self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv"))
self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv"))
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 62aecea02..423a5e973 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -386,7 +386,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
- if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False
+ if YoutubePlaylistIE.suitable(url): return False
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def report_video_webpage_download(self, video_id):
@@ -1021,8 +1021,12 @@ class YoutubeUserIE(InfoExtractor):
_GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
IE_NAME = u'youtube:user'
+ @classmethod
def suitable(cls, url):
- if YoutubeIE.suitable(url) or YoutubeFavouritesIE.suitable(url): return False
+ # Don't return True if the url can be extracted with other youtube
+ # extractor, the regex would is too permissive and it would match.
+ other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
+ if any(ie.suitable(url) for ie in other_ies): return False
else: return super(YoutubeUserIE, cls).suitable(url)
def _real_extract(self, url):
From 7e77275293bac0514253c1d38b8d19f926a69d8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?=
Date: Fri, 6 Sep 2013 18:08:07 +0200
Subject: [PATCH 3/5] Add an extractor for Metacritic
---
youtube_dl/extractor/__init__.py | 1 +
youtube_dl/extractor/metacritic.py | 55 ++++++++++++++++++++++++++++++
2 files changed, 56 insertions(+)
create mode 100644 youtube_dl/extractor/metacritic.py
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 70ebd29e2..fbe0b8cb7 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -52,6 +52,7 @@ from .keek import KeekIE
from .liveleak import LiveLeakIE
from .livestream import LivestreamIE
from .metacafe import MetacafeIE
+from .metacritic import MetacriticIE
from .mit import TechTVMITIE, MITIE
from .mixcloud import MixcloudIE
from .mtv import MTVIE
diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py
new file mode 100644
index 000000000..449138b56
--- /dev/null
+++ b/youtube_dl/extractor/metacritic.py
@@ -0,0 +1,55 @@
+import re
+import xml.etree.ElementTree
+import operator
+
+from .common import InfoExtractor
+
+
+class MetacriticIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.metacritic\.com/.+?/trailers/(?P\d+)'
+
+ _TEST = {
+ u'url': u'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222',
+ u'file': u'3698222.mp4',
+ u'info_dict': {
+ u'title': u'inFamous: Second Son - inSide Sucker Punch: Smoke & Mirrors',
+ u'description': u'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.',
+ u'duration': 221,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ # The xml is not well formatted, there are raw '&'
+ info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id,
+ video_id, u'Downloading info xml').replace('&', '&')
+ info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+
+ clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
+ formats = []
+ for videoFile in clip.findall('httpURI/videoFile'):
+ rate_str = videoFile.find('rate').text
+ video_url = videoFile.find('filePath').text
+ formats.append({
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'format_id': rate_str,
+ 'rate': int(rate_str),
+ })
+ formats.sort(key=operator.itemgetter('rate'))
+
+ description = self._html_search_regex(r'Description:(.*?)
',
+ webpage, u'description', flags=re.DOTALL)
+
+ info = {
+ 'id': video_id,
+ 'title': clip.find('title').text,
+ 'formats': formats,
+ 'description': description,
+ 'duration': int(clip.find('duration').text),
+ }
+ # TODO: Remove when #980 has been merged
+ info.update(formats[-1])
+ return info
From a490fda7464a3cb9d7b5938305241740bae69efb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?=
Date: Fri, 6 Sep 2013 18:36:07 +0200
Subject: [PATCH 4/5] [daylimotion] accept embed urls (fixes #1386)
---
youtube_dl/extractor/dailymotion.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 439033d23..3c616e089 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -14,7 +14,7 @@ from ..utils import (
class DailymotionIE(InfoExtractor):
"""Information Extractor for Dailymotion"""
- _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
+ _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)'
IE_NAME = u'dailymotion'
_TEST = {
u'url': u'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech',
@@ -33,6 +33,7 @@ class DailymotionIE(InfoExtractor):
video_id = mobj.group(1).split('_')[0].split('?')[0]
video_extension = 'mp4'
+ url = 'http://www.dailymotion.com/video/%s' % video_id
# Retrieve video webpage to extract further information
request = compat_urllib_request.Request(url)
From a7130543fa0368175740f5fa173ef920671db866 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?=
Date: Fri, 6 Sep 2013 18:39:35 +0200
Subject: [PATCH 5/5] [generic] If the url doesn't specify the protocol, then
try to extract prepending 'http://'
---
youtube_dl/extractor/generic.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index de7379a92..f92e61fea 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -109,6 +109,11 @@ class GenericIE(InfoExtractor):
return new_url
def _real_extract(self, url):
+ parsed_url = compat_urlparse.urlparse(url)
+ if not parsed_url.scheme:
+ self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
+ return self.url_result('http://' + url)
+
try:
new_url = self._test_redirect(url)
if new_url: