[jamendo] improve extraction
- fix album extraction(closes #18564) - improve metadata extraction(closes #18565)(closes #21379)
This commit is contained in:
parent
bf45295c53
commit
e452345fc5
1 changed files with 99 additions and 63 deletions
|
@ -1,38 +1,26 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import re
|
import hashlib
|
||||||
|
import random
|
||||||
|
|
||||||
from ..compat import compat_urlparse
|
from ..compat import compat_str
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from ..utils import parse_duration
|
from ..utils import (
|
||||||
|
clean_html,
|
||||||
|
int_or_none,
|
||||||
|
try_get,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class JamendoBaseIE(InfoExtractor):
|
class JamendoIE(InfoExtractor):
|
||||||
def _extract_meta(self, webpage, fatal=True):
|
|
||||||
title = self._og_search_title(
|
|
||||||
webpage, default=None) or self._search_regex(
|
|
||||||
r'<title>([^<]+)', webpage,
|
|
||||||
'title', default=None)
|
|
||||||
if title:
|
|
||||||
title = self._search_regex(
|
|
||||||
r'(.+?)\s*\|\s*Jamendo Music', title, 'title', default=None)
|
|
||||||
if not title:
|
|
||||||
title = self._html_search_meta(
|
|
||||||
'name', webpage, 'title', fatal=fatal)
|
|
||||||
mobj = re.search(r'(.+) - (.+)', title or '')
|
|
||||||
artist, second = mobj.groups() if mobj else [None] * 2
|
|
||||||
return title, artist, second
|
|
||||||
|
|
||||||
|
|
||||||
class JamendoIE(JamendoBaseIE):
|
|
||||||
_VALID_URL = r'''(?x)
|
_VALID_URL = r'''(?x)
|
||||||
https?://
|
https?://
|
||||||
(?:
|
(?:
|
||||||
licensing\.jamendo\.com/[^/]+|
|
licensing\.jamendo\.com/[^/]+|
|
||||||
(?:www\.)?jamendo\.com
|
(?:www\.)?jamendo\.com
|
||||||
)
|
)
|
||||||
/track/(?P<id>[0-9]+)/(?P<display_id>[^/?#&]+)
|
/track/(?P<id>[0-9]+)(?:/(?P<display_id>[^/?#&]+))?
|
||||||
'''
|
'''
|
||||||
_TESTS = [{
|
_TESTS = [{
|
||||||
'url': 'https://www.jamendo.com/track/196219/stories-from-emona-i',
|
'url': 'https://www.jamendo.com/track/196219/stories-from-emona-i',
|
||||||
|
@ -45,7 +33,9 @@ class JamendoIE(JamendoBaseIE):
|
||||||
'artist': 'Maya Filipič',
|
'artist': 'Maya Filipič',
|
||||||
'track': 'Stories from Emona I',
|
'track': 'Stories from Emona I',
|
||||||
'duration': 210,
|
'duration': 210,
|
||||||
'thumbnail': r're:^https?://.*\.jpg'
|
'thumbnail': r're:^https?://.*\.jpg',
|
||||||
|
'timestamp': 1217438117,
|
||||||
|
'upload_date': '20080730',
|
||||||
}
|
}
|
||||||
}, {
|
}, {
|
||||||
'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock',
|
'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock',
|
||||||
|
@ -53,15 +43,19 @@ class JamendoIE(JamendoBaseIE):
|
||||||
}]
|
}]
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
mobj = self._VALID_URL_RE.match(url)
|
track_id, display_id = self._VALID_URL_RE.match(url).groups()
|
||||||
track_id = mobj.group('id')
|
webpage = self._download_webpage(url, track_id)
|
||||||
display_id = mobj.group('display_id')
|
models = self._parse_json(self._html_search_regex(
|
||||||
|
r"data-bundled-models='([^']+)",
|
||||||
webpage = self._download_webpage(
|
webpage, 'bundled models'), track_id)
|
||||||
'https://www.jamendo.com/track/%s/%s' % (track_id, display_id),
|
track = models['track']['models'][0]
|
||||||
display_id)
|
title = track_name = track['name']
|
||||||
|
get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {}
|
||||||
title, artist, track = self._extract_meta(webpage)
|
artist = get_model('artist')
|
||||||
|
artist_name = artist.get('name')
|
||||||
|
if artist_name:
|
||||||
|
title = '%s - %s' % (artist_name, title)
|
||||||
|
album = get_model('album')
|
||||||
|
|
||||||
formats = [{
|
formats = [{
|
||||||
'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294'
|
'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294'
|
||||||
|
@ -77,31 +71,58 @@ class JamendoIE(JamendoBaseIE):
|
||||||
))]
|
))]
|
||||||
self._sort_formats(formats)
|
self._sort_formats(formats)
|
||||||
|
|
||||||
thumbnail = self._html_search_meta(
|
urls = []
|
||||||
'image', webpage, 'thumbnail', fatal=False)
|
thumbnails = []
|
||||||
duration = parse_duration(self._search_regex(
|
for _, covers in track.get('cover', {}).items():
|
||||||
r'<span[^>]+itemprop=["\']duration["\'][^>]+content=["\'](.+?)["\']',
|
for cover_id, cover_url in covers.items():
|
||||||
webpage, 'duration', fatal=False))
|
if not cover_url or cover_url in urls:
|
||||||
|
continue
|
||||||
|
urls.append(cover_url)
|
||||||
|
size = int_or_none(cover_id.lstrip('size'))
|
||||||
|
thumbnails.append({
|
||||||
|
'id': cover_id,
|
||||||
|
'url': cover_url,
|
||||||
|
'width': size,
|
||||||
|
'height': size,
|
||||||
|
})
|
||||||
|
|
||||||
|
tags = []
|
||||||
|
for tag in track.get('tags', []):
|
||||||
|
tag_name = tag.get('name')
|
||||||
|
if not tag_name:
|
||||||
|
continue
|
||||||
|
tags.append(tag_name)
|
||||||
|
|
||||||
|
stats = track.get('stats') or {}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'id': track_id,
|
'id': track_id,
|
||||||
'display_id': display_id,
|
'display_id': display_id,
|
||||||
'thumbnail': thumbnail,
|
'thumbnails': thumbnails,
|
||||||
'title': title,
|
'title': title,
|
||||||
'duration': duration,
|
'description': track.get('description'),
|
||||||
'artist': artist,
|
'duration': int_or_none(track.get('duration')),
|
||||||
'track': track,
|
'artist': artist_name,
|
||||||
'formats': formats
|
'track': track_name,
|
||||||
|
'album': album.get('name'),
|
||||||
|
'formats': formats,
|
||||||
|
'license': '-'.join(track.get('licenseCC', [])) or None,
|
||||||
|
'timestamp': int_or_none(track.get('dateCreated')),
|
||||||
|
'view_count': int_or_none(stats.get('listenedAll')),
|
||||||
|
'like_count': int_or_none(stats.get('favorited')),
|
||||||
|
'average_rating': int_or_none(stats.get('averageNote')),
|
||||||
|
'tags': tags,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class JamendoAlbumIE(JamendoBaseIE):
|
class JamendoAlbumIE(InfoExtractor):
|
||||||
_VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)/(?P<display_id>[\w-]+)'
|
_VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)'
|
||||||
_TEST = {
|
_TEST = {
|
||||||
'url': 'https://www.jamendo.com/album/121486/duck-on-cover',
|
'url': 'https://www.jamendo.com/album/121486/duck-on-cover',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': '121486',
|
'id': '121486',
|
||||||
'title': 'Shearer - Duck On Cover'
|
'title': 'Duck On Cover',
|
||||||
|
'description': 'md5:c2920eaeef07d7af5b96d7c64daf1239',
|
||||||
},
|
},
|
||||||
'playlist': [{
|
'playlist': [{
|
||||||
'md5': 'e1a2fcb42bda30dfac990212924149a8',
|
'md5': 'e1a2fcb42bda30dfac990212924149a8',
|
||||||
|
@ -111,6 +132,8 @@ class JamendoAlbumIE(JamendoBaseIE):
|
||||||
'title': 'Shearer - Warmachine',
|
'title': 'Shearer - Warmachine',
|
||||||
'artist': 'Shearer',
|
'artist': 'Shearer',
|
||||||
'track': 'Warmachine',
|
'track': 'Warmachine',
|
||||||
|
'timestamp': 1368089771,
|
||||||
|
'upload_date': '20130509',
|
||||||
}
|
}
|
||||||
}, {
|
}, {
|
||||||
'md5': '1f358d7b2f98edfe90fd55dac0799d50',
|
'md5': '1f358d7b2f98edfe90fd55dac0799d50',
|
||||||
|
@ -120,6 +143,8 @@ class JamendoAlbumIE(JamendoBaseIE):
|
||||||
'title': 'Shearer - Without Your Ghost',
|
'title': 'Shearer - Without Your Ghost',
|
||||||
'artist': 'Shearer',
|
'artist': 'Shearer',
|
||||||
'track': 'Without Your Ghost',
|
'track': 'Without Your Ghost',
|
||||||
|
'timestamp': 1368089771,
|
||||||
|
'upload_date': '20130509',
|
||||||
}
|
}
|
||||||
}],
|
}],
|
||||||
'params': {
|
'params': {
|
||||||
|
@ -127,24 +152,35 @@ class JamendoAlbumIE(JamendoBaseIE):
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _call_api(self, resource, resource_id):
|
||||||
|
path = '/api/%ss' % resource
|
||||||
|
rand = compat_str(random.random())
|
||||||
|
return self._download_json(
|
||||||
|
'https://www.jamendo.com' + path, resource_id, query={
|
||||||
|
'id[]': resource_id,
|
||||||
|
}, headers={
|
||||||
|
'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand)
|
||||||
|
})[0]
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
mobj = self._VALID_URL_RE.match(url)
|
album_id = self._match_id(url)
|
||||||
album_id = mobj.group('id')
|
album = self._call_api('album', album_id)
|
||||||
|
album_name = album.get('name')
|
||||||
|
|
||||||
webpage = self._download_webpage(url, mobj.group('display_id'))
|
entries = []
|
||||||
|
for track in album.get('tracks', []):
|
||||||
|
track_id = track.get('id')
|
||||||
|
if not track_id:
|
||||||
|
continue
|
||||||
|
track_id = compat_str(track_id)
|
||||||
|
entries.append({
|
||||||
|
'_type': 'url_transparent',
|
||||||
|
'url': 'https://www.jamendo.com/track/' + track_id,
|
||||||
|
'ie_key': JamendoIE.ie_key(),
|
||||||
|
'id': track_id,
|
||||||
|
'album': album_name,
|
||||||
|
})
|
||||||
|
|
||||||
title, artist, album = self._extract_meta(webpage, fatal=False)
|
return self.playlist_result(
|
||||||
|
entries, album_id, album_name,
|
||||||
entries = [{
|
clean_html(try_get(album, lambda x: x['description']['en'], compat_str)))
|
||||||
'_type': 'url_transparent',
|
|
||||||
'url': compat_urlparse.urljoin(url, m.group('path')),
|
|
||||||
'ie_key': JamendoIE.ie_key(),
|
|
||||||
'id': self._search_regex(
|
|
||||||
r'/track/(\d+)', m.group('path'), 'track id', default=None),
|
|
||||||
'artist': artist,
|
|
||||||
'album': album,
|
|
||||||
} for m in re.finditer(
|
|
||||||
r'<a[^>]+href=(["\'])(?P<path>(?:(?!\1).)+)\1[^>]+class=["\'][^>]*js-trackrow-albumpage-link',
|
|
||||||
webpage)]
|
|
||||||
|
|
||||||
return self.playlist_result(entries, album_id, title)
|
|
||||||
|
|
Loading…
Reference in a new issue