e4e50f60b1
Since Python 3.6, invalid escape sequences are deprecated. It's likely that there are invalid escape sequences somewhere on the webpage, so instead of unescaping the whole webpage, just unescape the URL. See https://bugs.python.org/issue27364. That change was designed for string literals, while it affects the 'unicode_escape' encoding as well. The code path is: str.decode('unicode_escape') codecs.unicode_escape_decode() PyUnicode_DecodeUnicodeEscape()
93 lines
3.1 KiB
Python
93 lines
3.1 KiB
Python
from __future__ import unicode_literals
|
|
|
|
import re
|
|
|
|
from .common import InfoExtractor
|
|
from ..utils import (
|
|
ExtractorError,
|
|
int_or_none,
|
|
lowercase_escape,
|
|
)
|
|
|
|
|
|
class GoogleDriveIE(InfoExtractor):
|
|
_VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})'
|
|
_TESTS = [{
|
|
'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
|
|
'md5': 'd109872761f7e7ecf353fa108c0dbe1e',
|
|
'info_dict': {
|
|
'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
|
|
'ext': 'mp4',
|
|
'title': 'Big Buck Bunny.mp4',
|
|
'duration': 45,
|
|
}
|
|
}, {
|
|
# video id is longer than 28 characters
|
|
'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
|
|
'only_matching': True,
|
|
}]
|
|
_FORMATS_EXT = {
|
|
'5': 'flv',
|
|
'6': 'flv',
|
|
'13': '3gp',
|
|
'17': '3gp',
|
|
'18': 'mp4',
|
|
'22': 'mp4',
|
|
'34': 'flv',
|
|
'35': 'flv',
|
|
'36': '3gp',
|
|
'37': 'mp4',
|
|
'38': 'mp4',
|
|
'43': 'webm',
|
|
'44': 'webm',
|
|
'45': 'webm',
|
|
'46': 'webm',
|
|
'59': 'mp4',
|
|
}
|
|
|
|
@staticmethod
|
|
def _extract_url(webpage):
|
|
mobj = re.search(
|
|
r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
|
|
webpage)
|
|
if mobj:
|
|
return 'https://drive.google.com/file/d/%s' % mobj.group('id')
|
|
|
|
def _real_extract(self, url):
|
|
video_id = self._match_id(url)
|
|
webpage = self._download_webpage(
|
|
'http://docs.google.com/file/d/%s' % video_id, video_id)
|
|
|
|
reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
|
|
if reason:
|
|
raise ExtractorError(reason)
|
|
|
|
title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title')
|
|
duration = int_or_none(self._search_regex(
|
|
r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None))
|
|
fmt_stream_map = self._search_regex(
|
|
r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt stream map').split(',')
|
|
fmt_list = self._search_regex(r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',')
|
|
|
|
formats = []
|
|
for fmt, fmt_stream in zip(fmt_list, fmt_stream_map):
|
|
fmt_id, fmt_url = fmt_stream.split('|')
|
|
resolution = fmt.split('/')[1]
|
|
width, height = resolution.split('x')
|
|
formats.append({
|
|
'url': lowercase_escape(fmt_url),
|
|
'format_id': fmt_id,
|
|
'resolution': resolution,
|
|
'width': int_or_none(width),
|
|
'height': int_or_none(height),
|
|
'ext': self._FORMATS_EXT[fmt_id],
|
|
})
|
|
self._sort_formats(formats)
|
|
|
|
return {
|
|
'id': video_id,
|
|
'title': title,
|
|
'thumbnail': self._og_search_thumbnail(webpage, default=None),
|
|
'duration': duration,
|
|
'formats': formats,
|
|
}
|