Merge remote-tracking branch 'origin/master'
This commit is contained in:
		
						commit
						ef66b0c6ef
					
				
					 25 changed files with 323 additions and 131 deletions
				
			
		| 
						 | 
				
			
			@ -32,9 +32,9 @@ tests = [
 | 
			
		|||
    # 83
 | 
			
		||||
    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<",
 | 
			
		||||
     ".>/?;}[{=+_)(*&^%<#!MNBVCXZASPFGHJKLwOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytreq"),
 | 
			
		||||
    # 82 - vflZK4ZYR 2013/08/23
 | 
			
		||||
    # 82 - vflGNjMhJ 2013/09/12
 | 
			
		||||
    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<",
 | 
			
		||||
     "wertyuioplkjhgfdsaqxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&z(-+={[};?/>.<"),
 | 
			
		||||
     ".>/?;}[<=+-(*&^%$#@!MNBVCXeASDFGHKLPOqUYTREWQ0987654321mnbvcxzasdfghjklpoiuytrIwZ"),
 | 
			
		||||
    # 81 - vflLC8JvQ 2013/07/25
 | 
			
		||||
    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.",
 | 
			
		||||
     "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"),
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -36,6 +36,7 @@ class TestAllURLsMatching(unittest.TestCase):
 | 
			
		|||
        self.assertFalse(YoutubeIE.suitable(u'https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) #668
 | 
			
		||||
        self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube'])
 | 
			
		||||
        self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube'])
 | 
			
		||||
        self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube'])
 | 
			
		||||
 | 
			
		||||
    def test_youtube_channel_matching(self):
 | 
			
		||||
        assertChannel = lambda url: self.assertMatch(url, ['youtube:channel'])
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -40,6 +40,7 @@ class TestDailymotionSubtitles(unittest.TestCase):
 | 
			
		|||
        subtitles = self.getSubtitles()
 | 
			
		||||
        self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792')
 | 
			
		||||
    def test_allsubtitles(self):
 | 
			
		||||
        self.DL.params['writesubtitles'] = True
 | 
			
		||||
        self.DL.params['allsubtitles'] = True
 | 
			
		||||
        subtitles = self.getSubtitles()
 | 
			
		||||
        self.assertEqual(len(subtitles.keys()), 5)
 | 
			
		||||
| 
						 | 
				
			
			@ -54,6 +55,7 @@ class TestDailymotionSubtitles(unittest.TestCase):
 | 
			
		|||
        self.assertTrue(len(subtitles.keys()) == 0)
 | 
			
		||||
    def test_nosubtitles(self):
 | 
			
		||||
        self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv'
 | 
			
		||||
        self.DL.params['writesubtitles'] = True
 | 
			
		||||
        self.DL.params['allsubtitles'] = True
 | 
			
		||||
        subtitles = self.getSubtitles()
 | 
			
		||||
        self.assertEqual(len(subtitles), 0)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,7 +8,7 @@ import json
 | 
			
		|||
import os
 | 
			
		||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 | 
			
		||||
 | 
			
		||||
from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE
 | 
			
		||||
from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE, UstreamChannelIE, SoundcloudUserIE
 | 
			
		||||
from youtube_dl.utils import *
 | 
			
		||||
 | 
			
		||||
from helper import FakeYDL
 | 
			
		||||
| 
						 | 
				
			
			@ -34,5 +34,21 @@ class TestPlaylists(unittest.TestCase):
 | 
			
		|||
        self.assertEqual(result['title'], u'Vimeo Tributes')
 | 
			
		||||
        self.assertTrue(len(result['entries']) > 24)
 | 
			
		||||
 | 
			
		||||
    def test_ustream_channel(self):
 | 
			
		||||
        dl = FakeYDL()
 | 
			
		||||
        ie = UstreamChannelIE(dl)
 | 
			
		||||
        result = ie.extract('http://www.ustream.tv/channel/young-americans-for-liberty')
 | 
			
		||||
        self.assertIsPlaylist(result)
 | 
			
		||||
        self.assertEqual(result['id'], u'5124905')
 | 
			
		||||
        self.assertTrue(len(result['entries']) >= 11)
 | 
			
		||||
 | 
			
		||||
    def test_soundcloud_user(self):
 | 
			
		||||
        dl = FakeYDL()
 | 
			
		||||
        ie = SoundcloudUserIE(dl)
 | 
			
		||||
        result = ie.extract('https://soundcloud.com/the-concept-band')
 | 
			
		||||
        self.assertIsPlaylist(result)
 | 
			
		||||
        self.assertEqual(result['id'], u'9615865')
 | 
			
		||||
        self.assertTrue(len(result['entries']) >= 12)
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    unittest.main()
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -11,13 +11,16 @@ import os
 | 
			
		|||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 | 
			
		||||
 | 
			
		||||
#from youtube_dl.utils import htmlentity_transform
 | 
			
		||||
from youtube_dl.utils import timeconvert
 | 
			
		||||
from youtube_dl.utils import sanitize_filename
 | 
			
		||||
from youtube_dl.utils import unescapeHTML
 | 
			
		||||
from youtube_dl.utils import orderedSet
 | 
			
		||||
from youtube_dl.utils import DateRange
 | 
			
		||||
from youtube_dl.utils import unified_strdate
 | 
			
		||||
from youtube_dl.utils import find_xpath_attr
 | 
			
		||||
from youtube_dl.utils import (
 | 
			
		||||
    timeconvert,
 | 
			
		||||
    sanitize_filename,
 | 
			
		||||
    unescapeHTML,
 | 
			
		||||
    orderedSet,
 | 
			
		||||
    DateRange,
 | 
			
		||||
    unified_strdate,
 | 
			
		||||
    find_xpath_attr,
 | 
			
		||||
    get_meta_content,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
if sys.version_info < (3, 0):
 | 
			
		||||
    _compat_str = lambda b: b.decode('unicode-escape')
 | 
			
		||||
| 
						 | 
				
			
			@ -127,5 +130,16 @@ class TestUtil(unittest.TestCase):
 | 
			
		|||
        self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1])
 | 
			
		||||
        self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2])
 | 
			
		||||
 | 
			
		||||
    def test_meta_parser(self):
 | 
			
		||||
        testhtml = u'''
 | 
			
		||||
        <head>
 | 
			
		||||
            <meta name="description" content="foo & bar">
 | 
			
		||||
            <meta content='Plato' name='author'/>
 | 
			
		||||
        </head>
 | 
			
		||||
        '''
 | 
			
		||||
        get_meta = lambda name: get_meta_content(name, testhtml)
 | 
			
		||||
        self.assertEqual(get_meta('description'), u'foo & bar')
 | 
			
		||||
        self.assertEqual(get_meta('author'), 'Plato')
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    unittest.main()
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -41,6 +41,7 @@ class TestYoutubeSubtitles(unittest.TestCase):
 | 
			
		|||
        subtitles = self.getSubtitles()
 | 
			
		||||
        self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d')
 | 
			
		||||
    def test_youtube_allsubtitles(self):
 | 
			
		||||
        self.DL.params['writesubtitles'] = True
 | 
			
		||||
        self.DL.params['allsubtitles'] = True
 | 
			
		||||
        subtitles = self.getSubtitles()
 | 
			
		||||
        self.assertEqual(len(subtitles.keys()), 13)
 | 
			
		||||
| 
						 | 
				
			
			@ -66,6 +67,7 @@ class TestYoutubeSubtitles(unittest.TestCase):
 | 
			
		|||
        self.assertTrue(subtitles['it'] is not None)
 | 
			
		||||
    def test_youtube_nosubtitles(self):
 | 
			
		||||
        self.url = 'sAjKT8FhjI8'
 | 
			
		||||
        self.DL.params['writesubtitles'] = True
 | 
			
		||||
        self.DL.params['allsubtitles'] = True
 | 
			
		||||
        subtitles = self.getSubtitles()
 | 
			
		||||
        self.assertEqual(len(subtitles), 0)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -74,6 +74,7 @@ class YoutubeDL(object):
 | 
			
		|||
    writesubtitles:    Write the video subtitles to a file
 | 
			
		||||
    writeautomaticsub: Write the automatic subtitles to a file
 | 
			
		||||
    allsubtitles:      Downloads all the subtitles of the video
 | 
			
		||||
                       (requires writesubtitles or writeautomaticsub)
 | 
			
		||||
    listsubtitles:     Lists all available subtitles for the video
 | 
			
		||||
    subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)
 | 
			
		||||
    subtitleslangs:    List of languages of the subtitles to download
 | 
			
		||||
| 
						 | 
				
			
			@ -492,13 +493,14 @@ class YoutubeDL(object):
 | 
			
		|||
                self.report_writedescription(descfn)
 | 
			
		||||
                with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
 | 
			
		||||
                    descfile.write(info_dict['description'])
 | 
			
		||||
            except (KeyError, TypeError):
 | 
			
		||||
                self.report_warning(u'There\'s no description to write.')
 | 
			
		||||
            except (OSError, IOError):
 | 
			
		||||
                self.report_error(u'Cannot write description file ' + descfn)
 | 
			
		||||
                return
 | 
			
		||||
 | 
			
		||||
        subtitles_are_requested = any([self.params.get('writesubtitles', False),
 | 
			
		||||
                                       self.params.get('writeautomaticsub'),
 | 
			
		||||
                                       self.params.get('allsubtitles', False)])
 | 
			
		||||
                                       self.params.get('writeautomaticsub')])
 | 
			
		||||
 | 
			
		||||
        if  subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
 | 
			
		||||
            # subtitles download errors are already managed as troubles in relevant IE
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -533,6 +533,11 @@ def _real_main(argv=None):
 | 
			
		|||
    else:
 | 
			
		||||
        date = DateRange(opts.dateafter, opts.datebefore)
 | 
			
		||||
 | 
			
		||||
    # --all-sub automatically sets --write-sub if --write-auto-sub is not given
 | 
			
		||||
    # this was the old behaviour if only --all-sub was given.
 | 
			
		||||
    if opts.allsubtitles and (opts.writeautomaticsub == False):
 | 
			
		||||
        opts.writesubtitles = True
 | 
			
		||||
 | 
			
		||||
    if sys.version_info < (3,):
 | 
			
		||||
        # In Python 2, sys.argv is a bytestring (also note http://bugs.python.org/issue2128 for Windows systems)
 | 
			
		||||
        if opts.outtmpl is not None:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -52,6 +52,7 @@ from .jeuxvideo import JeuxVideoIE
 | 
			
		|||
from .jukebox import JukeboxIE
 | 
			
		||||
from .justintv import JustinTVIE
 | 
			
		||||
from .kankan import KankanIE
 | 
			
		||||
from .kickstarter import KickStarterIE
 | 
			
		||||
from .keek import KeekIE
 | 
			
		||||
from .liveleak import LiveLeakIE
 | 
			
		||||
from .livestream import LivestreamIE
 | 
			
		||||
| 
						 | 
				
			
			@ -81,7 +82,8 @@ from .sina import SinaIE
 | 
			
		|||
from .slashdot import SlashdotIE
 | 
			
		||||
from .slideshare import SlideshareIE
 | 
			
		||||
from .sohu import SohuIE
 | 
			
		||||
from .soundcloud import SoundcloudIE, SoundcloudSetIE
 | 
			
		||||
from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE
 | 
			
		||||
from .southparkstudios import SouthParkStudiosIE
 | 
			
		||||
from .spiegel import SpiegelIE
 | 
			
		||||
from .stanfordoc import StanfordOpenClassroomIE
 | 
			
		||||
from .statigram import StatigramIE
 | 
			
		||||
| 
						 | 
				
			
			@ -96,7 +98,7 @@ from .tudou import TudouIE
 | 
			
		|||
from .tumblr import TumblrIE
 | 
			
		||||
from .tutv import TutvIE
 | 
			
		||||
from .unistra import UnistraIE
 | 
			
		||||
from .ustream import UstreamIE
 | 
			
		||||
from .ustream import UstreamIE, UstreamChannelIE
 | 
			
		||||
from .vbox7 import Vbox7IE
 | 
			
		||||
from .veehd import VeeHDIE
 | 
			
		||||
from .veoh import VeohIE
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -46,6 +46,8 @@ class ArchiveOrgIE(InfoExtractor):
 | 
			
		|||
            for fn,fdata in data['files'].items()
 | 
			
		||||
            if 'Video' in fdata['format']]
 | 
			
		||||
        formats.sort(key=lambda fdata: fdata['file_size'])
 | 
			
		||||
        for f in formats:
 | 
			
		||||
            f['ext'] = determine_ext(f['url'])
 | 
			
		||||
 | 
			
		||||
        info = {
 | 
			
		||||
            '_type': 'video',
 | 
			
		||||
| 
						 | 
				
			
			@ -61,7 +63,6 @@ class ArchiveOrgIE(InfoExtractor):
 | 
			
		|||
            info['thumbnail'] = thumbnail
 | 
			
		||||
 | 
			
		||||
        # TODO: Remove when #980 has been merged
 | 
			
		||||
        info['url'] = formats[-1]['url']
 | 
			
		||||
        info['ext'] = determine_ext(formats[-1]['url'])
 | 
			
		||||
        info.update(formats[-1])
 | 
			
		||||
 | 
			
		||||
        return info
 | 
			
		||||
        return info
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,3 +1,4 @@
 | 
			
		|||
# encoding: utf-8
 | 
			
		||||
import re
 | 
			
		||||
import xml.etree.ElementTree
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -5,24 +6,29 @@ from .common import InfoExtractor
 | 
			
		|||
from ..utils import unified_strdate
 | 
			
		||||
 | 
			
		||||
class CanalplusIE(InfoExtractor):
 | 
			
		||||
    _VALID_URL = r'https?://(www\.canalplus\.fr/.*?\?vid=|player\.canalplus\.fr/#/)(?P<id>\d+)'
 | 
			
		||||
    _VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>\d+))'
 | 
			
		||||
    _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
 | 
			
		||||
    IE_NAME = u'canalplus.fr'
 | 
			
		||||
 | 
			
		||||
    _TEST = {
 | 
			
		||||
        u'url': u'http://www.canalplus.fr/c-divertissement/pid3351-c-le-petit-journal.html?vid=889861',
 | 
			
		||||
        u'file': u'889861.flv',
 | 
			
		||||
        u'md5': u'590a888158b5f0d6832f84001fbf3e99',
 | 
			
		||||
        u'url': u'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470',
 | 
			
		||||
        u'file': u'922470.flv',
 | 
			
		||||
        u'info_dict': {
 | 
			
		||||
            u'title': u'Le Petit Journal 20/06/13 - La guerre des drone',
 | 
			
		||||
            u'upload_date': u'20130620',
 | 
			
		||||
            u'title': u'Zapping - 26/08/13',
 | 
			
		||||
            u'description': u'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013',
 | 
			
		||||
            u'upload_date': u'20130826',
 | 
			
		||||
        },
 | 
			
		||||
        u'params': {
 | 
			
		||||
            u'skip_download': True,
 | 
			
		||||
        },
 | 
			
		||||
        u'skip': u'Requires rtmpdump'
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    def _real_extract(self, url):
 | 
			
		||||
        mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
        video_id = mobj.group('id')
 | 
			
		||||
        if video_id is None:
 | 
			
		||||
            webpage = self._download_webpage(url, mobj.group('path'))
 | 
			
		||||
            video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id')
 | 
			
		||||
        info_url = self._VIDEO_INFO_TEMPLATE % video_id
 | 
			
		||||
        info_page = self._download_webpage(info_url,video_id, 
 | 
			
		||||
                                           u'Downloading video info')
 | 
			
		||||
| 
						 | 
				
			
			@ -43,4 +49,6 @@ class CanalplusIE(InfoExtractor):
 | 
			
		|||
                'ext': 'flv',
 | 
			
		||||
                'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text),
 | 
			
		||||
                'thumbnail': media.find('IMAGES/GRAND').text,
 | 
			
		||||
                'description': infos.find('DESCRIPTION').text,
 | 
			
		||||
                'view_count': int(infos.find('NB_VUES').text),
 | 
			
		||||
                }
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -54,6 +54,7 @@ class DreiSatIE(InfoExtractor):
 | 
			
		|||
            'width': int(fe.find('./width').text),
 | 
			
		||||
            'height': int(fe.find('./height').text),
 | 
			
		||||
            'url': fe.find('./url').text,
 | 
			
		||||
            'ext': determine_ext(fe.find('./url').text),
 | 
			
		||||
            'filesize': int(fe.find('./filesize').text),
 | 
			
		||||
            'video_bitrate': int(fe.find('./videoBitrate').text),
 | 
			
		||||
            '3sat_qualityname': fe.find('./quality').text,
 | 
			
		||||
| 
						 | 
				
			
			@ -79,7 +80,6 @@ class DreiSatIE(InfoExtractor):
 | 
			
		|||
        }
 | 
			
		||||
 | 
			
		||||
        # TODO: Remove when #980 has been merged
 | 
			
		||||
        info['url'] = formats[-1]['url']
 | 
			
		||||
        info['ext'] = determine_ext(formats[-1]['url'])
 | 
			
		||||
        info.update(formats[-1])
 | 
			
		||||
 | 
			
		||||
        return info
 | 
			
		||||
        return info
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -21,7 +21,7 @@ class FunnyOrDieIE(InfoExtractor):
 | 
			
		|||
        video_id = mobj.group('id')
 | 
			
		||||
        webpage = self._download_webpage(url, video_id)
 | 
			
		||||
 | 
			
		||||
        video_url = self._search_regex(r'type: "video/mp4", src: "(.*?)"',
 | 
			
		||||
        video_url = self._search_regex(r'type="video/mp4" src="(.*?)"',
 | 
			
		||||
            webpage, u'video URL', flags=re.DOTALL)
 | 
			
		||||
 | 
			
		||||
        info = {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -14,7 +14,7 @@ class GameSpotIE(InfoExtractor):
 | 
			
		|||
        u"file": u"6410818.mp4",
 | 
			
		||||
        u"md5": u"b2a30deaa8654fcccd43713a6b6a4825",
 | 
			
		||||
        u"info_dict": {
 | 
			
		||||
            u"title": u"Arma III - Community Guide: SITREP I",
 | 
			
		||||
            u"title": u"Arma 3 - Community Guide: SITREP I",
 | 
			
		||||
            u"upload_date": u"20130627", 
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -40,7 +40,8 @@ class GooglePlusIE(InfoExtractor):
 | 
			
		|||
        self.report_extraction(video_id)
 | 
			
		||||
 | 
			
		||||
        # Extract update date
 | 
			
		||||
        upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
 | 
			
		||||
        upload_date = self._html_search_regex(
 | 
			
		||||
            ['title="Timestamp">(.*?)</a>', r'<a.+?class="g-M.+?>(.+?)</a>'],
 | 
			
		||||
            webpage, u'upload date', fatal=False)
 | 
			
		||||
        if upload_date:
 | 
			
		||||
            # Convert timestring to a format suitable for filename
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										37
									
								
								youtube_dl/extractor/kickstarter.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								youtube_dl/extractor/kickstarter.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,37 @@
 | 
			
		|||
import re
 | 
			
		||||
 | 
			
		||||
from .common import InfoExtractor
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class KickStarterIE(InfoExtractor):
 | 
			
		||||
    _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>\d*)/.*'
 | 
			
		||||
    _TEST = {
 | 
			
		||||
        u"url": u"https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location",
 | 
			
		||||
        u"file": u"1404461844.mp4",
 | 
			
		||||
        u"md5": u"c81addca81327ffa66c642b5d8b08cab",
 | 
			
		||||
        u"info_dict": {
 | 
			
		||||
            u"title": u"Intersection: The Story of Josh Grant by Kyle Cowling",
 | 
			
		||||
        },
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    def _real_extract(self, url):
 | 
			
		||||
        m = re.match(self._VALID_URL, url)
 | 
			
		||||
        video_id = m.group('id')
 | 
			
		||||
        webpage_src = self._download_webpage(url, video_id)
 | 
			
		||||
 | 
			
		||||
        video_url = self._search_regex(r'data-video="(.*?)">',
 | 
			
		||||
            webpage_src, u'video URL')
 | 
			
		||||
        if 'mp4' in video_url:
 | 
			
		||||
            ext = 'mp4'
 | 
			
		||||
        else:
 | 
			
		||||
            ext = 'flv'
 | 
			
		||||
        video_title = self._html_search_regex(r"<title>(.*?)</title>",
 | 
			
		||||
            webpage_src, u'title').rpartition(u'\u2014 Kickstarter')[0].strip()
 | 
			
		||||
 | 
			
		||||
        results = [{
 | 
			
		||||
                    'id': video_id,
 | 
			
		||||
                    'url': video_url,
 | 
			
		||||
                    'title': video_title,
 | 
			
		||||
                    'ext': ext,
 | 
			
		||||
                    }]
 | 
			
		||||
        return results
 | 
			
		||||
| 
						 | 
				
			
			@ -5,34 +5,27 @@ import socket
 | 
			
		|||
from .common import InfoExtractor
 | 
			
		||||
from ..utils import (
 | 
			
		||||
    compat_http_client,
 | 
			
		||||
    compat_str,
 | 
			
		||||
    compat_urllib_error,
 | 
			
		||||
    compat_urllib_request,
 | 
			
		||||
 | 
			
		||||
    ExtractorError,
 | 
			
		||||
    unified_strdate,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MixcloudIE(InfoExtractor):
 | 
			
		||||
    _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
 | 
			
		||||
    _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
 | 
			
		||||
    IE_NAME = u'mixcloud'
 | 
			
		||||
 | 
			
		||||
    def report_download_json(self, file_id):
 | 
			
		||||
        """Report JSON download."""
 | 
			
		||||
        self.to_screen(u'Downloading json')
 | 
			
		||||
 | 
			
		||||
    def get_urls(self, jsonData, fmt, bitrate='best'):
 | 
			
		||||
        """Get urls from 'audio_formats' section in json"""
 | 
			
		||||
        try:
 | 
			
		||||
            bitrate_list = jsonData[fmt]
 | 
			
		||||
            if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
 | 
			
		||||
                bitrate = max(bitrate_list) # select highest
 | 
			
		||||
 | 
			
		||||
            url_list = jsonData[fmt][bitrate]
 | 
			
		||||
        except TypeError: # we have no bitrate info.
 | 
			
		||||
            url_list = jsonData[fmt]
 | 
			
		||||
        return url_list
 | 
			
		||||
    _TEST = {
 | 
			
		||||
        u'url': u'http://www.mixcloud.com/dholbach/cryptkeeper/',
 | 
			
		||||
        u'file': u'dholbach-cryptkeeper.mp3',
 | 
			
		||||
        u'info_dict': {
 | 
			
		||||
            u'title': u'Cryptkeeper',
 | 
			
		||||
            u'description': u'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
 | 
			
		||||
            u'uploader': u'Daniel Holbach',
 | 
			
		||||
            u'uploader_id': u'dholbach',
 | 
			
		||||
            u'upload_date': u'20111115',
 | 
			
		||||
        },
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    def check_urls(self, url_list):
 | 
			
		||||
        """Returns 1st active url from list"""
 | 
			
		||||
| 
						 | 
				
			
			@ -45,71 +38,32 @@ class MixcloudIE(InfoExtractor):
 | 
			
		|||
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
    def _print_formats(self, formats):
 | 
			
		||||
        print('Available formats:')
 | 
			
		||||
        for fmt in formats.keys():
 | 
			
		||||
            for b in formats[fmt]:
 | 
			
		||||
                try:
 | 
			
		||||
                    ext = formats[fmt][b][0]
 | 
			
		||||
                    print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
 | 
			
		||||
                except TypeError: # we have no bitrate info
 | 
			
		||||
                    ext = formats[fmt][0]
 | 
			
		||||
                    print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
 | 
			
		||||
                    break
 | 
			
		||||
 | 
			
		||||
    def _real_extract(self, url):
 | 
			
		||||
        mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
        if mobj is None:
 | 
			
		||||
            raise ExtractorError(u'Invalid URL: %s' % url)
 | 
			
		||||
        # extract uploader & filename from url
 | 
			
		||||
        uploader = mobj.group(1).decode('utf-8')
 | 
			
		||||
        file_id = uploader + "-" + mobj.group(2).decode('utf-8')
 | 
			
		||||
 | 
			
		||||
        # construct API request
 | 
			
		||||
        file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
 | 
			
		||||
        # retrieve .json file with links to files
 | 
			
		||||
        request = compat_urllib_request.Request(file_url)
 | 
			
		||||
        try:
 | 
			
		||||
            self.report_download_json(file_url)
 | 
			
		||||
            jsonData = compat_urllib_request.urlopen(request).read()
 | 
			
		||||
        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 | 
			
		||||
            raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
 | 
			
		||||
        uploader = mobj.group(1)
 | 
			
		||||
        cloudcast_name = mobj.group(2)
 | 
			
		||||
        track_id = '-'.join((uploader, cloudcast_name))
 | 
			
		||||
        api_url = 'http://api.mixcloud.com/%s/%s/' % (uploader, cloudcast_name)
 | 
			
		||||
        webpage = self._download_webpage(url, track_id)
 | 
			
		||||
        json_data = self._download_webpage(api_url, track_id,
 | 
			
		||||
            u'Downloading cloudcast info')
 | 
			
		||||
        info = json.loads(json_data)
 | 
			
		||||
 | 
			
		||||
        # parse JSON
 | 
			
		||||
        json_data = json.loads(jsonData)
 | 
			
		||||
        player_url = json_data['player_swf_url']
 | 
			
		||||
        formats = dict(json_data['audio_formats'])
 | 
			
		||||
        preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url')
 | 
			
		||||
        song_url = preview_url.replace('/previews/', '/cloudcasts/originals/')
 | 
			
		||||
        template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)
 | 
			
		||||
        final_song_url = self.check_urls(template_url % i for i in range(30))
 | 
			
		||||
 | 
			
		||||
        req_format = self._downloader.params.get('format', None)
 | 
			
		||||
 | 
			
		||||
        if self._downloader.params.get('listformats', None):
 | 
			
		||||
            self._print_formats(formats)
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        if req_format is None or req_format == 'best':
 | 
			
		||||
            for format_param in formats.keys():
 | 
			
		||||
                url_list = self.get_urls(formats, format_param)
 | 
			
		||||
                # check urls
 | 
			
		||||
                file_url = self.check_urls(url_list)
 | 
			
		||||
                if file_url is not None:
 | 
			
		||||
                    break # got it!
 | 
			
		||||
        else:
 | 
			
		||||
            if req_format not in formats:
 | 
			
		||||
                raise ExtractorError(u'Format is not available')
 | 
			
		||||
 | 
			
		||||
            url_list = self.get_urls(formats, req_format)
 | 
			
		||||
            file_url = self.check_urls(url_list)
 | 
			
		||||
            format_param = req_format
 | 
			
		||||
 | 
			
		||||
        return [{
 | 
			
		||||
            'id': file_id.decode('utf-8'),
 | 
			
		||||
            'url': file_url.decode('utf-8'),
 | 
			
		||||
            'uploader': uploader.decode('utf-8'),
 | 
			
		||||
            'upload_date': None,
 | 
			
		||||
            'title': json_data['name'],
 | 
			
		||||
            'ext': file_url.split('.')[-1].decode('utf-8'),
 | 
			
		||||
            'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
 | 
			
		||||
            'thumbnail': json_data['thumbnail_url'],
 | 
			
		||||
            'description': json_data['description'],
 | 
			
		||||
            'player_url': player_url.decode('utf-8'),
 | 
			
		||||
        }]
 | 
			
		||||
        return {
 | 
			
		||||
            'id': track_id,
 | 
			
		||||
            'title': info['name'],
 | 
			
		||||
            'url': final_song_url,
 | 
			
		||||
            'ext': 'mp3',
 | 
			
		||||
            'description': info['description'],
 | 
			
		||||
            'thumbnail': info['pictures'].get('extra_large'),
 | 
			
		||||
            'uploader': info['user']['name'],
 | 
			
		||||
            'uploader_id': info['user']['username'],
 | 
			
		||||
            'upload_date': unified_strdate(info['created_time']),
 | 
			
		||||
            'view_count': info['play_count'],
 | 
			
		||||
        }
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,10 +1,12 @@
 | 
			
		|||
import json
 | 
			
		||||
import re
 | 
			
		||||
import itertools
 | 
			
		||||
 | 
			
		||||
from .common import InfoExtractor
 | 
			
		||||
from ..utils import (
 | 
			
		||||
    compat_str,
 | 
			
		||||
    compat_urlparse,
 | 
			
		||||
    compat_urllib_parse,
 | 
			
		||||
 | 
			
		||||
    ExtractorError,
 | 
			
		||||
    unified_strdate,
 | 
			
		||||
| 
						 | 
				
			
			@ -53,10 +55,11 @@ class SoundcloudIE(InfoExtractor):
 | 
			
		|||
    def _resolv_url(cls, url):
 | 
			
		||||
        return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID
 | 
			
		||||
 | 
			
		||||
    def _extract_info_dict(self, info, full_title=None):
 | 
			
		||||
    def _extract_info_dict(self, info, full_title=None, quiet=False):
 | 
			
		||||
        video_id = info['id']
 | 
			
		||||
        name = full_title or video_id
 | 
			
		||||
        self.report_extraction(name)
 | 
			
		||||
        if quiet == False:
 | 
			
		||||
            self.report_extraction(name)
 | 
			
		||||
 | 
			
		||||
        thumbnail = info['artwork_url']
 | 
			
		||||
        if thumbnail is not None:
 | 
			
		||||
| 
						 | 
				
			
			@ -198,3 +201,41 @@ class SoundcloudSetIE(SoundcloudIE):
 | 
			
		|||
                'id': info['id'],
 | 
			
		||||
                'title': info['title'],
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class SoundcloudUserIE(SoundcloudIE):
 | 
			
		||||
    _VALID_URL = r'https?://(www\.)?soundcloud.com/(?P<user>[^/]+)(/?(tracks/)?)?(\?.*)?$'
 | 
			
		||||
    IE_NAME = u'soundcloud:user'
 | 
			
		||||
 | 
			
		||||
    # it's in tests/test_playlists.py
 | 
			
		||||
    _TEST = None
 | 
			
		||||
 | 
			
		||||
    def _real_extract(self, url):
 | 
			
		||||
        mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
        uploader = mobj.group('user')
 | 
			
		||||
 | 
			
		||||
        url = 'http://soundcloud.com/%s/' % uploader
 | 
			
		||||
        resolv_url = self._resolv_url(url)
 | 
			
		||||
        user_json = self._download_webpage(resolv_url, uploader,
 | 
			
		||||
            u'Downloading user info')
 | 
			
		||||
        user = json.loads(user_json)
 | 
			
		||||
 | 
			
		||||
        tracks = []
 | 
			
		||||
        for i in itertools.count():
 | 
			
		||||
            data = compat_urllib_parse.urlencode({'offset': i*50,
 | 
			
		||||
                                                  'client_id': self._CLIENT_ID,
 | 
			
		||||
                                                  })
 | 
			
		||||
            tracks_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % user['id'] + data
 | 
			
		||||
            response = self._download_webpage(tracks_url, uploader, 
 | 
			
		||||
                u'Downloading tracks page %s' % (i+1))
 | 
			
		||||
            new_tracks = json.loads(response)
 | 
			
		||||
            tracks.extend(self._extract_info_dict(track, quiet=True) for track in new_tracks)
 | 
			
		||||
            if len(new_tracks) < 50:
 | 
			
		||||
                break
 | 
			
		||||
 | 
			
		||||
        return {
 | 
			
		||||
            '_type': 'playlist',
 | 
			
		||||
            'id': compat_str(user['id']),
 | 
			
		||||
            'title': user['username'],
 | 
			
		||||
            'entries': tracks,
 | 
			
		||||
        }
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										34
									
								
								youtube_dl/extractor/southparkstudios.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								youtube_dl/extractor/southparkstudios.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,34 @@
 | 
			
		|||
import re
 | 
			
		||||
 | 
			
		||||
from .mtv import MTVIE, _media_xml_tag
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class SouthParkStudiosIE(MTVIE):
 | 
			
		||||
    IE_NAME = u'southparkstudios.com'
 | 
			
		||||
    _VALID_URL = r'https?://www\.southparkstudios\.com/clips/(?P<id>\d+)'
 | 
			
		||||
 | 
			
		||||
    _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
 | 
			
		||||
 | 
			
		||||
    _TEST = {
 | 
			
		||||
        u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured',
 | 
			
		||||
        u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4',
 | 
			
		||||
        u'info_dict': {
 | 
			
		||||
            u'title': u'Bat Daded',
 | 
			
		||||
            u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.',
 | 
			
		||||
        },
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    # Overwrite MTVIE properties we don't want
 | 
			
		||||
    _TESTS = []
 | 
			
		||||
 | 
			
		||||
    def _get_thumbnail_url(self, uri, itemdoc):
 | 
			
		||||
        search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
 | 
			
		||||
        return itemdoc.find(search_path).attrib['url']
 | 
			
		||||
 | 
			
		||||
    def _real_extract(self, url):
 | 
			
		||||
        mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
        video_id = mobj.group('id')
 | 
			
		||||
        webpage = self._download_webpage(url, video_id)
 | 
			
		||||
        mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"',
 | 
			
		||||
                                  webpage, u'mgid')
 | 
			
		||||
        return self._get_videos_info(mgid)
 | 
			
		||||
| 
						 | 
				
			
			@ -10,8 +10,7 @@ class SubtitlesInfoExtractor(InfoExtractor):
 | 
			
		|||
    @property
 | 
			
		||||
    def _have_to_download_any_subtitles(self):
 | 
			
		||||
        return any([self._downloader.params.get('writesubtitles', False),
 | 
			
		||||
                    self._downloader.params.get('writeautomaticsub'),
 | 
			
		||||
                    self._downloader.params.get('allsubtitles', False)])
 | 
			
		||||
                    self._downloader.params.get('writeautomaticsub')])
 | 
			
		||||
 | 
			
		||||
    def _list_available_subtitles(self, video_id, webpage=None):
 | 
			
		||||
        """ outputs the available subtitles for the video """
 | 
			
		||||
| 
						 | 
				
			
			@ -34,7 +33,7 @@ class SubtitlesInfoExtractor(InfoExtractor):
 | 
			
		|||
        available_subs_list = {}
 | 
			
		||||
        if self._downloader.params.get('writeautomaticsub', False):
 | 
			
		||||
            available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage))
 | 
			
		||||
        if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
 | 
			
		||||
        if self._downloader.params.get('writesubtitles', False):
 | 
			
		||||
            available_subs_list.update(self._get_available_subtitles(video_id))
 | 
			
		||||
 | 
			
		||||
        if not available_subs_list:  # error, it didn't get the available subtitles
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -52,6 +52,7 @@ class TriluliluIE(InfoExtractor):
 | 
			
		|||
            {
 | 
			
		||||
                'format': fnode.text,
 | 
			
		||||
                'url': video_url_template % fnode.text,
 | 
			
		||||
                'ext': fnode.text.partition('-')[0]
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            for fnode in format_doc.findall('./formats/format')
 | 
			
		||||
| 
						 | 
				
			
			@ -67,7 +68,6 @@ class TriluliluIE(InfoExtractor):
 | 
			
		|||
        }
 | 
			
		||||
 | 
			
		||||
        # TODO: Remove when #980 has been merged
 | 
			
		||||
        info['url'] = formats[-1]['url']
 | 
			
		||||
        info['ext'] = formats[-1]['format'].partition('-')[0]
 | 
			
		||||
        info.update(formats[-1])
 | 
			
		||||
 | 
			
		||||
        return info
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,6 +1,11 @@
 | 
			
		|||
import json
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
from .common import InfoExtractor
 | 
			
		||||
from ..utils import (
 | 
			
		||||
    compat_urlparse,
 | 
			
		||||
    get_meta_content,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class UstreamIE(InfoExtractor):
 | 
			
		||||
| 
						 | 
				
			
			@ -43,3 +48,25 @@ class UstreamIE(InfoExtractor):
 | 
			
		|||
                'thumbnail': thumbnail,
 | 
			
		||||
               }
 | 
			
		||||
        return info
 | 
			
		||||
 | 
			
		||||
class UstreamChannelIE(InfoExtractor):
 | 
			
		||||
    _VALID_URL = r'https?://www\.ustream\.tv/channel/(?P<slug>.+)'
 | 
			
		||||
    IE_NAME = u'ustream:channel'
 | 
			
		||||
 | 
			
		||||
    def _real_extract(self, url):
 | 
			
		||||
        m = re.match(self._VALID_URL, url)
 | 
			
		||||
        slug = m.group('slug')
 | 
			
		||||
        webpage = self._download_webpage(url, slug)
 | 
			
		||||
        channel_id = get_meta_content('ustream:channel_id', webpage)
 | 
			
		||||
 | 
			
		||||
        BASE = 'http://www.ustream.tv'
 | 
			
		||||
        next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id
 | 
			
		||||
        video_ids = []
 | 
			
		||||
        while next_url:
 | 
			
		||||
            reply = json.loads(self._download_webpage(compat_urlparse.urljoin(BASE, next_url), channel_id))
 | 
			
		||||
            video_ids.extend(re.findall(r'data-content-id="(\d.*)"', reply['data']))
 | 
			
		||||
            next_url = reply['nextUrl']
 | 
			
		||||
 | 
			
		||||
        urls = ['http://www.ustream.tv/recorded/' + vid for vid in video_ids]
 | 
			
		||||
        url_entries = [self.url_result(eurl, 'Ustream') for eurl in urls]
 | 
			
		||||
        return self.playlist_result(url_entries, channel_id)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -27,7 +27,7 @@ class XHamsterIE(InfoExtractor):
 | 
			
		|||
        mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
 | 
			
		||||
        video_id = mobj.group('id')
 | 
			
		||||
        mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 | 
			
		||||
        mrss_url = 'http://xhamster.com/movies/%s/.html?hd' % video_id
 | 
			
		||||
        webpage = self._download_webpage(mrss_url, video_id)
 | 
			
		||||
 | 
			
		||||
        mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -139,7 +139,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 | 
			
		|||
                     (
 | 
			
		||||
                         (?:https?://)?                                       # http(s):// (optional)
 | 
			
		||||
                         (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 | 
			
		||||
                            tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 | 
			
		||||
                            tube\.majestyc\.net/|
 | 
			
		||||
                            youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 | 
			
		||||
                         (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 | 
			
		||||
                         (?:                                                  # the various things that can precede the ID:
 | 
			
		||||
                             (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 | 
			
		||||
| 
						 | 
				
			
			@ -434,7 +435,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 | 
			
		|||
        elif len(s) == 83:
 | 
			
		||||
            return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
 | 
			
		||||
        elif len(s) == 82:
 | 
			
		||||
            return s[1:19] + s[0] + s[20:68] + s[19] + s[69:82]
 | 
			
		||||
            return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
 | 
			
		||||
        elif len(s) == 81:
 | 
			
		||||
            return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
 | 
			
		||||
        elif len(s) == 80:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -249,7 +249,17 @@ def htmlentity_transform(matchobj):
 | 
			
		|||
    return (u'&%s;' % entity)
 | 
			
		||||
 | 
			
		||||
compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 | 
			
		||||
class AttrParser(compat_html_parser.HTMLParser):
 | 
			
		||||
class BaseHTMLParser(compat_html_parser.HTMLParser):
 | 
			
		||||
    def __init(self):
 | 
			
		||||
        compat_html_parser.HTMLParser.__init__(self)
 | 
			
		||||
        self.html = None
 | 
			
		||||
 | 
			
		||||
    def loads(self, html):
 | 
			
		||||
        self.html = html
 | 
			
		||||
        self.feed(html)
 | 
			
		||||
        self.close()
 | 
			
		||||
 | 
			
		||||
class AttrParser(BaseHTMLParser):
 | 
			
		||||
    """Modified HTMLParser that isolates a tag with the specified attribute"""
 | 
			
		||||
    def __init__(self, attribute, value):
 | 
			
		||||
        self.attribute = attribute
 | 
			
		||||
| 
						 | 
				
			
			@ -257,10 +267,9 @@ class AttrParser(compat_html_parser.HTMLParser):
 | 
			
		|||
        self.result = None
 | 
			
		||||
        self.started = False
 | 
			
		||||
        self.depth = {}
 | 
			
		||||
        self.html = None
 | 
			
		||||
        self.watch_startpos = False
 | 
			
		||||
        self.error_count = 0
 | 
			
		||||
        compat_html_parser.HTMLParser.__init__(self)
 | 
			
		||||
        BaseHTMLParser.__init__(self)
 | 
			
		||||
 | 
			
		||||
    def error(self, message):
 | 
			
		||||
        if self.error_count > 10 or self.started:
 | 
			
		||||
| 
						 | 
				
			
			@ -269,11 +278,6 @@ class AttrParser(compat_html_parser.HTMLParser):
 | 
			
		|||
        self.error_count += 1
 | 
			
		||||
        self.goahead(1)
 | 
			
		||||
 | 
			
		||||
    def loads(self, html):
 | 
			
		||||
        self.html = html
 | 
			
		||||
        self.feed(html)
 | 
			
		||||
        self.close()
 | 
			
		||||
 | 
			
		||||
    def handle_starttag(self, tag, attrs):
 | 
			
		||||
        attrs = dict(attrs)
 | 
			
		||||
        if self.started:
 | 
			
		||||
| 
						 | 
				
			
			@ -334,6 +338,38 @@ def get_element_by_attribute(attribute, value, html):
 | 
			
		|||
        pass
 | 
			
		||||
    return parser.get_result()
 | 
			
		||||
 | 
			
		||||
class MetaParser(BaseHTMLParser):
 | 
			
		||||
    """
 | 
			
		||||
    Modified HTMLParser that isolates a meta tag with the specified name 
 | 
			
		||||
    attribute.
 | 
			
		||||
    """
 | 
			
		||||
    def __init__(self, name):
 | 
			
		||||
        BaseHTMLParser.__init__(self)
 | 
			
		||||
        self.name = name
 | 
			
		||||
        self.content = None
 | 
			
		||||
        self.result = None
 | 
			
		||||
 | 
			
		||||
    def handle_starttag(self, tag, attrs):
 | 
			
		||||
        if tag != 'meta':
 | 
			
		||||
            return
 | 
			
		||||
        attrs = dict(attrs)
 | 
			
		||||
        if attrs.get('name') == self.name:
 | 
			
		||||
            self.result = attrs.get('content')
 | 
			
		||||
 | 
			
		||||
    def get_result(self):
 | 
			
		||||
        return self.result
 | 
			
		||||
 | 
			
		||||
def get_meta_content(name, html):
 | 
			
		||||
    """
 | 
			
		||||
    Return the content attribute from the meta tag with the given name attribute.
 | 
			
		||||
    """
 | 
			
		||||
    parser = MetaParser(name)
 | 
			
		||||
    try:
 | 
			
		||||
        parser.loads(html)
 | 
			
		||||
    except compat_html_parser.HTMLParseError:
 | 
			
		||||
        pass
 | 
			
		||||
    return parser.get_result()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def clean_html(html):
 | 
			
		||||
    """Clean an HTML snippet into a readable string"""
 | 
			
		||||
| 
						 | 
				
			
			@ -664,7 +700,16 @@ def unified_strdate(date_str):
 | 
			
		|||
    date_str = date_str.replace(',',' ')
 | 
			
		||||
    # %z (UTC offset) is only supported in python>=3.2
 | 
			
		||||
    date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
 | 
			
		||||
    format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
 | 
			
		||||
    format_expressions = [
 | 
			
		||||
        '%d %B %Y',
 | 
			
		||||
        '%B %d %Y',
 | 
			
		||||
        '%b %d %Y',
 | 
			
		||||
        '%Y-%m-%d',
 | 
			
		||||
        '%d/%m/%Y',
 | 
			
		||||
        '%Y/%m/%d %H:%M:%S',
 | 
			
		||||
        '%d.%m.%Y %H:%M',
 | 
			
		||||
        '%Y-%m-%dT%H:%M:%SZ',
 | 
			
		||||
    ]
 | 
			
		||||
    for expression in format_expressions:
 | 
			
		||||
        try:
 | 
			
		||||
            upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue