OpenClassRoom IE (Closes: #234)
This commit is contained in:
		
							parent
							
								
									dd17922afc
								
							
						
					
					
						commit
						0b14e0b367
					
				
					 1 changed files with 78 additions and 24 deletions
				
			
		| 
						 | 
					@ -282,6 +282,14 @@ def _simplify_title(title):
 | 
				
			||||||
	expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 | 
						expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 | 
				
			||||||
	return expr.sub(u'_', title).strip(u'_')
 | 
						return expr.sub(u'_', title).strip(u'_')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _orderedSet(iterable):
 | 
				
			||||||
 | 
						""" Remove all duplicates from the input iterable """
 | 
				
			||||||
 | 
						res = []
 | 
				
			||||||
 | 
						for el in iterable:
 | 
				
			||||||
 | 
							if el not in res:
 | 
				
			||||||
 | 
								res.append(el)
 | 
				
			||||||
 | 
						return res
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class DownloadError(Exception):
 | 
					class DownloadError(Exception):
 | 
				
			||||||
	"""Download Error exception.
 | 
						"""Download Error exception.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -711,25 +719,6 @@ class FileDownloader(object):
 | 
				
			||||||
			return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 | 
								return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 | 
				
			||||||
		return None
 | 
							return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	def process_dict(self, info_dict):
 | 
					 | 
				
			||||||
		""" Download and handle the extracted information.
 | 
					 | 
				
			||||||
		For details on the specification of the various types of content, refer to the _process_* functions. """
 | 
					 | 
				
			||||||
		if info_dict['type'] == 'playlist':
 | 
					 | 
				
			||||||
			self._process_playlist(info_dict)
 | 
					 | 
				
			||||||
		elif info_dict['type'] == 'legacy-video':
 | 
					 | 
				
			||||||
			self.process_info(info_dict)
 | 
					 | 
				
			||||||
		else:
 | 
					 | 
				
			||||||
			raise ValueError('Invalid item type')
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	def _process_playlist(self, info_dict):
 | 
					 | 
				
			||||||
		assert info_dict['type'] == 'playlist'
 | 
					 | 
				
			||||||
		assert 'title' in info_dict
 | 
					 | 
				
			||||||
		assert 'stitle' in info_dict
 | 
					 | 
				
			||||||
		entries = info_dict['list']
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		for e in entries:
 | 
					 | 
				
			||||||
			self.process_dict(e)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	def process_info(self, info_dict):
 | 
						def process_info(self, info_dict):
 | 
				
			||||||
		"""Process a single dictionary returned by an InfoExtractor."""
 | 
							"""Process a single dictionary returned by an InfoExtractor."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3766,9 +3755,13 @@ class MixcloudIE(InfoExtractor):
 | 
				
			||||||
class StanfordOpenClassroomIE(InfoExtractor):
 | 
					class StanfordOpenClassroomIE(InfoExtractor):
 | 
				
			||||||
	"""Information extractor for Stanford's Open ClassRoom"""
 | 
						"""Information extractor for Stanford's Open ClassRoom"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	_VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
 | 
						_VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
 | 
				
			||||||
	IE_NAME = u'stanfordoc'
 | 
						IE_NAME = u'stanfordoc'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						def report_download_webpage(self, objid):
 | 
				
			||||||
 | 
							"""Report information extraction."""
 | 
				
			||||||
 | 
							self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	def report_extraction(self, video_id):
 | 
						def report_extraction(self, video_id):
 | 
				
			||||||
		"""Report information extraction."""
 | 
							"""Report information extraction."""
 | 
				
			||||||
		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 | 
							self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 | 
				
			||||||
| 
						 | 
					@ -3792,7 +3785,7 @@ class StanfordOpenClassroomIE(InfoExtractor):
 | 
				
			||||||
			try:
 | 
								try:
 | 
				
			||||||
				metaXml = urllib2.urlopen(xmlUrl).read()
 | 
									metaXml = urllib2.urlopen(xmlUrl).read()
 | 
				
			||||||
			except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 | 
								except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 | 
				
			||||||
				self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
 | 
									self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
 | 
				
			||||||
				return
 | 
									return
 | 
				
			||||||
			mdoc = xml.etree.ElementTree.fromstring(metaXml)
 | 
								mdoc = xml.etree.ElementTree.fromstring(metaXml)
 | 
				
			||||||
			try:
 | 
								try:
 | 
				
			||||||
| 
						 | 
					@ -3809,13 +3802,74 @@ class StanfordOpenClassroomIE(InfoExtractor):
 | 
				
			||||||
				self._downloader.process_info(info)
 | 
									self._downloader.process_info(info)
 | 
				
			||||||
			except UnavailableVideoError, err:
 | 
								except UnavailableVideoError, err:
 | 
				
			||||||
				self._downloader.trouble(u'\nERROR: unable to download video')
 | 
									self._downloader.trouble(u'\nERROR: unable to download video')
 | 
				
			||||||
		else:
 | 
							elif mobj.group('course'): # A course page
 | 
				
			||||||
			print('TODO: Not yet implemented')
 | 
								unescapeHTML = HTMLParser.HTMLParser().unescape
 | 
				
			||||||
			1/0
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								course = mobj.group('course')
 | 
				
			||||||
 | 
								info = {
 | 
				
			||||||
 | 
									'id': _simplify_title(course),
 | 
				
			||||||
 | 
									'type': 'playlist',
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								self.report_download_webpage(info['id'])
 | 
				
			||||||
 | 
								try:
 | 
				
			||||||
 | 
									coursepage = urllib2.urlopen(url).read()
 | 
				
			||||||
 | 
								except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 | 
				
			||||||
 | 
									self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
 | 
				
			||||||
 | 
									return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								m = re.search('<h1>([^<]+)</h1>', coursepage)
 | 
				
			||||||
 | 
								if m:
 | 
				
			||||||
 | 
									info['title'] = unescapeHTML(m.group(1))
 | 
				
			||||||
 | 
								else:
 | 
				
			||||||
 | 
									info['title'] = info['id']
 | 
				
			||||||
 | 
								info['stitle'] = _simplify_title(info['title'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								m = re.search('<description>([^<]+)</description>', coursepage)
 | 
				
			||||||
 | 
								if m:
 | 
				
			||||||
 | 
									info['description'] = unescapeHTML(m.group(1))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
 | 
				
			||||||
 | 
								info['list'] = [
 | 
				
			||||||
 | 
									{
 | 
				
			||||||
 | 
										'type': 'reference',
 | 
				
			||||||
 | 
										'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
 | 
				
			||||||
 | 
									}
 | 
				
			||||||
 | 
										for vpage in links]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								for entry in info['list']:
 | 
				
			||||||
 | 
									assert entry['type'] == 'reference'
 | 
				
			||||||
 | 
									self.extract(entry['url'])
 | 
				
			||||||
 | 
							else: # Root page
 | 
				
			||||||
 | 
								unescapeHTML = HTMLParser.HTMLParser().unescape
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								info = {
 | 
				
			||||||
 | 
									'id': 'Stanford OpenClassroom',
 | 
				
			||||||
 | 
									'type': 'playlist',
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								self.report_download_webpage(info['id'])
 | 
				
			||||||
 | 
								rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
 | 
				
			||||||
 | 
								try:
 | 
				
			||||||
 | 
									rootpage = urllib2.urlopen(rootURL).read()
 | 
				
			||||||
 | 
								except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 | 
				
			||||||
 | 
									self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
 | 
				
			||||||
 | 
									return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								info['title'] = info['id']
 | 
				
			||||||
 | 
								info['stitle'] = _simplify_title(info['title'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
 | 
				
			||||||
 | 
								info['list'] = [
 | 
				
			||||||
 | 
									{
 | 
				
			||||||
 | 
										'type': 'reference',
 | 
				
			||||||
 | 
										'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
 | 
				
			||||||
 | 
									}
 | 
				
			||||||
 | 
										for cpage in links]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
								for entry in info['list']:
 | 
				
			||||||
 | 
									assert entry['type'] == 'reference'
 | 
				
			||||||
 | 
									self.extract(entry['url'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class PostProcessor(object):
 | 
					class PostProcessor(object):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue