removed dependency from lxml: added IDParser
This commit is contained in:
		
							parent
							
								
									d6a9615347
								
							
						
					
					
						commit
						c6f45d4314
					
				
					 2 changed files with 155 additions and 76 deletions
				
			
		
							
								
								
									
										131
									
								
								youtube-dl
									
										
									
									
									
								
							
							
						
						
									
										131
									
								
								youtube-dl
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -15,6 +15,7 @@ __authors__  = (
 | 
			
		|||
	'Kevin Ngo',
 | 
			
		||||
	'Ori Avtalion',
 | 
			
		||||
	'shizeeg',
 | 
			
		||||
	'Filippo Valsorda',
 | 
			
		||||
	)
 | 
			
		||||
 | 
			
		||||
__license__ = 'Public Domain'
 | 
			
		||||
| 
						 | 
				
			
			@ -66,11 +67,6 @@ try:
 | 
			
		|||
except ImportError:
 | 
			
		||||
	from cgi import parse_qs
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
	import lxml.etree
 | 
			
		||||
except ImportError:
 | 
			
		||||
	pass # Handled below
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
	import xml.etree.ElementTree
 | 
			
		||||
except ImportError: # Python<2.5: Not officially supported, but let it slip
 | 
			
		||||
| 
						 | 
				
			
			@ -197,6 +193,69 @@ except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/tr
 | 
			
		|||
				raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 | 
			
		||||
			return res
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class IDParser(HTMLParser.HTMLParser):
 | 
			
		||||
	"""Modified HTMLParser that isolates a tag with the specified id"""
 | 
			
		||||
	def __init__(self, id):
 | 
			
		||||
		self.id = id
 | 
			
		||||
		self.result = None
 | 
			
		||||
		self.started = False
 | 
			
		||||
		self.depth = {}
 | 
			
		||||
		self.html = None
 | 
			
		||||
		self.watch_startpos = False
 | 
			
		||||
		HTMLParser.HTMLParser.__init__(self)
 | 
			
		||||
 | 
			
		||||
	def loads(self, html):
 | 
			
		||||
		self.html = html
 | 
			
		||||
		self.feed(html)
 | 
			
		||||
		self.close()
 | 
			
		||||
 | 
			
		||||
	def handle_starttag(self, tag, attrs):
 | 
			
		||||
		attrs = dict(attrs)
 | 
			
		||||
		if self.started:
 | 
			
		||||
			self.find_startpos(None)
 | 
			
		||||
		if 'id' in attrs and attrs['id'] == self.id:
 | 
			
		||||
			self.result = [tag]
 | 
			
		||||
			self.started = True
 | 
			
		||||
			self.watch_startpos = True
 | 
			
		||||
		if self.started:
 | 
			
		||||
			if not tag in self.depth: self.depth[tag] = 0
 | 
			
		||||
			self.depth[tag] += 1
 | 
			
		||||
 | 
			
		||||
	def handle_endtag(self, tag):
 | 
			
		||||
		if self.started:
 | 
			
		||||
			if tag in self.depth: self.depth[tag] -= 1
 | 
			
		||||
			if self.depth[self.result[0]] == 0:
 | 
			
		||||
				self.started = False
 | 
			
		||||
				self.result.append(self.getpos())
 | 
			
		||||
 | 
			
		||||
	def find_startpos(self, x):
 | 
			
		||||
		"""Needed to put the start position of the result (self.result[1])
 | 
			
		||||
		after the opening tag with the requested id"""
 | 
			
		||||
		if self.watch_startpos:
 | 
			
		||||
			self.watch_startpos = False
 | 
			
		||||
			self.result.append(self.getpos())
 | 
			
		||||
	handle_entityref = handle_charref = handle_data = handle_comment = \
 | 
			
		||||
	handle_decl = handle_pi = unknown_decl = find_startpos
 | 
			
		||||
 | 
			
		||||
	def get_result(self):
 | 
			
		||||
		if self.result == None: return None
 | 
			
		||||
		if len(self.result) != 3: return None
 | 
			
		||||
		lines = self.html.split('\n')
 | 
			
		||||
		lines = lines[self.result[1][0]-1:self.result[2][0]]
 | 
			
		||||
		lines[0] = lines[0][self.result[1][1]:]
 | 
			
		||||
		if len(lines) == 1:
 | 
			
		||||
			lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 | 
			
		||||
		lines[-1] = lines[-1][:self.result[2][1]]
 | 
			
		||||
		return '\n'.join(lines).strip()
 | 
			
		||||
 | 
			
		||||
def get_element_by_id(id, html):
 | 
			
		||||
	"""Return the content of the tag with the specified id in the passed HTML document"""
 | 
			
		||||
	parser = IDParser(id)
 | 
			
		||||
	parser.loads(html)
 | 
			
		||||
	return parser.get_result()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def preferredencoding():
 | 
			
		||||
	"""Get preferred encoding.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -241,6 +300,18 @@ def htmlentity_transform(matchobj):
 | 
			
		|||
	return (u'&%s;' % entity)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def clean_html(html):
 | 
			
		||||
	"""Clean an HTML snippet into a readable string"""
 | 
			
		||||
	# Newline vs <br />
 | 
			
		||||
	html = html.replace('\n', ' ')
 | 
			
		||||
	html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 | 
			
		||||
	# Strip html tags
 | 
			
		||||
	html = re.sub('<.*?>', '', html)
 | 
			
		||||
	# Replace html entities
 | 
			
		||||
	html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html)
 | 
			
		||||
	return html
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def sanitize_title(utitle):
 | 
			
		||||
	"""Sanitizes a video title so it could be used as part of a filename."""
 | 
			
		||||
	utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
 | 
			
		||||
| 
						 | 
				
			
			@ -1419,18 +1490,9 @@ class YoutubeIE(InfoExtractor):
 | 
			
		|||
					pass
 | 
			
		||||
 | 
			
		||||
		# description
 | 
			
		||||
		try:
 | 
			
		||||
			lxml.etree
 | 
			
		||||
		except NameError:
 | 
			
		||||
			video_description = u'No description available.'
 | 
			
		||||
			mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
 | 
			
		||||
			if mobj is not None:
 | 
			
		||||
				video_description = mobj.group(1).decode('utf-8')
 | 
			
		||||
		else:
 | 
			
		||||
			html_parser = lxml.etree.HTMLParser(encoding='utf-8')
 | 
			
		||||
			vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
 | 
			
		||||
			video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
 | 
			
		||||
			# TODO use another parser
 | 
			
		||||
		video_description = get_element_by_id("eow-description", video_webpage)
 | 
			
		||||
		if video_description: video_description = clean_html(video_description.decode('utf8'))
 | 
			
		||||
		else: video_description = ''
 | 
			
		||||
			
 | 
			
		||||
		# closed captions
 | 
			
		||||
		video_subtitles = None
 | 
			
		||||
| 
						 | 
				
			
			@ -2164,18 +2226,9 @@ class VimeoIE(InfoExtractor):
 | 
			
		|||
		video_thumbnail = config["video"]["thumbnail"]
 | 
			
		||||
 | 
			
		||||
		# Extract video description
 | 
			
		||||
		try:
 | 
			
		||||
			lxml.etree
 | 
			
		||||
		except NameError:
 | 
			
		||||
			video_description = u'No description available.'
 | 
			
		||||
			mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
 | 
			
		||||
			if mobj is not None:
 | 
			
		||||
				video_description = mobj.group(1)
 | 
			
		||||
		else:
 | 
			
		||||
			html_parser = lxml.etree.HTMLParser()
 | 
			
		||||
			vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
 | 
			
		||||
			video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
 | 
			
		||||
			# TODO use another parser
 | 
			
		||||
		video_description = get_element_by_id("description", webpage)
 | 
			
		||||
		if video_description: video_description = clean_html(video_description.decode('utf8'))
 | 
			
		||||
		else: video_description = ''
 | 
			
		||||
 | 
			
		||||
		# Extract upload date
 | 
			
		||||
		video_upload_date = u'NA'
 | 
			
		||||
| 
						 | 
				
			
			@ -3342,8 +3395,6 @@ class EscapistIE(InfoExtractor):
 | 
			
		|||
		self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
 | 
			
		||||
 | 
			
		||||
	def _real_extract(self, url):
 | 
			
		||||
		htmlParser = HTMLParser.HTMLParser()
 | 
			
		||||
 | 
			
		||||
		mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
		if mobj is None:
 | 
			
		||||
			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 | 
			
		||||
| 
						 | 
				
			
			@ -3359,11 +3410,11 @@ class EscapistIE(InfoExtractor):
 | 
			
		|||
			return
 | 
			
		||||
 | 
			
		||||
		descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
 | 
			
		||||
		description = htmlParser.unescape(descMatch.group(1))
 | 
			
		||||
		description = unescapeHTML(descMatch.group(1))
 | 
			
		||||
		imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
 | 
			
		||||
		imgUrl = htmlParser.unescape(imgMatch.group(1))
 | 
			
		||||
		imgUrl = unescapeHTML(imgMatch.group(1))
 | 
			
		||||
		playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
 | 
			
		||||
		playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
 | 
			
		||||
		playerUrl = unescapeHTML(playerUrlMatch.group(1))
 | 
			
		||||
		configUrlMatch = re.search('config=(.*)$', playerUrl)
 | 
			
		||||
		configUrl = urllib2.unquote(configUrlMatch.group(1))
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -3422,8 +3473,6 @@ class CollegeHumorIE(InfoExtractor):
 | 
			
		|||
		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 | 
			
		||||
 | 
			
		||||
	def _real_extract(self, url):
 | 
			
		||||
		htmlParser = HTMLParser.HTMLParser()
 | 
			
		||||
 | 
			
		||||
		mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
		if mobj is None:
 | 
			
		||||
			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 | 
			
		||||
| 
						 | 
				
			
			@ -3494,8 +3543,6 @@ class XVideosIE(InfoExtractor):
 | 
			
		|||
		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 | 
			
		||||
 | 
			
		||||
	def _real_extract(self, url):
 | 
			
		||||
		htmlParser = HTMLParser.HTMLParser()
 | 
			
		||||
 | 
			
		||||
		mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
		if mobj is None:
 | 
			
		||||
			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 | 
			
		||||
| 
						 | 
				
			
			@ -3584,8 +3631,6 @@ class SoundcloudIE(InfoExtractor):
 | 
			
		|||
		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 | 
			
		||||
 | 
			
		||||
	def _real_extract(self, url):
 | 
			
		||||
		htmlParser = HTMLParser.HTMLParser()
 | 
			
		||||
 | 
			
		||||
		mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
		if mobj is None:
 | 
			
		||||
			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 | 
			
		||||
| 
						 | 
				
			
			@ -3673,8 +3718,6 @@ class InfoQIE(InfoExtractor):
 | 
			
		|||
		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 | 
			
		||||
 | 
			
		||||
	def _real_extract(self, url):
 | 
			
		||||
		htmlParser = HTMLParser.HTMLParser()
 | 
			
		||||
 | 
			
		||||
		mobj = re.match(self._VALID_URL, url)
 | 
			
		||||
		if mobj is None:
 | 
			
		||||
			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 | 
			
		||||
| 
						 | 
				
			
			@ -3908,8 +3951,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
 | 
			
		|||
			except UnavailableVideoError, err:
 | 
			
		||||
				self._downloader.trouble(u'\nERROR: unable to download video')
 | 
			
		||||
		elif mobj.group('course'): # A course page
 | 
			
		||||
			unescapeHTML = HTMLParser.HTMLParser().unescape
 | 
			
		||||
 | 
			
		||||
			course = mobj.group('course')
 | 
			
		||||
			info = {
 | 
			
		||||
				'id': _simplify_title(course),
 | 
			
		||||
| 
						 | 
				
			
			@ -3946,8 +3987,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
 | 
			
		|||
				assert entry['type'] == 'reference'
 | 
			
		||||
				self.extract(entry['url'])
 | 
			
		||||
		else: # Root page
 | 
			
		||||
			unescapeHTML = HTMLParser.HTMLParser().unescape
 | 
			
		||||
 | 
			
		||||
			info = {
 | 
			
		||||
				'id': 'Stanford OpenClassroom',
 | 
			
		||||
				'type': 'playlist',
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -67,11 +67,6 @@ try:
 | 
			
		|||
except ImportError:
 | 
			
		||||
	from cgi import parse_qs
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
	import lxml.etree
 | 
			
		||||
except ImportError:
 | 
			
		||||
	pass # Handled below
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
	import xml.etree.ElementTree
 | 
			
		||||
except ImportError: # Python<2.5: Not officially supported, but let it slip
 | 
			
		||||
| 
						 | 
				
			
			@ -198,6 +193,69 @@ except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/tr
 | 
			
		|||
				raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 | 
			
		||||
			return res
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class IDParser(HTMLParser.HTMLParser):
 | 
			
		||||
	"""Modified HTMLParser that isolates a tag with the specified id"""
 | 
			
		||||
	def __init__(self, id):
 | 
			
		||||
		self.id = id
 | 
			
		||||
		self.result = None
 | 
			
		||||
		self.started = False
 | 
			
		||||
		self.depth = {}
 | 
			
		||||
		self.html = None
 | 
			
		||||
		self.watch_startpos = False
 | 
			
		||||
		HTMLParser.HTMLParser.__init__(self)
 | 
			
		||||
 | 
			
		||||
	def loads(self, html):
 | 
			
		||||
		self.html = html
 | 
			
		||||
		self.feed(html)
 | 
			
		||||
		self.close()
 | 
			
		||||
 | 
			
		||||
	def handle_starttag(self, tag, attrs):
 | 
			
		||||
		attrs = dict(attrs)
 | 
			
		||||
		if self.started:
 | 
			
		||||
			self.find_startpos(None)
 | 
			
		||||
		if 'id' in attrs and attrs['id'] == self.id:
 | 
			
		||||
			self.result = [tag]
 | 
			
		||||
			self.started = True
 | 
			
		||||
			self.watch_startpos = True
 | 
			
		||||
		if self.started:
 | 
			
		||||
			if not tag in self.depth: self.depth[tag] = 0
 | 
			
		||||
			self.depth[tag] += 1
 | 
			
		||||
 | 
			
		||||
	def handle_endtag(self, tag):
 | 
			
		||||
		if self.started:
 | 
			
		||||
			if tag in self.depth: self.depth[tag] -= 1
 | 
			
		||||
			if self.depth[self.result[0]] == 0:
 | 
			
		||||
				self.started = False
 | 
			
		||||
				self.result.append(self.getpos())
 | 
			
		||||
 | 
			
		||||
	def find_startpos(self, x):
 | 
			
		||||
		"""Needed to put the start position of the result (self.result[1])
 | 
			
		||||
		after the opening tag with the requested id"""
 | 
			
		||||
		if self.watch_startpos:
 | 
			
		||||
			self.watch_startpos = False
 | 
			
		||||
			self.result.append(self.getpos())
 | 
			
		||||
	handle_entityref = handle_charref = handle_data = handle_comment = \
 | 
			
		||||
	handle_decl = handle_pi = unknown_decl = find_startpos
 | 
			
		||||
 | 
			
		||||
	def get_result(self):
 | 
			
		||||
		if self.result == None: return None
 | 
			
		||||
		if len(self.result) != 3: return None
 | 
			
		||||
		lines = self.html.split('\n')
 | 
			
		||||
		lines = lines[self.result[1][0]-1:self.result[2][0]]
 | 
			
		||||
		lines[0] = lines[0][self.result[1][1]:]
 | 
			
		||||
		if len(lines) == 1:
 | 
			
		||||
			lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 | 
			
		||||
		lines[-1] = lines[-1][:self.result[2][1]]
 | 
			
		||||
		return '\n'.join(lines).strip()
 | 
			
		||||
 | 
			
		||||
def get_element_by_id(id, html):
 | 
			
		||||
	"""Return the content of the tag with the specified id in the passed HTML document"""
 | 
			
		||||
	parser = IDParser(id)
 | 
			
		||||
	parser.loads(html)
 | 
			
		||||
	return parser.get_result()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def preferredencoding():
 | 
			
		||||
	"""Get preferred encoding.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -246,7 +304,7 @@ def clean_html(html):
 | 
			
		|||
	"""Clean an HTML snippet into a readable string"""
 | 
			
		||||
	# Newline vs <br />
 | 
			
		||||
	html = html.replace('\n', ' ')
 | 
			
		||||
	html = re.sub('<\s*br\s*/?\s*>', '\n', html)
 | 
			
		||||
	html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 | 
			
		||||
	# Strip html tags
 | 
			
		||||
	html = re.sub('<.*?>', '', html)
 | 
			
		||||
	# Replace html entities
 | 
			
		||||
| 
						 | 
				
			
			@ -1432,18 +1490,9 @@ class YoutubeIE(InfoExtractor):
 | 
			
		|||
					pass
 | 
			
		||||
 | 
			
		||||
		# description
 | 
			
		||||
		try:
 | 
			
		||||
			lxml.etree
 | 
			
		||||
		except NameError:
 | 
			
		||||
			video_description = u'No description available.'
 | 
			
		||||
			mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
 | 
			
		||||
			if mobj is not None:
 | 
			
		||||
				video_description = mobj.group(1).decode('utf-8')
 | 
			
		||||
		else:
 | 
			
		||||
			html_parser = lxml.etree.HTMLParser(encoding='utf-8')
 | 
			
		||||
			vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
 | 
			
		||||
			video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
 | 
			
		||||
			# TODO use another parser
 | 
			
		||||
		video_description = get_element_by_id("eow-description", video_webpage)
 | 
			
		||||
		if video_description: video_description = clean_html(video_description.decode('utf8'))
 | 
			
		||||
		else: video_description = ''
 | 
			
		||||
			
 | 
			
		||||
		# closed captions
 | 
			
		||||
		video_subtitles = None
 | 
			
		||||
| 
						 | 
				
			
			@ -2177,18 +2226,9 @@ class VimeoIE(InfoExtractor):
 | 
			
		|||
		video_thumbnail = config["video"]["thumbnail"]
 | 
			
		||||
 | 
			
		||||
		# Extract video description
 | 
			
		||||
		try:
 | 
			
		||||
			lxml.etree
 | 
			
		||||
		except NameError:
 | 
			
		||||
			video_description = u'No description available.'
 | 
			
		||||
			mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
 | 
			
		||||
			if mobj is not None:
 | 
			
		||||
				video_description = mobj.group(1)
 | 
			
		||||
		else:
 | 
			
		||||
			html_parser = lxml.etree.HTMLParser()
 | 
			
		||||
			vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
 | 
			
		||||
			video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
 | 
			
		||||
			# TODO use another parser
 | 
			
		||||
		video_description = get_element_by_id("description", webpage)
 | 
			
		||||
		if video_description: video_description = clean_html(video_description.decode('utf8'))
 | 
			
		||||
		else: video_description = ''
 | 
			
		||||
 | 
			
		||||
		# Extract upload date
 | 
			
		||||
		video_upload_date = u'NA'
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue