[generic] Suppress warning about doctypes in RSS parser

This commit is contained in:
Philipp Hagemeister 2014-03-10 17:31:32 +01:00
parent e3899d0e00
commit bcf89ce62c
2 changed files with 13 additions and 2 deletions

View file

@ -4,7 +4,6 @@ from __future__ import unicode_literals
import os import os
import re import re
import xml.etree.ElementTree
from .common import InfoExtractor from .common import InfoExtractor
from .youtube import YoutubeIE from .youtube import YoutubeIE
@ -17,6 +16,7 @@ from ..utils import (
ExtractorError, ExtractorError,
HEADRequest, HEADRequest,
parse_xml,
smuggle_url, smuggle_url,
unescapeHTML, unescapeHTML,
unified_strdate, unified_strdate,
@ -274,7 +274,7 @@ class GenericIE(InfoExtractor):
# Is it an RSS feed? # Is it an RSS feed?
try: try:
doc = xml.etree.ElementTree.fromstring(webpage.encode('utf-8')) doc = parse_xml(webpage)
if doc.tag == 'rss': if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc) return self._extract_rss(url, video_id, doc)
except compat_xml_parse_error: except compat_xml_parse_error:

View file

@ -22,6 +22,7 @@ import struct
import subprocess import subprocess
import sys import sys
import traceback import traceback
import xml.etree.ElementTree
import zlib import zlib
try: try:
@ -1267,3 +1268,13 @@ def read_batch_urls(batch_fd):
def urlencode_postdata(*args, **kargs): def urlencode_postdata(*args, **kargs):
return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
def parse_xml(s):
class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
def doctype(self, name, pubid, system):
pass # Ignore doctypes
parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)