[utils] Add extract_attributes for extracting html tag attributes
This is much more robust than just using regexps, and handles all the common scenarios, such as empty/no values, repeated attributes, entity decoding, mixed case names, and the different possible value quoting schemes.
This commit is contained in:
parent
03879ff054
commit
8bb56eeeea
3 changed files with 76 additions and 0 deletions
|
@ -28,6 +28,7 @@ from youtube_dl.utils import (
|
|||
encodeFilename,
|
||||
escape_rfc3986,
|
||||
escape_url,
|
||||
extract_attributes,
|
||||
ExtractorError,
|
||||
find_xpath_attr,
|
||||
fix_xml_ampersands,
|
||||
|
@ -75,6 +76,7 @@ from youtube_dl.utils import (
|
|||
cli_bool_option,
|
||||
)
|
||||
from youtube_dl.compat import (
|
||||
compat_chr,
|
||||
compat_etree_fromstring,
|
||||
)
|
||||
|
||||
|
@ -591,6 +593,44 @@ class TestUtil(unittest.TestCase):
|
|||
on = js_to_json('{"abc": "def",}')
|
||||
self.assertEqual(json.loads(on), {'abc': 'def'})
|
||||
|
||||
def test_extract_attributes(self):
|
||||
self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
|
||||
self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
|
||||
self.assertEqual(extract_attributes('<e x=y>'), {'x': 'y'})
|
||||
self.assertEqual(extract_attributes('<e x="a \'b\' c">'), {'x': "a 'b' c"})
|
||||
self.assertEqual(extract_attributes('<e x=\'a "b" c\'>'), {'x': 'a "b" c'})
|
||||
self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
|
||||
self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
|
||||
self.assertEqual(extract_attributes('<e x="&">'), {'x': '&'}) # XML
|
||||
self.assertEqual(extract_attributes('<e x=""">'), {'x': '"'})
|
||||
self.assertEqual(extract_attributes('<e x="£">'), {'x': '£'}) # HTML 3.2
|
||||
self.assertEqual(extract_attributes('<e x="λ">'), {'x': 'λ'}) # HTML 4.0
|
||||
self.assertEqual(extract_attributes('<e x="&foo">'), {'x': '&foo'})
|
||||
self.assertEqual(extract_attributes('<e x="\'">'), {'x': "'"})
|
||||
self.assertEqual(extract_attributes('<e x=\'"\'>'), {'x': '"'})
|
||||
self.assertEqual(extract_attributes('<e x >'), {'x': None})
|
||||
self.assertEqual(extract_attributes('<e x=y a>'), {'x': 'y', 'a': None})
|
||||
self.assertEqual(extract_attributes('<e x= y>'), {'x': 'y'})
|
||||
self.assertEqual(extract_attributes('<e x=1 y=2 x=3>'), {'y': '2', 'x': '3'})
|
||||
self.assertEqual(extract_attributes('<e \nx=\ny\n>'), {'x': 'y'})
|
||||
self.assertEqual(extract_attributes('<e \nx=\n"y"\n>'), {'x': 'y'})
|
||||
self.assertEqual(extract_attributes("<e \nx=\n'y'\n>"), {'x': 'y'})
|
||||
self.assertEqual(extract_attributes('<e \nx="\ny\n">'), {'x': '\ny\n'})
|
||||
self.assertEqual(extract_attributes('<e CAPS=x>'), {'caps': 'x'}) # Names lowercased
|
||||
self.assertEqual(extract_attributes('<e x=1 X=2>'), {'x': '2'})
|
||||
self.assertEqual(extract_attributes('<e X=1 x=2>'), {'x': '2'})
|
||||
self.assertEqual(extract_attributes('<e _:funny-name1=1>'), {'_:funny-name1': '1'})
|
||||
self.assertEqual(extract_attributes('<e x="Fáilte 世界 \U0001f600">'), {'x': 'Fáilte 世界 \U0001f600'})
|
||||
self.assertEqual(extract_attributes('<e x="décomposé">'), {'x': 'décompose\u0301'})
|
||||
# "Narrow" Python builds don't support unicode code points outside BMP.
|
||||
try:
|
||||
compat_chr(0x10000)
|
||||
supports_outside_bmp = True
|
||||
except ValueError:
|
||||
supports_outside_bmp = False
|
||||
if supports_outside_bmp:
|
||||
self.assertEqual(extract_attributes('<e x="Smile 😀!">'), {'x': 'Smile \U0001f600!'})
|
||||
|
||||
def test_clean_html(self):
|
||||
self.assertEqual(clean_html('a:\nb'), 'a: b')
|
||||
self.assertEqual(clean_html('a:\n "b"'), 'a: "b"')
|
||||
|
|
|
@ -77,6 +77,11 @@ try:
|
|||
except ImportError: # Python 2
|
||||
from urllib import urlretrieve as compat_urlretrieve
|
||||
|
||||
try:
|
||||
from html.parser import HTMLParser as compat_HTMLParser
|
||||
except ImportError: # Python 2
|
||||
from HTMLParser import HTMLParser as compat_HTMLParser
|
||||
|
||||
|
||||
try:
|
||||
from subprocess import DEVNULL
|
||||
|
@ -540,6 +545,7 @@ else:
|
|||
from tokenize import generate_tokens as compat_tokenize_tokenize
|
||||
|
||||
__all__ = [
|
||||
'compat_HTMLParser',
|
||||
'compat_HTTPError',
|
||||
'compat_basestring',
|
||||
'compat_chr',
|
||||
|
|
|
@ -35,6 +35,7 @@ import xml.etree.ElementTree
|
|||
import zlib
|
||||
|
||||
from .compat import (
|
||||
compat_HTMLParser,
|
||||
compat_basestring,
|
||||
compat_chr,
|
||||
compat_etree_fromstring,
|
||||
|
@ -272,6 +273,35 @@ def get_element_by_attribute(attribute, value, html):
|
|||
|
||||
return unescapeHTML(res)
|
||||
|
||||
class HTMLAttributeParser(compat_HTMLParser):
|
||||
"""Trivial HTML parser to gather the attributes for a single element"""
|
||||
def __init__(self):
|
||||
self.attrs = { }
|
||||
compat_HTMLParser.__init__(self)
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self.attrs = dict(attrs)
|
||||
|
||||
def extract_attributes(html_element):
|
||||
"""Given a string for an HTML element such as
|
||||
<el
|
||||
a="foo" B="bar" c="&98;az" d=boz
|
||||
empty= noval entity="&"
|
||||
sq='"' dq="'"
|
||||
>
|
||||
Decode and return a dictionary of attributes.
|
||||
{
|
||||
'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
|
||||
'empty': '', 'noval': None, 'entity': '&',
|
||||
'sq': '"', 'dq': '\''
|
||||
}.
|
||||
NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
|
||||
but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
|
||||
"""
|
||||
parser = HTMLAttributeParser()
|
||||
parser.feed(html_element)
|
||||
parser.close()
|
||||
return parser.attrs
|
||||
|
||||
def clean_html(html):
|
||||
"""Clean an HTML snippet into a readable string"""
|
||||
|
|
Loading…
Reference in a new issue