[extractor/common] Support multiple properties in _og_search_property

This commit is contained in:
Sergey M․ 2016-08-02 22:55:14 +07:00
parent ce28252c48
commit b070564efb
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D
2 changed files with 10 additions and 2 deletions

View file

@ -48,6 +48,9 @@ class TestInfoExtractor(unittest.TestCase):
self.assertEqual(ie._og_search_property('foobar', html), 'Foo') self.assertEqual(ie._og_search_property('foobar', html), 'Foo')
self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar') self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar')
self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar') self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar')
self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar')
self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True)
self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True)
def test_html_search_meta(self): def test_html_search_meta(self):
ie = self.ie ie = self.ie

View file

@ -727,9 +727,14 @@ class InfoExtractor(object):
[^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
def _og_search_property(self, prop, html, name=None, **kargs): def _og_search_property(self, prop, html, name=None, **kargs):
if not isinstance(prop, (list, tuple)):
prop = [prop]
if name is None: if name is None:
name = 'OpenGraph %s' % prop name = 'OpenGraph %s' % prop[0]
escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs) og_regexes = []
for p in prop:
og_regexes.extend(self._og_regexes(p))
escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
if escaped is None: if escaped is None:
return None return None
return unescapeHTML(escaped) return unescapeHTML(escaped)