[extractor/common] Extract multiple JSON-LD entries
This commit is contained in:
		
							parent
							
								
									e40c758c2a
								
							
						
					
					
						commit
						4433bb0245
					
				
					 1 changed files with 32 additions and 9 deletions
				
			
		| 
						 | 
				
			
			@ -1182,16 +1182,33 @@ class InfoExtractor(object):
 | 
			
		|||
                                      'twitter card player')
 | 
			
		||||
 | 
			
		||||
    def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 | 
			
		||||
        json_ld = self._search_regex(
 | 
			
		||||
            JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
 | 
			
		||||
        json_ld_list = list(re.finditer(JSON_LD_RE, html))
 | 
			
		||||
        default = kwargs.get('default', NO_DEFAULT)
 | 
			
		||||
        if not json_ld:
 | 
			
		||||
            return default if default is not NO_DEFAULT else {}
 | 
			
		||||
        # JSON-LD may be malformed and thus `fatal` should be respected.
 | 
			
		||||
        # At the same time `default` may be passed that assumes `fatal=False`
 | 
			
		||||
        # for _search_regex. Let's simulate the same behavior here as well.
 | 
			
		||||
        fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 | 
			
		||||
        return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 | 
			
		||||
        json_ld = []
 | 
			
		||||
        for mobj in json_ld_list:
 | 
			
		||||
            json_ld_item = self._parse_json(
 | 
			
		||||
                mobj.group('json_ld'), video_id, fatal=fatal)
 | 
			
		||||
            if not json_ld_item:
 | 
			
		||||
                continue
 | 
			
		||||
            if isinstance(json_ld_item, dict):
 | 
			
		||||
                json_ld.append(json_ld_item)
 | 
			
		||||
            elif isinstance(json_ld_item, (list, tuple)):
 | 
			
		||||
                json_ld.extend(json_ld_item)
 | 
			
		||||
        if json_ld:
 | 
			
		||||
            json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 | 
			
		||||
        if json_ld:
 | 
			
		||||
            return json_ld
 | 
			
		||||
        if default is not NO_DEFAULT:
 | 
			
		||||
            return default
 | 
			
		||||
        elif fatal:
 | 
			
		||||
            raise RegexNotFoundError('Unable to extract JSON-LD')
 | 
			
		||||
        else:
 | 
			
		||||
            self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
 | 
			
		||||
            return {}
 | 
			
		||||
 | 
			
		||||
    def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 | 
			
		||||
        if isinstance(json_ld, compat_str):
 | 
			
		||||
| 
						 | 
				
			
			@ -1256,10 +1273,10 @@ class InfoExtractor(object):
 | 
			
		|||
            extract_interaction_statistic(e)
 | 
			
		||||
 | 
			
		||||
        for e in json_ld:
 | 
			
		||||
            if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
 | 
			
		||||
            if '@context' in e:
 | 
			
		||||
                item_type = e.get('@type')
 | 
			
		||||
                if expected_type is not None and expected_type != item_type:
 | 
			
		||||
                    return info
 | 
			
		||||
                    continue
 | 
			
		||||
                if item_type in ('TVEpisode', 'Episode'):
 | 
			
		||||
                    episode_name = unescapeHTML(e.get('name'))
 | 
			
		||||
                    info.update({
 | 
			
		||||
| 
						 | 
				
			
			@ -1293,11 +1310,17 @@ class InfoExtractor(object):
 | 
			
		|||
                    })
 | 
			
		||||
                elif item_type == 'VideoObject':
 | 
			
		||||
                    extract_video_object(e)
 | 
			
		||||
                    continue
 | 
			
		||||
                    if expected_type is None:
 | 
			
		||||
                        continue
 | 
			
		||||
                    else:
 | 
			
		||||
                        break
 | 
			
		||||
                video = e.get('video')
 | 
			
		||||
                if isinstance(video, dict) and video.get('@type') == 'VideoObject':
 | 
			
		||||
                    extract_video_object(video)
 | 
			
		||||
                break
 | 
			
		||||
                if expected_type is None:
 | 
			
		||||
                    continue
 | 
			
		||||
                else:
 | 
			
		||||
                    break
 | 
			
		||||
        return dict((k, v) for k, v in info.items() if v is not None)
 | 
			
		||||
 | 
			
		||||
    @staticmethod
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue