[ytsearch] Fix extraction (closes #26920)
This commit is contained in:
		
							parent
							
								
									48c5663c5f
								
							
						
					
					
						commit
						416da574ec
					
				
					 1 changed files with 78 additions and 38 deletions
				
			
		| 
						 | 
					@ -3181,54 +3181,94 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
 | 
				
			||||||
    _MAX_RESULTS = float('inf')
 | 
					    _MAX_RESULTS = float('inf')
 | 
				
			||||||
    IE_NAME = 'youtube:search'
 | 
					    IE_NAME = 'youtube:search'
 | 
				
			||||||
    _SEARCH_KEY = 'ytsearch'
 | 
					    _SEARCH_KEY = 'ytsearch'
 | 
				
			||||||
    _EXTRA_QUERY_ARGS = {}
 | 
					    _SEARCH_PARAMS = None
 | 
				
			||||||
    _TESTS = []
 | 
					    _TESTS = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _entries(self, query, n):
 | 
				
			||||||
 | 
					        data = {
 | 
				
			||||||
 | 
					            'context': {
 | 
				
			||||||
 | 
					                'client': {
 | 
				
			||||||
 | 
					                    'clientName': 'WEB',
 | 
				
			||||||
 | 
					                    'clientVersion': '2.20201021.03.00',
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            'query': query,
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        if self._SEARCH_PARAMS:
 | 
				
			||||||
 | 
					            data['params'] = self._SEARCH_PARAMS
 | 
				
			||||||
 | 
					        total = 0
 | 
				
			||||||
 | 
					        for page_num in itertools.count(1):
 | 
				
			||||||
 | 
					            search = self._download_json(
 | 
				
			||||||
 | 
					                'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
 | 
				
			||||||
 | 
					                video_id='query "%s"' % query,
 | 
				
			||||||
 | 
					                note='Downloading page %s' % page_num,
 | 
				
			||||||
 | 
					                errnote='Unable to download API page', fatal=False,
 | 
				
			||||||
 | 
					                data=json.dumps(data).encode('utf8'),
 | 
				
			||||||
 | 
					                headers={'content-type': 'application/json'})
 | 
				
			||||||
 | 
					            if not search:
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
 | 
					            slr_contents = try_get(
 | 
				
			||||||
 | 
					                search,
 | 
				
			||||||
 | 
					                (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
 | 
				
			||||||
 | 
					                 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
 | 
				
			||||||
 | 
					                list)
 | 
				
			||||||
 | 
					            if not slr_contents:
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
 | 
					            isr_contents = try_get(
 | 
				
			||||||
 | 
					                slr_contents,
 | 
				
			||||||
 | 
					                lambda x: x[0]['itemSectionRenderer']['contents'],
 | 
				
			||||||
 | 
					                list)
 | 
				
			||||||
 | 
					            if not isr_contents:
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
 | 
					            for content in isr_contents:
 | 
				
			||||||
 | 
					                if not isinstance(content, dict):
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
 | 
					                video = content.get('videoRenderer')
 | 
				
			||||||
 | 
					                if not isinstance(video, dict):
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
 | 
					                video_id = video.get('videoId')
 | 
				
			||||||
 | 
					                if not video_id:
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
 | 
					                title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
 | 
				
			||||||
 | 
					                description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
 | 
				
			||||||
 | 
					                duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
 | 
				
			||||||
 | 
					                view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
 | 
				
			||||||
 | 
					                view_count = int_or_none(self._search_regex(
 | 
				
			||||||
 | 
					                    r'^(\d+)', re.sub(r'\s', '', view_count_text),
 | 
				
			||||||
 | 
					                    'view count', default=None))
 | 
				
			||||||
 | 
					                uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
 | 
				
			||||||
 | 
					                total += 1
 | 
				
			||||||
 | 
					                yield {
 | 
				
			||||||
 | 
					                    '_type': 'url_transparent',
 | 
				
			||||||
 | 
					                    'ie_key': YoutubeIE.ie_key(),
 | 
				
			||||||
 | 
					                    'id': video_id,
 | 
				
			||||||
 | 
					                    'url': video_id,
 | 
				
			||||||
 | 
					                    'title': title,
 | 
				
			||||||
 | 
					                    'description': description,
 | 
				
			||||||
 | 
					                    'duration': duration,
 | 
				
			||||||
 | 
					                    'view_count': view_count,
 | 
				
			||||||
 | 
					                    'uploader': uploader,
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					                if total == n:
 | 
				
			||||||
 | 
					                    return
 | 
				
			||||||
 | 
					            token = try_get(
 | 
				
			||||||
 | 
					                slr_contents,
 | 
				
			||||||
 | 
					                lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
 | 
				
			||||||
 | 
					                compat_str)
 | 
				
			||||||
 | 
					            if not token:
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
 | 
					            data['continuation'] = token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _get_n_results(self, query, n):
 | 
					    def _get_n_results(self, query, n):
 | 
				
			||||||
        """Get a specified number of results for a query"""
 | 
					        """Get a specified number of results for a query"""
 | 
				
			||||||
 | 
					        return self.playlist_result(self._entries(query, n), query)
 | 
				
			||||||
        videos = []
 | 
					 | 
				
			||||||
        limit = n
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        url_query = {
 | 
					 | 
				
			||||||
            'search_query': query.encode('utf-8'),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
        url_query.update(self._EXTRA_QUERY_ARGS)
 | 
					 | 
				
			||||||
        result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        for pagenum in itertools.count(1):
 | 
					 | 
				
			||||||
            data = self._download_json(
 | 
					 | 
				
			||||||
                result_url, video_id='query "%s"' % query,
 | 
					 | 
				
			||||||
                note='Downloading page %s' % pagenum,
 | 
					 | 
				
			||||||
                errnote='Unable to download API page',
 | 
					 | 
				
			||||||
                query={'spf': 'navigate'})
 | 
					 | 
				
			||||||
            html_content = data[1]['body']['content']
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            if 'class="search-message' in html_content:
 | 
					 | 
				
			||||||
                raise ExtractorError(
 | 
					 | 
				
			||||||
                    '[youtube] No video results', expected=True)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            new_videos = list(self._process_page(html_content))
 | 
					 | 
				
			||||||
            videos += new_videos
 | 
					 | 
				
			||||||
            if not new_videos or len(videos) > limit:
 | 
					 | 
				
			||||||
                break
 | 
					 | 
				
			||||||
            next_link = self._html_search_regex(
 | 
					 | 
				
			||||||
                r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
 | 
					 | 
				
			||||||
                html_content, 'next link', default=None)
 | 
					 | 
				
			||||||
            if next_link is None:
 | 
					 | 
				
			||||||
                break
 | 
					 | 
				
			||||||
            result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if len(videos) > n:
 | 
					 | 
				
			||||||
            videos = videos[:n]
 | 
					 | 
				
			||||||
        return self.playlist_result(videos, query)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class YoutubeSearchDateIE(YoutubeSearchIE):
 | 
					class YoutubeSearchDateIE(YoutubeSearchIE):
 | 
				
			||||||
    IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
 | 
					    IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
 | 
				
			||||||
    _SEARCH_KEY = 'ytsearchdate'
 | 
					    _SEARCH_KEY = 'ytsearchdate'
 | 
				
			||||||
    IE_DESC = 'YouTube.com searches, newest videos first'
 | 
					    IE_DESC = 'YouTube.com searches, newest videos first'
 | 
				
			||||||
    _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
 | 
					    _SEARCH_PARAMS = 'CAI%3D'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
 | 
					class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue