[youtube] Extract chapters from JSON (closes #24819)
This commit is contained in:
		
							parent
							
								
									562de77f41
								
							
						
					
					
						commit
						84213ea8d4
					
				
					 2 changed files with 62 additions and 3 deletions
				
			
		| 
						 | 
					@ -267,7 +267,7 @@ class TestYoutubeChapters(unittest.TestCase):
 | 
				
			||||||
        for description, duration, expected_chapters in self._TEST_CASES:
 | 
					        for description, duration, expected_chapters in self._TEST_CASES:
 | 
				
			||||||
            ie = YoutubeIE()
 | 
					            ie = YoutubeIE()
 | 
				
			||||||
            expect_value(
 | 
					            expect_value(
 | 
				
			||||||
                self, ie._extract_chapters(description, duration),
 | 
					                self, ie._extract_chapters_from_description(description, duration),
 | 
				
			||||||
                expected_chapters, None)
 | 
					                expected_chapters, None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1652,8 +1652,63 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 | 
				
			||||||
        video_id = mobj.group(2)
 | 
					        video_id = mobj.group(2)
 | 
				
			||||||
        return video_id
 | 
					        return video_id
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _extract_chapters_from_json(self, webpage, video_id, duration):
 | 
				
			||||||
 | 
					        if not webpage:
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					        player = self._parse_json(
 | 
				
			||||||
 | 
					            self._search_regex(
 | 
				
			||||||
 | 
					                r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage,
 | 
				
			||||||
 | 
					                'player args', default='{}'),
 | 
				
			||||||
 | 
					            video_id, fatal=False)
 | 
				
			||||||
 | 
					        if not player or not isinstance(player, dict):
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					        watch_next_response = player.get('watch_next_response')
 | 
				
			||||||
 | 
					        if not isinstance(watch_next_response, compat_str):
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					        response = self._parse_json(watch_next_response, video_id, fatal=False)
 | 
				
			||||||
 | 
					        if not response or not isinstance(response, dict):
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					        chapters_list = try_get(
 | 
				
			||||||
 | 
					            response,
 | 
				
			||||||
 | 
					            lambda x: x['playerOverlays']
 | 
				
			||||||
 | 
					                       ['playerOverlayRenderer']
 | 
				
			||||||
 | 
					                       ['decoratedPlayerBarRenderer']
 | 
				
			||||||
 | 
					                       ['decoratedPlayerBarRenderer']
 | 
				
			||||||
 | 
					                       ['playerBar']
 | 
				
			||||||
 | 
					                       ['chapteredPlayerBarRenderer']
 | 
				
			||||||
 | 
					                       ['chapters'],
 | 
				
			||||||
 | 
					            list)
 | 
				
			||||||
 | 
					        if not chapters_list:
 | 
				
			||||||
 | 
					            return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def chapter_time(chapter):
 | 
				
			||||||
 | 
					            return float_or_none(
 | 
				
			||||||
 | 
					                try_get(
 | 
				
			||||||
 | 
					                    chapter,
 | 
				
			||||||
 | 
					                    lambda x: x['chapterRenderer']['timeRangeStartMillis'],
 | 
				
			||||||
 | 
					                    int),
 | 
				
			||||||
 | 
					                scale=1000)
 | 
				
			||||||
 | 
					        chapters = []
 | 
				
			||||||
 | 
					        for next_num, chapter in enumerate(chapters_list, start=1):
 | 
				
			||||||
 | 
					            start_time = chapter_time(chapter)
 | 
				
			||||||
 | 
					            if start_time is None:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            end_time = (chapter_time(chapters_list[next_num])
 | 
				
			||||||
 | 
					                        if next_num < len(chapters_list) else duration)
 | 
				
			||||||
 | 
					            if end_time is None:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            title = try_get(
 | 
				
			||||||
 | 
					                chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
 | 
				
			||||||
 | 
					                compat_str)
 | 
				
			||||||
 | 
					            chapters.append({
 | 
				
			||||||
 | 
					                'start_time': start_time,
 | 
				
			||||||
 | 
					                'end_time': end_time,
 | 
				
			||||||
 | 
					                'title': title,
 | 
				
			||||||
 | 
					            })
 | 
				
			||||||
 | 
					        return chapters
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    def _extract_chapters(description, duration):
 | 
					    def _extract_chapters_from_description(description, duration):
 | 
				
			||||||
        if not description:
 | 
					        if not description:
 | 
				
			||||||
            return None
 | 
					            return None
 | 
				
			||||||
        chapter_lines = re.findall(
 | 
					        chapter_lines = re.findall(
 | 
				
			||||||
| 
						 | 
					@ -1687,6 +1742,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 | 
				
			||||||
            })
 | 
					            })
 | 
				
			||||||
        return chapters
 | 
					        return chapters
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _extract_chapters(self, webpage, description, video_id, duration):
 | 
				
			||||||
 | 
					        return (self._extract_chapters_from_json(webpage, video_id, duration)
 | 
				
			||||||
 | 
					                or self._extract_chapters_from_description(description, duration))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _real_extract(self, url):
 | 
					    def _real_extract(self, url):
 | 
				
			||||||
        url, smuggled_data = unsmuggle_url(url, {})
 | 
					        url, smuggled_data = unsmuggle_url(url, {})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2324,7 +2383,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 | 
				
			||||||
                    errnote='Unable to download video annotations', fatal=False,
 | 
					                    errnote='Unable to download video annotations', fatal=False,
 | 
				
			||||||
                    data=urlencode_postdata({xsrf_field_name: xsrf_token}))
 | 
					                    data=urlencode_postdata({xsrf_field_name: xsrf_token}))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        chapters = self._extract_chapters(description_original, video_duration)
 | 
					        chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Look for the DASH manifest
 | 
					        # Look for the DASH manifest
 | 
				
			||||||
        if self._downloader.params.get('youtube_include_dash_manifest', True):
 | 
					        if self._downloader.params.get('youtube_include_dash_manifest', True):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue