[youtube] fix automatic captions extraction(closes #27162)(closes #27388)

This commit is contained in:
Remita Amine 2020-12-24 16:05:03 +01:00
parent f9e6aa1dcf
commit 4ef1fc9707

View file

@ -1322,17 +1322,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return self._parse_json( return self._parse_json(
uppercase_escape(config), video_id, fatal=False) uppercase_escape(config), video_id, fatal=False)
def _get_automatic_captions(self, video_id, webpage): def _get_automatic_captions(self, video_id, player_response, player_config):
"""We need the webpage for getting the captions url, pass it as an """We need the webpage for getting the captions url, pass it as an
argument to speed up the process.""" argument to speed up the process."""
self.to_screen('%s: Looking for automatic captions' % video_id) self.to_screen('%s: Looking for automatic captions' % video_id)
player_config = self._get_ytplayer_config(video_id, webpage)
err_msg = 'Couldn\'t find automatic captions for %s' % video_id err_msg = 'Couldn\'t find automatic captions for %s' % video_id
if not player_config: if not (player_response or player_config):
self._downloader.report_warning(err_msg) self._downloader.report_warning(err_msg)
return {} return {}
try: try:
args = player_config['args'] args = player_config.get('args') if player_config else {}
caption_url = args.get('ttsurl') caption_url = args.get('ttsurl')
if caption_url: if caption_url:
timestamp = args['timestamp'] timestamp = args['timestamp']
@ -1391,19 +1390,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return captions return captions
# New captions format as of 22.06.2017 # New captions format as of 22.06.2017
player_response = args.get('player_response') if player_response:
if player_response and isinstance(player_response, compat_str): renderer = player_response['captions']['playerCaptionsTracklistRenderer']
player_response = self._parse_json( base_url = renderer['captionTracks'][0]['baseUrl']
player_response, video_id, fatal=False) sub_lang_list = []
if player_response: for lang in renderer['translationLanguages']:
renderer = player_response['captions']['playerCaptionsTracklistRenderer'] lang_code = lang.get('languageCode')
base_url = renderer['captionTracks'][0]['baseUrl'] if lang_code:
sub_lang_list = [] sub_lang_list.append(lang_code)
for lang in renderer['translationLanguages']: return make_captions(base_url, sub_lang_list)
lang_code = lang.get('languageCode')
if lang_code:
sub_lang_list.append(lang_code)
return make_captions(base_url, sub_lang_list)
# Some videos don't provide ttsurl but rather caption_tracks and # Some videos don't provide ttsurl but rather caption_tracks and
# caption_translation_languages (e.g. 20LmZk1hakA) # caption_translation_languages (e.g. 20LmZk1hakA)
@ -1652,6 +1647,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Get video info # Get video info
video_info = {} video_info = {}
embed_webpage = None embed_webpage = None
ytplayer_config = None
if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None: if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None:
age_gate = True age_gate = True
@ -2276,7 +2272,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# subtitles # subtitles
video_subtitles = self.extract_subtitles(video_id, video_webpage) video_subtitles = self.extract_subtitles(video_id, video_webpage)
automatic_captions = self.extract_automatic_captions(video_id, video_webpage) automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config)
video_duration = try_get( video_duration = try_get(
video_info, lambda x: int_or_none(x['length_seconds'][0])) video_info, lambda x: int_or_none(x['length_seconds'][0]))