From 4759543f6e5d532795eb1d5434692bb6d5e1f0ec Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 11 Jan 2021 17:35:13 +0100 Subject: [PATCH] [youtube:search] fix view_count and try to extract all video sections(closes #27588)(closes #27604) --- youtube_dl/extractor/youtube.py | 117 ++++++++++++++------------------ 1 file changed, 50 insertions(+), 67 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 87bdc1677..f57099f8c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -308,6 +308,36 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg', default='{}'), video_id, fatal=False) + def _extract_video(self, renderer): + video_id = renderer['videoId'] + title = try_get( + renderer, + (lambda x: x['title']['runs'][0]['text'], + lambda x: x['title']['simpleText']), compat_str) + description = try_get( + renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'], + compat_str) + duration = parse_duration(try_get( + renderer, lambda x: x['lengthText']['simpleText'], compat_str)) + view_count_text = try_get( + renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or '' + view_count = str_to_int(self._search_regex( + r'^([\d,]+)', re.sub(r'\s', '', view_count_text), + 'view count', default=None)) + uploader = try_get( + renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str) + return { + '_type': 'url_transparent', + 'ie_key': YoutubeIE.ie_key(), + 'id': video_id, + 'url': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'uploader': uploader, + } + class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' @@ -2765,36 +2795,6 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if renderer: return renderer - def _extract_video(self, renderer): - video_id = renderer.get('videoId') - title = try_get( - renderer, - (lambda x: x['title']['runs'][0]['text'], - lambda x: x['title']['simpleText']), compat_str) - description = try_get( - renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'], - compat_str) - duration = parse_duration(try_get( - renderer, lambda x: x['lengthText']['simpleText'], compat_str)) - view_count_text = try_get( - renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or '' - view_count = str_to_int(self._search_regex( - r'^([\d,]+)', re.sub(r'\s', '', view_count_text), - 'view count', default=None)) - uploader = try_get( - renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str) - return { - '_type': 'url_transparent', - 'ie_key': YoutubeIE.ie_key(), - 'id': video_id, - 'url': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'uploader': uploader, - } - def _grid_entries(self, grid_renderer): for item in grid_renderer['items']: if not isinstance(item, dict): @@ -3417,46 +3417,29 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor): list) if not slr_contents: break - isr_contents = try_get( - slr_contents, - lambda x: x[0]['itemSectionRenderer']['contents'], - list) - if not isr_contents: - break - for content in isr_contents: - if not isinstance(content, dict): + for slr_content in slr_contents: + isr_contents = try_get( + slr_content, + lambda x: x['itemSectionRenderer']['contents'], + list) + if not isr_contents: continue - video = content.get('videoRenderer') - if not isinstance(video, dict): - continue - video_id = video.get('videoId') - if not video_id: - continue - title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str) - description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str) - duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str)) - view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or '' - view_count = int_or_none(self._search_regex( - r'^(\d+)', re.sub(r'\s', '', view_count_text), - 'view count', default=None)) - uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str) - total += 1 - yield { - '_type': 'url_transparent', - 'ie_key': YoutubeIE.ie_key(), - 'id': video_id, - 'url': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'uploader': uploader, - } - if total == n: - return + for content in isr_contents: + if not isinstance(content, dict): + continue + video = content.get('videoRenderer') + if not isinstance(video, dict): + continue + video_id = video.get('videoId') + if not video_id: + continue + yield self._extract_video(video) + total += 1 + if total == n: + return token = try_get( slr_contents, - lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], + lambda x: x[-1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], compat_str) if not token: break