[youtube] Move metadata extraction after video availability check
This commit is contained in:
parent
936784b272
commit
7e72694b5e
1 changed files with 128 additions and 131 deletions
|
@ -530,7 +530,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
|
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
|
||||||
'license': 'Standard YouTube License',
|
'license': 'Standard YouTube License',
|
||||||
'creator': 'Justin Timberlake',
|
'creator': 'Justin Timberlake',
|
||||||
'track': 'Tunnel Vision`',
|
'track': 'Tunnel Vision',
|
||||||
'artist': 'Justin Timberlake',
|
'artist': 'Justin Timberlake',
|
||||||
'age_limit': 18,
|
'age_limit': 18,
|
||||||
}
|
}
|
||||||
|
@ -1698,136 +1698,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
|
if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
|
||||||
raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True)
|
raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True)
|
||||||
|
|
||||||
# Start extracting information
|
|
||||||
self.report_information_extraction(video_id)
|
|
||||||
|
|
||||||
# uploader
|
|
||||||
video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str)
|
|
||||||
if video_uploader:
|
|
||||||
video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
|
|
||||||
else:
|
|
||||||
self._downloader.report_warning('unable to extract uploader name')
|
|
||||||
|
|
||||||
# uploader_id
|
|
||||||
video_uploader_id = None
|
|
||||||
video_uploader_url = None
|
|
||||||
mobj = re.search(
|
|
||||||
r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
|
|
||||||
video_webpage)
|
|
||||||
if mobj is not None:
|
|
||||||
video_uploader_id = mobj.group('uploader_id')
|
|
||||||
video_uploader_url = mobj.group('uploader_url')
|
|
||||||
else:
|
|
||||||
self._downloader.report_warning('unable to extract uploader nickname')
|
|
||||||
|
|
||||||
# thumbnail image
|
|
||||||
# We try first to get a high quality image:
|
|
||||||
m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
|
|
||||||
video_webpage, re.DOTALL)
|
|
||||||
if m_thumb is not None:
|
|
||||||
video_thumbnail = m_thumb.group(1)
|
|
||||||
elif 'thumbnail_url' not in video_info:
|
|
||||||
self._downloader.report_warning('unable to extract video thumbnail')
|
|
||||||
video_thumbnail = None
|
|
||||||
else: # don't panic if we can't find it
|
|
||||||
video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
|
|
||||||
|
|
||||||
# upload date
|
|
||||||
upload_date = self._html_search_meta(
|
|
||||||
'datePublished', video_webpage, 'upload date', default=None)
|
|
||||||
if not upload_date:
|
|
||||||
upload_date = self._search_regex(
|
|
||||||
[r'(?s)id="eow-date.*?>(.*?)</span>',
|
|
||||||
r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
|
|
||||||
video_webpage, 'upload date', default=None)
|
|
||||||
upload_date = unified_strdate(upload_date)
|
|
||||||
|
|
||||||
video_license = self._html_search_regex(
|
|
||||||
r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
|
|
||||||
video_webpage, 'license', default=None)
|
|
||||||
|
|
||||||
m_music = re.search(
|
|
||||||
r'''(?x)
|
|
||||||
<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
|
|
||||||
<ul[^>]*>\s*
|
|
||||||
<li>(?P<title>.+?)
|
|
||||||
by (?P<creator>.+?)
|
|
||||||
(?:
|
|
||||||
\(.+?\)|
|
|
||||||
<a[^>]*
|
|
||||||
(?:
|
|
||||||
\bhref=["\']/red[^>]*>| # drop possible
|
|
||||||
>\s*Listen ad-free with YouTube Red # YouTube Red ad
|
|
||||||
)
|
|
||||||
.*?
|
|
||||||
)?</li
|
|
||||||
''',
|
|
||||||
video_webpage)
|
|
||||||
if m_music:
|
|
||||||
video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
|
|
||||||
video_creator = clean_html(m_music.group('creator'))
|
|
||||||
else:
|
|
||||||
video_alt_title = video_creator = None
|
|
||||||
|
|
||||||
def extract_meta(field):
|
|
||||||
return self._html_search_regex(
|
|
||||||
r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
|
|
||||||
video_webpage, field, default=None)
|
|
||||||
|
|
||||||
track = extract_meta('Song')
|
|
||||||
artist = extract_meta('Artist')
|
|
||||||
|
|
||||||
m_episode = re.search(
|
|
||||||
r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
|
|
||||||
video_webpage)
|
|
||||||
if m_episode:
|
|
||||||
series = m_episode.group('series')
|
|
||||||
season_number = int(m_episode.group('season'))
|
|
||||||
episode_number = int(m_episode.group('episode'))
|
|
||||||
else:
|
|
||||||
series = season_number = episode_number = None
|
|
||||||
|
|
||||||
m_cat_container = self._search_regex(
|
|
||||||
r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
|
|
||||||
video_webpage, 'categories', default=None)
|
|
||||||
if m_cat_container:
|
|
||||||
category = self._html_search_regex(
|
|
||||||
r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
|
|
||||||
default=None)
|
|
||||||
video_categories = None if category is None else [category]
|
|
||||||
else:
|
|
||||||
video_categories = None
|
|
||||||
|
|
||||||
video_tags = [
|
|
||||||
unescapeHTML(m.group('content'))
|
|
||||||
for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
|
|
||||||
|
|
||||||
def _extract_count(count_name):
|
|
||||||
return str_to_int(self._search_regex(
|
|
||||||
r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
|
|
||||||
% re.escape(count_name),
|
|
||||||
video_webpage, count_name, default=None))
|
|
||||||
|
|
||||||
like_count = _extract_count('like')
|
|
||||||
dislike_count = _extract_count('dislike')
|
|
||||||
|
|
||||||
# subtitles
|
|
||||||
video_subtitles = self.extract_subtitles(video_id, video_webpage)
|
|
||||||
automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
|
|
||||||
|
|
||||||
video_duration = try_get(
|
|
||||||
video_info, lambda x: int_or_none(x['length_seconds'][0]))
|
|
||||||
if not video_duration:
|
|
||||||
video_duration = parse_duration(self._html_search_meta(
|
|
||||||
'duration', video_webpage, 'video duration'))
|
|
||||||
|
|
||||||
# annotations
|
|
||||||
video_annotations = None
|
|
||||||
if self._downloader.params.get('writeannotations', False):
|
|
||||||
video_annotations = self._extract_annotations(video_id)
|
|
||||||
|
|
||||||
chapters = self._extract_chapters(description_original, video_duration)
|
|
||||||
|
|
||||||
def _extract_filesize(media_url):
|
def _extract_filesize(media_url):
|
||||||
return int_or_none(self._search_regex(
|
return int_or_none(self._search_regex(
|
||||||
r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
|
r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
|
||||||
|
@ -2002,6 +1872,133 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|
||||||
raise ExtractorError(error_message, expected=True)
|
raise ExtractorError(error_message, expected=True)
|
||||||
raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
|
raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
|
||||||
|
|
||||||
|
# uploader
|
||||||
|
video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str)
|
||||||
|
if video_uploader:
|
||||||
|
video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
|
||||||
|
else:
|
||||||
|
self._downloader.report_warning('unable to extract uploader name')
|
||||||
|
|
||||||
|
# uploader_id
|
||||||
|
video_uploader_id = None
|
||||||
|
video_uploader_url = None
|
||||||
|
mobj = re.search(
|
||||||
|
r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
|
||||||
|
video_webpage)
|
||||||
|
if mobj is not None:
|
||||||
|
video_uploader_id = mobj.group('uploader_id')
|
||||||
|
video_uploader_url = mobj.group('uploader_url')
|
||||||
|
else:
|
||||||
|
self._downloader.report_warning('unable to extract uploader nickname')
|
||||||
|
|
||||||
|
# thumbnail image
|
||||||
|
# We try first to get a high quality image:
|
||||||
|
m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
|
||||||
|
video_webpage, re.DOTALL)
|
||||||
|
if m_thumb is not None:
|
||||||
|
video_thumbnail = m_thumb.group(1)
|
||||||
|
elif 'thumbnail_url' not in video_info:
|
||||||
|
self._downloader.report_warning('unable to extract video thumbnail')
|
||||||
|
video_thumbnail = None
|
||||||
|
else: # don't panic if we can't find it
|
||||||
|
video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
|
||||||
|
|
||||||
|
# upload date
|
||||||
|
upload_date = self._html_search_meta(
|
||||||
|
'datePublished', video_webpage, 'upload date', default=None)
|
||||||
|
if not upload_date:
|
||||||
|
upload_date = self._search_regex(
|
||||||
|
[r'(?s)id="eow-date.*?>(.*?)</span>',
|
||||||
|
r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
|
||||||
|
video_webpage, 'upload date', default=None)
|
||||||
|
upload_date = unified_strdate(upload_date)
|
||||||
|
|
||||||
|
video_license = self._html_search_regex(
|
||||||
|
r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
|
||||||
|
video_webpage, 'license', default=None)
|
||||||
|
|
||||||
|
m_music = re.search(
|
||||||
|
r'''(?x)
|
||||||
|
<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
|
||||||
|
<ul[^>]*>\s*
|
||||||
|
<li>(?P<title>.+?)
|
||||||
|
by (?P<creator>.+?)
|
||||||
|
(?:
|
||||||
|
\(.+?\)|
|
||||||
|
<a[^>]*
|
||||||
|
(?:
|
||||||
|
\bhref=["\']/red[^>]*>| # drop possible
|
||||||
|
>\s*Listen ad-free with YouTube Red # YouTube Red ad
|
||||||
|
)
|
||||||
|
.*?
|
||||||
|
)?</li
|
||||||
|
''',
|
||||||
|
video_webpage)
|
||||||
|
if m_music:
|
||||||
|
video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
|
||||||
|
video_creator = clean_html(m_music.group('creator'))
|
||||||
|
else:
|
||||||
|
video_alt_title = video_creator = None
|
||||||
|
|
||||||
|
def extract_meta(field):
|
||||||
|
return self._html_search_regex(
|
||||||
|
r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
|
||||||
|
video_webpage, field, default=None)
|
||||||
|
|
||||||
|
track = extract_meta('Song')
|
||||||
|
artist = extract_meta('Artist')
|
||||||
|
|
||||||
|
m_episode = re.search(
|
||||||
|
r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
|
||||||
|
video_webpage)
|
||||||
|
if m_episode:
|
||||||
|
series = m_episode.group('series')
|
||||||
|
season_number = int(m_episode.group('season'))
|
||||||
|
episode_number = int(m_episode.group('episode'))
|
||||||
|
else:
|
||||||
|
series = season_number = episode_number = None
|
||||||
|
|
||||||
|
m_cat_container = self._search_regex(
|
||||||
|
r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
|
||||||
|
video_webpage, 'categories', default=None)
|
||||||
|
if m_cat_container:
|
||||||
|
category = self._html_search_regex(
|
||||||
|
r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
|
||||||
|
default=None)
|
||||||
|
video_categories = None if category is None else [category]
|
||||||
|
else:
|
||||||
|
video_categories = None
|
||||||
|
|
||||||
|
video_tags = [
|
||||||
|
unescapeHTML(m.group('content'))
|
||||||
|
for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
|
||||||
|
|
||||||
|
def _extract_count(count_name):
|
||||||
|
return str_to_int(self._search_regex(
|
||||||
|
r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
|
||||||
|
% re.escape(count_name),
|
||||||
|
video_webpage, count_name, default=None))
|
||||||
|
|
||||||
|
like_count = _extract_count('like')
|
||||||
|
dislike_count = _extract_count('dislike')
|
||||||
|
|
||||||
|
# subtitles
|
||||||
|
video_subtitles = self.extract_subtitles(video_id, video_webpage)
|
||||||
|
automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
|
||||||
|
|
||||||
|
video_duration = try_get(
|
||||||
|
video_info, lambda x: int_or_none(x['length_seconds'][0]))
|
||||||
|
if not video_duration:
|
||||||
|
video_duration = parse_duration(self._html_search_meta(
|
||||||
|
'duration', video_webpage, 'video duration'))
|
||||||
|
|
||||||
|
# annotations
|
||||||
|
video_annotations = None
|
||||||
|
if self._downloader.params.get('writeannotations', False):
|
||||||
|
video_annotations = self._extract_annotations(video_id)
|
||||||
|
|
||||||
|
chapters = self._extract_chapters(description_original, video_duration)
|
||||||
|
|
||||||
# Look for the DASH manifest
|
# Look for the DASH manifest
|
||||||
if self._downloader.params.get('youtube_include_dash_manifest', True):
|
if self._downloader.params.get('youtube_include_dash_manifest', True):
|
||||||
dash_mpd_fatal = True
|
dash_mpd_fatal = True
|
||||||
|
|
Loading…
Reference in a new issue