[common:jwplayer] Improve jwplayer extraction and parsing (#31000)
* don't crash parser if jwplayer_data is invalid (empty, or no formats) * use `label` in `sources[n]` as `format_id` * relax `jwplayer().setup(...)` RE (also rework PR #27274 enhancement) * detect more manifest formats in _parse_jwplayer_formats() (from PR #29596) * improve metadata extraction (from PR #25433) * remember URLs in a set * use parse_resolution() in format * extract filesize in format (from yt-dlp) Co-authored-by: kikuyan <kikuyan@users.noreply.github.com> Co-authored-by: martin54 <martin54@users.noreply.github.com>
This commit is contained in:
parent
47e70fff8b
commit
604762a9f8
1 changed files with 29 additions and 11 deletions
|
@ -70,6 +70,7 @@ from ..utils import (
|
||||||
str_or_none,
|
str_or_none,
|
||||||
str_to_int,
|
str_to_int,
|
||||||
strip_or_none,
|
strip_or_none,
|
||||||
|
try_get,
|
||||||
unescapeHTML,
|
unescapeHTML,
|
||||||
unified_strdate,
|
unified_strdate,
|
||||||
unified_timestamp,
|
unified_timestamp,
|
||||||
|
@ -2713,7 +2714,7 @@ class InfoExtractor(object):
|
||||||
|
|
||||||
def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
|
def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
|
||||||
mobj = re.search(
|
mobj = re.search(
|
||||||
r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
|
r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
|
||||||
webpage)
|
webpage)
|
||||||
if mobj:
|
if mobj:
|
||||||
try:
|
try:
|
||||||
|
@ -2734,9 +2735,14 @@ class InfoExtractor(object):
|
||||||
|
|
||||||
def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
|
def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
|
||||||
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
|
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
|
||||||
|
flat_pl = try_get(jwplayer_data, lambda x: x.get('playlist') or True)
|
||||||
|
if flat_pl is None:
|
||||||
|
# not even a dict
|
||||||
|
return []
|
||||||
|
|
||||||
# JWPlayer backward compatibility: flattened playlists
|
# JWPlayer backward compatibility: flattened playlists
|
||||||
# https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
|
# https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
|
||||||
if 'playlist' not in jwplayer_data:
|
if flat_pl is True:
|
||||||
jwplayer_data = {'playlist': [jwplayer_data]}
|
jwplayer_data = {'playlist': [jwplayer_data]}
|
||||||
|
|
||||||
entries = []
|
entries = []
|
||||||
|
@ -2784,6 +2790,13 @@ class InfoExtractor(object):
|
||||||
'timestamp': int_or_none(video_data.get('pubdate')),
|
'timestamp': int_or_none(video_data.get('pubdate')),
|
||||||
'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
|
'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
|
||||||
'subtitles': subtitles,
|
'subtitles': subtitles,
|
||||||
|
'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
|
||||||
|
'genre': clean_html(video_data.get('genre')),
|
||||||
|
'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
|
||||||
|
'season_number': int_or_none(video_data.get('season')),
|
||||||
|
'episode_number': int_or_none(video_data.get('episode')),
|
||||||
|
'release_year': int_or_none(video_data.get('releasedate')),
|
||||||
|
'age_limit': int_or_none(video_data.get('age_restriction')),
|
||||||
}
|
}
|
||||||
# https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
|
# https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
|
||||||
if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
|
if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
|
||||||
|
@ -2792,7 +2805,9 @@ class InfoExtractor(object):
|
||||||
'url': formats[0]['url'],
|
'url': formats[0]['url'],
|
||||||
})
|
})
|
||||||
else:
|
else:
|
||||||
self._sort_formats(formats)
|
# avoid exception in case of only sttls
|
||||||
|
if formats:
|
||||||
|
self._sort_formats(formats)
|
||||||
entry['formats'] = formats
|
entry['formats'] = formats
|
||||||
entries.append(entry)
|
entries.append(entry)
|
||||||
if len(entries) == 1:
|
if len(entries) == 1:
|
||||||
|
@ -2802,7 +2817,7 @@ class InfoExtractor(object):
|
||||||
|
|
||||||
def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
|
def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
|
||||||
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
|
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
|
||||||
urls = []
|
urls = set()
|
||||||
formats = []
|
formats = []
|
||||||
for source in jwplayer_sources_data:
|
for source in jwplayer_sources_data:
|
||||||
if not isinstance(source, dict):
|
if not isinstance(source, dict):
|
||||||
|
@ -2811,14 +2826,14 @@ class InfoExtractor(object):
|
||||||
base_url, self._proto_relative_url(source.get('file')))
|
base_url, self._proto_relative_url(source.get('file')))
|
||||||
if not source_url or source_url in urls:
|
if not source_url or source_url in urls:
|
||||||
continue
|
continue
|
||||||
urls.append(source_url)
|
urls.add(source_url)
|
||||||
source_type = source.get('type') or ''
|
source_type = source.get('type') or ''
|
||||||
ext = mimetype2ext(source_type) or determine_ext(source_url)
|
ext = mimetype2ext(source_type) or determine_ext(source_url)
|
||||||
if source_type == 'hls' or ext == 'm3u8':
|
if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
|
||||||
formats.extend(self._extract_m3u8_formats(
|
formats.extend(self._extract_m3u8_formats(
|
||||||
source_url, video_id, 'mp4', entry_protocol='m3u8_native',
|
source_url, video_id, 'mp4', entry_protocol='m3u8_native',
|
||||||
m3u8_id=m3u8_id, fatal=False))
|
m3u8_id=m3u8_id, fatal=False))
|
||||||
elif source_type == 'dash' or ext == 'mpd':
|
elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
|
||||||
formats.extend(self._extract_mpd_formats(
|
formats.extend(self._extract_mpd_formats(
|
||||||
source_url, video_id, mpd_id=mpd_id, fatal=False))
|
source_url, video_id, mpd_id=mpd_id, fatal=False))
|
||||||
elif ext == 'smil':
|
elif ext == 'smil':
|
||||||
|
@ -2833,20 +2848,23 @@ class InfoExtractor(object):
|
||||||
'ext': ext,
|
'ext': ext,
|
||||||
})
|
})
|
||||||
else:
|
else:
|
||||||
|
format_id = str_or_none(source.get('label'))
|
||||||
height = int_or_none(source.get('height'))
|
height = int_or_none(source.get('height'))
|
||||||
if height is None:
|
if height is None and format_id:
|
||||||
# Often no height is provided but there is a label in
|
# Often no height is provided but there is a label in
|
||||||
# format like "1080p", "720p SD", or 1080.
|
# format like "1080p", "720p SD", or 1080.
|
||||||
height = int_or_none(self._search_regex(
|
height = parse_resolution(format_id).get('height')
|
||||||
r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
|
|
||||||
'height', default=None))
|
|
||||||
a_format = {
|
a_format = {
|
||||||
'url': source_url,
|
'url': source_url,
|
||||||
'width': int_or_none(source.get('width')),
|
'width': int_or_none(source.get('width')),
|
||||||
'height': height,
|
'height': height,
|
||||||
'tbr': int_or_none(source.get('bitrate'), scale=1000),
|
'tbr': int_or_none(source.get('bitrate'), scale=1000),
|
||||||
|
'filesize': int_or_none(source.get('filesize')),
|
||||||
'ext': ext,
|
'ext': ext,
|
||||||
}
|
}
|
||||||
|
if format_id:
|
||||||
|
a_format['format_id'] = format_id
|
||||||
|
|
||||||
if source_url.startswith('rtmp'):
|
if source_url.startswith('rtmp'):
|
||||||
a_format['ext'] = 'flv'
|
a_format['ext'] = 'flv'
|
||||||
# See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
|
# See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
|
||||||
|
|
Loading…
Reference in a new issue