[bbc] Extract description and timestamp from __INITIAL_DATA__ (#28774)

This commit is contained in:
dirkf 2021-04-20 20:51:55 +01:00 committed by GitHub
parent 9f6c03a006
commit 41920fc80e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -11,6 +11,7 @@ from ..compat import (
compat_etree_Element, compat_etree_Element,
compat_HTTPError, compat_HTTPError,
compat_parse_qs, compat_parse_qs,
compat_str,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
compat_urlparse, compat_urlparse,
) )
@ -25,8 +26,10 @@ from ..utils import (
js_to_json, js_to_json,
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
strip_or_none,
try_get, try_get,
unescapeHTML, unescapeHTML,
unified_timestamp,
url_or_none, url_or_none,
urlencode_postdata, urlencode_postdata,
urljoin, urljoin,
@ -761,8 +764,17 @@ class BBCIE(BBCCoUkIE):
'only_matching': True, 'only_matching': True,
}, { }, {
# custom redirection to www.bbc.com # custom redirection to www.bbc.com
# also, video with window.__INITIAL_DATA__
'url': 'http://www.bbc.co.uk/news/science-environment-33661876', 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
'only_matching': True, 'info_dict': {
'id': 'p02xzws1',
'ext': 'mp4',
'title': "Pluto may have 'nitrogen glaciers'",
'description': "Pluto could have glaciers of nitrogen ice, new photographs from Nasa's New Horizons probe suggest.",
'thumbnail': r're:https?://.+/.+\.jpg',
'timestamp': 1437785037,
'upload_date': '20150725',
},
}, { }, {
# single video article embedded with data-media-vpid # single video article embedded with data-media-vpid
'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
@ -1164,12 +1176,23 @@ class BBCIE(BBCCoUkIE):
continue continue
formats, subtitles = self._download_media_selector(item_id) formats, subtitles = self._download_media_selector(item_id)
self._sort_formats(formats) self._sort_formats(formats)
item_desc = try_get(
media,
lambda x: x['summary']['blocks'][0]['model']['text'],
compat_str)
item_time = None
for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
if try_get(meta, lambda x: x['label']) == 'Published':
item_time = unified_timestamp(meta.get('timestamp'))
break
entries.append({ entries.append({
'id': item_id, 'id': item_id,
'title': item_title, 'title': item_title,
'thumbnail': item.get('holdingImageUrl'), 'thumbnail': item.get('holdingImageUrl'),
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
'timestamp': item_time,
'description': strip_or_none(item_desc),
}) })
for resp in (initial_data.get('data') or {}).values(): for resp in (initial_data.get('data') or {}).values():
name = resp.get('name') name = resp.get('name')