[extractor/common] Improve JSON-LD interaction statistic extraction (refs #23306)
This commit is contained in:
parent
45b0a0d11b
commit
1727541315
2 changed files with 60 additions and 2 deletions
|
@ -98,6 +98,56 @@ class TestInfoExtractor(unittest.TestCase):
|
||||||
self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True)
|
self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True)
|
||||||
self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
|
self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
|
||||||
|
|
||||||
|
def test_search_json_ld_realworld(self):
|
||||||
|
# https://github.com/ytdl-org/youtube-dl/issues/23306
|
||||||
|
expect_dict(
|
||||||
|
self,
|
||||||
|
self.ie._search_json_ld(r'''<script type="application/ld+json">
|
||||||
|
{
|
||||||
|
"@context": "http://schema.org/",
|
||||||
|
"@type": "VideoObject",
|
||||||
|
"name": "1 On 1 With Kleio",
|
||||||
|
"url": "https://www.eporner.com/hd-porn/xN49A1cT3eB/1-On-1-With-Kleio/",
|
||||||
|
"duration": "PT0H12M23S",
|
||||||
|
"thumbnailUrl": ["https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg", "https://imggen.eporner.com/780814/1920/1080/9.jpg"],
|
||||||
|
"contentUrl": "https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4",
|
||||||
|
"embedUrl": "https://www.eporner.com/embed/xN49A1cT3eB/1-On-1-With-Kleio/",
|
||||||
|
"image": "https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg",
|
||||||
|
"width": "1920",
|
||||||
|
"height": "1080",
|
||||||
|
"encodingFormat": "mp4",
|
||||||
|
"bitrate": "6617kbps",
|
||||||
|
"isFamilyFriendly": "False",
|
||||||
|
"description": "Kleio Valentien",
|
||||||
|
"uploadDate": "2015-12-05T21:24:35+01:00",
|
||||||
|
"interactionStatistic": {
|
||||||
|
"@type": "InteractionCounter",
|
||||||
|
"interactionType": { "@type": "http://schema.org/WatchAction" },
|
||||||
|
"userInteractionCount": 1120958
|
||||||
|
}, "aggregateRating": {
|
||||||
|
"@type": "AggregateRating",
|
||||||
|
"ratingValue": "88",
|
||||||
|
"ratingCount": "630",
|
||||||
|
"bestRating": "100",
|
||||||
|
"worstRating": "0"
|
||||||
|
}, "actor": [{
|
||||||
|
"@type": "Person",
|
||||||
|
"name": "Kleio Valentien",
|
||||||
|
"url": "https://www.eporner.com/pornstar/kleio-valentien/"
|
||||||
|
}]}
|
||||||
|
</script>''', None),
|
||||||
|
{
|
||||||
|
'title': '1 On 1 With Kleio',
|
||||||
|
'description': 'Kleio Valentien',
|
||||||
|
'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4',
|
||||||
|
'timestamp': 1449347075,
|
||||||
|
'duration': 743.0,
|
||||||
|
'view_count': 1120958,
|
||||||
|
'width': 1920,
|
||||||
|
'height': 1080,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
def test_download_json(self):
|
def test_download_json(self):
|
||||||
uri = encode_data_uri(b'{"foo": "blah"}', 'application/json')
|
uri = encode_data_uri(b'{"foo": "blah"}', 'application/json')
|
||||||
self.assertEqual(self.ie._download_json(uri, None), {'foo': 'blah'})
|
self.assertEqual(self.ie._download_json(uri, None), {'foo': 'blah'})
|
||||||
|
|
|
@ -1237,8 +1237,16 @@ class InfoExtractor(object):
|
||||||
'ViewAction': 'view',
|
'ViewAction': 'view',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def extract_interaction_type(e):
|
||||||
|
interaction_type = e.get('interactionType')
|
||||||
|
if isinstance(interaction_type, dict):
|
||||||
|
interaction_type = interaction_type.get('@type')
|
||||||
|
return str_or_none(interaction_type)
|
||||||
|
|
||||||
def extract_interaction_statistic(e):
|
def extract_interaction_statistic(e):
|
||||||
interaction_statistic = e.get('interactionStatistic')
|
interaction_statistic = e.get('interactionStatistic')
|
||||||
|
if isinstance(interaction_statistic, dict):
|
||||||
|
interaction_statistic = [interaction_statistic]
|
||||||
if not isinstance(interaction_statistic, list):
|
if not isinstance(interaction_statistic, list):
|
||||||
return
|
return
|
||||||
for is_e in interaction_statistic:
|
for is_e in interaction_statistic:
|
||||||
|
@ -1246,8 +1254,8 @@ class InfoExtractor(object):
|
||||||
continue
|
continue
|
||||||
if is_e.get('@type') != 'InteractionCounter':
|
if is_e.get('@type') != 'InteractionCounter':
|
||||||
continue
|
continue
|
||||||
interaction_type = is_e.get('interactionType')
|
interaction_type = extract_interaction_type(is_e)
|
||||||
if not isinstance(interaction_type, compat_str):
|
if not interaction_type:
|
||||||
continue
|
continue
|
||||||
# For interaction count some sites provide string instead of
|
# For interaction count some sites provide string instead of
|
||||||
# an integer (as per spec) with non digit characters (e.g. ",")
|
# an integer (as per spec) with non digit characters (e.g. ",")
|
||||||
|
|
Loading…
Reference in a new issue