[extractor/common] Fix inline HTML5 media tags processing and add test (closes #27345)
This commit is contained in:
parent
e2bdf8bf4f
commit
5a1fbbf8b7
2 changed files with 15 additions and 3 deletions
|
@ -108,6 +108,18 @@ class TestInfoExtractor(unittest.TestCase):
|
||||||
self.assertEqual(self.ie._download_json(uri, None, fatal=False), None)
|
self.assertEqual(self.ie._download_json(uri, None, fatal=False), None)
|
||||||
|
|
||||||
def test_parse_html5_media_entries(self):
|
def test_parse_html5_media_entries(self):
|
||||||
|
# inline video tag
|
||||||
|
expect_dict(
|
||||||
|
self,
|
||||||
|
self.ie._parse_html5_media_entries(
|
||||||
|
'https://127.0.0.1/video.html',
|
||||||
|
r'<html><video src="/vid.mp4" /></html>', None)[0],
|
||||||
|
{
|
||||||
|
'formats': [{
|
||||||
|
'url': 'https://127.0.0.1/vid.mp4',
|
||||||
|
}],
|
||||||
|
})
|
||||||
|
|
||||||
# from https://www.r18.com/
|
# from https://www.r18.com/
|
||||||
# with kpbs in label
|
# with kpbs in label
|
||||||
expect_dict(
|
expect_dict(
|
||||||
|
|
|
@ -2515,9 +2515,9 @@ class InfoExtractor(object):
|
||||||
# https://www.ampproject.org/docs/reference/components/amp-video)
|
# https://www.ampproject.org/docs/reference/components/amp-video)
|
||||||
# For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
|
# For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
|
||||||
_MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
|
_MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
|
||||||
media_tags = [(media_tag, media_type, '')
|
media_tags = [(media_tag, media_tag_name, media_type, '')
|
||||||
for media_tag, media_type
|
for media_tag, media_tag_name, media_type
|
||||||
in re.findall(r'(?s)(<%s[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
|
in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
|
||||||
media_tags.extend(re.findall(
|
media_tags.extend(re.findall(
|
||||||
# We only allow video|audio followed by a whitespace or '>'.
|
# We only allow video|audio followed by a whitespace or '>'.
|
||||||
# Allowing more characters may end up in significant slow down (see
|
# Allowing more characters may end up in significant slow down (see
|
||||||
|
|
Loading…
Reference in a new issue