[arte] Add support for playlists and rework tests (Closes #9632)
This commit is contained in:
parent
6a1df4fb5f
commit
6e6b9f600f
2 changed files with 110 additions and 64 deletions
|
@ -61,10 +61,7 @@ class ArteTvIE(InfoExtractor):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class ArteTVPlus7IE(InfoExtractor):
|
class ArteTVBaseIE(InfoExtractor):
|
||||||
IE_NAME = 'arte.tv:+7'
|
|
||||||
_VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)'
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _extract_url_info(cls, url):
|
def _extract_url_info(cls, url):
|
||||||
mobj = re.match(cls._VALID_URL, url)
|
mobj = re.match(cls._VALID_URL, url)
|
||||||
|
@ -78,60 +75,6 @@ class ArteTVPlus7IE(InfoExtractor):
|
||||||
video_id = mobj.group('id')
|
video_id = mobj.group('id')
|
||||||
return video_id, lang
|
return video_id, lang
|
||||||
|
|
||||||
def _real_extract(self, url):
|
|
||||||
video_id, lang = self._extract_url_info(url)
|
|
||||||
webpage = self._download_webpage(url, video_id)
|
|
||||||
return self._extract_from_webpage(webpage, video_id, lang)
|
|
||||||
|
|
||||||
def _extract_from_webpage(self, webpage, video_id, lang):
|
|
||||||
patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']')
|
|
||||||
ids = (video_id, '')
|
|
||||||
# some pages contain multiple videos (like
|
|
||||||
# http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D),
|
|
||||||
# so we first try to look for json URLs that contain the video id from
|
|
||||||
# the 'vid' parameter.
|
|
||||||
patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates]
|
|
||||||
json_url = self._html_search_regex(
|
|
||||||
patterns, webpage, 'json vp url', default=None)
|
|
||||||
if not json_url:
|
|
||||||
def find_iframe_url(webpage, default=NO_DEFAULT):
|
|
||||||
return self._html_search_regex(
|
|
||||||
r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1',
|
|
||||||
webpage, 'iframe url', group='url', default=default)
|
|
||||||
|
|
||||||
iframe_url = find_iframe_url(webpage, None)
|
|
||||||
if not iframe_url:
|
|
||||||
embed_url = self._html_search_regex(
|
|
||||||
r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None)
|
|
||||||
if embed_url:
|
|
||||||
player = self._download_json(
|
|
||||||
embed_url, video_id, 'Downloading player page')
|
|
||||||
iframe_url = find_iframe_url(player['html'])
|
|
||||||
# en and es URLs produce react-based pages with different layout (e.g.
|
|
||||||
# http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world)
|
|
||||||
if not iframe_url:
|
|
||||||
program = self._search_regex(
|
|
||||||
r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n',
|
|
||||||
webpage, 'program', default=None)
|
|
||||||
if program:
|
|
||||||
embed_html = self._parse_json(program, video_id)
|
|
||||||
if embed_html:
|
|
||||||
iframe_url = find_iframe_url(embed_html['embed_html'])
|
|
||||||
if iframe_url:
|
|
||||||
json_url = compat_parse_qs(
|
|
||||||
compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
|
|
||||||
if json_url:
|
|
||||||
title = self._search_regex(
|
|
||||||
r'<h3[^>]+title=(["\'])(?P<title>.+?)\1',
|
|
||||||
webpage, 'title', default=None, group='title')
|
|
||||||
return self._extract_from_json_url(json_url, video_id, lang, title=title)
|
|
||||||
# Different kind of embed URL (e.g.
|
|
||||||
# http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium)
|
|
||||||
embed_url = self._search_regex(
|
|
||||||
r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1',
|
|
||||||
webpage, 'embed url', group='url')
|
|
||||||
return self.url_result(embed_url)
|
|
||||||
|
|
||||||
def _extract_from_json_url(self, json_url, video_id, lang, title=None):
|
def _extract_from_json_url(self, json_url, video_id, lang, title=None):
|
||||||
info = self._download_json(json_url, video_id)
|
info = self._download_json(json_url, video_id)
|
||||||
player_info = info['videoJsonPlayer']
|
player_info = info['videoJsonPlayer']
|
||||||
|
@ -235,6 +178,74 @@ class ArteTVPlus7IE(InfoExtractor):
|
||||||
return info_dict
|
return info_dict
|
||||||
|
|
||||||
|
|
||||||
|
class ArteTVPlus7IE(ArteTVBaseIE):
|
||||||
|
IE_NAME = 'arte.tv:+7'
|
||||||
|
_VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)'
|
||||||
|
|
||||||
|
_TESTS = [{
|
||||||
|
'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D',
|
||||||
|
'only_matching': True,
|
||||||
|
}]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def suitable(cls, url):
|
||||||
|
return False if ArteTVPlaylistIE.suitable(url) else super(ArteTVPlus7IE, cls).suitable(url)
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
video_id, lang = self._extract_url_info(url)
|
||||||
|
webpage = self._download_webpage(url, video_id)
|
||||||
|
return self._extract_from_webpage(webpage, video_id, lang)
|
||||||
|
|
||||||
|
def _extract_from_webpage(self, webpage, video_id, lang):
|
||||||
|
patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']')
|
||||||
|
ids = (video_id, '')
|
||||||
|
# some pages contain multiple videos (like
|
||||||
|
# http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D),
|
||||||
|
# so we first try to look for json URLs that contain the video id from
|
||||||
|
# the 'vid' parameter.
|
||||||
|
patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates]
|
||||||
|
json_url = self._html_search_regex(
|
||||||
|
patterns, webpage, 'json vp url', default=None)
|
||||||
|
if not json_url:
|
||||||
|
def find_iframe_url(webpage, default=NO_DEFAULT):
|
||||||
|
return self._html_search_regex(
|
||||||
|
r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1',
|
||||||
|
webpage, 'iframe url', group='url', default=default)
|
||||||
|
|
||||||
|
iframe_url = find_iframe_url(webpage, None)
|
||||||
|
if not iframe_url:
|
||||||
|
embed_url = self._html_search_regex(
|
||||||
|
r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None)
|
||||||
|
if embed_url:
|
||||||
|
player = self._download_json(
|
||||||
|
embed_url, video_id, 'Downloading player page')
|
||||||
|
iframe_url = find_iframe_url(player['html'])
|
||||||
|
# en and es URLs produce react-based pages with different layout (e.g.
|
||||||
|
# http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world)
|
||||||
|
if not iframe_url:
|
||||||
|
program = self._search_regex(
|
||||||
|
r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n',
|
||||||
|
webpage, 'program', default=None)
|
||||||
|
if program:
|
||||||
|
embed_html = self._parse_json(program, video_id)
|
||||||
|
if embed_html:
|
||||||
|
iframe_url = find_iframe_url(embed_html['embed_html'])
|
||||||
|
if iframe_url:
|
||||||
|
json_url = compat_parse_qs(
|
||||||
|
compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
|
||||||
|
if json_url:
|
||||||
|
title = self._search_regex(
|
||||||
|
r'<h3[^>]+title=(["\'])(?P<title>.+?)\1',
|
||||||
|
webpage, 'title', default=None, group='title')
|
||||||
|
return self._extract_from_json_url(json_url, video_id, lang, title=title)
|
||||||
|
# Different kind of embed URL (e.g.
|
||||||
|
# http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium)
|
||||||
|
embed_url = self._search_regex(
|
||||||
|
r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1',
|
||||||
|
webpage, 'embed url', group='url')
|
||||||
|
return self.url_result(embed_url)
|
||||||
|
|
||||||
|
|
||||||
# It also uses the arte_vp_url url from the webpage to extract the information
|
# It also uses the arte_vp_url url from the webpage to extract the information
|
||||||
class ArteTVCreativeIE(ArteTVPlus7IE):
|
class ArteTVCreativeIE(ArteTVPlus7IE):
|
||||||
IE_NAME = 'arte.tv:creative'
|
IE_NAME = 'arte.tv:creative'
|
||||||
|
@ -267,7 +278,7 @@ class ArteTVInfoIE(ArteTVPlus7IE):
|
||||||
IE_NAME = 'arte.tv:info'
|
IE_NAME = 'arte.tv:info'
|
||||||
_VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
|
_VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
|
||||||
|
|
||||||
_TEST = {
|
_TESTS = [{
|
||||||
'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere',
|
'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
'id': '067528-000-A',
|
'id': '067528-000-A',
|
||||||
|
@ -275,7 +286,7 @@ class ArteTVInfoIE(ArteTVPlus7IE):
|
||||||
'title': 'Service civique, un cache misère ?',
|
'title': 'Service civique, un cache misère ?',
|
||||||
'upload_date': '20160403',
|
'upload_date': '20160403',
|
||||||
},
|
},
|
||||||
}
|
}]
|
||||||
|
|
||||||
|
|
||||||
class ArteTVFutureIE(ArteTVPlus7IE):
|
class ArteTVFutureIE(ArteTVPlus7IE):
|
||||||
|
@ -300,6 +311,8 @@ class ArteTVDDCIE(ArteTVPlus7IE):
|
||||||
IE_NAME = 'arte.tv:ddc'
|
IE_NAME = 'arte.tv:ddc'
|
||||||
_VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)'
|
_VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)'
|
||||||
|
|
||||||
|
_TESTS = []
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
video_id, lang = self._extract_url_info(url)
|
video_id, lang = self._extract_url_info(url)
|
||||||
if lang == 'folge':
|
if lang == 'folge':
|
||||||
|
@ -318,7 +331,7 @@ class ArteTVConcertIE(ArteTVPlus7IE):
|
||||||
IE_NAME = 'arte.tv:concert'
|
IE_NAME = 'arte.tv:concert'
|
||||||
_VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)'
|
_VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)'
|
||||||
|
|
||||||
_TEST = {
|
_TESTS = [{
|
||||||
'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde',
|
'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde',
|
||||||
'md5': '9ea035b7bd69696b67aa2ccaaa218161',
|
'md5': '9ea035b7bd69696b67aa2ccaaa218161',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
|
@ -328,14 +341,14 @@ class ArteTVConcertIE(ArteTVPlus7IE):
|
||||||
'upload_date': '20140128',
|
'upload_date': '20140128',
|
||||||
'description': 'md5:486eb08f991552ade77439fe6d82c305',
|
'description': 'md5:486eb08f991552ade77439fe6d82c305',
|
||||||
},
|
},
|
||||||
}
|
}]
|
||||||
|
|
||||||
|
|
||||||
class ArteTVCinemaIE(ArteTVPlus7IE):
|
class ArteTVCinemaIE(ArteTVPlus7IE):
|
||||||
IE_NAME = 'arte.tv:cinema'
|
IE_NAME = 'arte.tv:cinema'
|
||||||
_VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)'
|
_VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)'
|
||||||
|
|
||||||
_TEST = {
|
_TESTS = [{
|
||||||
'url': 'http://cinema.arte.tv/de/node/38291',
|
'url': 'http://cinema.arte.tv/de/node/38291',
|
||||||
'md5': '6b275511a5107c60bacbeeda368c3aa1',
|
'md5': '6b275511a5107c60bacbeeda368c3aa1',
|
||||||
'info_dict': {
|
'info_dict': {
|
||||||
|
@ -345,7 +358,7 @@ class ArteTVCinemaIE(ArteTVPlus7IE):
|
||||||
'upload_date': '20160122',
|
'upload_date': '20160122',
|
||||||
'description': 'md5:7f749bbb77d800ef2be11d54529b96bc',
|
'description': 'md5:7f749bbb77d800ef2be11d54529b96bc',
|
||||||
},
|
},
|
||||||
}
|
}]
|
||||||
|
|
||||||
|
|
||||||
class ArteTVMagazineIE(ArteTVPlus7IE):
|
class ArteTVMagazineIE(ArteTVPlus7IE):
|
||||||
|
@ -390,9 +403,41 @@ class ArteTVEmbedIE(ArteTVPlus7IE):
|
||||||
)
|
)
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
_TESTS = []
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
mobj = re.match(self._VALID_URL, url)
|
mobj = re.match(self._VALID_URL, url)
|
||||||
video_id = mobj.group('id')
|
video_id = mobj.group('id')
|
||||||
lang = mobj.group('lang')
|
lang = mobj.group('lang')
|
||||||
json_url = mobj.group('json_url')
|
json_url = mobj.group('json_url')
|
||||||
return self._extract_from_json_url(json_url, video_id, lang)
|
return self._extract_from_json_url(json_url, video_id, lang)
|
||||||
|
|
||||||
|
|
||||||
|
class ArteTVPlaylistIE(ArteTVBaseIE):
|
||||||
|
IE_NAME = 'arte.tv:playlist'
|
||||||
|
_VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/[^#]*#collection/(?P<id>PL-\d+)'
|
||||||
|
|
||||||
|
_TESTS = [{
|
||||||
|
'url': 'http://www.arte.tv/guide/de/plus7/?country=DE#collection/PL-013263/ARTETV',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'PL-013263',
|
||||||
|
'title': 'Areva & Uramin',
|
||||||
|
},
|
||||||
|
'playlist_mincount': 6,
|
||||||
|
}, {
|
||||||
|
'url': 'http://www.arte.tv/guide/de/playlists?country=DE#collection/PL-013190/ARTETV',
|
||||||
|
'only_matching': True,
|
||||||
|
}]
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
playlist_id, lang = self._extract_url_info(url)
|
||||||
|
collection = self._download_json(
|
||||||
|
'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos'
|
||||||
|
% (lang, playlist_id), playlist_id)
|
||||||
|
title = collection.get('title')
|
||||||
|
description = collection.get('shortDescription') or collection.get('teaserText')
|
||||||
|
entries = [
|
||||||
|
self._extract_from_json_url(
|
||||||
|
video['jsonUrl'], video.get('programId') or playlist_id, lang)
|
||||||
|
for video in collection['videos'] if video.get('jsonUrl')]
|
||||||
|
return self.playlist_result(entries, playlist_id, title, description)
|
||||||
|
|
|
@ -56,6 +56,7 @@ from .arte import (
|
||||||
ArteTVDDCIE,
|
ArteTVDDCIE,
|
||||||
ArteTVMagazineIE,
|
ArteTVMagazineIE,
|
||||||
ArteTVEmbedIE,
|
ArteTVEmbedIE,
|
||||||
|
ArteTVPlaylistIE,
|
||||||
)
|
)
|
||||||
from .atresplayer import AtresPlayerIE
|
from .atresplayer import AtresPlayerIE
|
||||||
from .atttechchannel import ATTTechChannelIE
|
from .atttechchannel import ATTTechChannelIE
|
||||||
|
|
Loading…
Reference in a new issue