diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 919f4f987..f2c577f98 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -91,6 +91,7 @@ from .anvato import AnvatoIE from .washingtonpost import WashingtonPostIE from .wistia import WistiaIE from .mediaset import MediasetIE +from .joj import JojIE class GenericIE(InfoExtractor): @@ -1770,6 +1771,16 @@ class GenericIE(InfoExtractor): }, 'add_ie': [MediasetIE.ie_key()], }, + { + # JOJ.sk embeds + 'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok', + 'info_dict': { + 'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok', + 'title': 'Slovenskom sa prehnala vlna silných búrok', + }, + 'playlist_mincount': 5, + 'add_ie': [JojIE.ie_key()], + }, { # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video) 'url': 'https://tvrain.ru/amp/418921/', @@ -2722,6 +2733,12 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) + # Look for JOJ.sk embeds + joj_urls = JojIE._extract_urls(webpage) + if joj_urls: + return self.playlist_from_matches( + joj_urls, video_id, video_title, ie=JojIE.ie_key()) + def merge_dicts(dict1, dict2): merged = {} for k, v in dict1.items(): diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py index 2ebfec902..a764023e9 100755 --- a/youtube_dl/extractor/joj.py +++ b/youtube_dl/extractor/joj.py @@ -1,56 +1,100 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor import re +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + js_to_json, + try_get, +) + class JojIE(InfoExtractor): - _VALID_URL = r'https?://[a-z0-9]+\.joj\.sk/([^/]+/)*(?P<title_query>(?P<release_date>[0-9]{4}(-[0-9]{2}){2}).*)' # noqa + _VALID_URL = r'''(?x) + (?: + joj:| + https?://media\.joj\.sk/embed/ + ) + (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) + ''' _TESTS = [{ - 'url': 'https://www.joj.sk/nove-byvanie/archiv/2017-05-28-nove-byvanie', # noqa + 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932', 'info_dict': { 'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932', 'ext': 'mp4', - 'title': 'Nové Bývanie', - 'release_date': '20170528' + 'title': 'NOVÉ BÝVANIE', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 3118, } }, { - 'url': 'http://nasi.joj.sk/epizody/2016-09-06-stari-rodicia', - 'info_dict': { - 'id': 'f18b2c5f-9ea8-4941-a164-a814c53306ad', - 'ext': 'mp4', - 'title': 'Starí Rodičia', - 'release_date': '20160906' - } + 'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932', + 'only_matching': True, }] - media_src_url = 'http://n16.joj.sk/storage/' - xml_source_url = 'https://media.joj.sk/services/Video.php?clip=' + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', + webpage) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - release_date = mobj.group('release_date').replace('-', '') - webpage = self._download_webpage(url, 'id') - video_id = self._html_search_regex( - r'https?://([a-z0-9]+\.)joj\.sk/embed/(?P<video_id>[a-f0-9\-]+)', - webpage, 'id', group='video_id') - xml_playlist_url = self.xml_source_url + video_id - xml_playlist_et = self._download_xml(xml_playlist_url, 'XML playlist') + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://media.joj.sk/embed/%s' % video_id, video_id) + + title = self._search_regex( + (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1', + r'<title>(?P<title>[^<]+)'), webpage, 'title', + default=None, group='title') or self._og_search_title(webpage) + + bitrates = self._parse_json( + self._search_regex( + r'(?s)bitrates\s*=\s*({.+?});', webpage, 'bitrates', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) + formats = [] - for file_el in xml_playlist_et.findall('files/file'): - try: - height = int(file_el.attrib['id'].replace('p', '')) - except ValueError: - height = 0 - formats.append({'height': height, - 'url': self.media_src_url + file_el.attrib['path'].replace( # noqa - 'dat/', '', 1)}) + for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []: + if isinstance(format_url, compat_str): + height = self._search_regex( + r'(\d+)[pP]\.', format_url, 'height', default=None) + formats.append({ + 'url': format_url, + 'format_id': '%sp' % height if height else None, + 'height': int(height), + }) + if not formats: + playlist = self._download_xml( + 'https://media.joj.sk/services/Video.php?clip=%s' % video_id, + video_id) + for file_el in playlist.findall('./files/file'): + path = file_el.get('path') + if not path: + continue + format_id = file_el.get('id') or file_el.get('label') + formats.append({ + 'url': 'http://n16.joj.sk/storage/%s' % path.replace( + 'dat/', '', 1), + 'format_id': format_id, + 'height': int_or_none(self._search_regex( + r'(\d+)[pP]', format_id or path, 'height', + default=None)), + }) self._sort_formats(formats) + thumbnail = self._og_search_thumbnail(webpage) + + duration = int_or_none(self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + return { 'id': video_id, - 'title': self._og_search_title(webpage).title(), + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, 'formats': formats, - 'release_date': release_date }