From 87f78946a56d19fe3696725fe7329767fd910320 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 21 Aug 2013 03:50:56 +0200 Subject: [PATCH 1/7] [collegehumor] Allow old-style videos (Fixes #1285) --- youtube_dl/extractor/collegehumor.py | 52 ++++++++++++++++++---------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 30b9c7549..8d4c93d6d 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -4,6 +4,7 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( compat_urllib_parse_urlparse, + determine_ext, ExtractorError, ) @@ -12,7 +13,7 @@ from ..utils import ( class CollegeHumorIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P[0-9]+)/?(?P.*)$' - _TEST = { + _TESTS = [{ u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', u'file': u'6902724.mp4', u'md5': u'1264c12ad95dca142a9f0bf7968105a0', @@ -20,7 +21,16 @@ class CollegeHumorIE(InfoExtractor): u'title': u'Comic-Con Cosplay Catastrophe', u'description': u'Fans get creative this year at San Diego. Too creative. And yes, that\'s really Joss Whedon.', }, - } + }, + { + u'url': u'http://www.collegehumor.com/video/3505939/font-conference', + u'file': u'3505939.mp4', + u'md5': u'c51ca16b82bb456a4397987791a835f5', + u'info_dict': { + u'title': u'Font Conference', + u'description': u'This video wasn\'t long enough, so we made it double-spaced.', + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -49,25 +59,29 @@ class CollegeHumorIE(InfoExtractor): info['description'] = videoNode.findall('./description')[0].text info['title'] = videoNode.findall('./caption')[0].text info['thumbnail'] = videoNode.findall('./thumbnail')[0].text - manifest_url = videoNode.findall('./file')[0].text + next_url = videoNode.findall('./file')[0].text except IndexError: raise ExtractorError(u'Invalid metadata XML file') - manifest_url += '?hdcore=2.10.3' - manifestXml = self._download_webpage(manifest_url, video_id, - u'Downloading XML manifest', - u'Unable to download video info XML') + if next_url.endswith(u'manifest.f4m'): + manifest_url = next_url + '?hdcore=2.10.3' + manifestXml = self._download_webpage(manifest_url, video_id, + u'Downloading XML manifest', + u'Unable to download video info XML') - adoc = xml.etree.ElementTree.fromstring(manifestXml) - try: - media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0] - node_id = media_node.attrib['url'] - video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text - except IndexError as err: - raise ExtractorError(u'Invalid manifest file') + adoc = xml.etree.ElementTree.fromstring(manifestXml) + try: + media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0] + node_id = media_node.attrib['url'] + video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text + except IndexError as err: + raise ExtractorError(u'Invalid manifest file') + url_pr = compat_urllib_parse_urlparse(info['thumbnail']) + info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','') + info['ext'] = 'mp4' + else: + # Old-style direct links + info['url'] = next_url + info['ext'] = determine_ext(info['url']) - url_pr = compat_urllib_parse_urlparse(info['thumbnail']) - - info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','') - info['ext'] = 'mp4' - return [info] + return info From 79cb25776f46e0b9b1e95052fbd84a59440fa34f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 21 Aug 2013 04:06:46 +0200 Subject: [PATCH 2/7] Cache suitable regular expressions This speeds up TestAllURLsMatching.test_no_duplicates by about 8000% at the cost of minimal memory overhead. --- youtube_dl/extractor/common.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index da50abfc1..8009c2d85 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -77,7 +77,13 @@ class InfoExtractor(object): @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(cls._VALID_URL, url) is not None + + # This does not use has/getattr intentionally - we want to know whether + # we have cached the regexp for *this* class, whereas getattr would also + # match the superclass + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + return cls._VALID_URL_RE.match(url) is not None @classmethod def working(cls): From 3093468977e5c04d7f39016bbe983c483e47707f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 21 Aug 2013 04:31:57 +0200 Subject: [PATCH 3/7] [generic] Ignore stupid HTTP servers (#1284) --- youtube_dl/extractor/generic.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b633e896c..1c468f8f6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -107,8 +107,13 @@ class GenericIE(InfoExtractor): return new_url def _real_extract(self, url): - new_url = self._test_redirect(url) - if new_url: return [self.url_result(new_url)] + try: + new_url = self._test_redirect(url) + if new_url: + return [self.url_result(new_url)] + except compat_urllib_error.HTTPError: + # This may be a stupid server that doesn't like HEAD, our UA, or so + pass video_id = url.split('/')[-1] try: From 7fea7156cb41d4706059174f1fd00faa02278c8c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 21 Aug 2013 04:32:22 +0200 Subject: [PATCH 4/7] [generic] support HTML5 video --- youtube_dl/extractor/generic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 1c468f8f6..da016f7ee 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -149,6 +149,9 @@ class GenericIE(InfoExtractor): # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: if m_video_type is not None: mobj = re.search(r'.*? Date: Wed, 21 Aug 2013 04:33:57 +0200 Subject: [PATCH 5/7] release 2013.08.21 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8c93a275c..58e26bc49 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.08.17' +__version__ = '2013.08.21' From 739674cd77d6a6c7025878701939d987fac5b446 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 21 Aug 2013 05:24:58 +0200 Subject: [PATCH 6/7] [rtlnow] Add support for error message for queries from outside of Germany --- youtube_dl/extractor/rtlnow.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index d993a990a..2f134e6a7 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -2,7 +2,10 @@ import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + clean_html, + ExtractorError, +) class RTLnowIE(InfoExtractor): """Information Extractor for RTLnow, RTL2now and VOXnow""" @@ -18,6 +21,7 @@ class RTLnowIE(InfoExtractor): u'params': { u'skip_download': True, }, + u'skip': u'Only works from Germany', }, { u'url': u'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5', @@ -31,6 +35,7 @@ class RTLnowIE(InfoExtractor): u'params': { u'skip_download': True, }, + u'skip': u'Only works from Germany', }, { u'url': u'www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17', @@ -53,6 +58,14 @@ class RTLnowIE(InfoExtractor): video_id = mobj.group(u'video_id') webpage = self._download_webpage(webpage_url, video_id) + + note_m = re.search(r'''(?sx) + (.*?) + ''', webpage) + if note_m: + msg = clean_html(note_m.group(1)) + raise ExtractorError(msg) + video_title = self._html_search_regex(r'(?P<title>[^<]+)', webpage, u'title') playerdata_url = self._html_search_regex(r'\'playerdata\': \'(?P[^\']+)\'', From 6c3e6e88d3aaaea64ca3d96c005da654c89c8a3a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 21 Aug 2013 05:44:19 +0200 Subject: [PATCH 7/7] Allow hours in ETA display (Fixes #1280) --- youtube_dl/FileDownloader.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index ea6b9d626..217c4a52f 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -79,9 +79,13 @@ class FileDownloader(object): rate = float(current) / dif eta = int((float(total) - float(current)) / rate) (eta_mins, eta_secs) = divmod(eta, 60) - if eta_mins > 99: - return '--:--' - return '%02d:%02d' % (eta_mins, eta_secs) + (eta_hours, eta_mins) = divmod(eta_mins, 60) + if eta_hours > 99: + return '--:--:--' + if eta_hours == 0: + return '%02d:%02d' % (eta_mins, eta_secs) + else: + return '%02d:%02d:%02d' % (eta_hours, eta_mins, eta_secs) @staticmethod def calc_speed(start, now, bytes):