added support for HTTP redirects. Closes #315

This commit is contained in:
Filippo Valsorda 2012-03-18 22:15:58 +01:00
parent ceba827e9a
commit 6af22cf0ef
2 changed files with 121 additions and 0 deletions

View file

@ -15,6 +15,7 @@ __authors__ = (
'Kevin Ngo', 'Kevin Ngo',
'Ori Avtalion', 'Ori Avtalion',
'shizeeg', 'shizeeg',
'Filippo Valsorda',
) )
__license__ = 'Public Domain' __license__ = 'Public Domain'
@ -2240,7 +2241,67 @@ class GenericIE(InfoExtractor):
"""Report information extraction.""" """Report information extraction."""
self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id) self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
def report_following_redirect(self, new_url):
"""Report information extraction."""
self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
def _test_redirect(self, url):
"""Check if it is a redirect, like url shorteners, in case restart chain."""
class HeadRequest(urllib2.Request):
def get_method(self):
return "HEAD"
class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
"""
Subclass the HTTPRedirectHandler to make it use our
HeadRequest also on the redirected URL
"""
def redirect_request(self, req, fp, code, msg, headers, newurl):
if code in (301, 302, 303, 307):
newurl = newurl.replace(' ', '%20')
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
return HeadRequest(newurl,
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True)
else:
raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
class HTTPMethodFallback(urllib2.BaseHandler):
"""
Fallback to GET if HEAD is not allowed (405 HTTP error)
"""
def http_error_405(self, req, fp, code, msg, headers):
fp.read()
fp.close()
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
return self.parent.open(urllib2.Request(req.get_full_url(),
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True))
# Build our opener
opener = urllib2.OpenerDirector()
for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
HTTPMethodFallback, HEADRedirectHandler,
urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
opener.add_handler(handler())
response = opener.open(HeadRequest(url))
new_url = response.geturl()
if url == new_url: return False
self.report_following_redirect(new_url)
self._downloader.download([new_url])
return True
def _real_extract(self, url): def _real_extract(self, url):
if self._test_redirect(url): return
# At this point we have a new video # At this point we have a new video
self._downloader.increment_downloads() self._downloader.increment_downloads()

View file

@ -2241,7 +2241,67 @@ class GenericIE(InfoExtractor):
"""Report information extraction.""" """Report information extraction."""
self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id) self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
def report_following_redirect(self, new_url):
"""Report information extraction."""
self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
def _test_redirect(self, url):
"""Check if it is a redirect, like url shorteners, in case restart chain."""
class HeadRequest(urllib2.Request):
def get_method(self):
return "HEAD"
class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
"""
Subclass the HTTPRedirectHandler to make it use our
HeadRequest also on the redirected URL
"""
def redirect_request(self, req, fp, code, msg, headers, newurl):
if code in (301, 302, 303, 307):
newurl = newurl.replace(' ', '%20')
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
return HeadRequest(newurl,
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True)
else:
raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
class HTTPMethodFallback(urllib2.BaseHandler):
"""
Fallback to GET if HEAD is not allowed (405 HTTP error)
"""
def http_error_405(self, req, fp, code, msg, headers):
fp.read()
fp.close()
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
return self.parent.open(urllib2.Request(req.get_full_url(),
headers=newheaders,
origin_req_host=req.get_origin_req_host(),
unverifiable=True))
# Build our opener
opener = urllib2.OpenerDirector()
for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
HTTPMethodFallback, HEADRedirectHandler,
urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
opener.add_handler(handler())
response = opener.open(HeadRequest(url))
new_url = response.geturl()
if url == new_url: return False
self.report_following_redirect(new_url)
self._downloader.download([new_url])
return True
def _real_extract(self, url): def _real_extract(self, url):
if self._test_redirect(url): return
# At this point we have a new video # At this point we have a new video
self._downloader.increment_downloads() self._downloader.increment_downloads()