[pornhub] Apply scrape detection bypass for all extractors
This commit is contained in:
parent
6510a3aa97
commit
71a1f61700
1 changed files with 24 additions and 22 deletions
|
@ -24,7 +24,29 @@ from ..utils import (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class PornHubIE(InfoExtractor):
|
class PornHubBaseIE(InfoExtractor):
|
||||||
|
def _download_webpage_handle(self, *args, **kwargs):
|
||||||
|
def dl(*args, **kwargs):
|
||||||
|
return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
|
||||||
|
|
||||||
|
webpage, urlh = dl(*args, **kwargs)
|
||||||
|
|
||||||
|
if any(re.search(p, webpage) for p in (
|
||||||
|
r'<body\b[^>]+\bonload=["\']go\(\)',
|
||||||
|
r'document\.cookie\s*=\s*["\']RNKEY=',
|
||||||
|
r'document\.location\.reload\(true\)')):
|
||||||
|
url_or_request = args[0]
|
||||||
|
url = (url_or_request.get_full_url()
|
||||||
|
if isinstance(url_or_request, compat_urllib_request.Request)
|
||||||
|
else url_or_request)
|
||||||
|
phantom = PhantomJSwrapper(self, required_version='2.0')
|
||||||
|
phantom.get(url, html=webpage)
|
||||||
|
webpage, urlh = dl(*args, **kwargs)
|
||||||
|
|
||||||
|
return webpage, urlh
|
||||||
|
|
||||||
|
|
||||||
|
class PornHubIE(PornHubBaseIE):
|
||||||
IE_DESC = 'PornHub and Thumbzilla'
|
IE_DESC = 'PornHub and Thumbzilla'
|
||||||
_VALID_URL = r'''(?x)
|
_VALID_URL = r'''(?x)
|
||||||
https?://
|
https?://
|
||||||
|
@ -128,26 +150,6 @@ class PornHubIE(InfoExtractor):
|
||||||
'only_matching': True,
|
'only_matching': True,
|
||||||
}]
|
}]
|
||||||
|
|
||||||
def _download_webpage_handle(self, *args, **kwargs):
|
|
||||||
def dl(*args, **kwargs):
|
|
||||||
return super(PornHubIE, self)._download_webpage_handle(*args, **kwargs)
|
|
||||||
|
|
||||||
webpage, urlh = dl(*args, **kwargs)
|
|
||||||
|
|
||||||
if any(re.search(p, webpage) for p in (
|
|
||||||
r'<body\b[^>]+\bonload=["\']go\(\)',
|
|
||||||
r'document\.cookie\s*=\s*["\']RNKEY=',
|
|
||||||
r'document\.location\.reload\(true\)')):
|
|
||||||
url_or_request = args[0]
|
|
||||||
url = (url_or_request.get_full_url()
|
|
||||||
if isinstance(url_or_request, compat_urllib_request.Request)
|
|
||||||
else url_or_request)
|
|
||||||
phantom = PhantomJSwrapper(self, required_version='2.0')
|
|
||||||
phantom.get(url, html=webpage)
|
|
||||||
webpage, urlh = dl(*args, **kwargs)
|
|
||||||
|
|
||||||
return webpage, urlh
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _extract_urls(webpage):
|
def _extract_urls(webpage):
|
||||||
return re.findall(
|
return re.findall(
|
||||||
|
@ -329,7 +331,7 @@ class PornHubIE(InfoExtractor):
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class PornHubPlaylistBaseIE(InfoExtractor):
|
class PornHubPlaylistBaseIE(PornHubBaseIE):
|
||||||
def _extract_entries(self, webpage, host):
|
def _extract_entries(self, webpage, host):
|
||||||
# Only process container div with main playlist content skipping
|
# Only process container div with main playlist content skipping
|
||||||
# drop-down menu that uses similar pattern for videos (see
|
# drop-down menu that uses similar pattern for videos (see
|
||||||
|
|
Loading…
Reference in a new issue