[pornhub:user] Add support for URLs unavailable via /videos page and improve paging (closes #27853)
This commit is contained in:
parent
1b731ebcaa
commit
83031d749b
1 changed files with 47 additions and 9 deletions
|
@ -22,6 +22,7 @@ from ..utils import (
|
||||||
orderedSet,
|
orderedSet,
|
||||||
remove_quotes,
|
remove_quotes,
|
||||||
str_to_int,
|
str_to_int,
|
||||||
|
update_url_query,
|
||||||
url_or_none,
|
url_or_none,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -405,6 +406,10 @@ class PornHubIE(PornHubBaseIE):
|
||||||
|
|
||||||
|
|
||||||
class PornHubPlaylistBaseIE(PornHubBaseIE):
|
class PornHubPlaylistBaseIE(PornHubBaseIE):
|
||||||
|
def _extract_page(self, url):
|
||||||
|
return int_or_none(self._search_regex(
|
||||||
|
r'\bpage=(\d+)', url, 'page', default=None))
|
||||||
|
|
||||||
def _extract_entries(self, webpage, host):
|
def _extract_entries(self, webpage, host):
|
||||||
# Only process container div with main playlist content skipping
|
# Only process container div with main playlist content skipping
|
||||||
# drop-down menu that uses similar pattern for videos (see
|
# drop-down menu that uses similar pattern for videos (see
|
||||||
|
@ -463,14 +468,27 @@ class PornHubUserIE(PornHubPlaylistBaseIE):
|
||||||
}, {
|
}, {
|
||||||
'url': 'https://www.pornhub.com/model/zoe_ph?abc=1',
|
'url': 'https://www.pornhub.com/model/zoe_ph?abc=1',
|
||||||
'only_matching': True,
|
'only_matching': True,
|
||||||
|
}, {
|
||||||
|
# Unavailable via /videos page, but available with direct pagination
|
||||||
|
# on pornstar page (see [1]), requires premium
|
||||||
|
# 1. https://github.com/ytdl-org/youtube-dl/issues/27853
|
||||||
|
'url': 'https://www.pornhubpremium.com/pornstar/sienna-west',
|
||||||
|
'only_matching': True,
|
||||||
|
}, {
|
||||||
|
# Same as before, multi page
|
||||||
|
'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau',
|
||||||
|
'only_matching': True,
|
||||||
}]
|
}]
|
||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
mobj = re.match(self._VALID_URL, url)
|
mobj = re.match(self._VALID_URL, url)
|
||||||
user_id = mobj.group('id')
|
user_id = mobj.group('id')
|
||||||
|
videos_url = '%s/videos' % mobj.group('url')
|
||||||
|
page = self._extract_page(url)
|
||||||
|
if page:
|
||||||
|
videos_url = update_url_query(videos_url, {'page': page})
|
||||||
return self.url_result(
|
return self.url_result(
|
||||||
'%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(),
|
videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id)
|
||||||
video_id=user_id)
|
|
||||||
|
|
||||||
|
|
||||||
class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
|
class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
|
||||||
|
@ -488,17 +506,37 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
|
||||||
host = mobj.group('host')
|
host = mobj.group('host')
|
||||||
item_id = mobj.group('id')
|
item_id = mobj.group('id')
|
||||||
|
|
||||||
page = int_or_none(self._search_regex(
|
page = self._extract_page(url)
|
||||||
r'\bpage=(\d+)', url, 'page', default=None))
|
|
||||||
|
VIDEOS = '/videos'
|
||||||
|
|
||||||
|
def download_page(base_url, num):
|
||||||
|
note = 'Downloading %spage %d' % ('' if VIDEOS in base_url else 'fallback ', num)
|
||||||
|
return self._download_webpage(
|
||||||
|
base_url, item_id, note, query={'page': num})
|
||||||
|
|
||||||
|
def is_404(e):
|
||||||
|
return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404
|
||||||
|
|
||||||
entries = []
|
entries = []
|
||||||
for page_num in (page, ) if page is not None else itertools.count(1):
|
base_url = url
|
||||||
|
has_page = page is not None
|
||||||
|
first_page = page if has_page else 1
|
||||||
|
for page_num in (first_page, ) if has_page else itertools.count(first_page):
|
||||||
try:
|
try:
|
||||||
webpage = self._download_webpage(
|
try:
|
||||||
url, item_id, 'Downloading page %d' % page_num,
|
webpage = download_page(base_url, page_num)
|
||||||
query={'page': page_num})
|
|
||||||
except ExtractorError as e:
|
except ExtractorError as e:
|
||||||
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
|
# Some sources may not be available via /videos page,
|
||||||
|
# trying to fallback to main page pagination (see [1])
|
||||||
|
# 1. https://github.com/ytdl-org/youtube-dl/issues/27853
|
||||||
|
if is_404(e) and page_num == first_page and VIDEOS in base_url:
|
||||||
|
base_url = base_url.replace(VIDEOS, '')
|
||||||
|
webpage = download_page(base_url, page_num)
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
except ExtractorError as e:
|
||||||
|
if is_404(e) and page_num != first_page:
|
||||||
break
|
break
|
||||||
raise
|
raise
|
||||||
page_entries = self._extract_entries(webpage, host)
|
page_entries = self._extract_entries(webpage, host)
|
||||||
|
|
Loading…
Reference in a new issue