[YouTube] Refresh compat/utils usage

* import parse_qs()
* import parse_qs in lazy_extractors (clears old TODO)
* clean up old compiled lazy_extractors for Py2
* use update_url()
This commit is contained in:
dirkf 2023-02-06 16:19:21 +00:00
parent 4e04f10499
commit bafb6dec72
3 changed files with 39 additions and 44 deletions

View file

@ -13,6 +13,11 @@ sys.path.insert(0, dirn(dirn((os.path.abspath(__file__)))))
lazy_extractors_filename = sys.argv[1] lazy_extractors_filename = sys.argv[1]
if os.path.exists(lazy_extractors_filename): if os.path.exists(lazy_extractors_filename):
os.remove(lazy_extractors_filename) os.remove(lazy_extractors_filename)
# Py2: may be confused by leftover lazy_extractors.pyc
try:
os.remove(lazy_extractors_filename + 'c')
except OSError:
pass
from youtube_dl.extractor import _ALL_CLASSES from youtube_dl.extractor import _ALL_CLASSES
from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor
@ -22,7 +27,10 @@ with open('devscripts/lazy_load_template.py', 'rt') as f:
module_contents = [ module_contents = [
module_template + '\n' + getsource(InfoExtractor.suitable) + '\n', module_template + '\n' + getsource(InfoExtractor.suitable) + '\n',
'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n'] 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n',
# needed for suitable() methods of Youtube extractor (see #28780)
'from youtube_dl.utils import parse_qs\n',
]
ie_template = ''' ie_template = '''
class {name}({bases}): class {name}({bases}):

View file

@ -40,14 +40,16 @@ class TestExecution(unittest.TestCase):
self.assertFalse(stderr) self.assertFalse(stderr)
def test_lazy_extractors(self): def test_lazy_extractors(self):
lazy_extractors = 'youtube_dl/extractor/lazy_extractors.py'
try: try:
subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'youtube_dl/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL) subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', lazy_extractors], cwd=rootDir, stdout=_DEV_NULL)
subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL) subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL)
finally: finally:
try: for x in ['', 'c'] if sys.version_info[0] < 3 else ['']:
os.remove('youtube_dl/extractor/lazy_extractors.py') try:
except (IOError, OSError): os.remove(lazy_extractors + x)
pass except (IOError, OSError):
pass
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -14,12 +14,11 @@ from ..compat import (
compat_chr, compat_chr,
compat_HTTPError, compat_HTTPError,
compat_map as map, compat_map as map,
compat_parse_qs,
compat_str, compat_str,
compat_urllib_parse,
compat_urllib_parse_parse_qs as compat_parse_qs,
compat_urllib_parse_unquote_plus, compat_urllib_parse_unquote_plus,
compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
compat_urlparse,
) )
from ..jsinterp import JSInterpreter from ..jsinterp import JSInterpreter
from ..utils import ( from ..utils import (
@ -33,6 +32,7 @@ from ..utils import (
mimetype2ext, mimetype2ext,
parse_codecs, parse_codecs,
parse_duration, parse_duration,
parse_qs,
qualities, qualities,
remove_start, remove_start,
smuggle_url, smuggle_url,
@ -50,10 +50,6 @@ from ..utils import (
) )
def parse_qs(url):
return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
class YoutubeBaseInfoExtractor(InfoExtractor): class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors""" """Provide base functions for Youtube extractors"""
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
@ -636,6 +632,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
'duration': 142, 'duration': 142,
'uploader': 'The Witcher', 'uploader': 'The Witcher',
'uploader_id': 'WitcherGame',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
'upload_date': '20140605', 'upload_date': '20140605',
'thumbnail': 'https://i.ytimg.com/vi/HtVdAasjOgU/maxresdefault.jpg', 'thumbnail': 'https://i.ytimg.com/vi/HtVdAasjOgU/maxresdefault.jpg',
'age_limit': 18, 'age_limit': 18,
@ -671,7 +669,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}, },
}, },
{ {
'note': 'Age-gated video embedable only with clientScreen=EMBED', 'note': 'Age-gated video embeddable only with clientScreen=EMBED',
'url': 'https://youtube.com/watch?v=Tq92D6wQ1mg', 'url': 'https://youtube.com/watch?v=Tq92D6wQ1mg',
'info_dict': { 'info_dict': {
'id': 'Tq92D6wQ1mg', 'id': 'Tq92D6wQ1mg',
@ -1392,11 +1390,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
@classmethod @classmethod
def suitable(cls, url): def suitable(cls, url):
# Hack for lazy extractors until more generic solution is implemented if parse_qs(url).get('list', [None])[0]:
# (see #28780)
from .youtube import parse_qs
qs = parse_qs(url)
if qs.get('list', [None])[0]:
return False return False
return super(YoutubeIE, cls).suitable(url) return super(YoutubeIE, cls).suitable(url)
@ -1546,7 +1540,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if player_url.startswith('//'): if player_url.startswith('//'):
player_url = 'https:' + player_url player_url = 'https:' + player_url
elif not re.match(r'https?://', player_url): elif not re.match(r'https?://', player_url):
player_url = compat_urlparse.urljoin( player_url = compat_urllib_parse.urljoin(
'https://www.youtube.com', player_url) 'https://www.youtube.com', player_url)
return player_url return player_url
@ -1628,9 +1622,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _unthrottle_format_urls(self, video_id, player_url, formats): def _unthrottle_format_urls(self, video_id, player_url, formats):
for fmt in formats: for fmt in formats:
parsed_fmt_url = compat_urlparse.urlparse(fmt['url']) parsed_fmt_url = compat_urllib_parse.urlparse(fmt['url'])
qs = compat_urlparse.parse_qs(parsed_fmt_url.query) n_param = compat_parse_qs(parsed_fmt_url.query).get('n')
n_param = qs.get('n')
if not n_param: if not n_param:
continue continue
n_param = n_param[-1] n_param = n_param[-1]
@ -1638,9 +1631,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if n_response is None: if n_response is None:
# give up if descrambling failed # give up if descrambling failed
break break
qs['n'] = [n_response] fmt['url'] = update_url(
fmt['url'] = compat_urlparse.urlunparse( parsed_fmt_url, query_update={'n': [n_response]})
parsed_fmt_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
# from yt-dlp, with tweaks # from yt-dlp, with tweaks
def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False):
@ -1669,20 +1661,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl'])) lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']))
if not playback_url: if not playback_url:
return return
parsed_playback_url = compat_urlparse.urlparse(playback_url)
qs = compat_urlparse.parse_qs(parsed_playback_url.query)
# cpn generation algorithm is reverse engineered from base.js. # cpn generation algorithm is reverse engineered from base.js.
# In fact it works even with dummy cpn. # In fact it works even with dummy cpn.
CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
qs.update({ playback_url = update_url(
'ver': ['2'], playback_url, query_update={
'cpn': [cpn], 'ver': ['2'],
}) 'cpn': [cpn],
playback_url = compat_urlparse.urlunparse( })
parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
self._download_webpage( self._download_webpage(
playback_url, video_id, 'Marking watched', playback_url, video_id, 'Marking watched',
@ -2075,9 +2064,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
thumbnails = [] thumbnails = []
for container in (video_details, microformat): for container in (video_details, microformat):
for thumbnail in (try_get( for thumbnail in try_get(
container, container,
lambda x: x['thumbnail']['thumbnails'], list) or []): lambda x: x['thumbnail']['thumbnails'], list) or []:
thumbnail_url = url_or_none(thumbnail.get('url')) thumbnail_url = url_or_none(thumbnail.get('url'))
if not thumbnail_url: if not thumbnail_url:
continue continue
@ -3287,11 +3276,7 @@ class YoutubePlaylistIE(InfoExtractor):
def suitable(cls, url): def suitable(cls, url):
if YoutubeTabIE.suitable(url): if YoutubeTabIE.suitable(url):
return False return False
# Hack for lazy extractors until more generic solution is implemented if parse_qs(url).get('v', [None])[0]:
# (see #28780)
from .youtube import parse_qs
qs = parse_qs(url)
if qs.get('v', [None])[0]:
return False return False
return super(YoutubePlaylistIE, cls).suitable(url) return super(YoutubePlaylistIE, cls).suitable(url)
@ -3430,9 +3415,9 @@ class YoutubeSearchURLIE(YoutubeBaseInfoExtractor):
}] }]
def _real_extract(self, url): def _real_extract(self, url):
qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) qs = parse_qs(url)
query = (qs.get('search_query') or qs.get('q'))[0] query = (qs.get('search_query') or qs.get('q'))[-1]
params = qs.get('sp', ('',))[0] params = qs.get('sp', ('',))[-1]
return self.playlist_result(self._search_results(query, params), query, query) return self.playlist_result(self._search_results(query, params), query, query)