From 5eca00a2e33a6ca26a7f52589e5d77bab7e5edf4 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 9 Jan 2018 18:12:55 +0800 Subject: [PATCH] [weibo] Misc improvements --- youtube_dl/extractor/weibo.py | 125 ++++++++++++++++++---------------- 1 file changed, 66 insertions(+), 59 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index cbe0c3228..3cb4d71a6 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -8,7 +8,8 @@ import random import re from ..compat import ( - compat_urlparse, + compat_parse_qs, + compat_str, ) from ..utils import ( js_to_json, @@ -31,70 +32,71 @@ class WeiboIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) # to get Referer url for genvisitor - webpage, urlh = self._download_webpage_handle(url, video_id, note="first visit the page") + webpage, urlh = self._download_webpage_handle(url, video_id) visitor_url = urlh.geturl() - headers = { - 'Referer': visitor_url - } - fp = { - "os": "2", - "browser": "Gecko57,0,0,0", - "fonts": "undefined", - "screenInfo": "1440*900*24", - "plugins": "" - } - data = urlencode_postdata({ - "cb": "gen_callback", - "fp": json.dumps(fp), - }) + if 'passport.weibo.com' in visitor_url: + # first visit + visitor_data = self._download_json( + 'https://passport.weibo.com/visitor/genvisitor', video_id, + note='Generating first-visit data', + transform_source=strip_jsonp, + headers={'Referer': visitor_url}, + data=urlencode_postdata({ + 'cb': 'gen_callback', + 'fp': json.dumps({ + 'os': '2', + 'browser': 'Gecko57,0,0,0', + 'fonts': 'undefined', + 'screenInfo': '1440*900*24', + 'plugins': '', + }), + })) - genvisitor_url = 'https://passport.weibo.com/visitor/genvisitor' - webpage = self._download_webpage(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") + tid = visitor_data['data']['tid'] + cnfd = '%03d' % visitor_data['data']['confidence'] - p = strip_jsonp(webpage) - i1 = p.find('{') - i2 = p.rfind('}') - j = p[i1:i2 + 1] # get JSON object - d = json.loads(j) - tid = d["data"]["tid"] - cnfd = "%03d" % d["data"]["confidence"] + self._download_webpage( + 'https://passport.weibo.com/visitor/visitor', video_id, + note='Running first-visit callback', + query={ + 'a': 'incarnate', + 't': tid, + 'w': 2, + 'c': cnfd, + 'cb': 'cross_domain', + 'from': 'weibo', + '_rand': random.random(), + }) - query = { - 'a': 'incarnate', - 't': tid, - 'w': 2, - 'c': cnfd, - 'cb': 'cross_domain', - 'from': 'weibo', - '_rand': random.random() - } - gencallback_url = "https://passport.weibo.com/visitor/visitor" - self._download_webpage(gencallback_url, video_id, note="gen callback", query=query) + webpage = self._download_webpage( + url, video_id, note='Revisiting webpage') - webpage = self._download_webpage(url, video_id, note="retry to visit the page") + title = self._html_search_regex( + r'(.+?)', webpage, 'title') - title = self._html_search_regex(r'(.+?)', webpage, 'title') - - video_sources_text = self._search_regex(r'video-sources=\\\"(.+?)\"', webpage, 'video_sources') - - video_formats = compat_urlparse.parse_qs(video_sources_text) + video_formats = compat_parse_qs(self._search_regex( + r'video-sources=\\\"(.+?)\"', webpage, 'video_sources')) formats = [] - supported_resolutions = ('720', '480') + supported_resolutions = (480, 720) for res in supported_resolutions: - f = video_formats.get(res) - if isinstance(f, list): - if len(f) > 0: - vid_url = f[0] - formats.append({ - 'url': vid_url, - 'format': 'mp4', - 'height': int(res), - }) + vid_urls = video_formats.get(compat_str(res)) + if not vid_urls or not isinstance(vid_urls, list): + continue + + vid_url = vid_urls[0] + formats.append({ + 'url': vid_url, + 'height': res, + }) + self._sort_formats(formats) - uploader = self._og_search_property('nick-name', webpage, 'uploader', default=None) + + uploader = self._og_search_property( + 'nick-name', webpage, 'uploader', default=None) + return { 'id': video_id, 'title': title, @@ -118,12 +120,17 @@ class WeiboMobileIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) # to get Referer url for genvisitor - webpage = self._download_webpage(url, video_id, note="visit the page") - js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags=re.DOTALL) - weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) - page_info = weibo_info.get('status').get('page_info') - title = weibo_info.get('status').get('status_title') - uploader = weibo_info.get('status').get('user').get('screen_name') + webpage = self._download_webpage(url, video_id, note='visit the page') + + weibo_info = self._parse_json(self._search_regex( + r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};', + webpage, 'js_code', flags=re.DOTALL), + video_id, transform_source=js_to_json) + + status_data = weibo_info.get('status', {}) + page_info = status_data.get('page_info') + title = status_data['status_title'] + uploader = status_data.get('user', {}).get('screen_name') return { 'id': video_id,