[ted] Fix extraction (closes #13535))
This commit is contained in:
parent
54faac2235
commit
4917478803
1 changed files with 34 additions and 15 deletions
|
@ -6,7 +6,10 @@ import re
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
|
|
||||||
from ..compat import compat_str
|
from ..compat import compat_str
|
||||||
from ..utils import int_or_none
|
from ..utils import (
|
||||||
|
int_or_none,
|
||||||
|
try_get,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TEDIE(InfoExtractor):
|
class TEDIE(InfoExtractor):
|
||||||
|
@ -113,7 +116,8 @@ class TEDIE(InfoExtractor):
|
||||||
}
|
}
|
||||||
|
|
||||||
def _extract_info(self, webpage):
|
def _extract_info(self, webpage):
|
||||||
info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
|
info_json = self._search_regex(
|
||||||
|
r'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>',
|
||||||
webpage, 'info json')
|
webpage, 'info json')
|
||||||
return json.loads(info_json)
|
return json.loads(info_json)
|
||||||
|
|
||||||
|
@ -136,11 +140,16 @@ class TEDIE(InfoExtractor):
|
||||||
webpage = self._download_webpage(url, name,
|
webpage = self._download_webpage(url, name,
|
||||||
'Downloading playlist webpage')
|
'Downloading playlist webpage')
|
||||||
info = self._extract_info(webpage)
|
info = self._extract_info(webpage)
|
||||||
playlist_info = info['playlist']
|
|
||||||
|
playlist_info = try_get(
|
||||||
|
info, lambda x: x['__INITIAL_DATA__']['playlist'],
|
||||||
|
dict) or info['playlist']
|
||||||
|
|
||||||
playlist_entries = [
|
playlist_entries = [
|
||||||
self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
|
self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
|
||||||
for talk in info['talks']
|
for talk in try_get(
|
||||||
|
info, lambda x: x['__INITIAL_DATA__']['talks'],
|
||||||
|
dict) or info['talks']
|
||||||
]
|
]
|
||||||
return self.playlist_result(
|
return self.playlist_result(
|
||||||
playlist_entries,
|
playlist_entries,
|
||||||
|
@ -149,9 +158,14 @@ class TEDIE(InfoExtractor):
|
||||||
|
|
||||||
def _talk_info(self, url, video_name):
|
def _talk_info(self, url, video_name):
|
||||||
webpage = self._download_webpage(url, video_name)
|
webpage = self._download_webpage(url, video_name)
|
||||||
self.report_extraction(video_name)
|
|
||||||
|
|
||||||
talk_info = self._extract_info(webpage)['talks'][0]
|
info = self._extract_info(webpage)
|
||||||
|
|
||||||
|
talk_info = try_get(
|
||||||
|
info, lambda x: x['__INITIAL_DATA__']['talks'][0],
|
||||||
|
dict) or info['talks'][0]
|
||||||
|
|
||||||
|
title = talk_info['title'].strip()
|
||||||
|
|
||||||
external = talk_info.get('external')
|
external = talk_info.get('external')
|
||||||
if external:
|
if external:
|
||||||
|
@ -165,19 +179,27 @@ class TEDIE(InfoExtractor):
|
||||||
'url': ext_url or external['uri'],
|
'url': ext_url or external['uri'],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
native_downloads = try_get(
|
||||||
|
talk_info, lambda x: x['downloads']['nativeDownloads'],
|
||||||
|
dict) or talk_info['nativeDownloads']
|
||||||
|
|
||||||
formats = [{
|
formats = [{
|
||||||
'url': format_url,
|
'url': format_url,
|
||||||
'format_id': format_id,
|
'format_id': format_id,
|
||||||
'format': format_id,
|
'format': format_id,
|
||||||
} for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
|
} for (format_id, format_url) in native_downloads.items() if format_url is not None]
|
||||||
if formats:
|
if formats:
|
||||||
for f in formats:
|
for f in formats:
|
||||||
finfo = self._NATIVE_FORMATS.get(f['format_id'])
|
finfo = self._NATIVE_FORMATS.get(f['format_id'])
|
||||||
if finfo:
|
if finfo:
|
||||||
f.update(finfo)
|
f.update(finfo)
|
||||||
|
|
||||||
|
player_talk = talk_info['player_talks'][0]
|
||||||
|
|
||||||
|
resources_ = player_talk.get('resources') or talk_info.get('resources')
|
||||||
|
|
||||||
http_url = None
|
http_url = None
|
||||||
for format_id, resources in talk_info['resources'].items():
|
for format_id, resources in resources_.items():
|
||||||
if format_id == 'h264':
|
if format_id == 'h264':
|
||||||
for resource in resources:
|
for resource in resources:
|
||||||
h264_url = resource.get('file')
|
h264_url = resource.get('file')
|
||||||
|
@ -237,14 +259,11 @@ class TEDIE(InfoExtractor):
|
||||||
|
|
||||||
video_id = compat_str(talk_info['id'])
|
video_id = compat_str(talk_info['id'])
|
||||||
|
|
||||||
thumbnail = talk_info['thumb']
|
|
||||||
if not thumbnail.startswith('http'):
|
|
||||||
thumbnail = 'http://' + thumbnail
|
|
||||||
return {
|
return {
|
||||||
'id': video_id,
|
'id': video_id,
|
||||||
'title': talk_info['title'].strip(),
|
'title': title,
|
||||||
'uploader': talk_info['speaker'],
|
'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
|
||||||
'thumbnail': thumbnail,
|
'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
|
||||||
'description': self._og_search_description(webpage),
|
'description': self._og_search_description(webpage),
|
||||||
'subtitles': self._get_subtitles(video_id, talk_info),
|
'subtitles': self._get_subtitles(video_id, talk_info),
|
||||||
'formats': formats,
|
'formats': formats,
|
||||||
|
|
Loading…
Reference in a new issue