[amara] improve extraction

This commit is contained in:
Remita Amine 2020-11-19 17:29:30 +01:00
parent cf1a8668e8
commit 2cf8003638

View file

@ -1,12 +1,20 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from .youtube import YoutubeIE
from .vimeo import VimeoIE
from ..utils import (
int_or_none,
parse_iso8601,
update_url_query,
)
class AmaraIE(InfoExtractor): class AmaraIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)' _VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)'
_TESTS = [ _TESTS = [{
{ # Youtube
'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video',
'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae',
'info_dict': { 'info_dict': {
@ -18,10 +26,11 @@ class AmaraIE(InfoExtractor):
'subtitles': dict, 'subtitles': dict,
'upload_date': '20160813', 'upload_date': '20160813',
'uploader': 'PBS NewsHour', 'uploader': 'PBS NewsHour',
'uploader_id': 'PBSNewsHour' 'uploader_id': 'PBSNewsHour',
'timestamp': 1549639570,
} }
}, }, {
{ # Vimeo
'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011',
'md5': '99392c75fa05d432a8f11df03612195e', 'md5': '99392c75fa05d432a8f11df03612195e',
'info_dict': { 'info_dict': {
@ -31,46 +40,64 @@ class AmaraIE(InfoExtractor):
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'subtitles': dict, 'subtitles': dict,
'timestamp': 1294649110, 'timestamp': 1294763658,
'upload_date': '20110110', 'upload_date': '20110111',
'uploader': 'Sam Morrill', 'uploader': 'Sam Morrill',
'uploader_id': 'sammorrill' 'uploader_id': 'sammorrill'
} }
}, }, {
{ # Direct Link
'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/',
'md5': 'd3970f08512738ee60c5807311ff5d3f', 'md5': 'd3970f08512738ee60c5807311ff5d3f',
'info_dict': { 'info_dict': {
'id': 'ChimamandaAdichie_2009G-transcript', 'id': 's8KL7I3jLmh6',
'ext': 'mp4', 'ext': 'mp4',
'title': 'The danger of a single story', 'title': 'The danger of a single story',
'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'subtitles': dict, 'subtitles': dict,
'upload_date': '20131206' 'upload_date': '20091007',
'timestamp': 1254942511,
} }
} }]
]
def get_subtitles_for_language(self, language):
return [{
'ext': type,
'url': language['subtitles_uri'].replace('format=json', 'format=' + type)
} for type in ['vtt', 'srt', 'json']]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
meta = self._download_json('https://amara.org/api/videos/%s/' % video_id, video_id, query={'format': 'json'}) meta = self._download_json(
'https://amara.org/api/videos/%s/' % video_id,
video_id, query={'format': 'json'})
title = meta['title']
video_url = meta['all_urls'][0]
video_url = meta.get('all_urls')[0] subtitles = {}
subtitles = dict([(language['code'], self.get_subtitles_for_language(language)) for language in meta.get('languages', []) if language['published']]) for language in (meta.get('languages') or []):
subtitles_uri = language.get('subtitles_uri')
if not (subtitles_uri and language.get('published')):
continue
subtitle = subtitles.setdefault(language.get('code') or 'en', [])
for f in ('json', 'srt', 'vtt'):
subtitle.append({
'ext': f,
'url': update_url_query(subtitles_uri, {'format': f}),
})
return { info = {
'_type': 'url_transparent',
'url': video_url, 'url': video_url,
'id': video_id, 'id': video_id,
'subtitles': subtitles, 'subtitles': subtitles,
'title': meta['title'], 'title': title,
'description': meta.get('description'), 'description': meta.get('description'),
'thumbnail': meta.get('thumbnail') 'thumbnail': meta.get('thumbnail'),
'duration': int_or_none(meta.get('duration')),
'timestamp': parse_iso8601(meta.get('created')),
} }
for ie in (YoutubeIE, VimeoIE):
if ie.suitable(video_url):
info.update({
'_type': 'url_transparent',
'ie_key': ie.ie_key(),
})
break
return info