[mediasite] Improve extraction and code style, add support for DASH (closes #11185, closes #14343, refs #5428)

This commit is contained in:
Sergey M․ 2017-12-30 07:28:18 +07:00
parent 8056c8542d
commit 2ca7ed41fe
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D
2 changed files with 100 additions and 57 deletions

View file

@ -100,6 +100,7 @@ from .megaphone import MegaphoneIE
from .vzaar import VzaarIE from .vzaar import VzaarIE
from .channel9 import Channel9IE from .channel9 import Channel9IE
from .vshare import VShareIE from .vshare import VShareIE
from .mediasite import MediasiteIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -1925,6 +1926,18 @@ class GenericIE(InfoExtractor):
'title': 'vl14062007715967', 'title': 'vl14062007715967',
'ext': 'mp4', 'ext': 'mp4',
} }
},
{
'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/',
'md5': 'aecd089f55b1cb5a59032cb049d3a356',
'info_dict': {
'id': '90227f51a80c4d8f86c345a7fa62bd9a1d',
'ext': 'mp4',
'title': 'Lecture: Friday, September 23, 2016 - Sir Tony Hoare',
'description': 'md5:5a51db84a62def7b7054df2ade403c6c',
'timestamp': 1474354800,
'upload_date': '20160920',
}
} }
# { # {
# # TODO: find another test # # TODO: find another test
@ -2884,14 +2897,14 @@ class GenericIE(InfoExtractor):
vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) vshare_urls, video_id, video_title, ie=VShareIE.ie_key())
# Look for Mediasite embeds # Look for Mediasite embeds
mobj = re.search(r'''(?xi) mediasite_urls = MediasiteIE._extract_urls(webpage)
<iframe[^>]+src="((?:https?://[a-z0-9\-\.:\[\]]+)? if mediasite_urls:
/Mediasite/Play/[0-9a-f]{32,34}(?:\?.*?)?)" entries = [
''', webpage) self.url_result(smuggle_url(
if mobj is not None: compat_urlparse.urljoin(url, mediasite_url),
return self.url_result(smuggle_url( {'UrlReferrer': url}), ie=MediasiteIE.ie_key())
compat_urlparse.urljoin(url, unescapeHTML(mobj.group(1))), for mediasite_url in mediasite_urls]
{ 'UrlReferrer': url }), 'Livestream') return self.playlist_result(entries, video_id, video_title)
def merge_dicts(dict1, dict2): def merge_dicts(dict1, dict2):
merged = {} merged = {}

View file

@ -5,21 +5,22 @@ import re
import json import json
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urlparse from ..compat import (
compat_str,
compat_urlparse,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
unsmuggle_url,
mimetype2ext,
float_or_none, float_or_none,
mimetype2ext,
unescapeHTML,
unsmuggle_url,
urljoin,
) )
class MediasiteIE(InfoExtractor): class MediasiteIE(InfoExtractor):
_VALID_URL = r'''(?xi) _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/Play/(?P<id>[0-9a-f]{32,34})(?P<query>\?[^#]+|)'
https?://[a-z0-9\-\.:\[\]]+/Mediasite/Play/
(?P<id>[0-9a-f]{32,34})
(?P<QueryString>\?[^#]+|)
'''
_TESTS = [ _TESTS = [
{ {
'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d',
@ -87,67 +88,96 @@ class MediasiteIE(InfoExtractor):
# look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) # look in Mediasite.Core.js (Mediasite.ContentStreamType[*])
_STREAM_TYPES = { _STREAM_TYPES = {
0: 'video1', # the main video 0: 'video1', # the main video
2: 'slide', 2: 'slide',
3: 'presentation', 3: 'presentation',
4: 'video2', # screencast? 4: 'video2', # screencast?
5: 'video3', 5: 'video3',
} }
@staticmethod
def _extract_urls(webpage):
return [
unescapeHTML(mobj.group('url'))
for mobj in re.finditer(
r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/[0-9a-f]{32,34}(?:\?.*?)?)\1',
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
url, data = unsmuggle_url(url, {}) url, data = unsmuggle_url(url, {})
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
ResourceId = mobj.group('id') resource_id = mobj.group('id')
QueryString = mobj.group('QueryString') query = mobj.group('query')
webpage = self._download_webpage(url, ResourceId) # XXX: add UrlReferrer? webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer?
redirect_url = compat_str(urlh.geturl())
# XXX: might have also extracted UrlReferrer and QueryString from the html # XXX: might have also extracted UrlReferrer and QueryString from the html
ServicePath = compat_urlparse.urljoin(url, self._html_search_regex( service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex(
r'<div id="ServicePath">(.+?)</div>', webpage, ResourceId, r'<div[^>]+\bid=["\']ServicePath[^>]+>(.+?)</div>', webpage, resource_id,
default='/Mediasite/PlayerService/PlayerService.svc/json')) default='/Mediasite/PlayerService/PlayerService.svc/json'))
PlayerOptions = self._download_json( player_options = self._download_json(
'%s/GetPlayerOptions' % (ServicePath), ResourceId, '%s/GetPlayerOptions' % service_path, resource_id,
headers={ headers={
'Content-type': 'application/json; charset=utf-8', 'Content-type': 'application/json; charset=utf-8',
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
}, },
data=json.dumps({ data=json.dumps({
'getPlayerOptionsRequest': { 'getPlayerOptionsRequest': {
'ResourceId': ResourceId, 'ResourceId': resource_id,
'QueryString': QueryString, 'QueryString': query,
'UrlReferrer': data.get('UrlReferrer', ''), 'UrlReferrer': data.get('UrlReferrer', ''),
'UseScreenReader': False, 'UseScreenReader': False,
} }
}).encode('utf-8')) }).encode('utf-8'))['d']
Presentation = PlayerOptions['d']['Presentation']
if Presentation is None: presentation = player_options['Presentation']
raise ExtractorError('Mediasite says: %s' % title = presentation['Title']
(PlayerOptions['d']['PlayerPresentationStatusMessage'],),
if presentation is None:
raise ExtractorError(
'Mediasite says: %s' % player_options['PlayerPresentationStatusMessage'],
expected=True) expected=True)
thumbnails = [] thumbnails = []
formats = [] formats = []
for snum, Stream in enumerate(Presentation['Streams']): for snum, Stream in enumerate(presentation['Streams']):
stream_type = self._STREAM_TYPES.get( stream_type = Stream.get('StreamType')
Stream['StreamType'], 'type%u' % Stream['StreamType']) if stream_type is None:
continue
video_urls = Stream.get('VideoUrls')
if not isinstance(video_urls, list):
video_urls = []
stream_id = self._STREAM_TYPES.get(
stream_type, 'type%u' % stream_type)
stream_formats = [] stream_formats = []
for unum, VideoUrl in enumerate(Stream['VideoUrls']): for unum, VideoUrl in enumerate(video_urls):
url = VideoUrl['Location'] video_url = VideoUrl.get('Location')
if not video_url or not isinstance(video_url, compat_str):
continue
# XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS
if VideoUrl['MediaType'] == 'SS': media_type = VideoUrl.get('MediaType')
if media_type == 'SS':
stream_formats.extend(self._extract_ism_formats( stream_formats.extend(self._extract_ism_formats(
url, ResourceId, ism_id='%s-%u.%u' % (stream_type, snum, unum))) video_url, resource_id,
continue ism_id='%s-%u.%u' % (stream_id, snum, unum),
fatal=False))
stream_formats.append({ elif media_type == 'Dash':
'format_id': '%s-%u.%u' % (stream_type, snum, unum), stream_formats.extend(self._extract_mpd_formats(
'url': url, video_url, resource_id,
'ext': mimetype2ext(VideoUrl['MimeType']), mpd_id='%s-%u.%u' % (stream_id, snum, unum),
}) fatal=False))
else:
stream_formats.append({
'format_id': '%s-%u.%u' % (stream_id, snum, unum),
'url': video_url,
'ext': mimetype2ext(VideoUrl.get('MimeType')),
})
# TODO: if Stream['HasSlideContent']: # TODO: if Stream['HasSlideContent']:
# synthesise an MJPEG video stream '%s-%u.slides' % (stream_type, snum) # synthesise an MJPEG video stream '%s-%u.slides' % (stream_type, snum)
@ -155,16 +185,16 @@ class MediasiteIE(InfoExtractor):
# this will require writing a custom downloader... # this will require writing a custom downloader...
# disprefer 'secondary' streams # disprefer 'secondary' streams
if Stream['StreamType'] != 0: if stream_type != 0:
for fmt in stream_formats: for fmt in stream_formats:
fmt['preference'] = -1 fmt['preference'] = -1
ThumbnailUrl = Stream.get('ThumbnailUrl') thumbnail_url = Stream.get('ThumbnailUrl')
if ThumbnailUrl: if thumbnail_url:
thumbnails.append({ thumbnails.append({
'id': '%s-%u' % (stream_type, snum), 'id': '%s-%u' % (stream_id, snum),
'url': compat_urlparse.urljoin(url, ThumbnailUrl), 'url': urljoin(redirect_url, thumbnail_url),
'preference': -1 if Stream['StreamType'] != 0 else 0, 'preference': -1 if stream_type != 0 else 0,
}) })
formats.extend(stream_formats) formats.extend(stream_formats)
@ -174,11 +204,11 @@ class MediasiteIE(InfoExtractor):
# XXX: Presentation['Transcript'] # XXX: Presentation['Transcript']
return { return {
'id': ResourceId, 'id': resource_id,
'title': Presentation['Title'], 'title': title,
'description': Presentation.get('Description'), 'description': presentation.get('Description'),
'duration': float_or_none(Presentation.get('Duration'), 1000), 'duration': float_or_none(presentation.get('Duration'), 1000),
'timestamp': float_or_none(Presentation.get('UnixTime'), 1000), 'timestamp': float_or_none(presentation.get('UnixTime'), 1000),
'formats': formats, 'formats': formats,
'thumbnails': thumbnails, 'thumbnails': thumbnails,
} }