[cinchcast] Add new extractor (Fixes #4428)
This commit is contained in:
parent
4e40de6e2a
commit
42bdd9d051
5 changed files with 88 additions and 6 deletions
|
@ -144,6 +144,9 @@ class TestUtil(unittest.TestCase):
|
||||||
self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
|
self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
|
||||||
self.assertEqual(unified_strdate('1968-12-10'), '19681210')
|
self.assertEqual(unified_strdate('1968-12-10'), '19681210')
|
||||||
self.assertEqual(unified_strdate('28/01/2014 21:00:00 +0100'), '20140128')
|
self.assertEqual(unified_strdate('28/01/2014 21:00:00 +0100'), '20140128')
|
||||||
|
self.assertEqual(
|
||||||
|
unified_strdate('11/26/2014 11:30:00 AM PST', day_first=False),
|
||||||
|
'20141126')
|
||||||
|
|
||||||
def test_find_xpath_attr(self):
|
def test_find_xpath_attr(self):
|
||||||
testxml = '''<root>
|
testxml = '''<root>
|
||||||
|
|
|
@ -51,6 +51,7 @@ from .cbsnews import CBSNewsIE
|
||||||
from .ceskatelevize import CeskaTelevizeIE
|
from .ceskatelevize import CeskaTelevizeIE
|
||||||
from .channel9 import Channel9IE
|
from .channel9 import Channel9IE
|
||||||
from .chilloutzone import ChilloutzoneIE
|
from .chilloutzone import ChilloutzoneIE
|
||||||
|
from .cinchcast import CinchcastIE
|
||||||
from .clipfish import ClipfishIE
|
from .clipfish import ClipfishIE
|
||||||
from .cliphunter import CliphunterIE
|
from .cliphunter import CliphunterIE
|
||||||
from .clipsyndicate import ClipsyndicateIE
|
from .clipsyndicate import ClipsyndicateIE
|
||||||
|
|
53
youtube_dl/extractor/cinchcast.py
Normal file
53
youtube_dl/extractor/cinchcast.py
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .common import InfoExtractor
|
||||||
|
from ..utils import (
|
||||||
|
int_or_none,
|
||||||
|
unified_strdate,
|
||||||
|
xpath_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class CinchcastIE(InfoExtractor):
|
||||||
|
_VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P<id>[0-9]+)'
|
||||||
|
_TEST = {
|
||||||
|
# Actual test is run in generic, look for undergroundwellness
|
||||||
|
'url': 'http://player.cinchcast.com/?platformId=1&assetType=single&assetId=7141703',
|
||||||
|
'only_matching': True,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
video_id = self._match_id(url)
|
||||||
|
doc = self._download_xml(
|
||||||
|
'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id,
|
||||||
|
video_id)
|
||||||
|
|
||||||
|
item = doc.find('.//item')
|
||||||
|
title = xpath_text(item, './title', fatal=True)
|
||||||
|
date_str = xpath_text(
|
||||||
|
item, './{http://developer.longtailvideo.com/trac/}date')
|
||||||
|
upload_date = unified_strdate(date_str, day_first=False)
|
||||||
|
# duration is present but wrong
|
||||||
|
formats = []
|
||||||
|
formats.append({
|
||||||
|
'format_id': 'main',
|
||||||
|
'url': item.find(
|
||||||
|
'./{http://search.yahoo.com/mrss/}content').attrib['url'],
|
||||||
|
})
|
||||||
|
backup_url = xpath_text(
|
||||||
|
item, './{http://developer.longtailvideo.com/trac/}backupContent')
|
||||||
|
if backup_url:
|
||||||
|
formats.append({
|
||||||
|
'preference': 2, # seems to be more reliable
|
||||||
|
'format_id': 'backup',
|
||||||
|
'url': backup_url,
|
||||||
|
})
|
||||||
|
self._sort_formats(formats)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'id': video_id,
|
||||||
|
'title': title,
|
||||||
|
'upload_date': upload_date,
|
||||||
|
'formats': formats,
|
||||||
|
}
|
|
@ -467,8 +467,17 @@ class GenericIE(InfoExtractor):
|
||||||
'expected_warnings': [
|
'expected_warnings': [
|
||||||
'URL could be a direct video link, returning it as such.'
|
'URL could be a direct video link, returning it as such.'
|
||||||
]
|
]
|
||||||
}
|
},
|
||||||
|
# Cinchcast embed
|
||||||
|
{
|
||||||
|
'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '7141703',
|
||||||
|
'ext': 'mp3',
|
||||||
|
'upload_date': '20141126',
|
||||||
|
'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
|
||||||
|
}
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
def report_following_redirect(self, new_url):
|
def report_following_redirect(self, new_url):
|
||||||
|
@ -962,6 +971,13 @@ class GenericIE(InfoExtractor):
|
||||||
if mobj is not None:
|
if mobj is not None:
|
||||||
return self.url_result(mobj.group('url'), 'SBS')
|
return self.url_result(mobj.group('url'), 'SBS')
|
||||||
|
|
||||||
|
# Look for embedded Cinchcast player
|
||||||
|
mobj = re.search(
|
||||||
|
r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
|
||||||
|
webpage)
|
||||||
|
if mobj is not None:
|
||||||
|
return self.url_result(mobj.group('url'), 'Cinchcast')
|
||||||
|
|
||||||
mobj = re.search(
|
mobj = re.search(
|
||||||
r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
|
r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
|
||||||
webpage)
|
webpage)
|
||||||
|
|
|
@ -166,7 +166,7 @@ def xpath_text(node, xpath, name=None, fatal=False):
|
||||||
xpath = xpath.encode('ascii')
|
xpath = xpath.encode('ascii')
|
||||||
|
|
||||||
n = node.find(xpath)
|
n = node.find(xpath)
|
||||||
if n is None:
|
if n is None or n.text is None:
|
||||||
if fatal:
|
if fatal:
|
||||||
name = xpath if name is None else name
|
name = xpath if name is None else name
|
||||||
raise ExtractorError('Could not find XML element %s' % name)
|
raise ExtractorError('Could not find XML element %s' % name)
|
||||||
|
@ -644,17 +644,19 @@ def parse_iso8601(date_str, delimiter='T'):
|
||||||
return calendar.timegm(dt.timetuple())
|
return calendar.timegm(dt.timetuple())
|
||||||
|
|
||||||
|
|
||||||
def unified_strdate(date_str):
|
def unified_strdate(date_str, day_first=True):
|
||||||
"""Return a string with the date in the format YYYYMMDD"""
|
"""Return a string with the date in the format YYYYMMDD"""
|
||||||
|
|
||||||
if date_str is None:
|
if date_str is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
upload_date = None
|
upload_date = None
|
||||||
# Replace commas
|
# Replace commas
|
||||||
date_str = date_str.replace(',', ' ')
|
date_str = date_str.replace(',', ' ')
|
||||||
# %z (UTC offset) is only supported in python>=3.2
|
# %z (UTC offset) is only supported in python>=3.2
|
||||||
date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
|
date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
|
||||||
|
# Remove AM/PM + timezone
|
||||||
|
date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
|
||||||
|
|
||||||
format_expressions = [
|
format_expressions = [
|
||||||
'%d %B %Y',
|
'%d %B %Y',
|
||||||
'%d %b %Y',
|
'%d %b %Y',
|
||||||
|
@ -669,7 +671,6 @@ def unified_strdate(date_str):
|
||||||
'%d/%m/%Y',
|
'%d/%m/%Y',
|
||||||
'%d/%m/%y',
|
'%d/%m/%y',
|
||||||
'%Y/%m/%d %H:%M:%S',
|
'%Y/%m/%d %H:%M:%S',
|
||||||
'%d/%m/%Y %H:%M:%S',
|
|
||||||
'%Y-%m-%d %H:%M:%S',
|
'%Y-%m-%d %H:%M:%S',
|
||||||
'%Y-%m-%d %H:%M:%S.%f',
|
'%Y-%m-%d %H:%M:%S.%f',
|
||||||
'%d.%m.%Y %H:%M',
|
'%d.%m.%Y %H:%M',
|
||||||
|
@ -681,6 +682,14 @@ def unified_strdate(date_str):
|
||||||
'%Y-%m-%dT%H:%M:%S.%f',
|
'%Y-%m-%dT%H:%M:%S.%f',
|
||||||
'%Y-%m-%dT%H:%M',
|
'%Y-%m-%dT%H:%M',
|
||||||
]
|
]
|
||||||
|
if day_first:
|
||||||
|
format_expressions.extend([
|
||||||
|
'%d/%m/%Y %H:%M:%S',
|
||||||
|
])
|
||||||
|
else:
|
||||||
|
format_expressions.extend([
|
||||||
|
'%m/%d/%Y %H:%M:%S',
|
||||||
|
])
|
||||||
for expression in format_expressions:
|
for expression in format_expressions:
|
||||||
try:
|
try:
|
||||||
upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
|
upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
|
||||||
|
|
Loading…
Reference in a new issue