[cinchcast] Add new extractor (Fixes #4428)

This commit is contained in:
Philipp Hagemeister 2014-12-12 02:57:36 +01:00
parent 4e40de6e2a
commit 42bdd9d051
5 changed files with 88 additions and 6 deletions

View file

@ -144,6 +144,9 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011') self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
self.assertEqual(unified_strdate('1968-12-10'), '19681210') self.assertEqual(unified_strdate('1968-12-10'), '19681210')
self.assertEqual(unified_strdate('28/01/2014 21:00:00 +0100'), '20140128') self.assertEqual(unified_strdate('28/01/2014 21:00:00 +0100'), '20140128')
self.assertEqual(
unified_strdate('11/26/2014 11:30:00 AM PST', day_first=False),
'20141126')
def test_find_xpath_attr(self): def test_find_xpath_attr(self):
testxml = '''<root> testxml = '''<root>

View file

@ -51,6 +51,7 @@ from .cbsnews import CBSNewsIE
from .ceskatelevize import CeskaTelevizeIE from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE from .channel9 import Channel9IE
from .chilloutzone import ChilloutzoneIE from .chilloutzone import ChilloutzoneIE
from .cinchcast import CinchcastIE
from .clipfish import ClipfishIE from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE from .cliphunter import CliphunterIE
from .clipsyndicate import ClipsyndicateIE from .clipsyndicate import ClipsyndicateIE

View file

@ -0,0 +1,53 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
unified_strdate,
xpath_text,
)
class CinchcastIE(InfoExtractor):
_VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P<id>[0-9]+)'
_TEST = {
# Actual test is run in generic, look for undergroundwellness
'url': 'http://player.cinchcast.com/?platformId=1&#038;assetType=single&#038;assetId=7141703',
'only_matching': True,
}
def _real_extract(self, url):
video_id = self._match_id(url)
doc = self._download_xml(
'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id,
video_id)
item = doc.find('.//item')
title = xpath_text(item, './title', fatal=True)
date_str = xpath_text(
item, './{http://developer.longtailvideo.com/trac/}date')
upload_date = unified_strdate(date_str, day_first=False)
# duration is present but wrong
formats = []
formats.append({
'format_id': 'main',
'url': item.find(
'./{http://search.yahoo.com/mrss/}content').attrib['url'],
})
backup_url = xpath_text(
item, './{http://developer.longtailvideo.com/trac/}backupContent')
if backup_url:
formats.append({
'preference': 2, # seems to be more reliable
'format_id': 'backup',
'url': backup_url,
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'upload_date': upload_date,
'formats': formats,
}

View file

@ -467,8 +467,17 @@ class GenericIE(InfoExtractor):
'expected_warnings': [ 'expected_warnings': [
'URL could be a direct video link, returning it as such.' 'URL could be a direct video link, returning it as such.'
] ]
} },
# Cinchcast embed
{
'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
'info_dict': {
'id': '7141703',
'ext': 'mp3',
'upload_date': '20141126',
'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
}
},
] ]
def report_following_redirect(self, new_url): def report_following_redirect(self, new_url):
@ -962,6 +971,13 @@ class GenericIE(InfoExtractor):
if mobj is not None: if mobj is not None:
return self.url_result(mobj.group('url'), 'SBS') return self.url_result(mobj.group('url'), 'SBS')
# Look for embedded Cinchcast player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'Cinchcast')
mobj = re.search( mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
webpage) webpage)

View file

@ -166,7 +166,7 @@ def xpath_text(node, xpath, name=None, fatal=False):
xpath = xpath.encode('ascii') xpath = xpath.encode('ascii')
n = node.find(xpath) n = node.find(xpath)
if n is None: if n is None or n.text is None:
if fatal: if fatal:
name = xpath if name is None else name name = xpath if name is None else name
raise ExtractorError('Could not find XML element %s' % name) raise ExtractorError('Could not find XML element %s' % name)
@ -644,17 +644,19 @@ def parse_iso8601(date_str, delimiter='T'):
return calendar.timegm(dt.timetuple()) return calendar.timegm(dt.timetuple())
def unified_strdate(date_str): def unified_strdate(date_str, day_first=True):
"""Return a string with the date in the format YYYYMMDD""" """Return a string with the date in the format YYYYMMDD"""
if date_str is None: if date_str is None:
return None return None
upload_date = None upload_date = None
# Replace commas # Replace commas
date_str = date_str.replace(',', ' ') date_str = date_str.replace(',', ' ')
# %z (UTC offset) is only supported in python>=3.2 # %z (UTC offset) is only supported in python>=3.2
date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
# Remove AM/PM + timezone
date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
format_expressions = [ format_expressions = [
'%d %B %Y', '%d %B %Y',
'%d %b %Y', '%d %b %Y',
@ -669,7 +671,6 @@ def unified_strdate(date_str):
'%d/%m/%Y', '%d/%m/%Y',
'%d/%m/%y', '%d/%m/%y',
'%Y/%m/%d %H:%M:%S', '%Y/%m/%d %H:%M:%S',
'%d/%m/%Y %H:%M:%S',
'%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S',
'%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S.%f',
'%d.%m.%Y %H:%M', '%d.%m.%Y %H:%M',
@ -681,6 +682,14 @@ def unified_strdate(date_str):
'%Y-%m-%dT%H:%M:%S.%f', '%Y-%m-%dT%H:%M:%S.%f',
'%Y-%m-%dT%H:%M', '%Y-%m-%dT%H:%M',
] ]
if day_first:
format_expressions.extend([
'%d/%m/%Y %H:%M:%S',
])
else:
format_expressions.extend([
'%m/%d/%Y %H:%M:%S',
])
for expression in format_expressions: for expression in format_expressions:
try: try:
upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')