You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

313 lines
13KB

  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import uuid
  4. import xml.etree.ElementTree as etree
  5. import json
  6. import re
  7. from .common import InfoExtractor
  8. from .brightcove import BrightcoveNewIE
  9. from ..compat import (
  10. compat_str,
  11. compat_etree_register_namespace,
  12. )
  13. from ..utils import (
  14. determine_ext,
  15. ExtractorError,
  16. extract_attributes,
  17. int_or_none,
  18. merge_dicts,
  19. parse_duration,
  20. smuggle_url,
  21. url_or_none,
  22. xpath_with_ns,
  23. xpath_element,
  24. xpath_text,
  25. )
  26. class ITVIE(InfoExtractor):
  27. _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)'
  28. _GEO_COUNTRIES = ['GB']
  29. _TESTS = [{
  30. 'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053',
  31. 'info_dict': {
  32. 'id': '2a2936a0053',
  33. 'ext': 'flv',
  34. 'title': 'Home Movie',
  35. },
  36. 'params': {
  37. # rtmp download
  38. 'skip_download': True,
  39. },
  40. }, {
  41. # unavailable via data-playlist-url
  42. 'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033',
  43. 'only_matching': True,
  44. }, {
  45. # InvalidVodcrid
  46. 'url': 'https://www.itv.com/hub/james-martins-saturday-morning/2a5159a0034',
  47. 'only_matching': True,
  48. }, {
  49. # ContentUnavailable
  50. 'url': 'https://www.itv.com/hub/whos-doing-the-dishes/2a2898a0024',
  51. 'only_matching': True,
  52. }]
  53. def _real_extract(self, url):
  54. video_id = self._match_id(url)
  55. webpage = self._download_webpage(url, video_id)
  56. params = extract_attributes(self._search_regex(
  57. r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params'))
  58. ns_map = {
  59. 'soapenv': 'http://schemas.xmlsoap.org/soap/envelope/',
  60. 'tem': 'http://tempuri.org/',
  61. 'itv': 'http://schemas.datacontract.org/2004/07/Itv.BB.Mercury.Common.Types',
  62. 'com': 'http://schemas.itv.com/2009/05/Common',
  63. }
  64. for ns, full_ns in ns_map.items():
  65. compat_etree_register_namespace(ns, full_ns)
  66. def _add_ns(name):
  67. return xpath_with_ns(name, ns_map)
  68. def _add_sub_element(element, name):
  69. return etree.SubElement(element, _add_ns(name))
  70. production_id = (
  71. params.get('data-video-autoplay-id')
  72. or '%s#001' % (
  73. params.get('data-video-episode-id')
  74. or video_id.replace('a', '/')))
  75. req_env = etree.Element(_add_ns('soapenv:Envelope'))
  76. _add_sub_element(req_env, 'soapenv:Header')
  77. body = _add_sub_element(req_env, 'soapenv:Body')
  78. get_playlist = _add_sub_element(body, ('tem:GetPlaylist'))
  79. request = _add_sub_element(get_playlist, 'tem:request')
  80. _add_sub_element(request, 'itv:ProductionId').text = production_id
  81. _add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper()
  82. vodcrid = _add_sub_element(request, 'itv:Vodcrid')
  83. _add_sub_element(vodcrid, 'com:Id')
  84. _add_sub_element(request, 'itv:Partition')
  85. user_info = _add_sub_element(get_playlist, 'tem:userInfo')
  86. _add_sub_element(user_info, 'itv:Broadcaster').text = 'Itv'
  87. _add_sub_element(user_info, 'itv:DM')
  88. _add_sub_element(user_info, 'itv:RevenueScienceValue')
  89. _add_sub_element(user_info, 'itv:SessionId')
  90. _add_sub_element(user_info, 'itv:SsoToken')
  91. _add_sub_element(user_info, 'itv:UserToken')
  92. site_info = _add_sub_element(get_playlist, 'tem:siteInfo')
  93. _add_sub_element(site_info, 'itv:AdvertisingRestriction').text = 'None'
  94. _add_sub_element(site_info, 'itv:AdvertisingSite').text = 'ITV'
  95. _add_sub_element(site_info, 'itv:AdvertisingType').text = 'Any'
  96. _add_sub_element(site_info, 'itv:Area').text = 'ITVPLAYER.VIDEO'
  97. _add_sub_element(site_info, 'itv:Category')
  98. _add_sub_element(site_info, 'itv:Platform').text = 'DotCom'
  99. _add_sub_element(site_info, 'itv:Site').text = 'ItvCom'
  100. device_info = _add_sub_element(get_playlist, 'tem:deviceInfo')
  101. _add_sub_element(device_info, 'itv:ScreenSize').text = 'Big'
  102. player_info = _add_sub_element(get_playlist, 'tem:playerInfo')
  103. _add_sub_element(player_info, 'itv:Version').text = '2'
  104. headers = self.geo_verification_headers()
  105. headers.update({
  106. 'Content-Type': 'text/xml; charset=utf-8',
  107. 'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist',
  108. })
  109. info = self._search_json_ld(webpage, video_id, default={})
  110. formats = []
  111. subtitles = {}
  112. def extract_subtitle(sub_url):
  113. ext = determine_ext(sub_url, 'ttml')
  114. subtitles.setdefault('en', []).append({
  115. 'url': sub_url,
  116. 'ext': 'ttml' if ext == 'xml' else ext,
  117. })
  118. resp_env = self._download_xml(
  119. params['data-playlist-url'], video_id,
  120. headers=headers, data=etree.tostring(req_env), fatal=False)
  121. if resp_env:
  122. playlist = xpath_element(resp_env, './/Playlist')
  123. if playlist is None:
  124. fault_code = xpath_text(resp_env, './/faultcode')
  125. fault_string = xpath_text(resp_env, './/faultstring')
  126. if fault_code == 'InvalidGeoRegion':
  127. self.raise_geo_restricted(
  128. msg=fault_string, countries=self._GEO_COUNTRIES)
  129. elif fault_code not in (
  130. 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'):
  131. raise ExtractorError(
  132. '%s said: %s' % (self.IE_NAME, fault_string), expected=True)
  133. info.update({
  134. 'title': self._og_search_title(webpage),
  135. 'episode_title': params.get('data-video-episode'),
  136. 'series': params.get('data-video-title'),
  137. })
  138. else:
  139. title = xpath_text(playlist, 'EpisodeTitle', default=None)
  140. info.update({
  141. 'title': title,
  142. 'episode_title': title,
  143. 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
  144. 'series': xpath_text(playlist, 'ProgrammeTitle'),
  145. 'duration': parse_duration(xpath_text(playlist, 'Duration')),
  146. })
  147. video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True)
  148. media_files = xpath_element(video_element, 'MediaFiles', fatal=True)
  149. rtmp_url = media_files.attrib['base']
  150. for media_file in media_files.findall('MediaFile'):
  151. play_path = xpath_text(media_file, 'URL')
  152. if not play_path:
  153. continue
  154. tbr = int_or_none(media_file.get('bitrate'), 1000)
  155. f = {
  156. 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''),
  157. 'play_path': play_path,
  158. # Providing this swfVfy allows to avoid truncated downloads
  159. 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf',
  160. 'page_url': url,
  161. 'tbr': tbr,
  162. 'ext': 'flv',
  163. }
  164. app = self._search_regex(
  165. 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None)
  166. if app:
  167. f.update({
  168. 'url': rtmp_url.split('?', 1)[0],
  169. 'app': app,
  170. })
  171. else:
  172. f['url'] = rtmp_url
  173. formats.append(f)
  174. for caption_url in video_element.findall('ClosedCaptioningURIs/URL'):
  175. if caption_url.text:
  176. extract_subtitle(caption_url.text)
  177. ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id')
  178. hmac = params.get('data-video-hmac')
  179. if ios_playlist_url and hmac and re.match(r'https?://', ios_playlist_url):
  180. headers = self.geo_verification_headers()
  181. headers.update({
  182. 'Accept': 'application/vnd.itv.vod.playlist.v2+json',
  183. 'Content-Type': 'application/json',
  184. 'hmac': hmac.upper(),
  185. })
  186. ios_playlist = self._download_json(
  187. ios_playlist_url, video_id, data=json.dumps({
  188. 'user': {
  189. 'itvUserId': '',
  190. 'entitlements': [],
  191. 'token': ''
  192. },
  193. 'device': {
  194. 'manufacturer': 'Safari',
  195. 'model': '5',
  196. 'os': {
  197. 'name': 'Windows NT',
  198. 'version': '6.1',
  199. 'type': 'desktop'
  200. }
  201. },
  202. 'client': {
  203. 'version': '4.1',
  204. 'id': 'browser'
  205. },
  206. 'variantAvailability': {
  207. 'featureset': {
  208. 'min': ['hls', 'aes', 'outband-webvtt'],
  209. 'max': ['hls', 'aes', 'outband-webvtt']
  210. },
  211. 'platformTag': 'dotcom'
  212. }
  213. }).encode(), headers=headers, fatal=False)
  214. if ios_playlist:
  215. video_data = ios_playlist.get('Playlist', {}).get('Video', {})
  216. ios_base_url = video_data.get('Base')
  217. for media_file in video_data.get('MediaFiles', []):
  218. href = media_file.get('Href')
  219. if not href:
  220. continue
  221. if ios_base_url:
  222. href = ios_base_url + href
  223. ext = determine_ext(href)
  224. if ext == 'm3u8':
  225. formats.extend(self._extract_m3u8_formats(
  226. href, video_id, 'mp4', entry_protocol='m3u8_native',
  227. m3u8_id='hls', fatal=False))
  228. else:
  229. formats.append({
  230. 'url': href,
  231. })
  232. subs = video_data.get('Subtitles')
  233. if isinstance(subs, list):
  234. for sub in subs:
  235. if not isinstance(sub, dict):
  236. continue
  237. href = url_or_none(sub.get('Href'))
  238. if href:
  239. extract_subtitle(href)
  240. if not info.get('duration'):
  241. info['duration'] = parse_duration(video_data.get('Duration'))
  242. self._sort_formats(formats)
  243. info.update({
  244. 'id': video_id,
  245. 'formats': formats,
  246. 'subtitles': subtitles,
  247. })
  248. webpage_info = self._search_json_ld(webpage, video_id, default={})
  249. if not webpage_info.get('title'):
  250. webpage_info['title'] = self._html_search_regex(
  251. r'(?s)<h\d+[^>]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<',
  252. webpage, 'title', default=None) or self._og_search_title(
  253. webpage, default=None) or self._html_search_meta(
  254. 'twitter:title', webpage, 'title',
  255. default=None) or webpage_info['episode']
  256. return merge_dicts(info, webpage_info)
  257. class ITVBTCCIE(InfoExtractor):
  258. _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  259. _TEST = {
  260. 'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch',
  261. 'info_dict': {
  262. 'id': 'btcc-2018-all-the-action-from-brands-hatch',
  263. 'title': 'BTCC 2018: All the action from Brands Hatch',
  264. },
  265. 'playlist_mincount': 9,
  266. }
  267. BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s'
  268. def _real_extract(self, url):
  269. playlist_id = self._match_id(url)
  270. webpage = self._download_webpage(url, playlist_id)
  271. entries = [
  272. self.url_result(
  273. smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {
  274. # ITV does not like some GB IP ranges, so here are some
  275. # IP blocks it accepts
  276. 'geo_ip_blocks': [
  277. '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21'
  278. ],
  279. 'referrer': url,
  280. }),
  281. ie=BrightcoveNewIE.ie_key(), video_id=video_id)
  282. for video_id in re.findall(r'data-video-id=["\'](\d+)', webpage)]
  283. title = self._og_search_title(webpage, fatal=False)
  284. return self.playlist_result(entries, playlist_id, title)