Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

494 linhas
19KB

  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import base64
  4. import functools
  5. import hashlib
  6. import itertools
  7. import json
  8. import random
  9. import re
  10. import string
  11. from .common import InfoExtractor
  12. from ..compat import compat_struct_pack
  13. from ..utils import (
  14. determine_ext,
  15. error_to_compat_str,
  16. ExtractorError,
  17. int_or_none,
  18. mimetype2ext,
  19. OnDemandPagedList,
  20. parse_iso8601,
  21. sanitized_Request,
  22. str_to_int,
  23. try_get,
  24. unescapeHTML,
  25. update_url_query,
  26. url_or_none,
  27. urlencode_postdata,
  28. )
  29. class DailymotionBaseInfoExtractor(InfoExtractor):
  30. @staticmethod
  31. def _build_request(url):
  32. """Build a request with the family filter disabled"""
  33. request = sanitized_Request(url)
  34. request.add_header('Cookie', 'family_filter=off; ff=off')
  35. return request
  36. def _download_webpage_handle_no_ff(self, url, *args, **kwargs):
  37. request = self._build_request(url)
  38. return self._download_webpage_handle(request, *args, **kwargs)
  39. def _download_webpage_no_ff(self, url, *args, **kwargs):
  40. request = self._build_request(url)
  41. return self._download_webpage(request, *args, **kwargs)
  42. class DailymotionIE(DailymotionBaseInfoExtractor):
  43. _VALID_URL = r'(?i)https?://(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|#)/)?video|swf)/(?P<id>[^/?_]+)'
  44. IE_NAME = 'dailymotion'
  45. _FORMATS = [
  46. ('stream_h264_ld_url', 'ld'),
  47. ('stream_h264_url', 'standard'),
  48. ('stream_h264_hq_url', 'hq'),
  49. ('stream_h264_hd_url', 'hd'),
  50. ('stream_h264_hd1080_url', 'hd180'),
  51. ]
  52. _TESTS = [{
  53. 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news',
  54. 'md5': '074b95bdee76b9e3654137aee9c79dfe',
  55. 'info_dict': {
  56. 'id': 'x5kesuj',
  57. 'ext': 'mp4',
  58. 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller',
  59. 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller',
  60. 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
  61. 'duration': 187,
  62. 'timestamp': 1493651285,
  63. 'upload_date': '20170501',
  64. 'uploader': 'Deadline',
  65. 'uploader_id': 'x1xm8ri',
  66. 'age_limit': 0,
  67. },
  68. }, {
  69. 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
  70. 'md5': '2137c41a8e78554bb09225b8eb322406',
  71. 'info_dict': {
  72. 'id': 'x2iuewm',
  73. 'ext': 'mp4',
  74. 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
  75. 'description': 'Several come bundled with the Steam Controller.',
  76. 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
  77. 'duration': 74,
  78. 'timestamp': 1425657362,
  79. 'upload_date': '20150306',
  80. 'uploader': 'IGN',
  81. 'uploader_id': 'xijv66',
  82. 'age_limit': 0,
  83. 'view_count': int,
  84. },
  85. 'skip': 'video gone',
  86. }, {
  87. # Vevo video
  88. 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi',
  89. 'info_dict': {
  90. 'title': 'Roar (Official)',
  91. 'id': 'USUV71301934',
  92. 'ext': 'mp4',
  93. 'uploader': 'Katy Perry',
  94. 'upload_date': '20130905',
  95. },
  96. 'params': {
  97. 'skip_download': True,
  98. },
  99. 'skip': 'VEVO is only available in some countries',
  100. }, {
  101. # age-restricted video
  102. 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
  103. 'md5': '0d667a7b9cebecc3c89ee93099c4159d',
  104. 'info_dict': {
  105. 'id': 'xyh2zz',
  106. 'ext': 'mp4',
  107. 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
  108. 'uploader': 'HotWaves1012',
  109. 'age_limit': 18,
  110. },
  111. 'skip': 'video gone',
  112. }, {
  113. # geo-restricted, player v5
  114. 'url': 'http://www.dailymotion.com/video/xhza0o',
  115. 'only_matching': True,
  116. }, {
  117. # with subtitles
  118. 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news',
  119. 'only_matching': True,
  120. }, {
  121. 'url': 'http://www.dailymotion.com/swf/video/x3n92nf',
  122. 'only_matching': True,
  123. }, {
  124. 'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun',
  125. 'only_matching': True,
  126. }]
  127. @staticmethod
  128. def _extract_urls(webpage):
  129. # Look for embedded Dailymotion player
  130. matches = re.findall(
  131. r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
  132. return list(map(lambda m: unescapeHTML(m[1]), matches))
  133. def _real_extract(self, url):
  134. video_id = self._match_id(url)
  135. webpage = self._download_webpage_no_ff(
  136. 'https://www.dailymotion.com/video/%s' % video_id, video_id)
  137. age_limit = self._rta_search(webpage)
  138. description = self._og_search_description(
  139. webpage, default=None) or self._html_search_meta(
  140. 'description', webpage, 'description')
  141. view_count_str = self._search_regex(
  142. (r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"',
  143. r'video_views_count[^>]+>\s+([\s\d\,.]+)'),
  144. webpage, 'view count', default=None)
  145. if view_count_str:
  146. view_count_str = re.sub(r'\s', '', view_count_str)
  147. view_count = str_to_int(view_count_str)
  148. comment_count = int_or_none(self._search_regex(
  149. r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"',
  150. webpage, 'comment count', default=None))
  151. player_v5 = self._search_regex(
  152. [r'buildPlayer\(({.+?})\);\n', # See https://github.com/ytdl-org/youtube-dl/issues/7826
  153. r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);',
  154. r'buildPlayer\(({.+?})\);',
  155. r'var\s+config\s*=\s*({.+?});',
  156. # New layout regex (see https://github.com/ytdl-org/youtube-dl/issues/13580)
  157. r'__PLAYER_CONFIG__\s*=\s*({.+?});'],
  158. webpage, 'player v5', default=None)
  159. if player_v5:
  160. player = self._parse_json(player_v5, video_id, fatal=False) or {}
  161. metadata = try_get(player, lambda x: x['metadata'], dict)
  162. if not metadata:
  163. metadata_url = url_or_none(try_get(
  164. player, lambda x: x['context']['metadata_template_url1']))
  165. if metadata_url:
  166. metadata_url = metadata_url.replace(':videoId', video_id)
  167. else:
  168. metadata_url = update_url_query(
  169. 'https://www.dailymotion.com/player/metadata/video/%s'
  170. % video_id, {
  171. 'embedder': url,
  172. 'integration': 'inline',
  173. 'GK_PV5_NEON': '1',
  174. })
  175. metadata = self._download_json(
  176. metadata_url, video_id, 'Downloading metadata JSON')
  177. if try_get(metadata, lambda x: x['error']['type']) == 'password_protected':
  178. password = self._downloader.params.get('videopassword')
  179. if password:
  180. r = int(metadata['id'][1:], 36)
  181. us64e = lambda x: base64.urlsafe_b64encode(x).decode().strip('=')
  182. t = ''.join(random.choice(string.ascii_letters) for i in range(10))
  183. n = us64e(compat_struct_pack('I', r))
  184. i = us64e(hashlib.md5(('%s%d%s' % (password, r, t)).encode()).digest())
  185. metadata = self._download_json(
  186. 'http://www.dailymotion.com/player/metadata/video/p' + i + t + n, video_id)
  187. self._check_error(metadata)
  188. formats = []
  189. for quality, media_list in metadata['qualities'].items():
  190. for media in media_list:
  191. media_url = media.get('url')
  192. if not media_url:
  193. continue
  194. type_ = media.get('type')
  195. if type_ == 'application/vnd.lumberjack.manifest':
  196. continue
  197. ext = mimetype2ext(type_) or determine_ext(media_url)
  198. if ext == 'm3u8':
  199. m3u8_formats = self._extract_m3u8_formats(
  200. media_url, video_id, 'mp4', preference=-1,
  201. m3u8_id='hls', fatal=False)
  202. for f in m3u8_formats:
  203. f['url'] = f['url'].split('#')[0]
  204. formats.append(f)
  205. elif ext == 'f4m':
  206. formats.extend(self._extract_f4m_formats(
  207. media_url, video_id, preference=-1, f4m_id='hds', fatal=False))
  208. else:
  209. f = {
  210. 'url': media_url,
  211. 'format_id': 'http-%s' % quality,
  212. 'ext': ext,
  213. }
  214. m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url)
  215. if m:
  216. f.update({
  217. 'width': int(m.group('width')),
  218. 'height': int(m.group('height')),
  219. })
  220. formats.append(f)
  221. self._sort_formats(formats)
  222. title = metadata['title']
  223. duration = int_or_none(metadata.get('duration'))
  224. timestamp = int_or_none(metadata.get('created_time'))
  225. thumbnail = metadata.get('poster_url')
  226. uploader = metadata.get('owner', {}).get('screenname')
  227. uploader_id = metadata.get('owner', {}).get('id')
  228. subtitles = {}
  229. subtitles_data = metadata.get('subtitles', {}).get('data', {})
  230. if subtitles_data and isinstance(subtitles_data, dict):
  231. for subtitle_lang, subtitle in subtitles_data.items():
  232. subtitles[subtitle_lang] = [{
  233. 'ext': determine_ext(subtitle_url),
  234. 'url': subtitle_url,
  235. } for subtitle_url in subtitle.get('urls', [])]
  236. return {
  237. 'id': video_id,
  238. 'title': title,
  239. 'description': description,
  240. 'thumbnail': thumbnail,
  241. 'duration': duration,
  242. 'timestamp': timestamp,
  243. 'uploader': uploader,
  244. 'uploader_id': uploader_id,
  245. 'age_limit': age_limit,
  246. 'view_count': view_count,
  247. 'comment_count': comment_count,
  248. 'formats': formats,
  249. 'subtitles': subtitles,
  250. }
  251. # vevo embed
  252. vevo_id = self._search_regex(
  253. r'<link rel="video_src" href="[^"]*?vevo\.com[^"]*?video=(?P<id>[\w]*)',
  254. webpage, 'vevo embed', default=None)
  255. if vevo_id:
  256. return self.url_result('vevo:%s' % vevo_id, 'Vevo')
  257. # fallback old player
  258. embed_page = self._download_webpage_no_ff(
  259. 'https://www.dailymotion.com/embed/video/%s' % video_id,
  260. video_id, 'Downloading embed page')
  261. timestamp = parse_iso8601(self._html_search_meta(
  262. 'video:release_date', webpage, 'upload date'))
  263. info = self._parse_json(
  264. self._search_regex(
  265. r'var info = ({.*?}),$', embed_page,
  266. 'video info', flags=re.MULTILINE),
  267. video_id)
  268. self._check_error(info)
  269. formats = []
  270. for (key, format_id) in self._FORMATS:
  271. video_url = info.get(key)
  272. if video_url is not None:
  273. m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
  274. if m_size is not None:
  275. width, height = map(int_or_none, (m_size.group(1), m_size.group(2)))
  276. else:
  277. width, height = None, None
  278. formats.append({
  279. 'url': video_url,
  280. 'ext': 'mp4',
  281. 'format_id': format_id,
  282. 'width': width,
  283. 'height': height,
  284. })
  285. self._sort_formats(formats)
  286. # subtitles
  287. video_subtitles = self.extract_subtitles(video_id, webpage)
  288. title = self._og_search_title(webpage, default=None)
  289. if title is None:
  290. title = self._html_search_regex(
  291. r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage,
  292. 'title')
  293. return {
  294. 'id': video_id,
  295. 'formats': formats,
  296. 'uploader': info['owner.screenname'],
  297. 'timestamp': timestamp,
  298. 'title': title,
  299. 'description': description,
  300. 'subtitles': video_subtitles,
  301. 'thumbnail': info['thumbnail_url'],
  302. 'age_limit': age_limit,
  303. 'view_count': view_count,
  304. 'duration': info['duration']
  305. }
  306. def _check_error(self, info):
  307. error = info.get('error')
  308. if error:
  309. title = error.get('title') or error['message']
  310. # See https://developer.dailymotion.com/api#access-error
  311. if error.get('code') == 'DM007':
  312. self.raise_geo_restricted(msg=title)
  313. raise ExtractorError(
  314. '%s said: %s' % (self.IE_NAME, title), expected=True)
  315. def _get_subtitles(self, video_id, webpage):
  316. try:
  317. sub_list = self._download_webpage(
  318. 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id,
  319. video_id, note=False)
  320. except ExtractorError as err:
  321. self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
  322. return {}
  323. info = json.loads(sub_list)
  324. if (info['total'] > 0):
  325. sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list'])
  326. return sub_lang_list
  327. self._downloader.report_warning('video doesn\'t have subtitles')
  328. return {}
  329. class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
  330. IE_NAME = 'dailymotion:playlist'
  331. _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>x[0-9a-z]+)'
  332. _TESTS = [{
  333. 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q',
  334. 'info_dict': {
  335. 'title': 'SPORT',
  336. 'id': 'xv4bw',
  337. },
  338. 'playlist_mincount': 20,
  339. }]
  340. _PAGE_SIZE = 100
  341. def _fetch_page(self, playlist_id, authorizaion, page):
  342. page += 1
  343. videos = self._download_json(
  344. 'https://graphql.api.dailymotion.com',
  345. playlist_id, 'Downloading page %d' % page,
  346. data=json.dumps({
  347. 'query': '''{
  348. collection(xid: "%s") {
  349. videos(first: %d, page: %d) {
  350. pageInfo {
  351. hasNextPage
  352. nextPage
  353. }
  354. edges {
  355. node {
  356. xid
  357. url
  358. }
  359. }
  360. }
  361. }
  362. }''' % (playlist_id, self._PAGE_SIZE, page)
  363. }).encode(), headers={
  364. 'Authorization': authorizaion,
  365. 'Origin': 'https://www.dailymotion.com',
  366. })['data']['collection']['videos']
  367. for edge in videos['edges']:
  368. node = edge['node']
  369. yield self.url_result(
  370. node['url'], DailymotionIE.ie_key(), node['xid'])
  371. def _real_extract(self, url):
  372. playlist_id = self._match_id(url)
  373. webpage = self._download_webpage(url, playlist_id)
  374. api = self._parse_json(self._search_regex(
  375. r'__PLAYER_CONFIG__\s*=\s*({.+?});',
  376. webpage, 'player config'), playlist_id)['context']['api']
  377. auth = self._download_json(
  378. api.get('auth_url', 'https://graphql.api.dailymotion.com/oauth/token'),
  379. playlist_id, data=urlencode_postdata({
  380. 'client_id': api.get('client_id', 'f1a362d288c1b98099c7'),
  381. 'client_secret': api.get('client_secret', 'eea605b96e01c796ff369935357eca920c5da4c5'),
  382. 'grant_type': 'client_credentials',
  383. }))
  384. authorizaion = '%s %s' % (auth.get('token_type', 'Bearer'), auth['access_token'])
  385. entries = OnDemandPagedList(functools.partial(
  386. self._fetch_page, playlist_id, authorizaion), self._PAGE_SIZE)
  387. return self.playlist_result(
  388. entries, playlist_id,
  389. self._og_search_title(webpage))
  390. class DailymotionUserIE(DailymotionBaseInfoExtractor):
  391. IE_NAME = 'dailymotion:user'
  392. _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)'
  393. _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
  394. _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
  395. _TESTS = [{
  396. 'url': 'https://www.dailymotion.com/user/nqtv',
  397. 'info_dict': {
  398. 'id': 'nqtv',
  399. 'title': 'Rémi Gaillard',
  400. },
  401. 'playlist_mincount': 100,
  402. }, {
  403. 'url': 'http://www.dailymotion.com/user/UnderProject',
  404. 'info_dict': {
  405. 'id': 'UnderProject',
  406. 'title': 'UnderProject',
  407. },
  408. 'playlist_mincount': 1800,
  409. 'expected_warnings': [
  410. 'Stopped at duplicated page',
  411. ],
  412. 'skip': 'Takes too long time',
  413. }]
  414. def _extract_entries(self, id):
  415. video_ids = set()
  416. processed_urls = set()
  417. for pagenum in itertools.count(1):
  418. page_url = self._PAGE_TEMPLATE % (id, pagenum)
  419. webpage, urlh = self._download_webpage_handle_no_ff(
  420. page_url, id, 'Downloading page %s' % pagenum)
  421. if urlh.geturl() in processed_urls:
  422. self.report_warning('Stopped at duplicated page %s, which is the same as %s' % (
  423. page_url, urlh.geturl()), id)
  424. break
  425. processed_urls.add(urlh.geturl())
  426. for video_id in re.findall(r'data-xid="(.+?)"', webpage):
  427. if video_id not in video_ids:
  428. yield self.url_result(
  429. 'http://www.dailymotion.com/video/%s' % video_id,
  430. DailymotionIE.ie_key(), video_id)
  431. video_ids.add(video_id)
  432. if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
  433. break
  434. def _real_extract(self, url):
  435. mobj = re.match(self._VALID_URL, url)
  436. user = mobj.group('user')
  437. webpage = self._download_webpage(
  438. 'https://www.dailymotion.com/user/%s' % user, user)
  439. full_user = unescapeHTML(self._html_search_regex(
  440. r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user),
  441. webpage, 'user'))
  442. return {
  443. '_type': 'playlist',
  444. 'id': user,
  445. 'title': full_user,
  446. 'entries': self._extract_entries(user),
  447. }