From fdf229a55d678c9c194e482e28a3b55a106e337c Mon Sep 17 00:00:00 2001 From: Joaopa Date: Sat, 2 Dec 2023 08:00:32 -1000 Subject: [PATCH] [script.module.youtube.dl] 23.12.01 --- script.module.youtube.dl/addon.xml | 2 +- .../lib/youtube_dl/YoutubeDL.py | 404 +++- .../lib/youtube_dl/aes.py | 39 +- .../lib/youtube_dl/cache.py | 36 +- .../lib/youtube_dl/casefold.py | 1667 +++++++++++++++++ .../lib/youtube_dl/compat.py | 454 ++++- .../lib/youtube_dl/downloader/__init__.py | 28 +- .../lib/youtube_dl/downloader/common.py | 28 +- .../lib/youtube_dl/downloader/dash.py | 40 +- .../lib/youtube_dl/downloader/external.py | 198 +- .../lib/youtube_dl/downloader/fragment.py | 46 +- .../lib/youtube_dl/downloader/http.py | 18 +- .../lib/youtube_dl/downloader/niconico.py | 66 + .../lib/youtube_dl/downloader/rtmp.py | 10 +- .../lib/youtube_dl/extractor/adn.py | 57 +- .../lib/youtube_dl/extractor/aenetworks.py | 27 +- .../lib/youtube_dl/extractor/aliexpress.py | 2 +- .../lib/youtube_dl/extractor/alsace20tv.py | 89 + .../extractor/americastestkitchen.py | 115 +- .../lib/youtube_dl/extractor/appleconnect.py | 13 +- .../lib/youtube_dl/extractor/applepodcasts.py | 54 +- .../lib/youtube_dl/extractor/ard.py | 59 +- .../lib/youtube_dl/extractor/arnes.py | 101 + .../lib/youtube_dl/extractor/arte.py | 47 + .../lib/youtube_dl/extractor/audiomack.py | 42 +- .../lib/youtube_dl/extractor/bandcamp.py | 4 +- .../lib/youtube_dl/extractor/bbc.py | 293 ++- .../lib/youtube_dl/extractor/bigo.py | 59 + .../lib/youtube_dl/extractor/bilibili.py | 8 +- .../lib/youtube_dl/extractor/blerp.py | 173 ++ .../lib/youtube_dl/extractor/bongacams.py | 18 +- .../lib/youtube_dl/extractor/callin.py | 74 + .../lib/youtube_dl/extractor/cammodels.py | 34 +- .../lib/youtube_dl/extractor/cbs.py | 5 +- .../lib/youtube_dl/extractor/cbsnews.py | 2 +- .../lib/youtube_dl/extractor/cbssports.py | 123 +- .../lib/youtube_dl/extractor/cda.py | 6 +- .../lib/youtube_dl/extractor/ceskatelevize.py | 170 +- .../lib/youtube_dl/extractor/clipchamp.py | 69 + .../lib/youtube_dl/extractor/common.py | 173 +- .../lib/youtube_dl/extractor/cpac.py | 148 ++ .../youtube_dl/extractor/curiositystream.py | 108 +- .../lib/youtube_dl/extractor/dispeak.py | 46 +- .../lib/youtube_dl/extractor/dlf.py | 204 ++ .../lib/youtube_dl/extractor/egghead.py | 16 +- .../lib/youtube_dl/extractor/eroprofile.py | 21 +- .../lib/youtube_dl/extractor/extractors.py | 108 +- .../lib/youtube_dl/extractor/facebook.py | 5 +- .../lib/youtube_dl/extractor/fifa.py | 101 + .../lib/youtube_dl/extractor/filemoon.py | 43 + .../lib/youtube_dl/extractor/formula1.py | 32 +- .../lib/youtube_dl/extractor/francetv.py | 7 +- .../lib/youtube_dl/extractor/fujitv.py | 2 +- .../lib/youtube_dl/extractor/funimation.py | 6 +- .../lib/youtube_dl/extractor/gdcvault.py | 34 +- .../lib/youtube_dl/extractor/generic.py | 268 ++- .../lib/youtube_dl/extractor/globalplayer.py | 273 +++ .../lib/youtube_dl/extractor/go.py | 46 +- .../lib/youtube_dl/extractor/hrfernsehen.py | 101 + .../lib/youtube_dl/extractor/ign.py | 337 +++- .../lib/youtube_dl/extractor/infoq.py | 9 +- .../lib/youtube_dl/extractor/instagram.py | 29 +- .../lib/youtube_dl/extractor/itv.py | 380 +++- .../lib/youtube_dl/extractor/jamendo.py | 74 +- .../lib/youtube_dl/extractor/kaltura.py | 16 +- .../lib/youtube_dl/extractor/kommunetv.py | 35 + .../lib/youtube_dl/extractor/kth.py | 31 + .../lib/youtube_dl/extractor/lbry.py | 81 +- .../lib/youtube_dl/extractor/line.py | 142 +- .../lib/youtube_dl/extractor/manyvids.py | 123 +- .../lib/youtube_dl/extractor/maoritv.py | 31 + .../lib/youtube_dl/extractor/medaltv.py | 20 +- .../lib/youtube_dl/extractor/mediaset.py | 6 +- .../lib/youtube_dl/extractor/minds.py | 2 +- .../lib/youtube_dl/extractor/mlb.py | 189 +- .../lib/youtube_dl/extractor/motherless.py | 42 +- .../lib/youtube_dl/extractor/mtv.py | 9 +- .../lib/youtube_dl/extractor/myspass.py | 4 +- .../lib/youtube_dl/extractor/myvideoge.py | 87 + .../lib/youtube_dl/extractor/ndr.py | 101 +- .../lib/youtube_dl/extractor/neteasemusic.py | 159 +- .../lib/youtube_dl/extractor/nhk.py | 21 +- .../lib/youtube_dl/extractor/niconico.py | 686 +++++-- .../lib/youtube_dl/extractor/nrk.py | 5 +- .../lib/youtube_dl/extractor/nuvid.py | 120 +- .../lib/youtube_dl/extractor/openload.py | 4 +- .../lib/youtube_dl/extractor/orf.py | 26 +- .../lib/youtube_dl/extractor/palcomp3.py | 148 ++ .../lib/youtube_dl/extractor/peekvids.py | 193 ++ .../lib/youtube_dl/extractor/peertube.py | 23 +- .../lib/youtube_dl/extractor/periscope.py | 8 +- .../lib/youtube_dl/extractor/phoenix.py | 51 +- .../lib/youtube_dl/extractor/picarto.py | 100 +- .../lib/youtube_dl/extractor/pinterest.py | 4 +- .../lib/youtube_dl/extractor/playstuff.py | 65 + .../lib/youtube_dl/extractor/pluralsight.py | 2 +- .../lib/youtube_dl/extractor/pornhub.py | 94 +- .../lib/youtube_dl/extractor/pr0gramm.py | 105 ++ .../lib/youtube_dl/extractor/rai.py | 122 +- .../lib/youtube_dl/extractor/rbgtum.py | 97 + .../lib/youtube_dl/extractor/redbulltv.py | 6 +- .../lib/youtube_dl/extractor/rtve.py | 232 +-- .../lib/youtube_dl/extractor/rutv.py | 5 +- .../lib/youtube_dl/extractor/s4c.py | 124 ++ .../lib/youtube_dl/extractor/sbs.py | 5 +- .../youtube_dl/extractor/screencastomatic.py | 48 +- .../lib/youtube_dl/extractor/shahid.py | 28 +- .../lib/youtube_dl/extractor/shared.py | 9 +- .../lib/youtube_dl/extractor/southpark.py | 16 +- .../youtube_dl/extractor/sportdeutschland.py | 145 +- .../lib/youtube_dl/extractor/streamcz.py | 171 +- .../lib/youtube_dl/extractor/streamsb.py | 61 + .../lib/youtube_dl/extractor/svt.py | 23 +- .../lib/youtube_dl/extractor/ted.py | 24 +- .../lib/youtube_dl/extractor/tele5.py | 86 +- .../lib/youtube_dl/extractor/telegraaf.py | 4 +- .../lib/youtube_dl/extractor/telewebion.py | 47 +- .../lib/youtube_dl/extractor/thisvid.py | 218 +++ .../lib/youtube_dl/extractor/tiktok.py | 7 +- .../lib/youtube_dl/extractor/trovo.py | 1 + .../lib/youtube_dl/extractor/tv2dk.py | 32 +- .../lib/youtube_dl/extractor/tver.py | 36 +- .../lib/youtube_dl/extractor/twitch.py | 30 +- .../lib/youtube_dl/extractor/twitter.py | 4 + .../lib/youtube_dl/extractor/uktvplay.py | 2 +- .../lib/youtube_dl/extractor/umg.py | 8 +- .../lib/youtube_dl/extractor/uol.py | 1 - .../lib/youtube_dl/extractor/urplay.py | 52 +- .../lib/youtube_dl/extractor/ustream.py | 2 +- .../lib/youtube_dl/extractor/vgtv.py | 6 + .../lib/youtube_dl/extractor/videa.py | 57 +- .../lib/youtube_dl/extractor/viki.py | 339 ++-- .../lib/youtube_dl/extractor/vimeo.py | 359 ++-- .../lib/youtube_dl/extractor/vk.py | 11 + .../lib/youtube_dl/extractor/vlive.py | 2 +- .../lib/youtube_dl/extractor/voxmedia.py | 26 +- .../lib/youtube_dl/extractor/vvvvid.py | 40 +- .../lib/youtube_dl/extractor/wat.py | 2 +- .../lib/youtube_dl/extractor/wdr.py | 60 +- .../lib/youtube_dl/extractor/whyp.py | 55 + .../lib/youtube_dl/extractor/xfileshare.py | 4 + .../lib/youtube_dl/extractor/xhamster.py | 25 +- .../lib/youtube_dl/extractor/xtube.py | 51 +- .../lib/youtube_dl/extractor/xvideos.py | 2 +- .../lib/youtube_dl/extractor/youku.py | 2 +- .../lib/youtube_dl/extractor/youporn.py | 97 +- .../lib/youtube_dl/extractor/youtube.py | 1423 ++++++++++---- .../lib/youtube_dl/extractor/zdf.py | 203 +- .../lib/youtube_dl/extractor/zingmp3.py | 208 +- .../lib/youtube_dl/extractor/zoom.py | 68 + .../postprocessor/embedthumbnail.py | 7 +- .../lib/youtube_dl/postprocessor/ffmpeg.py | 18 +- .../postprocessor/metadatafromtitle.py | 2 + 153 files changed, 11676 insertions(+), 3021 deletions(-) create mode 100644 script.module.youtube.dl/lib/youtube_dl/casefold.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/downloader/niconico.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/alsace20tv.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/arnes.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/bigo.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/blerp.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/callin.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/clipchamp.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/cpac.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/dlf.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/fifa.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/filemoon.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/globalplayer.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/hrfernsehen.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/kommunetv.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/kth.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/maoritv.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/myvideoge.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/palcomp3.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/peekvids.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/playstuff.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/pr0gramm.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/rbgtum.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/s4c.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/streamsb.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/thisvid.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/whyp.py create mode 100644 script.module.youtube.dl/lib/youtube_dl/extractor/zoom.py diff --git a/script.module.youtube.dl/addon.xml b/script.module.youtube.dl/addon.xml index 8a87f1438..9f89e12e2 100644 --- a/script.module.youtube.dl/addon.xml +++ b/script.module.youtube.dl/addon.xml @@ -1,5 +1,5 @@ - + diff --git a/script.module.youtube.dl/lib/youtube_dl/YoutubeDL.py b/script.module.youtube.dl/lib/youtube_dl/YoutubeDL.py index ecac31f7a..13a41928f 100644 --- a/script.module.youtube.dl/lib/youtube_dl/YoutubeDL.py +++ b/script.module.youtube.dl/lib/youtube_dl/YoutubeDL.py @@ -4,11 +4,9 @@ from __future__ import absolute_import, unicode_literals import collections -import contextlib import copy import datetime import errno -import fileinput import io import itertools import json @@ -26,25 +24,38 @@ import traceback import random +try: + from ssl import OPENSSL_VERSION +except ImportError: + # Must be Python 2.6, should be built against 1.0.2 + OPENSSL_VERSION = 'OpenSSL 1.0.2(?)' from string import ascii_letters from .compat import ( compat_basestring, - compat_cookiejar, + compat_collections_chain_map as ChainMap, + compat_filter as filter, compat_get_terminal_size, compat_http_client, + compat_http_cookiejar_Cookie, + compat_http_cookies_SimpleCookie, + compat_integer_types, compat_kwargs, + compat_map as map, compat_numeric_types, + compat_open as open, compat_os_name, compat_str, compat_tokenize_tokenize, compat_urllib_error, + compat_urllib_parse, compat_urllib_request, compat_urllib_request_DataHandler, ) from .utils import ( age_restricted, args_to_str, + bug_reports_message, ContentTooShortError, date_from_str, DateRange, @@ -62,7 +73,9 @@ GeoRestrictedError, int_or_none, ISO3166Utils, + join_nonempty, locked_file, + LazyList, make_HTTPS_handler, MaxDownloadsReached, orderedSet, @@ -73,6 +86,7 @@ PostProcessingError, preferredencoding, prepend_extension, + process_communicate_or_kill, register_socks_protocols, render_table, replace_extension, @@ -84,6 +98,7 @@ std_headers, str_or_none, subtitles_filename, + traverse_obj, UnavailableVideoError, url_basename, version_tuple, @@ -93,6 +108,7 @@ YoutubeDLCookieProcessor, YoutubeDLHandler, YoutubeDLRedirectHandler, + ytdl_is_updateable, ) from .cache import Cache from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER @@ -362,6 +378,9 @@ def __init__(self, params=None, auto_init=True): self.params.update(params) self.cache = Cache(self) + self._header_cookies = [] + self._load_cookies_from_headers(self.params.get('http_headers')) + def check_deprecated(param, option, suggestion): if self.params.get(param) is not None: self.report_warning( @@ -568,7 +587,7 @@ def __exit__(self, *args): if self.params.get('cookiefile') is not None: self.cookiejar.save(ignore_discard=True, ignore_expires=True) - def trouble(self, message=None, tb=None): + def trouble(self, *args, **kwargs): """Determine action to take when a download problem appears. Depending on if the downloader has been configured to ignore @@ -577,6 +596,11 @@ def trouble(self, message=None, tb=None): tb, if given, is additional traceback information. """ + # message=None, tb=None, is_error=True + message = args[0] if len(args) > 0 else kwargs.get('message', None) + tb = args[1] if len(args) > 1 else kwargs.get('tb', None) + is_error = args[2] if len(args) > 2 else kwargs.get('is_error', True) + if message is not None: self.to_stderr(message) if self.params.get('verbose'): @@ -589,7 +613,10 @@ def trouble(self, message=None, tb=None): else: tb_data = traceback.format_list(traceback.extract_stack()) tb = ''.join(tb_data) - self.to_stderr(tb) + if tb: + self.to_stderr(tb) + if not is_error: + return if not self.params.get('ignoreerrors', False): if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: exc_info = sys.exc_info()[1].exc_info @@ -598,11 +625,18 @@ def trouble(self, message=None, tb=None): raise DownloadError(message, exc_info) self._download_retcode = 1 - def report_warning(self, message): + def report_warning(self, message, only_once=False, _cache={}): ''' Print the message to stderr, it will be prefixed with 'WARNING:' If stderr is a tty file the 'WARNING:' will be colored ''' + if only_once: + m_hash = hash((self, message)) + m_cnt = _cache.setdefault(m_hash, 0) + _cache[m_hash] = m_cnt + 1 + if m_cnt > 0: + return + if self.params.get('logger') is not None: self.params['logger'].warning(message) else: @@ -615,7 +649,7 @@ def report_warning(self, message): warning_message = '%s %s' % (_msg_header, message) self.to_stderr(warning_message) - def report_error(self, message, tb=None): + def report_error(self, message, *args, **kwargs): ''' Do the same as trouble, but prefixes the message with 'ERROR:', colored in red if stderr is a tty file. @@ -624,8 +658,18 @@ def report_error(self, message, tb=None): _msg_header = '\033[0;31mERROR:\033[0m' else: _msg_header = 'ERROR:' - error_message = '%s %s' % (_msg_header, message) - self.trouble(error_message, tb) + kwargs['message'] = '%s %s' % (_msg_header, message) + self.trouble(*args, **kwargs) + + def report_unscoped_cookies(self, *args, **kwargs): + # message=None, tb=False, is_error=False + if len(args) <= 2: + kwargs.setdefault('is_error', False) + if len(args) <= 0: + kwargs.setdefault( + 'message', + 'Unscoped cookies are not allowed: please specify some sort of scoping') + self.report_error(*args, **kwargs) def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" @@ -720,7 +764,7 @@ def prepare_filename(self, info_dict): filename = encodeFilename(filename, True).decode(preferredencoding()) return sanitize_path(filename) except ValueError as err: - self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') + self.report_error('Error in output template: ' + error_to_compat_str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') return None def _match_entry(self, info_dict, incomplete): @@ -773,11 +817,20 @@ def add_extra_info(info_dict, extra_info): def extract_info(self, url, download=True, ie_key=None, extra_info={}, process=True, force_generic_extractor=False): - ''' - Returns a list with a dictionary for each video we find. - If 'download', also downloads the videos. - extra_info is a dict containing the extra values to add to each result - ''' + """ + Return a list with a dictionary for each video extracted. + + Arguments: + url -- URL to extract + + Keyword arguments: + download -- whether to download videos during extraction + ie_key -- extractor key hint + extra_info -- dictionary containing the extra values to add to each result + process -- whether to resolve all unresolved references (URLs, playlist items), + must be True for download to work. + force_generic_extractor -- force using the generic extractor + """ if not ie_key and force_generic_extractor: ie_key = 'Generic' @@ -812,7 +865,7 @@ def wrapper(self, *args, **kwargs): msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' self.report_error(msg) except ExtractorError as e: # An error we somewhat expected - self.report_error(compat_str(e), e.format_traceback()) + self.report_error(compat_str(e), tb=e.format_traceback()) except MaxDownloadsReached: raise except Exception as e: @@ -822,8 +875,83 @@ def wrapper(self, *args, **kwargs): raise return wrapper + def _remove_cookie_header(self, http_headers): + """Filters out `Cookie` header from an `http_headers` dict + The `Cookie` header is removed to prevent leaks as a result of unscoped cookies. + See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj + + @param http_headers An `http_headers` dict from which any `Cookie` header + should be removed, or None + """ + return dict(filter(lambda pair: pair[0].lower() != 'cookie', (http_headers or {}).items())) + + def _load_cookies(self, data, **kwargs): + """Loads cookies from a `Cookie` header + + This tries to work around the security vulnerability of passing cookies to every domain. + + @param data The Cookie header as a string to load the cookies from + @param autoscope If `False`, scope cookies using Set-Cookie syntax and error for cookie without domains + If `True`, save cookies for later to be stored in the jar with a limited scope + If a URL, save cookies in the jar with the domain of the URL + """ + # autoscope=True (kw-only) + autoscope = kwargs.get('autoscope', True) + + for cookie in compat_http_cookies_SimpleCookie(data).values() if data else []: + if autoscope and any(cookie.values()): + raise ValueError('Invalid syntax in Cookie Header') + + domain = cookie.get('domain') or '' + expiry = cookie.get('expires') + if expiry == '': # 0 is valid so we check for `''` explicitly + expiry = None + prepared_cookie = compat_http_cookiejar_Cookie( + cookie.get('version') or 0, cookie.key, cookie.value, None, False, + domain, True, True, cookie.get('path') or '', bool(cookie.get('path')), + bool(cookie.get('secure')), expiry, False, None, None, {}) + + if domain: + self.cookiejar.set_cookie(prepared_cookie) + elif autoscope is True: + self.report_warning( + 'Passing cookies as a header is a potential security risk; ' + 'they will be scoped to the domain of the downloaded urls. ' + 'Please consider loading cookies from a file or browser instead.', + only_once=True) + self._header_cookies.append(prepared_cookie) + elif autoscope: + self.report_warning( + 'The extractor result contains an unscoped cookie as an HTTP header. ' + 'If you are specifying an input URL, ' + bug_reports_message(), + only_once=True) + self._apply_header_cookies(autoscope, [prepared_cookie]) + else: + self.report_unscoped_cookies() + + def _load_cookies_from_headers(self, headers): + self._load_cookies(traverse_obj(headers, 'cookie', casesense=False)) + + def _apply_header_cookies(self, url, cookies=None): + """This method applies stray header cookies to the provided url + + This loads header cookies and scopes them to the domain provided in `url`. + While this is not ideal, it helps reduce the risk of them being sent to + an unintended destination. + """ + parsed = compat_urllib_parse.urlparse(url) + if not parsed.hostname: + return + + for cookie in map(copy.copy, cookies or self._header_cookies): + cookie.domain = '.' + parsed.hostname + self.cookiejar.set_cookie(cookie) + @__handle_extraction_exceptions def __extract_info(self, url, ie, download, extra_info, process): + # Compat with passing cookies in http headers + self._apply_header_cookies(url) + ie_result = ie.extract(url) if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) return @@ -849,7 +977,7 @@ def add_default_extra_info(self, ie_result, ie, url): def process_ie_result(self, ie_result, download=True, extra_info={}): """ - Take the result of the ie(may be modified) and resolve all unresolved + Take the result of the ie (may be modified) and resolve all unresolved references (URLs, playlist items). It will also download the videos if 'download'. @@ -1376,17 +1504,16 @@ def _merge(formats_info): 'abr': formats_info[1].get('abr'), 'ext': output_ext, } - video_selector, audio_selector = map(_build_selector_function, selector.selector) def selector_function(ctx): - for pair in itertools.product( - video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))): + selector_fn = lambda x: _build_selector_function(x)(ctx) + for pair in itertools.product(*map(selector_fn, selector.selector)): yield _merge(pair) filters = [self._build_format_filter(f) for f in selector.filters] def final_selector(ctx): - ctx_copy = copy.deepcopy(ctx) + ctx_copy = dict(ctx) for _filter in filters: ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats'])) return selector_function(ctx_copy) @@ -1421,23 +1548,45 @@ def restore_last_token(self): parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) return _build_selector_function(parsed_selector) - def _calc_headers(self, info_dict): - res = std_headers.copy() - - add_headers = info_dict.get('http_headers') - if add_headers: - res.update(add_headers) + def _calc_headers(self, info_dict, load_cookies=False): + if load_cookies: # For --load-info-json + # load cookies from http_headers in legacy info.json + self._load_cookies(traverse_obj(info_dict, ('http_headers', 'Cookie'), casesense=False), + autoscope=info_dict['url']) + # load scoped cookies from info.json + self._load_cookies(info_dict.get('cookies'), autoscope=False) - cookies = self._calc_cookies(info_dict) + cookies = self.cookiejar.get_cookies_for_url(info_dict['url']) if cookies: - res['Cookie'] = cookies + # Make a string like name1=val1; attr1=a_val1; ...name2=val2; ... + # By convention a cookie name can't be a well-known attribute name + # so this syntax is unambiguous and can be parsed by (eg) SimpleCookie + encoder = compat_http_cookies_SimpleCookie() + values = [] + attributes = (('Domain', '='), ('Path', '='), ('Secure',), ('Expires', '='), ('Version', '=')) + attributes = tuple([x[0].lower()] + list(x) for x in attributes) + for cookie in cookies: + _, value = encoder.value_encode(cookie.value) + # Py 2 '' --> '', Py 3 '' --> '""' + if value == '': + value = '""' + values.append('='.join((cookie.name, value))) + for attr in attributes: + value = getattr(cookie, attr[0], None) + if value: + values.append('%s%s' % (''.join(attr[1:]), value if len(attr) == 3 else '')) + info_dict['cookies'] = '; '.join(values) + + res = std_headers.copy() + res.update(info_dict.get('http_headers') or {}) + res = self._remove_cookie_header(res) if 'X-Forwarded-For' not in res: x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip') if x_forwarded_for_ip: res['X-Forwarded-For'] = x_forwarded_for_ip - return res + return res or None def _calc_cookies(self, info_dict): pr = sanitized_Request(info_dict['url']) @@ -1511,14 +1660,18 @@ def sanitize_numeric_fields(info): if 'display_id' not in info_dict and 'id' in info_dict: info_dict['display_id'] = info_dict['id'] - if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: - # Working around out-of-range timestamp values (e.g. negative ones on Windows, - # see http://bugs.python.org/issue1646728) - try: - upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp']) - info_dict['upload_date'] = upload_date.strftime('%Y%m%d') - except (ValueError, OverflowError, OSError): - pass + for ts_key, date_key in ( + ('timestamp', 'upload_date'), + ('release_timestamp', 'release_date'), + ): + if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: + # Working around out-of-range timestamp values (e.g. negative ones on Windows, + # see http://bugs.python.org/issue1646728) + try: + upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) + info_dict[date_key] = compat_str(upload_date.strftime('%Y%m%d')) + except (ValueError, OverflowError, OSError): + pass # Auto generate title fields corresponding to the *_number fields when missing # in order to always have clean titles. This is very common for TV series. @@ -1556,9 +1709,6 @@ def sanitize_numeric_fields(info): else: formats = info_dict['formats'] - if not formats: - raise ExtractorError('No video formats found!') - def is_wellformed(f): url = f.get('url') if not url: @@ -1571,7 +1721,10 @@ def is_wellformed(f): return True # Filter out malformed formats for better extraction robustness - formats = list(filter(is_wellformed, formats)) + formats = list(filter(is_wellformed, formats or [])) + + if not formats: + raise ExtractorError('No video formats found!') formats_dict = {} @@ -1612,10 +1765,13 @@ def is_wellformed(f): format['protocol'] = determine_protocol(format) # Add HTTP headers, so that external programs can use them from the # json output - full_format_info = info_dict.copy() - full_format_info.update(format) - format['http_headers'] = self._calc_headers(full_format_info) - # Remove private housekeeping stuff + format['http_headers'] = self._calc_headers(ChainMap(format, info_dict), load_cookies=True) + + # Safeguard against old/insecure infojson when using --load-info-json + info_dict['http_headers'] = self._remove_cookie_header( + info_dict.get('http_headers') or {}) or None + + # Remove private housekeeping stuff (copied to http_headers in _calc_headers()) if '__x_forwarded_for_ip' in info_dict: del info_dict['__x_forwarded_for_ip'] @@ -1758,17 +1914,16 @@ def print_optional(field): self.to_stdout(formatSeconds(info_dict['duration'])) print_mandatory('format') if self.params.get('forcejson', False): - self.to_stdout(json.dumps(info_dict)) + self.to_stdout(json.dumps(self.sanitize_info(info_dict))) def process_info(self, info_dict): """Process a single resolved IE result.""" assert info_dict.get('_type', 'video') == 'video' - max_downloads = self.params.get('max_downloads') - if max_downloads is not None: - if self._num_downloads >= int(max_downloads): - raise MaxDownloadsReached() + max_downloads = int_or_none(self.params.get('max_downloads')) or float('inf') + if self._num_downloads >= max_downloads: + raise MaxDownloadsReached() # TODO: backward compatibility, to be removed info_dict['fulltitle'] = info_dict['title'] @@ -1819,7 +1974,7 @@ def ensure_dir_exists(path): else: try: self.to_screen('[info] Writing video description to: ' + descfn) - with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: + with open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: descfile.write(info_dict['description']) except (OSError, IOError): self.report_error('Cannot write description file ' + descfn) @@ -1834,7 +1989,7 @@ def ensure_dir_exists(path): else: try: self.to_screen('[info] Writing video annotations to: ' + annofn) - with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: + with open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: annofile.write(info_dict['annotations']) except (KeyError, TypeError): self.report_warning('There are no annotations to write.') @@ -1861,7 +2016,7 @@ def ensure_dir_exists(path): try: # Use newline='' to prevent conversion of newline characters # See https://github.com/ytdl-org/youtube-dl/issues/10268 - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: + with open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: subfile.write(sub_info['data']) except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) @@ -1870,36 +2025,41 @@ def ensure_dir_exists(path): try: sub_data = ie._request_webpage( sub_info['url'], info_dict['id'], note=False).read() - with io.open(encodeFilename(sub_filename), 'wb') as subfile: + with open(encodeFilename(sub_filename), 'wb') as subfile: subfile.write(sub_data) except (ExtractorError, IOError, OSError, ValueError) as err: self.report_warning('Unable to download subtitle for "%s": %s' % (sub_lang, error_to_compat_str(err))) continue - if self.params.get('writeinfojson', False): - infofn = replace_extension(filename, 'info.json', info_dict.get('ext')) - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)): - self.to_screen('[info] Video description metadata is already present') - else: - self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn) - try: - write_json_file(self.filter_requested_info(info_dict), infofn) - except (OSError, IOError): - self.report_error('Cannot write metadata to JSON file ' + infofn) - return + self._write_info_json( + 'video description', info_dict, + replace_extension(filename, 'info.json', info_dict.get('ext'))) self._write_thumbnails(info_dict, filename) if not self.params.get('skip_download', False): try: + def checked_get_suitable_downloader(info_dict, params): + ed_args = params.get('external_downloader_args') + dler = get_suitable_downloader(info_dict, params) + if ed_args and not params.get('external_downloader_args'): + # external_downloader_args was cleared because external_downloader was rejected + self.report_warning('Requested external downloader cannot be used: ' + 'ignoring --external-downloader-args.') + return dler + def dl(name, info): - fd = get_suitable_downloader(info, self.params)(self, self.params) + fd = checked_get_suitable_downloader(info, self.params)(self, self.params) for ph in self._progress_hooks: fd.add_progress_hook(ph) if self.params.get('verbose'): self.to_screen('[debug] Invoking downloader on %r' % info.get('url')) - return fd.download(name, info) + + new_info = dict((k, v) for k, v in info.items() if not k.startswith('__p')) + new_info['http_headers'] = self._calc_headers(new_info) + + return fd.download(name, new_info) if info_dict.get('requested_formats') is not None: downloaded = [] @@ -2036,9 +2196,12 @@ def compatible_formats(formats): try: self.post_process(filename, info_dict) except (PostProcessingError) as err: - self.report_error('postprocessing: %s' % str(err)) + self.report_error('postprocessing: %s' % error_to_compat_str(err)) return self.record_download_archive(info_dict) + # avoid possible nugatory search for further items (PR #26638) + if self._num_downloads >= max_downloads: + raise MaxDownloadsReached() def download(self, url_list): """Download a given list of URLs.""" @@ -2061,16 +2224,13 @@ def download(self, url_list): raise else: if self.params.get('dump_single_json', False): - self.to_stdout(json.dumps(res)) + self.to_stdout(json.dumps(self.sanitize_info(res))) return self._download_retcode def download_with_info_file(self, info_filename): - with contextlib.closing(fileinput.FileInput( - [info_filename], mode='r', - openhook=fileinput.hook_encoded('utf-8'))) as f: - # FileInput doesn't have a read method, we can't call json.load - info = self.filter_requested_info(json.loads('\n'.join(f))) + with open(info_filename, encoding='utf-8') as f: + info = self.filter_requested_info(json.load(f)) try: self.process_ie_result(info, download=True) except DownloadError: @@ -2083,10 +2243,36 @@ def download_with_info_file(self, info_filename): return self._download_retcode @staticmethod - def filter_requested_info(info_dict): - return dict( - (k, v) for k, v in info_dict.items() - if k not in ['requested_formats', 'requested_subtitles']) + def sanitize_info(info_dict, remove_private_keys=False): + ''' Sanitize the infodict for converting to json ''' + if info_dict is None: + return info_dict + + if remove_private_keys: + reject = lambda k, v: (v is None + or k.startswith('__') + or k in ('requested_formats', + 'requested_subtitles')) + else: + reject = lambda k, v: False + + def filter_fn(obj): + if isinstance(obj, dict): + return dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v)) + elif isinstance(obj, (list, tuple, set, LazyList)): + return list(map(filter_fn, obj)) + elif obj is None or any(isinstance(obj, c) + for c in (compat_integer_types, + (compat_str, float, bool))): + return obj + else: + return repr(obj) + + return filter_fn(info_dict) + + @classmethod + def filter_requested_info(cls, info_dict): + return cls.sanitize_info(info_dict, True) def post_process(self, filename, ie_info): """Run all the postprocessors on the given file.""" @@ -2293,18 +2479,21 @@ def print_debug_header(self): self.get_encoding())) write_string(encoding_str, encoding=None) - self._write_string('[debug] youtube-dl version ' + __version__ + '\n') + writeln_debug = lambda *s: self._write_string('[debug] %s\n' % (''.join(s), )) + writeln_debug('youtube-dl version ', __version__) if _LAZY_LOADER: - self._write_string('[debug] Lazy loading extractors enabled' + '\n') + writeln_debug('Lazy loading extractors enabled') + if ytdl_is_updateable(): + writeln_debug('Single file build') try: sp = subprocess.Popen( ['git', 'rev-parse', '--short', 'HEAD'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=os.path.dirname(os.path.abspath(__file__))) - out, err = sp.communicate() + out, err = process_communicate_or_kill(sp) out = out.decode().strip() if re.match('[0-9a-f]+', out): - self._write_string('[debug] Git HEAD: ' + out + '\n') + writeln_debug('Git HEAD: ', out) except Exception: try: sys.exc_clear() @@ -2317,9 +2506,22 @@ def python_implementation(): return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3] return impl_name - self._write_string('[debug] Python version %s (%s) - %s\n' % ( - platform.python_version(), python_implementation(), - platform_name())) + def libc_ver(): + try: + return platform.libc_ver() + except OSError: # We may not have access to the executable + return [] + + libc = join_nonempty(*libc_ver(), delim=' ') + writeln_debug('Python %s (%s %s %s) - %s - %s%s' % ( + platform.python_version(), + python_implementation(), + platform.machine(), + platform.architecture()[0], + platform_name(), + OPENSSL_VERSION, + (' - %s' % (libc, )) if libc else '' + )) exe_versions = FFmpegPostProcessor.get_versions(self) exe_versions['rtmpdump'] = rtmpdump_version() @@ -2331,17 +2533,17 @@ def python_implementation(): ) if not exe_str: exe_str = 'none' - self._write_string('[debug] exe versions: %s\n' % exe_str) + writeln_debug('exe versions: %s' % (exe_str, )) proxy_map = {} for handler in self._opener.handlers: if hasattr(handler, 'proxies'): proxy_map.update(handler.proxies) - self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n') + writeln_debug('Proxy map: ', compat_str(proxy_map)) if self.params.get('call_home', False): ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8') - self._write_string('[debug] Public IP address: %s\n' % ipaddr) + writeln_debug('Public IP address: %s' % (ipaddr, )) latest_version = self.urlopen( 'https://yt-dl.org/latest/version').read().decode('utf-8') if version_tuple(latest_version) > version_tuple(__version__): @@ -2358,7 +2560,7 @@ def _setup_opener(self): opts_proxy = self.params.get('proxy') if opts_cookiefile is None: - self.cookiejar = compat_cookiejar.CookieJar() + self.cookiejar = YoutubeDLCookieJar() else: opts_cookiefile = expand_path(opts_cookiefile) self.cookiejar = YoutubeDLCookieJar(opts_cookiefile) @@ -2419,6 +2621,28 @@ def get_encoding(self): encoding = preferredencoding() return encoding + def _write_info_json(self, label, info_dict, infofn, overwrite=None): + if not self.params.get('writeinfojson', False): + return False + + def msg(fmt, lbl): + return fmt % (lbl + ' metadata',) + + if overwrite is None: + overwrite = not self.params.get('nooverwrites', False) + + if not overwrite and os.path.exists(encodeFilename(infofn)): + self.to_screen(msg('[info] %s is already present', label.title())) + return 'exists' + else: + self.to_screen(msg('[info] Writing %s as JSON to: ' + infofn, label)) + try: + write_json_file(self.filter_requested_info(info_dict), infofn) + return True + except (OSError, IOError): + self.report_error(msg('Cannot write %s to JSON file ' + infofn, label)) + return + def _write_thumbnails(self, info_dict, filename): if self.params.get('writethumbnail', False): thumbnails = info_dict.get('thumbnails') diff --git a/script.module.youtube.dl/lib/youtube_dl/aes.py b/script.module.youtube.dl/lib/youtube_dl/aes.py index 461bb6d41..a94a41079 100644 --- a/script.module.youtube.dl/lib/youtube_dl/aes.py +++ b/script.module.youtube.dl/lib/youtube_dl/aes.py @@ -8,6 +8,18 @@ BLOCK_SIZE_BYTES = 16 +def pkcs7_padding(data): + """ + PKCS#7 padding + + @param {int[]} data cleartext + @returns {int[]} padding data + """ + + remaining_length = BLOCK_SIZE_BYTES - len(data) % BLOCK_SIZE_BYTES + return data + [remaining_length] * remaining_length + + def aes_ctr_decrypt(data, key, counter): """ Decrypt with aes in counter mode @@ -76,8 +88,7 @@ def aes_cbc_encrypt(data, key, iv): previous_cipher_block = iv for i in range(block_count): block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] - remaining_length = BLOCK_SIZE_BYTES - len(block) - block += [remaining_length] * remaining_length + block = pkcs7_padding(block) mixed_block = xor(block, previous_cipher_block) encrypted_block = aes_encrypt(mixed_block, expanded_key) @@ -88,6 +99,28 @@ def aes_cbc_encrypt(data, key, iv): return encrypted_data +def aes_ecb_encrypt(data, key): + """ + Encrypt with aes in ECB mode. Using PKCS#7 padding + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + block = pkcs7_padding(block) + + encrypted_block = aes_encrypt(block, expanded_key) + encrypted_data += encrypted_block + + return encrypted_data + + def key_expansion(data): """ Generate key schedule @@ -303,7 +336,7 @@ def xor(data1, data2): def rijndael_mul(a, b): - if(a == 0 or b == 0): + if (a == 0 or b == 0): return 0 return RIJNDAEL_EXP_TABLE[(RIJNDAEL_LOG_TABLE[a] + RIJNDAEL_LOG_TABLE[b]) % 0xFF] diff --git a/script.module.youtube.dl/lib/youtube_dl/cache.py b/script.module.youtube.dl/lib/youtube_dl/cache.py index 7bdade1bd..54123da0e 100644 --- a/script.module.youtube.dl/lib/youtube_dl/cache.py +++ b/script.module.youtube.dl/lib/youtube_dl/cache.py @@ -1,21 +1,32 @@ from __future__ import unicode_literals import errno -import io import json import os import re import shutil import traceback -from .compat import compat_getenv +from .compat import ( + compat_getenv, + compat_open as open, +) from .utils import ( + error_to_compat_str, expand_path, + is_outdated_version, + try_get, write_json_file, ) +from .version import __version__ class Cache(object): + + _YTDL_DIR = 'youtube-dl' + _VERSION_KEY = _YTDL_DIR + '_version' + _DEFAULT_VERSION = '2021.12.17' + def __init__(self, ydl): self._ydl = ydl @@ -23,7 +34,7 @@ def _get_root_dir(self): res = self._ydl.params.get('cachedir') if res is None: cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache') - res = os.path.join(cache_root, 'youtube-dl') + res = os.path.join(cache_root, self._YTDL_DIR) return expand_path(res) def _get_cache_fn(self, section, key, dtype): @@ -50,13 +61,22 @@ def store(self, section, key, data, dtype='json'): except OSError as ose: if ose.errno != errno.EEXIST: raise - write_json_file(data, fn) + write_json_file({self._VERSION_KEY: __version__, 'data': data}, fn) except Exception: tb = traceback.format_exc() self._ydl.report_warning( 'Writing cache to %r failed: %s' % (fn, tb)) - def load(self, section, key, dtype='json', default=None): + def _validate(self, data, min_ver): + version = try_get(data, lambda x: x[self._VERSION_KEY]) + if not version: # Backward compatibility + data, version = {'data': data}, self._DEFAULT_VERSION + if not is_outdated_version(version, min_ver or '0', assume_new=False): + return data['data'] + self._ydl.to_screen( + 'Discarding old cache from version {version} (needs {min_ver})'.format(**locals())) + + def load(self, section, key, dtype='json', default=None, min_ver=None): assert dtype in ('json',) if not self.enabled: @@ -65,13 +85,13 @@ def load(self, section, key, dtype='json', default=None): cache_fn = self._get_cache_fn(section, key, dtype) try: try: - with io.open(cache_fn, 'r', encoding='utf-8') as cachef: - return json.load(cachef) + with open(cache_fn, 'r', encoding='utf-8') as cachef: + return self._validate(json.load(cachef), min_ver) except ValueError: try: file_size = os.path.getsize(cache_fn) except (OSError, IOError) as oe: - file_size = str(oe) + file_size = error_to_compat_str(oe) self._ydl.report_warning( 'Cache retrieval from %s failed (%s)' % (cache_fn, file_size)) except IOError: diff --git a/script.module.youtube.dl/lib/youtube_dl/casefold.py b/script.module.youtube.dl/lib/youtube_dl/casefold.py new file mode 100644 index 000000000..ad9c66f8e --- /dev/null +++ b/script.module.youtube.dl/lib/youtube_dl/casefold.py @@ -0,0 +1,1667 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .compat import ( + compat_str, + compat_chr, +) + +# Below is included the text of icu/CaseFolding.txt retrieved from +# https://github.com/unicode-org/icu/blob/main/icu4c/source/data/unidata/CaseFolding.txt +# In case newly foldable Unicode characters are defined, paste the new version +# of the text inside the ''' marks. +# The text is expected to have only blank lines andlines with 1st character #, +# all ignored, and fold definitions like this: +# `from_hex_code; space_separated_to_hex_code_list; comment` + +_map_str = ''' +# CaseFolding-15.0.0.txt +# Date: 2022-02-02, 23:35:35 GMT +# © 2022 Unicode®, Inc. +# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries. +# For terms of use, see https://www.unicode.org/terms_of_use.html +# +# Unicode Character Database +# For documentation, see https://www.unicode.org/reports/tr44/ +# +# Case Folding Properties +# +# This file is a supplement to the UnicodeData file. +# It provides a case folding mapping generated from the Unicode Character Database. +# If all characters are mapped according to the full mapping below, then +# case differences (according to UnicodeData.txt and SpecialCasing.txt) +# are eliminated. +# +# The data supports both implementations that require simple case foldings +# (where string lengths don't change), and implementations that allow full case folding +# (where string lengths may grow). Note that where they can be supported, the +# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match. +# +# All code points not listed in this file map to themselves. +# +# NOTE: case folding does not preserve normalization formats! +# +# For information on case folding, including how to have case folding +# preserve normalization formats, see Section 3.13 Default Case Algorithms in +# The Unicode Standard. +# +# ================================================================================ +# Format +# ================================================================================ +# The entries in this file are in the following machine-readable format: +# +# ; ; ; # +# +# The status field is: +# C: common case folding, common mappings shared by both simple and full mappings. +# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. +# S: simple case folding, mappings to single characters where different from F. +# T: special case for uppercase I and dotted uppercase I +# - For non-Turkic languages, this mapping is normally not used. +# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. +# Note that the Turkic mappings do not maintain canonical equivalence without additional processing. +# See the discussions of case mapping in the Unicode Standard for more information. +# +# Usage: +# A. To do a simple case folding, use the mappings with status C + S. +# B. To do a full case folding, use the mappings with status C + F. +# +# The mappings with status T can be used or omitted depending on the desired case-folding +# behavior. (The default option is to exclude them.) +# +# ================================================================= + +# Property: Case_Folding + +# All code points not explicitly listed for Case_Folding +# have the value C for the status field, and the code point itself for the mapping field. + +# ================================================================= +0041; C; 0061; # LATIN CAPITAL LETTER A +0042; C; 0062; # LATIN CAPITAL LETTER B +0043; C; 0063; # LATIN CAPITAL LETTER C +0044; C; 0064; # LATIN CAPITAL LETTER D +0045; C; 0065; # LATIN CAPITAL LETTER E +0046; C; 0066; # LATIN CAPITAL LETTER F +0047; C; 0067; # LATIN CAPITAL LETTER G +0048; C; 0068; # LATIN CAPITAL LETTER H +0049; C; 0069; # LATIN CAPITAL LETTER I +0049; T; 0131; # LATIN CAPITAL LETTER I +004A; C; 006A; # LATIN CAPITAL LETTER J +004B; C; 006B; # LATIN CAPITAL LETTER K +004C; C; 006C; # LATIN CAPITAL LETTER L +004D; C; 006D; # LATIN CAPITAL LETTER M +004E; C; 006E; # LATIN CAPITAL LETTER N +004F; C; 006F; # LATIN CAPITAL LETTER O +0050; C; 0070; # LATIN CAPITAL LETTER P +0051; C; 0071; # LATIN CAPITAL LETTER Q +0052; C; 0072; # LATIN CAPITAL LETTER R +0053; C; 0073; # LATIN CAPITAL LETTER S +0054; C; 0074; # LATIN CAPITAL LETTER T +0055; C; 0075; # LATIN CAPITAL LETTER U +0056; C; 0076; # LATIN CAPITAL LETTER V +0057; C; 0077; # LATIN CAPITAL LETTER W +0058; C; 0078; # LATIN CAPITAL LETTER X +0059; C; 0079; # LATIN CAPITAL LETTER Y +005A; C; 007A; # LATIN CAPITAL LETTER Z +00B5; C; 03BC; # MICRO SIGN +00C0; C; 00E0; # LATIN CAPITAL LETTER A WITH GRAVE +00C1; C; 00E1; # LATIN CAPITAL LETTER A WITH ACUTE +00C2; C; 00E2; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX +00C3; C; 00E3; # LATIN CAPITAL LETTER A WITH TILDE +00C4; C; 00E4; # LATIN CAPITAL LETTER A WITH DIAERESIS +00C5; C; 00E5; # LATIN CAPITAL LETTER A WITH RING ABOVE +00C6; C; 00E6; # LATIN CAPITAL LETTER AE +00C7; C; 00E7; # LATIN CAPITAL LETTER C WITH CEDILLA +00C8; C; 00E8; # LATIN CAPITAL LETTER E WITH GRAVE +00C9; C; 00E9; # LATIN CAPITAL LETTER E WITH ACUTE +00CA; C; 00EA; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX +00CB; C; 00EB; # LATIN CAPITAL LETTER E WITH DIAERESIS +00CC; C; 00EC; # LATIN CAPITAL LETTER I WITH GRAVE +00CD; C; 00ED; # LATIN CAPITAL LETTER I WITH ACUTE +00CE; C; 00EE; # LATIN CAPITAL LETTER I WITH CIRCUMFLEX +00CF; C; 00EF; # LATIN CAPITAL LETTER I WITH DIAERESIS +00D0; C; 00F0; # LATIN CAPITAL LETTER ETH +00D1; C; 00F1; # LATIN CAPITAL LETTER N WITH TILDE +00D2; C; 00F2; # LATIN CAPITAL LETTER O WITH GRAVE +00D3; C; 00F3; # LATIN CAPITAL LETTER O WITH ACUTE +00D4; C; 00F4; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX +00D5; C; 00F5; # LATIN CAPITAL LETTER O WITH TILDE +00D6; C; 00F6; # LATIN CAPITAL LETTER O WITH DIAERESIS +00D8; C; 00F8; # LATIN CAPITAL LETTER O WITH STROKE +00D9; C; 00F9; # LATIN CAPITAL LETTER U WITH GRAVE +00DA; C; 00FA; # LATIN CAPITAL LETTER U WITH ACUTE +00DB; C; 00FB; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX +00DC; C; 00FC; # LATIN CAPITAL LETTER U WITH DIAERESIS +00DD; C; 00FD; # LATIN CAPITAL LETTER Y WITH ACUTE +00DE; C; 00FE; # LATIN CAPITAL LETTER THORN +00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S +0100; C; 0101; # LATIN CAPITAL LETTER A WITH MACRON +0102; C; 0103; # LATIN CAPITAL LETTER A WITH BREVE +0104; C; 0105; # LATIN CAPITAL LETTER A WITH OGONEK +0106; C; 0107; # LATIN CAPITAL LETTER C WITH ACUTE +0108; C; 0109; # LATIN CAPITAL LETTER C WITH CIRCUMFLEX +010A; C; 010B; # LATIN CAPITAL LETTER C WITH DOT ABOVE +010C; C; 010D; # LATIN CAPITAL LETTER C WITH CARON +010E; C; 010F; # LATIN CAPITAL LETTER D WITH CARON +0110; C; 0111; # LATIN CAPITAL LETTER D WITH STROKE +0112; C; 0113; # LATIN CAPITAL LETTER E WITH MACRON +0114; C; 0115; # LATIN CAPITAL LETTER E WITH BREVE +0116; C; 0117; # LATIN CAPITAL LETTER E WITH DOT ABOVE +0118; C; 0119; # LATIN CAPITAL LETTER E WITH OGONEK +011A; C; 011B; # LATIN CAPITAL LETTER E WITH CARON +011C; C; 011D; # LATIN CAPITAL LETTER G WITH CIRCUMFLEX +011E; C; 011F; # LATIN CAPITAL LETTER G WITH BREVE +0120; C; 0121; # LATIN CAPITAL LETTER G WITH DOT ABOVE +0122; C; 0123; # LATIN CAPITAL LETTER G WITH CEDILLA +0124; C; 0125; # LATIN CAPITAL LETTER H WITH CIRCUMFLEX +0126; C; 0127; # LATIN CAPITAL LETTER H WITH STROKE +0128; C; 0129; # LATIN CAPITAL LETTER I WITH TILDE +012A; C; 012B; # LATIN CAPITAL LETTER I WITH MACRON +012C; C; 012D; # LATIN CAPITAL LETTER I WITH BREVE +012E; C; 012F; # LATIN CAPITAL LETTER I WITH OGONEK +0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE +0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE +0132; C; 0133; # LATIN CAPITAL LIGATURE IJ +0134; C; 0135; # LATIN CAPITAL LETTER J WITH CIRCUMFLEX +0136; C; 0137; # LATIN CAPITAL LETTER K WITH CEDILLA +0139; C; 013A; # LATIN CAPITAL LETTER L WITH ACUTE +013B; C; 013C; # LATIN CAPITAL LETTER L WITH CEDILLA +013D; C; 013E; # LATIN CAPITAL LETTER L WITH CARON +013F; C; 0140; # LATIN CAPITAL LETTER L WITH MIDDLE DOT +0141; C; 0142; # LATIN CAPITAL LETTER L WITH STROKE +0143; C; 0144; # LATIN CAPITAL LETTER N WITH ACUTE +0145; C; 0146; # LATIN CAPITAL LETTER N WITH CEDILLA +0147; C; 0148; # LATIN CAPITAL LETTER N WITH CARON +0149; F; 02BC 006E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE +014A; C; 014B; # LATIN CAPITAL LETTER ENG +014C; C; 014D; # LATIN CAPITAL LETTER O WITH MACRON +014E; C; 014F; # LATIN CAPITAL LETTER O WITH BREVE +0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE +0152; C; 0153; # LATIN CAPITAL LIGATURE OE +0154; C; 0155; # LATIN CAPITAL LETTER R WITH ACUTE +0156; C; 0157; # LATIN CAPITAL LETTER R WITH CEDILLA +0158; C; 0159; # LATIN CAPITAL LETTER R WITH CARON +015A; C; 015B; # LATIN CAPITAL LETTER S WITH ACUTE +015C; C; 015D; # LATIN CAPITAL LETTER S WITH CIRCUMFLEX +015E; C; 015F; # LATIN CAPITAL LETTER S WITH CEDILLA +0160; C; 0161; # LATIN CAPITAL LETTER S WITH CARON +0162; C; 0163; # LATIN CAPITAL LETTER T WITH CEDILLA +0164; C; 0165; # LATIN CAPITAL LETTER T WITH CARON +0166; C; 0167; # LATIN CAPITAL LETTER T WITH STROKE +0168; C; 0169; # LATIN CAPITAL LETTER U WITH TILDE +016A; C; 016B; # LATIN CAPITAL LETTER U WITH MACRON +016C; C; 016D; # LATIN CAPITAL LETTER U WITH BREVE +016E; C; 016F; # LATIN CAPITAL LETTER U WITH RING ABOVE +0170; C; 0171; # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE +0172; C; 0173; # LATIN CAPITAL LETTER U WITH OGONEK +0174; C; 0175; # LATIN CAPITAL LETTER W WITH CIRCUMFLEX +0176; C; 0177; # LATIN CAPITAL LETTER Y WITH CIRCUMFLEX +0178; C; 00FF; # LATIN CAPITAL LETTER Y WITH DIAERESIS +0179; C; 017A; # LATIN CAPITAL LETTER Z WITH ACUTE +017B; C; 017C; # LATIN CAPITAL LETTER Z WITH DOT ABOVE +017D; C; 017E; # LATIN CAPITAL LETTER Z WITH CARON +017F; C; 0073; # LATIN SMALL LETTER LONG S +0181; C; 0253; # LATIN CAPITAL LETTER B WITH HOOK +0182; C; 0183; # LATIN CAPITAL LETTER B WITH TOPBAR +0184; C; 0185; # LATIN CAPITAL LETTER TONE SIX +0186; C; 0254; # LATIN CAPITAL LETTER OPEN O +0187; C; 0188; # LATIN CAPITAL LETTER C WITH HOOK +0189; C; 0256; # LATIN CAPITAL LETTER AFRICAN D +018A; C; 0257; # LATIN CAPITAL LETTER D WITH HOOK +018B; C; 018C; # LATIN CAPITAL LETTER D WITH TOPBAR +018E; C; 01DD; # LATIN CAPITAL LETTER REVERSED E +018F; C; 0259; # LATIN CAPITAL LETTER SCHWA +0190; C; 025B; # LATIN CAPITAL LETTER OPEN E +0191; C; 0192; # LATIN CAPITAL LETTER F WITH HOOK +0193; C; 0260; # LATIN CAPITAL LETTER G WITH HOOK +0194; C; 0263; # LATIN CAPITAL LETTER GAMMA +0196; C; 0269; # LATIN CAPITAL LETTER IOTA +0197; C; 0268; # LATIN CAPITAL LETTER I WITH STROKE +0198; C; 0199; # LATIN CAPITAL LETTER K WITH HOOK +019C; C; 026F; # LATIN CAPITAL LETTER TURNED M +019D; C; 0272; # LATIN CAPITAL LETTER N WITH LEFT HOOK +019F; C; 0275; # LATIN CAPITAL LETTER O WITH MIDDLE TILDE +01A0; C; 01A1; # LATIN CAPITAL LETTER O WITH HORN +01A2; C; 01A3; # LATIN CAPITAL LETTER OI +01A4; C; 01A5; # LATIN CAPITAL LETTER P WITH HOOK +01A6; C; 0280; # LATIN LETTER YR +01A7; C; 01A8; # LATIN CAPITAL LETTER TONE TWO +01A9; C; 0283; # LATIN CAPITAL LETTER ESH +01AC; C; 01AD; # LATIN CAPITAL LETTER T WITH HOOK +01AE; C; 0288; # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK +01AF; C; 01B0; # LATIN CAPITAL LETTER U WITH HORN +01B1; C; 028A; # LATIN CAPITAL LETTER UPSILON +01B2; C; 028B; # LATIN CAPITAL LETTER V WITH HOOK +01B3; C; 01B4; # LATIN CAPITAL LETTER Y WITH HOOK +01B5; C; 01B6; # LATIN CAPITAL LETTER Z WITH STROKE +01B7; C; 0292; # LATIN CAPITAL LETTER EZH +01B8; C; 01B9; # LATIN CAPITAL LETTER EZH REVERSED +01BC; C; 01BD; # LATIN CAPITAL LETTER TONE FIVE +01C4; C; 01C6; # LATIN CAPITAL LETTER DZ WITH CARON +01C5; C; 01C6; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON +01C7; C; 01C9; # LATIN CAPITAL LETTER LJ +01C8; C; 01C9; # LATIN CAPITAL LETTER L WITH SMALL LETTER J +01CA; C; 01CC; # LATIN CAPITAL LETTER NJ +01CB; C; 01CC; # LATIN CAPITAL LETTER N WITH SMALL LETTER J +01CD; C; 01CE; # LATIN CAPITAL LETTER A WITH CARON +01CF; C; 01D0; # LATIN CAPITAL LETTER I WITH CARON +01D1; C; 01D2; # LATIN CAPITAL LETTER O WITH CARON +01D3; C; 01D4; # LATIN CAPITAL LETTER U WITH CARON +01D5; C; 01D6; # LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON +01D7; C; 01D8; # LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE +01D9; C; 01DA; # LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON +01DB; C; 01DC; # LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE +01DE; C; 01DF; # LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON +01E0; C; 01E1; # LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON +01E2; C; 01E3; # LATIN CAPITAL LETTER AE WITH MACRON +01E4; C; 01E5; # LATIN CAPITAL LETTER G WITH STROKE +01E6; C; 01E7; # LATIN CAPITAL LETTER G WITH CARON +01E8; C; 01E9; # LATIN CAPITAL LETTER K WITH CARON +01EA; C; 01EB; # LATIN CAPITAL LETTER O WITH OGONEK +01EC; C; 01ED; # LATIN CAPITAL LETTER O WITH OGONEK AND MACRON +01EE; C; 01EF; # LATIN CAPITAL LETTER EZH WITH CARON +01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON +01F1; C; 01F3; # LATIN CAPITAL LETTER DZ +01F2; C; 01F3; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z +01F4; C; 01F5; # LATIN CAPITAL LETTER G WITH ACUTE +01F6; C; 0195; # LATIN CAPITAL LETTER HWAIR +01F7; C; 01BF; # LATIN CAPITAL LETTER WYNN +01F8; C; 01F9; # LATIN CAPITAL LETTER N WITH GRAVE +01FA; C; 01FB; # LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE +01FC; C; 01FD; # LATIN CAPITAL LETTER AE WITH ACUTE +01FE; C; 01FF; # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE +0200; C; 0201; # LATIN CAPITAL LETTER A WITH DOUBLE GRAVE +0202; C; 0203; # LATIN CAPITAL LETTER A WITH INVERTED BREVE +0204; C; 0205; # LATIN CAPITAL LETTER E WITH DOUBLE GRAVE +0206; C; 0207; # LATIN CAPITAL LETTER E WITH INVERTED BREVE +0208; C; 0209; # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE +020A; C; 020B; # LATIN CAPITAL LETTER I WITH INVERTED BREVE +020C; C; 020D; # LATIN CAPITAL LETTER O WITH DOUBLE GRAVE +020E; C; 020F; # LATIN CAPITAL LETTER O WITH INVERTED BREVE +0210; C; 0211; # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE +0212; C; 0213; # LATIN CAPITAL LETTER R WITH INVERTED BREVE +0214; C; 0215; # LATIN CAPITAL LETTER U WITH DOUBLE GRAVE +0216; C; 0217; # LATIN CAPITAL LETTER U WITH INVERTED BREVE +0218; C; 0219; # LATIN CAPITAL LETTER S WITH COMMA BELOW +021A; C; 021B; # LATIN CAPITAL LETTER T WITH COMMA BELOW +021C; C; 021D; # LATIN CAPITAL LETTER YOGH +021E; C; 021F; # LATIN CAPITAL LETTER H WITH CARON +0220; C; 019E; # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG +0222; C; 0223; # LATIN CAPITAL LETTER OU +0224; C; 0225; # LATIN CAPITAL LETTER Z WITH HOOK +0226; C; 0227; # LATIN CAPITAL LETTER A WITH DOT ABOVE +0228; C; 0229; # LATIN CAPITAL LETTER E WITH CEDILLA +022A; C; 022B; # LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON +022C; C; 022D; # LATIN CAPITAL LETTER O WITH TILDE AND MACRON +022E; C; 022F; # LATIN CAPITAL LETTER O WITH DOT ABOVE +0230; C; 0231; # LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON +0232; C; 0233; # LATIN CAPITAL LETTER Y WITH MACRON +023A; C; 2C65; # LATIN CAPITAL LETTER A WITH STROKE +023B; C; 023C; # LATIN CAPITAL LETTER C WITH STROKE +023D; C; 019A; # LATIN CAPITAL LETTER L WITH BAR +023E; C; 2C66; # LATIN CAPITAL LETTER T WITH DIAGONAL STROKE +0241; C; 0242; # LATIN CAPITAL LETTER GLOTTAL STOP +0243; C; 0180; # LATIN CAPITAL LETTER B WITH STROKE +0244; C; 0289; # LATIN CAPITAL LETTER U BAR +0245; C; 028C; # LATIN CAPITAL LETTER TURNED V +0246; C; 0247; # LATIN CAPITAL LETTER E WITH STROKE +0248; C; 0249; # LATIN CAPITAL LETTER J WITH STROKE +024A; C; 024B; # LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL +024C; C; 024D; # LATIN CAPITAL LETTER R WITH STROKE +024E; C; 024F; # LATIN CAPITAL LETTER Y WITH STROKE +0345; C; 03B9; # COMBINING GREEK YPOGEGRAMMENI +0370; C; 0371; # GREEK CAPITAL LETTER HETA +0372; C; 0373; # GREEK CAPITAL LETTER ARCHAIC SAMPI +0376; C; 0377; # GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA +037F; C; 03F3; # GREEK CAPITAL LETTER YOT +0386; C; 03AC; # GREEK CAPITAL LETTER ALPHA WITH TONOS +0388; C; 03AD; # GREEK CAPITAL LETTER EPSILON WITH TONOS +0389; C; 03AE; # GREEK CAPITAL LETTER ETA WITH TONOS +038A; C; 03AF; # GREEK CAPITAL LETTER IOTA WITH TONOS +038C; C; 03CC; # GREEK CAPITAL LETTER OMICRON WITH TONOS +038E; C; 03CD; # GREEK CAPITAL LETTER UPSILON WITH TONOS +038F; C; 03CE; # GREEK CAPITAL LETTER OMEGA WITH TONOS +0390; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS +0391; C; 03B1; # GREEK CAPITAL LETTER ALPHA +0392; C; 03B2; # GREEK CAPITAL LETTER BETA +0393; C; 03B3; # GREEK CAPITAL LETTER GAMMA +0394; C; 03B4; # GREEK CAPITAL LETTER DELTA +0395; C; 03B5; # GREEK CAPITAL LETTER EPSILON +0396; C; 03B6; # GREEK CAPITAL LETTER ZETA +0397; C; 03B7; # GREEK CAPITAL LETTER ETA +0398; C; 03B8; # GREEK CAPITAL LETTER THETA +0399; C; 03B9; # GREEK CAPITAL LETTER IOTA +039A; C; 03BA; # GREEK CAPITAL LETTER KAPPA +039B; C; 03BB; # GREEK CAPITAL LETTER LAMDA +039C; C; 03BC; # GREEK CAPITAL LETTER MU +039D; C; 03BD; # GREEK CAPITAL LETTER NU +039E; C; 03BE; # GREEK CAPITAL LETTER XI +039F; C; 03BF; # GREEK CAPITAL LETTER OMICRON +03A0; C; 03C0; # GREEK CAPITAL LETTER PI +03A1; C; 03C1; # GREEK CAPITAL LETTER RHO +03A3; C; 03C3; # GREEK CAPITAL LETTER SIGMA +03A4; C; 03C4; # GREEK CAPITAL LETTER TAU +03A5; C; 03C5; # GREEK CAPITAL LETTER UPSILON +03A6; C; 03C6; # GREEK CAPITAL LETTER PHI +03A7; C; 03C7; # GREEK CAPITAL LETTER CHI +03A8; C; 03C8; # GREEK CAPITAL LETTER PSI +03A9; C; 03C9; # GREEK CAPITAL LETTER OMEGA +03AA; C; 03CA; # GREEK CAPITAL LETTER IOTA WITH DIALYTIKA +03AB; C; 03CB; # GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA +03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS +03C2; C; 03C3; # GREEK SMALL LETTER FINAL SIGMA +03CF; C; 03D7; # GREEK CAPITAL KAI SYMBOL +03D0; C; 03B2; # GREEK BETA SYMBOL +03D1; C; 03B8; # GREEK THETA SYMBOL +03D5; C; 03C6; # GREEK PHI SYMBOL +03D6; C; 03C0; # GREEK PI SYMBOL +03D8; C; 03D9; # GREEK LETTER ARCHAIC KOPPA +03DA; C; 03DB; # GREEK LETTER STIGMA +03DC; C; 03DD; # GREEK LETTER DIGAMMA +03DE; C; 03DF; # GREEK LETTER KOPPA +03E0; C; 03E1; # GREEK LETTER SAMPI +03E2; C; 03E3; # COPTIC CAPITAL LETTER SHEI +03E4; C; 03E5; # COPTIC CAPITAL LETTER FEI +03E6; C; 03E7; # COPTIC CAPITAL LETTER KHEI +03E8; C; 03E9; # COPTIC CAPITAL LETTER HORI +03EA; C; 03EB; # COPTIC CAPITAL LETTER GANGIA +03EC; C; 03ED; # COPTIC CAPITAL LETTER SHIMA +03EE; C; 03EF; # COPTIC CAPITAL LETTER DEI +03F0; C; 03BA; # GREEK KAPPA SYMBOL +03F1; C; 03C1; # GREEK RHO SYMBOL +03F4; C; 03B8; # GREEK CAPITAL THETA SYMBOL +03F5; C; 03B5; # GREEK LUNATE EPSILON SYMBOL +03F7; C; 03F8; # GREEK CAPITAL LETTER SHO +03F9; C; 03F2; # GREEK CAPITAL LUNATE SIGMA SYMBOL +03FA; C; 03FB; # GREEK CAPITAL LETTER SAN +03FD; C; 037B; # GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL +03FE; C; 037C; # GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL +03FF; C; 037D; # GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL +0400; C; 0450; # CYRILLIC CAPITAL LETTER IE WITH GRAVE +0401; C; 0451; # CYRILLIC CAPITAL LETTER IO +0402; C; 0452; # CYRILLIC CAPITAL LETTER DJE +0403; C; 0453; # CYRILLIC CAPITAL LETTER GJE +0404; C; 0454; # CYRILLIC CAPITAL LETTER UKRAINIAN IE +0405; C; 0455; # CYRILLIC CAPITAL LETTER DZE +0406; C; 0456; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I +0407; C; 0457; # CYRILLIC CAPITAL LETTER YI +0408; C; 0458; # CYRILLIC CAPITAL LETTER JE +0409; C; 0459; # CYRILLIC CAPITAL LETTER LJE +040A; C; 045A; # CYRILLIC CAPITAL LETTER NJE +040B; C; 045B; # CYRILLIC CAPITAL LETTER TSHE +040C; C; 045C; # CYRILLIC CAPITAL LETTER KJE +040D; C; 045D; # CYRILLIC CAPITAL LETTER I WITH GRAVE +040E; C; 045E; # CYRILLIC CAPITAL LETTER SHORT U +040F; C; 045F; # CYRILLIC CAPITAL LETTER DZHE +0410; C; 0430; # CYRILLIC CAPITAL LETTER A +0411; C; 0431; # CYRILLIC CAPITAL LETTER BE +0412; C; 0432; # CYRILLIC CAPITAL LETTER VE +0413; C; 0433; # CYRILLIC CAPITAL LETTER GHE +0414; C; 0434; # CYRILLIC CAPITAL LETTER DE +0415; C; 0435; # CYRILLIC CAPITAL LETTER IE +0416; C; 0436; # CYRILLIC CAPITAL LETTER ZHE +0417; C; 0437; # CYRILLIC CAPITAL LETTER ZE +0418; C; 0438; # CYRILLIC CAPITAL LETTER I +0419; C; 0439; # CYRILLIC CAPITAL LETTER SHORT I +041A; C; 043A; # CYRILLIC CAPITAL LETTER KA +041B; C; 043B; # CYRILLIC CAPITAL LETTER EL +041C; C; 043C; # CYRILLIC CAPITAL LETTER EM +041D; C; 043D; # CYRILLIC CAPITAL LETTER EN +041E; C; 043E; # CYRILLIC CAPITAL LETTER O +041F; C; 043F; # CYRILLIC CAPITAL LETTER PE +0420; C; 0440; # CYRILLIC CAPITAL LETTER ER +0421; C; 0441; # CYRILLIC CAPITAL LETTER ES +0422; C; 0442; # CYRILLIC CAPITAL LETTER TE +0423; C; 0443; # CYRILLIC CAPITAL LETTER U +0424; C; 0444; # CYRILLIC CAPITAL LETTER EF +0425; C; 0445; # CYRILLIC CAPITAL LETTER HA +0426; C; 0446; # CYRILLIC CAPITAL LETTER TSE +0427; C; 0447; # CYRILLIC CAPITAL LETTER CHE +0428; C; 0448; # CYRILLIC CAPITAL LETTER SHA +0429; C; 0449; # CYRILLIC CAPITAL LETTER SHCHA +042A; C; 044A; # CYRILLIC CAPITAL LETTER HARD SIGN +042B; C; 044B; # CYRILLIC CAPITAL LETTER YERU +042C; C; 044C; # CYRILLIC CAPITAL LETTER SOFT SIGN +042D; C; 044D; # CYRILLIC CAPITAL LETTER E +042E; C; 044E; # CYRILLIC CAPITAL LETTER YU +042F; C; 044F; # CYRILLIC CAPITAL LETTER YA +0460; C; 0461; # CYRILLIC CAPITAL LETTER OMEGA +0462; C; 0463; # CYRILLIC CAPITAL LETTER YAT +0464; C; 0465; # CYRILLIC CAPITAL LETTER IOTIFIED E +0466; C; 0467; # CYRILLIC CAPITAL LETTER LITTLE YUS +0468; C; 0469; # CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS +046A; C; 046B; # CYRILLIC CAPITAL LETTER BIG YUS +046C; C; 046D; # CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS +046E; C; 046F; # CYRILLIC CAPITAL LETTER KSI +0470; C; 0471; # CYRILLIC CAPITAL LETTER PSI +0472; C; 0473; # CYRILLIC CAPITAL LETTER FITA +0474; C; 0475; # CYRILLIC CAPITAL LETTER IZHITSA +0476; C; 0477; # CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT +0478; C; 0479; # CYRILLIC CAPITAL LETTER UK +047A; C; 047B; # CYRILLIC CAPITAL LETTER ROUND OMEGA +047C; C; 047D; # CYRILLIC CAPITAL LETTER OMEGA WITH TITLO +047E; C; 047F; # CYRILLIC CAPITAL LETTER OT +0480; C; 0481; # CYRILLIC CAPITAL LETTER KOPPA +048A; C; 048B; # CYRILLIC CAPITAL LETTER SHORT I WITH TAIL +048C; C; 048D; # CYRILLIC CAPITAL LETTER SEMISOFT SIGN +048E; C; 048F; # CYRILLIC CAPITAL LETTER ER WITH TICK +0490; C; 0491; # CYRILLIC CAPITAL LETTER GHE WITH UPTURN +0492; C; 0493; # CYRILLIC CAPITAL LETTER GHE WITH STROKE +0494; C; 0495; # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK +0496; C; 0497; # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER +0498; C; 0499; # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER +049A; C; 049B; # CYRILLIC CAPITAL LETTER KA WITH DESCENDER +049C; C; 049D; # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE +049E; C; 049F; # CYRILLIC CAPITAL LETTER KA WITH STROKE +04A0; C; 04A1; # CYRILLIC CAPITAL LETTER BASHKIR KA +04A2; C; 04A3; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER +04A4; C; 04A5; # CYRILLIC CAPITAL LIGATURE EN GHE +04A6; C; 04A7; # CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK +04A8; C; 04A9; # CYRILLIC CAPITAL LETTER ABKHASIAN HA +04AA; C; 04AB; # CYRILLIC CAPITAL LETTER ES WITH DESCENDER +04AC; C; 04AD; # CYRILLIC CAPITAL LETTER TE WITH DESCENDER +04AE; C; 04AF; # CYRILLIC CAPITAL LETTER STRAIGHT U +04B0; C; 04B1; # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE +04B2; C; 04B3; # CYRILLIC CAPITAL LETTER HA WITH DESCENDER +04B4; C; 04B5; # CYRILLIC CAPITAL LIGATURE TE TSE +04B6; C; 04B7; # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER +04B8; C; 04B9; # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE +04BA; C; 04BB; # CYRILLIC CAPITAL LETTER SHHA +04BC; C; 04BD; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE +04BE; C; 04BF; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER +04C0; C; 04CF; # CYRILLIC LETTER PALOCHKA +04C1; C; 04C2; # CYRILLIC CAPITAL LETTER ZHE WITH BREVE +04C3; C; 04C4; # CYRILLIC CAPITAL LETTER KA WITH HOOK +04C5; C; 04C6; # CYRILLIC CAPITAL LETTER EL WITH TAIL +04C7; C; 04C8; # CYRILLIC CAPITAL LETTER EN WITH HOOK +04C9; C; 04CA; # CYRILLIC CAPITAL LETTER EN WITH TAIL +04CB; C; 04CC; # CYRILLIC CAPITAL LETTER KHAKASSIAN CHE +04CD; C; 04CE; # CYRILLIC CAPITAL LETTER EM WITH TAIL +04D0; C; 04D1; # CYRILLIC CAPITAL LETTER A WITH BREVE +04D2; C; 04D3; # CYRILLIC CAPITAL LETTER A WITH DIAERESIS +04D4; C; 04D5; # CYRILLIC CAPITAL LIGATURE A IE +04D6; C; 04D7; # CYRILLIC CAPITAL LETTER IE WITH BREVE +04D8; C; 04D9; # CYRILLIC CAPITAL LETTER SCHWA +04DA; C; 04DB; # CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS +04DC; C; 04DD; # CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS +04DE; C; 04DF; # CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS +04E0; C; 04E1; # CYRILLIC CAPITAL LETTER ABKHASIAN DZE +04E2; C; 04E3; # CYRILLIC CAPITAL LETTER I WITH MACRON +04E4; C; 04E5; # CYRILLIC CAPITAL LETTER I WITH DIAERESIS +04E6; C; 04E7; # CYRILLIC CAPITAL LETTER O WITH DIAERESIS +04E8; C; 04E9; # CYRILLIC CAPITAL LETTER BARRED O +04EA; C; 04EB; # CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS +04EC; C; 04ED; # CYRILLIC CAPITAL LETTER E WITH DIAERESIS +04EE; C; 04EF; # CYRILLIC CAPITAL LETTER U WITH MACRON +04F0; C; 04F1; # CYRILLIC CAPITAL LETTER U WITH DIAERESIS +04F2; C; 04F3; # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE +04F4; C; 04F5; # CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS +04F6; C; 04F7; # CYRILLIC CAPITAL LETTER GHE WITH DESCENDER +04F8; C; 04F9; # CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS +04FA; C; 04FB; # CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK +04FC; C; 04FD; # CYRILLIC CAPITAL LETTER HA WITH HOOK +04FE; C; 04FF; # CYRILLIC CAPITAL LETTER HA WITH STROKE +0500; C; 0501; # CYRILLIC CAPITAL LETTER KOMI DE +0502; C; 0503; # CYRILLIC CAPITAL LETTER KOMI DJE +0504; C; 0505; # CYRILLIC CAPITAL LETTER KOMI ZJE +0506; C; 0507; # CYRILLIC CAPITAL LETTER KOMI DZJE +0508; C; 0509; # CYRILLIC CAPITAL LETTER KOMI LJE +050A; C; 050B; # CYRILLIC CAPITAL LETTER KOMI NJE +050C; C; 050D; # CYRILLIC CAPITAL LETTER KOMI SJE +050E; C; 050F; # CYRILLIC CAPITAL LETTER KOMI TJE +0510; C; 0511; # CYRILLIC CAPITAL LETTER REVERSED ZE +0512; C; 0513; # CYRILLIC CAPITAL LETTER EL WITH HOOK +0514; C; 0515; # CYRILLIC CAPITAL LETTER LHA +0516; C; 0517; # CYRILLIC CAPITAL LETTER RHA +0518; C; 0519; # CYRILLIC CAPITAL LETTER YAE +051A; C; 051B; # CYRILLIC CAPITAL LETTER QA +051C; C; 051D; # CYRILLIC CAPITAL LETTER WE +051E; C; 051F; # CYRILLIC CAPITAL LETTER ALEUT KA +0520; C; 0521; # CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK +0522; C; 0523; # CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK +0524; C; 0525; # CYRILLIC CAPITAL LETTER PE WITH DESCENDER +0526; C; 0527; # CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER +0528; C; 0529; # CYRILLIC CAPITAL LETTER EN WITH LEFT HOOK +052A; C; 052B; # CYRILLIC CAPITAL LETTER DZZHE +052C; C; 052D; # CYRILLIC CAPITAL LETTER DCHE +052E; C; 052F; # CYRILLIC CAPITAL LETTER EL WITH DESCENDER +0531; C; 0561; # ARMENIAN CAPITAL LETTER AYB +0532; C; 0562; # ARMENIAN CAPITAL LETTER BEN +0533; C; 0563; # ARMENIAN CAPITAL LETTER GIM +0534; C; 0564; # ARMENIAN CAPITAL LETTER DA +0535; C; 0565; # ARMENIAN CAPITAL LETTER ECH +0536; C; 0566; # ARMENIAN CAPITAL LETTER ZA +0537; C; 0567; # ARMENIAN CAPITAL LETTER EH +0538; C; 0568; # ARMENIAN CAPITAL LETTER ET +0539; C; 0569; # ARMENIAN CAPITAL LETTER TO +053A; C; 056A; # ARMENIAN CAPITAL LETTER ZHE +053B; C; 056B; # ARMENIAN CAPITAL LETTER INI +053C; C; 056C; # ARMENIAN CAPITAL LETTER LIWN +053D; C; 056D; # ARMENIAN CAPITAL LETTER XEH +053E; C; 056E; # ARMENIAN CAPITAL LETTER CA +053F; C; 056F; # ARMENIAN CAPITAL LETTER KEN +0540; C; 0570; # ARMENIAN CAPITAL LETTER HO +0541; C; 0571; # ARMENIAN CAPITAL LETTER JA +0542; C; 0572; # ARMENIAN CAPITAL LETTER GHAD +0543; C; 0573; # ARMENIAN CAPITAL LETTER CHEH +0544; C; 0574; # ARMENIAN CAPITAL LETTER MEN +0545; C; 0575; # ARMENIAN CAPITAL LETTER YI +0546; C; 0576; # ARMENIAN CAPITAL LETTER NOW +0547; C; 0577; # ARMENIAN CAPITAL LETTER SHA +0548; C; 0578; # ARMENIAN CAPITAL LETTER VO +0549; C; 0579; # ARMENIAN CAPITAL LETTER CHA +054A; C; 057A; # ARMENIAN CAPITAL LETTER PEH +054B; C; 057B; # ARMENIAN CAPITAL LETTER JHEH +054C; C; 057C; # ARMENIAN CAPITAL LETTER RA +054D; C; 057D; # ARMENIAN CAPITAL LETTER SEH +054E; C; 057E; # ARMENIAN CAPITAL LETTER VEW +054F; C; 057F; # ARMENIAN CAPITAL LETTER TIWN +0550; C; 0580; # ARMENIAN CAPITAL LETTER REH +0551; C; 0581; # ARMENIAN CAPITAL LETTER CO +0552; C; 0582; # ARMENIAN CAPITAL LETTER YIWN +0553; C; 0583; # ARMENIAN CAPITAL LETTER PIWR +0554; C; 0584; # ARMENIAN CAPITAL LETTER KEH +0555; C; 0585; # ARMENIAN CAPITAL LETTER OH +0556; C; 0586; # ARMENIAN CAPITAL LETTER FEH +0587; F; 0565 0582; # ARMENIAN SMALL LIGATURE ECH YIWN +10A0; C; 2D00; # GEORGIAN CAPITAL LETTER AN +10A1; C; 2D01; # GEORGIAN CAPITAL LETTER BAN +10A2; C; 2D02; # GEORGIAN CAPITAL LETTER GAN +10A3; C; 2D03; # GEORGIAN CAPITAL LETTER DON +10A4; C; 2D04; # GEORGIAN CAPITAL LETTER EN +10A5; C; 2D05; # GEORGIAN CAPITAL LETTER VIN +10A6; C; 2D06; # GEORGIAN CAPITAL LETTER ZEN +10A7; C; 2D07; # GEORGIAN CAPITAL LETTER TAN +10A8; C; 2D08; # GEORGIAN CAPITAL LETTER IN +10A9; C; 2D09; # GEORGIAN CAPITAL LETTER KAN +10AA; C; 2D0A; # GEORGIAN CAPITAL LETTER LAS +10AB; C; 2D0B; # GEORGIAN CAPITAL LETTER MAN +10AC; C; 2D0C; # GEORGIAN CAPITAL LETTER NAR +10AD; C; 2D0D; # GEORGIAN CAPITAL LETTER ON +10AE; C; 2D0E; # GEORGIAN CAPITAL LETTER PAR +10AF; C; 2D0F; # GEORGIAN CAPITAL LETTER ZHAR +10B0; C; 2D10; # GEORGIAN CAPITAL LETTER RAE +10B1; C; 2D11; # GEORGIAN CAPITAL LETTER SAN +10B2; C; 2D12; # GEORGIAN CAPITAL LETTER TAR +10B3; C; 2D13; # GEORGIAN CAPITAL LETTER UN +10B4; C; 2D14; # GEORGIAN CAPITAL LETTER PHAR +10B5; C; 2D15; # GEORGIAN CAPITAL LETTER KHAR +10B6; C; 2D16; # GEORGIAN CAPITAL LETTER GHAN +10B7; C; 2D17; # GEORGIAN CAPITAL LETTER QAR +10B8; C; 2D18; # GEORGIAN CAPITAL LETTER SHIN +10B9; C; 2D19; # GEORGIAN CAPITAL LETTER CHIN +10BA; C; 2D1A; # GEORGIAN CAPITAL LETTER CAN +10BB; C; 2D1B; # GEORGIAN CAPITAL LETTER JIL +10BC; C; 2D1C; # GEORGIAN CAPITAL LETTER CIL +10BD; C; 2D1D; # GEORGIAN CAPITAL LETTER CHAR +10BE; C; 2D1E; # GEORGIAN CAPITAL LETTER XAN +10BF; C; 2D1F; # GEORGIAN CAPITAL LETTER JHAN +10C0; C; 2D20; # GEORGIAN CAPITAL LETTER HAE +10C1; C; 2D21; # GEORGIAN CAPITAL LETTER HE +10C2; C; 2D22; # GEORGIAN CAPITAL LETTER HIE +10C3; C; 2D23; # GEORGIAN CAPITAL LETTER WE +10C4; C; 2D24; # GEORGIAN CAPITAL LETTER HAR +10C5; C; 2D25; # GEORGIAN CAPITAL LETTER HOE +10C7; C; 2D27; # GEORGIAN CAPITAL LETTER YN +10CD; C; 2D2D; # GEORGIAN CAPITAL LETTER AEN +13F8; C; 13F0; # CHEROKEE SMALL LETTER YE +13F9; C; 13F1; # CHEROKEE SMALL LETTER YI +13FA; C; 13F2; # CHEROKEE SMALL LETTER YO +13FB; C; 13F3; # CHEROKEE SMALL LETTER YU +13FC; C; 13F4; # CHEROKEE SMALL LETTER YV +13FD; C; 13F5; # CHEROKEE SMALL LETTER MV +1C80; C; 0432; # CYRILLIC SMALL LETTER ROUNDED VE +1C81; C; 0434; # CYRILLIC SMALL LETTER LONG-LEGGED DE +1C82; C; 043E; # CYRILLIC SMALL LETTER NARROW O +1C83; C; 0441; # CYRILLIC SMALL LETTER WIDE ES +1C84; C; 0442; # CYRILLIC SMALL LETTER TALL TE +1C85; C; 0442; # CYRILLIC SMALL LETTER THREE-LEGGED TE +1C86; C; 044A; # CYRILLIC SMALL LETTER TALL HARD SIGN +1C87; C; 0463; # CYRILLIC SMALL LETTER TALL YAT +1C88; C; A64B; # CYRILLIC SMALL LETTER UNBLENDED UK +1C90; C; 10D0; # GEORGIAN MTAVRULI CAPITAL LETTER AN +1C91; C; 10D1; # GEORGIAN MTAVRULI CAPITAL LETTER BAN +1C92; C; 10D2; # GEORGIAN MTAVRULI CAPITAL LETTER GAN +1C93; C; 10D3; # GEORGIAN MTAVRULI CAPITAL LETTER DON +1C94; C; 10D4; # GEORGIAN MTAVRULI CAPITAL LETTER EN +1C95; C; 10D5; # GEORGIAN MTAVRULI CAPITAL LETTER VIN +1C96; C; 10D6; # GEORGIAN MTAVRULI CAPITAL LETTER ZEN +1C97; C; 10D7; # GEORGIAN MTAVRULI CAPITAL LETTER TAN +1C98; C; 10D8; # GEORGIAN MTAVRULI CAPITAL LETTER IN +1C99; C; 10D9; # GEORGIAN MTAVRULI CAPITAL LETTER KAN +1C9A; C; 10DA; # GEORGIAN MTAVRULI CAPITAL LETTER LAS +1C9B; C; 10DB; # GEORGIAN MTAVRULI CAPITAL LETTER MAN +1C9C; C; 10DC; # GEORGIAN MTAVRULI CAPITAL LETTER NAR +1C9D; C; 10DD; # GEORGIAN MTAVRULI CAPITAL LETTER ON +1C9E; C; 10DE; # GEORGIAN MTAVRULI CAPITAL LETTER PAR +1C9F; C; 10DF; # GEORGIAN MTAVRULI CAPITAL LETTER ZHAR +1CA0; C; 10E0; # GEORGIAN MTAVRULI CAPITAL LETTER RAE +1CA1; C; 10E1; # GEORGIAN MTAVRULI CAPITAL LETTER SAN +1CA2; C; 10E2; # GEORGIAN MTAVRULI CAPITAL LETTER TAR +1CA3; C; 10E3; # GEORGIAN MTAVRULI CAPITAL LETTER UN +1CA4; C; 10E4; # GEORGIAN MTAVRULI CAPITAL LETTER PHAR +1CA5; C; 10E5; # GEORGIAN MTAVRULI CAPITAL LETTER KHAR +1CA6; C; 10E6; # GEORGIAN MTAVRULI CAPITAL LETTER GHAN +1CA7; C; 10E7; # GEORGIAN MTAVRULI CAPITAL LETTER QAR +1CA8; C; 10E8; # GEORGIAN MTAVRULI CAPITAL LETTER SHIN +1CA9; C; 10E9; # GEORGIAN MTAVRULI CAPITAL LETTER CHIN +1CAA; C; 10EA; # GEORGIAN MTAVRULI CAPITAL LETTER CAN +1CAB; C; 10EB; # GEORGIAN MTAVRULI CAPITAL LETTER JIL +1CAC; C; 10EC; # GEORGIAN MTAVRULI CAPITAL LETTER CIL +1CAD; C; 10ED; # GEORGIAN MTAVRULI CAPITAL LETTER CHAR +1CAE; C; 10EE; # GEORGIAN MTAVRULI CAPITAL LETTER XAN +1CAF; C; 10EF; # GEORGIAN MTAVRULI CAPITAL LETTER JHAN +1CB0; C; 10F0; # GEORGIAN MTAVRULI CAPITAL LETTER HAE +1CB1; C; 10F1; # GEORGIAN MTAVRULI CAPITAL LETTER HE +1CB2; C; 10F2; # GEORGIAN MTAVRULI CAPITAL LETTER HIE +1CB3; C; 10F3; # GEORGIAN MTAVRULI CAPITAL LETTER WE +1CB4; C; 10F4; # GEORGIAN MTAVRULI CAPITAL LETTER HAR +1CB5; C; 10F5; # GEORGIAN MTAVRULI CAPITAL LETTER HOE +1CB6; C; 10F6; # GEORGIAN MTAVRULI CAPITAL LETTER FI +1CB7; C; 10F7; # GEORGIAN MTAVRULI CAPITAL LETTER YN +1CB8; C; 10F8; # GEORGIAN MTAVRULI CAPITAL LETTER ELIFI +1CB9; C; 10F9; # GEORGIAN MTAVRULI CAPITAL LETTER TURNED GAN +1CBA; C; 10FA; # GEORGIAN MTAVRULI CAPITAL LETTER AIN +1CBD; C; 10FD; # GEORGIAN MTAVRULI CAPITAL LETTER AEN +1CBE; C; 10FE; # GEORGIAN MTAVRULI CAPITAL LETTER HARD SIGN +1CBF; C; 10FF; # GEORGIAN MTAVRULI CAPITAL LETTER LABIAL SIGN +1E00; C; 1E01; # LATIN CAPITAL LETTER A WITH RING BELOW +1E02; C; 1E03; # LATIN CAPITAL LETTER B WITH DOT ABOVE +1E04; C; 1E05; # LATIN CAPITAL LETTER B WITH DOT BELOW +1E06; C; 1E07; # LATIN CAPITAL LETTER B WITH LINE BELOW +1E08; C; 1E09; # LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE +1E0A; C; 1E0B; # LATIN CAPITAL LETTER D WITH DOT ABOVE +1E0C; C; 1E0D; # LATIN CAPITAL LETTER D WITH DOT BELOW +1E0E; C; 1E0F; # LATIN CAPITAL LETTER D WITH LINE BELOW +1E10; C; 1E11; # LATIN CAPITAL LETTER D WITH CEDILLA +1E12; C; 1E13; # LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW +1E14; C; 1E15; # LATIN CAPITAL LETTER E WITH MACRON AND GRAVE +1E16; C; 1E17; # LATIN CAPITAL LETTER E WITH MACRON AND ACUTE +1E18; C; 1E19; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW +1E1A; C; 1E1B; # LATIN CAPITAL LETTER E WITH TILDE BELOW +1E1C; C; 1E1D; # LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE +1E1E; C; 1E1F; # LATIN CAPITAL LETTER F WITH DOT ABOVE +1E20; C; 1E21; # LATIN CAPITAL LETTER G WITH MACRON +1E22; C; 1E23; # LATIN CAPITAL LETTER H WITH DOT ABOVE +1E24; C; 1E25; # LATIN CAPITAL LETTER H WITH DOT BELOW +1E26; C; 1E27; # LATIN CAPITAL LETTER H WITH DIAERESIS +1E28; C; 1E29; # LATIN CAPITAL LETTER H WITH CEDILLA +1E2A; C; 1E2B; # LATIN CAPITAL LETTER H WITH BREVE BELOW +1E2C; C; 1E2D; # LATIN CAPITAL LETTER I WITH TILDE BELOW +1E2E; C; 1E2F; # LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE +1E30; C; 1E31; # LATIN CAPITAL LETTER K WITH ACUTE +1E32; C; 1E33; # LATIN CAPITAL LETTER K WITH DOT BELOW +1E34; C; 1E35; # LATIN CAPITAL LETTER K WITH LINE BELOW +1E36; C; 1E37; # LATIN CAPITAL LETTER L WITH DOT BELOW +1E38; C; 1E39; # LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON +1E3A; C; 1E3B; # LATIN CAPITAL LETTER L WITH LINE BELOW +1E3C; C; 1E3D; # LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW +1E3E; C; 1E3F; # LATIN CAPITAL LETTER M WITH ACUTE +1E40; C; 1E41; # LATIN CAPITAL LETTER M WITH DOT ABOVE +1E42; C; 1E43; # LATIN CAPITAL LETTER M WITH DOT BELOW +1E44; C; 1E45; # LATIN CAPITAL LETTER N WITH DOT ABOVE +1E46; C; 1E47; # LATIN CAPITAL LETTER N WITH DOT BELOW +1E48; C; 1E49; # LATIN CAPITAL LETTER N WITH LINE BELOW +1E4A; C; 1E4B; # LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW +1E4C; C; 1E4D; # LATIN CAPITAL LETTER O WITH TILDE AND ACUTE +1E4E; C; 1E4F; # LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS +1E50; C; 1E51; # LATIN CAPITAL LETTER O WITH MACRON AND GRAVE +1E52; C; 1E53; # LATIN CAPITAL LETTER O WITH MACRON AND ACUTE +1E54; C; 1E55; # LATIN CAPITAL LETTER P WITH ACUTE +1E56; C; 1E57; # LATIN CAPITAL LETTER P WITH DOT ABOVE +1E58; C; 1E59; # LATIN CAPITAL LETTER R WITH DOT ABOVE +1E5A; C; 1E5B; # LATIN CAPITAL LETTER R WITH DOT BELOW +1E5C; C; 1E5D; # LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON +1E5E; C; 1E5F; # LATIN CAPITAL LETTER R WITH LINE BELOW +1E60; C; 1E61; # LATIN CAPITAL LETTER S WITH DOT ABOVE +1E62; C; 1E63; # LATIN CAPITAL LETTER S WITH DOT BELOW +1E64; C; 1E65; # LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE +1E66; C; 1E67; # LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE +1E68; C; 1E69; # LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE +1E6A; C; 1E6B; # LATIN CAPITAL LETTER T WITH DOT ABOVE +1E6C; C; 1E6D; # LATIN CAPITAL LETTER T WITH DOT BELOW +1E6E; C; 1E6F; # LATIN CAPITAL LETTER T WITH LINE BELOW +1E70; C; 1E71; # LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW +1E72; C; 1E73; # LATIN CAPITAL LETTER U WITH DIAERESIS BELOW +1E74; C; 1E75; # LATIN CAPITAL LETTER U WITH TILDE BELOW +1E76; C; 1E77; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW +1E78; C; 1E79; # LATIN CAPITAL LETTER U WITH TILDE AND ACUTE +1E7A; C; 1E7B; # LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS +1E7C; C; 1E7D; # LATIN CAPITAL LETTER V WITH TILDE +1E7E; C; 1E7F; # LATIN CAPITAL LETTER V WITH DOT BELOW +1E80; C; 1E81; # LATIN CAPITAL LETTER W WITH GRAVE +1E82; C; 1E83; # LATIN CAPITAL LETTER W WITH ACUTE +1E84; C; 1E85; # LATIN CAPITAL LETTER W WITH DIAERESIS +1E86; C; 1E87; # LATIN CAPITAL LETTER W WITH DOT ABOVE +1E88; C; 1E89; # LATIN CAPITAL LETTER W WITH DOT BELOW +1E8A; C; 1E8B; # LATIN CAPITAL LETTER X WITH DOT ABOVE +1E8C; C; 1E8D; # LATIN CAPITAL LETTER X WITH DIAERESIS +1E8E; C; 1E8F; # LATIN CAPITAL LETTER Y WITH DOT ABOVE +1E90; C; 1E91; # LATIN CAPITAL LETTER Z WITH CIRCUMFLEX +1E92; C; 1E93; # LATIN CAPITAL LETTER Z WITH DOT BELOW +1E94; C; 1E95; # LATIN CAPITAL LETTER Z WITH LINE BELOW +1E96; F; 0068 0331; # LATIN SMALL LETTER H WITH LINE BELOW +1E97; F; 0074 0308; # LATIN SMALL LETTER T WITH DIAERESIS +1E98; F; 0077 030A; # LATIN SMALL LETTER W WITH RING ABOVE +1E99; F; 0079 030A; # LATIN SMALL LETTER Y WITH RING ABOVE +1E9A; F; 0061 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING +1E9B; C; 1E61; # LATIN SMALL LETTER LONG S WITH DOT ABOVE +1E9E; F; 0073 0073; # LATIN CAPITAL LETTER SHARP S +1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S +1EA0; C; 1EA1; # LATIN CAPITAL LETTER A WITH DOT BELOW +1EA2; C; 1EA3; # LATIN CAPITAL LETTER A WITH HOOK ABOVE +1EA4; C; 1EA5; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE +1EA6; C; 1EA7; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE +1EA8; C; 1EA9; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE +1EAA; C; 1EAB; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE +1EAC; C; 1EAD; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW +1EAE; C; 1EAF; # LATIN CAPITAL LETTER A WITH BREVE AND ACUTE +1EB0; C; 1EB1; # LATIN CAPITAL LETTER A WITH BREVE AND GRAVE +1EB2; C; 1EB3; # LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE +1EB4; C; 1EB5; # LATIN CAPITAL LETTER A WITH BREVE AND TILDE +1EB6; C; 1EB7; # LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW +1EB8; C; 1EB9; # LATIN CAPITAL LETTER E WITH DOT BELOW +1EBA; C; 1EBB; # LATIN CAPITAL LETTER E WITH HOOK ABOVE +1EBC; C; 1EBD; # LATIN CAPITAL LETTER E WITH TILDE +1EBE; C; 1EBF; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE +1EC0; C; 1EC1; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE +1EC2; C; 1EC3; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE +1EC4; C; 1EC5; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE +1EC6; C; 1EC7; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW +1EC8; C; 1EC9; # LATIN CAPITAL LETTER I WITH HOOK ABOVE +1ECA; C; 1ECB; # LATIN CAPITAL LETTER I WITH DOT BELOW +1ECC; C; 1ECD; # LATIN CAPITAL LETTER O WITH DOT BELOW +1ECE; C; 1ECF; # LATIN CAPITAL LETTER O WITH HOOK ABOVE +1ED0; C; 1ED1; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE +1ED2; C; 1ED3; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE +1ED4; C; 1ED5; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE +1ED6; C; 1ED7; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE +1ED8; C; 1ED9; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW +1EDA; C; 1EDB; # LATIN CAPITAL LETTER O WITH HORN AND ACUTE +1EDC; C; 1EDD; # LATIN CAPITAL LETTER O WITH HORN AND GRAVE +1EDE; C; 1EDF; # LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE +1EE0; C; 1EE1; # LATIN CAPITAL LETTER O WITH HORN AND TILDE +1EE2; C; 1EE3; # LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW +1EE4; C; 1EE5; # LATIN CAPITAL LETTER U WITH DOT BELOW +1EE6; C; 1EE7; # LATIN CAPITAL LETTER U WITH HOOK ABOVE +1EE8; C; 1EE9; # LATIN CAPITAL LETTER U WITH HORN AND ACUTE +1EEA; C; 1EEB; # LATIN CAPITAL LETTER U WITH HORN AND GRAVE +1EEC; C; 1EED; # LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE +1EEE; C; 1EEF; # LATIN CAPITAL LETTER U WITH HORN AND TILDE +1EF0; C; 1EF1; # LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW +1EF2; C; 1EF3; # LATIN CAPITAL LETTER Y WITH GRAVE +1EF4; C; 1EF5; # LATIN CAPITAL LETTER Y WITH DOT BELOW +1EF6; C; 1EF7; # LATIN CAPITAL LETTER Y WITH HOOK ABOVE +1EF8; C; 1EF9; # LATIN CAPITAL LETTER Y WITH TILDE +1EFA; C; 1EFB; # LATIN CAPITAL LETTER MIDDLE-WELSH LL +1EFC; C; 1EFD; # LATIN CAPITAL LETTER MIDDLE-WELSH V +1EFE; C; 1EFF; # LATIN CAPITAL LETTER Y WITH LOOP +1F08; C; 1F00; # GREEK CAPITAL LETTER ALPHA WITH PSILI +1F09; C; 1F01; # GREEK CAPITAL LETTER ALPHA WITH DASIA +1F0A; C; 1F02; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA +1F0B; C; 1F03; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA +1F0C; C; 1F04; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA +1F0D; C; 1F05; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA +1F0E; C; 1F06; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI +1F0F; C; 1F07; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI +1F18; C; 1F10; # GREEK CAPITAL LETTER EPSILON WITH PSILI +1F19; C; 1F11; # GREEK CAPITAL LETTER EPSILON WITH DASIA +1F1A; C; 1F12; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND VARIA +1F1B; C; 1F13; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND VARIA +1F1C; C; 1F14; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND OXIA +1F1D; C; 1F15; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA +1F28; C; 1F20; # GREEK CAPITAL LETTER ETA WITH PSILI +1F29; C; 1F21; # GREEK CAPITAL LETTER ETA WITH DASIA +1F2A; C; 1F22; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA +1F2B; C; 1F23; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA +1F2C; C; 1F24; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA +1F2D; C; 1F25; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA +1F2E; C; 1F26; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI +1F2F; C; 1F27; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI +1F38; C; 1F30; # GREEK CAPITAL LETTER IOTA WITH PSILI +1F39; C; 1F31; # GREEK CAPITAL LETTER IOTA WITH DASIA +1F3A; C; 1F32; # GREEK CAPITAL LETTER IOTA WITH PSILI AND VARIA +1F3B; C; 1F33; # GREEK CAPITAL LETTER IOTA WITH DASIA AND VARIA +1F3C; C; 1F34; # GREEK CAPITAL LETTER IOTA WITH PSILI AND OXIA +1F3D; C; 1F35; # GREEK CAPITAL LETTER IOTA WITH DASIA AND OXIA +1F3E; C; 1F36; # GREEK CAPITAL LETTER IOTA WITH PSILI AND PERISPOMENI +1F3F; C; 1F37; # GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI +1F48; C; 1F40; # GREEK CAPITAL LETTER OMICRON WITH PSILI +1F49; C; 1F41; # GREEK CAPITAL LETTER OMICRON WITH DASIA +1F4A; C; 1F42; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND VARIA +1F4B; C; 1F43; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND VARIA +1F4C; C; 1F44; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND OXIA +1F4D; C; 1F45; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA +1F50; F; 03C5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI +1F52; F; 03C5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA +1F54; F; 03C5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA +1F56; F; 03C5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI +1F59; C; 1F51; # GREEK CAPITAL LETTER UPSILON WITH DASIA +1F5B; C; 1F53; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA +1F5D; C; 1F55; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA +1F5F; C; 1F57; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI +1F68; C; 1F60; # GREEK CAPITAL LETTER OMEGA WITH PSILI +1F69; C; 1F61; # GREEK CAPITAL LETTER OMEGA WITH DASIA +1F6A; C; 1F62; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA +1F6B; C; 1F63; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA +1F6C; C; 1F64; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA +1F6D; C; 1F65; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA +1F6E; C; 1F66; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI +1F6F; C; 1F67; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI +1F80; F; 1F00 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI +1F81; F; 1F01 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI +1F82; F; 1F02 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1F83; F; 1F03 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1F84; F; 1F04 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1F85; F; 1F05 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1F86; F; 1F06 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1F87; F; 1F07 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F88; F; 1F00 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI +1F88; S; 1F80; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI +1F89; F; 1F01 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI +1F89; S; 1F81; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI +1F8A; F; 1F02 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F8A; S; 1F82; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F8B; F; 1F03 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F8B; S; 1F83; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F8C; F; 1F04 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F8C; S; 1F84; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F8D; F; 1F05 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F8D; S; 1F85; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F8E; F; 1F06 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F8E; S; 1F86; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F8F; F; 1F07 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F90; F; 1F20 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI +1F91; F; 1F21 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI +1F92; F; 1F22 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1F93; F; 1F23 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1F94; F; 1F24 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1F95; F; 1F25 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1F96; F; 1F26 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1F97; F; 1F27 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1F98; F; 1F20 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI +1F98; S; 1F90; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI +1F99; F; 1F21 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI +1F99; S; 1F91; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI +1F9A; F; 1F22 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F9A; S; 1F92; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1F9B; F; 1F23 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F9B; S; 1F93; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1F9C; F; 1F24 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F9C; S; 1F94; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1F9D; F; 1F25 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F9D; S; 1F95; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1F9E; F; 1F26 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F9E; S; 1F96; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1F9F; F; 1F27 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1F9F; S; 1F97; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FA0; F; 1F60 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI +1FA1; F; 1F61 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI +1FA2; F; 1F62 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI +1FA3; F; 1F63 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI +1FA4; F; 1F64 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI +1FA5; F; 1F65 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI +1FA6; F; 1F66 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI +1FA7; F; 1F67 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI +1FA8; F; 1F60 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI +1FA8; S; 1FA0; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI +1FA9; F; 1F61 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI +1FA9; S; 1FA1; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI +1FAA; F; 1F62 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1FAA; S; 1FA2; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI +1FAB; F; 1F63 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1FAB; S; 1FA3; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI +1FAC; F; 1F64 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1FAC; S; 1FA4; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI +1FAD; F; 1F65 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1FAD; S; 1FA5; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI +1FAE; F; 1F66 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1FAE; S; 1FA6; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI +1FAF; F; 1F67 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FAF; S; 1FA7; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI +1FB2; F; 1F70 03B9; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI +1FB3; F; 03B1 03B9; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI +1FB4; F; 03AC 03B9; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI +1FB6; F; 03B1 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI +1FB7; F; 03B1 0342 03B9; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI +1FB8; C; 1FB0; # GREEK CAPITAL LETTER ALPHA WITH VRACHY +1FB9; C; 1FB1; # GREEK CAPITAL LETTER ALPHA WITH MACRON +1FBA; C; 1F70; # GREEK CAPITAL LETTER ALPHA WITH VARIA +1FBB; C; 1F71; # GREEK CAPITAL LETTER ALPHA WITH OXIA +1FBC; F; 03B1 03B9; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FBC; S; 1FB3; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI +1FBE; C; 03B9; # GREEK PROSGEGRAMMENI +1FC2; F; 1F74 03B9; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI +1FC3; F; 03B7 03B9; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI +1FC4; F; 03AE 03B9; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI +1FC6; F; 03B7 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI +1FC7; F; 03B7 0342 03B9; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI +1FC8; C; 1F72; # GREEK CAPITAL LETTER EPSILON WITH VARIA +1FC9; C; 1F73; # GREEK CAPITAL LETTER EPSILON WITH OXIA +1FCA; C; 1F74; # GREEK CAPITAL LETTER ETA WITH VARIA +1FCB; C; 1F75; # GREEK CAPITAL LETTER ETA WITH OXIA +1FCC; F; 03B7 03B9; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FCC; S; 1FC3; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI +1FD2; F; 03B9 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA +1FD3; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA +1FD6; F; 03B9 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI +1FD7; F; 03B9 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI +1FD8; C; 1FD0; # GREEK CAPITAL LETTER IOTA WITH VRACHY +1FD9; C; 1FD1; # GREEK CAPITAL LETTER IOTA WITH MACRON +1FDA; C; 1F76; # GREEK CAPITAL LETTER IOTA WITH VARIA +1FDB; C; 1F77; # GREEK CAPITAL LETTER IOTA WITH OXIA +1FE2; F; 03C5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA +1FE3; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA +1FE4; F; 03C1 0313; # GREEK SMALL LETTER RHO WITH PSILI +1FE6; F; 03C5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI +1FE7; F; 03C5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI +1FE8; C; 1FE0; # GREEK CAPITAL LETTER UPSILON WITH VRACHY +1FE9; C; 1FE1; # GREEK CAPITAL LETTER UPSILON WITH MACRON +1FEA; C; 1F7A; # GREEK CAPITAL LETTER UPSILON WITH VARIA +1FEB; C; 1F7B; # GREEK CAPITAL LETTER UPSILON WITH OXIA +1FEC; C; 1FE5; # GREEK CAPITAL LETTER RHO WITH DASIA +1FF2; F; 1F7C 03B9; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI +1FF3; F; 03C9 03B9; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI +1FF4; F; 03CE 03B9; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI +1FF6; F; 03C9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI +1FF7; F; 03C9 0342 03B9; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI +1FF8; C; 1F78; # GREEK CAPITAL LETTER OMICRON WITH VARIA +1FF9; C; 1F79; # GREEK CAPITAL LETTER OMICRON WITH OXIA +1FFA; C; 1F7C; # GREEK CAPITAL LETTER OMEGA WITH VARIA +1FFB; C; 1F7D; # GREEK CAPITAL LETTER OMEGA WITH OXIA +1FFC; F; 03C9 03B9; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI +1FFC; S; 1FF3; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI +2126; C; 03C9; # OHM SIGN +212A; C; 006B; # KELVIN SIGN +212B; C; 00E5; # ANGSTROM SIGN +2132; C; 214E; # TURNED CAPITAL F +2160; C; 2170; # ROMAN NUMERAL ONE +2161; C; 2171; # ROMAN NUMERAL TWO +2162; C; 2172; # ROMAN NUMERAL THREE +2163; C; 2173; # ROMAN NUMERAL FOUR +2164; C; 2174; # ROMAN NUMERAL FIVE +2165; C; 2175; # ROMAN NUMERAL SIX +2166; C; 2176; # ROMAN NUMERAL SEVEN +2167; C; 2177; # ROMAN NUMERAL EIGHT +2168; C; 2178; # ROMAN NUMERAL NINE +2169; C; 2179; # ROMAN NUMERAL TEN +216A; C; 217A; # ROMAN NUMERAL ELEVEN +216B; C; 217B; # ROMAN NUMERAL TWELVE +216C; C; 217C; # ROMAN NUMERAL FIFTY +216D; C; 217D; # ROMAN NUMERAL ONE HUNDRED +216E; C; 217E; # ROMAN NUMERAL FIVE HUNDRED +216F; C; 217F; # ROMAN NUMERAL ONE THOUSAND +2183; C; 2184; # ROMAN NUMERAL REVERSED ONE HUNDRED +24B6; C; 24D0; # CIRCLED LATIN CAPITAL LETTER A +24B7; C; 24D1; # CIRCLED LATIN CAPITAL LETTER B +24B8; C; 24D2; # CIRCLED LATIN CAPITAL LETTER C +24B9; C; 24D3; # CIRCLED LATIN CAPITAL LETTER D +24BA; C; 24D4; # CIRCLED LATIN CAPITAL LETTER E +24BB; C; 24D5; # CIRCLED LATIN CAPITAL LETTER F +24BC; C; 24D6; # CIRCLED LATIN CAPITAL LETTER G +24BD; C; 24D7; # CIRCLED LATIN CAPITAL LETTER H +24BE; C; 24D8; # CIRCLED LATIN CAPITAL LETTER I +24BF; C; 24D9; # CIRCLED LATIN CAPITAL LETTER J +24C0; C; 24DA; # CIRCLED LATIN CAPITAL LETTER K +24C1; C; 24DB; # CIRCLED LATIN CAPITAL LETTER L +24C2; C; 24DC; # CIRCLED LATIN CAPITAL LETTER M +24C3; C; 24DD; # CIRCLED LATIN CAPITAL LETTER N +24C4; C; 24DE; # CIRCLED LATIN CAPITAL LETTER O +24C5; C; 24DF; # CIRCLED LATIN CAPITAL LETTER P +24C6; C; 24E0; # CIRCLED LATIN CAPITAL LETTER Q +24C7; C; 24E1; # CIRCLED LATIN CAPITAL LETTER R +24C8; C; 24E2; # CIRCLED LATIN CAPITAL LETTER S +24C9; C; 24E3; # CIRCLED LATIN CAPITAL LETTER T +24CA; C; 24E4; # CIRCLED LATIN CAPITAL LETTER U +24CB; C; 24E5; # CIRCLED LATIN CAPITAL LETTER V +24CC; C; 24E6; # CIRCLED LATIN CAPITAL LETTER W +24CD; C; 24E7; # CIRCLED LATIN CAPITAL LETTER X +24CE; C; 24E8; # CIRCLED LATIN CAPITAL LETTER Y +24CF; C; 24E9; # CIRCLED LATIN CAPITAL LETTER Z +2C00; C; 2C30; # GLAGOLITIC CAPITAL LETTER AZU +2C01; C; 2C31; # GLAGOLITIC CAPITAL LETTER BUKY +2C02; C; 2C32; # GLAGOLITIC CAPITAL LETTER VEDE +2C03; C; 2C33; # GLAGOLITIC CAPITAL LETTER GLAGOLI +2C04; C; 2C34; # GLAGOLITIC CAPITAL LETTER DOBRO +2C05; C; 2C35; # GLAGOLITIC CAPITAL LETTER YESTU +2C06; C; 2C36; # GLAGOLITIC CAPITAL LETTER ZHIVETE +2C07; C; 2C37; # GLAGOLITIC CAPITAL LETTER DZELO +2C08; C; 2C38; # GLAGOLITIC CAPITAL LETTER ZEMLJA +2C09; C; 2C39; # GLAGOLITIC CAPITAL LETTER IZHE +2C0A; C; 2C3A; # GLAGOLITIC CAPITAL LETTER INITIAL IZHE +2C0B; C; 2C3B; # GLAGOLITIC CAPITAL LETTER I +2C0C; C; 2C3C; # GLAGOLITIC CAPITAL LETTER DJERVI +2C0D; C; 2C3D; # GLAGOLITIC CAPITAL LETTER KAKO +2C0E; C; 2C3E; # GLAGOLITIC CAPITAL LETTER LJUDIJE +2C0F; C; 2C3F; # GLAGOLITIC CAPITAL LETTER MYSLITE +2C10; C; 2C40; # GLAGOLITIC CAPITAL LETTER NASHI +2C11; C; 2C41; # GLAGOLITIC CAPITAL LETTER ONU +2C12; C; 2C42; # GLAGOLITIC CAPITAL LETTER POKOJI +2C13; C; 2C43; # GLAGOLITIC CAPITAL LETTER RITSI +2C14; C; 2C44; # GLAGOLITIC CAPITAL LETTER SLOVO +2C15; C; 2C45; # GLAGOLITIC CAPITAL LETTER TVRIDO +2C16; C; 2C46; # GLAGOLITIC CAPITAL LETTER UKU +2C17; C; 2C47; # GLAGOLITIC CAPITAL LETTER FRITU +2C18; C; 2C48; # GLAGOLITIC CAPITAL LETTER HERU +2C19; C; 2C49; # GLAGOLITIC CAPITAL LETTER OTU +2C1A; C; 2C4A; # GLAGOLITIC CAPITAL LETTER PE +2C1B; C; 2C4B; # GLAGOLITIC CAPITAL LETTER SHTA +2C1C; C; 2C4C; # GLAGOLITIC CAPITAL LETTER TSI +2C1D; C; 2C4D; # GLAGOLITIC CAPITAL LETTER CHRIVI +2C1E; C; 2C4E; # GLAGOLITIC CAPITAL LETTER SHA +2C1F; C; 2C4F; # GLAGOLITIC CAPITAL LETTER YERU +2C20; C; 2C50; # GLAGOLITIC CAPITAL LETTER YERI +2C21; C; 2C51; # GLAGOLITIC CAPITAL LETTER YATI +2C22; C; 2C52; # GLAGOLITIC CAPITAL LETTER SPIDERY HA +2C23; C; 2C53; # GLAGOLITIC CAPITAL LETTER YU +2C24; C; 2C54; # GLAGOLITIC CAPITAL LETTER SMALL YUS +2C25; C; 2C55; # GLAGOLITIC CAPITAL LETTER SMALL YUS WITH TAIL +2C26; C; 2C56; # GLAGOLITIC CAPITAL LETTER YO +2C27; C; 2C57; # GLAGOLITIC CAPITAL LETTER IOTATED SMALL YUS +2C28; C; 2C58; # GLAGOLITIC CAPITAL LETTER BIG YUS +2C29; C; 2C59; # GLAGOLITIC CAPITAL LETTER IOTATED BIG YUS +2C2A; C; 2C5A; # GLAGOLITIC CAPITAL LETTER FITA +2C2B; C; 2C5B; # GLAGOLITIC CAPITAL LETTER IZHITSA +2C2C; C; 2C5C; # GLAGOLITIC CAPITAL LETTER SHTAPIC +2C2D; C; 2C5D; # GLAGOLITIC CAPITAL LETTER TROKUTASTI A +2C2E; C; 2C5E; # GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE +2C2F; C; 2C5F; # GLAGOLITIC CAPITAL LETTER CAUDATE CHRIVI +2C60; C; 2C61; # LATIN CAPITAL LETTER L WITH DOUBLE BAR +2C62; C; 026B; # LATIN CAPITAL LETTER L WITH MIDDLE TILDE +2C63; C; 1D7D; # LATIN CAPITAL LETTER P WITH STROKE +2C64; C; 027D; # LATIN CAPITAL LETTER R WITH TAIL +2C67; C; 2C68; # LATIN CAPITAL LETTER H WITH DESCENDER +2C69; C; 2C6A; # LATIN CAPITAL LETTER K WITH DESCENDER +2C6B; C; 2C6C; # LATIN CAPITAL LETTER Z WITH DESCENDER +2C6D; C; 0251; # LATIN CAPITAL LETTER ALPHA +2C6E; C; 0271; # LATIN CAPITAL LETTER M WITH HOOK +2C6F; C; 0250; # LATIN CAPITAL LETTER TURNED A +2C70; C; 0252; # LATIN CAPITAL LETTER TURNED ALPHA +2C72; C; 2C73; # LATIN CAPITAL LETTER W WITH HOOK +2C75; C; 2C76; # LATIN CAPITAL LETTER HALF H +2C7E; C; 023F; # LATIN CAPITAL LETTER S WITH SWASH TAIL +2C7F; C; 0240; # LATIN CAPITAL LETTER Z WITH SWASH TAIL +2C80; C; 2C81; # COPTIC CAPITAL LETTER ALFA +2C82; C; 2C83; # COPTIC CAPITAL LETTER VIDA +2C84; C; 2C85; # COPTIC CAPITAL LETTER GAMMA +2C86; C; 2C87; # COPTIC CAPITAL LETTER DALDA +2C88; C; 2C89; # COPTIC CAPITAL LETTER EIE +2C8A; C; 2C8B; # COPTIC CAPITAL LETTER SOU +2C8C; C; 2C8D; # COPTIC CAPITAL LETTER ZATA +2C8E; C; 2C8F; # COPTIC CAPITAL LETTER HATE +2C90; C; 2C91; # COPTIC CAPITAL LETTER THETHE +2C92; C; 2C93; # COPTIC CAPITAL LETTER IAUDA +2C94; C; 2C95; # COPTIC CAPITAL LETTER KAPA +2C96; C; 2C97; # COPTIC CAPITAL LETTER LAULA +2C98; C; 2C99; # COPTIC CAPITAL LETTER MI +2C9A; C; 2C9B; # COPTIC CAPITAL LETTER NI +2C9C; C; 2C9D; # COPTIC CAPITAL LETTER KSI +2C9E; C; 2C9F; # COPTIC CAPITAL LETTER O +2CA0; C; 2CA1; # COPTIC CAPITAL LETTER PI +2CA2; C; 2CA3; # COPTIC CAPITAL LETTER RO +2CA4; C; 2CA5; # COPTIC CAPITAL LETTER SIMA +2CA6; C; 2CA7; # COPTIC CAPITAL LETTER TAU +2CA8; C; 2CA9; # COPTIC CAPITAL LETTER UA +2CAA; C; 2CAB; # COPTIC CAPITAL LETTER FI +2CAC; C; 2CAD; # COPTIC CAPITAL LETTER KHI +2CAE; C; 2CAF; # COPTIC CAPITAL LETTER PSI +2CB0; C; 2CB1; # COPTIC CAPITAL LETTER OOU +2CB2; C; 2CB3; # COPTIC CAPITAL LETTER DIALECT-P ALEF +2CB4; C; 2CB5; # COPTIC CAPITAL LETTER OLD COPTIC AIN +2CB6; C; 2CB7; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE +2CB8; C; 2CB9; # COPTIC CAPITAL LETTER DIALECT-P KAPA +2CBA; C; 2CBB; # COPTIC CAPITAL LETTER DIALECT-P NI +2CBC; C; 2CBD; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI +2CBE; C; 2CBF; # COPTIC CAPITAL LETTER OLD COPTIC OOU +2CC0; C; 2CC1; # COPTIC CAPITAL LETTER SAMPI +2CC2; C; 2CC3; # COPTIC CAPITAL LETTER CROSSED SHEI +2CC4; C; 2CC5; # COPTIC CAPITAL LETTER OLD COPTIC SHEI +2CC6; C; 2CC7; # COPTIC CAPITAL LETTER OLD COPTIC ESH +2CC8; C; 2CC9; # COPTIC CAPITAL LETTER AKHMIMIC KHEI +2CCA; C; 2CCB; # COPTIC CAPITAL LETTER DIALECT-P HORI +2CCC; C; 2CCD; # COPTIC CAPITAL LETTER OLD COPTIC HORI +2CCE; C; 2CCF; # COPTIC CAPITAL LETTER OLD COPTIC HA +2CD0; C; 2CD1; # COPTIC CAPITAL LETTER L-SHAPED HA +2CD2; C; 2CD3; # COPTIC CAPITAL LETTER OLD COPTIC HEI +2CD4; C; 2CD5; # COPTIC CAPITAL LETTER OLD COPTIC HAT +2CD6; C; 2CD7; # COPTIC CAPITAL LETTER OLD COPTIC GANGIA +2CD8; C; 2CD9; # COPTIC CAPITAL LETTER OLD COPTIC DJA +2CDA; C; 2CDB; # COPTIC CAPITAL LETTER OLD COPTIC SHIMA +2CDC; C; 2CDD; # COPTIC CAPITAL LETTER OLD NUBIAN SHIMA +2CDE; C; 2CDF; # COPTIC CAPITAL LETTER OLD NUBIAN NGI +2CE0; C; 2CE1; # COPTIC CAPITAL LETTER OLD NUBIAN NYI +2CE2; C; 2CE3; # COPTIC CAPITAL LETTER OLD NUBIAN WAU +2CEB; C; 2CEC; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI +2CED; C; 2CEE; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA +2CF2; C; 2CF3; # COPTIC CAPITAL LETTER BOHAIRIC KHEI +A640; C; A641; # CYRILLIC CAPITAL LETTER ZEMLYA +A642; C; A643; # CYRILLIC CAPITAL LETTER DZELO +A644; C; A645; # CYRILLIC CAPITAL LETTER REVERSED DZE +A646; C; A647; # CYRILLIC CAPITAL LETTER IOTA +A648; C; A649; # CYRILLIC CAPITAL LETTER DJERV +A64A; C; A64B; # CYRILLIC CAPITAL LETTER MONOGRAPH UK +A64C; C; A64D; # CYRILLIC CAPITAL LETTER BROAD OMEGA +A64E; C; A64F; # CYRILLIC CAPITAL LETTER NEUTRAL YER +A650; C; A651; # CYRILLIC CAPITAL LETTER YERU WITH BACK YER +A652; C; A653; # CYRILLIC CAPITAL LETTER IOTIFIED YAT +A654; C; A655; # CYRILLIC CAPITAL LETTER REVERSED YU +A656; C; A657; # CYRILLIC CAPITAL LETTER IOTIFIED A +A658; C; A659; # CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS +A65A; C; A65B; # CYRILLIC CAPITAL LETTER BLENDED YUS +A65C; C; A65D; # CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS +A65E; C; A65F; # CYRILLIC CAPITAL LETTER YN +A660; C; A661; # CYRILLIC CAPITAL LETTER REVERSED TSE +A662; C; A663; # CYRILLIC CAPITAL LETTER SOFT DE +A664; C; A665; # CYRILLIC CAPITAL LETTER SOFT EL +A666; C; A667; # CYRILLIC CAPITAL LETTER SOFT EM +A668; C; A669; # CYRILLIC CAPITAL LETTER MONOCULAR O +A66A; C; A66B; # CYRILLIC CAPITAL LETTER BINOCULAR O +A66C; C; A66D; # CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O +A680; C; A681; # CYRILLIC CAPITAL LETTER DWE +A682; C; A683; # CYRILLIC CAPITAL LETTER DZWE +A684; C; A685; # CYRILLIC CAPITAL LETTER ZHWE +A686; C; A687; # CYRILLIC CAPITAL LETTER CCHE +A688; C; A689; # CYRILLIC CAPITAL LETTER DZZE +A68A; C; A68B; # CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK +A68C; C; A68D; # CYRILLIC CAPITAL LETTER TWE +A68E; C; A68F; # CYRILLIC CAPITAL LETTER TSWE +A690; C; A691; # CYRILLIC CAPITAL LETTER TSSE +A692; C; A693; # CYRILLIC CAPITAL LETTER TCHE +A694; C; A695; # CYRILLIC CAPITAL LETTER HWE +A696; C; A697; # CYRILLIC CAPITAL LETTER SHWE +A698; C; A699; # CYRILLIC CAPITAL LETTER DOUBLE O +A69A; C; A69B; # CYRILLIC CAPITAL LETTER CROSSED O +A722; C; A723; # LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF +A724; C; A725; # LATIN CAPITAL LETTER EGYPTOLOGICAL AIN +A726; C; A727; # LATIN CAPITAL LETTER HENG +A728; C; A729; # LATIN CAPITAL LETTER TZ +A72A; C; A72B; # LATIN CAPITAL LETTER TRESILLO +A72C; C; A72D; # LATIN CAPITAL LETTER CUATRILLO +A72E; C; A72F; # LATIN CAPITAL LETTER CUATRILLO WITH COMMA +A732; C; A733; # LATIN CAPITAL LETTER AA +A734; C; A735; # LATIN CAPITAL LETTER AO +A736; C; A737; # LATIN CAPITAL LETTER AU +A738; C; A739; # LATIN CAPITAL LETTER AV +A73A; C; A73B; # LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR +A73C; C; A73D; # LATIN CAPITAL LETTER AY +A73E; C; A73F; # LATIN CAPITAL LETTER REVERSED C WITH DOT +A740; C; A741; # LATIN CAPITAL LETTER K WITH STROKE +A742; C; A743; # LATIN CAPITAL LETTER K WITH DIAGONAL STROKE +A744; C; A745; # LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE +A746; C; A747; # LATIN CAPITAL LETTER BROKEN L +A748; C; A749; # LATIN CAPITAL LETTER L WITH HIGH STROKE +A74A; C; A74B; # LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY +A74C; C; A74D; # LATIN CAPITAL LETTER O WITH LOOP +A74E; C; A74F; # LATIN CAPITAL LETTER OO +A750; C; A751; # LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER +A752; C; A753; # LATIN CAPITAL LETTER P WITH FLOURISH +A754; C; A755; # LATIN CAPITAL LETTER P WITH SQUIRREL TAIL +A756; C; A757; # LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER +A758; C; A759; # LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE +A75A; C; A75B; # LATIN CAPITAL LETTER R ROTUNDA +A75C; C; A75D; # LATIN CAPITAL LETTER RUM ROTUNDA +A75E; C; A75F; # LATIN CAPITAL LETTER V WITH DIAGONAL STROKE +A760; C; A761; # LATIN CAPITAL LETTER VY +A762; C; A763; # LATIN CAPITAL LETTER VISIGOTHIC Z +A764; C; A765; # LATIN CAPITAL LETTER THORN WITH STROKE +A766; C; A767; # LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER +A768; C; A769; # LATIN CAPITAL LETTER VEND +A76A; C; A76B; # LATIN CAPITAL LETTER ET +A76C; C; A76D; # LATIN CAPITAL LETTER IS +A76E; C; A76F; # LATIN CAPITAL LETTER CON +A779; C; A77A; # LATIN CAPITAL LETTER INSULAR D +A77B; C; A77C; # LATIN CAPITAL LETTER INSULAR F +A77D; C; 1D79; # LATIN CAPITAL LETTER INSULAR G +A77E; C; A77F; # LATIN CAPITAL LETTER TURNED INSULAR G +A780; C; A781; # LATIN CAPITAL LETTER TURNED L +A782; C; A783; # LATIN CAPITAL LETTER INSULAR R +A784; C; A785; # LATIN CAPITAL LETTER INSULAR S +A786; C; A787; # LATIN CAPITAL LETTER INSULAR T +A78B; C; A78C; # LATIN CAPITAL LETTER SALTILLO +A78D; C; 0265; # LATIN CAPITAL LETTER TURNED H +A790; C; A791; # LATIN CAPITAL LETTER N WITH DESCENDER +A792; C; A793; # LATIN CAPITAL LETTER C WITH BAR +A796; C; A797; # LATIN CAPITAL LETTER B WITH FLOURISH +A798; C; A799; # LATIN CAPITAL LETTER F WITH STROKE +A79A; C; A79B; # LATIN CAPITAL LETTER VOLAPUK AE +A79C; C; A79D; # LATIN CAPITAL LETTER VOLAPUK OE +A79E; C; A79F; # LATIN CAPITAL LETTER VOLAPUK UE +A7A0; C; A7A1; # LATIN CAPITAL LETTER G WITH OBLIQUE STROKE +A7A2; C; A7A3; # LATIN CAPITAL LETTER K WITH OBLIQUE STROKE +A7A4; C; A7A5; # LATIN CAPITAL LETTER N WITH OBLIQUE STROKE +A7A6; C; A7A7; # LATIN CAPITAL LETTER R WITH OBLIQUE STROKE +A7A8; C; A7A9; # LATIN CAPITAL LETTER S WITH OBLIQUE STROKE +A7AA; C; 0266; # LATIN CAPITAL LETTER H WITH HOOK +A7AB; C; 025C; # LATIN CAPITAL LETTER REVERSED OPEN E +A7AC; C; 0261; # LATIN CAPITAL LETTER SCRIPT G +A7AD; C; 026C; # LATIN CAPITAL LETTER L WITH BELT +A7AE; C; 026A; # LATIN CAPITAL LETTER SMALL CAPITAL I +A7B0; C; 029E; # LATIN CAPITAL LETTER TURNED K +A7B1; C; 0287; # LATIN CAPITAL LETTER TURNED T +A7B2; C; 029D; # LATIN CAPITAL LETTER J WITH CROSSED-TAIL +A7B3; C; AB53; # LATIN CAPITAL LETTER CHI +A7B4; C; A7B5; # LATIN CAPITAL LETTER BETA +A7B6; C; A7B7; # LATIN CAPITAL LETTER OMEGA +A7B8; C; A7B9; # LATIN CAPITAL LETTER U WITH STROKE +A7BA; C; A7BB; # LATIN CAPITAL LETTER GLOTTAL A +A7BC; C; A7BD; # LATIN CAPITAL LETTER GLOTTAL I +A7BE; C; A7BF; # LATIN CAPITAL LETTER GLOTTAL U +A7C0; C; A7C1; # LATIN CAPITAL LETTER OLD POLISH O +A7C2; C; A7C3; # LATIN CAPITAL LETTER ANGLICANA W +A7C4; C; A794; # LATIN CAPITAL LETTER C WITH PALATAL HOOK +A7C5; C; 0282; # LATIN CAPITAL LETTER S WITH HOOK +A7C6; C; 1D8E; # LATIN CAPITAL LETTER Z WITH PALATAL HOOK +A7C7; C; A7C8; # LATIN CAPITAL LETTER D WITH SHORT STROKE OVERLAY +A7C9; C; A7CA; # LATIN CAPITAL LETTER S WITH SHORT STROKE OVERLAY +A7D0; C; A7D1; # LATIN CAPITAL LETTER CLOSED INSULAR G +A7D6; C; A7D7; # LATIN CAPITAL LETTER MIDDLE SCOTS S +A7D8; C; A7D9; # LATIN CAPITAL LETTER SIGMOID S +A7F5; C; A7F6; # LATIN CAPITAL LETTER REVERSED HALF H +AB70; C; 13A0; # CHEROKEE SMALL LETTER A +AB71; C; 13A1; # CHEROKEE SMALL LETTER E +AB72; C; 13A2; # CHEROKEE SMALL LETTER I +AB73; C; 13A3; # CHEROKEE SMALL LETTER O +AB74; C; 13A4; # CHEROKEE SMALL LETTER U +AB75; C; 13A5; # CHEROKEE SMALL LETTER V +AB76; C; 13A6; # CHEROKEE SMALL LETTER GA +AB77; C; 13A7; # CHEROKEE SMALL LETTER KA +AB78; C; 13A8; # CHEROKEE SMALL LETTER GE +AB79; C; 13A9; # CHEROKEE SMALL LETTER GI +AB7A; C; 13AA; # CHEROKEE SMALL LETTER GO +AB7B; C; 13AB; # CHEROKEE SMALL LETTER GU +AB7C; C; 13AC; # CHEROKEE SMALL LETTER GV +AB7D; C; 13AD; # CHEROKEE SMALL LETTER HA +AB7E; C; 13AE; # CHEROKEE SMALL LETTER HE +AB7F; C; 13AF; # CHEROKEE SMALL LETTER HI +AB80; C; 13B0; # CHEROKEE SMALL LETTER HO +AB81; C; 13B1; # CHEROKEE SMALL LETTER HU +AB82; C; 13B2; # CHEROKEE SMALL LETTER HV +AB83; C; 13B3; # CHEROKEE SMALL LETTER LA +AB84; C; 13B4; # CHEROKEE SMALL LETTER LE +AB85; C; 13B5; # CHEROKEE SMALL LETTER LI +AB86; C; 13B6; # CHEROKEE SMALL LETTER LO +AB87; C; 13B7; # CHEROKEE SMALL LETTER LU +AB88; C; 13B8; # CHEROKEE SMALL LETTER LV +AB89; C; 13B9; # CHEROKEE SMALL LETTER MA +AB8A; C; 13BA; # CHEROKEE SMALL LETTER ME +AB8B; C; 13BB; # CHEROKEE SMALL LETTER MI +AB8C; C; 13BC; # CHEROKEE SMALL LETTER MO +AB8D; C; 13BD; # CHEROKEE SMALL LETTER MU +AB8E; C; 13BE; # CHEROKEE SMALL LETTER NA +AB8F; C; 13BF; # CHEROKEE SMALL LETTER HNA +AB90; C; 13C0; # CHEROKEE SMALL LETTER NAH +AB91; C; 13C1; # CHEROKEE SMALL LETTER NE +AB92; C; 13C2; # CHEROKEE SMALL LETTER NI +AB93; C; 13C3; # CHEROKEE SMALL LETTER NO +AB94; C; 13C4; # CHEROKEE SMALL LETTER NU +AB95; C; 13C5; # CHEROKEE SMALL LETTER NV +AB96; C; 13C6; # CHEROKEE SMALL LETTER QUA +AB97; C; 13C7; # CHEROKEE SMALL LETTER QUE +AB98; C; 13C8; # CHEROKEE SMALL LETTER QUI +AB99; C; 13C9; # CHEROKEE SMALL LETTER QUO +AB9A; C; 13CA; # CHEROKEE SMALL LETTER QUU +AB9B; C; 13CB; # CHEROKEE SMALL LETTER QUV +AB9C; C; 13CC; # CHEROKEE SMALL LETTER SA +AB9D; C; 13CD; # CHEROKEE SMALL LETTER S +AB9E; C; 13CE; # CHEROKEE SMALL LETTER SE +AB9F; C; 13CF; # CHEROKEE SMALL LETTER SI +ABA0; C; 13D0; # CHEROKEE SMALL LETTER SO +ABA1; C; 13D1; # CHEROKEE SMALL LETTER SU +ABA2; C; 13D2; # CHEROKEE SMALL LETTER SV +ABA3; C; 13D3; # CHEROKEE SMALL LETTER DA +ABA4; C; 13D4; # CHEROKEE SMALL LETTER TA +ABA5; C; 13D5; # CHEROKEE SMALL LETTER DE +ABA6; C; 13D6; # CHEROKEE SMALL LETTER TE +ABA7; C; 13D7; # CHEROKEE SMALL LETTER DI +ABA8; C; 13D8; # CHEROKEE SMALL LETTER TI +ABA9; C; 13D9; # CHEROKEE SMALL LETTER DO +ABAA; C; 13DA; # CHEROKEE SMALL LETTER DU +ABAB; C; 13DB; # CHEROKEE SMALL LETTER DV +ABAC; C; 13DC; # CHEROKEE SMALL LETTER DLA +ABAD; C; 13DD; # CHEROKEE SMALL LETTER TLA +ABAE; C; 13DE; # CHEROKEE SMALL LETTER TLE +ABAF; C; 13DF; # CHEROKEE SMALL LETTER TLI +ABB0; C; 13E0; # CHEROKEE SMALL LETTER TLO +ABB1; C; 13E1; # CHEROKEE SMALL LETTER TLU +ABB2; C; 13E2; # CHEROKEE SMALL LETTER TLV +ABB3; C; 13E3; # CHEROKEE SMALL LETTER TSA +ABB4; C; 13E4; # CHEROKEE SMALL LETTER TSE +ABB5; C; 13E5; # CHEROKEE SMALL LETTER TSI +ABB6; C; 13E6; # CHEROKEE SMALL LETTER TSO +ABB7; C; 13E7; # CHEROKEE SMALL LETTER TSU +ABB8; C; 13E8; # CHEROKEE SMALL LETTER TSV +ABB9; C; 13E9; # CHEROKEE SMALL LETTER WA +ABBA; C; 13EA; # CHEROKEE SMALL LETTER WE +ABBB; C; 13EB; # CHEROKEE SMALL LETTER WI +ABBC; C; 13EC; # CHEROKEE SMALL LETTER WO +ABBD; C; 13ED; # CHEROKEE SMALL LETTER WU +ABBE; C; 13EE; # CHEROKEE SMALL LETTER WV +ABBF; C; 13EF; # CHEROKEE SMALL LETTER YA +FB00; F; 0066 0066; # LATIN SMALL LIGATURE FF +FB01; F; 0066 0069; # LATIN SMALL LIGATURE FI +FB02; F; 0066 006C; # LATIN SMALL LIGATURE FL +FB03; F; 0066 0066 0069; # LATIN SMALL LIGATURE FFI +FB04; F; 0066 0066 006C; # LATIN SMALL LIGATURE FFL +FB05; F; 0073 0074; # LATIN SMALL LIGATURE LONG S T +FB06; F; 0073 0074; # LATIN SMALL LIGATURE ST +FB13; F; 0574 0576; # ARMENIAN SMALL LIGATURE MEN NOW +FB14; F; 0574 0565; # ARMENIAN SMALL LIGATURE MEN ECH +FB15; F; 0574 056B; # ARMENIAN SMALL LIGATURE MEN INI +FB16; F; 057E 0576; # ARMENIAN SMALL LIGATURE VEW NOW +FB17; F; 0574 056D; # ARMENIAN SMALL LIGATURE MEN XEH +FF21; C; FF41; # FULLWIDTH LATIN CAPITAL LETTER A +FF22; C; FF42; # FULLWIDTH LATIN CAPITAL LETTER B +FF23; C; FF43; # FULLWIDTH LATIN CAPITAL LETTER C +FF24; C; FF44; # FULLWIDTH LATIN CAPITAL LETTER D +FF25; C; FF45; # FULLWIDTH LATIN CAPITAL LETTER E +FF26; C; FF46; # FULLWIDTH LATIN CAPITAL LETTER F +FF27; C; FF47; # FULLWIDTH LATIN CAPITAL LETTER G +FF28; C; FF48; # FULLWIDTH LATIN CAPITAL LETTER H +FF29; C; FF49; # FULLWIDTH LATIN CAPITAL LETTER I +FF2A; C; FF4A; # FULLWIDTH LATIN CAPITAL LETTER J +FF2B; C; FF4B; # FULLWIDTH LATIN CAPITAL LETTER K +FF2C; C; FF4C; # FULLWIDTH LATIN CAPITAL LETTER L +FF2D; C; FF4D; # FULLWIDTH LATIN CAPITAL LETTER M +FF2E; C; FF4E; # FULLWIDTH LATIN CAPITAL LETTER N +FF2F; C; FF4F; # FULLWIDTH LATIN CAPITAL LETTER O +FF30; C; FF50; # FULLWIDTH LATIN CAPITAL LETTER P +FF31; C; FF51; # FULLWIDTH LATIN CAPITAL LETTER Q +FF32; C; FF52; # FULLWIDTH LATIN CAPITAL LETTER R +FF33; C; FF53; # FULLWIDTH LATIN CAPITAL LETTER S +FF34; C; FF54; # FULLWIDTH LATIN CAPITAL LETTER T +FF35; C; FF55; # FULLWIDTH LATIN CAPITAL LETTER U +FF36; C; FF56; # FULLWIDTH LATIN CAPITAL LETTER V +FF37; C; FF57; # FULLWIDTH LATIN CAPITAL LETTER W +FF38; C; FF58; # FULLWIDTH LATIN CAPITAL LETTER X +FF39; C; FF59; # FULLWIDTH LATIN CAPITAL LETTER Y +FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z +10400; C; 10428; # DESERET CAPITAL LETTER LONG I +10401; C; 10429; # DESERET CAPITAL LETTER LONG E +10402; C; 1042A; # DESERET CAPITAL LETTER LONG A +10403; C; 1042B; # DESERET CAPITAL LETTER LONG AH +10404; C; 1042C; # DESERET CAPITAL LETTER LONG O +10405; C; 1042D; # DESERET CAPITAL LETTER LONG OO +10406; C; 1042E; # DESERET CAPITAL LETTER SHORT I +10407; C; 1042F; # DESERET CAPITAL LETTER SHORT E +10408; C; 10430; # DESERET CAPITAL LETTER SHORT A +10409; C; 10431; # DESERET CAPITAL LETTER SHORT AH +1040A; C; 10432; # DESERET CAPITAL LETTER SHORT O +1040B; C; 10433; # DESERET CAPITAL LETTER SHORT OO +1040C; C; 10434; # DESERET CAPITAL LETTER AY +1040D; C; 10435; # DESERET CAPITAL LETTER OW +1040E; C; 10436; # DESERET CAPITAL LETTER WU +1040F; C; 10437; # DESERET CAPITAL LETTER YEE +10410; C; 10438; # DESERET CAPITAL LETTER H +10411; C; 10439; # DESERET CAPITAL LETTER PEE +10412; C; 1043A; # DESERET CAPITAL LETTER BEE +10413; C; 1043B; # DESERET CAPITAL LETTER TEE +10414; C; 1043C; # DESERET CAPITAL LETTER DEE +10415; C; 1043D; # DESERET CAPITAL LETTER CHEE +10416; C; 1043E; # DESERET CAPITAL LETTER JEE +10417; C; 1043F; # DESERET CAPITAL LETTER KAY +10418; C; 10440; # DESERET CAPITAL LETTER GAY +10419; C; 10441; # DESERET CAPITAL LETTER EF +1041A; C; 10442; # DESERET CAPITAL LETTER VEE +1041B; C; 10443; # DESERET CAPITAL LETTER ETH +1041C; C; 10444; # DESERET CAPITAL LETTER THEE +1041D; C; 10445; # DESERET CAPITAL LETTER ES +1041E; C; 10446; # DESERET CAPITAL LETTER ZEE +1041F; C; 10447; # DESERET CAPITAL LETTER ESH +10420; C; 10448; # DESERET CAPITAL LETTER ZHEE +10421; C; 10449; # DESERET CAPITAL LETTER ER +10422; C; 1044A; # DESERET CAPITAL LETTER EL +10423; C; 1044B; # DESERET CAPITAL LETTER EM +10424; C; 1044C; # DESERET CAPITAL LETTER EN +10425; C; 1044D; # DESERET CAPITAL LETTER ENG +10426; C; 1044E; # DESERET CAPITAL LETTER OI +10427; C; 1044F; # DESERET CAPITAL LETTER EW +104B0; C; 104D8; # OSAGE CAPITAL LETTER A +104B1; C; 104D9; # OSAGE CAPITAL LETTER AI +104B2; C; 104DA; # OSAGE CAPITAL LETTER AIN +104B3; C; 104DB; # OSAGE CAPITAL LETTER AH +104B4; C; 104DC; # OSAGE CAPITAL LETTER BRA +104B5; C; 104DD; # OSAGE CAPITAL LETTER CHA +104B6; C; 104DE; # OSAGE CAPITAL LETTER EHCHA +104B7; C; 104DF; # OSAGE CAPITAL LETTER E +104B8; C; 104E0; # OSAGE CAPITAL LETTER EIN +104B9; C; 104E1; # OSAGE CAPITAL LETTER HA +104BA; C; 104E2; # OSAGE CAPITAL LETTER HYA +104BB; C; 104E3; # OSAGE CAPITAL LETTER I +104BC; C; 104E4; # OSAGE CAPITAL LETTER KA +104BD; C; 104E5; # OSAGE CAPITAL LETTER EHKA +104BE; C; 104E6; # OSAGE CAPITAL LETTER KYA +104BF; C; 104E7; # OSAGE CAPITAL LETTER LA +104C0; C; 104E8; # OSAGE CAPITAL LETTER MA +104C1; C; 104E9; # OSAGE CAPITAL LETTER NA +104C2; C; 104EA; # OSAGE CAPITAL LETTER O +104C3; C; 104EB; # OSAGE CAPITAL LETTER OIN +104C4; C; 104EC; # OSAGE CAPITAL LETTER PA +104C5; C; 104ED; # OSAGE CAPITAL LETTER EHPA +104C6; C; 104EE; # OSAGE CAPITAL LETTER SA +104C7; C; 104EF; # OSAGE CAPITAL LETTER SHA +104C8; C; 104F0; # OSAGE CAPITAL LETTER TA +104C9; C; 104F1; # OSAGE CAPITAL LETTER EHTA +104CA; C; 104F2; # OSAGE CAPITAL LETTER TSA +104CB; C; 104F3; # OSAGE CAPITAL LETTER EHTSA +104CC; C; 104F4; # OSAGE CAPITAL LETTER TSHA +104CD; C; 104F5; # OSAGE CAPITAL LETTER DHA +104CE; C; 104F6; # OSAGE CAPITAL LETTER U +104CF; C; 104F7; # OSAGE CAPITAL LETTER WA +104D0; C; 104F8; # OSAGE CAPITAL LETTER KHA +104D1; C; 104F9; # OSAGE CAPITAL LETTER GHA +104D2; C; 104FA; # OSAGE CAPITAL LETTER ZA +104D3; C; 104FB; # OSAGE CAPITAL LETTER ZHA +10570; C; 10597; # VITHKUQI CAPITAL LETTER A +10571; C; 10598; # VITHKUQI CAPITAL LETTER BBE +10572; C; 10599; # VITHKUQI CAPITAL LETTER BE +10573; C; 1059A; # VITHKUQI CAPITAL LETTER CE +10574; C; 1059B; # VITHKUQI CAPITAL LETTER CHE +10575; C; 1059C; # VITHKUQI CAPITAL LETTER DE +10576; C; 1059D; # VITHKUQI CAPITAL LETTER DHE +10577; C; 1059E; # VITHKUQI CAPITAL LETTER EI +10578; C; 1059F; # VITHKUQI CAPITAL LETTER E +10579; C; 105A0; # VITHKUQI CAPITAL LETTER FE +1057A; C; 105A1; # VITHKUQI CAPITAL LETTER GA +1057C; C; 105A3; # VITHKUQI CAPITAL LETTER HA +1057D; C; 105A4; # VITHKUQI CAPITAL LETTER HHA +1057E; C; 105A5; # VITHKUQI CAPITAL LETTER I +1057F; C; 105A6; # VITHKUQI CAPITAL LETTER IJE +10580; C; 105A7; # VITHKUQI CAPITAL LETTER JE +10581; C; 105A8; # VITHKUQI CAPITAL LETTER KA +10582; C; 105A9; # VITHKUQI CAPITAL LETTER LA +10583; C; 105AA; # VITHKUQI CAPITAL LETTER LLA +10584; C; 105AB; # VITHKUQI CAPITAL LETTER ME +10585; C; 105AC; # VITHKUQI CAPITAL LETTER NE +10586; C; 105AD; # VITHKUQI CAPITAL LETTER NJE +10587; C; 105AE; # VITHKUQI CAPITAL LETTER O +10588; C; 105AF; # VITHKUQI CAPITAL LETTER PE +10589; C; 105B0; # VITHKUQI CAPITAL LETTER QA +1058A; C; 105B1; # VITHKUQI CAPITAL LETTER RE +1058C; C; 105B3; # VITHKUQI CAPITAL LETTER SE +1058D; C; 105B4; # VITHKUQI CAPITAL LETTER SHE +1058E; C; 105B5; # VITHKUQI CAPITAL LETTER TE +1058F; C; 105B6; # VITHKUQI CAPITAL LETTER THE +10590; C; 105B7; # VITHKUQI CAPITAL LETTER U +10591; C; 105B8; # VITHKUQI CAPITAL LETTER VE +10592; C; 105B9; # VITHKUQI CAPITAL LETTER XE +10594; C; 105BB; # VITHKUQI CAPITAL LETTER Y +10595; C; 105BC; # VITHKUQI CAPITAL LETTER ZE +10C80; C; 10CC0; # OLD HUNGARIAN CAPITAL LETTER A +10C81; C; 10CC1; # OLD HUNGARIAN CAPITAL LETTER AA +10C82; C; 10CC2; # OLD HUNGARIAN CAPITAL LETTER EB +10C83; C; 10CC3; # OLD HUNGARIAN CAPITAL LETTER AMB +10C84; C; 10CC4; # OLD HUNGARIAN CAPITAL LETTER EC +10C85; C; 10CC5; # OLD HUNGARIAN CAPITAL LETTER ENC +10C86; C; 10CC6; # OLD HUNGARIAN CAPITAL LETTER ECS +10C87; C; 10CC7; # OLD HUNGARIAN CAPITAL LETTER ED +10C88; C; 10CC8; # OLD HUNGARIAN CAPITAL LETTER AND +10C89; C; 10CC9; # OLD HUNGARIAN CAPITAL LETTER E +10C8A; C; 10CCA; # OLD HUNGARIAN CAPITAL LETTER CLOSE E +10C8B; C; 10CCB; # OLD HUNGARIAN CAPITAL LETTER EE +10C8C; C; 10CCC; # OLD HUNGARIAN CAPITAL LETTER EF +10C8D; C; 10CCD; # OLD HUNGARIAN CAPITAL LETTER EG +10C8E; C; 10CCE; # OLD HUNGARIAN CAPITAL LETTER EGY +10C8F; C; 10CCF; # OLD HUNGARIAN CAPITAL LETTER EH +10C90; C; 10CD0; # OLD HUNGARIAN CAPITAL LETTER I +10C91; C; 10CD1; # OLD HUNGARIAN CAPITAL LETTER II +10C92; C; 10CD2; # OLD HUNGARIAN CAPITAL LETTER EJ +10C93; C; 10CD3; # OLD HUNGARIAN CAPITAL LETTER EK +10C94; C; 10CD4; # OLD HUNGARIAN CAPITAL LETTER AK +10C95; C; 10CD5; # OLD HUNGARIAN CAPITAL LETTER UNK +10C96; C; 10CD6; # OLD HUNGARIAN CAPITAL LETTER EL +10C97; C; 10CD7; # OLD HUNGARIAN CAPITAL LETTER ELY +10C98; C; 10CD8; # OLD HUNGARIAN CAPITAL LETTER EM +10C99; C; 10CD9; # OLD HUNGARIAN CAPITAL LETTER EN +10C9A; C; 10CDA; # OLD HUNGARIAN CAPITAL LETTER ENY +10C9B; C; 10CDB; # OLD HUNGARIAN CAPITAL LETTER O +10C9C; C; 10CDC; # OLD HUNGARIAN CAPITAL LETTER OO +10C9D; C; 10CDD; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG OE +10C9E; C; 10CDE; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA OE +10C9F; C; 10CDF; # OLD HUNGARIAN CAPITAL LETTER OEE +10CA0; C; 10CE0; # OLD HUNGARIAN CAPITAL LETTER EP +10CA1; C; 10CE1; # OLD HUNGARIAN CAPITAL LETTER EMP +10CA2; C; 10CE2; # OLD HUNGARIAN CAPITAL LETTER ER +10CA3; C; 10CE3; # OLD HUNGARIAN CAPITAL LETTER SHORT ER +10CA4; C; 10CE4; # OLD HUNGARIAN CAPITAL LETTER ES +10CA5; C; 10CE5; # OLD HUNGARIAN CAPITAL LETTER ESZ +10CA6; C; 10CE6; # OLD HUNGARIAN CAPITAL LETTER ET +10CA7; C; 10CE7; # OLD HUNGARIAN CAPITAL LETTER ENT +10CA8; C; 10CE8; # OLD HUNGARIAN CAPITAL LETTER ETY +10CA9; C; 10CE9; # OLD HUNGARIAN CAPITAL LETTER ECH +10CAA; C; 10CEA; # OLD HUNGARIAN CAPITAL LETTER U +10CAB; C; 10CEB; # OLD HUNGARIAN CAPITAL LETTER UU +10CAC; C; 10CEC; # OLD HUNGARIAN CAPITAL LETTER NIKOLSBURG UE +10CAD; C; 10CED; # OLD HUNGARIAN CAPITAL LETTER RUDIMENTA UE +10CAE; C; 10CEE; # OLD HUNGARIAN CAPITAL LETTER EV +10CAF; C; 10CEF; # OLD HUNGARIAN CAPITAL LETTER EZ +10CB0; C; 10CF0; # OLD HUNGARIAN CAPITAL LETTER EZS +10CB1; C; 10CF1; # OLD HUNGARIAN CAPITAL LETTER ENT-SHAPED SIGN +10CB2; C; 10CF2; # OLD HUNGARIAN CAPITAL LETTER US +118A0; C; 118C0; # WARANG CITI CAPITAL LETTER NGAA +118A1; C; 118C1; # WARANG CITI CAPITAL LETTER A +118A2; C; 118C2; # WARANG CITI CAPITAL LETTER WI +118A3; C; 118C3; # WARANG CITI CAPITAL LETTER YU +118A4; C; 118C4; # WARANG CITI CAPITAL LETTER YA +118A5; C; 118C5; # WARANG CITI CAPITAL LETTER YO +118A6; C; 118C6; # WARANG CITI CAPITAL LETTER II +118A7; C; 118C7; # WARANG CITI CAPITAL LETTER UU +118A8; C; 118C8; # WARANG CITI CAPITAL LETTER E +118A9; C; 118C9; # WARANG CITI CAPITAL LETTER O +118AA; C; 118CA; # WARANG CITI CAPITAL LETTER ANG +118AB; C; 118CB; # WARANG CITI CAPITAL LETTER GA +118AC; C; 118CC; # WARANG CITI CAPITAL LETTER KO +118AD; C; 118CD; # WARANG CITI CAPITAL LETTER ENY +118AE; C; 118CE; # WARANG CITI CAPITAL LETTER YUJ +118AF; C; 118CF; # WARANG CITI CAPITAL LETTER UC +118B0; C; 118D0; # WARANG CITI CAPITAL LETTER ENN +118B1; C; 118D1; # WARANG CITI CAPITAL LETTER ODD +118B2; C; 118D2; # WARANG CITI CAPITAL LETTER TTE +118B3; C; 118D3; # WARANG CITI CAPITAL LETTER NUNG +118B4; C; 118D4; # WARANG CITI CAPITAL LETTER DA +118B5; C; 118D5; # WARANG CITI CAPITAL LETTER AT +118B6; C; 118D6; # WARANG CITI CAPITAL LETTER AM +118B7; C; 118D7; # WARANG CITI CAPITAL LETTER BU +118B8; C; 118D8; # WARANG CITI CAPITAL LETTER PU +118B9; C; 118D9; # WARANG CITI CAPITAL LETTER HIYO +118BA; C; 118DA; # WARANG CITI CAPITAL LETTER HOLO +118BB; C; 118DB; # WARANG CITI CAPITAL LETTER HORR +118BC; C; 118DC; # WARANG CITI CAPITAL LETTER HAR +118BD; C; 118DD; # WARANG CITI CAPITAL LETTER SSUU +118BE; C; 118DE; # WARANG CITI CAPITAL LETTER SII +118BF; C; 118DF; # WARANG CITI CAPITAL LETTER VIYO +16E40; C; 16E60; # MEDEFAIDRIN CAPITAL LETTER M +16E41; C; 16E61; # MEDEFAIDRIN CAPITAL LETTER S +16E42; C; 16E62; # MEDEFAIDRIN CAPITAL LETTER V +16E43; C; 16E63; # MEDEFAIDRIN CAPITAL LETTER W +16E44; C; 16E64; # MEDEFAIDRIN CAPITAL LETTER ATIU +16E45; C; 16E65; # MEDEFAIDRIN CAPITAL LETTER Z +16E46; C; 16E66; # MEDEFAIDRIN CAPITAL LETTER KP +16E47; C; 16E67; # MEDEFAIDRIN CAPITAL LETTER P +16E48; C; 16E68; # MEDEFAIDRIN CAPITAL LETTER T +16E49; C; 16E69; # MEDEFAIDRIN CAPITAL LETTER G +16E4A; C; 16E6A; # MEDEFAIDRIN CAPITAL LETTER F +16E4B; C; 16E6B; # MEDEFAIDRIN CAPITAL LETTER I +16E4C; C; 16E6C; # MEDEFAIDRIN CAPITAL LETTER K +16E4D; C; 16E6D; # MEDEFAIDRIN CAPITAL LETTER A +16E4E; C; 16E6E; # MEDEFAIDRIN CAPITAL LETTER J +16E4F; C; 16E6F; # MEDEFAIDRIN CAPITAL LETTER E +16E50; C; 16E70; # MEDEFAIDRIN CAPITAL LETTER B +16E51; C; 16E71; # MEDEFAIDRIN CAPITAL LETTER C +16E52; C; 16E72; # MEDEFAIDRIN CAPITAL LETTER U +16E53; C; 16E73; # MEDEFAIDRIN CAPITAL LETTER YU +16E54; C; 16E74; # MEDEFAIDRIN CAPITAL LETTER L +16E55; C; 16E75; # MEDEFAIDRIN CAPITAL LETTER Q +16E56; C; 16E76; # MEDEFAIDRIN CAPITAL LETTER HP +16E57; C; 16E77; # MEDEFAIDRIN CAPITAL LETTER NY +16E58; C; 16E78; # MEDEFAIDRIN CAPITAL LETTER X +16E59; C; 16E79; # MEDEFAIDRIN CAPITAL LETTER D +16E5A; C; 16E7A; # MEDEFAIDRIN CAPITAL LETTER OE +16E5B; C; 16E7B; # MEDEFAIDRIN CAPITAL LETTER N +16E5C; C; 16E7C; # MEDEFAIDRIN CAPITAL LETTER R +16E5D; C; 16E7D; # MEDEFAIDRIN CAPITAL LETTER O +16E5E; C; 16E7E; # MEDEFAIDRIN CAPITAL LETTER AI +16E5F; C; 16E7F; # MEDEFAIDRIN CAPITAL LETTER Y +1E900; C; 1E922; # ADLAM CAPITAL LETTER ALIF +1E901; C; 1E923; # ADLAM CAPITAL LETTER DAALI +1E902; C; 1E924; # ADLAM CAPITAL LETTER LAAM +1E903; C; 1E925; # ADLAM CAPITAL LETTER MIIM +1E904; C; 1E926; # ADLAM CAPITAL LETTER BA +1E905; C; 1E927; # ADLAM CAPITAL LETTER SINNYIIYHE +1E906; C; 1E928; # ADLAM CAPITAL LETTER PE +1E907; C; 1E929; # ADLAM CAPITAL LETTER BHE +1E908; C; 1E92A; # ADLAM CAPITAL LETTER RA +1E909; C; 1E92B; # ADLAM CAPITAL LETTER E +1E90A; C; 1E92C; # ADLAM CAPITAL LETTER FA +1E90B; C; 1E92D; # ADLAM CAPITAL LETTER I +1E90C; C; 1E92E; # ADLAM CAPITAL LETTER O +1E90D; C; 1E92F; # ADLAM CAPITAL LETTER DHA +1E90E; C; 1E930; # ADLAM CAPITAL LETTER YHE +1E90F; C; 1E931; # ADLAM CAPITAL LETTER WAW +1E910; C; 1E932; # ADLAM CAPITAL LETTER NUN +1E911; C; 1E933; # ADLAM CAPITAL LETTER KAF +1E912; C; 1E934; # ADLAM CAPITAL LETTER YA +1E913; C; 1E935; # ADLAM CAPITAL LETTER U +1E914; C; 1E936; # ADLAM CAPITAL LETTER JIIM +1E915; C; 1E937; # ADLAM CAPITAL LETTER CHI +1E916; C; 1E938; # ADLAM CAPITAL LETTER HA +1E917; C; 1E939; # ADLAM CAPITAL LETTER QAAF +1E918; C; 1E93A; # ADLAM CAPITAL LETTER GA +1E919; C; 1E93B; # ADLAM CAPITAL LETTER NYA +1E91A; C; 1E93C; # ADLAM CAPITAL LETTER TU +1E91B; C; 1E93D; # ADLAM CAPITAL LETTER NHA +1E91C; C; 1E93E; # ADLAM CAPITAL LETTER VA +1E91D; C; 1E93F; # ADLAM CAPITAL LETTER KHA +1E91E; C; 1E940; # ADLAM CAPITAL LETTER GBE +1E91F; C; 1E941; # ADLAM CAPITAL LETTER ZAL +1E920; C; 1E942; # ADLAM CAPITAL LETTER KPO +1E921; C; 1E943; # ADLAM CAPITAL LETTER SHA +''' + + +def _parse_unichr(s): + s = int(s, 16) + try: + return compat_chr(s) + except ValueError: + # work around "unichr() arg not in range(0x10000) (narrow Python build)" + return ('\\U%08x' % s).decode('unicode-escape') + + +_map = dict( + (_parse_unichr(from_), ''.join(map(_parse_unichr, to_.split(' ')))) + for from_, type_, to_, _ in ( + l.split('; ', 3) for l in _map_str.splitlines() if l and not l[0] == '#') + if type_ in ('C', 'F')) +del _map_str + + +def casefold(s): + assert isinstance(s, compat_str) + return ''.join((_map.get(c, c) for c in s)) + + +__all__ = [ + 'casefold', +] diff --git a/script.module.youtube.dl/lib/youtube_dl/compat.py b/script.module.youtube.dl/lib/youtube_dl/compat.py index 6c3d49d45..3c526a78d 100644 --- a/script.module.youtube.dl/lib/youtube_dl/compat.py +++ b/script.module.youtube.dl/lib/youtube_dl/compat.py @@ -1,10 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +from __future__ import division import base64 import binascii import collections import ctypes +import datetime import email import getpass import io @@ -19,14 +21,57 @@ import struct import subprocess import sys +import types import xml.etree.ElementTree +# naming convention +# 'compat_' + Python3_name.replace('.', '_') +# other aliases exist for convenience and/or legacy + +# deal with critical unicode/str things first +try: + # Python 2 + compat_str, compat_basestring, compat_chr = ( + unicode, basestring, unichr + ) +except NameError: + compat_str, compat_basestring, compat_chr = ( + str, (str, bytes), chr + ) + +# casefold +try: + compat_str.casefold + compat_casefold = lambda s: s.casefold() +except AttributeError: + from .casefold import casefold as compat_casefold + +try: + import collections.abc as compat_collections_abc +except ImportError: + import collections as compat_collections_abc try: import urllib.request as compat_urllib_request except ImportError: # Python 2 import urllib2 as compat_urllib_request +# Also fix up lack of method arg in old Pythons +try: + _req = compat_urllib_request.Request + _req('http://127.0.0.1', method='GET') +except TypeError: + class _request(object): + def __new__(cls, url, *args, **kwargs): + method = kwargs.pop('method', None) + r = _req(url, *args, **kwargs) + if method: + r.get_method = types.MethodType(lambda _: method, r) + return r + + compat_urllib_request.Request = _request + + try: import urllib.error as compat_urllib_error except ImportError: # Python 2 @@ -36,26 +81,32 @@ import urllib.parse as compat_urllib_parse except ImportError: # Python 2 import urllib as compat_urllib_parse + import urlparse as _urlparse + for a in dir(_urlparse): + if not hasattr(compat_urllib_parse, a): + setattr(compat_urllib_parse, a, getattr(_urlparse, a)) + del _urlparse -try: - from urllib.parse import urlparse as compat_urllib_parse_urlparse -except ImportError: # Python 2 - from urlparse import urlparse as compat_urllib_parse_urlparse - -try: - import urllib.parse as compat_urlparse -except ImportError: # Python 2 - import urlparse as compat_urlparse +# unfavoured aliases +compat_urlparse = compat_urllib_parse +compat_urllib_parse_urlparse = compat_urllib_parse.urlparse try: import urllib.response as compat_urllib_response except ImportError: # Python 2 import urllib as compat_urllib_response +try: + compat_urllib_response.addinfourl.status +except AttributeError: + # .getcode() is deprecated in Py 3. + compat_urllib_response.addinfourl.status = property(lambda self: self.getcode()) + try: import http.cookiejar as compat_cookiejar except ImportError: # Python 2 import cookielib as compat_cookiejar +compat_http_cookiejar = compat_cookiejar if sys.version_info[0] == 2: class compat_cookiejar_Cookie(compat_cookiejar.Cookie): @@ -67,11 +118,35 @@ def __init__(self, version, name, value, *args, **kwargs): compat_cookiejar.Cookie.__init__(self, version, name, value, *args, **kwargs) else: compat_cookiejar_Cookie = compat_cookiejar.Cookie +compat_http_cookiejar_Cookie = compat_cookiejar_Cookie try: import http.cookies as compat_cookies except ImportError: # Python 2 import Cookie as compat_cookies +compat_http_cookies = compat_cookies + +if sys.version_info[0] == 2 or sys.version_info < (3, 3): + class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie): + def load(self, rawdata): + must_have_value = 0 + if not isinstance(rawdata, dict): + if sys.version_info[:2] != (2, 7) or sys.platform.startswith('java'): + # attribute must have value for parsing + rawdata, must_have_value = re.subn( + r'(?i)(;\s*)(secure|httponly)(\s*(?:;|$))', r'\1\2=\2\3', rawdata) + if sys.version_info[0] == 2: + if isinstance(rawdata, compat_str): + rawdata = str(rawdata) + super(compat_cookies_SimpleCookie, self).load(rawdata) + if must_have_value > 0: + for morsel in self.values(): + for attr in ('secure', 'httponly'): + if morsel.get(attr): + morsel[attr] = True +else: + compat_cookies_SimpleCookie = compat_cookies.SimpleCookie +compat_http_cookies_SimpleCookie = compat_cookies_SimpleCookie try: import html.entities as compat_html_entities @@ -2320,21 +2395,29 @@ def __init__(self, version, name, value, *args, **kwargs): import http.client as compat_http_client except ImportError: # Python 2 import httplib as compat_http_client +try: + compat_http_client.HTTPResponse.getcode +except AttributeError: + # Py < 3.1 + compat_http_client.HTTPResponse.getcode = lambda self: self.status try: from urllib.error import HTTPError as compat_HTTPError except ImportError: # Python 2 from urllib2 import HTTPError as compat_HTTPError +compat_urllib_HTTPError = compat_HTTPError try: from urllib.request import urlretrieve as compat_urlretrieve except ImportError: # Python 2 from urllib import urlretrieve as compat_urlretrieve +compat_urllib_request_urlretrieve = compat_urlretrieve try: from html.parser import HTMLParser as compat_HTMLParser except ImportError: # Python 2 from HTMLParser import HTMLParser as compat_HTMLParser +compat_html_parser_HTMLParser = compat_HTMLParser try: # Python 2 from HTMLParser import HTMLParseError as compat_HTMLParseError @@ -2348,6 +2431,7 @@ def __init__(self, version, name, value, *args, **kwargs): # and uniform cross-version exception handling class compat_HTMLParseError(Exception): pass +compat_html_parser_HTMLParseError = compat_HTMLParseError try: from subprocess import DEVNULL @@ -2360,15 +2444,12 @@ class compat_HTMLParseError(Exception): except ImportError: import BaseHTTPServer as compat_http_server -try: - compat_str = unicode # Python 2 -except NameError: - compat_str = str - try: from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote as compat_urllib_parse_unquote from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus + from urllib.parse import urlencode as compat_urllib_parse_urlencode + from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire') else re.compile(r'([\x00-\x7f]+)')) @@ -2435,9 +2516,6 @@ def compat_urllib_parse_unquote_plus(string, encoding='utf-8', errors='replace') string = string.replace('+', ' ') return compat_urllib_parse_unquote(string, encoding, errors) -try: - from urllib.parse import urlencode as compat_urllib_parse_urlencode -except ImportError: # Python 2 # Python 2 will choke in urlencode on mixture of byte and unicode strings. # Possible solutions are to either port it from python 3 with all # the friends or manually ensure input query contains only byte strings. @@ -2459,7 +2537,62 @@ def encode_dict(d): def encode_list(l): return [encode_elem(e) for e in l] - return compat_urllib_parse.urlencode(encode_elem(query), doseq=doseq) + return compat_urllib_parse._urlencode(encode_elem(query), doseq=doseq) + + # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. + # Python 2's version is apparently totally broken + def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + qs, _coerce_result = qs, compat_str + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + r = [] + for name_value in pairs: + if not name_value and not strict_parsing: + continue + nv = name_value.split('=', 1) + if len(nv) != 2: + if strict_parsing: + raise ValueError('bad query field: %r' % (name_value,)) + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append('') + else: + continue + if len(nv[1]) or keep_blank_values: + name = nv[0].replace('+', ' ') + name = compat_urllib_parse_unquote( + name, encoding=encoding, errors=errors) + name = _coerce_result(name) + value = nv[1].replace('+', ' ') + value = compat_urllib_parse_unquote( + value, encoding=encoding, errors=errors) + value = _coerce_result(value) + r.append((name, value)) + return r + + def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + parsed_result = {} + pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, + encoding=encoding, errors=errors) + for name, value in pairs: + if name in parsed_result: + parsed_result[name].append(value) + else: + parsed_result[name] = [value] + return parsed_result + + setattr(compat_urllib_parse, '_urlencode', + getattr(compat_urllib_parse, 'urlencode')) + for name, fix in ( + ('unquote_to_bytes', compat_urllib_parse_unquote_to_bytes), + ('parse_unquote', compat_urllib_parse_unquote), + ('unquote_plus', compat_urllib_parse_unquote_plus), + ('urlencode', compat_urllib_parse_urlencode), + ('parse_qs', compat_parse_qs)): + setattr(compat_urllib_parse, name, fix) + +compat_urllib_parse_parse_qs = compat_parse_qs try: from urllib.request import DataHandler as compat_urllib_request_DataHandler @@ -2495,21 +2628,11 @@ def data_open(self, req): return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url) -try: - compat_basestring = basestring # Python 2 -except NameError: - compat_basestring = str - -try: - compat_chr = unichr # Python 2 -except NameError: - compat_chr = chr - try: from xml.etree.ElementTree import ParseError as compat_xml_parse_error except ImportError: # Python 2.6 from xml.parsers.expat import ExpatError as compat_xml_parse_error - +compat_xml_etree_ElementTree_ParseError = compat_xml_parse_error etree = xml.etree.ElementTree @@ -2523,10 +2646,11 @@ def doctype(self, name, pubid, system): # xml.etree.ElementTree.Element is a method in Python <=2.6 and # the following will crash with: # TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types - isinstance(None, xml.etree.ElementTree.Element) + isinstance(None, etree.Element) from xml.etree.ElementTree import Element as compat_etree_Element except TypeError: # Python <=2.6 from xml.etree.ElementTree import _ElementInterface as compat_etree_Element +compat_xml_etree_ElementTree_Element = compat_etree_Element if sys.version_info[0] >= 3: def compat_etree_fromstring(text): @@ -2582,6 +2706,7 @@ def compat_etree_register_namespace(prefix, uri): if k == uri or v == prefix: del etree._namespace_map[k] etree._namespace_map[uri] = prefix +compat_xml_etree_register_namespace = compat_etree_register_namespace if sys.version_info < (2, 7): # Here comes the crazy part: In 2.6, if the xpath is a unicode, @@ -2593,53 +2718,6 @@ def compat_xpath(xpath): else: compat_xpath = lambda xpath: xpath -try: - from urllib.parse import parse_qs as compat_parse_qs -except ImportError: # Python 2 - # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. - # Python 2's version is apparently totally broken - - def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - qs, _coerce_result = qs, compat_str - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] - r = [] - for name_value in pairs: - if not name_value and not strict_parsing: - continue - nv = name_value.split('=', 1) - if len(nv) != 2: - if strict_parsing: - raise ValueError('bad query field: %r' % (name_value,)) - # Handle case of a control-name with no equal sign - if keep_blank_values: - nv.append('') - else: - continue - if len(nv[1]) or keep_blank_values: - name = nv[0].replace('+', ' ') - name = compat_urllib_parse_unquote( - name, encoding=encoding, errors=errors) - name = _coerce_result(name) - value = nv[1].replace('+', ' ') - value = compat_urllib_parse_unquote( - value, encoding=encoding, errors=errors) - value = _coerce_result(value) - r.append((name, value)) - return r - - def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - parsed_result = {} - pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, - encoding=encoding, errors=errors) - for name, value in pairs: - if name in parsed_result: - parsed_result[name].append(value) - else: - parsed_result[name] = [value] - return parsed_result - compat_os_name = os._name if os.name == 'java' else os.name @@ -2764,6 +2842,8 @@ def compat_expanduser(path): else: compat_expanduser = os.path.expanduser +compat_os_path_expanduser = compat_expanduser + if compat_os_name == 'nt' and sys.version_info < (3, 8): # os.path.realpath on Windows does not follow symbolic links @@ -2775,6 +2855,8 @@ def compat_realpath(path): else: compat_realpath = os.path.realpath +compat_os_path_realpath = compat_realpath + if sys.version_info < (3, 0): def compat_print(s): @@ -2795,11 +2877,15 @@ def compat_getpass(prompt, *args, **kwargs): else: compat_getpass = getpass.getpass +compat_getpass_getpass = compat_getpass + + try: compat_input = raw_input except NameError: # Python 3 compat_input = input + # Python < 2.6.5 require kwargs to be bytes try: def _testfunc(x): @@ -2877,6 +2963,7 @@ def _compat_add_option(self, *args, **kwargs): _terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines']) def compat_get_terminal_size(fallback=(80, 24)): + from .utils import process_communicate_or_kill columns = compat_getenv('COLUMNS') if columns: columns = int(columns) @@ -2893,7 +2980,7 @@ def compat_get_terminal_size(fallback=(80, 24)): sp = subprocess.Popen( ['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = sp.communicate() + out, err = process_communicate_or_kill(sp) _lines, _columns = map(int, out.split()) except Exception: _columns, _lines = _terminal_size(*fallback) @@ -2904,15 +2991,16 @@ def compat_get_terminal_size(fallback=(80, 24)): lines = _lines return _terminal_size(columns, lines) + try: itertools.count(start=0, step=1) compat_itertools_count = itertools.count except TypeError: # Python 2.6 def compat_itertools_count(start=0, step=1): - n = start while True: - yield n - n += step + yield start + start += step + if sys.version_info >= (3, 0): from tokenize import tokenize as compat_tokenize_tokenize @@ -2953,6 +3041,24 @@ def unpack(self, string): compat_Struct = struct.Struct +# compat_map/filter() returning an iterator, supposedly the +# same versioning as for zip below +try: + from future_builtins import map as compat_map +except ImportError: + try: + from itertools import imap as compat_map + except ImportError: + compat_map = map + +try: + from future_builtins import filter as compat_filter +except ImportError: + try: + from itertools import ifilter as compat_filter + except ImportError: + compat_filter = filter + try: from future_builtins import zip as compat_zip except ImportError: # not 2.6+ or is 3.x @@ -2962,6 +3068,82 @@ def unpack(self, string): compat_zip = zip +# method renamed between Py2/3 +try: + from itertools import zip_longest as compat_itertools_zip_longest +except ImportError: + from itertools import izip_longest as compat_itertools_zip_longest + + +# new class in collections +try: + from collections import ChainMap as compat_collections_chain_map + # Py3.3's ChainMap is deficient + if sys.version_info < (3, 4): + raise ImportError +except ImportError: + # Py <= 3.3 + class compat_collections_chain_map(compat_collections_abc.MutableMapping): + + maps = [{}] + + def __init__(self, *maps): + self.maps = list(maps) or [{}] + + def __getitem__(self, k): + for m in self.maps: + if k in m: + return m[k] + raise KeyError(k) + + def __setitem__(self, k, v): + self.maps[0].__setitem__(k, v) + return + + def __contains__(self, k): + return any((k in m) for m in self.maps) + + def __delitem(self, k): + if k in self.maps[0]: + del self.maps[0][k] + return + raise KeyError(k) + + def __delitem__(self, k): + self.__delitem(k) + + def __iter__(self): + return itertools.chain(*reversed(self.maps)) + + def __len__(self): + return len(iter(self)) + + # to match Py3, don't del directly + def pop(self, k, *args): + if self.__contains__(k): + off = self.__getitem__(k) + self.__delitem(k) + return off + elif len(args) > 0: + return args[0] + raise KeyError(k) + + def new_child(self, m=None, **kwargs): + m = m or {} + m.update(kwargs) + return compat_collections_chain_map(m, *self.maps) + + @property + def parents(self): + return compat_collections_chain_map(*(self.maps[1:])) + + +# Pythons disagree on the type of a pattern (RegexObject, _sre.SRE_Pattern, Pattern, ...?) +compat_re_Pattern = type(re.compile('')) +# and on the type of a match +compat_re_Match = type(re.match('a', 'a')) + + if sys.version_info < (3, 3): def compat_b64decode(s, *args, **kwargs): if isinstance(s, compat_str): @@ -2970,6 +3152,8 @@ def compat_b64decode(s, *args, **kwargs): else: compat_b64decode = base64.b64decode +compat_base64_b64decode = compat_b64decode + if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0): # PyPy2 prior to version 5.4.0 expects byte strings as Windows function @@ -2989,25 +3173,95 @@ def compat_ctypes_WINFUNCTYPE(*args, **kwargs): return ctypes.WINFUNCTYPE(*args, **kwargs) -__all__ = [ +if sys.version_info < (3, 0): + # open(file, mode='r', buffering=- 1, encoding=None, errors=None, newline=None, closefd=True) not: opener=None + def compat_open(file_, *args, **kwargs): + if len(args) > 6 or 'opener' in kwargs: + raise ValueError('open: unsupported argument "opener"') + return io.open(file_, *args, **kwargs) +else: + compat_open = open + + +# compat_register_utf8 +def compat_register_utf8(): + if sys.platform == 'win32': + # https://github.com/ytdl-org/youtube-dl/issues/820 + from codecs import register, lookup + register( + lambda name: lookup('utf-8') if name == 'cp65001' else None) + + +# compat_datetime_timedelta_total_seconds +try: + compat_datetime_timedelta_total_seconds = datetime.timedelta.total_seconds +except AttributeError: + # Py 2.6 + def compat_datetime_timedelta_total_seconds(td): + return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6 + +# optional decompression packages +# PyPi brotli package implements 'br' Content-Encoding +try: + import brotli as compat_brotli +except ImportError: + compat_brotli = None +# PyPi ncompress package implements 'compress' Content-Encoding +try: + import ncompress as compat_ncompress +except ImportError: + compat_ncompress = None + + +legacy = [ 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', - 'compat_Struct', 'compat_b64decode', - 'compat_basestring', - 'compat_chr', 'compat_cookiejar', 'compat_cookiejar_Cookie', 'compat_cookies', - 'compat_ctypes_WINFUNCTYPE', + 'compat_cookies_SimpleCookie', 'compat_etree_Element', - 'compat_etree_fromstring', 'compat_etree_register_namespace', 'compat_expanduser', + 'compat_getpass', + 'compat_parse_qs', + 'compat_realpath', + 'compat_urllib_parse_parse_qs', + 'compat_urllib_parse_unquote', + 'compat_urllib_parse_unquote_plus', + 'compat_urllib_parse_unquote_to_bytes', + 'compat_urllib_parse_urlencode', + 'compat_urllib_parse_urlparse', + 'compat_urlparse', + 'compat_urlretrieve', + 'compat_xml_parse_error', +] + + +__all__ = [ + 'compat_html_parser_HTMLParseError', + 'compat_html_parser_HTMLParser', + 'compat_Struct', + 'compat_base64_b64decode', + 'compat_basestring', + 'compat_brotli', + 'compat_casefold', + 'compat_chr', + 'compat_collections_abc', + 'compat_collections_chain_map', + 'compat_datetime_timedelta_total_seconds', + 'compat_http_cookiejar', + 'compat_http_cookiejar_Cookie', + 'compat_http_cookies', + 'compat_http_cookies_SimpleCookie', + 'compat_ctypes_WINFUNCTYPE', + 'compat_etree_fromstring', + 'compat_filter', 'compat_get_terminal_size', 'compat_getenv', - 'compat_getpass', + 'compat_getpass_getpass', 'compat_html_entities', 'compat_html_entities_html5', 'compat_http_client', @@ -3015,13 +3269,20 @@ def compat_ctypes_WINFUNCTYPE(*args, **kwargs): 'compat_input', 'compat_integer_types', 'compat_itertools_count', + 'compat_itertools_zip_longest', 'compat_kwargs', + 'compat_map', + 'compat_ncompress', 'compat_numeric_types', + 'compat_open', 'compat_ord', 'compat_os_name', - 'compat_parse_qs', + 'compat_os_path_expanduser', + 'compat_os_path_realpath', 'compat_print', - 'compat_realpath', + 'compat_re_Match', + 'compat_re_Pattern', + 'compat_register_utf8', 'compat_setenv', 'compat_shlex_quote', 'compat_shlex_split', @@ -3033,17 +3294,14 @@ def compat_ctypes_WINFUNCTYPE(*args, **kwargs): 'compat_tokenize_tokenize', 'compat_urllib_error', 'compat_urllib_parse', - 'compat_urllib_parse_unquote', - 'compat_urllib_parse_unquote_plus', - 'compat_urllib_parse_unquote_to_bytes', - 'compat_urllib_parse_urlencode', - 'compat_urllib_parse_urlparse', 'compat_urllib_request', 'compat_urllib_request_DataHandler', 'compat_urllib_response', - 'compat_urlparse', - 'compat_urlretrieve', - 'compat_xml_parse_error', + 'compat_urllib_request_urlretrieve', + 'compat_urllib_HTTPError', + 'compat_xml_etree_ElementTree_Element', + 'compat_xml_etree_ElementTree_ParseError', + 'compat_xml_etree_register_namespace', 'compat_xpath', 'compat_zip', 'workaround_optparse_bug9161', diff --git a/script.module.youtube.dl/lib/youtube_dl/downloader/__init__.py b/script.module.youtube.dl/lib/youtube_dl/downloader/__init__.py index 2e485df9d..d701d6292 100644 --- a/script.module.youtube.dl/lib/youtube_dl/downloader/__init__.py +++ b/script.module.youtube.dl/lib/youtube_dl/downloader/__init__.py @@ -1,22 +1,31 @@ from __future__ import unicode_literals +from ..utils import ( + determine_protocol, +) + + +def get_suitable_downloader(info_dict, params={}): + info_dict['protocol'] = determine_protocol(info_dict) + info_copy = info_dict.copy() + return _get_suitable_downloader(info_copy, params) + + +# Some of these require get_suitable_downloader from .common import FileDownloader +from .dash import DashSegmentsFD from .f4m import F4mFD from .hls import HlsFD from .http import HttpFD from .rtmp import RtmpFD -from .dash import DashSegmentsFD from .rtsp import RtspFD from .ism import IsmFD +from .niconico import NiconicoDmcFD from .external import ( get_external_downloader, FFmpegFD, ) -from ..utils import ( - determine_protocol, -) - PROTOCOL_MAP = { 'rtmp': RtmpFD, 'm3u8_native': HlsFD, @@ -26,13 +35,12 @@ 'f4m': F4mFD, 'http_dash_segments': DashSegmentsFD, 'ism': IsmFD, + 'niconico_dmc': NiconicoDmcFD, } -def get_suitable_downloader(info_dict, params={}): +def _get_suitable_downloader(info_dict, params={}): """Get the downloader class that can handle the info dict.""" - protocol = determine_protocol(info_dict) - info_dict['protocol'] = protocol # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): # return FFmpegFD @@ -42,7 +50,11 @@ def get_suitable_downloader(info_dict, params={}): ed = get_external_downloader(external_downloader) if ed.can_download(info_dict): return ed + # Avoid using unwanted args since external_downloader was rejected + if params.get('external_downloader_args'): + params['external_downloader_args'] = None + protocol = info_dict['protocol'] if protocol.startswith('m3u8') and info_dict.get('is_live'): return FFmpegFD diff --git a/script.module.youtube.dl/lib/youtube_dl/downloader/common.py b/script.module.youtube.dl/lib/youtube_dl/downloader/common.py index 1cdba89cd..91e691776 100644 --- a/script.module.youtube.dl/lib/youtube_dl/downloader/common.py +++ b/script.module.youtube.dl/lib/youtube_dl/downloader/common.py @@ -88,17 +88,21 @@ def format_percent(percent): return '---.-%' return '%6s' % ('%3.1f%%' % percent) - @staticmethod - def calc_eta(start, now, total, current): + @classmethod + def calc_eta(cls, start_or_rate, now_or_remaining, *args): + if len(args) < 2: + rate, remaining = (start_or_rate, now_or_remaining) + if None in (rate, remaining): + return None + return int(float(remaining) / rate) + start, now = (start_or_rate, now_or_remaining) + total, current = args[:2] if total is None: return None if now is None: now = time.time() - dif = now - start - if current == 0 or dif < 0.001: # One millisecond - return None - rate = float(current) / dif - return int((float(total) - float(current)) / rate) + rate = cls.calc_speed(start, now, current) + return rate and int((float(total) - float(current)) / rate) @staticmethod def format_eta(eta): @@ -123,6 +127,12 @@ def format_speed(speed): def format_retries(retries): return 'inf' if retries == float('inf') else '%.0f' % retries + @staticmethod + def filesize_or_none(unencoded_filename): + fn = encodeFilename(unencoded_filename) + if os.path.isfile(fn): + return os.path.getsize(fn) + @staticmethod def best_block_size(elapsed_time, bytes): new_min = max(bytes / 2.0, 1.0) @@ -329,6 +339,10 @@ def report_unable_to_resume(self): def download(self, filename, info_dict): """Download to a filename using the info from info_dict Return True on success and False otherwise + + This method filters the `Cookie` header from the info_dict to prevent leaks. + Downloaders have their own way of handling cookies. + See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-v8mc-9377-rwjj """ nooverwrites_and_exists = ( diff --git a/script.module.youtube.dl/lib/youtube_dl/downloader/dash.py b/script.module.youtube.dl/lib/youtube_dl/downloader/dash.py index c6d674bc6..2800d4260 100644 --- a/script.module.youtube.dl/lib/youtube_dl/downloader/dash.py +++ b/script.module.youtube.dl/lib/youtube_dl/downloader/dash.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import itertools + from .fragment import FragmentFD from ..compat import compat_urllib_error from ..utils import ( @@ -30,26 +32,23 @@ def real_download(self, filename, info_dict): fragment_retries = self.params.get('fragment_retries', 0) skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - frag_index = 0 - for i, fragment in enumerate(fragments): - frag_index += 1 + for frag_index, fragment in enumerate(fragments, 1): if frag_index <= ctx['fragment_index']: continue # In DASH, the first segment contains necessary headers to # generate a valid MP4 file, so always abort for the first segment - fatal = i == 0 or not skip_unavailable_fragments - count = 0 - while count <= fragment_retries: + fatal = frag_index == 1 or not skip_unavailable_fragments + fragment_url = fragment.get('url') + if not fragment_url: + assert fragment_base_url + fragment_url = urljoin(fragment_base_url, fragment['path']) + success = False + for count in itertools.count(): try: - fragment_url = fragment.get('url') - if not fragment_url: - assert fragment_base_url - fragment_url = urljoin(fragment_base_url, fragment['path']) success, frag_content = self._download_fragment(ctx, fragment_url, info_dict) if not success: return False self._append_fragment(ctx, frag_content) - break except compat_urllib_error.HTTPError as err: # YouTube may often return 404 HTTP error for a fragment causing the # whole download to fail. However if the same fragment is immediately @@ -57,22 +56,21 @@ def real_download(self, filename, info_dict): # is usually enough) thus allowing to download the whole file successfully. # To be future-proof we will retry all fragments that fail with any # HTTP error. - count += 1 - if count <= fragment_retries: - self.report_retry_fragment(err, frag_index, count, fragment_retries) + if count < fragment_retries: + self.report_retry_fragment(err, frag_index, count + 1, fragment_retries) + continue except DownloadError: # Don't retry fragment if error occurred during HTTP downloading - # itself since it has own retry settings - if not fatal: - self.report_skip_fragment(frag_index) - break - raise + # itself since it has its own retry settings + if fatal: + raise + break - if count > fragment_retries: + if not success: if not fatal: self.report_skip_fragment(frag_index) continue - self.report_error('giving up after %s fragment retries' % fragment_retries) + self.report_error('giving up after %s fragment retries' % count) return False self._finish_frag_download(ctx) diff --git a/script.module.youtube.dl/lib/youtube_dl/downloader/external.py b/script.module.youtube.dl/lib/youtube_dl/downloader/external.py index c31f8910a..7fc864e85 100644 --- a/script.module.youtube.dl/lib/youtube_dl/downloader/external.py +++ b/script.module.youtube.dl/lib/youtube_dl/downloader/external.py @@ -1,9 +1,10 @@ from __future__ import unicode_literals -import os.path +import os import re import subprocess import sys +import tempfile import time from .common import FileDownloader @@ -22,6 +23,9 @@ handle_youtubedl_headers, check_executable, is_outdated_version, + process_communicate_or_kill, + T, + traverse_obj, ) @@ -29,6 +33,7 @@ class ExternalFD(FileDownloader): def real_download(self, filename, info_dict): self.report_destination(filename) tmpfilename = self.temp_name(filename) + self._cookies_tempfile = None try: started = time.time() @@ -41,6 +46,13 @@ def real_download(self, filename, info_dict): # should take place retval = 0 self.to_screen('[%s] Interrupted by user' % self.get_basename()) + finally: + if self._cookies_tempfile and os.path.isfile(self._cookies_tempfile): + try: + os.remove(self._cookies_tempfile) + except OSError: + self.report_warning( + 'Unable to delete temporary cookies file "{0}"'.format(self._cookies_tempfile)) if retval == 0: status = { @@ -96,6 +108,16 @@ def _valueless_option(self, command_option, param, expected_value=True): def _configuration_args(self, default=[]): return cli_configuration_args(self.params, 'external_downloader_args', default) + def _write_cookies(self): + if not self.ydl.cookiejar.filename: + tmp_cookies = tempfile.NamedTemporaryFile(suffix='.cookies', delete=False) + tmp_cookies.close() + self._cookies_tempfile = tmp_cookies.name + self.to_screen('[download] Writing temporary cookies file to "{0}"'.format(self._cookies_tempfile)) + # real_download resets _cookies_tempfile; if it's None, save() will write to cookiejar.filename + self.ydl.cookiejar.save(self._cookies_tempfile, ignore_discard=True, ignore_expires=True) + return self.ydl.cookiejar.filename or self._cookies_tempfile + def _call_downloader(self, tmpfilename, info_dict): """ Either overwrite this or implement _make_cmd """ cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] @@ -104,18 +126,26 @@ def _call_downloader(self, tmpfilename, info_dict): p = subprocess.Popen( cmd, stderr=subprocess.PIPE) - _, stderr = p.communicate() + _, stderr = process_communicate_or_kill(p) if p.returncode != 0: self.to_stderr(stderr.decode('utf-8', 'replace')) return p.returncode + @staticmethod + def _header_items(info_dict): + return traverse_obj( + info_dict, ('http_headers', T(dict.items), Ellipsis)) + class CurlFD(ExternalFD): AVAILABLE_OPT = '-V' def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '--location', '-o', tmpfilename] - for key, val in info_dict['http_headers'].items(): + cmd = [self.exe, '--location', '-o', tmpfilename, '--compressed'] + cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url']) + if cookie_header: + cmd += ['--cookie', cookie_header] + for key, val in self._header_items(info_dict): cmd += ['--header', '%s: %s' % (key, val)] cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') cmd += self._valueless_option('--silent', 'noprogress') @@ -141,7 +171,7 @@ def _call_downloader(self, tmpfilename, info_dict): # curl writes the progress to stderr so don't capture it. p = subprocess.Popen(cmd) - p.communicate() + process_communicate_or_kill(p) return p.returncode @@ -150,8 +180,11 @@ class AxelFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-o', tmpfilename] - for key, val in info_dict['http_headers'].items(): + for key, val in self._header_items(info_dict): cmd += ['-H', '%s: %s' % (key, val)] + cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url']) + if cookie_header: + cmd += ['-H', 'Cookie: {0}'.format(cookie_header), '--max-redirect=0'] cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -161,8 +194,10 @@ class WgetFD(ExternalFD): AVAILABLE_OPT = '--version' def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] - for key, val in info_dict['http_headers'].items(): + cmd = [self.exe, '-O', tmpfilename, '-nv', '--compression=auto'] + if self.ydl.cookiejar.get_cookie_header(info_dict['url']): + cmd += ['--load-cookies', self._write_cookies()] + for key, val in self._header_items(info_dict): cmd += ['--header', '%s: %s' % (key, val)] cmd += self._option('--limit-rate', 'ratelimit') retry = self._option('--tries', 'retries') @@ -181,24 +216,121 @@ def _make_cmd(self, tmpfilename, info_dict): class Aria2cFD(ExternalFD): AVAILABLE_OPT = '-v' + @staticmethod + def _aria2c_filename(fn): + return fn if os.path.isabs(fn) else os.path.join('.', fn) + def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '-c'] - cmd += self._configuration_args([ - '--min-split-size', '1M', '--max-connection-per-server', '4']) - dn = os.path.dirname(tmpfilename) - if dn: - cmd += ['--dir', dn] - cmd += ['--out', os.path.basename(tmpfilename)] - for key, val in info_dict['http_headers'].items(): + cmd = [self.exe, '-c', + '--console-log-level=warn', '--summary-interval=0', '--download-result=hide', + '--http-accept-gzip=true', '--file-allocation=none', '-x16', '-j16', '-s16'] + if 'fragments' in info_dict: + cmd += ['--allow-overwrite=true', '--allow-piece-length-change=true'] + else: + cmd += ['--min-split-size', '1M'] + + if self.ydl.cookiejar.get_cookie_header(info_dict['url']): + cmd += ['--load-cookies={0}'.format(self._write_cookies())] + for key, val in self._header_items(info_dict): cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._configuration_args(['--max-connection-per-server', '4']) + cmd += ['--out', os.path.basename(tmpfilename)] + cmd += self._option('--max-overall-download-limit', 'ratelimit') cmd += self._option('--interface', 'source_address') cmd += self._option('--all-proxy', 'proxy') cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') cmd += self._bool_option('--remote-time', 'updatetime', 'true', 'false', '=') - cmd += ['--', info_dict['url']] + cmd += self._bool_option('--show-console-readout', 'noprogress', 'false', 'true', '=') + cmd += self._configuration_args() + + # aria2c strips out spaces from the beginning/end of filenames and paths. + # We work around this issue by adding a "./" to the beginning of the + # filename and relative path, and adding a "/" at the end of the path. + # See: https://github.com/yt-dlp/yt-dlp/issues/276 + # https://github.com/ytdl-org/youtube-dl/issues/20312 + # https://github.com/aria2/aria2/issues/1373 + dn = os.path.dirname(tmpfilename) + if dn: + cmd += ['--dir', self._aria2c_filename(dn) + os.path.sep] + if 'fragments' not in info_dict: + cmd += ['--out', self._aria2c_filename(os.path.basename(tmpfilename))] + cmd += ['--auto-file-renaming=false'] + if 'fragments' in info_dict: + cmd += ['--file-allocation=none', '--uri-selector=inorder'] + url_list_file = '%s.frag.urls' % (tmpfilename, ) + url_list = [] + for frag_index, fragment in enumerate(info_dict['fragments']): + fragment_filename = '%s-Frag%d' % (os.path.basename(tmpfilename), frag_index) + url_list.append('%s\n\tout=%s' % (fragment['url'], self._aria2c_filename(fragment_filename))) + stream, _ = self.sanitize_open(url_list_file, 'wb') + stream.write('\n'.join(url_list).encode()) + stream.close() + cmd += ['-i', self._aria2c_filename(url_list_file)] + else: + cmd += ['--', info_dict['url']] return cmd +class Aria2pFD(ExternalFD): + ''' Aria2pFD class + This class support to use aria2p as downloader. + (Aria2p, a command-line tool and Python library to interact with an aria2c daemon process + through JSON-RPC.) + It can help you to get download progress more easily. + To use aria2p as downloader, you need to install aria2c and aria2p, aria2p can download with pip. + Then run aria2c in the background and enable with the --enable-rpc option. + ''' + try: + import aria2p + __avail = True + except ImportError: + __avail = False + + @classmethod + def available(cls): + return cls.__avail + + def _call_downloader(self, tmpfilename, info_dict): + aria2 = self.aria2p.API( + self.aria2p.Client( + host='http://localhost', + port=6800, + secret='' + ) + ) + + options = { + 'min-split-size': '1M', + 'max-connection-per-server': 4, + 'auto-file-renaming': 'false', + } + options['dir'] = os.path.dirname(tmpfilename) or os.path.abspath('.') + options['out'] = os.path.basename(tmpfilename) + if self.ydl.cookiejar.get_cookie_header(info_dict['url']): + options['load-cookies'] = self._write_cookies() + options['header'] = [] + for key, val in self._header_items(info_dict): + options['header'].append('{0}: {1}'.format(key, val)) + download = aria2.add_uris([info_dict['url']], options) + status = { + 'status': 'downloading', + 'tmpfilename': tmpfilename, + } + started = time.time() + while download.status in ['active', 'waiting']: + download = aria2.get_download(download.gid) + status.update({ + 'downloaded_bytes': download.completed_length, + 'total_bytes': download.total_length, + 'elapsed': time.time() - started, + 'eta': download.eta.total_seconds(), + 'speed': download.download_speed, + }) + self._hook_progress(status) + time.sleep(.5) + return download.status != 'complete' + + class HttpieFD(ExternalFD): @classmethod def available(cls): @@ -206,15 +338,23 @@ def available(cls): def _make_cmd(self, tmpfilename, info_dict): cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] - for key, val in info_dict['http_headers'].items(): + for key, val in self._header_items(info_dict): cmd += ['%s:%s' % (key, val)] + + # httpie 3.1.0+ removes the Cookie header on redirect, so this should be safe for now. [1] + # If we ever need cookie handling for redirects, we can export the cookiejar into a session. [2] + # 1: https://github.com/httpie/httpie/security/advisories/GHSA-9w4w-cpc8-h2fq + # 2: https://httpie.io/docs/cli/sessions + cookie_header = self.ydl.cookiejar.get_cookie_header(info_dict['url']) + if cookie_header: + cmd += ['Cookie:%s' % cookie_header] return cmd class FFmpegFD(ExternalFD): @classmethod def supports(cls, info_dict): - return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms') + return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms', 'http_dash_segments') @classmethod def available(cls): @@ -253,7 +393,14 @@ def _call_downloader(self, tmpfilename, info_dict): # if end_time: # args += ['-t', compat_str(end_time - start_time)] - if info_dict['http_headers'] and re.match(r'^https?://', url): + cookies = self.ydl.cookiejar.get_cookies_for_url(url) + if cookies: + args.extend(['-cookies', ''.join( + '{0}={1}; path={2}; domain={3};\r\n'.format( + cookie.name, cookie.value, cookie.path, cookie.domain) + for cookie in cookies)]) + + if info_dict.get('http_headers') and re.match(r'^https?://', url): # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. headers = handle_youtubedl_headers(info_dict['http_headers']) @@ -336,14 +483,17 @@ def _call_downloader(self, tmpfilename, info_dict): proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env) try: retval = proc.wait() - except KeyboardInterrupt: - # subprocces.run would send the SIGKILL signal to ffmpeg and the + except BaseException as e: + # subprocess.run would send the SIGKILL signal to ffmpeg and the # mp4 file couldn't be played, but if we ask ffmpeg to quit it # produces a file that is playable (this is mostly useful for live # streams). Note that Windows is not affected and produces playable # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). - if sys.platform != 'win32': - proc.communicate(b'q') + if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32': + process_communicate_or_kill(proc, b'q') + else: + proc.kill() + proc.wait() raise return retval diff --git a/script.module.youtube.dl/lib/youtube_dl/downloader/fragment.py b/script.module.youtube.dl/lib/youtube_dl/downloader/fragment.py index 35c76feba..913e91b64 100644 --- a/script.module.youtube.dl/lib/youtube_dl/downloader/fragment.py +++ b/script.module.youtube.dl/lib/youtube_dl/downloader/fragment.py @@ -71,7 +71,7 @@ def _prepare_and_start_frag_download(self, ctx): @staticmethod def __do_ytdl_file(ctx): - return not ctx['live'] and not ctx['tmpfilename'] == '-' + return ctx['live'] is not True and ctx['tmpfilename'] != '-' def _read_ytdl_file(self, ctx): assert 'ytdl_corrupt' not in ctx @@ -101,6 +101,13 @@ def _download_fragment(self, ctx, frag_url, info_dict, headers=None): 'url': frag_url, 'http_headers': headers or info_dict.get('http_headers'), } + frag_resume_len = 0 + if ctx['dl'].params.get('continuedl', True): + frag_resume_len = self.filesize_or_none( + self.temp_name(fragment_filename)) + fragment_info_dict['frag_resume_len'] = frag_resume_len + ctx['frag_resume_len'] = frag_resume_len or 0 + success = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False, None @@ -124,9 +131,7 @@ def _append_fragment(self, ctx, frag_content): del ctx['fragment_filename_sanitized'] def _prepare_frag_download(self, ctx): - if 'live' not in ctx: - ctx['live'] = False - if not ctx['live']: + if not ctx.setdefault('live', False): total_frags_str = '%d' % ctx['total_frags'] ad_frags = ctx.get('ad_frags', 0) if ad_frags: @@ -136,10 +141,11 @@ def _prepare_frag_download(self, ctx): self.to_screen( '[%s] Total fragments: %s' % (self.FD_NAME, total_frags_str)) self.report_destination(ctx['filename']) + continuedl = self.params.get('continuedl', True) dl = HttpQuietDownloader( self.ydl, { - 'continuedl': True, + 'continuedl': continuedl, 'quiet': True, 'noprogress': True, 'ratelimit': self.params.get('ratelimit'), @@ -150,12 +156,11 @@ def _prepare_frag_download(self, ctx): ) tmpfilename = self.temp_name(ctx['filename']) open_mode = 'wb' - resume_len = 0 # Establish possible resume length - if os.path.isfile(encodeFilename(tmpfilename)): + resume_len = self.filesize_or_none(tmpfilename) or 0 + if resume_len > 0: open_mode = 'ab' - resume_len = os.path.getsize(encodeFilename(tmpfilename)) # Should be initialized before ytdl file check ctx.update({ @@ -164,7 +169,8 @@ def _prepare_frag_download(self, ctx): }) if self.__do_ytdl_file(ctx): - if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): + ytdl_file_exists = os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))) + if continuedl and ytdl_file_exists: self._read_ytdl_file(ctx) is_corrupt = ctx.get('ytdl_corrupt') is True is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0 @@ -178,7 +184,12 @@ def _prepare_frag_download(self, ctx): if 'ytdl_corrupt' in ctx: del ctx['ytdl_corrupt'] self._write_ytdl_file(ctx) + else: + if not continuedl: + if ytdl_file_exists: + self._read_ytdl_file(ctx) + ctx['fragment_index'] = resume_len = 0 self._write_ytdl_file(ctx) assert ctx['fragment_index'] == 0 @@ -209,6 +220,7 @@ def _start_frag_download(self, ctx): start = time.time() ctx.update({ 'started': start, + 'fragment_started': start, # Amount of fragment's bytes downloaded by the time of the previous # frag progress hook invocation 'prev_frag_downloaded_bytes': 0, @@ -218,6 +230,9 @@ def frag_progress_hook(s): if s['status'] not in ('downloading', 'finished'): return + if not total_frags and ctx.get('fragment_count'): + state['fragment_count'] = ctx['fragment_count'] + time_now = time.time() state['elapsed'] = time_now - start frag_total_bytes = s.get('total_bytes') or 0 @@ -232,16 +247,17 @@ def frag_progress_hook(s): ctx['fragment_index'] = state['fragment_index'] state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes'] ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes'] + ctx['speed'] = state['speed'] = self.calc_speed( + ctx['fragment_started'], time_now, frag_total_bytes) + ctx['fragment_started'] = time.time() ctx['prev_frag_downloaded_bytes'] = 0 else: frag_downloaded_bytes = s['downloaded_bytes'] state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] + ctx['speed'] = state['speed'] = self.calc_speed( + ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx['frag_resume_len']) if not ctx['live']: - state['eta'] = self.calc_eta( - start, time_now, estimated_size - resume_len, - state['downloaded_bytes'] - resume_len) - state['speed'] = s.get('speed') or ctx.get('speed') - ctx['speed'] = state['speed'] + state['eta'] = self.calc_eta(state['speed'], estimated_size - state['downloaded_bytes']) ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes self._hook_progress(state) @@ -268,7 +284,7 @@ def _finish_frag_download(self, ctx): os.utime(ctx['filename'], (time.time(), filetime)) except Exception: pass - downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) + downloaded_bytes = self.filesize_or_none(ctx['filename']) or 0 self._hook_progress({ 'downloaded_bytes': downloaded_bytes, diff --git a/script.module.youtube.dl/lib/youtube_dl/downloader/http.py b/script.module.youtube.dl/lib/youtube_dl/downloader/http.py index d8ac41dcc..3cad87420 100644 --- a/script.module.youtube.dl/lib/youtube_dl/downloader/http.py +++ b/script.module.youtube.dl/lib/youtube_dl/downloader/http.py @@ -58,9 +58,9 @@ class DownloadContext(dict): if self.params.get('continuedl', True): # Establish possible resume length - if os.path.isfile(encodeFilename(ctx.tmpfilename)): - ctx.resume_len = os.path.getsize( - encodeFilename(ctx.tmpfilename)) + ctx.resume_len = info_dict.get('frag_resume_len') + if ctx.resume_len is None: + ctx.resume_len = self.filesize_or_none(ctx.tmpfilename) or 0 ctx.is_resume = ctx.resume_len > 0 @@ -115,9 +115,9 @@ def establish_connection(): raise RetryDownload(err) raise err # When trying to resume, Content-Range HTTP header of response has to be checked - # to match the value of requested Range HTTP header. This is due to a webservers + # to match the value of requested Range HTTP header. This is due to webservers # that don't support resuming and serve a whole file with no Content-Range - # set in response despite of requested Range (see + # set in response despite requested Range (see # https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799) if has_range: content_range = ctx.data.headers.get('Content-Range') @@ -141,7 +141,8 @@ def establish_connection(): # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload - self.report_unable_to_resume() + if range_start > 0: + self.report_unable_to_resume() ctx.resume_len = 0 ctx.open_mode = 'wb' ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None)) @@ -293,10 +294,7 @@ def retry(e): # Progress message speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) - if ctx.data_len is None: - eta = None - else: - eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len) + eta = self.calc_eta(speed, ctx.data_len and (ctx.data_len - byte_counter)) self._hook_progress({ 'status': 'downloading', diff --git a/script.module.youtube.dl/lib/youtube_dl/downloader/niconico.py b/script.module.youtube.dl/lib/youtube_dl/downloader/niconico.py new file mode 100644 index 000000000..6392c9989 --- /dev/null +++ b/script.module.youtube.dl/lib/youtube_dl/downloader/niconico.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +try: + import threading +except ImportError: + threading = None + +from .common import FileDownloader +from ..downloader import get_suitable_downloader +from ..extractor.niconico import NiconicoIE +from ..utils import sanitized_Request + + +class NiconicoDmcFD(FileDownloader): + """ Downloading niconico douga from DMC with heartbeat """ + + FD_NAME = 'niconico_dmc' + + def real_download(self, filename, info_dict): + self.to_screen('[%s] Downloading from DMC' % self.FD_NAME) + + ie = NiconicoIE(self.ydl) + info_dict, heartbeat_info_dict = ie._get_heartbeat_info(info_dict) + + fd = get_suitable_downloader(info_dict, params=self.params)(self.ydl, self.params) + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + + if not threading: + self.to_screen('[%s] Threading for Heartbeat not available' % self.FD_NAME) + return fd.real_download(filename, info_dict) + + success = download_complete = False + timer = [None] + heartbeat_lock = threading.Lock() + heartbeat_url = heartbeat_info_dict['url'] + heartbeat_data = heartbeat_info_dict['data'].encode() + heartbeat_interval = heartbeat_info_dict.get('interval', 30) + + request = sanitized_Request(heartbeat_url, heartbeat_data) + + def heartbeat(): + try: + self.ydl.urlopen(request).read() + except Exception: + self.to_screen('[%s] Heartbeat failed' % self.FD_NAME) + + with heartbeat_lock: + if not download_complete: + timer[0] = threading.Timer(heartbeat_interval, heartbeat) + timer[0].start() + + heartbeat_info_dict['ping']() + self.to_screen('[%s] Heartbeat with %d second interval ...' % (self.FD_NAME, heartbeat_interval)) + try: + heartbeat() + if type(fd).__name__ == 'HlsFD': + info_dict.update(ie._extract_m3u8_formats(info_dict['url'], info_dict['id'])[0]) + success = fd.real_download(filename, info_dict) + finally: + if heartbeat_lock: + with heartbeat_lock: + timer[0].cancel() + download_complete = True + return success diff --git a/script.module.youtube.dl/lib/youtube_dl/downloader/rtmp.py b/script.module.youtube.dl/lib/youtube_dl/downloader/rtmp.py index fbb7f51b0..8a25dbc8d 100644 --- a/script.module.youtube.dl/lib/youtube_dl/downloader/rtmp.py +++ b/script.module.youtube.dl/lib/youtube_dl/downloader/rtmp.py @@ -89,11 +89,13 @@ def run_rtmpdump(args): self.to_screen('') cursor_in_new_line = True self.to_screen('[rtmpdump] ' + line) - finally: + if not cursor_in_new_line: + self.to_screen('') + return proc.wait() + except BaseException: # Including KeyboardInterrupt + proc.kill() proc.wait() - if not cursor_in_new_line: - self.to_screen('') - return proc.returncode + raise url = info_dict['url'] player_url = info_dict.get('player_url') diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/adn.py b/script.module.youtube.dl/lib/youtube_dl/extractor/adn.py index a55ebbcbd..5ff419f19 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/adn.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/adn.py @@ -31,30 +31,34 @@ class ADNIE(InfoExtractor): - IE_DESC = 'Anime Digital Network' - _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P\d+)' - _TEST = { - 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', - 'md5': '0319c99885ff5547565cacb4f3f9348d', + IE_DESC = 'Animation Digital Network' + _VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.fr/video/[^/]+/(?P\d+)' + _TESTS = [{ + 'url': 'https://animationdigitalnetwork.fr/video/fruits-basket/9841-episode-1-a-ce-soir', + 'md5': '1c9ef066ceb302c86f80c2b371615261', 'info_dict': { - 'id': '7778', + 'id': '9841', 'ext': 'mp4', - 'title': 'Blue Exorcist - Kyôto Saga - Episode 1', - 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5', - 'series': 'Blue Exorcist - Kyôto Saga', - 'duration': 1467, - 'release_date': '20170106', + 'title': 'Fruits Basket - Episode 1', + 'description': 'md5:14be2f72c3c96809b0ca424b0097d336', + 'series': 'Fruits Basket', + 'duration': 1437, + 'release_date': '20190405', 'comment_count': int, 'average_rating': float, - 'season_number': 2, - 'episode': 'Début des hostilités', + 'season_number': 1, + 'episode': 'À ce soir !', 'episode_number': 1, - } - } + }, + 'skip': 'Only available in region (FR, ...)', + }, { + 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', + 'only_matching': True, + }] - _NETRC_MACHINE = 'animedigitalnetwork' - _BASE_URL = 'http://animedigitalnetwork.fr' - _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' + _NETRC_MACHINE = 'animationdigitalnetwork' + _BASE = 'animationdigitalnetwork.fr' + _API_BASE_URL = 'https://gw.api.' + _BASE + '/' _PLAYER_BASE_URL = _API_BASE_URL + 'player/' _HEADERS = {} _LOGIN_ERR_MESSAGE = 'Unable to log in' @@ -82,14 +86,14 @@ def _get_subtitles(self, sub_url, video_id): if subtitle_location: enc_subtitles = self._download_webpage( subtitle_location, video_id, 'Downloading subtitles data', - fatal=False, headers={'Origin': 'https://animedigitalnetwork.fr'}) + fatal=False, headers={'Origin': 'https://' + self._BASE}) if not enc_subtitles: return None - # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js + # http://animationdigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), - bytes_to_intlist(binascii.unhexlify(self._K + 'ab9f52f5baae7c72')), + bytes_to_intlist(binascii.unhexlify(self._K + '7fac1178830cfe0c')), bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( @@ -138,9 +142,9 @@ def _real_initialize(self): if not username: return try: + url = self._API_BASE_URL + 'authentication/login' access_token = (self._download_json( - self._API_BASE_URL + 'authentication/login', None, - 'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False, + url, None, 'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False, data=urlencode_postdata({ 'password': password, 'rememberMe': False, @@ -153,7 +157,8 @@ def _real_initialize(self): message = None if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: resp = self._parse_json( - e.cause.read().decode(), None, fatal=False) or {} + self._webpage_read_content(e.cause, url, username), + username, fatal=False) or {} message = resp.get('message') or resp.get('code') self.report_warning(message or self._LOGIN_ERR_MESSAGE) @@ -211,7 +216,9 @@ def _real_extract(self, url): # This usually goes away with a different random pkcs1pad, so retry continue - error = self._parse_json(e.cause.read(), video_id) + error = self._parse_json( + self._webpage_read_content(e.cause, links_url, video_id), + video_id, fatal=False) or {} message = error.get('message') if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country': self.raise_geo_restricted(msg=message) diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/aenetworks.py b/script.module.youtube.dl/lib/youtube_dl/extractor/aenetworks.py index e55c03fd7..59fbe048a 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/aenetworks.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/aenetworks.py @@ -8,6 +8,8 @@ ExtractorError, GeoRestrictedError, int_or_none, + remove_start, + traverse_obj, update_url_query, urlencode_postdata, ) @@ -20,8 +22,8 @@ class AENetworksBaseIE(ThePlatformIE): (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| fyi\.tv )/''' - _THEPLATFORM_KEY = 'crazyjava' - _THEPLATFORM_SECRET = 's3cr3t' + _THEPLATFORM_KEY = '43jXaGRQud' + _THEPLATFORM_SECRET = 'S10BPXHMlb' _DOMAIN_MAP = { 'history.com': ('HISTORY', 'history'), 'aetv.com': ('AETV', 'aetv'), @@ -33,14 +35,17 @@ class AENetworksBaseIE(ThePlatformIE): } def _extract_aen_smil(self, smil_url, video_id, auth=None): - query = {'mbr': 'true'} + query = { + 'mbr': 'true', + 'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3', + } if auth: query['auth'] = auth TP_SMIL_QUERY = [{ 'assetTypes': 'high_video_ak', - 'switch': 'hls_high_ak' + 'switch': 'hls_high_ak', }, { - 'assetTypes': 'high_video_s3' + 'assetTypes': 'high_video_s3', }, { 'assetTypes': 'high_video_s3', 'switch': 'hls_high_fastly', @@ -75,7 +80,14 @@ def _extract_aetn_info(self, domain, filter_key, filter_value, url): requestor_id, brand = self._DOMAIN_MAP[domain] result = self._download_json( 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, - filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0] + filter_value, query={'filter[%s]' % filter_key: filter_value}) + result = traverse_obj( + result, ('results', + lambda k, v: k == 0 and v[filter_key] == filter_value), + get_all=False) + if not result: + raise ExtractorError('Show not found in A&E feed (too new?)', expected=True, + video_id=remove_start(filter_value, '/')) title = result['title'] video_id = result['id'] media_url = result['publicUrl'] @@ -126,7 +138,7 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], - 'skip': 'This video is only available for users of participating TV providers.', + 'skip': 'Geo-restricted - This content is not available in your location.' }, { 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', 'info_dict': { @@ -143,6 +155,7 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], + 'skip': 'This video is only available for users of participating TV providers.', }, { 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'only_matching': True diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/aliexpress.py b/script.module.youtube.dl/lib/youtube_dl/extractor/aliexpress.py index 6f241e683..9722fe9ac 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/aliexpress.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/aliexpress.py @@ -18,7 +18,7 @@ class AliExpressLiveIE(InfoExtractor): 'id': '2800002704436634', 'ext': 'mp4', 'title': 'CASIMA7.22', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'uploader': 'CASIMA Official Store', 'timestamp': 1500717600, 'upload_date': '20170722', diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/alsace20tv.py b/script.module.youtube.dl/lib/youtube_dl/extractor/alsace20tv.py new file mode 100644 index 000000000..228cec3ec --- /dev/null +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/alsace20tv.py @@ -0,0 +1,89 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + get_element_by_class, + int_or_none, + unified_strdate, + url_or_none, +) + + +class Alsace20TVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/(?:[\w-]+/)+[\w-]+-(?P[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/VOD/Actu/JT/Votre-JT-jeudi-3-fevrier-lyNHCXpYJh.html', + # 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'description': 'md5:fc0bc4a0692d3d2dba4524053de4c7b7', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'duration': 1073, + 'view_count': int, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _extract_video(self, video_id, url=None): + info = self._download_json( + 'https://www.alsace20.tv/visionneuse/visio_v9_js.php?key=%s&habillage=0&mode=html' % (video_id, ), + video_id) or {} + title = info['titre'] + + formats = [] + for res, fmt_url in (info.get('files') or {}).items(): + formats.extend( + self._extract_smil_formats(fmt_url, video_id, fatal=False) + if '/smil:_' in fmt_url + else self._extract_mpd_formats(fmt_url, video_id, mpd_id=res, fatal=False)) + self._sort_formats(formats) + + webpage = (url and self._download_webpage(url, video_id, fatal=False)) or '' + thumbnail = url_or_none(dict_get(info, ('image', 'preview', )) or self._og_search_thumbnail(webpage)) + upload_date = self._search_regex(r'/(\d{6})_', thumbnail, 'upload_date', default=None) + upload_date = unified_strdate('20%s-%s-%s' % (upload_date[:2], upload_date[2:4], upload_date[4:])) if upload_date else None + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': clean_html(get_element_by_class('wysiwyg', webpage)), + 'upload_date': upload_date, + 'thumbnail': thumbnail, + 'duration': int_or_none(self._og_search_property('video:duration', webpage) if webpage else None), + 'view_count': int_or_none(info.get('nb_vues')), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id, url) + + +class Alsace20TVEmbedIE(Alsace20TVIE): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/emb/(?P[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/emb/lyNHCXpYJh', + # 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'view_count': int, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id) diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/americastestkitchen.py b/script.module.youtube.dl/lib/youtube_dl/extractor/americastestkitchen.py index be960c0f9..08d3604e9 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/americastestkitchen.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/americastestkitchen.py @@ -15,7 +15,7 @@ class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?Pepisode|videos)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?:cooks(?:country|illustrated)/)?(?Pepisode|videos)/(?P\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', @@ -23,15 +23,20 @@ class AmericasTestKitchenIE(InfoExtractor): 'id': '5b400b9ee338f922cb06450c', 'title': 'Japanese Suppers', 'ext': 'mp4', + 'display_id': 'weeknight-japanese-suppers', 'description': 'md5:64e606bfee910627efc4b5f050de92b3', - 'thumbnail': r're:^https?://', - 'timestamp': 1523318400, - 'upload_date': '20180410', - 'release_date': '20180410', + 'timestamp': 1523304000, + 'upload_date': '20180409', + 'release_date': '20180409', 'series': "America's Test Kitchen", + 'season': 'Season 18', 'season_number': 18, 'episode': 'Japanese Suppers', 'episode_number': 15, + 'duration': 1376, + 'thumbnail': r're:^https?://', + 'average_rating': 0, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -44,15 +49,20 @@ class AmericasTestKitchenIE(InfoExtractor): 'id': '5fbe8c61bda2010001c6763b', 'title': 'Simple Chicken Dinner', 'ext': 'mp4', + 'display_id': 'atktv_2103_simple-chicken-dinner_full-episode_web-mp4', 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7', - 'thumbnail': r're:^https?://', - 'timestamp': 1610755200, - 'upload_date': '20210116', - 'release_date': '20210116', + 'timestamp': 1610737200, + 'upload_date': '20210115', + 'release_date': '20210115', 'series': "America's Test Kitchen", + 'season': 'Season 21', 'season_number': 21, 'episode': 'Simple Chicken Dinner', 'episode_number': 3, + 'duration': 1397, + 'thumbnail': r're:^https?://', + 'view_count': int, + 'average_rating': 0, }, 'params': { 'skip_download': True, @@ -60,6 +70,12 @@ class AmericasTestKitchenIE(InfoExtractor): }, { 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, + }, { + 'url': 'https://www.americastestkitchen.com/cookscountry/episode/564-when-only-chocolate-will-do', + 'only_matching': True, + }, { + 'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington', + 'only_matching': True, }, { 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', 'only_matching': True, @@ -94,7 +110,7 @@ def _real_extract(self, url): class AmericasTestKitchenSeasonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|cookscountry)\.com/episodes/browse/season_(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|(?Pcooks(?:country|illustrated)))\.com(?:(?:/(?Pcooks(?:country|illustrated)))?(?:/?$|(?\d+)))' _TESTS = [{ # ATK Season 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', @@ -105,48 +121,93 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): 'playlist_count': 13, }, { # Cooks Country Season - 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'url': 'https://www.americastestkitchen.com/cookscountry/episodes/browse/season_12', 'info_dict': { 'id': 'season_12', 'title': 'Season 12', }, 'playlist_count': 13, + }, { + # America's Test Kitchen Series + 'url': 'https://www.americastestkitchen.com/', + 'info_dict': { + 'id': 'americastestkitchen', + 'title': 'America\'s Test Kitchen', + }, + 'playlist_count': 558, + }, { + # Cooks Country Series + 'url': 'https://www.americastestkitchen.com/cookscountry', + 'info_dict': { + 'id': 'cookscountry', + 'title': 'Cook\'s Country', + }, + 'playlist_count': 199, + }, { + 'url': 'https://www.americastestkitchen.com/cookscountry/', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com', + 'only_matching': True, + }, { + 'url': 'https://www.americastestkitchen.com/cooksillustrated/', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com', + 'only_matching': True, }] def _real_extract(self, url): - show_name, season_number = re.match(self._VALID_URL, url).groups() - season_number = int(season_number) + match = re.match(self._VALID_URL, url).groupdict() + show = match.get('show2') + show_path = ('/' + show) if show else '' + show = show or match['show'] + season_number = int_or_none(match.get('season')) + + slug, title = { + 'americastestkitchen': ('atk', 'America\'s Test Kitchen'), + 'cookscountry': ('cco', 'Cook\'s Country'), + 'cooksillustrated': ('cio', 'Cook\'s Illustrated'), + }[show] - slug = 'atk' if show_name == 'americastestkitchen' else 'cco' + facet_filters = [ + 'search_document_klass:episode', + 'search_show_slug:' + slug, + ] - season = 'Season %d' % season_number + if season_number: + playlist_id = 'season_%d' % season_number + playlist_title = 'Season %d' % season_number + facet_filters.append('search_season_list:' + playlist_title) + else: + playlist_id = show + playlist_title = title season_search = self._download_json( 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, - season, headers={ - 'Origin': 'https://www.%s.com' % show_name, + playlist_id, headers={ + 'Origin': 'https://www.americastestkitchen.com', 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', 'X-Algolia-Application-Id': 'Y1FNZXUI30', }, query={ - 'facetFilters': json.dumps([ - 'search_season_list:' + season, - 'search_document_klass:episode', - 'search_show_slug:' + slug, - ]), - 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, + 'facetFilters': json.dumps(facet_filters), + 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title,search_atk_episode_season' % slug, 'attributesToHighlight': '', 'hitsPerPage': 1000, }) def entries(): for episode in (season_search.get('hits') or []): - search_url = episode.get('search_url') + search_url = episode.get('search_url') # always formatted like '/episode/123-title-of-episode' if not search_url: continue yield { '_type': 'url', - 'url': 'https://www.%s.com%s' % (show_name, search_url), - 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]), + 'url': 'https://www.americastestkitchen.com%s%s' % (show_path, search_url), + 'id': try_get(episode, lambda e: e['objectID'].rsplit('_', 1)[-1]), 'title': episode.get('title'), 'description': episode.get('description'), 'timestamp': unified_timestamp(episode.get('search_document_date')), @@ -156,4 +217,4 @@ def entries(): } return self.playlist_result( - entries(), 'season_%d' % season_number, season) + entries(), playlist_id, playlist_title) diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/appleconnect.py b/script.module.youtube.dl/lib/youtube_dl/extractor/appleconnect.py index a84b8b1eb..494f8330c 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/appleconnect.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/appleconnect.py @@ -9,10 +9,10 @@ class AppleConnectIE(InfoExtractor): - _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P[\w-]+)' - _TEST = { + _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/(?:id)?sa\.(?P[\w-]+)' + _TESTS = [{ 'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3', - 'md5': 'e7c38568a01ea45402570e6029206723', + 'md5': 'c1d41f72c8bcaf222e089434619316e4', 'info_dict': { 'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3', 'ext': 'm4v', @@ -22,7 +22,10 @@ class AppleConnectIE(InfoExtractor): 'upload_date': '20150710', 'timestamp': 1436545535, }, - } + }, { + 'url': 'https://itunes.apple.com/us/post/sa.0fe0229f-2457-11e5-9f40-1bb645f2d5d9', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -36,7 +39,7 @@ def _real_extract(self, url): video_data = self._parse_json(video_json, video_id) timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp')) - like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count')) + like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count', default=None)) return { 'id': video_id, diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/applepodcasts.py b/script.module.youtube.dl/lib/youtube_dl/extractor/applepodcasts.py index 95758fece..95e0f663c 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/applepodcasts.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/applepodcasts.py @@ -3,8 +3,11 @@ from .common import InfoExtractor from ..utils import ( + clean_html, clean_podcast_url, + get_element_by_class, int_or_none, + parse_codecs, parse_iso8601, try_get, ) @@ -14,16 +17,17 @@ class ApplePodcastsIE(InfoExtractor): _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P\d+)' _TESTS = [{ 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', - 'md5': 'df02e6acb11c10e844946a39e7222b08', + 'md5': '41dc31cd650143e530d9423b6b5a344f', 'info_dict': { 'id': '1000482637777', 'ext': 'mp3', 'title': '207 - Whitney Webb Returns', - 'description': 'md5:13a73bade02d2e43737751e3987e1399', + 'description': 'md5:75ef4316031df7b41ced4e7b987f79c6', 'upload_date': '20200705', - 'timestamp': 1593921600, - 'duration': 6425, + 'timestamp': 1593932400, + 'duration': 6454, 'series': 'The Tim Dillon Show', + 'thumbnail': 're:.+[.](png|jpe?g|webp)', } }, { 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', @@ -39,18 +43,40 @@ class ApplePodcastsIE(InfoExtractor): def _real_extract(self, url): episode_id = self._match_id(url) webpage = self._download_webpage(url, episode_id) - ember_data = self._parse_json(self._search_regex( - r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', - webpage, 'ember data'), episode_id) - episode = ember_data['data']['attributes'] + episode_data = {} + ember_data = {} + # new page type 2021-11 + amp_data = self._parse_json(self._search_regex( + r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<', + webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {} + amp_data = try_get(amp_data, + lambda a: self._parse_json( + next(a[x] for x in iter(a) if episode_id in x), + episode_id), + dict) or {} + amp_data = amp_data.get('d') or [] + episode_data = try_get( + amp_data, + lambda a: next(x for x in a + if x['type'] == 'podcast-episodes' and x['id'] == episode_id), + dict) + if not episode_data: + # try pre 2021-11 page type: TODO: consider deleting if no longer used + ember_data = self._parse_json(self._search_regex( + r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', + webpage, 'ember data'), episode_id) or {} + ember_data = ember_data.get(episode_id) or ember_data + episode_data = try_get(ember_data, lambda x: x['data'], dict) + episode = episode_data['attributes'] description = episode.get('description') or {} series = None - for inc in (ember_data.get('included') or []): + for inc in (amp_data or ember_data.get('included') or []): if inc.get('type') == 'media/podcast': series = try_get(inc, lambda x: x['attributes']['name']) + series = series or clean_html(get_element_by_class('podcast-header__identity', webpage)) - return { + info = [{ 'id': episode_id, 'title': episode['name'], 'url': clean_podcast_url(episode['assetUrl']), @@ -58,4 +84,10 @@ def _real_extract(self, url): 'timestamp': parse_iso8601(episode.get('releaseDateTime')), 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), 'series': series, - } + 'thumbnail': self._og_search_thumbnail(webpage), + }] + self._sort_formats(info) + info = info[0] + codecs = parse_codecs(info.get('ext', 'mp3')) + info.update(codecs) + return info diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/ard.py b/script.module.youtube.dl/lib/youtube_dl/extractor/ard.py index 143fc51e9..a5b1f54d5 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/ard.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/ard.py @@ -249,14 +249,14 @@ def _real_extract(self, url): class ARDIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(?:www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?:video-?)?(?P[0-9]+))\.html' + _VALID_URL = r'(?Phttps?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P[^/?#&]+))\.html' _TESTS = [{ # available till 7.01.2022 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html', 'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1', 'info_dict': { - 'display_id': 'maischberger-die-woche', - 'id': '100', + 'id': 'maischberger-die-woche-video100', + 'display_id': 'maischberger-die-woche-video100', 'ext': 'mp4', 'duration': 3687.0, 'title': 'maischberger. die woche vom 7. Januar 2021', @@ -264,16 +264,25 @@ class ARDIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'https://www.daserste.de/information/reportage-dokumentation/erlebnis-erde/videosextern/woelfe-und-herdenschutzhunde-ungleiche-brueder-102.html', + 'url': 'https://www.daserste.de/information/politik-weltgeschehen/morgenmagazin/videosextern/dominik-kahun-aus-der-nhl-direkt-zur-weltmeisterschaft-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.daserste.de/information/nachrichten-wetter/tagesthemen/videosextern/tagesthemen-17736.html', 'only_matching': True, }, { 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', 'only_matching': True, + }, { + 'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/Drehpause-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.daserste.de/unterhaltung/film/filmmittwoch-im-ersten/videos/making-ofwendezeit-video-100.html', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') + display_id = mobj.group('id') player_url = mobj.group('mainurl') + '~playerXml.xml' doc = self._download_xml(player_url, display_id) @@ -323,9 +332,24 @@ def _real_extract(self, url): formats.append(f) self._sort_formats(formats) + _SUB_FORMATS = ( + ('./dataTimedText', 'ttml'), + ('./dataTimedTextNoOffset', 'ttml'), + ('./dataTimedTextVtt', 'vtt'), + ) + + subtitles = {} + for subsel, subext in _SUB_FORMATS: + for node in video_node.findall(subsel): + subtitles.setdefault('de', []).append({ + 'url': node.attrib['url'], + 'ext': subext, + }) + return { - 'id': mobj.group('id'), + 'id': xpath_text(video_node, './videoId', default=display_id), 'formats': formats, + 'subtitles': subtitles, 'display_id': display_id, 'title': video_node.find('./title').text, 'duration': parse_duration(video_node.find('./duration').text), @@ -335,7 +359,7 @@ def _real_extract(self, url): class ARDBetaMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?:player|live|video)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?:[^/]+/)?(?:player|live|video)/(?:[^/]+/)*(?PY3JpZDovL[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', @@ -365,22 +389,22 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): }, { 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') - display_id = mobj.group('display_id') - if display_id: - display_id = display_id.rstrip('/') - if not display_id: - display_id = video_id + video_id = self._match_id(url) player_page = self._download_json( 'https://api.ardmediathek.de/public-gateway', - display_id, data=json.dumps({ + video_id, data=json.dumps({ 'query': '''{ - playerPage(client:"%s", clipId: "%s") { + playerPage(client: "ard", clipId: "%s") { blockedByFsk broadcastedOn maturityContentRating @@ -410,7 +434,7 @@ def _real_extract(self, url): } } } -}''' % (mobj.group('client'), video_id), +}''' % video_id, }).encode(), headers={ 'Content-Type': 'application/json' })['data']['playerPage'] @@ -435,7 +459,6 @@ def _real_extract(self, url): r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) info.update({ 'age_limit': age_limit, - 'display_id': display_id, 'title': title, 'description': description, 'timestamp': unified_timestamp(player_page.get('broadcastedOn')), diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/arnes.py b/script.module.youtube.dl/lib/youtube_dl/extractor/arnes.py new file mode 100644 index 000000000..c0032fcab --- /dev/null +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/arnes.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + float_or_none, + int_or_none, + parse_iso8601, + remove_start, +) + + +class ArnesIE(InfoExtractor): + IE_NAME = 'video.arnes.si' + IE_DESC = 'Arnes Video' + _VALID_URL = r'https?://video\.arnes\.si/(?:[a-z]{2}/)?(?:watch|embed|api/(?:asset|public/video))/(?P[0-9a-zA-Z]{12})' + _TESTS = [{ + 'url': 'https://video.arnes.si/watch/a1qrWTOQfVoU?t=10', + 'md5': '4d0f4d0a03571b33e1efac25fd4a065d', + 'info_dict': { + 'id': 'a1qrWTOQfVoU', + 'ext': 'mp4', + 'title': 'Linearna neodvisnost, definicija', + 'description': 'Linearna neodvisnost, definicija', + 'license': 'PRIVATE', + 'creator': 'Polona Oblak', + 'timestamp': 1585063725, + 'upload_date': '20200324', + 'channel': 'Polona Oblak', + 'channel_id': 'q6pc04hw24cj', + 'channel_url': 'https://video.arnes.si/?channel=q6pc04hw24cj', + 'duration': 596.75, + 'view_count': int, + 'tags': ['linearna_algebra'], + 'start_time': 10, + } + }, { + 'url': 'https://video.arnes.si/api/asset/s1YjnV7hadlC/play.mp4', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/embed/s1YjnV7hadlC', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/en/watch/s1YjnV7hadlC', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/embed/s1YjnV7hadlC?t=123&hideRelated=1', + 'only_matching': True, + }, { + 'url': 'https://video.arnes.si/api/public/video/s1YjnV7hadlC', + 'only_matching': True, + }] + _BASE_URL = 'https://video.arnes.si' + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + self._BASE_URL + '/api/public/video/' + video_id, video_id)['data'] + title = video['title'] + + formats = [] + for media in (video.get('media') or []): + media_url = media.get('url') + if not media_url: + continue + formats.append({ + 'url': self._BASE_URL + media_url, + 'format_id': remove_start(media.get('format'), 'FORMAT_'), + 'format_note': media.get('formatTranslation'), + 'width': int_or_none(media.get('width')), + 'height': int_or_none(media.get('height')), + }) + self._sort_formats(formats) + + channel = video.get('channel') or {} + channel_id = channel.get('url') + thumbnail = video.get('thumbnailUrl') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': self._BASE_URL + thumbnail, + 'description': video.get('description'), + 'license': video.get('license'), + 'creator': video.get('author'), + 'timestamp': parse_iso8601(video.get('creationTime')), + 'channel': channel.get('name'), + 'channel_id': channel_id, + 'channel_url': self._BASE_URL + '/?channel=' + channel_id if channel_id else None, + 'duration': float_or_none(video.get('duration'), 1000), + 'view_count': int_or_none(video.get('views')), + 'tags': video.get('hashtags'), + 'start_time': int_or_none(compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get('t', [None])[0]), + } diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/arte.py b/script.module.youtube.dl/lib/youtube_dl/extractor/arte.py index 03abdbfaf..5bfe57b10 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/arte.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/arte.py @@ -12,6 +12,7 @@ ExtractorError, int_or_none, qualities, + strip_or_none, try_get, unified_strdate, url_or_none, @@ -252,3 +253,49 @@ def _real_extract(self, url): title = collection.get('title') description = collection.get('shortDescription') or collection.get('teaserText') return self.playlist_result(entries, playlist_id, title, description) + + +class ArteTVCategoryIE(ArteTVBaseIE): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P%s)/videos/(?P[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/politics-and-society/', + 'info_dict': { + 'id': 'politics-and-society', + 'title': 'Politics and society', + 'description': 'Investigative documentary series, geopolitical analysis, and international commentary', + }, + 'playlist_mincount': 13, + }, + ] + + @classmethod + def suitable(cls, url): + return ( + not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, )) + and super(ArteTVCategoryIE, cls).suitable(url)) + + def _real_extract(self, url): + lang, playlist_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, playlist_id) + + items = [] + for video in re.finditer( + r']*?href\s*=\s*(?P"|\'|\b)(?Phttps?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang, + webpage): + video = video.group('url') + if video == url: + continue + if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): + items.append(video) + + if items: + title = (self._og_search_title(webpage, default=None) + or self._html_search_regex(r']*>([^<]+)', default=None)) + title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url) + + result = self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title) + if result: + description = self._og_search_description(webpage, default=None) + if description: + result['description'] = description + return result diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/audiomack.py b/script.module.youtube.dl/lib/youtube_dl/extractor/audiomack.py index cc7771354..4d1fbad1f 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/audiomack.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/audiomack.py @@ -14,7 +14,7 @@ class AudiomackIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:song/|(?=.+/song/))(?P[\w/-]+)' IE_NAME = 'audiomack' _TESTS = [ # hosted on audiomack @@ -29,25 +29,27 @@ class AudiomackIE(InfoExtractor): } }, # audiomack wrapper around soundcloud song + # Needs new test URL. { 'add_ie': ['Soundcloud'], 'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle', - 'info_dict': { - 'id': '258901379', - 'ext': 'mp3', - 'description': 'mamba day freestyle for the legend Kobe Bryant ', - 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', - 'uploader': 'ILOVEMAKONNEN', - 'upload_date': '20160414', - } + 'only_matching': True, + # 'info_dict': { + # 'id': '258901379', + # 'ext': 'mp3', + # 'description': 'mamba day freestyle for the legend Kobe Bryant ', + # 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', + # 'uploader': 'ILOVEMAKONNEN', + # 'upload_date': '20160414', + # } }, ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/song/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/song/', '/') # Request the extended version of the api for extra fields like artist and title api_response = self._download_json( @@ -73,13 +75,13 @@ def _real_extract(self, url): class AudiomackAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:album/|(?=.+/album/))(?P[\w/-]+)' IE_NAME = 'audiomack:album' _TESTS = [ # Standard album playlist { 'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape', - 'playlist_count': 15, + 'playlist_count': 11, 'info_dict': { 'id': '812251', @@ -95,24 +97,24 @@ class AudiomackAlbumIE(InfoExtractor): }, 'playlist': [{ 'info_dict': { - 'title': 'PPP (Pistol P Project) - 9. Heaven or Hell (CHIMACA) ft Zuse (prod by DJ FU)', - 'id': '837577', + 'title': 'PPP (Pistol P Project) - 10. 4 Minutes Of Hell Part 4 (prod by DY OF 808 MAFIA)', + 'id': '837580', 'ext': 'mp3', 'uploader': 'Lil Herb a.k.a. G Herbo', } }], 'params': { - 'playliststart': 9, - 'playlistend': 9, + 'playliststart': 2, + 'playlistend': 2, } } ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/album/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/album/', '/') result = {'_type': 'playlist', 'entries': []} # There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata # Therefore we don't know how many songs the album has and must infi-loop until failure @@ -134,7 +136,7 @@ def _real_extract(self, url): # Pull out the album metadata and add to result (if it exists) for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]: if apikey in api_response and resultkey not in result: - result[resultkey] = api_response[apikey] + result[resultkey] = compat_str(api_response[apikey]) song_id = url_basename(api_response['url']).rpartition('.')[0] result['entries'].append({ 'id': compat_str(api_response.get('id', song_id)), diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/bandcamp.py b/script.module.youtube.dl/lib/youtube_dl/extractor/bandcamp.py index 69e673a26..006aab3b4 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/bandcamp.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/bandcamp.py @@ -49,6 +49,7 @@ class BandcampIE(InfoExtractor): 'uploader': 'Ben Prunty', 'timestamp': 1396508491, 'upload_date': '20140403', + 'release_timestamp': 1396483200, 'release_date': '20140403', 'duration': 260.877, 'track': 'Lanius (Battle)', @@ -69,6 +70,7 @@ class BandcampIE(InfoExtractor): 'uploader': 'Mastodon', 'timestamp': 1322005399, 'upload_date': '20111122', + 'release_timestamp': 1076112000, 'release_date': '20040207', 'duration': 120.79, 'track': 'Hail to Fire', @@ -197,7 +199,7 @@ def _real_extract(self, url): 'thumbnail': thumbnail, 'uploader': artist, 'timestamp': timestamp, - 'release_date': unified_strdate(tralbum.get('album_release_date')), + 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')), 'duration': duration, 'track': track, 'track_number': track_number, diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/bbc.py b/script.module.youtube.dl/lib/youtube_dl/extractor/bbc.py index 92e6f1bea..378b52f4f 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/bbc.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/bbc.py @@ -1,17 +1,24 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import itertools +import json import re from .common import InfoExtractor from ..compat import ( compat_etree_Element, compat_HTTPError, + compat_parse_qs, + compat_str, + compat_urllib_error, + compat_urllib_parse_urlparse, compat_urlparse, ) from ..utils import ( ExtractorError, + OnDemandPagedList, clean_html, dict_get, float_or_none, @@ -20,8 +27,10 @@ js_to_json, parse_duration, parse_iso8601, + strip_or_none, try_get, unescapeHTML, + unified_timestamp, url_or_none, urlencode_postdata, urljoin, @@ -31,7 +40,7 @@ class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})' + _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ @@ -387,9 +396,17 @@ def _process_media_selector(self, media_selection, programme_id): formats.extend(self._extract_mpd_formats( href, programme_id, mpd_id=format_id, fatal=False)) elif transfer_format == 'hls': - formats.extend(self._extract_m3u8_formats( - href, programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False)) + # TODO: let expected_status be passed into _extract_xxx_formats() instead + try: + fmts = self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + except ExtractorError as e: + if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError) + and e.exc_info[1].code in (403, 404)): + raise + fmts = [] + formats.extend(fmts) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) @@ -756,23 +773,44 @@ class BBCIE(BBCCoUkIE): 'only_matching': True, }, { # custom redirection to www.bbc.com + # also, video with window.__INITIAL_DATA__ 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', - 'only_matching': True, + 'info_dict': { + 'id': 'p02xzws1', + 'ext': 'mp4', + 'title': "Pluto may have 'nitrogen glaciers'", + 'description': 'md5:6a95b593f528d7a5f2605221bc56912f', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1437785037, + 'upload_date': '20150725', + }, + }, { + # video with window.__INITIAL_DATA__ and value as JSON string + 'url': 'https://www.bbc.com/news/av/world-europe-59468682', + 'info_dict': { + 'id': 'p0b71qth', + 'ext': 'mp4', + 'title': 'Why France is making this woman a national hero', + 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1638230731, + 'upload_date': '20211130', + }, }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'only_matching': True, }, { + # bbcthreeConfig 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', 'info_dict': { 'id': 'p06556y7', 'ext': 'mp4', - 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + 'title': 'Things Not To Say to people that live on council estates', + 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.", + 'duration': 360, + 'thumbnail': r're:https?://.+/.+\.jpg', }, - 'params': { - 'skip_download': True, - } }, { # window.__PRELOADED_STATE__ 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', @@ -811,7 +849,7 @@ class BBCIE(BBCCoUkIE): @classmethod def suitable(cls, url): - EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE) + EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE) return (False if any(ie.suitable(url) for ie in EXCLUDE_IE) else super(BBCIE, cls).suitable(url)) @@ -1145,9 +1183,16 @@ def _real_extract(self, url): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) - initial_data = self._parse_json(self._search_regex( - r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), playlist_id, fatal=False) + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, + 'quoted preload state', default=None) + if initial_data is None: + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage, + 'preload state', default={}) + else: + initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) + initial_data = self._parse_json(initial_data, playlist_id, fatal=False) if initial_data: def parse_media(media): if not media: @@ -1159,19 +1204,39 @@ def parse_media(media): continue formats, subtitles = self._download_media_selector(item_id) self._sort_formats(formats) + item_desc = None + blocks = try_get(media, lambda x: x['summary']['blocks'], list) + if blocks: + summary = [] + for block in blocks: + text = try_get(block, lambda x: x['model']['text'], compat_str) + if text: + summary.append(text) + if summary: + item_desc = '\n\n'.join(summary) + item_time = None + for meta in try_get(media, lambda x: x['metadata']['items'], list) or []: + if try_get(meta, lambda x: x['label']) == 'Published': + item_time = unified_timestamp(meta.get('timestamp')) + break entries.append({ 'id': item_id, 'title': item_title, 'thumbnail': item.get('holdingImageUrl'), 'formats': formats, 'subtitles': subtitles, + 'timestamp': item_time, + 'description': strip_or_none(item_desc), }) for resp in (initial_data.get('data') or {}).values(): name = resp.get('name') if name == 'media-experience': parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) elif name == 'article': - for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): + for block in (try_get(resp, + (lambda x: x['data']['blocks'], + lambda x: x['data']['content']['model']['blocks'],), + list) or []): if block.get('type') != 'media': continue parse_media(block.get('model')) @@ -1338,21 +1403,149 @@ def _real_extract(self, url): playlist_id, title, description) -class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): - IE_NAME = 'bbc.co.uk:iplayer:playlist' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P%s)' % BBCCoUkIE._ID_REGEX - _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s' - _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)' +class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor): + _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P%s)' % BBCCoUkIE._ID_REGEX + + @staticmethod + def _get_default(episode, key, default_key='default'): + return try_get(episode, lambda x: x[key][default_key]) + + def _get_description(self, data): + synopsis = data.get(self._DESCRIPTION_KEY) or {} + return dict_get(synopsis, ('large', 'medium', 'small')) + + def _fetch_page(self, programme_id, per_page, series_id, page): + elements = self._get_elements(self._call_api( + programme_id, per_page, page + 1, series_id)) + for element in elements: + episode = self._get_episode(element) + episode_id = episode.get('id') + if not episode_id: + continue + thumbnail = None + image = self._get_episode_image(episode) + if image: + thumbnail = image.replace('{recipe}', 'raw') + category = self._get_default(episode, 'labels', 'category') + yield { + '_type': 'url', + 'id': episode_id, + 'title': self._get_episode_field(episode, 'subtitle'), + 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id, + 'thumbnail': thumbnail, + 'description': self._get_description(episode), + 'categories': [category] if category else None, + 'series': self._get_episode_field(episode, 'title'), + 'ie_key': BBCCoUkIE.ie_key(), + } + + def _real_extract(self, url): + pid = self._match_id(url) + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + series_id = qs.get('seriesId', [None])[0] + page = qs.get('page', [None])[0] + per_page = 36 if page else self._PAGE_SIZE + fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id) + entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE) + playlist_data = self._get_playlist_data(self._call_api(pid, 1)) + return self.playlist_result( + entries, pid, self._get_playlist_title(playlist_data), + self._get_description(playlist_data)) + + +class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:episodes' + _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes' _TESTS = [{ 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v', 'info_dict': { 'id': 'b05rcz9v', 'title': 'The Disappearance', - 'description': 'French thriller serial about a missing teenager.', + 'description': 'md5:58eb101aee3116bad4da05f91179c0cb', }, - 'playlist_mincount': 6, - 'skip': 'This programme is not currently available on BBC iPlayer', + 'playlist_mincount': 8, }, { + # all seasons + 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster', + 'info_dict': { + 'id': 'b094m5t9', + 'title': 'Doctor Foster', + 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6', + }, + 'playlist_mincount': 10, + }, { + # explicit season + 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv', + 'info_dict': { + 'id': 'b094m5t9', + 'title': 'Doctor Foster', + 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6', + }, + 'playlist_mincount': 5, + }, { + # all pages + 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove', + 'info_dict': { + 'id': 'm0004c4v', + 'title': 'Beechgrove', + 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.', + }, + 'playlist_mincount': 37, + }, { + # explicit page + 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2', + 'info_dict': { + 'id': 'm0004c4v', + 'title': 'Beechgrove', + 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.', + }, + 'playlist_mincount': 1, + }] + _PAGE_SIZE = 100 + _DESCRIPTION_KEY = 'synopsis' + + def _get_episode_image(self, episode): + return self._get_default(episode, 'image') + + def _get_episode_field(self, episode, field): + return self._get_default(episode, field) + + @staticmethod + def _get_elements(data): + return data['entities']['results'] + + @staticmethod + def _get_episode(element): + return element.get('episode') or {} + + def _call_api(self, pid, per_page, page=1, series_id=None): + variables = { + 'id': pid, + 'page': page, + 'perPage': per_page, + } + if series_id: + variables['sliceId'] = series_id + return self._download_json( + 'https://graph.ibl.api.bbc.co.uk/', pid, headers={ + 'Content-Type': 'application/json' + }, data=json.dumps({ + 'id': '5692d93d5aac8d796a0305e895e61551', + 'variables': variables, + }).encode('utf-8'))['data']['programme'] + + @staticmethod + def _get_playlist_data(data): + return data + + def _get_playlist_title(self, data): + return self._get_default(data, 'title') + + +class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:group' + _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group' + _TESTS = [{ # Available for over a year unlike 30 days for most other programmes 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32', 'info_dict': { @@ -1361,14 +1554,56 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', }, 'playlist_mincount': 10, + }, { + # all pages + 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7', + 'info_dict': { + 'id': 'p081d7j7', + 'title': 'Music in Scotland', + 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.', + }, + 'playlist_mincount': 47, + }, { + # explicit page + 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2', + 'info_dict': { + 'id': 'p081d7j7', + 'title': 'Music in Scotland', + 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.', + }, + 'playlist_mincount': 11, }] + _PAGE_SIZE = 200 + _DESCRIPTION_KEY = 'synopses' - def _extract_title_and_description(self, webpage): - title = self._search_regex(r'

([^<]+)

', webpage, 'title', fatal=False) - description = self._search_regex( - r']+class=(["\'])subtitle\1[^>]*>(?P[^<]+)

', - webpage, 'description', fatal=False, group='value') - return title, description + def _get_episode_image(self, episode): + return self._get_default(episode, 'images', 'standard') + + def _get_episode_field(self, episode, field): + return episode.get(field) + + @staticmethod + def _get_elements(data): + return data['elements'] + + @staticmethod + def _get_episode(element): + return element + + def _call_api(self, pid, per_page, page=1, series_id=None): + return self._download_json( + 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid, + pid, query={ + 'page': page, + 'per_page': per_page, + })['group_episodes'] + + @staticmethod + def _get_playlist_data(data): + return data['group'] + + def _get_playlist_title(self, data): + return data.get('title') class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/bigo.py b/script.module.youtube.dl/lib/youtube_dl/extractor/bigo.py new file mode 100644 index 000000000..ddf76ac55 --- /dev/null +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/bigo.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError, urlencode_postdata + + +class BigoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bigo\.tv/(?:[a-z]{2,}/)?(?P[^/]+)' + + _TESTS = [{ + 'url': 'https://www.bigo.tv/ja/221338632', + 'info_dict': { + 'id': '6576287577575737440', + 'title': '土よ〜💁‍♂️ 休憩室/REST room', + 'thumbnail': r're:https?://.+', + 'uploader': '✨Shin💫', + 'uploader_id': '221338632', + 'is_live': True, + }, + 'skip': 'livestream', + }, { + 'url': 'https://www.bigo.tv/th/Tarlerm1304', + 'only_matching': True, + }, { + 'url': 'https://bigo.tv/115976881', + 'only_matching': True, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + info_raw = self._download_json( + 'https://bigo.tv/studio/getInternalStudioInfo', + user_id, data=urlencode_postdata({'siteId': user_id})) + + if not isinstance(info_raw, dict): + raise ExtractorError('Received invalid JSON data') + if info_raw.get('code'): + raise ExtractorError( + 'Bigo says: %s (code %s)' % (info_raw.get('msg'), info_raw.get('code')), expected=True) + info = info_raw.get('data') or {} + + if not info.get('alive'): + raise ExtractorError('This user is offline.', expected=True) + + return { + 'id': info.get('roomId') or user_id, + 'title': info.get('roomTopic') or info.get('nick_name') or user_id, + 'formats': [{ + 'url': info.get('hls_src'), + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + 'thumbnail': info.get('snapshot'), + 'uploader': info.get('nick_name'), + 'uploader_id': user_id, + 'is_live': True, + } diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/bilibili.py b/script.module.youtube.dl/lib/youtube_dl/extractor/bilibili.py index 4dc597e16..d42f0e98a 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/bilibili.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/bilibili.py @@ -156,6 +156,7 @@ def _real_extract(self, url): cid = js['result']['cid'] headers = { + 'Accept': 'application/json', 'Referer': url } headers.update(self.geo_verification_headers()) @@ -232,7 +233,7 @@ def _real_extract(self, url): webpage) if uploader_mobj: info.update({ - 'uploader': uploader_mobj.group('name'), + 'uploader': uploader_mobj.group('name').strip(), 'uploader_id': uploader_mobj.group('id'), }) if not info.get('uploader'): @@ -368,6 +369,11 @@ def _real_extract(self, url): 'filesize': int_or_none(play_data.get('size')), }] + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': url, + }) + song = self._call_api('song/info', au_id) title = song['title'] statistic = song.get('statistic') or {} diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/blerp.py b/script.module.youtube.dl/lib/youtube_dl/extractor/blerp.py new file mode 100644 index 000000000..355daef6e --- /dev/null +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/blerp.py @@ -0,0 +1,173 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from ..utils import ( + strip_or_none, + traverse_obj, +) +from .common import InfoExtractor + + +class BlerpIE(InfoExtractor): + IE_NAME = 'blerp' + _VALID_URL = r'https?://(?:www\.)?blerp\.com/soundbites/(?P[0-9a-zA-Z]+)' + _TESTS = [{ + 'url': 'https://blerp.com/soundbites/6320fe8745636cb4dd677a5a', + 'info_dict': { + 'id': '6320fe8745636cb4dd677a5a', + 'title': 'Samsung Galaxy S8 Over the Horizon Ringtone 2016', + 'uploader': 'luminousaj', + 'uploader_id': '5fb81e51aa66ae000c395478', + 'ext': 'mp3', + 'tags': ['samsung', 'galaxy', 's8', 'over the horizon', '2016', 'ringtone'], + } + }, { + 'url': 'https://blerp.com/soundbites/5bc94ef4796001000498429f', + 'info_dict': { + 'id': '5bc94ef4796001000498429f', + 'title': 'Yee', + 'uploader': '179617322678353920', + 'uploader_id': '5ba99cf71386730004552c42', + 'ext': 'mp3', + 'tags': ['YEE', 'YEET', 'wo ha haah catchy tune yee', 'yee'] + } + }] + + _GRAPHQL_OPERATIONNAME = "webBitePageGetBite" + _GRAPHQL_QUERY = ( + '''query webBitePageGetBite($_id: MongoID!) { + web { + biteById(_id: $_id) { + ...bitePageFrag + __typename + } + __typename + } + } + + fragment bitePageFrag on Bite { + _id + title + userKeywords + keywords + color + visibility + isPremium + owned + price + extraReview + isAudioExists + image { + filename + original { + url + __typename + } + __typename + } + userReactions { + _id + reactions + createdAt + __typename + } + topReactions + totalSaveCount + saved + blerpLibraryType + license + licenseMetaData + playCount + totalShareCount + totalFavoriteCount + totalAddedToBoardCount + userCategory + userAudioQuality + audioCreationState + transcription + userTranscription + description + createdAt + updatedAt + author + listingType + ownerObject { + _id + username + profileImage { + filename + original { + url + __typename + } + __typename + } + __typename + } + transcription + favorited + visibility + isCurated + sourceUrl + audienceRating + strictAudienceRating + ownerId + reportObject { + reportedContentStatus + __typename + } + giphy { + mp4 + gif + __typename + } + audio { + filename + original { + url + __typename + } + mp3 { + url + __typename + } + __typename + } + __typename + } + + ''') + + def _real_extract(self, url): + audio_id = self._match_id(url) + + data = { + 'operationName': self._GRAPHQL_OPERATIONNAME, + 'query': self._GRAPHQL_QUERY, + 'variables': { + '_id': audio_id + } + } + + headers = { + 'Content-Type': 'application/json' + } + + json_result = self._download_json('https://api.blerp.com/graphql', + audio_id, data=json.dumps(data).encode('utf-8'), headers=headers) + + bite_json = json_result['data']['web']['biteById'] + + info_dict = { + 'id': bite_json['_id'], + 'url': bite_json['audio']['mp3']['url'], + 'title': bite_json['title'], + 'uploader': traverse_obj(bite_json, ('ownerObject', 'username'), expected_type=strip_or_none), + 'uploader_id': traverse_obj(bite_json, ('ownerObject', '_id'), expected_type=strip_or_none), + 'ext': 'mp3', + 'tags': list(filter(None, map(strip_or_none, (traverse_obj(bite_json, 'userKeywords', expected_type=list) or []))) or None) + } + + return info_dict diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/bongacams.py b/script.module.youtube.dl/lib/youtube_dl/extractor/bongacams.py index 180542fbc..016999d55 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/bongacams.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/bongacams.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -12,13 +13,28 @@ class BongaCamsIE(InfoExtractor): - _VALID_URL = r'https?://(?P(?:[^/]+\.)?bongacams\d*\.com)/(?P[^/?&#]+)' + _VALID_URL = r'https?://(?P(?:[^/]+\.)?bongacams\d*\.(?:com|net))/(?P[^/?&#]+)' _TESTS = [{ 'url': 'https://de.bongacams.com/azumi-8', 'only_matching': True, }, { 'url': 'https://cn.bongacams.com/azumi-8', 'only_matching': True, + }, { + 'url': 'https://de.bongacams.net/claireashton', + 'info_dict': { + 'id': 'claireashton', + 'ext': 'mp4', + 'title': r're:ClaireAshton \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'age_limit': 18, + 'uploader_id': 'ClaireAshton', + 'uploader': 'ClaireAshton', + 'like_count': int, + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/callin.py b/script.module.youtube.dl/lib/youtube_dl/extractor/callin.py new file mode 100644 index 000000000..341be479f --- /dev/null +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/callin.py @@ -0,0 +1,74 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + traverse_obj, + try_get, +) + + +class CallinIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?callin\.com/episode/(?:[^/#?-]+-)*(?P[^/#?-]+)' + _TESTS = [{ + 'url': 'https://www.callin.com/episode/fcc-commissioner-brendan-carr-on-elons-PrumRdSQJW', + 'md5': '14ede27ee2c957b7e4db93140fc0745c', + 'info_dict': { + 'id': 'PrumRdSQJW', + 'ext': 'mp4', + 'title': 'FCC Commissioner Brendan Carr on Elon’s Starlink', + 'description': 'Or, why the government doesn’t like SpaceX', + 'channel': 'The Pull Request', + 'channel_url': 'https://callin.com/show/the-pull-request-ucnDJmEKAa', + } + }, { + 'url': 'https://www.callin.com/episode/episode-81-elites-melt-down-over-student-debt-lzxMidUnjA', + 'md5': '16f704ddbf82a27e3930533b12062f07', + 'info_dict': { + 'id': 'lzxMidUnjA', + 'ext': 'mp4', + 'title': 'Episode 81- Elites MELT DOWN over Student Debt Victory? Rumble in NYC?', + 'description': 'Let’s talk todays episode about the primary election shake up in NYC and the elites melting down over student debt cancelation.', + 'channel': 'The DEBRIEF With Briahna Joy Gray', + 'channel_url': 'https://callin.com/show/the-debrief-with-briahna-joy-gray-siiFDzGegm', + } + }] + + def _search_nextjs_data(self, webpage, video_id, transform_source=None, fatal=True, **kw): + return self._parse_json( + self._search_regex( + r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', + webpage, 'next.js data', fatal=fatal, **kw), + video_id, transform_source=transform_source, fatal=fatal) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + next_data = self._search_nextjs_data(webpage, video_id) + episode = traverse_obj(next_data, ('props', 'pageProps', 'episode'), expected_type=dict) + if not episode: + raise ExtractorError('Failed to find episode data') + + title = episode.get('title') or self._og_search_title(webpage) + description = episode.get('description') or self._og_search_description(webpage) + + formats = [] + formats.extend(self._extract_m3u8_formats( + episode.get('m3u8'), video_id, 'mp4', + entry_protocol='m3u8_native', fatal=False)) + self._sort_formats(formats) + + channel = try_get(episode, lambda x: x['show']['title'], compat_str) + channel_url = try_get(episode, lambda x: x['show']['linkObj']['resourceUrl'], compat_str) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'channel': channel, + 'channel_url': channel_url, + } diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/cammodels.py b/script.module.youtube.dl/lib/youtube_dl/extractor/cammodels.py index 1eb81b75e..d2e860b24 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/cammodels.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/cammodels.py @@ -3,7 +3,6 @@ from .common import InfoExtractor from ..utils import ( - ExtractorError, int_or_none, url_or_none, ) @@ -20,32 +19,11 @@ class CamModelsIE(InfoExtractor): def _real_extract(self, url): user_id = self._match_id(url) - webpage = self._download_webpage( - url, user_id, headers=self.geo_verification_headers()) - - manifest_root = self._html_search_regex( - r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) - - if not manifest_root: - ERRORS = ( - ("I'm offline, but let's stay connected", 'This user is currently offline'), - ('in a private show', 'This user is in a private show'), - ('is currently performing LIVE', 'This model is currently performing live'), - ) - for pattern, message in ERRORS: - if pattern in webpage: - error = message - expected = True - break - else: - error = 'Unable to find manifest URL root' - expected = False - raise ExtractorError(error, expected=expected) - manifest = self._download_json( - '%s%s.json' % (manifest_root, user_id), user_id) + 'https://manifest-server.naiadsystems.com/live/s:%s.json' % user_id, user_id) formats = [] + thumbnails = [] for format_id, format_dict in manifest['formats'].items(): if not isinstance(format_dict, dict): continue @@ -85,6 +63,13 @@ def _real_extract(self, url): 'preference': -1, }) else: + if format_id == 'jpeg': + thumbnails.append({ + 'url': f['url'], + 'width': f['width'], + 'height': f['height'], + 'format_id': f['format_id'], + }) continue formats.append(f) self._sort_formats(formats) @@ -92,6 +77,7 @@ def _real_extract(self, url): return { 'id': user_id, 'title': self._live_title(user_id), + 'thumbnails': thumbnails, 'is_live': True, 'formats': formats, 'age_limit': 18 diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/cbs.py b/script.module.youtube.dl/lib/youtube_dl/extractor/cbs.py index 4a19a73d2..c79e55a75 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/cbs.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/cbs.py @@ -27,7 +27,7 @@ def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): class CBSIE(CBSBaseIE): - _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' + _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:(?:cbs|paramountplus)\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', @@ -52,6 +52,9 @@ class CBSIE(CBSBaseIE): }, { 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, + }, { + 'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/', + 'only_matching': True, }] def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/cbsnews.py b/script.module.youtube.dl/lib/youtube_dl/extractor/cbsnews.py index 345debcf0..1285ed65e 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/cbsnews.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/cbsnews.py @@ -26,7 +26,7 @@ class CBSNewsEmbedIE(CBSIE): def _real_extract(self, url): item = self._parse_json(zlib.decompress(compat_b64decode( compat_urllib_parse_unquote(self._match_id(url))), - -zlib.MAX_WBITS), None)['video']['items'][0] + -zlib.MAX_WBITS).decode('utf-8'), None)['video']['items'][0] return self._extract_video_info(item['mpxRefId'], 'cbsnews') diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/cbssports.py b/script.module.youtube.dl/lib/youtube_dl/extractor/cbssports.py index 83b764762..a891c9a55 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/cbssports.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/cbssports.py @@ -1,38 +1,113 @@ from __future__ import unicode_literals -from .cbs import CBSBaseIE +import re +# from .cbs import CBSBaseIE +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, +) -class CBSSportsIE(CBSBaseIE): - _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/(?:video|news)/(?P[^/?#&]+)' +# class CBSSportsEmbedIE(CBSBaseIE): +class CBSSportsEmbedIE(InfoExtractor): + IE_NAME = 'cbssports:embed' + _VALID_URL = r'''(?ix)https?://(?:(?:www\.)?cbs|embed\.247)sports\.com/player/embed.+? + (?: + ids%3D(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})| + pcid%3D(?P\d+) + )''' _TESTS = [{ - 'url': 'https://www.cbssports.com/nba/video/donovan-mitchell-flashes-star-potential-in-game-2-victory-over-thunder/', - 'info_dict': { - 'id': '1214315075735', - 'ext': 'mp4', - 'title': 'Donovan Mitchell flashes star potential in Game 2 victory over Thunder', - 'description': 'md5:df6f48622612c2d6bd2e295ddef58def', - 'timestamp': 1524111457, - 'upload_date': '20180419', - 'uploader': 'CBSI-NEW', - }, - 'params': { - # m3u8 download - 'skip_download': True, - } + 'url': 'https://www.cbssports.com/player/embed/?args=player_id%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26ids%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26resizable%3D1%26autoplay%3Dtrue%26domain%3Dcbssports.com%26comp_ads_enabled%3Dfalse%26watchAndRead%3D0%26startTime%3D0%26env%3Dprod', + 'only_matching': True, }, { - 'url': 'https://www.cbssports.com/nba/news/nba-playoffs-2018-watch-76ers-vs-heat-game-3-series-schedule-tv-channel-online-stream/', + 'url': 'https://embed.247sports.com/player/embed/?args=%3fplayer_id%3d1827823171591%26channel%3dcollege-football-recruiting%26pcid%3d1827823171591%26width%3d640%26height%3d360%26autoplay%3dTrue%26comp_ads_enabled%3dFalse%26uvpc%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_v4%2526partner%253d247%26uvpc_m%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_m_v4%2526partner_m%253d247_mobile%26utag%3d247sportssite%26resizable%3dTrue', 'only_matching': True, }] - def _extract_video_info(self, filter_query, video_id): - return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + # def _extract_video_info(self, filter_query, video_id): + # return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + def _real_extract(self, url): + uuid, pcid = re.match(self._VALID_URL, url).groups() + query = {'id': uuid} if uuid else {'pcid': pcid} + video = self._download_json( + 'https://www.cbssports.com/api/content/video/', + uuid or pcid, query=query)[0] + video_id = video['id'] + title = video['title'] + metadata = video.get('metaData') or {} + # return self._extract_video_info('byId=%d' % metadata['mpxOutletId'], video_id) + # return self._extract_video_info('byGuid=' + metadata['mpxRefId'], video_id) + + formats = self._extract_m3u8_formats( + metadata['files'][0]['url'], video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + self._sort_formats(formats) + + image = video.get('image') + thumbnails = None + if image: + image_path = image.get('path') + if image_path: + thumbnails = [{ + 'url': image_path, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + 'filesize': int_or_none(image.get('size')), + }] + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': video.get('description'), + 'timestamp': int_or_none(try_get(video, lambda x: x['dateCreated']['epoch'])), + 'duration': int_or_none(metadata.get('duration')), + } + + +class CBSSportsBaseIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - [r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'], - webpage, 'video id') - return self._extract_video_info('byId=%s' % video_id, video_id) + iframe_url = self._search_regex( + r']+(?:data-)?src="(https?://[^/]+/player/embed[^"]+)"', + webpage, 'embed url') + return self.url_result(iframe_url, CBSSportsEmbedIE.ie_key()) + + +class CBSSportsIE(CBSSportsBaseIE): + IE_NAME = 'cbssports' + _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/video/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.cbssports.com/college-football/video/cover-3-stanford-spring-gleaning/', + 'info_dict': { + 'id': 'b56c03a6-231a-4bbe-9c55-af3c8a8e9636', + 'ext': 'mp4', + 'title': 'Cover 3: Stanford Spring Gleaning', + 'description': 'The Cover 3 crew break down everything you need to know about the Stanford Cardinal this spring.', + 'timestamp': 1617218398, + 'upload_date': '20210331', + 'duration': 502, + }, + }] + + +class TwentyFourSevenSportsIE(CBSSportsBaseIE): + IE_NAME = '247sports' + _VALID_URL = r'https?://(?:www\.)?247sports\.com/Video/(?:[^/?#&]+-)?(?P\d+)' + _TESTS = [{ + 'url': 'https://247sports.com/Video/2021-QB-Jake-Garcia-senior-highlights-through-five-games-10084854/', + 'info_dict': { + 'id': '4f1265cb-c3b5-44a8-bb1d-1914119a0ccc', + 'ext': 'mp4', + 'title': '2021 QB Jake Garcia senior highlights through five games', + 'description': 'md5:8cb67ebed48e2e6adac1701e0ff6e45b', + 'timestamp': 1607114223, + 'upload_date': '20201204', + 'duration': 208, + }, + }] diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/cda.py b/script.module.youtube.dl/lib/youtube_dl/extractor/cda.py index 1b4362144..e1b391937 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/cda.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/cda.py @@ -133,6 +133,8 @@ def _real_extract(self, url): 'age_limit': 18 if need_confirm_age else 0, } + info = self._search_json_ld(webpage, video_id, default={}) + # Source: https://www.cda.pl/js/player.js?t=1606154898 def decrypt_file(a): for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): @@ -197,7 +199,7 @@ def extract_format(page, version): handler = self._download_webpage webpage = handler( - self._BASE_URL + href, video_id, + urljoin(self._BASE_URL, href), video_id, 'Downloading %s version information' % resolution, fatal=False) if not webpage: # Manually report warning because empty page is returned when @@ -209,6 +211,4 @@ def extract_format(page, version): self._sort_formats(formats) - info = self._search_json_ld(webpage, video_id, default={}) - return merge_dicts(info_dict, info) diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/ceskatelevize.py b/script.module.youtube.dl/lib/youtube_dl/extractor/ceskatelevize.py index 7cb4efb74..fe677d8e8 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/ceskatelevize.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/ceskatelevize.py @@ -12,70 +12,136 @@ ExtractorError, float_or_none, sanitized_Request, - unescapeHTML, - update_url_query, + str_or_none, + traverse_obj, urlencode_postdata, USER_AGENTS, ) class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P[^/#?]+)' + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/(?:[^/?#&]+/)*(?P[^/#?]+)' _TESTS = [{ - 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', + 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', 'info_dict': { - 'id': '61924494877246241', + 'id': '61924494877028507', 'ext': 'mp4', - 'title': 'Hyde Park Civilizace: Život v Grónsku', - 'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626', + 'title': 'Bonus 01 - En - Hyde Park Civilizace', + 'description': 'English Subtittles', 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3350, + 'duration': 81.3, }, 'params': { # m3u8 download 'skip_download': True, }, }, { - 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', + # live stream + 'url': 'http://www.ceskatelevize.cz/zive/ct1/', 'info_dict': { - 'id': '61924494877028507', + 'id': '102', 'ext': 'mp4', - 'title': 'Hyde Park Civilizace: Bonus 01 - En', - 'description': 'English Subtittles', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 81.3, + 'title': r'ČT1 - živé vysílání online', + 'description': 'Sledujte živé vysílání kanálu ČT1 online. Vybírat si můžete i z dalších kanálů České televize na kterémkoli z vašich zařízení.', + 'is_live': True, }, 'params': { # m3u8 download 'skip_download': True, }, }, { - # live stream + # another 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', + 'only_matching': True, 'info_dict': { 'id': 402, 'ext': 'mp4', 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, }, + # 'skip': 'Georestricted to Czech Republic', + }, { + 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', + 'only_matching': True, + }, { + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Bogotart - Queer', + 'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti', + }, + 'playlist': [{ + 'info_dict': { + 'id': '61924494877311053', + 'ext': 'mp4', + 'title': 'Bogotart - Queer (Varování 18+)', + 'duration': 11.9, + }, + }, { + 'info_dict': { + 'id': '61924494877068022', + 'ext': 'mp4', + 'title': 'Bogotart - Queer (Queer)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1558.3, + }, + }], 'params': { # m3u8 download 'skip_download': True, }, - 'skip': 'Georestricted to Czech Republic', }, { - 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', + # iframe embed + 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', 'only_matching': True, }] + def _search_nextjs_data(self, webpage, video_id, **kw): + return self._parse_json( + self._search_regex( + r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', + webpage, 'next.js data', **kw), + video_id, **kw) + def _real_extract(self, url): playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) + webpage, urlh = self._download_webpage_handle(url, playlist_id) + parsed_url = compat_urllib_parse_urlparse(urlh.geturl()) + site_name = self._og_search_property('site_name', webpage, fatal=False, default='Česká televize') + playlist_title = self._og_search_title(webpage, default=None) + if site_name and playlist_title: + playlist_title = re.split(r'\s*[—|]\s*%s' % (site_name, ), playlist_title, 1)[0] + playlist_description = self._og_search_description(webpage, default=None) + if playlist_description: + playlist_description = playlist_description.replace('\xa0', ' ') + + type_ = 'IDEC' + if re.search(r'(^/porady|/zive)/', parsed_url.path): + next_data = self._search_nextjs_data(webpage, playlist_id) + if '/zive/' in parsed_url.path: + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'liveBroadcast', 'current', 'idec'), get_all=False) + else: + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False) + if not idec: + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'videobonusDetail', 'bonusId'), get_all=False) + if idec: + type_ = 'bonus' + if not idec: + raise ExtractorError('Failed to find IDEC id') + iframe_hash = self._download_webpage( + 'https://www.ceskatelevize.cz/v-api/iframe-hash/', + playlist_id, note='Getting IFRAME hash') + query = {'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', type_: idec, } + webpage = self._download_webpage( + 'https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php', + playlist_id, note='Downloading player', query=query) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' if '%s

' % NOT_AVAILABLE_STRING in webpage: - raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + self.raise_geo_restricted(NOT_AVAILABLE_STRING) + if any(not_found in webpage for not_found in ('Neplatný parametr pro videopřehrávač', 'IDEC nebyl nalezen', )): + raise ExtractorError('no video with IDEC available', video_id=idec, expected=True) type_ = None episode_id = None @@ -100,7 +166,7 @@ def _real_extract(self, url): data = { 'playlist[0][type]': type_, 'playlist[0][id]': episode_id, - 'requestUrl': compat_urllib_parse_urlparse(url).path, + 'requestUrl': parsed_url.path, 'requestSource': 'iVysilani', } @@ -108,7 +174,7 @@ def _real_extract(self, url): for user_agent in (None, USER_AGENTS['Safari']): req = sanitized_Request( - 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/', data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') @@ -130,9 +196,6 @@ def _real_extract(self, url): req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) - playlist_title = self._og_search_title(webpage, default=None) - playlist_description = self._og_search_description(webpage, default=None) - playlist = self._download_json(req, playlist_id, fatal=False) if not playlist: continue @@ -167,7 +230,7 @@ def _real_extract(self, url): entries[num]['formats'].extend(formats) continue - item_id = item.get('id') or item['assetId'] + item_id = str_or_none(item.get('id') or item['assetId']) title = item['title'] duration = float_or_none(item.get('duration')) @@ -181,8 +244,6 @@ def _real_extract(self, url): if playlist_len == 1: final_title = playlist_title or title - if is_live: - final_title = self._live_title(final_title) else: final_title = '%s (%s)' % (playlist_title, title) @@ -200,6 +261,8 @@ def _real_extract(self, url): for e in entries: self._sort_formats(e['formats']) + if len(entries) == 1: + return entries[0] return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) def _get_subtitles(self, episode_id, subs): @@ -236,54 +299,3 @@ def _fix_subtitle(subtitle): yield line return '\r\n'.join(_fix_subtitle(subtitles)) - - -class CeskaTelevizePoradyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P[^/#?]+)' - _TESTS = [{ - # video with 18+ caution trailer - 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', - 'info_dict': { - 'id': '215562210900007-bogotart', - 'title': 'Queer: Bogotart', - 'description': 'Alternativní průvodce současným queer světem', - }, - 'playlist': [{ - 'info_dict': { - 'id': '61924494876844842', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Varování 18+)', - 'duration': 10.2, - }, - }, { - 'info_dict': { - 'id': '61924494877068022', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Queer)', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 1558.3, - }, - }], - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # iframe embed - 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - data_url = update_url_query(unescapeHTML(self._search_regex( - (r']*\bdata-url=(["\'])(?P(?:(?!\1).)+)\1', - r']+\bsrc=(["\'])(?P(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), - webpage, 'iframe player url', group='url')), query={ - 'autoStart': 'true', - }) - - return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/clipchamp.py b/script.module.youtube.dl/lib/youtube_dl/extractor/clipchamp.py new file mode 100644 index 000000000..3b485eaab --- /dev/null +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/clipchamp.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + merge_dicts, + T, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class ClipchampIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clipchamp\.com/watch/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://clipchamp.com/watch/gRXZ4ZhdDaU', + 'info_dict': { + 'id': 'gRXZ4ZhdDaU', + 'ext': 'mp4', + 'title': 'Untitled video', + 'uploader': 'Alexander Schwartz', + 'timestamp': 1680805580, + 'upload_date': '20230406', + 'thumbnail': r're:^https?://.+\.jpg', + }, + 'params': { + 'skip_download': 'm3u8', + 'format': 'bestvideo', + }, + }] + + _STREAM_URL_TMPL = 'https://%s.cloudflarestream.com/%s/manifest/video.%s' + _STREAM_URL_QUERY = {'parentOrigin': 'https://clipchamp.com'} + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video'] + + storage_location = data.get('storage_location') + if storage_location != 'cf_stream': + raise ExtractorError('Unsupported clip storage location "%s"' % (storage_location,)) + + path = data['download_url'] + iframe = self._download_webpage( + 'https://iframe.cloudflarestream.com/' + path, video_id, 'Downloading player iframe') + subdomain = self._search_regex( + r'''\bcustomer-domain-prefix\s*=\s*("|')(?P[\w-]+)\1''', iframe, + 'subdomain', group='sd', fatal=False) or 'customer-2ut9yn3y6fta1yxe' + + formats = self._extract_mpd_formats( + self._STREAM_URL_TMPL % (subdomain, path, 'mpd'), video_id, + query=self._STREAM_URL_QUERY, fatal=False, mpd_id='dash') + formats.extend(self._extract_m3u8_formats( + self._STREAM_URL_TMPL % (subdomain, path, 'm3u8'), video_id, 'mp4', + query=self._STREAM_URL_QUERY, fatal=False, m3u8_id='hls')) + + return merge_dicts({ + 'id': video_id, + 'formats': formats, + 'uploader': ' '.join(traverse_obj(data, ('creator', ('first_name', 'last_name'), T(compat_str)))) or None, + }, traverse_obj(data, { + 'title': ('project', 'project_name', T(compat_str)), + 'timestamp': ('created_at', T(unified_timestamp)), + 'thumbnail': ('thumbnail_url', T(url_or_none)), + }), rev=True) diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/common.py b/script.module.youtube.dl/lib/youtube_dl/extractor/common.py index 8eb110f4e..0eca9f844 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/common.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/common.py @@ -3,6 +3,7 @@ import base64 import datetime +import functools import hashlib import json import netrc @@ -17,12 +18,14 @@ from ..compat import ( compat_cookiejar_Cookie, - compat_cookies, + compat_cookies_SimpleCookie, compat_etree_Element, compat_etree_fromstring, compat_getpass, compat_integer_types, compat_http_client, + compat_map as map, + compat_open as open, compat_os_name, compat_str, compat_urllib_error, @@ -31,6 +34,7 @@ compat_urllib_request, compat_urlparse, compat_xml_parse_error, + compat_zip as zip, ) from ..downloader.f4m import ( get_base_url, @@ -70,6 +74,8 @@ str_or_none, str_to_int, strip_or_none, + traverse_obj, + try_get, unescapeHTML, unified_strdate, unified_timestamp, @@ -78,6 +84,7 @@ urljoin, url_basename, url_or_none, + variadic, xpath_element, xpath_text, xpath_with_ns, @@ -230,8 +237,10 @@ class InfoExtractor(object): uploader: Full name of the video uploader. license: License name the video is licensed under. creator: The creator of the video. + release_timestamp: UNIX timestamp of the moment the video was released. release_date: The date (YYYYMMDD) when the video was released. - timestamp: UNIX timestamp of the moment the video became available. + timestamp: UNIX timestamp of the moment the video became available + (uploaded). upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. @@ -364,9 +373,22 @@ class InfoExtractor(object): title, description etc. - Subclasses of this one should re-define the _real_initialize() and - _real_extract() methods and define a _VALID_URL regexp. - Probably, they should also be added to the list of extractors. + A subclass of InfoExtractor must be defined to handle each specific site (or + several sites). Such a concrete subclass should be added to the list of + extractors. It should also: + * define its _VALID_URL attribute as a regexp, or a Sequence of alternative + regexps (but see below) + * re-define the _real_extract() method + * optionally re-define the _real_initialize() method. + + An extractor subclass may also override suitable() if necessary, but the + function signature must be preserved and the function must import everything + it needs (except other extractors), so that lazy_extractors works correctly. + If the subclass's suitable() and _real_extract() functions avoid using + _VALID_URL, the subclass need not set that class attribute. + + An abstract subclass of InfoExtractor may be used to simplify implementation + within an extractor module; it should not be added to the list of extractors. _GEO_BYPASS attribute may be set to False in order to disable geo restriction bypass mechanisms for a particular extractor. @@ -402,21 +424,32 @@ def __init__(self, downloader=None): self.set_downloader(downloader) @classmethod - def suitable(cls, url): - """Receives a URL and returns True if suitable for this IE.""" - + def __match_valid_url(cls, url): # This does not use has/getattr intentionally - we want to know whether - # we have cached the regexp for *this* class, whereas getattr would also - # match the superclass + # we have cached the regexp for cls, whereas getattr would also + # match its superclass if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL) - return cls._VALID_URL_RE.match(url) is not None + # _VALID_URL can now be a list/tuple of patterns + cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL))) + # 20% faster than next(filter(None, (p.match(url) for p in cls._VALID_URL_RE)), None) in 2.7 + for p in cls._VALID_URL_RE: + p = p.match(url) + if p: + return p + + # The public alias can safely be overridden, as in some back-ports + _match_valid_url = __match_valid_url + + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + # This function must import everything it needs (except other extractors), + # so that lazy_extractors works correctly + return cls.__match_valid_url(url) is not None @classmethod def _match_id(cls, url): - if '_VALID_URL_RE' not in cls.__dict__: - cls._VALID_URL_RE = re.compile(cls._VALID_URL) - m = cls._VALID_URL_RE.match(url) + m = cls.__match_valid_url(url) assert m return compat_str(m.group('id')) @@ -1002,6 +1035,8 @@ def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, f if group is None: # return the first matching group return next(g for g in mobj.groups() if g is not None) + elif isinstance(group, (list, tuple)): + return tuple(mobj.group(g) for g in group) else: return mobj.group(group) elif default is not NO_DEFAULT: @@ -1017,10 +1052,9 @@ def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=Tr Like _search_regex, but strips HTML tags and unescapes entities. """ res = self._search_regex(pattern, string, name, default, fatal, flags, group) - if res: - return clean_html(res).strip() - else: - return res + if isinstance(res, tuple): + return tuple(map(clean_html, res)) + return clean_html(res) def _get_netrc_login_info(self, netrc_machine=None): username = None @@ -1084,7 +1118,7 @@ def _get_tfa_info(self, note='two-factor verification code'): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))' property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' % {'prop': re.escape(prop)}) template = r']+?%s[^>]+?%s' @@ -1273,6 +1307,7 @@ def extract_interaction_statistic(e): def extract_video_object(e): assert e['@type'] == 'VideoObject' + author = e.get('author') info.update({ 'url': url_or_none(e.get('contentUrl')), 'title': unescapeHTML(e.get('name')), @@ -1280,7 +1315,11 @@ def extract_video_object(e): 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), - 'uploader': str_or_none(e.get('author')), + # author can be an instance of 'Organization' or 'Person' types. + # both types can have 'name' property(inherited from 'Thing' type). [1] + # however some websites are using 'Text' type instead. + # 1. https://schema.org/VideoObject + 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None, 'filesize': float_or_none(e.get('contentSize')), 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), @@ -1340,6 +1379,44 @@ def extract_video_object(e): break return dict((k, v) for k, v in info.items() if v is not None) + def _search_nextjs_data(self, webpage, video_id, **kw): + nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal')) + kw.pop('transform_source', None) + next_data = self._search_regex( + r''']+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P[^<]+)''', + webpage, 'next.js data', group='nd', **kw) + if not next_data: + return {} + return self._parse_json(next_data, video_id, **nkw) + + def _search_nuxt_data(self, webpage, video_id, *args, **kwargs): + """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" + + # self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0) + context_name = args[0] if len(args) > 0 else kwargs.get('context_name', '__NUXT__') + fatal = kwargs.get('fatal', True) + traverse = kwargs.get('traverse', ('data', 0)) + + re_ctx = re.escape(context_name) + + FUNCTION_RE = (r'\(\s*function\s*\((?P[\s\S]*?)\)\s*\{\s*' + r'return\s+(?P\{[\s\S]*?})\s*;?\s*}\s*\((?P[\s\S]*?)\)') + + js, arg_keys, arg_vals = self._search_regex( + (p.format(re_ctx, FUNCTION_RE) for p in + (r'', + r'{0}\s*\([\s\S]*?{1}')), + webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), + default=NO_DEFAULT if fatal else (None, None, None)) + if js is None: + return {} + + args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json( + '[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ()))) + + ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) + return traverse_obj(ret, traverse) or {} + @staticmethod def _hidden_inputs(html): html = re.sub(r'', '', html) @@ -2487,7 +2564,8 @@ def parse_content_type(content_type): return f return {} - def _media_formats(src, cur_media_type, type_info={}): + def _media_formats(src, cur_media_type, type_info=None): + type_info = type_info or {} full_url = absolute_url(src) ext = type_info.get('ext') or determine_ext(full_url) if ext == 'm3u8': @@ -2505,6 +2583,7 @@ def _media_formats(src, cur_media_type, type_info={}): formats = [{ 'url': full_url, 'vcodec': 'none' if cur_media_type == 'audio' else None, + 'ext': ext, }] return is_plain_url, formats @@ -2513,7 +2592,7 @@ def _media_formats(src, cur_media_type, type_info={}): # so we wll include them right here (see # https://www.ampproject.org/docs/reference/components/amp-video) # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ - _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' + _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video(?:-js)?|audio)' media_tags = [(media_tag, media_tag_name, media_type, '') for media_tag, media_tag_name, media_type in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)] @@ -2531,7 +2610,8 @@ def _media_formats(src, cur_media_type, type_info={}): media_attributes = extract_attributes(media_tag) src = strip_or_none(media_attributes.get('src')) if src: - _, formats = _media_formats(src, media_type) + f = parse_content_type(media_attributes.get('type')) + _, formats = _media_formats(src, media_type, f) media_info['formats'].extend(formats) media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) if media_content: @@ -2706,7 +2786,7 @@ def manifest_url(manifest): def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): mobj = re.search( - r'(?s)jwplayer\((?P[\'"])[^\'" ]+(?P=quote)\)(?!).*?\.setup\s*\((?P[^)]+)\)', + r'''(?s)jwplayer\s*\(\s*(?P'|")(?!(?P=q)).+(?P=q)\s*\)(?!).*?\.\s*setup\s*\(\s*(?P(?:\([^)]*\)|[^)])+)\s*\)''', webpage) if mobj: try: @@ -2727,9 +2807,14 @@ def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + flat_pl = try_get(jwplayer_data, lambda x: x.get('playlist') or True) + if flat_pl is None: + # not even a dict + return [] + # JWPlayer backward compatibility: flattened playlists # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 - if 'playlist' not in jwplayer_data: + if flat_pl is True: jwplayer_data = {'playlist': [jwplayer_data]} entries = [] @@ -2777,6 +2862,13 @@ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'subtitles': subtitles, + 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ... + 'genre': clean_html(video_data.get('genre')), + 'channel': clean_html(dict_get(video_data, ('category', 'channel'))), + 'season_number': int_or_none(video_data.get('season')), + 'episode_number': int_or_none(video_data.get('episode')), + 'release_year': int_or_none(video_data.get('releasedate')), + 'age_limit': int_or_none(video_data.get('age_restriction')), } # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): @@ -2785,7 +2877,9 @@ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, 'url': formats[0]['url'], }) else: - self._sort_formats(formats) + # avoid exception in case of only sttls + if formats: + self._sort_formats(formats) entry['formats'] = formats entries.append(entry) if len(entries) == 1: @@ -2795,7 +2889,7 @@ def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): - urls = [] + urls = set() formats = [] for source in jwplayer_sources_data: if not isinstance(source, dict): @@ -2804,14 +2898,14 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, base_url, self._proto_relative_url(source.get('file'))) if not source_url or source_url in urls: continue - urls.append(source_url) + urls.add(source_url) source_type = source.get('type') or '' ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': + if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url: formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif source_type == 'dash' or ext == 'mpd': + elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url: formats.extend(self._extract_mpd_formats( source_url, video_id, mpd_id=mpd_id, fatal=False)) elif ext == 'smil': @@ -2826,20 +2920,23 @@ def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, 'ext': ext, }) else: + format_id = str_or_none(source.get('label')) height = int_or_none(source.get('height')) - if height is None: + if height is None and format_id: # Often no height is provided but there is a label in # format like "1080p", "720p SD", or 1080. - height = int_or_none(self._search_regex( - r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), - 'height', default=None)) + height = parse_resolution(format_id).get('height') a_format = { 'url': source_url, 'width': int_or_none(source.get('width')), 'height': height, - 'tbr': int_or_none(source.get('bitrate')), + 'tbr': int_or_none(source.get('bitrate'), scale=1000), + 'filesize': int_or_none(source.get('filesize')), 'ext': ext, } + if format_id: + a_format['format_id'] = format_id + if source_url.startswith('rtmp'): a_format['ext'] = 'flv' # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as @@ -2894,10 +2991,10 @@ def _set_cookie(self, domain, name, value, expire_time=None, port=None, self._downloader.cookiejar.set_cookie(cookie) def _get_cookies(self, url): - """ Return a compat_cookies.SimpleCookie with the cookies for the url """ + """ Return a compat_cookies_SimpleCookie with the cookies for the url """ req = sanitized_Request(url) self._downloader.cookiejar.add_cookie_header(req) - return compat_cookies.SimpleCookie(req.get_header('Cookie')) + return compat_cookies_SimpleCookie(req.get_header('Cookie')) def _apply_first_set_cookie_header(self, url_handle, cookie): """ diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/cpac.py b/script.module.youtube.dl/lib/youtube_dl/extractor/cpac.py new file mode 100644 index 000000000..22741152c --- /dev/null +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/cpac.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + try_get, + unified_timestamp, + update_url_query, + urljoin, +) + +# compat_range +try: + if callable(xrange): + range = xrange +except (NameError, TypeError): + pass + + +class CPACIE(InfoExtractor): + IE_NAME = 'cpac' + _VALID_URL = r'https?://(?:www\.)?cpac\.ca/(?Pl-)?episode\?id=(?P[\da-f]{8}(?:-[\da-f]{4}){3}-[\da-f]{12})' + _TEST = { + # 'url': 'http://www.cpac.ca/en/programs/primetime-politics/episodes/65490909', + 'url': 'https://www.cpac.ca/episode?id=fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'md5': 'e46ad699caafd7aa6024279f2614e8fa', + 'info_dict': { + 'id': 'fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'ext': 'mp4', + 'upload_date': '20220215', + 'title': 'News Conference to Celebrate National Kindness Week – February 15, 2022', + 'description': 'md5:466a206abd21f3a6f776cdef290c23fb', + 'timestamp': 1644901200, + }, + 'params': { + 'format': 'bestvideo', + 'hls_prefer_native': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if '/l-episode?' in url else 'en' + + content = self._download_json( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/episode/index.xml&crafterSite=cpacca&id=' + video_id, + video_id) + video_url = try_get(content, lambda x: x['page']['details']['videoUrl'], compat_str) + formats = [] + if video_url: + content = content['page'] + title = str_or_none(content['details']['title_%s_t' % (url_lang, )]) + formats = self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', ext='mp4') + for fmt in formats: + # prefer language to match URL + fmt_lang = fmt.get('language') + if fmt_lang == url_lang: + fmt['language_preference'] = 10 + elif not fmt_lang: + fmt['language_preference'] = -1 + else: + fmt['language_preference'] = -10 + + self._sort_formats(formats) + + category = str_or_none(content['details']['category_%s_t' % (url_lang, )]) + + def is_live(v_type): + return (v_type == 'live') if v_type is not None else None + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': str_or_none(content['details'].get('description_%s_t' % (url_lang, ))), + 'timestamp': unified_timestamp(content['details'].get('liveDateTime')), + 'category': [category] if category else None, + 'thumbnail': urljoin(url, str_or_none(content['details'].get('image_%s_s' % (url_lang, )))), + 'is_live': is_live(content['details'].get('type')), + } + + +class CPACPlaylistIE(InfoExtractor): + IE_NAME = 'cpac:playlist' + _VALID_URL = r'(?i)https?://(?:www\.)?cpac\.ca/(?:program|search|(?Pemission|rechercher))\?(?:[^&]+&)*?(?P(?:id=\d+|programId=\d+|key=[^&]+))' + + _TESTS = [{ + 'url': 'https://www.cpac.ca/program?id=6', + 'info_dict': { + 'id': 'id=6', + 'title': 'Headline Politics', + 'description': 'Watch CPAC’s signature long-form coverage of the day’s pressing political events as they unfold.', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.cpac.ca/search?key=hudson&type=all&order=desc', + 'info_dict': { + 'id': 'key=hudson', + 'title': 'hudson', + }, + 'playlist_count': 22, + }, { + 'url': 'https://www.cpac.ca/search?programId=50', + 'info_dict': { + 'id': 'programId=50', + 'title': '50', + }, + 'playlist_count': 9, + }, { + 'url': 'https://www.cpac.ca/emission?id=6', + 'only_matching': True, + }, { + 'url': 'https://www.cpac.ca/rechercher?key=hudson&type=all&order=desc', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if any(x in url for x in ('/emission?', '/rechercher?')) else 'en' + pl_type, list_type = ('program', 'itemList') if any(x in url for x in ('/program?', '/emission?')) else ('search', 'searchResult') + api_url = ( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/%s/index.xml&crafterSite=cpacca&%s' + % (pl_type, video_id, )) + content = self._download_json(api_url, video_id) + entries = [] + total_pages = int_or_none(try_get(content, lambda x: x['page'][list_type]['totalPages']), default=1) + for page in range(1, total_pages + 1): + if page > 1: + api_url = update_url_query(api_url, {'page': '%d' % (page, ), }) + content = self._download_json( + api_url, video_id, + note='Downloading continuation - %d' % (page, ), + fatal=False) + + for item in try_get(content, lambda x: x['page'][list_type]['item'], list) or []: + episode_url = urljoin(url, try_get(item, lambda x: x['url_%s_s' % (url_lang, )])) + if episode_url: + entries.append(episode_url) + + return self.playlist_result( + (self.url_result(entry) for entry in entries), + playlist_id=video_id, + playlist_title=try_get(content, lambda x: x['page']['program']['title_%s_t' % (url_lang, )]) or video_id.split('=')[-1], + playlist_description=try_get(content, lambda x: x['page']['program']['description_%s_t' % (url_lang, )]), + ) diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/curiositystream.py b/script.module.youtube.dl/lib/youtube_dl/extractor/curiositystream.py index e4a7fca6c..48ff30432 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/curiositystream.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/curiositystream.py @@ -25,12 +25,12 @@ def _handle_errors(self, result): raise ExtractorError( '%s said: %s' % (self.IE_NAME, error), expected=True) - def _call_api(self, path, video_id): + def _call_api(self, path, video_id, query=None): headers = {} if self._auth_token: headers['X-Auth-Token'] = self._auth_token result = self._download_json( - self._API_BASE_URL + path, video_id, headers=headers) + self._API_BASE_URL + path, video_id, headers=headers, query=query) self._handle_errors(result) return result['data'] @@ -52,62 +52,75 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P\d+)' _TEST = { 'url': 'https://app.curiositystream.com/video/2', - 'md5': '262bb2f257ff301115f1973540de8983', 'info_dict': { 'id': '2', 'ext': 'mp4', 'title': 'How Did You Develop The Internet?', 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', - } + }, + 'params': { + 'format': 'bestvideo', + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): video_id = self._match_id(url) - media = self._call_api('media/' + video_id, video_id) - title = media['title'] formats = [] - for encoding in media.get('encodings', []): - m3u8_url = encoding.get('master_playlist_url') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - encoding_url = encoding.get('url') - file_url = encoding.get('file_url') - if not encoding_url and not file_url: - continue - f = { - 'width': int_or_none(encoding.get('width')), - 'height': int_or_none(encoding.get('height')), - 'vbr': int_or_none(encoding.get('video_bitrate')), - 'abr': int_or_none(encoding.get('audio_bitrate')), - 'filesize': int_or_none(encoding.get('size_in_bytes')), - 'vcodec': encoding.get('video_codec'), - 'acodec': encoding.get('audio_codec'), - 'container': encoding.get('container_type'), - } - for f_url in (encoding_url, file_url): - if not f_url: + for encoding_format in ('m3u8', 'mpd'): + media = self._call_api('media/' + video_id, video_id, query={ + 'encodingsNew': 'true', + 'encodingsFormat': encoding_format, + }) + for encoding in media.get('encodings', []): + playlist_url = encoding.get('master_playlist_url') + if encoding_format == 'm3u8': + # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol + formats.extend(self._extract_m3u8_formats( + playlist_url, video_id, 'mp4', + m3u8_id='hls', fatal=False)) + elif encoding_format == 'mpd': + formats.extend(self._extract_mpd_formats( + playlist_url, video_id, mpd_id='dash', fatal=False)) + encoding_url = encoding.get('url') + file_url = encoding.get('file_url') + if not encoding_url and not file_url: continue - fmt = f.copy() - rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp[34]:.+)$', f_url) - if rtmp: - fmt.update({ - 'url': rtmp.group('url'), - 'play_path': rtmp.group('playpath'), - 'app': rtmp.group('app'), - 'ext': 'flv', - 'format_id': 'rtmp', - }) - else: - fmt.update({ - 'url': f_url, - 'format_id': 'http', - }) - formats.append(fmt) + f = { + 'width': int_or_none(encoding.get('width')), + 'height': int_or_none(encoding.get('height')), + 'vbr': int_or_none(encoding.get('video_bitrate')), + 'abr': int_or_none(encoding.get('audio_bitrate')), + 'filesize': int_or_none(encoding.get('size_in_bytes')), + 'vcodec': encoding.get('video_codec'), + 'acodec': encoding.get('audio_codec'), + 'container': encoding.get('container_type'), + } + for f_url in (encoding_url, file_url): + if not f_url: + continue + fmt = f.copy() + rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp[34]:.+)$', f_url) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + 'format_id': 'rtmp', + }) + else: + fmt.update({ + 'url': f_url, + 'format_id': 'http', + }) + formats.append(fmt) self._sort_formats(formats) + title = media['title'] + subtitles = {} for closed_caption in media.get('closed_captions', []): sub_url = closed_caption.get('file') @@ -132,7 +145,7 @@ def _real_extract(self, url): class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): IE_NAME = 'curiositystream:collection' - _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collection|series)/(?P\d+)' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collections?|series)/(?P\d+)' _TESTS = [{ 'url': 'https://app.curiositystream.com/collection/2', 'info_dict': { @@ -140,10 +153,13 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): 'title': 'Curious Minds: The Internet', 'description': 'How is the internet shaping our lives in the 21st Century?', }, - 'playlist_mincount': 17, + 'playlist_mincount': 16, }, { 'url': 'https://curiositystream.com/series/2', 'only_matching': True, + }, { + 'url': 'https://curiositystream.com/collections/36', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/dispeak.py b/script.module.youtube.dl/lib/youtube_dl/extractor/dispeak.py index c345e0274..276fd4b09 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/dispeak.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/dispeak.py @@ -32,6 +32,18 @@ class DigitallySpeakingIE(InfoExtractor): # From http://www.gdcvault.com/play/1013700/Advanced-Material 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', 'only_matching': True, + }, { + # From https://gdcvault.com/play/1016624, empty speakerVideo + 'url': 'https://sevt.dispeak.com/ubm/gdc/online12/xml/201210-822101_1349794556671DDDD.xml', + 'info_dict': { + 'id': '201210-822101_1349794556671DDDD', + 'ext': 'flv', + 'title': 'Pre-launch - Preparing to Take the Plunge', + }, + }, { + # From http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru, empty slideVideo + 'url': 'http://events.digitallyspeaking.com/gdc/project25/xml/p25-miyamoto1999_1282467389849HSVB.xml', + 'only_matching': True, }] def _parse_mp4(self, metadata): @@ -84,26 +96,20 @@ def _parse_flv(self, metadata): 'vcodec': 'none', 'format_id': audio.get('code'), }) - slide_video_path = xpath_text(metadata, './slideVideo', fatal=True) - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(slide_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'slide deck video', - 'quality': -2, - 'preference': -2, - 'format_id': 'slides', - }) - speaker_video_path = xpath_text(metadata, './speakerVideo', fatal=True) - formats.append({ - 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, - 'play_path': remove_end(speaker_video_path, '.flv'), - 'ext': 'flv', - 'format_note': 'speaker video', - 'quality': -1, - 'preference': -1, - 'format_id': 'speaker', - }) + for video_key, format_id, preference in ( + ('slide', 'slides', -2), ('speaker', 'speaker', -1)): + video_path = xpath_text(metadata, './%sVideo' % video_key) + if not video_path: + continue + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(video_path, '.flv'), + 'ext': 'flv', + 'format_note': '%s video' % video_key, + 'quality': preference, + 'preference': preference, + 'format_id': format_id, + }) return formats def _real_extract(self, url): diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/dlf.py b/script.module.youtube.dl/lib/youtube_dl/extractor/dlf.py new file mode 100644 index 000000000..cc3de4582 --- /dev/null +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/dlf.py @@ -0,0 +1,204 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + determine_ext, + extract_attributes, + int_or_none, + merge_dicts, + traverse_obj, + url_or_none, + variadic, +) + + +class DLFBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?deutschlandfunk\.de/' + _BUTTON_REGEX = r'(]+alt="Anhören"[^>]+data-audio-diraid[^>]*>)' + + def _parse_button_attrs(self, button, audio_id=None): + attrs = extract_attributes(button) + audio_id = audio_id or attrs['data-audio-diraid'] + + url = traverse_obj( + attrs, 'data-audio-download-src', 'data-audio', 'data-audioreference', + 'data-audio-src', expected_type=url_or_none) + ext = determine_ext(url) + formats = (self._extract_m3u8_formats(url, audio_id, fatal=False) + if ext == 'm3u8' else [{'url': url, 'ext': ext, 'vcodec': 'none'}]) + self._sort_formats(formats) + + def traverse_attrs(path): + path = list(variadic(path)) + t = path.pop() if callable(path[-1]) else None + return traverse_obj(attrs, path, expected_type=t, get_all=False) + + def txt_or_none(v, default=None): + return default if v is None else (compat_str(v).strip() or default) + + return merge_dicts(*reversed([{ + 'id': audio_id, + # 'extractor_key': DLFIE.ie_key(), + # 'extractor': DLFIE.IE_NAME, + 'formats': formats, + }, dict((k, traverse_attrs(v)) for k, v in { + 'title': (('data-audiotitle', 'data-audio-title', 'data-audio-download-tracking-title'), txt_or_none), + 'duration': (('data-audioduration', 'data-audio-duration'), int_or_none), + 'thumbnail': ('data-audioimage', url_or_none), + 'uploader': 'data-audio-producer', + 'series': 'data-audio-series', + 'channel': 'data-audio-origin-site-name', + 'webpage_url': ('data-audio-download-tracking-path', url_or_none), + }.items())])) + + +class DLFIE(DLFBaseIE): + IE_NAME = 'dlf' + _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'[\w-]+-dlf-(?P[\da-f]{8})-100\.html' + _TESTS = [ + # Audio as an HLS stream + { + 'url': 'https://www.deutschlandfunk.de/tanz-der-saiteninstrumente-das-wild-strings-trio-aus-slowenien-dlf-03a3eb19-100.html', + 'info_dict': { + 'id': '03a3eb19', + 'title': r're:Tanz der Saiteninstrumente [-/] Das Wild Strings Trio aus Slowenien', + 'ext': 'm4a', + 'duration': 3298, + 'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673', + 'uploader': 'Deutschlandfunk', + 'series': 'On Stage', + 'channel': 'deutschlandfunk' + }, + 'params': { + 'skip_download': 'm3u8' + }, + 'skip': 'This webpage no longer exists' + }, { + 'url': 'https://www.deutschlandfunk.de/russische-athleten-kehren-zurueck-auf-die-sportbuehne-ein-gefaehrlicher-tueroeffner-dlf-d9cc1856-100.html', + 'info_dict': { + 'id': 'd9cc1856', + 'title': 'Russische Athleten kehren zurück auf die Sportbühne: Ein gefährlicher Türöffner', + 'ext': 'mp3', + 'duration': 291, + 'thumbnail': 'https://assets.deutschlandfunk.de/FALLBACK-IMAGE-AUDIO/512x512.png?t=1603714364673', + 'uploader': 'Deutschlandfunk', + 'series': 'Kommentare und Themen der Woche', + 'channel': 'deutschlandfunk' + } + }, + ] + + def _real_extract(self, url): + audio_id = self._match_id(url) + webpage = self._download_webpage(url, audio_id) + + return self._parse_button_attrs( + self._search_regex(self._BUTTON_REGEX, webpage, 'button'), audio_id) + + +class DLFCorpusIE(DLFBaseIE): + IE_NAME = 'dlf:corpus' + IE_DESC = 'DLF Multi-feed Archives' + _VALID_URL = DLFBaseIE._VALID_URL_BASE + r'(?P(?![\w-]+-dlf-[\da-f]{8})[\w-]+-\d+)\.html' + _TESTS = [ + # Recorded news broadcast with referrals to related broadcasts + { + 'url': 'https://www.deutschlandfunk.de/fechten-russland-belarus-ukraine-protest-100.html', + 'info_dict': { + 'id': 'fechten-russland-belarus-ukraine-protest-100', + 'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet', + 'description': 'md5:91340aab29c71aa7518ad5be13d1e8ad' + }, + 'playlist_mincount': 5, + 'playlist': [{ + 'info_dict': { + 'id': '1fc5d64a', + 'title': r're:Wiederzulassung als neutrale Athleten [-/] Was die Rückkehr russischer und belarussischer Sportler beim Fechten bedeutet', + 'ext': 'mp3', + 'duration': 252, + 'thumbnail': 'https://assets.deutschlandfunk.de/aad16241-6b76-4a09-958b-96d0ee1d6f57/512x512.jpg?t=1679480020313', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport', + 'channel': 'deutschlandfunk' + } + }, { + 'info_dict': { + 'id': '2ada145f', + 'title': r're:(?:Sportpolitik / )?Fechtverband votiert für Rückkehr russischer Athleten', + 'ext': 'mp3', + 'duration': 336, + 'thumbnail': 'https://assets.deutschlandfunk.de/FILE_93982766f7317df30409b8a184ac044a/512x512.jpg?t=1678547581005', + 'uploader': 'Deutschlandfunk', + 'series': 'Deutschlandfunk Nova', + 'channel': 'deutschlandfunk-nova' + } + }, { + 'info_dict': { + 'id': '5e55e8c9', + 'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis', + 'ext': 'mp3', + 'duration': 187, + 'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk' + } + }, { + 'info_dict': { + 'id': '47e1a096', + 'title': r're:Rückkehr Russlands im Fechten [-/] "Fassungslos, dass es einfach so passiert ist"', + 'ext': 'mp3', + 'duration': 602, + 'thumbnail': 'https://assets.deutschlandfunk.de/da4c494a-21cc-48b4-9cc7-40e09fd442c2/512x512.jpg?t=1678562155770', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk' + } + }, { + 'info_dict': { + 'id': '5e55e8c9', + 'title': r're:Wiederzulassung von Russland und Belarus [-/] "Herumlavieren" des Fechter-Bundes sorgt für Unverständnis', + 'ext': 'mp3', + 'duration': 187, + 'thumbnail': 'https://assets.deutschlandfunk.de/a595989d-1ed1-4a2e-8370-b64d7f11d757/512x512.jpg?t=1679173825412', + 'uploader': 'Deutschlandfunk', + 'series': 'Sport am Samstag', + 'channel': 'deutschlandfunk' + } + }] + }, + # Podcast feed with tag buttons, playlist count fluctuates + { + 'url': 'https://www.deutschlandfunk.de/kommentare-und-themen-der-woche-100.html', + 'info_dict': { + 'id': 'kommentare-und-themen-der-woche-100', + 'title': 'Meinung - Kommentare und Themen der Woche', + 'description': 'md5:2901bbd65cd2d45e116d399a099ce5d5', + }, + 'playlist_mincount': 10, + }, + # Podcast feed with no description + { + 'url': 'https://www.deutschlandfunk.de/podcast-tolle-idee-100.html', + 'info_dict': { + 'id': 'podcast-tolle-idee-100', + 'title': 'Wissenschaftspodcast - Tolle Idee! - Was wurde daraus?', + }, + 'playlist_mincount': 11, + }, + ] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + return self.playlist_result( + map(self._parse_button_attrs, re.findall(self._BUTTON_REGEX, webpage)), + playlist_id, self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None), + self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage, default=None)) diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/egghead.py b/script.module.youtube.dl/lib/youtube_dl/extractor/egghead.py index aff9b88c0..9bbd703e0 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/egghead.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/egghead.py @@ -22,16 +22,19 @@ def _call_api(self, path, video_id, resource, fatal=True): class EggheadCourseIE(EggheadBaseIE): IE_DESC = 'egghead.io course' IE_NAME = 'egghead:course' - _VALID_URL = r'https://egghead\.io/courses/(?P[^/?#&]+)' - _TEST = { + _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:course|playlist)s/(?P[^/?#&]+)' + _TESTS = [{ 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', 'playlist_count': 29, 'info_dict': { - 'id': '72', + 'id': '432655', 'title': 'Professor Frisby Introduces Composable Functional JavaScript', 'description': 're:(?s)^This course teaches the ubiquitous.*You\'ll start composing functionality before you know it.$', }, - } + }, { + 'url': 'https://app.egghead.io/playlists/professor-frisby-introduces-composable-functional-javascript', + 'only_matching': True, + }] def _real_extract(self, url): playlist_id = self._match_id(url) @@ -65,7 +68,7 @@ def _real_extract(self, url): class EggheadLessonIE(EggheadBaseIE): IE_DESC = 'egghead.io lesson' IE_NAME = 'egghead:lesson' - _VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P[^/?#&]+)' + _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:api/v1/)?lessons/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', 'info_dict': { @@ -88,6 +91,9 @@ class EggheadLessonIE(EggheadBaseIE): }, { 'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application', 'only_matching': True, + }, { + 'url': 'https://app.egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/eroprofile.py b/script.module.youtube.dl/lib/youtube_dl/extractor/eroprofile.py index c08643a17..c460dc7f9 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/eroprofile.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/eroprofile.py @@ -6,7 +6,7 @@ from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, - unescapeHTML + merge_dicts, ) @@ -24,7 +24,8 @@ class EroProfileIE(InfoExtractor): 'title': 'sexy babe softcore', 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, - } + }, + 'skip': 'Video not found', }, { 'url': 'http://www.eroprofile.com/m/videos/view/Try-It-On-Pee_cut_2-wmv-4shared-com-file-sharing-download-movie-file', 'md5': '1baa9602ede46ce904c431f5418d8916', @@ -77,19 +78,15 @@ def _real_extract(self, url): [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'], webpage, 'video id', default=None) - video_url = unescapeHTML(self._search_regex( - r'([^<]+)', webpage, 'title') - thumbnail = self._search_regex( - r'onclick="showVideoPlayer\(\)">([^<]+)', r']*>(.+?)'), + webpage, 'title') + + info = self._parse_html5_media_entries(url, webpage, video_id)[0] - return { + return merge_dicts(info, { 'id': video_id, 'display_id': display_id, - 'url': video_url, 'title': title, - 'thumbnail': thumbnail, 'age_limit': 18, - } + }) diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/extractors.py b/script.module.youtube.dl/lib/youtube_dl/extractor/extractors.py index 07a8af055..d9289e5bf 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/extractors.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/extractors.py @@ -51,6 +51,10 @@ from .aol import AolIE from .allocine import AllocineIE from .aliexpress import AliExpressLiveIE +from .alsace20tv import ( + Alsace20TVIE, + Alsace20TVEmbedIE, +) from .apa import APAIE from .aparat import AparatIE from .appleconnect import AppleConnectIE @@ -71,7 +75,9 @@ ArteTVIE, ArteTVEmbedIE, ArteTVPlaylistIE, + ArteTVCategoryIE, ) +from .arnes import ArnesIE from .asiancrush import ( AsianCrushIE, AsianCrushPlaylistIE, @@ -95,7 +101,8 @@ from .bbc import ( BBCCoUkIE, BBCCoUkArticleIE, - BBCCoUkIPlayerPlaylistIE, + BBCCoUkIPlayerEpisodesIE, + BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE, BBCIE, ) @@ -112,6 +119,7 @@ ) from .bibeltv import BibelTVIE from .bigflix import BigflixIE +from .bigo import BigoIE from .bild import BildIE from .bilibili import ( BiliBiliIE, @@ -130,7 +138,7 @@ BleacherReportIE, BleacherReportCMSIE, ) -from .blinkx import BlinkxIE +from .blerp import BlerpIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bongacams import BongaCamsIE @@ -151,6 +159,7 @@ from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE +from .callin import CallinIE from .camdemy import ( CamdemyIE, CamdemyFolderIE @@ -189,7 +198,11 @@ CBSNewsIE, CBSNewsLiveVideoIE, ) -from .cbssports import CBSSportsIE +from .cbssports import ( + CBSSportsEmbedIE, + CBSSportsIE, + TwentyFourSevenSportsIE, +) from .ccc import ( CCCIE, CCCPlaylistIE, @@ -197,10 +210,7 @@ from .ccma import CCMAIE from .cctv import CCTVIE from .cda import CDAIE -from .ceskatelevize import ( - CeskaTelevizeIE, - CeskaTelevizePoradyIE, -) +from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .charlierose import CharlieRoseIE from .chaturbate import ChaturbateIE @@ -216,6 +226,7 @@ CiscoLiveSearchIE, ) from .cjsw import CJSWIE +from .clipchamp import ClipchampIE from .cliphunter import CliphunterIE from .clippit import ClippitIE from .cliprs import ClipRsIE @@ -248,6 +259,10 @@ from .condenast import CondeNastIE from .contv import CONtvIE from .corus import CorusIE +from .cpac import ( + CPACIE, + CPACPlaylistIE, +) from .cracked import CrackedIE from .crackle import CrackleIE from .crooksandliars import CrooksAndLiarsIE @@ -281,6 +296,10 @@ from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .democracynow import DemocracynowIE +from .dlf import ( + DLFCorpusIE, + DLFIE, +) from .dfb import DFBIE from .dhm import DHMIE from .digg import DiggIE @@ -362,6 +381,8 @@ FC2EmbedIE, ) from .fczenit import FczenitIE +from .filemoon import FileMoonIE +from .fifa import FifaIE from .filmon import ( FilmOnIE, FilmOnChannelIE, @@ -428,6 +449,13 @@ from .giantbomb import GiantBombIE from .giga import GigaIE from .glide import GlideIE +from .globalplayer import ( + GlobalPlayerLiveIE, + GlobalPlayerLivePlaylistIE, + GlobalPlayerAudioIE, + GlobalPlayerAudioEpisodeIE, + GlobalPlayerVideoIE +) from .globo import ( GloboIE, GloboArticleIE, @@ -464,6 +492,7 @@ ) from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE +from .hrfernsehen import HRFernsehenIE from .hrti import ( HRTiIE, HRTiPlaylistIE, @@ -540,8 +569,10 @@ from .kickstarter import KickStarterIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE +from .kommunetv import KommunetvIE from .konserthusetplay import KonserthusetPlayIE from .krasview import KrasViewIE +from .kth import KTHIE from .ku6 import Ku6IE from .kusi import KUSIIE from .kuwo import ( @@ -593,7 +624,11 @@ LimelightChannelIE, LimelightChannelListIE, ) -from .line import LineTVIE +from .line import ( + LineTVIE, + LineLiveIE, + LineLiveChannelIE, +) from .linkedin import ( LinkedInLearningIE, LinkedInLearningCourseIE, @@ -601,10 +636,6 @@ from .linuxacademy import LinuxAcademyIE from .litv import LiTVIE from .livejournal import LiveJournalIE -from .liveleak import ( - LiveLeakIE, - LiveLeakEmbedIE, -) from .livestream import ( LivestreamIE, LivestreamOriginalIE, @@ -630,6 +661,7 @@ MangomoloLiveIE, ) from .manyvids import ManyVidsIE +from .maoritv import MaoriTVIE from .markiza import ( MarkizaIE, MarkizaPageIE, @@ -673,7 +705,10 @@ MixcloudUserIE, MixcloudPlaylistIE, ) -from .mlb import MLBIE +from .mlb import ( + MLBIE, + MLBVideoIE, +) from .mnet import MnetIE from .moevideo import MoeVideoIE from .mofosex import ( @@ -707,6 +742,7 @@ MyviIE, MyviEmbedIE, ) +from .myvideoge import MyVideoGeIE from .myvidster import MyVidsterIE from .nationalgeographic import ( NationalGeographicVideoIE, @@ -780,7 +816,14 @@ NickNightIE, NickRuIE, ) -from .niconico import NiconicoIE, NiconicoPlaylistIE +from .niconico import ( + NiconicoIE, + NiconicoPlaylistIE, + NiconicoUserIE, + NicovideoSearchIE, + NicovideoSearchDateIE, + NicovideoSearchURLIE, +) from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE from .ninenow import NineNowIE @@ -874,11 +917,20 @@ PacktPubIE, PacktPubCourseIE, ) +from .palcomp3 import ( + PalcoMP3IE, + PalcoMP3ArtistIE, + PalcoMP3VideoIE, +) from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE from .pearvideo import PearVideoIE +from .peekvids import ( + PeekVidsIE, + PlayVidsIE, +) from .peertube import PeerTubeIE from .people import PeopleIE from .performgroup import PerformGroupIE @@ -907,6 +959,7 @@ from .playfm import PlayFMIE from .playplustv import PlayPlusTVIE from .plays import PlaysTVIE +from .playstuff import PlayStuffIE from .playtvak import PlaytvakIE from .playvid import PlayvidIE from .playwire import PlaywireIE @@ -934,6 +987,10 @@ from .pornotube import PornotubeIE from .pornovoisines import PornoVoisinesIE from .pornoxo import PornoXOIE +from .pr0gramm import ( + Pr0grammIE, + Pr0grammStaticIE, +) from .puhutv import ( PuhuTVIE, PuhuTVSerieIE, @@ -971,6 +1028,10 @@ RayWenderlichIE, RayWenderlichCourseIE, ) +from .rbgtum import ( + RbgTumIE, + RbgTumCourseIE, +) from .rbmaradio import RBMARadioIE from .rds import RDSIE from .redbulltv import ( @@ -1026,6 +1087,10 @@ from .rutv import RUTVIE from .ruutu import RuutuIE from .ruv import RuvIE +from .s4c import ( + S4CIE, + S4CSeriesIE, +) from .safari import ( SafariIE, SafariApiIE, @@ -1161,6 +1226,7 @@ from .streamable import StreamableIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE +from .streamsb import StreamsbIE from .streetvoice import StreetVoiceIE from .stretchinternet import StretchInternetIE from .stv import STVPlayerIE @@ -1230,6 +1296,11 @@ from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE +from .thisvid import ( + ThisVidIE, + ThisVidMemberIE, + ThisVidPlaylistIE, +) from .threeqsdn import ThreeQSDNIE from .tiktok import ( TikTokIE, @@ -1514,6 +1585,7 @@ WeiboMobileIE ) from .weiqitv import WeiqiTVIE +from .whyp import WhypIE from .wistia import ( WistiaIE, WistiaPlaylistIE, @@ -1591,7 +1663,7 @@ YoutubeRecommendedIE, YoutubeSearchDateIE, YoutubeSearchIE, - #YoutubeSearchURLIE, + YoutubeSearchURLIE, YoutubeSubscriptionsIE, YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, @@ -1621,5 +1693,9 @@ ) from .zdf import ZDFIE, ZDFChannelIE from .zhihu import ZhihuIE -from .zingmp3 import ZingMp3IE +from .zingmp3 import ( + ZingMp3IE, + ZingMp3AlbumIE, +) +from .zoom import ZoomIE from .zype import ZypeIE diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/facebook.py b/script.module.youtube.dl/lib/youtube_dl/extractor/facebook.py index cb34c59f5..04650af39 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/facebook.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/facebook.py @@ -521,7 +521,10 @@ def parse_attachment(attachment, key='media'): raise ExtractorError( 'The video is not available, Facebook said: "%s"' % m_msg.group(1), expected=True) - elif '>You must log in to continue' in webpage: + elif any(p in webpage for p in ( + '>You must log in to continue', + 'id="login_form"', + 'id="loginbutton"')): self.raise_login_required() if not video_data and '/watchparty/' in url: diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/fifa.py b/script.module.youtube.dl/lib/youtube_dl/extractor/fifa.py new file mode 100644 index 000000000..15157774e --- /dev/null +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/fifa.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + int_or_none, + traverse_obj, + unified_timestamp, +) + +if not callable(getattr(InfoExtractor, '_match_valid_url', None)): + + BaseInfoExtractor = InfoExtractor + + import re + + class InfoExtractor(BaseInfoExtractor): + + @classmethod + def _match_valid_url(cls, url): + return re.match(cls._VALID_URL, url) + + +class FifaIE(InfoExtractor): + _VALID_URL = r'https?://www.fifa.com/fifaplus/(?P\w{2})/watch/([^#?]+/)?(?P\w+)' + _TESTS = [{ + 'url': 'https://www.fifa.com/fifaplus/en/watch/7on10qPcnyLajDDU3ntg6y', + 'info_dict': { + 'id': '7on10qPcnyLajDDU3ntg6y', + 'title': 'Italy v France | Final | 2006 FIFA World Cup Germany™ | Full Match Replay', + 'description': 'md5:f4520d0ee80529c8ba4134a7d692ff8b', + 'ext': 'mp4', + 'categories': ['FIFA Tournaments'], + 'thumbnail': 'https://digitalhub.fifa.com/transform/135e2656-3a51-407b-8810-6c34bec5b59b/FMR_2006_Italy_France_Final_Hero', + 'duration': 8165, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.fifa.com/fifaplus/pt/watch/1cg5r5Qt6Qt12ilkDgb1sV', + 'info_dict': { + 'id': '1cg5r5Qt6Qt12ilkDgb1sV', + 'title': 'Brazil v Germany | Semi-finals | 2014 FIFA World Cup Brazil™ | Extended Highlights', + 'description': 'md5:d908c74ee66322b804ae2e521b02a855', + 'ext': 'mp4', + 'categories': ['FIFA Tournaments', 'Highlights'], + 'thumbnail': 'https://digitalhub.fifa.com/transform/d8fe6f61-276d-4a73-a7fe-6878a35fd082/FIFAPLS_100EXTHL_2014BRAvGER_TMB', + 'duration': 902, + 'release_timestamp': 1404777600, + 'release_date': '20140708', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.fifa.com/fifaplus/fr/watch/3C6gQH9C2DLwzNx7BMRQdp', + 'info_dict': { + 'id': '3C6gQH9C2DLwzNx7BMRQdp', + 'title': 'Josimar goal against Northern Ireland | Classic Goals', + 'description': 'md5:cbe7e7bb52f603c9f1fe9a4780fe983b', + 'ext': 'mp4', + 'categories': ['FIFA Tournaments', 'Goal'], + 'duration': 28, + 'thumbnail': 'https://digitalhub.fifa.com/transform/f9301391-f8d9-48b5-823e-c093ac5e3e11/CG_MEN_1986_JOSIMAR', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id, locale = self._match_valid_url(url).group('id', 'locale') + webpage = self._download_webpage(url, video_id) + + preconnect_link = self._search_regex( + r']+\brel\s*=\s*"preconnect"[^>]+href\s*=\s*"([^"]+)"', webpage, 'Preconnect Link') + + video_details = self._download_json( + '{preconnect_link}/sections/videoDetails/{video_id}'.format(**locals()), video_id, 'Downloading Video Details', fatal=False) + + preplay_parameters = self._download_json( + '{preconnect_link}/videoPlayerData/{video_id}'.format(**locals()), video_id, 'Downloading Preplay Parameters')['preplayParameters'] + + content_data = self._download_json( + # 1. query string is expected to be sent as-is + # 2. `sig` must be appended + # 3. if absent, the call appears to work but the manifest is bad (404) + 'https://content.uplynk.com/preplay/{contentId}/multiple.json?{queryStr}&sig={signature}'.format(**preplay_parameters), + video_id, 'Downloading Content Data') + + # formats, subtitles = self._extract_m3u8_formats_and_subtitles(content_data['playURL'], video_id) + formats, subtitles = self._extract_m3u8_formats(content_data['playURL'], video_id, ext='mp4', entry_protocol='m3u8_native'), None + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_details['title'], + 'description': video_details.get('description'), + 'duration': int_or_none(video_details.get('duration')), + 'release_timestamp': unified_timestamp(video_details.get('dateOfRelease')), + 'categories': traverse_obj(video_details, (('videoCategory', 'videoSubcategory'),)), + 'thumbnail': traverse_obj(video_details, ('backgroundImage', 'src')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/filemoon.py b/script.module.youtube.dl/lib/youtube_dl/extractor/filemoon.py new file mode 100644 index 000000000..654df9b69 --- /dev/null +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/filemoon.py @@ -0,0 +1,43 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + js_to_json, +) + + +class FileMoonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?filemoon\.sx/./(?P\w+)' + _TEST = { + 'url': 'https://filemoon.sx/e/dw40rxrzruqz', + 'md5': '5a713742f57ac4aef29b74733e8dda01', + 'info_dict': { + 'id': 'dw40rxrzruqz', + 'title': 'dw40rxrzruqz', + 'ext': 'mp4' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + matches = re.findall(r'(?s)(eval.*?)', webpage) + packed = matches[-1] + unpacked = decode_packed_codes(packed) + jwplayer_sources = self._parse_json( + self._search_regex( + r'(?s)player\s*\.\s*setup\s*\(\s*\{\s*sources\s*:\s*(.*?])', unpacked, 'jwplayer sources'), + video_id, transform_source=js_to_json) + + formats = self._parse_jwplayer_formats(jwplayer_sources, video_id) + + return { + 'id': video_id, + 'title': self._generic_title(url) or video_id, + 'formats': formats + } diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/formula1.py b/script.module.youtube.dl/lib/youtube_dl/extractor/formula1.py index fecfc28ae..67662e6de 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/formula1.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/formula1.py @@ -5,29 +5,23 @@ class Formula1IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?formula1\.com/(?:content/fom-website/)?en/video/\d{4}/\d{1,2}/(?P.+?)\.html' - _TESTS = [{ - 'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html', - 'md5': '8c79e54be72078b26b89e0e111c0502b', + _VALID_URL = r'https?://(?:www\.)?formula1\.com/en/latest/video\.[^.]+\.(?P\d+)\.html' + _TEST = { + 'url': 'https://www.formula1.com/en/latest/video.race-highlights-spain-2016.6060988138001.html', + 'md5': 'be7d3a8c2f804eb2ab2aa5d941c359f8', 'info_dict': { - 'id': 'JvYXJpMzE6pArfHWm5ARp5AiUmD-gibV', + 'id': '6060988138001', 'ext': 'mp4', 'title': 'Race highlights - Spain 2016', + 'timestamp': 1463332814, + 'upload_date': '20160515', + 'uploader_id': '6057949432001', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - }, { - 'url': 'http://www.formula1.com/en/video/2016/5/Race_highlights_-_Spain_2016.html', - 'only_matching': True, - }] + 'add_ie': ['BrightcoveNew'], + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/6057949432001/S1WMrhjlh_default/index.html?videoId=%s' def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - ooyala_embed_code = self._search_regex( - r'data-videoid="([^"]+)"', webpage, 'ooyala embed code') + bc_id = self._match_id(url) return self.url_result( - 'ooyala:%s' % ooyala_embed_code, 'Ooyala', ooyala_embed_code) + self.BRIGHTCOVE_URL_TEMPLATE % bc_id, 'BrightcoveNew', bc_id) diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/francetv.py b/script.module.youtube.dl/lib/youtube_dl/extractor/francetv.py index 3ca415077..e4ec2e200 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/francetv.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/francetv.py @@ -383,6 +383,10 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): }, { 'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin', 'only_matching': True, + }, { + # "
]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'), + r']+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', + r'(?:data-id|[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/(?:[^/]+/)?shows/[^/]+/(?P[^/?#&]+)' _NETRC_MACHINE = 'funimation' _TOKEN = None @@ -51,6 +51,10 @@ class FunimationIE(InfoExtractor): }, { 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/', 'only_matching': True, + }, { + # with lang code + 'url': 'https://www.funimation.com/en/shows/hacksign/role-play/', + 'only_matching': True, }] def _login(self): diff --git a/script.module.youtube.dl/lib/youtube_dl/extractor/gdcvault.py b/script.module.youtube.dl/lib/youtube_dl/extractor/gdcvault.py index 2f555c1d4..acc6478b8 100644 --- a/script.module.youtube.dl/lib/youtube_dl/extractor/gdcvault.py +++ b/script.module.youtube.dl/lib/youtube_dl/extractor/gdcvault.py @@ -6,6 +6,7 @@ from .kaltura import KalturaIE from ..utils import ( HEADRequest, + remove_start, sanitized_Request, smuggle_url, urlencode_postdata, @@ -102,6 +103,26 @@ class GDCVaultIE(InfoExtractor): 'format': 'mp4-408', }, }, + { + # Kaltura embed, whitespace between quote and embedded URL in iframe's src + 'url': 'https://www.gdcvault.com/play/1025699', + 'info_dict': { + 'id': '0_zagynv0a', + 'ext': 'mp4', + 'title': 'Tech Toolbox', + 'upload_date': '20190408', + 'uploader_id': 'joe@blazestreaming.com', + 'timestamp': 1554764629, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # HTML5 video + 'url': 'http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru', + 'only_matching': True, + }, ] def _login(self, webpage_url, display_id): @@ -175,7 +196,18 @@ def _real_extract(self, url): xml_name = self._html_search_regex( r'