From 261c29546e63efe672ca37236f2b18505e5a74f9 Mon Sep 17 00:00:00 2001 From: Omikhleia Date: Fri, 2 Dec 2022 01:44:46 +0100 Subject: [PATCH] feat(core): Accept and resolve BCP47 language names (WIP) --- core/font.lua | 38 +++++++++---- core/languages.lua | 132 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 144 insertions(+), 26 deletions(-) diff --git a/core/font.lua b/core/font.lua index 9ece09996..38b1e22af 100644 --- a/core/font.lua +++ b/core/font.lua @@ -1,4 +1,7 @@ -local icu = require("justenoughicu") +-- BEGIN OMIKHLEIA HACKLANG +-- No longer required here, see further below +-- local icu = require("justenoughicu") +-- END OMIKHLEIA HACKLANG local lastshaper @@ -20,19 +23,36 @@ SILE.registerCommand("font", function (options, content) if options.features then SILE.settings:set("font.features", options.features) end if options.direction then SILE.settings:set("font.direction", options.direction) end if options.language then - if options.language ~= "und" and icu and icu.canonicalize_language then - local newlang = icu.canonicalize_language(options.language) - -- if newlang ~= options.language then - -- SU.warn("Language '"..options.language.."' not canonical, '"..newlang.."' will be used instead.") - -- end - options.language = newlang - end + -- BEGIN OMIKHLEIA HACKLANG + -- Commented out. Too soon and or at discrepancy with the "language" command anyway. + -- So let document.language is what the user wants, and loadLanguage take care of it + -- + -- if options.language ~= "und" and icu and icu.canonicalize_language then + -- local newlang = icu.canonicalize_language(options.language) + -- -- if newlang ~= options.language then + -- -- SU.warn("Language '"..options.language.."' not canonical, '"..newlang.."' will be used instead.") + -- -- end + -- options.language = newlang + -- end + -- END OMIKHLEIA HACKLANG SILE.settings:set("document.language", options.language) - fluent:set_locale(options.language) + -- BEGIN OMIKHLEIA HACKLANG + -- Commented out. This is BAD design: + -- Fluent maintains a global state (locale) but we could be in a temporary environment + -- if the font command has contents, and we'd need this to remain scoped. + -- So we should act here! + -- fluent:set_locale(options.language) + -- END OMIKHLEIA HACKLANG SILE.languageSupport.loadLanguage(options.language) end if options.script then SILE.settings:set("font.script", options.script) elseif SILE.settings:get("document.language") then + -- BEGIN OMIKHLEIA HACKLANG + -- BAD STUFF WARNING: This SILE.languageSupport.languages[] is broken, (nearly always returning nil), + -- see comment in languages.lua + -- ON THE OTHER HAND, setting font.script doesn't seem to be used in any sensible way... + -- Possible code smell here. + -- END OMIKHLEIA HACKLANG local lang = SILE.languageSupport.languages[SILE.settings:get("document.language")] if lang and lang.defaultScript then SILE.settings:set("font.script", lang.defaultScript) diff --git a/core/languages.lua b/core/languages.lua index fd084fdd1..50f5030e7 100644 --- a/core/languages.lua +++ b/core/languages.lua @@ -1,5 +1,9 @@ local loadkit = require("loadkit") -local cldr = require("cldr") + +-- BEGIN OMIKHLEIA HACKLANG +-- Disabled for now, see further below. +-- local cldr = require("cldr") +-- END OMIKHLEIA HACKLANG loadkit.register("ftl", function (file) local contents = assert(file:read("*a")) @@ -9,30 +13,124 @@ end) local loadonce = {} +-- BEGIN OMIKHLEIA HACKLANG +local icu = require("justenoughicu") + +-- This small utility could be moved to utilities as SU.forLanguage()... +-- The input is expected to be a valid BCP47 canonical language. +-- The idea is to find the "closest" language accepted by the callback: +-- Loop removing a language specifier until the callback returns a non-nil value +-- Returns that value and the matched language in that case, or nil no callback matched. +-- E.g. "xx-Xxxx-XX" will be matched against "xx-Xxxx--XX", "xx-Xxxx", "xx" until one of +-- these are satisfied. +-- Returns +-- nil if not callback could process the language +-- language, res if a callback could (returning the matched language pattern and the +-- result of the callback) +local function forLanguage(langbcp47, callback) + while langbcp47 do + local res = callback(langbcp47) + if res then + return res, langbcp47 + end + langbcp47 = langbcp47:match("^(.+)-.*$") -- split at dash (-) and remove last part. + end + return nil +end +-- END OMIKHLEIA HACKLANG + SILE.languageSupport = { - languages = {}, + languages = {}, -- BEGIN OMIKHLEIA HACKLANG + -- BAD CODE SMELL WARNING!!!! + -- In earlier versions this was used where we now have a "loadonce" table of boolean. + -- The change occurred here https://github.com/sile-typesetter/sile/commit/0c5e7f97f3c73ab1b6cd7aee0afca4a59c447cd9 + -- So this table is not handled as it was, and shouldn't be used! But: + -- - font.lua uses it at one point... + -- - so does languages/kn.lua + -- - and also packages/complex-spaces + -- END OMIKHLEIA HACKLANG loadLanguage = function (language) language = language or SILE.settings:get("document.language") - language = cldr.locales[language] and language or "und" + -- BEGIN OMIKHLEIA HACKLANG + -- Either done too soon or plain wrong (a BCP47 language can e.g. contain a script for instance, sr-Latn + -- and I don't see that in https://github.com/alerque/cldr-lua/blob/master/cldr/data/locales.lua + -- I don't really know why we would want that CLDR filtering here BTW... + -- language = cldr.locales[language] and language or "und" + + -- The user may have set document.language to anything, let's ensure a canonical + -- BCP47 language... + if language ~= "und" then + language = icu.canonicalize_language(language) + end + -- END OMIKHLEIA HACKLANG + if loadonce[language] then return end loadonce[language] = true - local langresource = string.format("languages.%s", language) - local gotlang, lang = pcall(require, langresource) - if not gotlang then - SU.warn(("Unable to load language feature support (e.g. hyphenation rules) for %s: %s") - :format(language, lang:gsub(":.*", ""))) + + -- BEGIN OMIKHLEIA HACKLANG + -- We need to find language resources for this BCP47 identifier, from the less specific + -- to the more general. + local langresource, matchedlang = forLanguage(language, function (lang) + local resource = string.format("languages.%s", lang) + local gotres, res = pcall(require, resource) + return gotres and res + end) + if not langresource then + SU.warn(("Unable to load language feature support (e.g. hyphenation rules) for %s") + :format(language)) + else + print(("Loaded language feature support for %s: matched %s") -- HACK We'll need a mere SU.debug when OK... + :format(language, matchedlang)) + if language ~= matchedlang then + -- Now that's so UGLY. Say the input language was "en-GB". + -- It matched "en" eventually (as we don't have yet an "languages.en-GB" resources) + -- PROBLEM: Our languages.xxx files (almost) all work by side effects, putting various things, + -- in the case of our example, in SILE.nodeMarkers.en, SILE.hyphenator.languages.en + -- and SU.formatNumber.en... While we now expect the language to be "en-GB"... + -- It's a HACK, but copy the stuff into our language. + SILE.nodeMakers[language] = SILE.nodeMakers[matchedlang] + SU.formatNumber[language] = SU.formatNumber[matchedlang] + SILE.hyphenator.languages[language] = SILE.hyphenator.languages[matchedlang] + end end - local ftlresource = string.format("i18n.%s", language) - SU.debug("fluent", "Loading FTL resource", ftlresource, "into locale", language) - fluent:set_locale(language) - local gotftl, ftl = pcall(require, ftlresource) - if not gotftl then - SU.warn(("Unable to load localized strings (e.g. table of contents header text) for %s: %s") - :format(language, ftl:gsub(":.*", ""))) + + -- We need to find fluent reources for this BCP47 identifier, from the less specific + -- to the more general. + local ftlresource, matchedi18n = forLanguage(language, function (lang) + local resource = string.format("i18n.%s", lang) + SU.debug("fluent", "Loading FTL resource", resource, "into locale", lang) + fluent:set_locale(lang) + local gotftl, ftl = pcall(require, resource) + return gotftl and ftl + end) + if not ftlresource then + SU.warn(("Unable to load localized strings (e.g. table of contents header text) for %s") + :format(language)) + else + print(("Load localized strings for %s: matched %s") -- HACK We'll need a mere SU.debug when OK... + :format(language, matchedi18n)) + if language ~= matchedi18n then + -- Now that's even more UGLY. Say the input language was "en-GB". + -- It matched "en" eventually (as we don't have yet an "i18n.en-GB" resources) + -- PROBLEM: the fluent locale must be set to the target language before loading + -- a ftl file. APIs that aren't stateless are messy :( + -- in the case of our example, they had to be read into "en"... + -- HACK HACK, all we can do is reloaad it fully, but under the target "en-GB" name... + local loaded = string.format("i18n.%s", matchedi18n) + package.loaded[loaded] = nil -- HACK force reload!!! + fluent:set_locale(language) + require(string.format("i18n.%s", matchedi18n)) + end end - if type(lang) == "table" and lang.init then - lang.init() + + -- Most language resource files act by side effects, directly tweaking + -- SILE.nodeMarkers.xx, SILE.hyphenator.languages.xx, SU.formatNumber.xx (etc.? Doh) + -- BUT some don't do that exactly AND return a table with an init method... + -- Unclear API discrepancy, heh. + if type(langresource) == "table" and langresource.init then + langresource.init() end + -- END OMIKHLEIA HACKLANG end }