From be42c275b83b0cd5c26a8dd36bc57f7964e55557 Mon Sep 17 00:00:00 2001 From: Caleb Hearon Date: Thu, 26 Dec 2024 15:26:35 -0500 Subject: [PATCH] add C++ parser for the font shorthand This is the first part needed for the new font stack, which will look like the FontFace-based API the browsers have. FontFace uses a parser for font-family, font-size, etc., so I will need deeper control over the parser. FontFace will be implemented in C++ and I didn't want to carry over the awkward (and slow) switching between JS and C++. So here it is. I used Claude to generate initial classes and busy work, but it's been heavily examined and heavily modified. Caching aside, this is 3x faster in the benchmarks, which use random names to bypass the cache, and still a full 2x as fast when the JS version has a cached value. Those results were a bit inconsistent, so I'm not sure how much I trust them, but I expect this parser to have a stable performance profile nonetheless, so I'm not going to add any caching. It's also far more correct than what we had! --- CHANGELOG.md | 2 + binding.gyp | 3 +- index.js | 3 - lib/parse-font.js | 110 ------ src/Canvas.cc | 39 +- src/Canvas.h | 1 + src/CanvasRenderingContext2d.cc | 40 +-- src/CharData.h | 231 ++++++++++++ src/FontParser.cc | 605 ++++++++++++++++++++++++++++++++ src/FontParser.h | 115 ++++++ test/canvas.test.js | 73 ---- test/fontParser.test.js | 118 +++++++ 12 files changed, 1130 insertions(+), 210 deletions(-) delete mode 100644 lib/parse-font.js create mode 100644 src/CharData.h create mode 100644 src/FontParser.cc create mode 100644 src/FontParser.h create mode 100644 test/fontParser.test.js diff --git a/CHANGELOG.md b/CHANGELOG.md index c835d3739..aa6d4df10 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ project adheres to [Semantic Versioning](http://semver.org/). (Unreleased) ================== ### Changed +* `ctx.font` has a new C++ parser and is 2x-400x faster. Please file an issue if you experience different results, as caching has been removed. + ### Added ### Fixed diff --git a/binding.gyp b/binding.gyp index 166842641..bf647f7d1 100644 --- a/binding.gyp +++ b/binding.gyp @@ -75,7 +75,8 @@ 'src/Image.cc', 'src/ImageData.cc', 'src/init.cc', - 'src/register_font.cc' + 'src/register_font.cc', + 'src/FontParser.cc' ], 'conditions': [ ['OS=="win"', { diff --git a/index.js b/index.js index 89f2daabc..adde4da12 100644 --- a/index.js +++ b/index.js @@ -2,7 +2,6 @@ const Canvas = require('./lib/canvas') const Image = require('./lib/image') const CanvasRenderingContext2D = require('./lib/context2d') const CanvasPattern = require('./lib/pattern') -const parseFont = require('./lib/parse-font') const packageJson = require('./package.json') const bindings = require('./lib/bindings') const fs = require('fs') @@ -12,7 +11,6 @@ const JPEGStream = require('./lib/jpegstream') const { DOMPoint, DOMMatrix } = require('./lib/DOMMatrix') bindings.setDOMMatrix(DOMMatrix) -bindings.setParseFont(parseFont) function createCanvas (width, height, type) { return new Canvas(width, height, type) @@ -73,7 +71,6 @@ exports.DOMPoint = DOMPoint exports.registerFont = registerFont exports.deregisterAllFonts = deregisterAllFonts -exports.parseFont = parseFont exports.createCanvas = createCanvas exports.createImageData = createImageData diff --git a/lib/parse-font.js b/lib/parse-font.js deleted file mode 100644 index a18f05e51..000000000 --- a/lib/parse-font.js +++ /dev/null @@ -1,110 +0,0 @@ -'use strict' - -/** - * Font RegExp helpers. - */ - -const weights = 'bold|bolder|lighter|[1-9]00' -const styles = 'italic|oblique' -const variants = 'small-caps' -const stretches = 'ultra-condensed|extra-condensed|condensed|semi-condensed|semi-expanded|expanded|extra-expanded|ultra-expanded' -const units = 'px|pt|pc|in|cm|mm|%|em|ex|ch|rem|q' -const string = /'((\\'|[^'])+)'|"((\\"|[^"])+)"|[\w\s-]+/.source - -// [ [ <‘font-style’> || || <‘font-weight’> || <‘font-stretch’> ]? -// <‘font-size’> [ / <‘line-height’> ]? <‘font-family’> ] -// https://drafts.csswg.org/css-fonts-3/#font-prop -const weightRe = new RegExp(`(${weights}) +`, 'i') -const styleRe = new RegExp(`(${styles}) +`, 'i') -const variantRe = new RegExp(`(${variants}) +`, 'i') -const stretchRe = new RegExp(`(${stretches}) +`, 'i') -const familyRe = new RegExp(string, 'g') -const unquoteRe = /^['"](.*)['"]$/ -const unescapeRe = /\\(['"])/g -const sizeFamilyRe = new RegExp( - `([\\d\\.]+)(${units}) *((?:${string})( *, *(?:${string}))*)`) - -/** - * Cache font parsing. - */ - -const cache = {} - -const defaultHeight = 16 // pt, common browser default - -/** - * Parse font `str`. - * - * @param {String} str - * @return {Object} Parsed font. `size` is in device units. `unit` is the unit - * appearing in the input string. - * @api private - */ - -module.exports = str => { - // Cached - if (cache[str]) return cache[str] - - // Try for required properties first. - const sizeFamily = sizeFamilyRe.exec(str) - if (!sizeFamily) return // invalid - - const names = sizeFamily[3] - .match(familyRe) - // remove actual bounding quotes, if any, unescape any remaining quotes inside - .map(s => s.trim().replace(unquoteRe, '$1').replace(unescapeRe, '$1')) - .filter(s => !!s) - - // Default values and required properties - const font = { - weight: 'normal', - style: 'normal', - stretch: 'normal', - variant: 'normal', - size: parseFloat(sizeFamily[1]), - unit: sizeFamily[2], - family: names.join(',') - } - - // Optional, unordered properties. - let weight, style, variant, stretch - // Stop search at `sizeFamily.index` - const substr = str.substring(0, sizeFamily.index) - if ((weight = weightRe.exec(substr))) font.weight = weight[1] - if ((style = styleRe.exec(substr))) font.style = style[1] - if ((variant = variantRe.exec(substr))) font.variant = variant[1] - if ((stretch = stretchRe.exec(substr))) font.stretch = stretch[1] - - // Convert to device units. (`font.unit` is the original unit) - // TODO: ch, ex - switch (font.unit) { - case 'pt': - font.size /= 0.75 - break - case 'pc': - font.size *= 16 - break - case 'in': - font.size *= 96 - break - case 'cm': - font.size *= 96.0 / 2.54 - break - case 'mm': - font.size *= 96.0 / 25.4 - break - case '%': - // TODO disabled because existing unit tests assume 100 - // font.size *= defaultHeight / 100 / 0.75 - break - case 'em': - case 'rem': - font.size *= defaultHeight / 0.75 - break - case 'q': - font.size *= 96 / 25.4 / 4 - break - } - - return (cache[str] = font) -} diff --git a/src/Canvas.cc b/src/Canvas.cc index 6ba312008..7b208bec2 100644 --- a/src/Canvas.cc +++ b/src/Canvas.cc @@ -21,6 +21,7 @@ #include "Util.h" #include #include "node_buffer.h" +#include "FontParser.h" #ifdef HAVE_JPEG #include "JPEGStream.h" @@ -68,7 +69,8 @@ Canvas::Initialize(Napi::Env& env, Napi::Object& exports) { StaticValue("PNG_FILTER_PAETH", Napi::Number::New(env, PNG_FILTER_PAETH), napi_default_jsproperty), StaticValue("PNG_ALL_FILTERS", Napi::Number::New(env, PNG_ALL_FILTERS), napi_default_jsproperty), StaticMethod<&Canvas::RegisterFont>("_registerFont", napi_default_method), - StaticMethod<&Canvas::DeregisterAllFonts>("_deregisterAllFonts", napi_default_method) + StaticMethod<&Canvas::DeregisterAllFonts>("_deregisterAllFonts", napi_default_method), + StaticMethod<&Canvas::ParseFont>("parseFont", napi_default_method) }); data->CanvasCtor = Napi::Persistent(ctor); @@ -694,6 +696,7 @@ Canvas::RegisterFont(const Napi::CallbackInfo& info) { // now check the attrs, there are many ways to be wrong Napi::Object js_user_desc = info[1].As(); + // TODO: use FontParser on these values just like the FontFace API works char *family = str_value(js_user_desc.Get("family"), NULL, false); char *weight = str_value(js_user_desc.Get("weight"), "normal", true); char *style = str_value(js_user_desc.Get("style"), "normal", false); @@ -749,6 +752,40 @@ Canvas::DeregisterAllFonts(const Napi::CallbackInfo& info) { if (!success) Napi::Error::New(env, "Could not deregister one or more fonts").ThrowAsJavaScriptException(); } +/* + * Do not use! This is only exported for testing + */ +Napi::Value +Canvas::ParseFont(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + + if (info.Length() != 1) return env.Undefined(); + + Napi::String str; + if (!info[0].ToString().UnwrapTo(&str)) return env.Undefined(); + + bool ok; + auto props = FontParser::parse(str, &ok); + if (!ok) return env.Undefined(); + + Napi::Object obj = Napi::Object::New(env); + obj.Set("size", Napi::Number::New(env, props.fontSize)); + Napi::Array families = Napi::Array::New(env); + obj.Set("families", families); + + unsigned int index = 0; + + for (auto& family : props.fontFamily) { + families[index++] = Napi::String::New(env, family); + } + + obj.Set("weight", Napi::Number::New(env, props.fontWeight)); + obj.Set("variant", Napi::Number::New(env, static_cast(props.fontVariant))); + obj.Set("style", Napi::Number::New(env, static_cast(props.fontStyle))); + + return obj; +} + /* * Get a PangoStyle from a CSS string (like "italic") */ diff --git a/src/Canvas.h b/src/Canvas.h index 5f35b356b..5b039539a 100644 --- a/src/Canvas.h +++ b/src/Canvas.h @@ -68,6 +68,7 @@ class Canvas : public Napi::ObjectWrap { void StreamJPEGSync(const Napi::CallbackInfo& info); static void RegisterFont(const Napi::CallbackInfo& info); static void DeregisterAllFonts(const Napi::CallbackInfo& info); + static Napi::Value ParseFont(const Napi::CallbackInfo& info); Napi::Error CairoError(cairo_status_t status); static void ToPngBufferAsync(Closure* closure); static void ToJpegBufferAsync(Closure* closure); diff --git a/src/CanvasRenderingContext2d.cc b/src/CanvasRenderingContext2d.cc index d0966e299..1597d089a 100644 --- a/src/CanvasRenderingContext2d.cc +++ b/src/CanvasRenderingContext2d.cc @@ -9,6 +9,7 @@ #include "CanvasGradient.h" #include "CanvasPattern.h" #include "InstanceData.h" +#include "FontParser.h" #include #include #include "Image.h" @@ -2575,34 +2576,29 @@ Context2d::GetFont(const Napi::CallbackInfo& info) { void Context2d::SetFont(const Napi::CallbackInfo& info, const Napi::Value& value) { - InstanceData* data = env.GetInstanceData(); - if (!value.IsString()) return; - if (!value.As().Utf8Value().length()) return; - - Napi::Value mparsed; + std::string str = value.As().Utf8Value(); + if (!str.length()) return; - // parseFont returns undefined for invalid CSS font strings - if (!data->parseFont.Call({ value }).UnwrapTo(&mparsed) || mparsed.IsUndefined()) return; - - Napi::Object font = mparsed.As(); - - Napi::String empty = Napi::String::New(env, ""); - Napi::Number zero = Napi::Number::New(env, 0); - - std::string weight = font.Get("weight").UnwrapOr(empty).ToString().UnwrapOr(empty).Utf8Value(); - std::string style = font.Get("style").UnwrapOr(empty).ToString().UnwrapOr(empty).Utf8Value(); - double size = font.Get("size").UnwrapOr(zero).ToNumber().UnwrapOr(zero).DoubleValue(); - std::string unit = font.Get("unit").UnwrapOr(empty).ToString().UnwrapOr(empty).Utf8Value(); - std::string family = font.Get("family").UnwrapOr(empty).ToString().UnwrapOr(empty).Utf8Value(); + bool success; + auto props = FontParser::parse(str, &success); + if (!success) return; PangoFontDescription *desc = pango_font_description_copy(state->fontDescription); pango_font_description_free(state->fontDescription); - pango_font_description_set_style(desc, Canvas::GetStyleFromCSSString(style.c_str())); - pango_font_description_set_weight(desc, Canvas::GetWeightFromCSSString(weight.c_str())); + PangoStyle style = props.fontStyle == FontStyle::Italic ? PANGO_STYLE_ITALIC + : props.fontStyle == FontStyle::Oblique ? PANGO_STYLE_OBLIQUE + : PANGO_STYLE_NORMAL; + pango_font_description_set_style(desc, style); + pango_font_description_set_weight(desc, static_cast(props.fontWeight)); + + std::string family = props.fontFamily.empty() ? "" : props.fontFamily[0]; + for (size_t i = 1; i < props.fontFamily.size(); i++) { + family += "," + props.fontFamily[i]; + } if (family.length() > 0) { // See #1643 - Pango understands "sans" whereas CSS uses "sans-serif" std::string s1(family); @@ -2617,12 +2613,12 @@ Context2d::SetFont(const Napi::CallbackInfo& info, const Napi::Value& value) { PangoFontDescription *sys_desc = Canvas::ResolveFontDescription(desc); pango_font_description_free(desc); - if (size > 0) pango_font_description_set_absolute_size(sys_desc, size * PANGO_SCALE); + if (props.fontSize > 0) pango_font_description_set_absolute_size(sys_desc, props.fontSize * PANGO_SCALE); state->fontDescription = sys_desc; pango_layout_set_font_description(_layout, sys_desc); - state->font = value.As().Utf8Value().c_str(); + state->font = str; } /* diff --git a/src/CharData.h b/src/CharData.h new file mode 100644 index 000000000..ebc2dd5e1 --- /dev/null +++ b/src/CharData.h @@ -0,0 +1,231 @@ +// This is used for classifying characters according to the definition of tokens +// in the CSS standards, but could be extended for any other future uses + +#pragma once + +namespace CharData { + static constexpr uint8_t Whitespace = 0x1; + static constexpr uint8_t Newline = 0x2; + static constexpr uint8_t Hex = 0x4; + static constexpr uint8_t Nmstart = 0x8; + static constexpr uint8_t Nmchar = 0x10; + static constexpr uint8_t Sign = 0x20; + static constexpr uint8_t Digit = 0x40; + static constexpr uint8_t NumStart = 0x80; +}; + +using namespace CharData; + +constexpr const uint8_t charData[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0-8 + Whitespace, // 9 (HT) + Whitespace | Newline, // 10 (LF) + 0, // 11 (VT) + Whitespace | Newline, // 12 (FF) + Whitespace | Newline, // 13 (CR) + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 14-31 + Whitespace, // 32 (Space) + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 33-42 + Sign | NumStart, // 43 (+) + 0, // 44 + Nmchar | Sign | NumStart, // 45 (-) + 0, 0, // 46-47 + Nmchar | Digit | NumStart | Hex, // 48 (0) + Nmchar | Digit | NumStart | Hex, // 49 (1) + Nmchar | Digit | NumStart | Hex, // 50 (2) + Nmchar | Digit | NumStart | Hex, // 51 (3) + Nmchar | Digit | NumStart | Hex, // 52 (4) + Nmchar | Digit | NumStart | Hex, // 53 (5) + Nmchar | Digit | NumStart | Hex, // 54 (6) + Nmchar | Digit | NumStart | Hex, // 55 (7) + Nmchar | Digit | NumStart | Hex, // 56 (8) + Nmchar | Digit | NumStart | Hex, // 57 (9) + 0, 0, 0, 0, 0, 0, 0, // 58-64 + Nmstart | Nmchar | Hex, // 65 (A) + Nmstart | Nmchar | Hex, // 66 (B) + Nmstart | Nmchar | Hex, // 67 (C) + Nmstart | Nmchar | Hex, // 68 (D) + Nmstart | Nmchar | Hex, // 69 (E) + Nmstart | Nmchar | Hex, // 70 (F) + Nmstart | Nmchar, // 71 (G) + Nmstart | Nmchar, // 72 (H) + Nmstart | Nmchar, // 73 (I) + Nmstart | Nmchar, // 74 (J) + Nmstart | Nmchar, // 75 (K) + Nmstart | Nmchar, // 76 (L) + Nmstart | Nmchar, // 77 (M) + Nmstart | Nmchar, // 78 (N) + Nmstart | Nmchar, // 79 (O) + Nmstart | Nmchar, // 80 (P) + Nmstart | Nmchar, // 81 (Q) + Nmstart | Nmchar, // 82 (R) + Nmstart | Nmchar, // 83 (S) + Nmstart | Nmchar, // 84 (T) + Nmstart | Nmchar, // 85 (U) + Nmstart | Nmchar, // 86 (V) + Nmstart | Nmchar, // 87 (W) + Nmstart | Nmchar, // 88 (X) + Nmstart | Nmchar, // 89 (Y) + Nmstart | Nmchar, // 90 (Z) + 0, // 91 + Nmstart, // 92 (\) + 0, 0, // 93-94 + Nmstart | Nmchar, // 95 (_) + 0, // 96 + Nmstart | Nmchar | Hex, // 97 (a) + Nmstart | Nmchar | Hex, // 98 (b) + Nmstart | Nmchar | Hex, // 99 (c) + Nmstart | Nmchar | Hex, // 100 (d) + Nmstart | Nmchar | Hex, // 101 (e) + Nmstart | Nmchar | Hex, // 102 (f) + Nmstart | Nmchar, // 103 (g) + Nmstart | Nmchar, // 104 (h) + Nmstart | Nmchar, // 105 (i) + Nmstart | Nmchar, // 106 (j) + Nmstart | Nmchar, // 107 (k) + Nmstart | Nmchar, // 108 (l) + Nmstart | Nmchar, // 109 (m) + Nmstart | Nmchar, // 110 (n) + Nmstart | Nmchar, // 111 (o) + Nmstart | Nmchar, // 112 (p) + Nmstart | Nmchar, // 113 (q) + Nmstart | Nmchar, // 114 (r) + Nmstart | Nmchar, // 115 (s) + Nmstart | Nmchar, // 116 (t) + Nmstart | Nmchar, // 117 (u) + Nmstart | Nmchar, // 118 (v) + Nmstart | Nmchar, // 119 (w) + Nmstart | Nmchar, // 120 (x) + Nmstart | Nmchar, // 121 (y) + Nmstart | Nmchar, // 122 (z) + 0, 0, 0, 0, 0, // 123-127 + // Non-ASCII + Nmstart | Nmchar, // 128 + Nmstart | Nmchar, // 129 + Nmstart | Nmchar, // 130 + Nmstart | Nmchar, // 131 + Nmstart | Nmchar, // 132 + Nmstart | Nmchar, // 133 + Nmstart | Nmchar, // 134 + Nmstart | Nmchar, // 135 + Nmstart | Nmchar, // 136 + Nmstart | Nmchar, // 137 + Nmstart | Nmchar, // 138 + Nmstart | Nmchar, // 139 + Nmstart | Nmchar, // 140 + Nmstart | Nmchar, // 141 + Nmstart | Nmchar, // 142 + Nmstart | Nmchar, // 143 + Nmstart | Nmchar, // 144 + Nmstart | Nmchar, // 145 + Nmstart | Nmchar, // 146 + Nmstart | Nmchar, // 147 + Nmstart | Nmchar, // 148 + Nmstart | Nmchar, // 149 + Nmstart | Nmchar, // 150 + Nmstart | Nmchar, // 151 + Nmstart | Nmchar, // 152 + Nmstart | Nmchar, // 153 + Nmstart | Nmchar, // 154 + Nmstart | Nmchar, // 155 + Nmstart | Nmchar, // 156 + Nmstart | Nmchar, // 157 + Nmstart | Nmchar, // 158 + Nmstart | Nmchar, // 159 + Nmstart | Nmchar, // 160 + Nmstart | Nmchar, // 161 + Nmstart | Nmchar, // 162 + Nmstart | Nmchar, // 163 + Nmstart | Nmchar, // 164 + Nmstart | Nmchar, // 165 + Nmstart | Nmchar, // 166 + Nmstart | Nmchar, // 167 + Nmstart | Nmchar, // 168 + Nmstart | Nmchar, // 169 + Nmstart | Nmchar, // 170 + Nmstart | Nmchar, // 171 + Nmstart | Nmchar, // 172 + Nmstart | Nmchar, // 173 + Nmstart | Nmchar, // 174 + Nmstart | Nmchar, // 175 + Nmstart | Nmchar, // 176 + Nmstart | Nmchar, // 177 + Nmstart | Nmchar, // 178 + Nmstart | Nmchar, // 179 + Nmstart | Nmchar, // 180 + Nmstart | Nmchar, // 181 + Nmstart | Nmchar, // 182 + Nmstart | Nmchar, // 183 + Nmstart | Nmchar, // 184 + Nmstart | Nmchar, // 185 + Nmstart | Nmchar, // 186 + Nmstart | Nmchar, // 187 + Nmstart | Nmchar, // 188 + Nmstart | Nmchar, // 189 + Nmstart | Nmchar, // 190 + Nmstart | Nmchar, // 191 + Nmstart | Nmchar, // 192 + Nmstart | Nmchar, // 193 + Nmstart | Nmchar, // 194 + Nmstart | Nmchar, // 195 + Nmstart | Nmchar, // 196 + Nmstart | Nmchar, // 197 + Nmstart | Nmchar, // 198 + Nmstart | Nmchar, // 199 + Nmstart | Nmchar, // 200 + Nmstart | Nmchar, // 201 + Nmstart | Nmchar, // 202 + Nmstart | Nmchar, // 203 + Nmstart | Nmchar, // 204 + Nmstart | Nmchar, // 205 + Nmstart | Nmchar, // 206 + Nmstart | Nmchar, // 207 + Nmstart | Nmchar, // 208 + Nmstart | Nmchar, // 209 + Nmstart | Nmchar, // 210 + Nmstart | Nmchar, // 211 + Nmstart | Nmchar, // 212 + Nmstart | Nmchar, // 213 + Nmstart | Nmchar, // 214 + Nmstart | Nmchar, // 215 + Nmstart | Nmchar, // 216 + Nmstart | Nmchar, // 217 + Nmstart | Nmchar, // 218 + Nmstart | Nmchar, // 219 + Nmstart | Nmchar, // 220 + Nmstart | Nmchar, // 221 + Nmstart | Nmchar, // 222 + Nmstart | Nmchar, // 223 + Nmstart | Nmchar, // 224 + Nmstart | Nmchar, // 225 + Nmstart | Nmchar, // 226 + Nmstart | Nmchar, // 227 + Nmstart | Nmchar, // 228 + Nmstart | Nmchar, // 229 + Nmstart | Nmchar, // 230 + Nmstart | Nmchar, // 231 + Nmstart | Nmchar, // 232 + Nmstart | Nmchar, // 233 + Nmstart | Nmchar, // 234 + Nmstart | Nmchar, // 235 + Nmstart | Nmchar, // 236 + Nmstart | Nmchar, // 237 + Nmstart | Nmchar, // 238 + Nmstart | Nmchar, // 239 + Nmstart | Nmchar, // 240 + Nmstart | Nmchar, // 241 + Nmstart | Nmchar, // 242 + Nmstart | Nmchar, // 243 + Nmstart | Nmchar, // 244 + Nmstart | Nmchar, // 245 + Nmstart | Nmchar, // 246 + Nmstart | Nmchar, // 247 + Nmstart | Nmchar, // 248 + Nmstart | Nmchar, // 249 + Nmstart | Nmchar, // 250 + Nmstart | Nmchar, // 251 + Nmstart | Nmchar, // 252 + Nmstart | Nmchar, // 253 + Nmstart | Nmchar, // 254 + Nmstart | Nmchar // 255 +}; diff --git a/src/FontParser.cc b/src/FontParser.cc new file mode 100644 index 000000000..773502cb3 --- /dev/null +++ b/src/FontParser.cc @@ -0,0 +1,605 @@ +// This is written to exactly parse the `font` shorthand in CSS2: +// https://www.w3.org/TR/CSS22/fonts.html#font-shorthand +// https://www.w3.org/TR/CSS22/syndata.html#tokenization +// +// We may want to update it for CSS 3 (e.g. font-stretch, or updated +// tokenization) but I've only ever seen one or two issues filed in node-canvas +// due to parsing in my 8 years on the project + +#include "FontParser.h" +#include "CharData.h" +#include +#include + +Token::Token(Type type, std::string value) : type_(type), value_(std::move(value)) {} + +Token::Token(Type type, double value) : type_(type), value_(value) {} + +Token::Token(Type type) : type_(type), value_(std::string{}) {} + +const std::string& +Token::getString() const { + static const std::string empty; + auto* str = std::get_if(&value_); + return str ? *str : empty; +} + +double +Token::getNumber() const { + auto* num = std::get_if(&value_); + return num ? *num : 0.0f; +} + +Tokenizer::Tokenizer(std::string_view input) : input_(input) {} + +std::string +Tokenizer::utf8Encode(uint32_t codepoint) { + std::string result; + + if (codepoint < 0x80) { + result += static_cast(codepoint); + } else if (codepoint < 0x800) { + result += static_cast((codepoint >> 6) | 0xc0); + result += static_cast((codepoint & 0x3f) | 0x80); + } else if (codepoint < 0x10000) { + result += static_cast((codepoint >> 12) | 0xe0); + result += static_cast(((codepoint >> 6) & 0x3f) | 0x80); + result += static_cast((codepoint & 0x3f) | 0x80); + } else { + result += static_cast((codepoint >> 18) | 0xf0); + result += static_cast(((codepoint >> 12) & 0x3f) | 0x80); + result += static_cast(((codepoint >> 6) & 0x3f) | 0x80); + result += static_cast((codepoint & 0x3f) | 0x80); + } + + return result; +} + +char +Tokenizer::peek() const { + return position_ < input_.length() ? input_[position_] : '\0'; +} + +char +Tokenizer::advance() { + return position_ < input_.length() ? input_[position_++] : '\0'; +} + +Token +Tokenizer::parseNumber() { + enum class State { + Start, + AfterSign, + Digits, + AfterDecimal, + AfterE, + AfterESign, + ExponentDigits + }; + + size_t start = position_; + size_t ePosition = 0; + State state = State::Start; + bool valid = false; + + while (position_ < input_.length()) { + char c = peek(); + uint8_t flags = charData[static_cast(c)]; + + switch (state) { + case State::Start: + if (flags & CharData::Sign) { + position_++; + state = State::AfterSign; + } else if (flags & CharData::Digit) { + position_++; + state = State::Digits; + valid = true; + } else if (c == '.') { + position_++; + state = State::AfterDecimal; + } else { + goto done; + } + break; + + case State::AfterSign: + if (flags & CharData::Digit) { + position_++; + state = State::Digits; + valid = true; + } else if (c == '.') { + position_++; + state = State::AfterDecimal; + } else { + goto done; + } + break; + + case State::Digits: + if (flags & CharData::Digit) { + position_++; + } else if (c == '.') { + position_++; + state = State::AfterDecimal; + } else if (c == 'e' || c == 'E') { + ePosition = position_; + position_++; + state = State::AfterE; + valid = false; + } else { + goto done; + } + break; + + case State::AfterDecimal: + if (flags & CharData::Digit) { + position_++; + valid = true; + state = State::Digits; + } else { + goto done; + } + break; + + case State::AfterE: + if (flags & CharData::Sign) { + position_++; + state = State::AfterESign; + } else if (flags & CharData::Digit) { + position_++; + valid = true; + state = State::ExponentDigits; + } else { + position_ = ePosition; + valid = true; + goto done; + } + break; + + case State::AfterESign: + if (flags & CharData::Digit) { + position_++; + valid = true; + state = State::ExponentDigits; + } else { + position_ = ePosition; + valid = true; + goto done; + } + break; + + case State::ExponentDigits: + if (flags & CharData::Digit) { + position_++; + } else { + goto done; + } + break; + } + } + +done: + if (!valid) { + position_ = start; + return Token(Token::Type::Invalid); + } + + std::string number_str(input_.substr(start, position_ - start)); + double value = std::stod(number_str); + return Token(Token::Type::Number, value); +} + +// Note that identifiers are always lower-case. This helps us make easier/more +// efficient comparisons, but means that font-families specified as identifiers +// will be lower-cased. Since font selection isn't case sensitive, this +// shouldn't ever be a problem. +Token +Tokenizer::parseIdentifier() { + std::string identifier; + auto flags = CharData::Nmstart; + auto start = position_; + + while (position_ < input_.length()) { + char c = peek(); + + if (c == '\\') { + advance(); + if (!parseEscape(identifier)) { + position_ = start; + return Token(Token::Type::Invalid); + } + flags = CharData::Nmchar; + } else if (charData[static_cast(c)] & flags) { + identifier += advance() + (c >= 'A' && c <= 'Z' ? 32 : 0); + flags = CharData::Nmchar; + } else { + break; + } + } + + return Token(Token::Type::Identifier, identifier); +} + +uint32_t +Tokenizer::parseUnicode() { + uint32_t value = 0; + size_t count = 0; + + while (position_ < input_.length() && count < 6) { + char c = peek(); + uint32_t digit; + + if (c >= '0' && c <= '9') { + digit = c - '0'; + } else if (c >= 'a' && c <= 'f') { + digit = c - 'a' + 10; + } else if (c >= 'A' && c <= 'F') { + digit = c - 'A' + 10; + } else { + break; + } + + value = value * 16 + digit; + advance(); + count++; + } + + // Optional whitespace after hex escape + char c = peek(); + if (c == '\r') { + advance(); + if (peek() == '\n') advance(); + } else if (isWhitespace(c)) { + advance(); + } + + return value; +} + +bool +Tokenizer::parseEscape(std::string& str) { + char c = peek(); + auto flags = charData[static_cast(c)]; + + if (flags & CharData::Hex) { + str += utf8Encode(parseUnicode()); + return true; + } else if (!(flags & CharData::Newline) && !(flags & CharData::Hex)) { + str += advance(); + return true; + } + + return false; +} + +Token +Tokenizer::parseString(char quote) { + advance(); + std::string value; + auto start = position_; + + while (position_ < input_.length()) { + char c = peek(); + + if (c == quote) { + advance(); + return Token(Token::Type::QuotedString, value); + } else if (c == '\\') { + advance(); + c = peek(); + if (c == '\r') { + advance(); + if (peek() == '\n') advance(); + } else if (isNewline(c)) { + advance(); + } else { + if (!parseEscape(value)) { + position_ = start; + return Token(Token::Type::Invalid); + } + } + } else { + value += advance(); + } + } + + position_ = start; + return Token(Token::Type::Invalid); +} + +Token +Tokenizer::nextToken() { + if (position_ >= input_.length()) { + return Token(Token::Type::EndOfInput); + } + + char c = peek(); + auto flags = charData[static_cast(c)]; + + if (isWhitespace(c)) { + std::string whitespace; + while (position_ < input_.length() && isWhitespace(peek())) { + whitespace += advance(); + } + return Token(Token::Type::Whitespace, whitespace); + } + + if (flags & CharData::NumStart) { + Token token = parseNumber(); + if (token.type() != Token::Type::Invalid) return token; + } + + if (flags & CharData::Nmstart) { + Token token = parseIdentifier(); + if (token.type() != Token::Type::Invalid) return token; + } + + if (c == '"') { + Token token = parseString('"'); + if (token.type() != Token::Type::Invalid) return token; + } + + if (c == '\'') { + Token token = parseString('\''); + if (token.type() != Token::Type::Invalid) return token; + } + + switch (advance()) { + case '/': return Token(Token::Type::Slash); + case ',': return Token(Token::Type::Comma); + case '%': return Token(Token::Type::Percent); + default: return Token(Token::Type::Invalid); + } +} + +FontParser::FontParser(std::string_view input) + : tokenizer_(input) + , currentToken_(tokenizer_.nextToken()) + , nextToken_(tokenizer_.nextToken()) {} + +const std::unordered_map FontParser::weightMap = { + {"normal", 400}, + {"bold", 700}, + {"lighter", 100}, + {"bolder", 700} +}; + +const std::unordered_map FontParser::unitMap = { + {"cm", 37.8f}, + {"mm", 3.78f}, + {"in", 96.0f}, + {"pt", 96.0f / 72.0f}, + {"pc", 96.0f / 6.0f}, + {"em", 16.0f}, + {"px", 1.0f} +}; + +void +FontParser::advance() { + currentToken_ = nextToken_; + nextToken_ = tokenizer_.nextToken(); +} + +void +FontParser::skipWs() { + while (currentToken_.type() == Token::Type::Whitespace) advance(); +} + +bool +FontParser::check(Token::Type type) const { + return currentToken_.type() == type; +} + +bool +FontParser::checkWs() const { + return nextToken_.type() == Token::Type::Whitespace + || nextToken_.type() == Token::Type::EndOfInput; +} + +bool +FontParser::parseFontStyle(FontProperties& props) { + if (check(Token::Type::Identifier)) { + const auto& value = currentToken_.getString(); + if (value == "italic") { + props.fontStyle = FontStyle::Italic; + advance(); + return true; + } else if (value == "oblique") { + props.fontStyle = FontStyle::Oblique; + advance(); + return true; + } else if (value == "normal") { + props.fontStyle = FontStyle::Normal; + advance(); + return true; + } + } + + return false; +} + +bool +FontParser::parseFontVariant(FontProperties& props) { + if (check(Token::Type::Identifier)) { + const auto& value = currentToken_.getString(); + if (value == "small-caps") { + props.fontVariant = FontVariant::SmallCaps; + advance(); + return true; + } else if (value == "normal") { + props.fontVariant = FontVariant::Normal; + advance(); + return true; + } + } + + return false; +} + +bool +FontParser::parseFontWeight(FontProperties& props) { + if (check(Token::Type::Number)) { + double weightFloat = currentToken_.getNumber(); + int weight = static_cast(weightFloat); + if (weight < 1 || weight > 1000) return false; + props.fontWeight = static_cast(weight); + advance(); + return true; + } else if (check(Token::Type::Identifier)) { + const auto& value = currentToken_.getString(); + + if (auto it = weightMap.find(value); it != weightMap.end()) { + props.fontWeight = it->second; + advance(); + return true; + } + } + + return false; +} + +bool +FontParser::parseFontSize(FontProperties& props) { + if (!check(Token::Type::Number)) return false; + + props.fontSize = currentToken_.getNumber(); + advance(); + + double multiplier = 1.0f; + if (check(Token::Type::Identifier)) { + const auto& unit = currentToken_.getString(); + + if (auto it = unitMap.find(unit); it != unitMap.end()) { + multiplier = it->second; + advance(); + } else { + return false; + } + } else if (check(Token::Type::Percent)) { + multiplier = 16.0f / 100.0f; + advance(); + } else { + return false; + } + + // Technically if we consumed some tokens but couldn't parse the font-size, + // we should rewind the tokenizer, but I don't think the grammar allows for + // any valid alternates in this specific case + + props.fontSize *= multiplier; + return true; +} + +// line-height is not used by canvas ever, but should still parse +bool +FontParser::parseLineHeight(FontProperties& props) { + if (check(Token::Type::Slash)) { + advance(); + skipWs(); + if (check(Token::Type::Number)) { + advance(); + if (check(Token::Type::Percent)) { + advance(); + } else if (check(Token::Type::Identifier)) { + auto identifier = currentToken_.getString(); + if (auto it = unitMap.find(identifier); it != unitMap.end()) { + advance(); + } else { + return false; + } + } else { + return false; + } + } else if (check(Token::Type::Identifier) && currentToken_.getString() == "normal") { + advance(); + } else { + return false; + } + } + + return true; +} + +bool +FontParser::parseFontFamily(FontProperties& props) { + while (!check(Token::Type::EndOfInput)) { + std::string family = ""; + std::string trailingWs = ""; + bool found = false; + + while ( + check(Token::Type::QuotedString) || + check(Token::Type::Identifier) || + check(Token::Type::Whitespace) + ) { + if (check(Token::Type::Whitespace)) { + if (found) trailingWs += currentToken_.getString(); + } else { // Identifier, QuotedString + if (found) { + family += trailingWs; + trailingWs.clear(); + } + + family += currentToken_.getString(); + found = true; + } + + advance(); + } + + if (!found) return false; // only whitespace or non-id/string found + + props.fontFamily.push_back(family); + + if (check(Token::Type::Comma)) advance(); + } + + return true; +} + +FontProperties +FontParser::parse(const std::string& fontString, bool* success) { + FontParser parser(fontString); + auto result = parser.parseFont(); + if (success) *success = !parser.hasError_; + return result; +} + +FontProperties +FontParser::parseFont() { + FontProperties props; + uint8_t state = 0b111; + + skipWs(); + + for (size_t i = 0; i < 3 && checkWs(); i++) { + if ((state & 0b001) && parseFontStyle(props)) { + state &= 0b110; + goto match; + } + + if ((state & 0b010) && parseFontVariant(props)) { + state &= 0b101; + goto match; + } + + if ((state & 0b100) && parseFontWeight(props)) { + state &= 0b011; + goto match; + } + + break; // all attempts exhausted + match: skipWs(); // success: move to the next non-ws token + } + + if (parseFontSize(props)) { + skipWs(); + if (parseLineHeight(props) && parseFontFamily(props)) { + return props; + } + } + + hasError_ = true; + return props; +} diff --git a/src/FontParser.h b/src/FontParser.h new file mode 100644 index 000000000..c88802109 --- /dev/null +++ b/src/FontParser.h @@ -0,0 +1,115 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include "CharData.h" + +enum class FontStyle { + Normal, + Italic, + Oblique +}; + +enum class FontVariant { + Normal, + SmallCaps +}; + +struct FontProperties { + double fontSize{16.0f}; + std::vector fontFamily; + uint16_t fontWeight{400}; + FontVariant fontVariant{FontVariant::Normal}; + FontStyle fontStyle{FontStyle::Normal}; +}; + +class Token { + public: + enum class Type { + Invalid, + Number, + Percent, + Identifier, + Slash, + Comma, + QuotedString, + Whitespace, + EndOfInput + }; + + Token(Type type, std::string value); + Token(Type type, double value); + Token(Type type); + + Type type() const { return type_; } + + const std::string& getString() const; + double getNumber() const; + + private: + Type type_; + std::variant value_; +}; + +class Tokenizer { + public: + Tokenizer(std::string_view input); + Token nextToken(); + + private: + std::string_view input_; + size_t position_{0}; + + // Util + std::string utf8Encode(uint32_t codepoint); + inline bool isWhitespace(char c) const { + return charData[static_cast(c)] & CharData::Whitespace; + } + inline bool isNewline(char c) const { + return charData[static_cast(c)] & CharData::Newline; + } + + // Moving through the string + char peek() const; + char advance(); + + // Tokenize + Token parseNumber(); + Token parseIdentifier(); + uint32_t parseUnicode(); + bool parseEscape(std::string& str); + Token parseString(char quote); +}; + +class FontParser { + public: + static FontProperties parse(const std::string& fontString, bool* success = nullptr); + + private: + static const std::unordered_map weightMap; + static const std::unordered_map unitMap; + + FontParser(std::string_view input); + + void advance(); + void skipWs(); + bool check(Token::Type type) const; + bool checkWs() const; + + bool parseFontStyle(FontProperties& props); + bool parseFontVariant(FontProperties& props); + bool parseFontWeight(FontProperties& props); + bool parseFontSize(FontProperties& props); + bool parseLineHeight(FontProperties& props); + bool parseFontFamily(FontProperties& props); + FontProperties parseFont(); + + Tokenizer tokenizer_; + Token currentToken_; + Token nextToken_; + bool hasError_{false}; +}; diff --git a/test/canvas.test.js b/test/canvas.test.js index 1a75ac031..75f15ed5a 100644 --- a/test/canvas.test.js +++ b/test/canvas.test.js @@ -14,7 +14,6 @@ const { createCanvas, createImageData, loadImage, - parseFont, registerFont, Canvas, deregisterAllFonts @@ -37,78 +36,6 @@ describe('Canvas', function () { assert('width' in Canvas.prototype) }) - it('.parseFont()', function () { - const tests = [ - '20px Arial', - { size: 20, unit: 'px', family: 'Arial' }, - '20pt Arial', - { size: 26.666666666666668, unit: 'pt', family: 'Arial' }, - '20.5pt Arial', - { size: 27.333333333333332, unit: 'pt', family: 'Arial' }, - '20% Arial', - { size: 20, unit: '%', family: 'Arial' }, // TODO I think this is a bad assertion - ZB 23-Jul-2017 - '20mm Arial', - { size: 75.59055118110237, unit: 'mm', family: 'Arial' }, - '20px serif', - { size: 20, unit: 'px', family: 'serif' }, - '20px sans-serif', - { size: 20, unit: 'px', family: 'sans-serif' }, - '20px monospace', - { size: 20, unit: 'px', family: 'monospace' }, - '50px Arial, sans-serif', - { size: 50, unit: 'px', family: 'Arial,sans-serif' }, - 'bold italic 50px Arial, sans-serif', - { style: 'italic', weight: 'bold', size: 50, unit: 'px', family: 'Arial,sans-serif' }, - '50px Helvetica , Arial, sans-serif', - { size: 50, unit: 'px', family: 'Helvetica,Arial,sans-serif' }, - '50px "Helvetica Neue", sans-serif', - { size: 50, unit: 'px', family: 'Helvetica Neue,sans-serif' }, - '50px "Helvetica Neue", "foo bar baz" , sans-serif', - { size: 50, unit: 'px', family: 'Helvetica Neue,foo bar baz,sans-serif' }, - "50px 'Helvetica Neue'", - { size: 50, unit: 'px', family: 'Helvetica Neue' }, - 'italic 20px Arial', - { size: 20, unit: 'px', style: 'italic', family: 'Arial' }, - 'oblique 20px Arial', - { size: 20, unit: 'px', style: 'oblique', family: 'Arial' }, - 'normal 20px Arial', - { size: 20, unit: 'px', style: 'normal', family: 'Arial' }, - '300 20px Arial', - { size: 20, unit: 'px', weight: '300', family: 'Arial' }, - '800 20px Arial', - { size: 20, unit: 'px', weight: '800', family: 'Arial' }, - 'bolder 20px Arial', - { size: 20, unit: 'px', weight: 'bolder', family: 'Arial' }, - 'lighter 20px Arial', - { size: 20, unit: 'px', weight: 'lighter', family: 'Arial' }, - 'normal normal normal 16px Impact', - { size: 16, unit: 'px', weight: 'normal', family: 'Impact', style: 'normal', variant: 'normal' }, - 'italic small-caps bolder 16px cursive', - { size: 16, unit: 'px', style: 'italic', variant: 'small-caps', weight: 'bolder', family: 'cursive' }, - '20px "new century schoolbook", serif', - { size: 20, unit: 'px', family: 'new century schoolbook,serif' }, - '20px "Arial bold 300"', // synthetic case with weight keyword inside family - { size: 20, unit: 'px', family: 'Arial bold 300', variant: 'normal' }, - `50px "Helvetica 'Neue'", "foo \\"bar\\" baz" , "Someone's weird \\'edge\\' case", sans-serif`, - { size: 50, unit: 'px', family: `Helvetica 'Neue',foo "bar" baz,Someone's weird 'edge' case,sans-serif` } - ] - - for (let i = 0, len = tests.length; i < len; ++i) { - const str = tests[i++] - const expected = tests[i] - const actual = parseFont(str) - - if (!expected.style) expected.style = 'normal' - if (!expected.weight) expected.weight = 'normal' - if (!expected.stretch) expected.stretch = 'normal' - if (!expected.variant) expected.variant = 'normal' - - assert.deepEqual(actual, expected, 'Failed to parse: ' + str) - } - - assert.strictEqual(parseFont('Helvetica, sans'), undefined) - }) - it('registerFont', function () { // Minimal test to make sure nothing is thrown registerFont('./examples/pfennigFont/Pfennig.ttf', { family: 'Pfennig' }) diff --git a/test/fontParser.test.js b/test/fontParser.test.js new file mode 100644 index 000000000..0302466b9 --- /dev/null +++ b/test/fontParser.test.js @@ -0,0 +1,118 @@ +/* eslint-env mocha */ + +'use strict' + +/** + * Module dependencies. + */ +const assert = require('assert') +const {Canvas} = require('..'); + +const tests = [ + '20px Arial', + { size: 20, families: ['arial'] }, + '20pt Arial', + { size: 26.666667461395264, families: ['arial'] }, + '20.5pt Arial', + { size: 27.333334147930145, families: ['arial'] }, + '20% Arial', + { size: 3.1999999284744263, families: ['arial'] }, + '20mm Arial', + { size: 75.59999942779541, families: ['arial'] }, + '20px serif', + { size: 20, families: ['serif'] }, + '20px sans-serif', + { size: 20, families: ['sans-serif'] }, + '20px monospace', + { size: 20, families: ['monospace'] }, + '50px Arial, sans-serif', + { size: 50, families: ['arial', 'sans-serif'] }, + 'bold italic 50px Arial, sans-serif', + { style: 1, weight: 700, size: 50, families: ['arial', 'sans-serif'] }, + '50px Helvetica , Arial, sans-serif', + { size: 50, families: ['helvetica', 'arial', 'sans-serif'] }, + '50px "Helvetica Neue", sans-serif', + { size: 50, families: ['Helvetica Neue', 'sans-serif'] }, + '50px "Helvetica Neue", "foo bar baz" , sans-serif', + { size: 50, families: ['Helvetica Neue', 'foo bar baz', 'sans-serif'] }, + "50px 'Helvetica Neue'", + { size: 50, families: ['Helvetica Neue'] }, + 'italic 20px Arial', + { size: 20, style: 1, families: ['arial'] }, + 'oblique 20px Arial', + { size: 20, style: 2, families: ['arial'] }, + 'normal 20px Arial', + { size: 20, families: ['arial'] }, + '300 20px Arial', + { size: 20, weight: 300, families: ['arial'] }, + '800 20px Arial', + { size: 20, weight: 800, families: ['arial'] }, + 'bolder 20px Arial', + { size: 20, weight: 700, families: ['arial'] }, + 'lighter 20px Arial', + { size: 20, weight: 100, families: ['arial'] }, + 'normal normal normal 16px Impact', + { size: 16, families: ['impact'] }, + 'italic small-caps bolder 16px cursive', + { size: 16, style: 1, variant: 1, weight: 700, families: ['cursive'] }, + '20px "new century schoolbook", serif', + { size: 20, families: ['new century schoolbook', 'serif'] }, + '20px "Arial bold 300"', // synthetic case with weight keyword inside family + { size: 20, families: ['Arial bold 300'] }, + `50px "Helvetica 'Neue'", "foo \\"bar\\" baz" , "Someone's weird \\'edge\\' case", sans-serif`, + { size: 50, families: [`Helvetica 'Neue'`, 'foo "bar" baz', `Someone's weird 'edge' case`, 'sans-serif'] }, + 'Helvetica, sans', + undefined, + '123px thefont/123abc', + undefined, + '123px /\tnormal thefont', + {size: 123, families: ['thefont']}, + '12px/1.2whoops arial', + undefined, + 'bold bold 12px thefont', + undefined, + 'italic italic 12px Arial', + undefined, + 'small-caps bold italic small-caps 12px Arial', + undefined, + 'small-caps bold oblique 12px \'A\'ri\\61l', + {size: 12, style: 2, weight: 700, variant: 1, families: ['Arial']}, + '12px/34% "The\\\n Word"', + {size: 12, families: ['The Word']}, + '', + undefined, + 'normal normal normal 1%/normal a , \'b\'', + {size: 0.1599999964237213, families: ['a', 'b']}, + 'normalnormalnormal 1px/normal a', + undefined, + '12px _the_font', + {size: 12, families: ['_the_font']}, + '9px 7 birds', + undefined, + '2em "Courier', + undefined, + `2em \\'Courier\\"`, + {size: 32, families: ['\'courier"']}, + '1px \\10abcde', + {size: 1, families: [String.fromCodePoint(parseInt('10abcd', 16)) + 'e']}, + '3E+2 1e-1px yay', + {weight: 300, size: 0.1, families: ['yay']} +]; + +describe('Font parser', function () { + for (let i = 0; i < tests.length; i++) { + const str = tests[i++] + it(str, function () { + const expected = tests[i] + const actual = Canvas.parseFont(str) + + if (expected) { + if (expected.style == null) expected.style = 0 + if (expected.weight == null) expected.weight = 400 + if (expected.variant == null) expected.variant = 0 + } + + assert.deepEqual(actual, expected) + }) + } +})