From 24d0403dc80dab48e99933aa661c73ca19fc95a9 Mon Sep 17 00:00:00 2001 From: Caleb Hearon Date: Thu, 26 Dec 2024 15:26:35 -0500 Subject: [PATCH] temp --- CHANGELOG.md | 2 + binding.gyp | 3 +- index.js | 3 - src/Canvas.cc | 38 ++- src/Canvas.h | 1 + src/CanvasRenderingContext2d.cc | 63 ++-- src/CharData.h | 231 +++++++++++++ src/FontParser.cc | 589 ++++++++++++++++++++++++++++++++ src/FontParser.h | 117 +++++++ test/canvas.test.js | 64 ++-- 10 files changed, 1042 insertions(+), 69 deletions(-) create mode 100644 src/CharData.h create mode 100644 src/FontParser.cc create mode 100644 src/FontParser.h diff --git a/CHANGELOG.md b/CHANGELOG.md index c835d3739..96b6d4e36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ project adheres to [Semantic Versioning](http://semver.org/). (Unreleased) ================== ### Changed +* `ctx.font` is now parsed natively, speeding up performance by over 500x + ### Added ### Fixed diff --git a/binding.gyp b/binding.gyp index 166842641..bf647f7d1 100644 --- a/binding.gyp +++ b/binding.gyp @@ -75,7 +75,8 @@ 'src/Image.cc', 'src/ImageData.cc', 'src/init.cc', - 'src/register_font.cc' + 'src/register_font.cc', + 'src/FontParser.cc' ], 'conditions': [ ['OS=="win"', { diff --git a/index.js b/index.js index 89f2daabc..adde4da12 100644 --- a/index.js +++ b/index.js @@ -2,7 +2,6 @@ const Canvas = require('./lib/canvas') const Image = require('./lib/image') const CanvasRenderingContext2D = require('./lib/context2d') const CanvasPattern = require('./lib/pattern') -const parseFont = require('./lib/parse-font') const packageJson = require('./package.json') const bindings = require('./lib/bindings') const fs = require('fs') @@ -12,7 +11,6 @@ const JPEGStream = require('./lib/jpegstream') const { DOMPoint, DOMMatrix } = require('./lib/DOMMatrix') bindings.setDOMMatrix(DOMMatrix) -bindings.setParseFont(parseFont) function createCanvas (width, height, type) { return new Canvas(width, height, type) @@ -73,7 +71,6 @@ exports.DOMPoint = DOMPoint exports.registerFont = registerFont exports.deregisterAllFonts = deregisterAllFonts -exports.parseFont = parseFont exports.createCanvas = createCanvas exports.createImageData = createImageData diff --git a/src/Canvas.cc b/src/Canvas.cc index 6ba312008..ed9afea3a 100644 --- a/src/Canvas.cc +++ b/src/Canvas.cc @@ -21,6 +21,7 @@ #include "Util.h" #include #include "node_buffer.h" +#include "FontParser.h" #ifdef HAVE_JPEG #include "JPEGStream.h" @@ -68,7 +69,8 @@ Canvas::Initialize(Napi::Env& env, Napi::Object& exports) { StaticValue("PNG_FILTER_PAETH", Napi::Number::New(env, PNG_FILTER_PAETH), napi_default_jsproperty), StaticValue("PNG_ALL_FILTERS", Napi::Number::New(env, PNG_ALL_FILTERS), napi_default_jsproperty), StaticMethod<&Canvas::RegisterFont>("_registerFont", napi_default_method), - StaticMethod<&Canvas::DeregisterAllFonts>("_deregisterAllFonts", napi_default_method) + StaticMethod<&Canvas::DeregisterAllFonts>("_deregisterAllFonts", napi_default_method), + StaticMethod<&Canvas::ParseFont>("parseFont", napi_default_method) }); data->CanvasCtor = Napi::Persistent(ctor); @@ -749,6 +751,40 @@ Canvas::DeregisterAllFonts(const Napi::CallbackInfo& info) { if (!success) Napi::Error::New(env, "Could not deregister one or more fonts").ThrowAsJavaScriptException(); } +/* + * This is only exported for testing + */ +Napi::Value +Canvas::ParseFont(const Napi::CallbackInfo& info) { + Napi::Env env = info.Env(); + + if (info.Length() != 1) return env.Undefined(); + + Napi::String str; + if (!info[0].ToString().UnwrapTo(&str)) return env.Undefined(); + + bool ok; + auto props = FontParser::parse(str, &ok); + if (!ok) return env.Undefined(); + + Napi::Object obj = Napi::Object::New(env); + obj.Set("size", Napi::Number::New(env, props.fontSize)); + Napi::Array families = Napi::Array::New(env); + obj.Set("families", families); + + unsigned int index = 0; + + for (auto& family : props.fontFamily) { + families[index++] = Napi::String::New(env, family); + } + + obj.Set("weight", Napi::Number::New(env, props.fontWeight)); + obj.Set("variant", Napi::Number::New(env, static_cast(props.fontVariant))); + obj.Set("style", Napi::Number::New(env, static_cast(props.fontStyle))); + + return obj; +} + /* * Get a PangoStyle from a CSS string (like "italic") */ diff --git a/src/Canvas.h b/src/Canvas.h index 5f35b356b..5b039539a 100644 --- a/src/Canvas.h +++ b/src/Canvas.h @@ -68,6 +68,7 @@ class Canvas : public Napi::ObjectWrap { void StreamJPEGSync(const Napi::CallbackInfo& info); static void RegisterFont(const Napi::CallbackInfo& info); static void DeregisterAllFonts(const Napi::CallbackInfo& info); + static Napi::Value ParseFont(const Napi::CallbackInfo& info); Napi::Error CairoError(cairo_status_t status); static void ToPngBufferAsync(Closure* closure); static void ToJpegBufferAsync(Closure* closure); diff --git a/src/CanvasRenderingContext2d.cc b/src/CanvasRenderingContext2d.cc index d0966e299..19d56bcc4 100644 --- a/src/CanvasRenderingContext2d.cc +++ b/src/CanvasRenderingContext2d.cc @@ -9,6 +9,7 @@ #include "CanvasGradient.h" #include "CanvasPattern.h" #include "InstanceData.h" +#include "FontParser.h" #include #include #include "Image.h" @@ -2575,54 +2576,54 @@ Context2d::GetFont(const Napi::CallbackInfo& info) { void Context2d::SetFont(const Napi::CallbackInfo& info, const Napi::Value& value) { - InstanceData* data = env.GetInstanceData(); - if (!value.IsString()) return; - if (!value.As().Utf8Value().length()) return; - - Napi::Value mparsed; - - // parseFont returns undefined for invalid CSS font strings - if (!data->parseFont.Call({ value }).UnwrapTo(&mparsed) || mparsed.IsUndefined()) return; + std::string str = value.As().Utf8Value(); + if (!str.length()) return; - Napi::Object font = mparsed.As(); - - Napi::String empty = Napi::String::New(env, ""); - Napi::Number zero = Napi::Number::New(env, 0); - - std::string weight = font.Get("weight").UnwrapOr(empty).ToString().UnwrapOr(empty).Utf8Value(); - std::string style = font.Get("style").UnwrapOr(empty).ToString().UnwrapOr(empty).Utf8Value(); - double size = font.Get("size").UnwrapOr(zero).ToNumber().UnwrapOr(zero).DoubleValue(); - std::string unit = font.Get("unit").UnwrapOr(empty).ToString().UnwrapOr(empty).Utf8Value(); - std::string family = font.Get("family").UnwrapOr(empty).ToString().UnwrapOr(empty).Utf8Value(); + bool success; + auto props = FontParser::parse(str, &success); + if (!success) return; PangoFontDescription *desc = pango_font_description_copy(state->fontDescription); pango_font_description_free(state->fontDescription); - pango_font_description_set_style(desc, Canvas::GetStyleFromCSSString(style.c_str())); - pango_font_description_set_weight(desc, Canvas::GetWeightFromCSSString(weight.c_str())); + // Map our FontStyle enum to Pango style + PangoStyle style = PANGO_STYLE_NORMAL; + switch (props.fontStyle) { + case FontStyle::Italic: + style = PANGO_STYLE_ITALIC; + break; + case FontStyle::Oblique: + style = PANGO_STYLE_OBLIQUE; + break; + default: + break; + } + pango_font_description_set_style(desc, style); + + // Weight is already in the correct range (1-1000) + pango_font_description_set_weight(desc, static_cast(props.fontWeight)); - if (family.length() > 0) { - // See #1643 - Pango understands "sans" whereas CSS uses "sans-serif" - std::string s1(family); - std::string s2("sans-serif"); - if (streq_casein(s1, s2)) { - pango_font_description_set_family(desc, "sans"); - } else { - pango_font_description_set_family(desc, family.c_str()); - } + // Join font families with commas + std::string family = props.fontFamily.empty() ? "" : props.fontFamily[0]; + for (size_t i = 1; i < props.fontFamily.size(); i++) { + family += "," + props.fontFamily[i]; } + pango_font_description_set_family(desc, family.c_str()); + + // Convert font size to Pango units (points * PANGO_SCALE) + pango_font_description_set_size(desc, props.fontSize * PANGO_SCALE); PangoFontDescription *sys_desc = Canvas::ResolveFontDescription(desc); pango_font_description_free(desc); - if (size > 0) pango_font_description_set_absolute_size(sys_desc, size * PANGO_SCALE); + pango_font_description_set_absolute_size(sys_desc, props.fontSize * PANGO_SCALE); state->fontDescription = sys_desc; pango_layout_set_font_description(_layout, sys_desc); - state->font = value.As().Utf8Value().c_str(); + state->font = str; } /* diff --git a/src/CharData.h b/src/CharData.h new file mode 100644 index 000000000..ebc2dd5e1 --- /dev/null +++ b/src/CharData.h @@ -0,0 +1,231 @@ +// This is used for classifying characters according to the definition of tokens +// in the CSS standards, but could be extended for any other future uses + +#pragma once + +namespace CharData { + static constexpr uint8_t Whitespace = 0x1; + static constexpr uint8_t Newline = 0x2; + static constexpr uint8_t Hex = 0x4; + static constexpr uint8_t Nmstart = 0x8; + static constexpr uint8_t Nmchar = 0x10; + static constexpr uint8_t Sign = 0x20; + static constexpr uint8_t Digit = 0x40; + static constexpr uint8_t NumStart = 0x80; +}; + +using namespace CharData; + +constexpr const uint8_t charData[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0-8 + Whitespace, // 9 (HT) + Whitespace | Newline, // 10 (LF) + 0, // 11 (VT) + Whitespace | Newline, // 12 (FF) + Whitespace | Newline, // 13 (CR) + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 14-31 + Whitespace, // 32 (Space) + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 33-42 + Sign | NumStart, // 43 (+) + 0, // 44 + Nmchar | Sign | NumStart, // 45 (-) + 0, 0, // 46-47 + Nmchar | Digit | NumStart | Hex, // 48 (0) + Nmchar | Digit | NumStart | Hex, // 49 (1) + Nmchar | Digit | NumStart | Hex, // 50 (2) + Nmchar | Digit | NumStart | Hex, // 51 (3) + Nmchar | Digit | NumStart | Hex, // 52 (4) + Nmchar | Digit | NumStart | Hex, // 53 (5) + Nmchar | Digit | NumStart | Hex, // 54 (6) + Nmchar | Digit | NumStart | Hex, // 55 (7) + Nmchar | Digit | NumStart | Hex, // 56 (8) + Nmchar | Digit | NumStart | Hex, // 57 (9) + 0, 0, 0, 0, 0, 0, 0, // 58-64 + Nmstart | Nmchar | Hex, // 65 (A) + Nmstart | Nmchar | Hex, // 66 (B) + Nmstart | Nmchar | Hex, // 67 (C) + Nmstart | Nmchar | Hex, // 68 (D) + Nmstart | Nmchar | Hex, // 69 (E) + Nmstart | Nmchar | Hex, // 70 (F) + Nmstart | Nmchar, // 71 (G) + Nmstart | Nmchar, // 72 (H) + Nmstart | Nmchar, // 73 (I) + Nmstart | Nmchar, // 74 (J) + Nmstart | Nmchar, // 75 (K) + Nmstart | Nmchar, // 76 (L) + Nmstart | Nmchar, // 77 (M) + Nmstart | Nmchar, // 78 (N) + Nmstart | Nmchar, // 79 (O) + Nmstart | Nmchar, // 80 (P) + Nmstart | Nmchar, // 81 (Q) + Nmstart | Nmchar, // 82 (R) + Nmstart | Nmchar, // 83 (S) + Nmstart | Nmchar, // 84 (T) + Nmstart | Nmchar, // 85 (U) + Nmstart | Nmchar, // 86 (V) + Nmstart | Nmchar, // 87 (W) + Nmstart | Nmchar, // 88 (X) + Nmstart | Nmchar, // 89 (Y) + Nmstart | Nmchar, // 90 (Z) + 0, // 91 + Nmstart, // 92 (\) + 0, 0, // 93-94 + Nmstart | Nmchar, // 95 (_) + 0, // 96 + Nmstart | Nmchar | Hex, // 97 (a) + Nmstart | Nmchar | Hex, // 98 (b) + Nmstart | Nmchar | Hex, // 99 (c) + Nmstart | Nmchar | Hex, // 100 (d) + Nmstart | Nmchar | Hex, // 101 (e) + Nmstart | Nmchar | Hex, // 102 (f) + Nmstart | Nmchar, // 103 (g) + Nmstart | Nmchar, // 104 (h) + Nmstart | Nmchar, // 105 (i) + Nmstart | Nmchar, // 106 (j) + Nmstart | Nmchar, // 107 (k) + Nmstart | Nmchar, // 108 (l) + Nmstart | Nmchar, // 109 (m) + Nmstart | Nmchar, // 110 (n) + Nmstart | Nmchar, // 111 (o) + Nmstart | Nmchar, // 112 (p) + Nmstart | Nmchar, // 113 (q) + Nmstart | Nmchar, // 114 (r) + Nmstart | Nmchar, // 115 (s) + Nmstart | Nmchar, // 116 (t) + Nmstart | Nmchar, // 117 (u) + Nmstart | Nmchar, // 118 (v) + Nmstart | Nmchar, // 119 (w) + Nmstart | Nmchar, // 120 (x) + Nmstart | Nmchar, // 121 (y) + Nmstart | Nmchar, // 122 (z) + 0, 0, 0, 0, 0, // 123-127 + // Non-ASCII + Nmstart | Nmchar, // 128 + Nmstart | Nmchar, // 129 + Nmstart | Nmchar, // 130 + Nmstart | Nmchar, // 131 + Nmstart | Nmchar, // 132 + Nmstart | Nmchar, // 133 + Nmstart | Nmchar, // 134 + Nmstart | Nmchar, // 135 + Nmstart | Nmchar, // 136 + Nmstart | Nmchar, // 137 + Nmstart | Nmchar, // 138 + Nmstart | Nmchar, // 139 + Nmstart | Nmchar, // 140 + Nmstart | Nmchar, // 141 + Nmstart | Nmchar, // 142 + Nmstart | Nmchar, // 143 + Nmstart | Nmchar, // 144 + Nmstart | Nmchar, // 145 + Nmstart | Nmchar, // 146 + Nmstart | Nmchar, // 147 + Nmstart | Nmchar, // 148 + Nmstart | Nmchar, // 149 + Nmstart | Nmchar, // 150 + Nmstart | Nmchar, // 151 + Nmstart | Nmchar, // 152 + Nmstart | Nmchar, // 153 + Nmstart | Nmchar, // 154 + Nmstart | Nmchar, // 155 + Nmstart | Nmchar, // 156 + Nmstart | Nmchar, // 157 + Nmstart | Nmchar, // 158 + Nmstart | Nmchar, // 159 + Nmstart | Nmchar, // 160 + Nmstart | Nmchar, // 161 + Nmstart | Nmchar, // 162 + Nmstart | Nmchar, // 163 + Nmstart | Nmchar, // 164 + Nmstart | Nmchar, // 165 + Nmstart | Nmchar, // 166 + Nmstart | Nmchar, // 167 + Nmstart | Nmchar, // 168 + Nmstart | Nmchar, // 169 + Nmstart | Nmchar, // 170 + Nmstart | Nmchar, // 171 + Nmstart | Nmchar, // 172 + Nmstart | Nmchar, // 173 + Nmstart | Nmchar, // 174 + Nmstart | Nmchar, // 175 + Nmstart | Nmchar, // 176 + Nmstart | Nmchar, // 177 + Nmstart | Nmchar, // 178 + Nmstart | Nmchar, // 179 + Nmstart | Nmchar, // 180 + Nmstart | Nmchar, // 181 + Nmstart | Nmchar, // 182 + Nmstart | Nmchar, // 183 + Nmstart | Nmchar, // 184 + Nmstart | Nmchar, // 185 + Nmstart | Nmchar, // 186 + Nmstart | Nmchar, // 187 + Nmstart | Nmchar, // 188 + Nmstart | Nmchar, // 189 + Nmstart | Nmchar, // 190 + Nmstart | Nmchar, // 191 + Nmstart | Nmchar, // 192 + Nmstart | Nmchar, // 193 + Nmstart | Nmchar, // 194 + Nmstart | Nmchar, // 195 + Nmstart | Nmchar, // 196 + Nmstart | Nmchar, // 197 + Nmstart | Nmchar, // 198 + Nmstart | Nmchar, // 199 + Nmstart | Nmchar, // 200 + Nmstart | Nmchar, // 201 + Nmstart | Nmchar, // 202 + Nmstart | Nmchar, // 203 + Nmstart | Nmchar, // 204 + Nmstart | Nmchar, // 205 + Nmstart | Nmchar, // 206 + Nmstart | Nmchar, // 207 + Nmstart | Nmchar, // 208 + Nmstart | Nmchar, // 209 + Nmstart | Nmchar, // 210 + Nmstart | Nmchar, // 211 + Nmstart | Nmchar, // 212 + Nmstart | Nmchar, // 213 + Nmstart | Nmchar, // 214 + Nmstart | Nmchar, // 215 + Nmstart | Nmchar, // 216 + Nmstart | Nmchar, // 217 + Nmstart | Nmchar, // 218 + Nmstart | Nmchar, // 219 + Nmstart | Nmchar, // 220 + Nmstart | Nmchar, // 221 + Nmstart | Nmchar, // 222 + Nmstart | Nmchar, // 223 + Nmstart | Nmchar, // 224 + Nmstart | Nmchar, // 225 + Nmstart | Nmchar, // 226 + Nmstart | Nmchar, // 227 + Nmstart | Nmchar, // 228 + Nmstart | Nmchar, // 229 + Nmstart | Nmchar, // 230 + Nmstart | Nmchar, // 231 + Nmstart | Nmchar, // 232 + Nmstart | Nmchar, // 233 + Nmstart | Nmchar, // 234 + Nmstart | Nmchar, // 235 + Nmstart | Nmchar, // 236 + Nmstart | Nmchar, // 237 + Nmstart | Nmchar, // 238 + Nmstart | Nmchar, // 239 + Nmstart | Nmchar, // 240 + Nmstart | Nmchar, // 241 + Nmstart | Nmchar, // 242 + Nmstart | Nmchar, // 243 + Nmstart | Nmchar, // 244 + Nmstart | Nmchar, // 245 + Nmstart | Nmchar, // 246 + Nmstart | Nmchar, // 247 + Nmstart | Nmchar, // 248 + Nmstart | Nmchar, // 249 + Nmstart | Nmchar, // 250 + Nmstart | Nmchar, // 251 + Nmstart | Nmchar, // 252 + Nmstart | Nmchar, // 253 + Nmstart | Nmchar, // 254 + Nmstart | Nmchar // 255 +}; diff --git a/src/FontParser.cc b/src/FontParser.cc new file mode 100644 index 000000000..3865ee648 --- /dev/null +++ b/src/FontParser.cc @@ -0,0 +1,589 @@ +// This is written to exactly parse the `font` shorthand in CSS2: +// https://www.w3.org/TR/CSS22/fonts.html#font-shorthand +// https://www.w3.org/TR/CSS22/syndata.html#tokenization +// +// We may want to update it for CSS 3 (e.g. font-stretch, or updated +// tokenization) but I've only ever seen one or two issues filed in node-canvas +// due to parsing in my 8 years on the project +// +// Claude 3.5 was used for the initial classes and repetitive work, but its +// output was thoroughly checked and heavily changed. + +#include "FontParser.h" +#include "CharData.h" +#include +#include + +Token::Token(Type type, std::string value) : type_(type), value_(std::move(value)) {} + +Token::Token(Type type, double value) : type_(type), value_(value) {} + +Token::Token(Type type) : type_(type), value_(std::string{}) {} + +const std::string& +Token::getString() const { + static const std::string empty; + auto* str = std::get_if(&value_); + return str ? *str : empty; +} + +double +Token::getNumber() const { + auto* num = std::get_if(&value_); + return num ? *num : 0.0f; +} + +Tokenizer::Tokenizer(std::string_view input) : input_(input) {} + +std::string +Tokenizer::utf8Encode(uint32_t codepoint) { + std::string result; + + if (codepoint < 0x80) { + result += static_cast(codepoint); + } else if (codepoint < 0x800) { + result += static_cast((codepoint >> 6) | 0xc0); + result += static_cast((codepoint & 0x3f) | 0x80); + } else if (codepoint < 0x10000) { + result += static_cast((codepoint >> 12) | 0xe0); + result += static_cast(((codepoint >> 6) & 0x3f) | 0x80); + result += static_cast((codepoint & 0x3f) | 0x80); + } else { + result += static_cast((codepoint >> 18) | 0xf0); + result += static_cast(((codepoint >> 12) & 0x3f) | 0x80); + result += static_cast(((codepoint >> 6) & 0x3f) | 0x80); + result += static_cast((codepoint & 0x3f) | 0x80); + } + + return result; +} + +char +Tokenizer::peek() const { + return position_ < input_.length() ? input_[position_] : '\0'; +} + +char +Tokenizer::advance() { + return position_ < input_.length() ? input_[position_++] : '\0'; +} + +Token +Tokenizer::parseNumber() { + enum class State { + Start, + AfterSign, + Digits, + AfterDecimal, + AfterE, + AfterESign, + ExponentDigits + }; + + size_t start = position_; + State state = State::Start; + bool valid = false; + + while (position_ < input_.length()) { + char c = peek(); + uint8_t flags = charData[static_cast(c)]; + + switch (state) { + case State::Start: + if (flags & CharData::Sign) { + position_++; + state = State::AfterSign; + } else if (flags & CharData::Digit) { + position_++; + state = State::Digits; + valid = true; + } else if (c == '.') { + position_++; + state = State::AfterDecimal; + } else { + goto done; + } + break; + + case State::AfterSign: + if (flags & CharData::Digit) { + position_++; + state = State::Digits; + valid = true; + } else if (c == '.') { + position_++; + state = State::AfterDecimal; + } else { + goto done; + } + break; + + case State::Digits: + if (flags & CharData::Digit) { + position_++; + } else if (c == '.') { + position_++; + state = State::AfterDecimal; + } else if (c == 'e' || c == 'E') { + position_++; + state = State::AfterE; + valid = false; // Need exponent digits + } else { + goto done; + } + break; + + case State::AfterDecimal: + if (flags & CharData::Digit) { + position_++; + valid = true; + state = State::Digits; + } else { + goto done; + } + break; + + case State::AfterE: + if (flags & CharData::Sign) { + position_++; + state = State::AfterESign; + } else if (flags & CharData::Digit) { + position_++; + valid = true; + state = State::ExponentDigits; + } else { + goto done; + } + break; + + case State::AfterESign: + if (flags & CharData::Digit) { + position_++; + valid = true; + state = State::ExponentDigits; + } else { + goto done; + } + break; + + case State::ExponentDigits: + if (flags & CharData::Digit) { + position_++; + } else { + goto done; + } + break; + } + } + +done: + if (!valid) { + position_ = start; + return Token(Token::Type::Invalid); + } + + std::string number_str(input_.substr(start, position_ - start)); + double value = std::stod(number_str); + return Token(Token::Type::Number, value); +} + +// Note that identifiers are always lower-case. This helps us make easier/more +// efficient comparisons, but means that font-families specified as identifiers +// will be lower-cased. Since font selection isn't case sensitive, this +// shouldn't ever be a problem. +Token +Tokenizer::parseIdentifier() { + std::string identifier; + auto flags = CharData::Nmstart; + auto start = position_; + + while (position_ < input_.length()) { + char c = peek(); + + if (c == '\\') { + advance(); + if (!parseEscape(identifier)) { + position_ = start; + return Token(Token::Type::Invalid); + } + flags = CharData::Nmchar; + continue; + } + + if (charData[static_cast(c)] & flags) { + identifier += advance() + (c >= 'A' && c <= 'Z' ? 32 : 0); + flags = CharData::Nmchar; + } else { + break; + } + } + + return Token(Token::Type::Identifier, identifier); +} + +uint32_t +Tokenizer::parseUnicode() { + uint32_t value = 0; + size_t count = 0; + + while (position_ < input_.length() && count < 6) { + char c = peek(); + uint32_t digit; + + if (c >= '0' && c <= '9') { + digit = c - '0'; + } else if (c >= 'a' && c <= 'f') { + digit = c - 'a' + 10; + } else if (c >= 'A' && c <= 'F') { + digit = c - 'A' + 10; + } else { + break; + } + + value = value * 16 + digit; + advance(); + count++; + } + + // Optional whitespace after hex escape + char c = peek(); + if (c == '\r') { + advance(); + if (peek() == '\n') advance(); + } else if (isWhitespace(c)) { + advance(); + } + + return value; +} + +bool +Tokenizer::parseEscape(std::string& str) { + char c = peek(); + auto flags = charData[static_cast(c)]; + + if (flags & CharData::Hex) { + str += utf8Encode(parseUnicode()); + return true; + } else if (!(flags & CharData::Newline) && !(flags & CharData::Hex)) { + str += advance(); + return true; + } + + return false; +} + +Token +Tokenizer::parseString(char quote) { + advance(); // consume opening quote + std::string value; + auto start = position_; + + while (position_ < input_.length()) { + char c = peek(); + + if (c == quote) { + advance(); // consume closing quote + break; + } else if (c == '\\') { + advance(); + c = peek(); + if (c == '\r') { + advance(); + if (peek() == '\n') advance(); + } else if (isNewline(c)) { + advance(); + } else { + if (!parseEscape(value)) { + position_ = start; + return Token(Token::Type::Invalid); + } + } + } else { + value += advance(); + } + } + + return Token(Token::Type::QuotedString, value); +} + +Token +Tokenizer::nextToken() { + if (position_ >= input_.length()) { + return Token(Token::Type::EndOfInput); + } + + char c = peek(); + auto flags = charData[static_cast(c)]; + + // Add handling for whitespace tokens + if (isWhitespace(c)) { + std::string whitespace; + while (position_ < input_.length() && isWhitespace(peek())) { + whitespace += advance(); + } + return Token(Token::Type::Whitespace, whitespace); + } + + if (flags & CharData::NumStart) { + Token token = parseNumber(); + if (token.type() != Token::Type::Invalid) return token; + } + if (flags & CharData::Nmstart || flags & 0x80) return parseIdentifier(); + if (c == '"') return parseString('"'); + if (c == '\'') return parseString('\''); + + advance(); // consume character + switch (c) { + case '/': return Token(Token::Type::Slash); + case ',': return Token(Token::Type::Comma); + case '%': return Token(Token::Type::Percent); + default: return Token(Token::Type::Invalid); + } +} + +FontParser::FontParser(std::string_view input) : tokenizer_(input) { + currentToken_ = tokenizer_.nextToken(); + nextToken_ = tokenizer_.nextToken(); +} + +const std::unordered_map FontParser::weightMap = { + {"normal", 400}, + {"bold", 700}, + {"lighter", 100}, + {"bolder", 700} +}; + +const std::unordered_map FontParser::unitMap = { + {"cm", 37.8f}, + {"mm", 3.78f}, + {"in", 96.0f}, + {"pt", 96.0f / 72.0f}, + {"pc", 96.0f / 6.0f}, + {"em", 16.0f}, + {"px", 1.0f} +}; + +void +FontParser::advance() { + currentToken_ = nextToken_; + nextToken_ = tokenizer_.nextToken(); +} + +void +FontParser::skipWs() { + while (currentToken_.type() == Token::Type::Whitespace) advance(); +} + +bool +FontParser::check(Token::Type type) const { + return currentToken_.type() == type; +} + +bool +FontParser::checkWs() const { + return nextToken_.type() == Token::Type::Whitespace + || nextToken_.type() == Token::Type::EndOfInput; +} + +bool +FontParser::parseFontStyle(FontProperties& props) { + if (check(Token::Type::Identifier)) { + const auto& value = currentToken_.getString(); + if (value == "italic") { + props.fontStyle = FontStyle::Italic; + advance(); + return true; + } else if (value == "oblique") { + props.fontStyle = FontStyle::Oblique; + advance(); + return true; + } else if (value == "normal") { + props.fontStyle = FontStyle::Normal; + advance(); + return true; + } + } + + return false; +} + +bool +FontParser::parseFontVariant(FontProperties& props) { + if (check(Token::Type::Identifier)) { + const auto& value = currentToken_.getString(); + if (value == "small-caps") { + props.fontVariant = FontVariant::SmallCaps; + advance(); + return true; + } else if (value == "normal") { + props.fontVariant = FontVariant::Normal; + advance(); + return true; + } + } + + return false; +} + +bool +FontParser::parseFontWeight(FontProperties& props) { + if (check(Token::Type::Number)) { + double weightFloat = currentToken_.getNumber(); + int weight = static_cast(weightFloat); + if (weight < 1 || weight > 1000) return false; + props.fontWeight = static_cast(weight); + advance(); + return true; + } else if (check(Token::Type::Identifier)) { + const auto& value = currentToken_.getString(); + + if (auto it = weightMap.find(value); it != weightMap.end()) { + props.fontWeight = it->second; + advance(); + return true; + } + } + + return false; +} + +bool +FontParser::parseFontSize(FontProperties& props) { + if (!check(Token::Type::Number)) return false; + + props.fontSize = currentToken_.getNumber(); + advance(); + + double multiplier = 1.0f; + if (check(Token::Type::Identifier)) { + const auto& unit = currentToken_.getString(); + + if (auto it = unitMap.find(unit); it != unitMap.end()) { + multiplier = it->second; + advance(); + } else { + return false; + } + } else if (check(Token::Type::Percent)) { + multiplier = 16.0f / 100.0f; + advance(); + } else { + return false; + } + + // Technically if we consumed some tokens but couldn't parse the font-size, + // we should rewind the tokenizer, but I don't think the grammar allows for + // any valid alternates in this specific case + + props.fontSize *= multiplier; + return true; +} + +// line-height is not used by canvas ever, but should still parse +bool +FontParser::parseLineHeight(FontProperties& props) { + skipWs(); + if (check(Token::Type::Slash)) { + advance(); + skipWs(); + if (check(Token::Type::Number)) { + advance(); + if (check(Token::Type::Percent)) advance(); + skipWs(); + } else if (check(Token::Type::Identifier)) { + auto identifier = currentToken_.getString(); + auto it = unitMap.find(identifier); + if (it == unitMap.end() && identifier != "normal") return false; + advance(); + skipWs(); + } else { + return false; + } + } + + return true; +} + +bool +FontParser::parseFontFamily(FontProperties& props) { + while (!check(Token::Type::EndOfInput) && !check(Token::Type::Invalid)) { + std::string family = ""; + std::string trailingWs = ""; + bool found = false; + + while ( + check(Token::Type::QuotedString) || + check(Token::Type::Identifier) || + check(Token::Type::Whitespace) + ) { + if (check(Token::Type::Whitespace)) { + if (found) trailingWs += currentToken_.getString(); + } else { // Identifier, QuotedString + if (found) { + family += trailingWs; + trailingWs.clear(); + } + + family += currentToken_.getString(); + found = true; + } + + advance(); + } + + if (!found) return false; // only whitespace or non-id/string found + + props.fontFamily.push_back(family); + + if (check(Token::Type::Comma)) advance(); + } + + return true; +} + +FontProperties +FontParser::parse(const std::string& fontString, bool* success) { + FontParser parser(fontString); + auto result = parser.parseFont(); + if (success) *success = !parser.hasError_; + return result; +} + +FontProperties +FontParser::parseFont() { + FontProperties props; + uint8_t state = 0b111; + + skipWs(); + + for (size_t i = 0; i < 3 && checkWs(); i++) { + if ((state & 0b001) && parseFontStyle(props)) { + state &= 0b110; + goto match; + } + + if ((state & 0b010) && parseFontVariant(props)) { + state &= 0b101; + goto match; + } + + if ((state & 0b100) && parseFontWeight(props)) { + state &= 0b011; + goto match; + } + + break; // all attempts exhausted + match: skipWs(); // success: move to the next non-ws token + } + + if ( + !parseFontSize(props) || + !parseLineHeight(props) || + !parseFontFamily(props) + ) { + hasError_ = true; + } + + return props; +} diff --git a/src/FontParser.h b/src/FontParser.h new file mode 100644 index 000000000..62e9f9cea --- /dev/null +++ b/src/FontParser.h @@ -0,0 +1,117 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include "CharData.h" + +enum class FontStyle { + Normal, + Italic, + Oblique +}; + +enum class FontVariant { + Normal, + SmallCaps +}; + +struct FontProperties { + double fontSize{16.0f}; + std::vector fontFamily; + uint16_t fontWeight{400}; + FontVariant fontVariant{FontVariant::Normal}; + FontStyle fontStyle{FontStyle::Normal}; +}; + +class Token { + public: + enum class Type { + Invalid, + Number, + Percent, + Identifier, + Slash, + Comma, + QuotedString, + Whitespace, + EndOfInput + }; + + Token(Type type, std::string value); + Token(Type type, double value); + Token(Type type); // For tokens without values like Slash, Comma, EndOfInput + Token(): Token(Type::Identifier) {} + + Type type() const { return type_; } + + // Value accessors + const std::string& getString() const; // Returns empty string if not string + double getNumber() const; // Returns 0.0f if not number + + private: + Type type_; + std::variant value_; +}; + +class Tokenizer { + public: + Tokenizer(std::string_view input); + Token nextToken(); + + private: + std::string_view input_; + size_t position_{0}; + + // Util + std::string utf8Encode(uint32_t codepoint); + bool isWhitespace(char c) const { + return charData[static_cast(c)] & CharData::Whitespace; + } + bool isNewline(char c) const { + return charData[static_cast(c)] & CharData::Newline; + } + + // Moving through the string + char peek() const; + char advance(); + + // Tokenize + Token parseNumber(); + Token parseIdentifier(); + uint32_t parseUnicode(); + bool parseEscape(std::string& str); + Token parseString(char quote); +}; + +class FontParser { + public: + static FontProperties parse(const std::string& fontString, bool* success = nullptr); + + private: + static const std::unordered_map weightMap; + static const std::unordered_map unitMap; + + FontParser(std::string_view input); + + void advance(); + void skipWs(); + bool check(Token::Type type) const; + bool checkWs() const; + + bool parseFontStyle(FontProperties& props); + bool parseFontVariant(FontProperties& props); + bool parseFontWeight(FontProperties& props); + bool parseFontSize(FontProperties& props); + bool parseLineHeight(FontProperties& props); + bool parseFontFamily(FontProperties& props); + FontProperties parseFont(); + + Token currentToken_; + Token nextToken_; + Tokenizer tokenizer_; + bool hasError_{false}; +}; diff --git a/test/canvas.test.js b/test/canvas.test.js index 1a75ac031..de0023798 100644 --- a/test/canvas.test.js +++ b/test/canvas.test.js @@ -14,7 +14,6 @@ const { createCanvas, createImageData, loadImage, - parseFont, registerFont, Canvas, deregisterAllFonts @@ -40,73 +39,72 @@ describe('Canvas', function () { it('.parseFont()', function () { const tests = [ '20px Arial', - { size: 20, unit: 'px', family: 'Arial' }, + { size: 20, families: ['arial'] }, '20pt Arial', - { size: 26.666666666666668, unit: 'pt', family: 'Arial' }, + { size: 26.666667461395264, families: ['arial'] }, '20.5pt Arial', - { size: 27.333333333333332, unit: 'pt', family: 'Arial' }, + { size: 27.333334147930145, families: ['arial'] }, '20% Arial', - { size: 20, unit: '%', family: 'Arial' }, // TODO I think this is a bad assertion - ZB 23-Jul-2017 + { size: 3.1999999284744263, families: ['arial'] }, '20mm Arial', - { size: 75.59055118110237, unit: 'mm', family: 'Arial' }, + { size: 75.59999942779541, families: ['arial'] }, '20px serif', - { size: 20, unit: 'px', family: 'serif' }, + { size: 20, families: ['serif'] }, '20px sans-serif', - { size: 20, unit: 'px', family: 'sans-serif' }, + { size: 20, families: ['sans-serif'] }, '20px monospace', - { size: 20, unit: 'px', family: 'monospace' }, + { size: 20, families: ['monospace'] }, '50px Arial, sans-serif', - { size: 50, unit: 'px', family: 'Arial,sans-serif' }, + { size: 50, families: ['arial', 'sans-serif'] }, 'bold italic 50px Arial, sans-serif', - { style: 'italic', weight: 'bold', size: 50, unit: 'px', family: 'Arial,sans-serif' }, + { style: 1, weight: 700, size: 50, families: ['arial', 'sans-serif'] }, '50px Helvetica , Arial, sans-serif', - { size: 50, unit: 'px', family: 'Helvetica,Arial,sans-serif' }, + { size: 50, families: ['helvetica', 'arial', 'sans-serif'] }, '50px "Helvetica Neue", sans-serif', - { size: 50, unit: 'px', family: 'Helvetica Neue,sans-serif' }, + { size: 50, families: ['Helvetica Neue', 'sans-serif'] }, '50px "Helvetica Neue", "foo bar baz" , sans-serif', - { size: 50, unit: 'px', family: 'Helvetica Neue,foo bar baz,sans-serif' }, + { size: 50, families: ['Helvetica Neue', 'foo bar baz', 'sans-serif'] }, "50px 'Helvetica Neue'", - { size: 50, unit: 'px', family: 'Helvetica Neue' }, + { size: 50, families: ['Helvetica Neue'] }, 'italic 20px Arial', - { size: 20, unit: 'px', style: 'italic', family: 'Arial' }, + { size: 20, style: 1, families: ['arial'] }, 'oblique 20px Arial', - { size: 20, unit: 'px', style: 'oblique', family: 'Arial' }, + { size: 20, style: 2, families: ['arial'] }, 'normal 20px Arial', - { size: 20, unit: 'px', style: 'normal', family: 'Arial' }, + { size: 20, families: ['arial'] }, '300 20px Arial', - { size: 20, unit: 'px', weight: '300', family: 'Arial' }, + { size: 20, weight: 300, families: ['arial'] }, '800 20px Arial', - { size: 20, unit: 'px', weight: '800', family: 'Arial' }, + { size: 20, weight: 800, families: ['arial'] }, 'bolder 20px Arial', - { size: 20, unit: 'px', weight: 'bolder', family: 'Arial' }, + { size: 20, weight: 700, families: ['arial'] }, 'lighter 20px Arial', - { size: 20, unit: 'px', weight: 'lighter', family: 'Arial' }, + { size: 20, weight: 100, families: ['arial'] }, 'normal normal normal 16px Impact', - { size: 16, unit: 'px', weight: 'normal', family: 'Impact', style: 'normal', variant: 'normal' }, + { size: 16, families: ['impact'] }, 'italic small-caps bolder 16px cursive', - { size: 16, unit: 'px', style: 'italic', variant: 'small-caps', weight: 'bolder', family: 'cursive' }, + { size: 16, style: 1, variant: 1, weight: 700, families: ['cursive'] }, '20px "new century schoolbook", serif', - { size: 20, unit: 'px', family: 'new century schoolbook,serif' }, + { size: 20, families: ['new century schoolbook', 'serif'] }, '20px "Arial bold 300"', // synthetic case with weight keyword inside family - { size: 20, unit: 'px', family: 'Arial bold 300', variant: 'normal' }, + { size: 20, families: ['Arial bold 300'] }, `50px "Helvetica 'Neue'", "foo \\"bar\\" baz" , "Someone's weird \\'edge\\' case", sans-serif`, - { size: 50, unit: 'px', family: `Helvetica 'Neue',foo "bar" baz,Someone's weird 'edge' case,sans-serif` } + { size: 50, families: [`Helvetica 'Neue'`, 'foo "bar" baz', `Someone's weird 'edge' case`, 'sans-serif'] } ] for (let i = 0, len = tests.length; i < len; ++i) { const str = tests[i++] const expected = tests[i] - const actual = parseFont(str) + const actual = Canvas.parseFont(str) - if (!expected.style) expected.style = 'normal' - if (!expected.weight) expected.weight = 'normal' - if (!expected.stretch) expected.stretch = 'normal' - if (!expected.variant) expected.variant = 'normal' + if (expected.style == null) expected.style = 0; + if (expected.weight == null) expected.weight = 400; + if (expected.variant == null) expected.variant = 0; assert.deepEqual(actual, expected, 'Failed to parse: ' + str) } - assert.strictEqual(parseFont('Helvetica, sans'), undefined) + assert.strictEqual(Canvas.parseFont('Helvetica, sans'), undefined) }) it('registerFont', function () {