Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Encodings #2435

Merged
merged 16 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lib/LaTeXML/Core/Definition.pm
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ sub isRegister {
sub isFontDef { # ONLY FontDef handles this!
return ''; }

sub isCharDef { # ONLY CharDef handles this!
return ''; }

sub isPrefix {
return 0; }

Expand Down
21 changes: 15 additions & 6 deletions lib/LaTeXML/Core/Definition/CharDef.pm
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,29 @@ use LaTeXML::Common::Error;
use LaTeXML::Core::Token;
use LaTeXML::Core::Tokens;
use LaTeXML::Core::Box;
use LaTeXML::Util::Unicode;
use base qw(LaTeXML::Core::Definition::Register);

# A CharDef is a specialized register;
# You can't assign it; when you invoke the control sequence, it returns
# the result of evaluating the character (more like a regular primitive).
# When $mode is 'math', interprets $value as a (3-part) mathcode, otherwise just index into current font.
# When $mathglyph is provided, it is the unicode corresponding to the \mathchar of $value
# Optionally provide the encoding, otherwise use current encoding when digested.
sub new {
my ($class, $cs, $mode, $value) = @_;
my ($class, $cs, $mode, $value, $encoding) = @_;
return bless { cs => $cs, parameters => undef,
mode => $mode, value => $value,
mode => $mode, value => $value, encoding => $encoding,
registerType => 'Number', readonly => 1,
locator => $STATE->getStomach->getGullet->getMouth->getLocator }, $class; }

sub valueOf {
my ($self) = @_;
return $$self{value}; }

sub isCharDef {
return 1; }

sub setValue {
my ($self, $value, $scope) = @_;
Error('unexpected', $self, undef, "Can't assign to chardef " . $self->getCSName);
Expand All @@ -50,11 +55,15 @@ sub invoke {
my $src = $$self{locator} && $$self{locator}->toString;
my $local = $src && $src !~ /\.(?:sty|ltxml|ltxmlc)/; # Dumps currently have undefined src!
if ($$self{mode} eq 'text') { # text; but note defered font/encoding till digestion!
# Decode the codepoint using current font & encoding
my ($glyph, $adjfont) = LaTeXML::Package::FontDecode($nvalue);
## Decode the codepoint using requested encoding ELSE current font & encoding
my ($glyph, $adjfont) = LaTeXML::Package::FontDecode($nvalue, $$self{encoding});
my %props = ();
if ($STATE->lookupValue('IN_MATH')) { # Add math properties if IN math (even for text \chardef)
my $charinfo = unicode_math_properties($glyph);
%props = %$charinfo if $charinfo; }
return Box($glyph, $adjfont, undef,
($local ? Tokens(T_CS('\char'), $value->revert, T_CS('\relax')) : $$self{cs})); }
else { # Else math mode, mathDecode!
($local ? Tokens(T_CS('\char'), $value->revert, T_CS('\relax')) : $$self{cs}), %props); }
else { # Else math mode, mathDecode!
my ($glyph, $f, $rev, %props) = LaTeXML::Package::decodeMathChar($nvalue);
if (!defined $props{name}) { # Synthesize name attribute from CS, if needed (Clarify purpose of name!)
my $n = $self->getCSName;
Expand Down
5 changes: 1 addition & 4 deletions lib/LaTeXML/Engine/LaTeX.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -2765,17 +2765,14 @@ DefMacro('\ProvideTextCommandDefault DefToken', '\ProvideTextCommand{#1}{?}');

DefPrimitive('\DeclareTextSymbol DefToken {}{Number}', sub {
my ($stomach, $cs, $encoding, $code) = @_;
$code = $code->valueOf;
my $css = ToString($cs);
$encoding = ToString(Expand($encoding));
if (isDefinable($cs)) { # If not already defined...
DefMacroI($cs, undef,
'\expandafter\ifx\csname\cf@encoding\string' . $css . '\endcsname\relax\csname?\string' . $css . '\endcsname'
. '\else\csname\cf@encoding\string' . $css . '\endcsname\fi'); }
my $ecs = T_CS('\\' . $encoding . $css);
DefPrimitiveI($ecs, undef, sub {
my ($glyph, $adjfont) = FontDecode($code, $encoding);
Box($glyph, $adjfont, undef, $cs); });
$STATE->installDefinition(LaTeXML::Core::Definition::CharDef->new($ecs, 'text', $code, $encoding));
return; });

DefPrimitive('\DeclareTextSymbolDefault DefToken {}', sub {
Expand Down
53 changes: 43 additions & 10 deletions lib/LaTeXML/Engine/TeX_Character.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,14 @@ sub applyAccent {
my $locator = $box->getLocator;
my $font = $box->getFont;
my $string = $box->toString;
$string =~ tr/\x{0131}\x{0237}/ij/;
$string =~ s/\s/ /g;
# In Unicode (but not always (La)TeX), overaccents generally mask the dots of i,j.
# So we replace dotless so NFC can normalize better.
if (my $entry = unicode_accent($standalonechar)) {
if (($$entry{role} || '') eq 'OVERACCENT') {
$string =~ tr/\x{0131}\x{0237}/ij/; } } # Replace dotless i,j with dotted version
if (($string =~ /[ij]/) && ($combiningchar eq "\x{0307}")) { # a dot on i,j Not needed
$combiningchar = ''; }
my @letters = split(//, $string);
return Box(($string =~ /^\s*$/
? $standalonechar
Expand All @@ -81,19 +87,46 @@ sub DefAccent {
protected => 1);
return; }

# This will fail if there really are "assignments" after the number! (See TeX Book)
# We're given a number pointing into the font; the FontMap presumably has the standalone char.
# If there's no letter to be accented, just use the stanadalone.
# \accent <number> <optional assignments><character>; See TeX Book p.287
# <assignments>: (<prefix>) simple assignment or macro assignment
# <character> : letter, other, \char, \chardef token, \noboundary
# Eventually, we're given a number pointing into the font;
# the FontMap presumably has the standalone char, to use if there is no base letter
# Otherwise, use the Util::Unicode module to find the appropriate combining character
DefPrimitive('\accent Number {}', sub {
my ($stomach, $num, $letter) = @_;
DefPrimitive('\accent Number', sub {
my ($stomach, $num) = @_;
my $gullet = $stomach->getGullet;
# Decode & Fetch the accent BEFORE processing any "assignments"
my $n = $num->valueOf;
my ($glyph, $adjfont) = FontDecode($n);
my @assignments = ();
## Check for (& accumulate) various kinds of <assignments>
my ($token, $cc, $defn);
while (($token = $gullet->readXNonSpace)
&& ($defn = $STATE->lookupDefinition($token))
&& ($defn->isPrefix
|| $defn->isFontDef
|| ($defn->isRegister && !$defn->isCharDef)
|| ($token->getString =~ /^\\(?:def|edef|gdef|xdef)$/))) {
push(@assignments, $stomach->invokeToken($token)); }
## Check for various kinds of <character>
my $letter = Tokens();
if (!$token) { }
elsif ((($cc = $token->getCatcode) == CC_LETTER) || ($cc == CC_OTHER)
|| ($defn && $defn->isCharDef)) {
$letter = $token; }
elsif ($token->equals(T_CS('\char'))) {
$letter = Tokens(Invocation($token, $gullet->readNumber)); }
elsif ($token->equals(T_CS('\noboundary'))) { } # Treat as empty
else {
$gullet->unread($token); } # No appropriate <character> ? Treat as empty
my $result;
if (my $entry = unicode_accent($glyph)) {
applyAccent($stomach, $letter, $$entry{combiner}, $$entry{standalone},
$result = applyAccent($stomach, $letter, $$entry{combiner}, $$entry{standalone},
Invocation(T_CS('\accent'), $num, $letter)); }
else { # Unknown accent ? Attempt to OVERLAY the accent on top of $letter
Digest(Tokens(T_CS('\lx@overlay'), T_BEGIN, $letter, T_END, T_BEGIN, T_OTHER($glyph), T_END)); } });
$result = Digest(Tokens(T_CS('\lx@overlay'), T_BEGIN, $letter, T_END, T_BEGIN, T_OTHER($glyph), T_END)); }
return (@assignments, $result); });

#======================================================================
# \chardef iq provides an alternate way to define a control sequence that returns a character.
Expand All @@ -118,12 +151,12 @@ DefPrimitive('\chardef Token SkipSpaces SkipMatch:=', sub {
sub ucToken {
my ($token) = @_;
my $code = $STATE->lookupUCcode($token->getString);
return ((defined $code) && ($code != 0) ? Token(chr($code), $token->getCatcode) : $token); }
return ((defined $code) && ($code != 0) ? Token(pack('U', $code), $token->getCatcode) : $token); }

sub lcToken {
my ($token) = @_;
my $code = $STATE->lookupLCcode($token->getString);
return ((defined $code) && ($code != 0) ? Token(chr($code), $token->getCatcode) : $token); }
return ((defined $code) && ($code != 0) ? Token(pack('U', $code), $token->getCatcode) : $token); }

# Note that these are NOT expandable, even though the "return" tokens!
DefPrimitive('\uppercase GeneralText', sub {
Expand Down
2 changes: 1 addition & 1 deletion lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ DeclareFontMap('OML',
# p q r s t u v w
'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
# x y z dotless i dotless j weier-p arrow acc. inv.breve
'x', 'y', 'z', "\x{0131}", "j", "\x{2118}", "\x{2192}", UTF(0xA0) . "\x{0311}"]);
'x', 'y', 'z', "\x{0131}", "j", "\x{2118}", "\x{2192}", UTF(0xA0) . "\x{0361}"]);
DeclareFontMap('OMS',
[ #minus dot times ast divide diamond plus-minus minus-plus
"-", "\x{22C5}", UTF(0xD7), "\x{2217}", UTF(0xF7), "\x{22C4}", UTF(0xB1), "\x{2213}",
Expand Down
14 changes: 10 additions & 4 deletions lib/LaTeXML/Package.pm
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ use Unicode::Normalize;
use LaTeXML::Util::Unicode;
use Text::Balanced;
use Text::Unidecode;
use Encode;
use base qw(Exporter);
our @EXPORT = (qw(&DefAutoload &DefExpandable
&DefMacro &DefMacroI
Expand Down Expand Up @@ -2794,16 +2795,21 @@ sub FontDecodeString {
my ($string, $encoding, $implicit) = @_;
return if !defined $string;
my ($map, $font);
my $map_max = 256; # Up to 256 chars in FontMap
my $input_enc = $STATE->lookupValue('INPUT_ENCODING');
# BUT, if input was in utf8, we'll assume the upper half 128-256 is ALREADY unicode!
if ($input_enc && ($input_enc eq 'utf8')) {
$map_max = 128; }
if (!$encoding) {
$font = LookupValue('font');
$encoding = $font->getEncoding; }
if ($encoding && ($map = LoadFontMap($encoding))) { # OK got some map.
if ($encoding && ($map = LoadFontMap($encoding))) { # OK got some map.
my ($family, $fmap);
if ($font && ($family = $font->getFamily) && ($fmap = LookupValue($encoding . '_' . $family . '_fontmap'))) {
$map = $fmap; } } # Use the family specific map, if any.

$map = $fmap; } } # Use the family specific map, if any.
$map_max = 128 if $map && !defined($$map[128]); # ALSO for short font maps
return join('', grep { defined $_ }
map { ($implicit ? (($map && ($_ < 128)) ? $$map[$_] : pack('U', $_))
map { ($implicit ? (($map && ($_ < $map_max)) ? $$map[$_] : pack('U', $_))
: ($map ? $$map[$_] : undef)) }
map { ord($_) } split(//, $string)); }

Expand Down
2 changes: 2 additions & 0 deletions lib/LaTeXML/Package/french.ldf.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ DefMacro('\nombre{}', '\@ifpackageloaded{numprint}{\numprint{#1}}{\ltx@orig@nomb
AtBeginDocument(sub {
Let('\degre', '\textdegree');
DefMacro('\degres', '\hbox to 0.3em{\degre}');
Let('\tild', '\textasciitilde');
Let('\circonflexe', '\textasciicircum');
});

1;
Expand Down
62 changes: 31 additions & 31 deletions lib/LaTeXML/Package/ly1.fontmap.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -16,38 +16,38 @@ use warnings;
use LaTeXML::Package;

DeclareFontMap('LY1', [
undef, undef, undef, undef, "\x{2044}", "\x{02D9}", "\x{02DD}", "\x{02DB}",
"\x{FB02}", undef, undef, undef, "\x{FB01}", undef, undef, undef,
"\x{0131}", undef, UTF(0x60), UTF(0xB4), "\x{02C7}", "\x{02D8}", UTF(0xAF), "\x{02DA}",
UTF(0xB8), UTF(0xDF), UTF(0xE6), "\x{0153}", UTF(0xF8), UTF(0xC6), "\x{0152}", UTF(0xD8),
" ", "!", "\"", "#", "\$", "%", "&", "\x{2019}",
"(", ")", "*", "+", ",", "-", ".", "/",
"0", "1", "2", "3", "4", "5", "6", "7",
"8", "9", ":", ";", "<", "=", ">", "?",
"\@", "A", "B", "C", "D", "E", "F", "G",
"H", "I", "J", "K", "L", "M", "N", "O",
"P", "Q", "R", "S", "T", "U", "V", "W",
"X", "Y", "Z", "[", "\\", "]", "^", UTF(0x5F),
"\x{2018}", "a", "b", "c", "d", "e", "f", "g",
"h", "i", "j", "k", "l", "m", "n", "o",
"p", "q", "r", "s", "t", "u", "v", "w",
"x", "y", "z", "{", "|", "}", "~", "\x{2010}",
"\x{0141}", "'", "\x{201A}", "\x{0192}", "\x{201E}", "\x{2026}", "\x{2020}", "\x{2021}",
"^", "\x{2030}", "\x{0160}", "\x{2039}", "\x{0152}", "\x{017D}", UTF(0x5E), "-",
undef, undef, undef, undef, "\x{2044}", "\x{02D9}", "\x{02DD}", "\x{02DB}",
"\x{FB02}", undef, undef, undef, "\x{FB01}", undef, undef, undef,
"\x{0131}", undef, UTF(0x60), UTF(0xB4), "\x{02C7}", "\x{02D8}", UTF(0xAF), "\x{02DA}",
UTF(0xB8), UTF(0xDF), UTF(0xE6), "\x{0153}", UTF(0xF8), UTF(0xC6), "\x{0152}", UTF(0xD8),
" ", "!", "\"", "#", "\$", "%", "&", "\x{2019}",
"(", ")", "*", "+", ",", "-", ".", "/",
"0", "1", "2", "3", "4", "5", "6", "7",
"8", "9", ":", ";", "<", "=", ">", "?",
"\@", "A", "B", "C", "D", "E", "F", "G",
"H", "I", "J", "K", "L", "M", "N", "O",
"P", "Q", "R", "S", "T", "U", "V", "W",
"X", "Y", "Z", "[", "\\", "]", "\x{02C6}", UTF(0x5F),
"\x{2018}", "a", "b", "c", "d", "e", "f", "g",
"h", "i", "j", "k", "l", "m", "n", "o",
"p", "q", "r", "s", "t", "u", "v", "w",
"x", "y", "z", "{", "|", "}", "\x{02DC}", "\x{2010}",
"\x{0141}", "'", "\x{201A}", "\x{0192}", "\x{201E}", "\x{2026}", "\x{2020}", "\x{2021}",
"^", "\x{2030}", "\x{0160}", "\x{2039}", "\x{0152}", "\x{017D}", UTF(0x5E), "-",
"\x{0142}", "\x{2018}", "\x{2019}", "\x{201C}", "\x{201D}", "\x{2022}", "\x{2013}", "\x{2014}",
"~", "\x{2122}", "\x{0161}", "\x{203A}", "\x{0153}", "\x{017E}", UTF(0x7E), "\x{0178}",
undef, UTF(0xA1), UTF(0xA2), UTF(0xA3), UTF(0xA4), UTF(0xA5), UTF(0xA6), UTF(0xA7),
UTF(0xA8), UTF(0xA9), UTF(0xAA), UTF(0xAB), UTF(0xAC), undef, UTF(0xAE), UTF(0xAF),
UTF(0xB0), UTF(0xB1), UTF(0xB2), UTF(0xB3), UTF(0xB4), UTF(0xB5), UTF(0xB6), UTF(0xB7),
UTF(0xB8), UTF(0xB9), UTF(0xBA), UTF(0xBB), UTF(0xBC), UTF(0xBD), UTF(0xBE), UTF(0xBF),
UTF(0xC0), UTF(0xC1), UTF(0xC2), UTF(0xC3), UTF(0xC4), UTF(0xC5), UTF(0xC6), UTF(0xC7),
UTF(0xC8), UTF(0xC9), UTF(0xCA), UTF(0xCB), UTF(0xCC), UTF(0xCD), UTF(0xCE), UTF(0xCF),
UTF(0xD0), UTF(0xD1), UTF(0xD2), UTF(0xD3), UTF(0xD4), UTF(0xD5), UTF(0xD6), UTF(0xD7),
UTF(0xD8), UTF(0xD9), UTF(0xDA), UTF(0xDB), UTF(0xDC), UTF(0xDD), UTF(0xDE), UTF(0xDF),
UTF(0xE0), UTF(0xE1), UTF(0xE2), UTF(0xE3), UTF(0xE4), UTF(0xE5), UTF(0xE6), UTF(0xE7),
UTF(0xE8), UTF(0xE9), UTF(0xEA), UTF(0xEB), UTF(0xEC), UTF(0xED), UTF(0xEE), UTF(0xEF),
UTF(0xF0), UTF(0xF1), UTF(0xF2), UTF(0xF3), UTF(0xF4), UTF(0xF5), UTF(0xF6), UTF(0xF7),
UTF(0xF8), UTF(0xF9), UTF(0xFA), UTF(0xFB), UTF(0xFC), UTF(0xFD), UTF(0xFE), UTF(0xFF),
"\x{02DC}", "\x{2122}", "\x{0161}", "\x{203A}", "\x{0153}", "\x{017E}", UTF(0x7E), "\x{0178}",
undef, UTF(0xA1), UTF(0xA2), UTF(0xA3), UTF(0xA4), UTF(0xA5), UTF(0xA6), UTF(0xA7),
UTF(0xA8), UTF(0xA9), UTF(0xAA), UTF(0xAB), UTF(0xAC), undef, UTF(0xAE), UTF(0xAF),
UTF(0xB0), UTF(0xB1), UTF(0xB2), UTF(0xB3), UTF(0xB4), UTF(0xB5), UTF(0xB6), UTF(0xB7),
UTF(0xB8), UTF(0xB9), UTF(0xBA), UTF(0xBB), UTF(0xBC), UTF(0xBD), UTF(0xBE), UTF(0xBF),
UTF(0xC0), UTF(0xC1), UTF(0xC2), UTF(0xC3), UTF(0xC4), UTF(0xC5), UTF(0xC6), UTF(0xC7),
UTF(0xC8), UTF(0xC9), UTF(0xCA), UTF(0xCB), UTF(0xCC), UTF(0xCD), UTF(0xCE), UTF(0xCF),
UTF(0xD0), UTF(0xD1), UTF(0xD2), UTF(0xD3), UTF(0xD4), UTF(0xD5), UTF(0xD6), UTF(0xD7),
UTF(0xD8), UTF(0xD9), UTF(0xDA), UTF(0xDB), UTF(0xDC), UTF(0xDD), UTF(0xDE), UTF(0xDF),
UTF(0xE0), UTF(0xE1), UTF(0xE2), UTF(0xE3), UTF(0xE4), UTF(0xE5), UTF(0xE6), UTF(0xE7),
UTF(0xE8), UTF(0xE9), UTF(0xEA), UTF(0xEB), UTF(0xEC), UTF(0xED), UTF(0xEE), UTF(0xEF),
UTF(0xF0), UTF(0xF1), UTF(0xF2), UTF(0xF3), UTF(0xF4), UTF(0xF5), UTF(0xF6), UTF(0xF7),
UTF(0xF8), UTF(0xF9), UTF(0xFA), UTF(0xFB), UTF(0xFC), UTF(0xFD), UTF(0xFE), UTF(0xFF),
]);

1;
Expand Down
1 change: 1 addition & 0 deletions lib/LaTeXML/Package/siunitx.sty.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ sub six_apply_mathligatures {
my $repl;
if (@tokens && ($repl = $six_mathligatures{ $t->getCSName }{ $tokens[0]->getCSName })) {
shift(@tokens); push(@r, $repl); }
elsif ($t->getCatcode == CC_COMMENT) { }
else {
push(@r, $t); } }
return @r; }
Expand Down
Loading
Loading