Skip to content

Commit

Permalink
Unicode math properties (#2425)
Browse files Browse the repository at this point in the history
* New unicode_math_properties(char) gets the math properties (role, meaning, ...) associated with a unicode char

* New unicode_math_properties(char) gets the math properties (role, meaning, ...) associated with a unicode char

* Revise API for decodeMathChar to return the math properties associated with the glyph, after decoding and font adjustment

* Revise CharDef->new API to take only the code, deferring decoding until digestsion; use new decodeMathChar API

* Updates for improved decodeMathChar and CharDef->new API

* Update fontmap for some bigops

* Have \mathop and friends intelligently merge the requested TeX math class with any already assigned grammatical role

* Update tests for more 'semantic' when low-level glyphs are accessed

* Refinements & clarifications suggested by D.Ginev

* Add guard
  • Loading branch information
brucemiller authored Oct 10, 2024
1 parent aba805b commit 9f07a49
Show file tree
Hide file tree
Showing 14 changed files with 430 additions and 129 deletions.
9 changes: 9 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
0.8.9 2024-Winter
- This release addresses a large variety of usability, fidelity, robustness,
portability and output-quality issues.
-
Incompatible changes: The API of several low-level internal functions have changed.
In the unlikely event you have used these in your own bindings, they will need to be
updated.
- LaTeXML::Core::CharDef->new($cs,$mode,$value)
- decodeMathCode($mathcode,$reversion) returns ($glyph,$font,$reversion,%mathproperties)
0.8.8 2024-02-29
- This release addresses a large variety of usability, fidelity, robustness,
portability and output-quality issues.
Expand Down
36 changes: 20 additions & 16 deletions lib/LaTeXML/Core/Definition/CharDef.pm
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@ use base qw(LaTeXML::Core::Definition::Register);
# A CharDef is a specialized register;
# You can't assign it; when you invoke the control sequence, it returns
# the result of evaluating the character (more like a regular primitive).
# When $mode is 'math', interprets $value as a (3-part) mathcode, otherwise just index into current font.
# When $mathglyph is provided, it is the unicode corresponding to the \mathchar of $value
sub new {
my ($class, $cs, $value, $mathglyph, %traits) = @_;
my ($class, $cs, $mode, $value) = @_;
return bless { cs => $cs, parameters => undef,
value => $value, mathglyph => $mathglyph,
registerType => 'Number', readonly => 1,
locator => $STATE->getStomach->getGullet->getMouth->getLocator,
%traits }, $class; }
mode => $mode, value => $value,
registerType => 'Number', readonly => 1,
locator => $STATE->getStomach->getGullet->getMouth->getLocator }, $class; }

sub valueOf {
my ($self) = @_;
Expand All @@ -43,22 +43,26 @@ sub setValue {

sub invoke {
my ($self, $stomach) = @_;
my $value = $$self{value};
my $mathglyph = $$self{mathglyph};
my $value = $$self{value};
my $nvalue = $value->valueOf;
# A dilemma: If the \chardef were in a style file, you're prefer to revert to the $cs
# but if defined in the document source, better to use \char ###\relax, so it still "works"
my $src = $$self{locator} && $$self{locator}->toString;
my $local = $src && $src !~ /\.(?:sty|ltxml|ltxmlc)/; # Dumps currently have undefined src!
if (defined $mathglyph) { # Must be a math char
return Box($mathglyph, undef, undef,
($local ? Tokens(T_CS('\mathchar'), $value->revert, T_CS('\relax')) : $$self{cs}),
role => $$self{role}); }
else { # else text; but note defered font/encoding till digestion!
# Decode the codepoint using current font & encoding
my ($glyph, $adjfont) = LaTeXML::Package::FontDecode($value->valueOf);
if ($$self{mode} eq 'text') { # text; but note defered font/encoding till digestion!
# Decode the codepoint using current font & encoding
my ($glyph, $adjfont) = LaTeXML::Package::FontDecode($nvalue);
return Box($glyph, $adjfont, undef,
($local ? Tokens(T_CS('\char'), $value->revert, T_CS('\relax')) : $$self{cs}),
); } }
($local ? Tokens(T_CS('\char'), $value->revert, T_CS('\relax')) : $$self{cs})); }
else { # Else math mode, mathDecode!
my ($glyph, $f, $rev, %props) = LaTeXML::Package::decodeMathChar($nvalue);
if (!defined $props{name}) { # Synthesize name attribute from CS, if needed (Clarify purpose of name!)
my $n = $self->getCSName;
$n =~ s/^\\//;
$props{name} = $n if !$props{meaning} || ($n ne $props{meaning}); }
return Box($glyph, undef, undef,
($local ? Tokens(T_CS('\mathchar'), $value->revert, T_CS('\relax')) : $$self{cs}),
%props); } }

sub equals {
my ($self, $other) = @_;
Expand Down
4 changes: 2 additions & 2 deletions lib/LaTeXML/Core/Stomach.pm
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,8 @@ sub invokeToken_simple {
$STATE->clearPrefixes; # prefixes shouldn't apply here.
if (my $mathcode = $STATE->lookupValue('IN_MATH')
&& $STATE->lookupMathcode($meaning->toString)) {
my ($role, $glyph, $f, $reversion) = LaTeXML::Package::decodeMathChar($mathcode, $meaning);
return Box($glyph, $f, undef, $reversion, role => $role); }
my ($glyph, $f, $reversion, %props) = LaTeXML::Package::decodeMathChar($mathcode, $meaning);
return Box($glyph, $f, undef, $reversion, %props); }
else {
return Box(LaTeXML::Package::FontDecodeString($meaning->toString, undef, 1),
undef, undef, $meaning); } } }
Expand Down
4 changes: 2 additions & 2 deletions lib/LaTeXML/Engine/LaTeX.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -6293,8 +6293,8 @@ Let('\mathalpha', '\relax');
DefPrimitive('\mathhexbox {}{}{}', sub {
my ($stomach, $a, $b, $c) = @_;
my $n = ToString($a) * 256 + ToString($b) * 16 + ToString($c);
my ($role, $glyph) = decodeMathChar($n);
return Box($glyph, LookupValue('font')->specialize($glyph)); });
my ($glyph, $f, $rev, %props) = decodeMathChar($n);
return Box($glyph, $f, undef, undef, %props); });

DefMacroI('\nocorrlist', undef, ',.');
Let('\nocorr', '\relax');
Expand Down
2 changes: 1 addition & 1 deletion lib/LaTeXML/Engine/TeX_Character.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ DefPrimitive('\chardef Token SkipSpaces SkipMatch:=', sub {
my ($stomach, $newcs) = @_;
$STATE->assignMeaning($newcs, $STATE->lookupMeaning(T_CS('\relax'))); # Let w/o AfterAssignment
my $value = $stomach->getGullet->readNumber();
$STATE->installDefinition(LaTeXML::Core::Definition::CharDef->new($newcs, $value));
$STATE->installDefinition(LaTeXML::Core::Definition::CharDef->new($newcs, 'text', $value));
AfterAssignment();
return; });

Expand Down
6 changes: 3 additions & 3 deletions lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -276,9 +276,9 @@ DeclareFontMap('OMX',
"\x{23A7}", "\x{23AB}", "\x{23A9}", "\x{23AD}", "\x{23A8}", "\x{23AC}", "\x{23AA}", "\x{23D0}",
# l.bot.paren r.bot.paren l.paren.ext r.paren.ext
"\x{239D}", "\x{23A0}", "\x{239C}", "\x{239F}", "\x{27E8}", "\x{27E9}", "\x{2294}", "\x{2294}",
"\x{222E}", "\x{222E}", "\x{2299}", "\x{2299}", "\x{2295}", "\x{2295}", "\x{2297}", "\x{2297}",
"\x{2211}", "\x{220F}", "\x{222B}", "\x{22C3}", "\x{22C2}", "\x{228C}", "\x{2227}", "\x{2228}",
"\x{2211}", "\x{220F}", "\x{222B}", "\x{22C3}", "\x{22C2}", "\x{228C}", "\x{2227}", "\x{2228}",
"\x{222E}", "\x{222E}", "\x{2A00}", "\x{2A00}", "\x{2A01}", "\x{2A01}", "\x{2A02}", "\x{2A02}",
"\x{2211}", "\x{220F}", "\x{222B}", "\x{22C3}", "\x{22C2}", "\x{2A04}", "\x{22C0}", "\x{22C1}",
"\x{2211}", "\x{220F}", "\x{222B}", "\x{22C3}", "\x{22C2}", "\x{2A04}", "\x{22C0}", "\x{22C1}",
"\x{2210}", "\x{2210}", UTF(0x5E), UTF(0x5E), UTF(0x5E), UTF(0x7E), UTF(0x7E), UTF(0x7E),
"[", "]", "\x{230A}", "\x{230B}", "\x{2308}", "\x{2309}", "{", "}",
# [missing rad frags] double arrow ext.
Expand Down
96 changes: 61 additions & 35 deletions lib/LaTeXML/Engine/TeX_Math.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -569,48 +569,49 @@ DefRegister('\everydisplay', Tokens());

DefPrimitive('\mathchar Number', sub {
my ($stomach, $code) = @_;
my ($role, $glyph, $font, $reversion) = decodeMathChar($code,
my ($glyph, $font, $reversion, %props) = decodeMathChar($code,
Tokens(T_CS('\mathchar'), $_[1]->revert, T_CS('\relax')));
return Box($glyph, $font, undef, $reversion, role => $role); });
return Box($glyph, $font, undef, $reversion, %props); });

DefConstructor('\delimiter Number',
"?#glyph(?#isMath(<ltx:XMTok role='#role'>#glyph</ltx:XMTok>)(#glyph))",
"?#glyph(?#isMath(<ltx:XMTok role='#role' name='#name' stretchy='#stretchy'>#glyph</ltx:XMTok>)(#glyph))",
sizer => '#glyph',
afterDigest => sub {
my ($stomach, $whatsit) = @_;
my $n = $whatsit->getArg(1)->valueOf;
$n = $n >> 12; # Ignore 3 rightmost digits and treat as \mathchar
my ($role, $glyph) = decodeMathChar($n);
$whatsit->setProperty(glyph => $glyph) if $glyph;
$whatsit->setProperty(role => $role) if defined $role;
$whatsit->setProperty(font => LookupValue('font')->specialize($glyph)) if $glyph;
my ($glyph, $f, $rev, %props) = decodeMathChar($n);
$whatsit->setProperty(glyph => $glyph) if $glyph;
$whatsit->setProperties(%props) if %props;
$whatsit->setProperty(font => $f) if $glyph;
return; });

# Almost like a register, but different...
DefPrimitive('\mathchardef Token SkipSpaces SkipMatch:=', sub {
my ($stomach, $newcs) = @_;
$STATE->assignMeaning($newcs, $STATE->lookupMeaning(T_CS('\relax'))); # Let w/o AfterAssignment
my $value = $stomach->getGullet->readNumber();
my ($role, $glyph) = decodeMathChar($value->valueOf);
$STATE->installDefinition(LaTeXML::Core::Definition::CharDef->new($newcs, $value,
$glyph, role => $role));
$STATE->installDefinition(LaTeXML::Core::Definition::CharDef->new($newcs, 'math', $value));
AfterAssignment();
return; });

DefConstructor('\mathaccent Number Digested',
"<ltx:XMApp><ltx:XMTok role='#accrole'>#glyph</ltx:XMTok><ltx:XMArg>#2</ltx:XMArg></ltx:XMApp>",
"<ltx:XMApp><ltx:XMTok role='#accrole' name='#name' stretchy='#stretchy'>#glyph</ltx:XMTok><ltx:XMArg>#2</ltx:XMArg></ltx:XMApp>",
sizer => '#2', # Close enough?
afterDigest => sub {
my ($stomach, $whatsit) = @_;
my $n = $whatsit->getArg(1)->valueOf;
my ($role, $glyph) = decodeMathChar($n);
my $accrole = 'OVERACCENT';
if (my $entry = unicode_accent($glyph)) {
$glyph = $$entry{unwrapped};
$accrole = $$entry{role}; }
$whatsit->setProperty(glyph => $glyph) if $glyph;
$whatsit->setProperty(font => LookupValue('font')->specialize($glyph)) if $glyph;
$whatsit->setProperty(accrole => $accrole) if $glyph;
my ($glyph, $f, $rev, %props) = decodeMathChar($n);
my $name;
my $acc_props = unicode_accent($glyph) || {};
$glyph = $$acc_props{unwrapped} if $$acc_props{unwrapped};
my $accrole = $$acc_props{role} || 'OVERACCENT';
$name = $$acc_props{name};
$whatsit->setProperty(glyph => $glyph) if $glyph;
$whatsit->setProperty(font => LookupValue('font')->specialize($glyph)) if $glyph;
$whatsit->setProperty(accrole => $accrole) if $glyph;
$whatsit->setProperty(name => $name) if $name;
$whatsit->setProperty(stretchy => $$acc_props{stretchy} || 'false'); # stretchy ?
return; });

# # Only used for active math characters, so far
Expand Down Expand Up @@ -649,23 +650,48 @@ DefRegister('\fam' => Number(-1),
# \mathpunct c assigns class 6 (punctuation) to following character or subformula.
# \mathrel c assigns class 3 (relation) to following character or subformula.

# Add an XMWrap, adjusting the math role unless it's already a sub-class of the requested coarse TeX math classes
# Is XMWrap the right thing to wrap with (instead of XMArg)?
# We can't really assume that the stuff inside is sensible math.
# NOTE that \mathord and \mathbin aren't really right here.
# We need a finer granularity than TeX does: an ORD could be several things,
# a BIN could be a MULOP or ADDOP.
# AND, rarely, they're empty.... Is it wrong to drop them?
DefConstructor('\mathord Digested', "?#1(<ltx:XMWrap role='ID' >#1</ltx:XMWrap>)()", bounded => 1);
# Parameter Should be Digested, but that throws off doScriptPos's position depth !?!?!
DefConstructor('\mathop {}', "?#1(<ltx:XMWrap role='BIGOP' scriptpos='#scriptpos'>#1</ltx:XMWrap>)()",
bounded => 1, properties => { scriptpos => \&doScriptpos });

DefConstructor('\mathbin Digested', "?#1(<ltx:XMWrap role='BINOP'>#1</ltx:XMWrap>)()", bounded => 1);
DefConstructor('\mathrel Digested', "?#1(<ltx:XMWrap role='RELOP'>#1</ltx:XMWrap>)()", bounded => 1);
DefConstructor('\mathopen Digested', "?#1(<ltx:XMWrap role='OPEN' >#1</ltx:XMWrap>)()", bounded => 1);
DefConstructor('\mathclose Digested', "?#1(<ltx:XMWrap role='CLOSE'>#1</ltx:XMWrap>)()", bounded => 1);
DefConstructor('\mathpunct Digested', "?#1(<ltx:XMWrap role='PUNCT'>#1</ltx:XMWrap>)()", bounded => 1);
DefConstructor('\mathinner Digested', "?#1(<ltx:XMWrap role='ATOM'>#1</ltx:XMWrap>)()", bounded => 1);
our %mathclass_subclass = (
BIGOP => { ARROW => 1, SUMOP => 1, INTOP => 1, DIFFOP => 1 },
BINOP => { ADDOP => 1, MULOP => 1 },
RELOP => {},
OPEN => {},
CLOSE => {},
PUNCT => { PERIOD => 1 },
ID => { NUMBER => 1 },
ATOM => {}, # really any role
);

sub adjustMathRole {
my ($role, $document, $node, %props) = @_;
if (!$node) { } # Nothing? do nothing!
else {
my $wrapper = $document->openElement('ltx:XMWrap');
$document->absorb($node);
$document->closeElement('ltx:XMWrap');
my @nodes = element_nodes($wrapper);
@nodes = grep { $document->getNodeQName($_) ne 'ltx:XMHint'; } @nodes;
my $applied = 0;
my $gotrole = '';
if ((scalar(@nodes) == 1) # Got single node
&& ($gotrole = $nodes[0]->getAttribute('role')) # with a role
&& (($role eq 'ATOM') || $mathclass_subclass{$role}{$gotrole})) { } # and acceptable? Do nothing
else {
$applied = 1;
$wrapper->setAttribute(role => $role); } # Else, assign the requested role
$wrapper->setAttribute(scriptpos => $props{scriptpos}) if defined $props{scriptpos};
$wrapper->setAttribute(mathstyle => $props{mathstyle}) if defined $props{mathstyle}; }
return; }
DefConstructor('\mathord Digested', sub { adjustMathRole('ID', @_); });
DefConstructor('\mathop Digested', sub { adjustMathRole('BIGOP', @_); },
properties => { scriptpos => \&doScriptpos });
DefConstructor('\mathbin Digested', sub { adjustMathRole('BINOP', @_); });
DefConstructor('\mathrel Digested', sub { adjustMathRole('RELOP', @_); });
DefConstructor('\mathopen Digested', sub { adjustMathRole('OPEN', @_); });
DefConstructor('\mathclose Digested', sub { adjustMathRole('CLOSE', @_); });
DefConstructor('\mathpunct Digested', sub { adjustMathRole('PUNCT', @_); });
DefConstructor('\mathinner Digested', sub { adjustMathRole('ATOM', @_); });

#======================================================================
# Delimiters
Expand Down
47 changes: 36 additions & 11 deletions lib/LaTeXML/Package.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1789,7 +1789,7 @@ sub defmath_cons {
? $cs : $presentation->unlist); }; }
$STATE->installDefinition(LaTeXML::Core::Definition::Constructor->new($defcs, $paramlist,
($nargs == 0
# If trivial presentation, allow it in Text
# If trivial presentation, allow it in Text
? ($presentation !~ /(?:\(|\)|\\)/
? "?#isMath(<ltx:XMTok role='#role' scriptpos='#scriptpos' stretchy='#stretchy'"
. " font='#font' $cons_attr$end_tok)"
Expand Down Expand Up @@ -2466,7 +2466,7 @@ sub AddToMacro {
else {
local $LaTeXML::Core::State::UNLOCKED = 1; # ALLOW redefinitions that only adding to the macro
DefMacroI($cs, undef, Tokens(map { $_->unlist }
map { (blessed $_ ? $_ : TokenizeInternal($_)) } ($defn->getExpansion, @tokens)),
map { (blessed $_ ? $_ : TokenizeInternal($_)) } ($defn->getExpansion, @tokens)),
nopackParameters => 1, scope => 'global', locked => $$defn{locked}); }
return; }

Expand Down Expand Up @@ -2833,7 +2833,8 @@ sub decodeMathChar {
my $curfam = $STATE->lookupValue('fontfamily') // -1;
my $initfont = $STATE->lookupValue('initial_math_font') || $curfont;
my ($fontdef, $fontinfo);
my ($oclass, $ofam) = ($class, $fam);
my ($oclass, $ofam) = ($class, $fam);
my $downsize = 0;
# Special case: class 7 means use the \fam as the family code, if 0<=f<=15;
if ($class == 7) {
$fam = $curfam if (defined $curfam) && (0 <= $curfam) && ($curfam <= 15); }
Expand All @@ -2846,24 +2847,48 @@ sub decodeMathChar {
$fontdef = T_CS('\font'); # Assume specified by \mathrm or something similar!
$fontinfo = $STATE->lookupValue('font')->asFontinfo; }
else {
$fontdef = LookupValue('textfont_' . $fam);
my $style = $curfont->getMathstyle;
$style = 'text' unless $style && ($style =~ /^(:?scriptscript|script|text)$/);
my $basefontdef = LookupValue('textfont_0');
my $basefontdefn = $STATE->lookupDefinition($basefontdef);
my $basefontinfo = $basefontdefn && $basefontdefn->isFontDef;
if ($style eq 'text') { # Lookup the requested font according to script level, but with adjusted fallbacks
$fontdef = LookupValue('textfont_' . $fam); }
elsif ($style eq 'script') {
if ($fontdef = LookupValue('scriptfont_' . $fam)) { }
elsif ($fontdef = LookupValue('textfont_' . $fam)) { $downsize = 1; } }
elsif ($style eq 'scriptscript') {
if ($fontdef = LookupValue('scriptscriptfont_' . $fam)) { }
elsif ($fontdef = LookupValue('scriptfont_' . $fam)) { $downsize = 1; }
elsif ($fontdef = LookupValue('textfont_' . $fam)) { $downsize = 2; } }
my $defn = $STATE->lookupDefinition($fontdef);
$fontinfo = $defn && $defn->isFontDef; }
my $font = $curfont->merge(%$fontinfo);
$fontinfo = $defn && $defn->isFontDef;
if ($fontinfo && ($$basefontinfo{size} != $curfont->getSize)) { # If we've gotten an explicit font SIZE change; Adjust!
$fontinfo = {%$fontinfo}; $$fontinfo{size} = $curfont->getSize; } }
my $font = $curfont->merge(%$fontinfo);
if ($downsize > 0) { $font = $curfont->merge(scripted => 1); }
if ($downsize > 1) { $font = $curfont->merge(scripted => 1); }

my $encoding = $fontinfo && $$fontinfo{encoding} || '';
my ($glyph, $f) = ($encoding ? FontDecode($n, $encoding, $font) : ($char, $font));
# If no specific class, Lookup properties from a DefMath? [Eventually: Unicode data!]
my $charinfo = (defined $glyph ? LookupValue('math_token_attributes_' . $glyph) : ());
my $charinfo = unicode_math_properties($glyph);
my $role = ($charinfo && $$charinfo{role}) || $mathclassrole[$class];
my $size = $curfont->getSize;
$f = $f->merge(size => $size);
my %props = ();
%props = %$charinfo if $charinfo;
$props{role} = $role if $role && !$props{role};
my $in_display = $curfont->getMathstyle eq 'display';
if ($props{need_scriptpos}) {
$props{scriptpos} = ($in_display ? 'mid' : 'post'); }
if ($props{need_mathstyle}) {
$props{mathstyle} = ($in_display ? 'display' : 'text'); }
my %d = $f->relativeTo($curfont);
if ($reversion) {
%d = () if LookupValue('LaTeX.pool.ltxml_loaded');
my $rev = ($maybe_rev && %d ? Tokens(T_BEGIN, $fontdef, $reversion, T_END) : $reversion);
return ($role, $glyph, $f, $rev); }
return ($glyph, $f, $rev, %props); }
else {
return ($role, $glyph, $f); } }
return ($glyph, $f, undef, %props); } }

#======================================================================
# Color
Expand Down
Loading

0 comments on commit 9f07a49

Please sign in to comment.