Skip to content

Commit

Permalink
ICU-22707 UTC-180? Give up on 16.0β rules, amend LB10 and LB21a instead.
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin committed Jul 5, 2024
1 parent 1973f6a commit b47defb
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 84 deletions.
46 changes: 3 additions & 43 deletions icu4c/source/data/brkitr/rules/line.txt
Original file line number Diff line number Diff line change
Expand Up @@ -284,59 +284,19 @@ $LB18Breaks = [$LB8Breaks $SP];
$LB18NonBreaks $CM* $QU;
^$CM+ $QU;

# OP and GL are subtracted because of LB14 and LB12 (there is no break after them).
# BA is subtracted because of LB21a:
# We must not poke a hole into HL U+3000 × [\p{Pi} & QU] [\p{ea=F}\p{ea=W}\p{ea=H}],
# where U+3000 is lb=BA and ea=W.
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL $BA]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL $BA]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];

$OP $CM* $SP+ [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
($OP $CM* $SP+ | [$OP [$QU-\p{Pi}] $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];

$QU $CM* .;
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];

$OP $CM* $SP+ [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
($OP $CM* $SP+ | [$OP [$QU-\p{Pi}] $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
^([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];


^$CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^$CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];

$OP $CM* $SP+ [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
($OP $CM* $SP+ | [$OP [$QU-\p{Pi}] $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];

$SP? $IS $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];

# LB 20
# <break> $CB
# $CB <break>
#
$LB20NonBreaks = [$LB18NonBreaks - $CB];

[$LB20NonBreaks - [$HL $IS $RI]] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
[$LB20NonBreaks - [$HL $IS $RI]] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
$CB $CM* $ZWJ [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
($RI $CM*)? $RI $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];

# LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen.
# Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151),
# and then to default UAX #14 behaviour (UTC-179-C32).
Expand Down Expand Up @@ -369,7 +329,7 @@ $BB $CM* $LB20NonBreaks;
# LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
# HL (HY | BA) x [^HL]
#
$HL $CM* ($HY | $BA) $CM* [^$CB $HL]?;
$HL $CM* ($HY | [ $BA - [\p{ea=F}\p{ea=W}\p{ea=H}] ] ) $CM* [^$CB $HL]?;

# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
Expand Down
24 changes: 6 additions & 18 deletions icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2892,23 +2892,9 @@ void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos
// LB 9 Treat X CM* as if it were x.
// No explicit action required.

// LB 10 Treat any remaining combining mark as AL, but preserve its East
// Asian Width.
// LB 10 Treat any remaining combining mark as lb=AL, ea=Na.
if (fCM->contains(*posChar)) {
switch (u_getIntPropertyValue(*posChar, UCHAR_EAST_ASIAN_WIDTH)) {
case U_EA_WIDE:
*posChar = u'';
break;
case U_EA_NEUTRAL:
*posChar = u'';
break;
case U_EA_AMBIGUOUS:
*posChar = u'';
break;
default:
puts("Unexpected ea value for lb=CM");
std::terminate();
}
*posChar = u'A';
}

// Push the updated nextPos and nextChar back to our caller.
Expand Down Expand Up @@ -3281,7 +3267,8 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
}
breakObliviousPrevPosX2 = beforeCM;
}
if (!feaFWH->contains(fText->char32At(breakObliviousPrevPosX2))) {
if (!feaFWH->contains(fText->char32At(breakObliviousPrevPosX2)) ||
fCM->contains(fText->char32At(breakObliviousPrevPosX2))) {
setAppliedRule(pos, "LB 19a [^\\p{ea=F}\\p{ea=W}\\p{ea=H}] QU ×");
continue;
}
Expand Down Expand Up @@ -3332,7 +3319,8 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
continue;
}

if (fHL->contains(prevCharX2) && (fHY->contains(prevChar) || fBA->contains(prevChar)) &&
if (fHL->contains(prevCharX2) &&
(fHY->contains(prevChar) || (fBA->contains(prevChar) && !feaFWH->contains(prevChar))) &&
!fHL->contains(thisChar)) {
setAppliedRule(pos, "LB 21a HL (HY | BA) x [^HL]");
continue;
Expand Down
12 changes: 2 additions & 10 deletions icu4c/source/test/testdata/break_rules/line.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,9 @@ CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];

eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}];
eaFWHminusOPGL = [ eaFWH - [OP GL] ];
eaFWHandCM = [ CMS & eaFWH ];
eaFWHminusCM = [ eaFWH - CMS ];
eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ];

# An annoying special case, \p{lb=BA} & [\p{ea=F}\p{ea=W}\p{ea=H}].
ideographicSpace = [\u3000];
BAminuseaFWH = [BA - eaFWH ];

PiQU = [\p{Pi}&QU];
PfQU = [\p{Pf}&QU];
Expand Down Expand Up @@ -129,12 +126,8 @@ LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .;
# points.
LB19a.1: eaFWHminusOPGL ÷ PiQU CM* eaFWHminusCM;
LB19a.2: eaFWHminusOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM;
LB19a.3: ^eaFWHandCM ÷ PiQU CM* eaFWHminusCM;
LB19a.4: ^eaFWHandCM CM* CMS ÷ PiQU CM* eaFWHminusCM;
LB19a.5: eaFWH CM* PfQU ÷ eaFWHBreakableAtLB19;
LB19a.6: eaFWH CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;
LB19a.7: ^eaFWHandCM CM* PfQU ÷ eaFWHBreakableAtLB19;
LB19a.8: ^eaFWHandCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19;

# Moved up, before LB7, because they can match a longer sequence that would also match LB7.
# For example, the sequence "OP CM SP AL" matches LB14
Expand Down Expand Up @@ -216,8 +209,7 @@ LB20a.1: ^(HY | HH) CM* AL;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
# Chains over two characters with the LB19a break rule.
LB21a.1: HL CM* ideographicSpace CM* PfQU (CM* CMS)? ÷ eaFWHBreakableAtLB19;
LB21a: HL CM* (HY | BA) CM* [^CM CB HL];
LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL];

LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
Expand Down
13 changes: 0 additions & 13 deletions icu4c/source/test/testdata/rbbitst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2223,16 +2223,3 @@ Bangkok)•</data>
<data>•« Complex »« chaining » •</data>
<data>•« .618 »•</data> # Interaction with the ICU tailoring to break before such numbers.

# Non-breaking lb=SP (from LB14 and LB15a) followed by a lb=CM-as-AL that is
# ea=W, in a position that would match initial context for LB19a if it were not
# ea=W.
# See https://github.com/unicode-org/icu/pull/3028#issuecomment-2200259320.
<data>•︷ \U00016FF1•⸠ᅛᆅ•</data>
<data>•︷ « \U00016FF1•⸠ᅛᆅ•</data>
<data>•A »« \U00016FF1•⸠ᅛᆅ•</data>
<data>•︷ \U00016FF1\u302B•⸠ᅛᆅ•</data>
<data>•︷ « \U00016FF1\u302B•⸠ᅛᆅ•</data>
<data>•A »« \U00016FF1\u302B•⸠ᅛᆅ•</data>
<data>•❲ \u3035⸍•굼•</data>
<data>•❲ « \u3035⸍•굼•</data>
<data>•A »« \u3035⸍•굼•</data>

0 comments on commit b47defb

Please sign in to comment.