From 209e550cf19d369bbacedd9bbb962b6ecfe2b20f Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Fri, 5 Jul 2024 00:55:05 +0200 Subject: [PATCH] ICU-22707 give up --- icu4c/source/data/brkitr/rules/line.txt | 46 ++----------------- icu4c/source/test/intltest/rbbitst.cpp | 24 +++------- .../source/test/testdata/break_rules/line.txt | 12 +---- icu4c/source/test/testdata/rbbitst.txt | 13 ------ 4 files changed, 11 insertions(+), 84 deletions(-) diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt index 64f489352be1..1720d4d5da27 100644 --- a/icu4c/source/data/brkitr/rules/line.txt +++ b/icu4c/source/data/brkitr/rules/line.txt @@ -284,47 +284,12 @@ $LB18Breaks = [$LB8Breaks $SP]; $LB18NonBreaks $CM* $QU; ^$CM+ $QU; -# OP and GL are subtracted because of LB14 and LB12 (there is no break after them). -# BA is subtracted because of LB21a: -# We must not poke a hole into HL U+3000 × [\p{Pi} & QU] [\p{ea=F}\p{ea=W}\p{ea=H}], -# where U+3000 is lb=BA and ea=W. -[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL $BA]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL $BA]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; - -$OP $CM* $SP+ [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -($OP $CM* $SP+ | [$OP [$QU-\p{Pi}] $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -^([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; +[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; $QU $CM* .; [$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; [$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; -^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; -^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; - -$OP $CM* $SP+ [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; -($OP $CM* $SP+ | [$OP [$QU-\p{Pi}] $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; -^([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; -$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; -$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; -^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] ($CM* $CMX)? / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]]; - - -^$CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -^$CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; - -$OP $CM* $SP+ [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -($OP $CM* $SP+ | [$OP [$QU-\p{Pi}] $GL] $CM*) ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -^([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -$LB8NonBreaks [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -$CAN_CM $CM* [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -^$CM+ [\p{Pf} & $QU] $CM* ([\p{Pi} & $QU] $CM* $SP*)+ $SP [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; - -$SP? $IS $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; # LB 20 # $CB @@ -332,11 +297,6 @@ $SP? $IS $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $ # $LB20NonBreaks = [$LB18NonBreaks - $CB]; -[$LB20NonBreaks - [$HL $IS $RI]] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -[$LB20NonBreaks - [$HL $IS $RI]] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -$CB $CM* $ZWJ [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; -($RI $CM*)? $RI $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] ($CM* $CMX)? / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM]; - # LB 20a Don't break between Hyphens and Letters when there is a break preceding the hyphen. # Originally added as a Finnish tailoring, promoted to default ICU behavior (ICU-8151), # and then to default UAX #14 behaviour (UTC-179-C32). @@ -369,7 +329,7 @@ $BB $CM* $LB20NonBreaks; # LB 21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew # HL (HY | BA) x [^HL] # -$HL $CM* ($HY | $BA) $CM* [^$CB $HL]?; +$HL $CM* ($HY | [ $BA - [\p{ea=F}\p{ea=W}\p{ea=H}] ] ) $CM* [^$CB $HL]?; # LB 21b (forward) Don't break between SY and HL # (break between HL and SY already disallowed by LB 13 above) diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 205700a27be3..e361c8648148 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -2876,23 +2876,9 @@ void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos // LB 9 Treat X CM* as if it were x. // No explicit action required. - // LB 10 Treat any remaining combining mark as AL, but preserve its East - // Asian Width. + // LB 10 Treat any remaining combining mark as lb=AL, ea=Na. if (fCM->contains(*posChar)) { - switch (u_getIntPropertyValue(*posChar, UCHAR_EAST_ASIAN_WIDTH)) { - case U_EA_WIDE: - *posChar = u'♈'; - break; - case U_EA_NEUTRAL: - *posChar = u'ᴬ'; - break; - case U_EA_AMBIGUOUS: - *posChar = u'Ⓐ'; - break; - default: - puts("Unexpected ea value for lb=CM"); - std::terminate(); - } + *posChar = u'A'; } // Push the updated nextPos and nextChar back to our caller. @@ -3265,7 +3251,8 @@ int32_t RBBILineMonkey::next(int32_t startPos) { } breakObliviousPrevPosX2 = beforeCM; } - if (!feaFWH->contains(fText->char32At(breakObliviousPrevPosX2))) { + if (!feaFWH->contains(fText->char32At(breakObliviousPrevPosX2)) || + fCM->contains(fText->char32At(breakObliviousPrevPosX2))) { setAppliedRule(pos, "LB 19a [^\\p{ea=F}\\p{ea=W}\\p{ea=H}] QU ×"); continue; } @@ -3316,7 +3303,8 @@ int32_t RBBILineMonkey::next(int32_t startPos) { continue; } - if (fHL->contains(prevCharX2) && (fHY->contains(prevChar) || fBA->contains(prevChar)) && + if (fHL->contains(prevCharX2) && + (fHY->contains(prevChar) || (fBA->contains(prevChar) && !feaFWH->contains(prevChar))) && !fHL->contains(thisChar)) { setAppliedRule(pos, "LB 21a HL (HY | BA) x [^HL]"); continue; diff --git a/icu4c/source/test/testdata/break_rules/line.txt b/icu4c/source/test/testdata/break_rules/line.txt index 287ef0e75230..63b647520573 100644 --- a/icu4c/source/test/testdata/break_rules/line.txt +++ b/icu4c/source/test/testdata/break_rules/line.txt @@ -78,12 +78,9 @@ CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; eaFWH = [\p{ea=F}\p{ea=W}\p{ea=H}]; eaFWHminusOPGL = [ eaFWH - [OP GL] ]; -eaFWHandCM = [ CMS & eaFWH ]; eaFWHminusCM = [ eaFWH - CMS ]; eaFWHBreakableAtLB19 = [ eaFWH - [NS BA EX CL IN IS GL CMS] ]; - -# An annoying special case, \p{lb=BA} & [\p{ea=F}\p{ea=W}\p{ea=H}]. -ideographicSpace = [\u3000]; +BAminuseaFWH = [BA - eaFWH ]; PiQU = [\p{Pi}&QU]; PfQU = [\p{Pf}&QU]; @@ -129,12 +126,8 @@ LB15a.1: ( OP CM* SP* | QU CM* | GL CM* ) (PiQU CM* SP*)+ .; # points. LB19a.1: eaFWHminusOPGL ÷ PiQU CM* eaFWHminusCM; LB19a.2: eaFWHminusOPGL CM* CMS ÷ PiQU CM* eaFWHminusCM; -LB19a.3: ^eaFWHandCM ÷ PiQU CM* eaFWHminusCM; -LB19a.4: ^eaFWHandCM CM* CMS ÷ PiQU CM* eaFWHminusCM; LB19a.5: eaFWH CM* PfQU ÷ eaFWHBreakableAtLB19; LB19a.6: eaFWH CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19; -LB19a.7: ^eaFWHandCM CM* PfQU ÷ eaFWHBreakableAtLB19; -LB19a.8: ^eaFWHandCM CM* PfQU CM* CMS ÷ eaFWHBreakableAtLB19; # Moved up, before LB7, because they can match a longer sequence that would also match LB7. # For example, the sequence "OP CM SP AL" matches LB14 @@ -216,8 +209,7 @@ LB20a.1: ^(HY | HH) CM* AL; # Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then # not picking up the continuing match after the BA from 21a. # Chains over two characters with the LB19a break rule. -LB21a.1: HL CM* ideographicSpace CM* PfQU (CM* CMS)? ÷ eaFWHBreakableAtLB19; -LB21a: HL CM* (HY | BA) CM* [^CM CB HL]; +LB21a: HL CM* (HY | BAminuseaFWH) CM* [^CM CB HL]; LB21.1: . CM* [BA HY NS]; LB21.2: BB CM* [^CM CB]; diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index c848d369acc2..fcebf97b915e 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -2223,16 +2223,3 @@ Bangkok)• •« Complex »« chaining » • •« .618 »• # Interaction with the ICU tailoring to break before such numbers. -# Non-breaking lb=SP (from LB14 and LB15a) followed by a lb=CM-as-AL that is -# ea=W, in a position that would match initial context for LB19a if it were not -# ea=W. -# See https://github.com/unicode-org/icu/pull/3028#issuecomment-2200259320. -•︷ \U00016FF1•⸠ᅛᆅ• -•︷ « \U00016FF1•⸠ᅛᆅ• -•A »« \U00016FF1•⸠ᅛᆅ• -•︷ \U00016FF1\u302B•⸠ᅛᆅ• -•︷ « \U00016FF1\u302B•⸠ᅛᆅ• -•A »« \U00016FF1\u302B•⸠ᅛᆅ• -•❲ \u3035⸍•굼• -•❲ « \u3035⸍•굼• -•A »« \u3035⸍•굼•