From 9054d0cc35d2937a2fc47cfb90e9cae0f2785975 Mon Sep 17 00:00:00 2001 From: Tim Chevalier Date: Tue, 8 Oct 2024 14:41:59 -0700 Subject: [PATCH] ICU-22940 MF2 ICU4C: Update for bidi support Per https://github.com/unicode-org/message-format-wg/pull/884 --- .../source/i18n/messageformat2_formatter.cpp | 3 +- icu4c/source/i18n/messageformat2_parser.cpp | 421 ++++++++++++++---- icu4c/source/i18n/messageformat2_parser.h | 84 +++- icu4c/source/i18n/ucln_in.h | 1 + .../test/intltest/messageformat2test.cpp | 10 - .../intltest/messageformat2test_read_json.cpp | 3 + .../test/intltest/messageformat2test_utils.h | 11 +- testdata/message2/bidi.json | 160 +++++++ 8 files changed, 579 insertions(+), 114 deletions(-) create mode 100644 testdata/message2/bidi.json diff --git a/icu4c/source/i18n/messageformat2_formatter.cpp b/icu4c/source/i18n/messageformat2_formatter.cpp index 8d17ae49b99a..e25adda46533 100644 --- a/icu4c/source/i18n/messageformat2_formatter.cpp +++ b/icu4c/source/i18n/messageformat2_formatter.cpp @@ -43,7 +43,8 @@ namespace message2 { // Parse the pattern MFDataModel::Builder tree(errorCode); - Parser(pat, tree, *errors, normalizedInput).parse(parseError, errorCode); + Parser(pat, tree, *errors, normalizedInput, errorCode) + .parse(parseError, errorCode); // Fail on syntax errors if (errors->hasSyntaxError()) { diff --git a/icu4c/source/i18n/messageformat2_parser.cpp b/icu4c/source/i18n/messageformat2_parser.cpp index b4768756c5ea..0b26ee164b37 100644 --- a/icu4c/source/i18n/messageformat2_parser.cpp +++ b/icu4c/source/i18n/messageformat2_parser.cpp @@ -7,9 +7,12 @@ #if !UCONFIG_NO_MF2 +#include "unicode/uniset.h" #include "messageformat2_errors.h" #include "messageformat2_macros.h" #include "messageformat2_parser.h" +#include "ucln_in.h" +#include "umutex.h" #include "uvector.h" // U_ASSERT U_NAMESPACE_BEGIN @@ -91,14 +94,235 @@ static void copyContext(const UChar in[U_PARSE_CONTEXT_LEN], UChar out[U_PARSE_C } // ------------------------------------- -// Predicates +// Initialization of UnicodeSets + +namespace unisets { + +UnicodeSet* gUnicodeSets[unisets::UNISETS_KEY_COUNT] = {}; + +inline UnicodeSet* getImpl(Key key) { + return gUnicodeSets[key]; +} + +icu::UInitOnce gMF2ParseUniSetsInitOnce {}; +} + +UnicodeSet* initContentChars(UErrorCode& status) { + if (U_FAILURE(status)) { + return {}; + } + + UnicodeSet* result = new UnicodeSet(0x0001, 0x0008); // Omit NULL, HTAB and LF + if (result == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + result->add(0x000B, 0x000C); // Omit CR + result->add(0x000E, 0x001F); // Omit SP + result->add(0x0021, 0x002D); // Omit '.' + result->add(0x002F, 0x003F); // Omit '@' + result->add(0x0041, 0x005B); // Omit '\' + result->add(0x005D, 0x007A); // Omit { | } + result->add(0x007E, 0xD7FF); // Omit surrogates + result->add(0xE000, 0x10FFFF); + result->freeze(); + return result; +} + +UnicodeSet* initWhitespace(UErrorCode& status) { + if (U_FAILURE(status)) { + return {}; + } + + UnicodeSet* result = new UnicodeSet(); + if (result == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + result->add(SPACE); + result->add(HTAB); + result->add(CR); + result->add(LF); + result->add(IDEOGRAPHIC_SPACE); + result->freeze(); + return result; +} + +UnicodeSet* initBidiControls(UErrorCode& status) { + UnicodeSet* result = new UnicodeSet(UnicodeString("[\\u061C]"), status); + if (U_FAILURE(status)) { + return {}; + } + result->add(0x200E, 0x200F); + result->add(0x2066, 0x2069); + result->freeze(); + return result; +} + +UnicodeSet* initAlpha(UErrorCode& status) { + UnicodeSet* result = new UnicodeSet(UnicodeString("[:letter:]"), status); + if (U_FAILURE(status)) { + return {}; + } + result->freeze(); + return result; +} + +UnicodeSet* initDigits(UErrorCode& status) { + UnicodeSet* result = new UnicodeSet(UnicodeString("[:number:]"), status); + if (U_FAILURE(status)) { + return {}; + } + result->freeze(); + return result; +} + +UnicodeSet* initNameStartChars(UErrorCode& status) { + if (U_FAILURE(status)) { + return {}; + } -// Returns true if `c` is in the interval [`first`, `last`] -static bool inRange(UChar32 c, UChar32 first, UChar32 last) { - U_ASSERT(first < last); - return c >= first && c <= last; + UnicodeSet* result = new UnicodeSet(*unisets::getImpl(unisets::ALPHA)); + if (result == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + }; + result->add(UNDERSCORE); + result->add(0x00C0, 0x00D6); + result->add(0x00D8, 0x00F6); + result->add(0x00F8, 0x02FF); + result->add(0x0370, 0x037D); + result->add(0x037F, 0x061B); + result->add(0x061D, 0x1FFF); + result->add(0x200C, 0x200D); + result->add(0x2070, 0x218F); + result->add(0x2C00, 0x2FEF); + result->add(0x3001, 0xD7FF); + result->add(0xF900, 0xFDCF); + result->add(0xFDF0, 0xFFFD); + result->add(0x100000, 0xEFFFF); + result->freeze(); + return result; } +UnicodeSet* initNameChars(UErrorCode& status) { + if (U_FAILURE(status)) { + return {}; + } + + UnicodeSet* result = new UnicodeSet(); + if (result == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + }; + result->addAll(*unisets::getImpl(unisets::NAME_START)); + result->addAll(*unisets::getImpl(unisets::DIGIT)); + result->add(HYPHEN); + result->add(PERIOD); + result->add(0x00B7); + result->add(0x0300, 0x036F); + result->add(0x203F, 0x2040); + result->freeze(); + return result; +} + +UnicodeSet* initTextChars(UErrorCode& status) { + if (U_FAILURE(status)) { + return {}; + } + + UnicodeSet* result = new UnicodeSet(); + if (result == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + }; + result->addAll(*unisets::getImpl(unisets::CONTENT)); + result->addAll(*unisets::getImpl(unisets::WHITESPACE)); + result->add(PERIOD); + result->add(AT); + result->add(PIPE); + result->freeze(); + return result; +} + +UnicodeSet* initQuotedChars(UErrorCode& status) { + if (U_FAILURE(status)) { + return {}; + } + + UnicodeSet* result = new UnicodeSet(); + if (result == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + }; + result->addAll(*unisets::getImpl(unisets::CONTENT)); + result->addAll(*unisets::getImpl(unisets::WHITESPACE)); + result->add(PERIOD); + result->add(AT); + result->add(LEFT_CURLY_BRACE); + result->add(RIGHT_CURLY_BRACE); + result->freeze(); + return result; +} + +UnicodeSet* initEscapableChars(UErrorCode& status) { + if (U_FAILURE(status)) { + return {}; + } + + UnicodeSet* result = new UnicodeSet(); + if (result == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + result->add(PIPE); + result->add(BACKSLASH); + result->add(LEFT_CURLY_BRACE); + result->add(RIGHT_CURLY_BRACE); + result->freeze(); + return result; +} + +namespace unisets { + +UBool U_CALLCONV cleanupMF2ParseUniSets() { + for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) { + delete gUnicodeSets[i]; + gUnicodeSets[i] = nullptr; + } + gMF2ParseUniSetsInitOnce.reset(); + return true; +} + +void U_CALLCONV initMF2ParseUniSets(UErrorCode& status) { + ucln_i18n_registerCleanup(UCLN_I18N_MF2_UNISETS, cleanupMF2ParseUniSets); + + gUnicodeSets[unisets::CONTENT] = initContentChars(status); + gUnicodeSets[unisets::WHITESPACE] = initWhitespace(status); + gUnicodeSets[unisets::BIDI] = initBidiControls(status); + gUnicodeSets[unisets::ALPHA] = initAlpha(status); + gUnicodeSets[unisets::DIGIT] = initDigits(status); + gUnicodeSets[unisets::NAME_START] = initNameStartChars(status); + gUnicodeSets[unisets::NAME_CHAR] = initNameChars(status); + gUnicodeSets[unisets::TEXT] = initTextChars(status); + gUnicodeSets[unisets::QUOTED] = initQuotedChars(status); + gUnicodeSets[unisets::ESCAPABLE] = initEscapableChars(status); +} + +const UnicodeSet* get(Key key) { + UErrorCode localStatus = U_ZERO_ERROR; + umtx_initOnce(gMF2ParseUniSetsInitOnce, &initMF2ParseUniSets, localStatus); + if (U_FAILURE(localStatus)) { + return nullptr; + } + return getImpl(key); +} + +} + +// ------------------------------------- +// Predicates + /* The following helper predicates should exactly match nonterminals in the MessageFormat 2 grammar: @@ -113,76 +337,50 @@ static bool inRange(UChar32 c, UChar32 first, UChar32 last) { `isWhitespace()` : `s` */ -static bool isContentChar(UChar32 c) { - return inRange(c, 0x0001, 0x0008) // Omit NULL, HTAB and LF - || inRange(c, 0x000B, 0x000C) // Omit CR - || inRange(c, 0x000E, 0x001F) // Omit SP - || inRange(c, 0x0021, 0x002D) // Omit '.' - || inRange(c, 0x002F, 0x003F) // Omit '@' - || inRange(c, 0x0041, 0x005B) // Omit '\' - || inRange(c, 0x005D, 0x007A) // Omit { | } - || inRange(c, 0x007E, 0xD7FF) // Omit surrogates - || inRange(c, 0xE000, 0x10FFFF); +bool Parser::isContentChar(UChar32 c) const { + return contentChars->contains(c); } -// See `s` in the MessageFormat 2 grammar -inline bool isWhitespace(UChar32 c) { - switch (c) { - case SPACE: - case HTAB: - case CR: - case LF: - case IDEOGRAPHIC_SPACE: - return true; - default: - return false; - } +// See `bidi` in the MF2 grammar +bool Parser::isBidiControl(UChar32 c) const { + return bidiControlChars->contains(c); } -static bool isTextChar(UChar32 c) { - return isContentChar(c) - || isWhitespace(c) - || c == PERIOD - || c == AT - || c == PIPE; +// See `ws` in the MessageFormat 2 grammar +bool Parser::isWhitespace(UChar32 c) const { + return whitespaceChars->contains(c); } -static bool isAlpha(UChar32 c) { return inRange(c, 0x0041, 0x005A) || inRange(c, 0x0061, 0x007A); } +bool Parser::isTextChar(UChar32 c) const { + return textChars->contains(c); +} + +bool Parser::isAlpha(UChar32 c) const { + return alphaChars->contains(c); +} -static bool isDigit(UChar32 c) { return inRange(c, 0x0030, 0x0039); } +bool Parser::isDigit(UChar32 c) const { + return digitChars->contains(c); +} -static bool isNameStart(UChar32 c) { - return isAlpha(c) || c == UNDERSCORE || inRange(c, 0x00C0, 0x00D6) || inRange(c, 0x00D8, 0x00F6) || - inRange(c, 0x00F8, 0x02FF) || inRange(c, 0x0370, 0x037D) || inRange(c, 0x037F, 0x1FFF) || - inRange(c, 0x200C, 0x200D) || inRange(c, 0x2070, 0x218F) || inRange(c, 0x2C00, 0x2FEF) || - inRange(c, 0x3001, 0xD7FF) || inRange(c, 0xF900, 0xFDCF) || inRange(c, 0xFDF0, 0xFFFD) || - inRange(c, 0x10000, 0xEFFFF); +bool Parser::isNameStart(UChar32 c) const { + return nameStartChars->contains(c); } -static bool isNameChar(UChar32 c) { - return isNameStart(c) || isDigit(c) || c == HYPHEN || c == PERIOD || c == 0x00B7 || - inRange(c, 0x0300, 0x036F) || inRange(c, 0x203F, 0x2040); +bool Parser::isNameChar(UChar32 c) const { + return nameChars->contains(c); } -static bool isUnquotedStart(UChar32 c) { - return isNameStart(c) || isDigit(c) || c == HYPHEN || c == PERIOD || c == 0x00B7 || - inRange(c, 0x0300, 0x036F) || inRange(c, 0x203F, 0x2040); +bool Parser::isUnquotedStart(UChar32 c) const { + return isNameChar(c); } -static bool isQuotedChar(UChar32 c) { - return isContentChar(c) - || isWhitespace(c) - || c == PERIOD - || c == AT - || c == LEFT_CURLY_BRACE - || c == RIGHT_CURLY_BRACE; +bool Parser::isQuotedChar(UChar32 c) const { + return quotedChars->contains(c); } -static bool isEscapableChar(UChar32 c) { - return c == PIPE - || c == BACKSLASH - || c == LEFT_CURLY_BRACE - || c == RIGHT_CURLY_BRACE; +bool Parser::isEscapableChar(UChar32 c) const { + return escapableChars->contains(c); } // Returns true iff `c` can begin a `function` nonterminal @@ -203,12 +401,12 @@ static bool isAnnotationStart(UChar32 c) { } // Returns true iff `c` can begin a `literal` nonterminal -static bool isLiteralStart(UChar32 c) { +bool Parser::isLiteralStart(UChar32 c) const { return (c == PIPE || isNameStart(c) || c == HYPHEN || isDigit(c)); } // Returns true iff `c` can begin a `key` nonterminal -static bool isKeyStart(UChar32 c) { +bool Parser::isKeyStart(UChar32 c) const { return (c == ASTERISK || isLiteralStart(c)); } @@ -347,7 +545,7 @@ option, or the optional space before an attribute. No pre, no post. A message may end with whitespace, so `index` may equal `len()` on exit. */ -void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode) { +void Parser::parseRequiredWS(UErrorCode& errorCode) { bool sawWhitespace = false; // The loop exits either when we consume all the input, @@ -358,7 +556,7 @@ void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode) // If whitespace isn't required -- or if we saw it already -- // then the caller is responsible for checking this case and // setting an error if necessary. - if (!required || sawWhitespace) { + if (sawWhitespace) { // Not an error. return; } @@ -380,24 +578,51 @@ void Parser::parseWhitespaceMaybeRequired(bool required, UErrorCode& errorCode) } } - if (!sawWhitespace && required) { + if (!sawWhitespace) { ERROR(errorCode); } } +void Parser::parseOptionalBidi() { + while (true) { + if (!inBounds()) { + return; + } + if (isBidiControl(peek())) { + next(); + } else { + break; + } + } +} + /* - No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`. + No pre, no post, because a message may end with whitespace + Matches `s` in the MF2 grammar */ void Parser::parseRequiredWhitespace(UErrorCode& errorCode) { - parseWhitespaceMaybeRequired(true, errorCode); + parseOptionalBidi(); + parseRequiredWS(errorCode); + parseOptionalWhitespace(); normalizedInput += SPACE; } /* No pre, no post, for the same reason as `parseWhitespaceMaybeRequired()`. */ -void Parser::parseOptionalWhitespace(UErrorCode& errorCode) { - parseWhitespaceMaybeRequired(false, errorCode); +void Parser::parseOptionalWhitespace() { + while (true) { + if (!inBounds()) { + return; + } + auto cp = peek(); + if (isWhitespace(cp) || isBidiControl(cp)) { + maybeAdvanceLine(); + next(); + } else { + break; + } + } } // Consumes a single character, signaling an error if `peek()` != `c` @@ -442,11 +667,11 @@ void Parser::parseToken(const std::u16string_view& token, UErrorCode& errorCode) */ void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCode& errorCode) { // No need for error check or bounds check before parseOptionalWhitespace - parseOptionalWhitespace(errorCode); + parseOptionalWhitespace(); // Establish precondition CHECK_BOUNDS(errorCode); parseToken(token, errorCode); - parseOptionalWhitespace(errorCode); + parseOptionalWhitespace(); // Guarantee postcondition CHECK_BOUNDS(errorCode); } @@ -458,12 +683,12 @@ void Parser::parseTokenWithWhitespace(const std::u16string_view& token, UErrorCo then consumes optional whitespace again */ void Parser::parseTokenWithWhitespace(UChar32 c, UErrorCode& errorCode) { - // No need for error check or bounds check before parseOptionalWhitespace(errorCode) - parseOptionalWhitespace(errorCode); + // No need for error check or bounds check before parseOptionalWhitespace() + parseOptionalWhitespace(); // Establish precondition CHECK_BOUNDS(errorCode); parseToken(c, errorCode); - parseOptionalWhitespace(errorCode); + parseOptionalWhitespace(); // Guarantee postcondition CHECK_BOUNDS(errorCode); } @@ -482,11 +707,17 @@ UnicodeString Parser::parseName(UErrorCode& errorCode) { U_ASSERT(inBounds()); - if (!isNameStart(peek())) { + if (!(isNameStart(peek()) || isBidiControl(peek()))) { ERROR(errorCode); return name; } + // name = [bidi] name-start *name-char [bidi] + + // [bidi] + parseOptionalBidi(); + + // name-start *name-char while (isNameChar(peek())) { UChar32 c = peek(); name += c; @@ -497,6 +728,10 @@ UnicodeString Parser::parseName(UErrorCode& errorCode) { break; } } + + // [bidi] + parseOptionalBidi(); + return name; } @@ -853,7 +1088,7 @@ void Parser::parseAttribute(AttributeAdder& attrAdder, UErrorCode& errorCode) // about whether whitespace precedes another // attribute, or the '=' sign int32_t savedIndex = index; - parseOptionalWhitespace(errorCode); + parseOptionalWhitespace(); Operand rand; if (peek() == EQUALS) { @@ -1149,7 +1384,7 @@ the comment in `parseOptions()` for details. // (the character is either the required space before an annotation, or optional // trailing space after the literal or variable). It's still ambiguous which // one does apply. - parseOptionalWhitespace(status); + parseOptionalWhitespace(); // Restore precondition CHECK_BOUNDS(status); @@ -1220,7 +1455,7 @@ Expression Parser::parseExpression(UErrorCode& status) { // Parse opening brace parseToken(LEFT_CURLY_BRACE, status); // Optional whitespace after opening brace - parseOptionalWhitespace(status); + parseOptionalWhitespace(); Expression::Builder exprBuilder(status); // Restore precondition @@ -1263,7 +1498,7 @@ Expression Parser::parseExpression(UErrorCode& status) { // Parse optional space // (the last [s] in e.g. "{" [s] literal [s annotation] *(s attribute) [s] "}") - parseOptionalWhitespace(status); + parseOptionalWhitespace(); // Either an operand or operator (or both) must have been set already, // so there can't be an error @@ -1339,7 +1574,7 @@ void Parser::parseInputDeclaration(UErrorCode& status) { CHECK_BOUNDS(status); parseToken(ID_INPUT, status); - parseOptionalWhitespace(status); + parseOptionalWhitespace(); // Restore precondition before calling parseExpression() CHECK_BOUNDS(status); @@ -1400,7 +1635,7 @@ void Parser::parseDeclarations(UErrorCode& status) { // Avoid looping infinitely CHECK_ERROR(status); - parseOptionalWhitespace(status); + parseOptionalWhitespace(); // Restore precondition CHECK_BOUNDS(status); } @@ -1510,8 +1745,8 @@ This is addressed using "backtracking" (similarly to `parseOptions()`). // We've seen at least one whitespace-key pair, so now we can parse // *(s key) [s] - while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek())) { // Try to recover from errors - bool wasWhitespace = isWhitespace(peek()); + while (peek() != LEFT_CURLY_BRACE || isWhitespace(peek()) || isBidiControl(peek())) { + bool wasWhitespace = isWhitespace(peek()) || isBidiControl(peek()); parseRequiredWhitespace(status); if (!wasWhitespace) { // Avoid infinite loop when parsing something like: @@ -1569,7 +1804,7 @@ Markup Parser::parseMarkup(UErrorCode& status) { // Consume the '{' next(); normalizedInput += LEFT_CURLY_BRACE; - parseOptionalWhitespace(status); + parseOptionalWhitespace(); bool closing = false; switch (peek()) { case NUMBER_SIGN: { @@ -1596,19 +1831,19 @@ Markup Parser::parseMarkup(UErrorCode& status) { // Parse the options, which must begin with a ' ' // if present - if (inBounds() && isWhitespace(peek())) { + if (inBounds() && (isWhitespace(peek()) || isBidiControl(peek()))) { OptionAdder optionAdder(builder); parseOptions(optionAdder, status); } // Parse the attributes, which also must begin // with a ' ' - if (inBounds() && isWhitespace(peek())) { + if (inBounds() && (isWhitespace(peek()) || isBidiControl(peek()))) { AttributeAdder attrAdder(builder); parseAttributes(attrAdder, status); } - parseOptionalWhitespace(status); + parseOptionalWhitespace(); bool standalone = false; // Check if this is a standalone or not @@ -1656,7 +1891,7 @@ std::variant Parser::parsePlaceholder(UErrorCode& status) { isMarkup = true; break; } - if (!isWhitespace(c)){ + if (!(isWhitespace(c) || isBidiControl(c))) { break; } tempIndex++; @@ -1740,7 +1975,7 @@ void Parser::parseSelectors(UErrorCode& status) { // "Backtracking" is required here. It's not clear if whitespace is // (`[s]` selector) or (`[s]` variant) while (isWhitespace(peek()) || peek() == LEFT_CURLY_BRACE) { - parseOptionalWhitespace(status); + parseOptionalWhitespace(); // Restore precondition CHECK_BOUNDS(status); if (peek() != LEFT_CURLY_BRACE) { @@ -1770,9 +2005,9 @@ void Parser::parseSelectors(UErrorCode& status) { } \ // Parse variants - while (isWhitespace(peek()) || isKeyStart(peek())) { + while (isWhitespace(peek()) || isBidiControl(peek()) || isKeyStart(peek())) { // Trailing whitespace is allowed - parseOptionalWhitespace(status); + parseOptionalWhitespace(); if (!inBounds()) { return; } @@ -1871,7 +2106,7 @@ void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) { bool complex = false; // First, "look ahead" to determine if this is a simple or complex // message. To do that, check the first non-whitespace character. - while (inBounds(index) && isWhitespace(peek())) { + while (inBounds(index) && (isWhitespace(peek()) || isBidiControl(peek()))) { next(); } @@ -1891,10 +2126,10 @@ void Parser::parse(UParseError &parseErrorResult, UErrorCode& status) { // Message can be empty, so we need to only look ahead // if we know it's non-empty if (complex) { - parseOptionalWhitespace(status); + parseOptionalWhitespace(); parseDeclarations(status); parseBody(status); - parseOptionalWhitespace(status); + parseOptionalWhitespace(); } else { // Simple message // For normalization, quote the pattern diff --git a/icu4c/source/i18n/messageformat2_parser.h b/icu4c/source/i18n/messageformat2_parser.h index b62cbe9200b9..170447731345 100644 --- a/icu4c/source/i18n/messageformat2_parser.h +++ b/icu4c/source/i18n/messageformat2_parser.h @@ -10,6 +10,7 @@ #include "unicode/messageformat2_data_model.h" #include "unicode/parseerr.h" +#include "unicode/uniset.h" #include "messageformat2_allocation.h" #include "messageformat2_errors.h" @@ -54,6 +55,26 @@ namespace message2 { } }; + + // Initialization of UnicodeSets + namespace unisets { + enum Key { + CONTENT, + WHITESPACE, + BIDI, + ALPHA, + DIGIT, + NAME_START, + NAME_CHAR, + TEXT, + QUOTED, + ESCAPABLE, + UNISETS_KEY_COUNT + }; + + U_I18N_API const UnicodeSet* get(Key key); + } + // Parser class (private) class Parser : public UMemory { public: @@ -82,8 +103,23 @@ namespace message2 { UChar postContext[U_PARSE_CONTEXT_LEN]; } MessageParseError; - Parser(const UnicodeString &input, MFDataModel::Builder& dataModelBuilder, StaticErrors& e, UnicodeString& normalizedInputRef) - : source(input), index(0), errors(e), normalizedInput(normalizedInputRef), dataModel(dataModelBuilder) { + Parser(const UnicodeString &input, + MFDataModel::Builder& dataModelBuilder, + StaticErrors& e, + UnicodeString& normalizedInputRef, + UErrorCode& status) + : contentChars(unisets::get(unisets::CONTENT)), + whitespaceChars(unisets::get(unisets::WHITESPACE)), + bidiControlChars(unisets::get(unisets::BIDI)), + alphaChars(unisets::get(unisets::ALPHA)), + digitChars(unisets::get(unisets::DIGIT)), + nameStartChars(unisets::get(unisets::NAME_START)), + nameChars(unisets::get(unisets::NAME_CHAR)), + textChars(unisets::get(unisets::TEXT)), + quotedChars(unisets::get(unisets::QUOTED)), + escapableChars(unisets::get(unisets::ESCAPABLE)), + source(input), index(0), errors(e), normalizedInput(normalizedInputRef), dataModel(dataModelBuilder) { + (void) status; parseError.line = 0; parseError.offset = 0; parseError.lengthBeforeCurrentLine = 0; @@ -91,6 +127,31 @@ namespace message2 { parseError.postContext[0] = '\0'; } + UnicodeSet initContentChars(UErrorCode& status); + UnicodeSet initWhitespace(UErrorCode& status); + UnicodeSet initBidiControls(UErrorCode& status); + UnicodeSet initAlpha(UErrorCode& status); + UnicodeSet initDigits(UErrorCode& status); + UnicodeSet initNameStartChars(UErrorCode& status); + UnicodeSet initNameChars(UErrorCode& status); + UnicodeSet initTextChars(UErrorCode& status); + UnicodeSet initQuotedChars(UErrorCode& status); + UnicodeSet initEscapableChars(UErrorCode& status); + + bool isContentChar(UChar32) const; + bool isBidiControl(UChar32) const; + bool isWhitespace(UChar32) const; + bool isTextChar(UChar32) const; + bool isQuotedChar(UChar32) const; + bool isEscapableChar(UChar32) const; + bool isAlpha(UChar32) const; + bool isDigit(UChar32) const; + bool isNameStart(UChar32) const; + bool isNameChar(UChar32) const; + bool isUnquotedStart(UChar32) const; + bool isLiteralStart(UChar32) const; + bool isKeyStart(UChar32) const; + static void translateParseError(const MessageParseError&, UParseError&); static void setParseError(MessageParseError&, uint32_t); void maybeAdvanceLine(); @@ -102,9 +163,10 @@ namespace message2 { void parseInputDeclaration(UErrorCode&); void parseSelectors(UErrorCode&); - void parseWhitespaceMaybeRequired(bool, UErrorCode&); + void parseRequiredWS(UErrorCode&); void parseRequiredWhitespace(UErrorCode&); - void parseOptionalWhitespace(UErrorCode&); + void parseOptionalBidi(); + void parseOptionalWhitespace(); void parseToken(UChar32, UErrorCode&); void parseTokenWithWhitespace(UChar32, UErrorCode&); void parseToken(const std::u16string_view&, UErrorCode&); @@ -149,6 +211,18 @@ namespace message2 { bool inBounds(uint32_t i) const { return source.moveIndex32(index, i) < source.length(); } bool allConsumed() const { return (int32_t) index == source.length(); } + // UnicodeSets for checking character ranges + const UnicodeSet* contentChars; + const UnicodeSet* whitespaceChars; + const UnicodeSet* bidiControlChars; + const UnicodeSet* alphaChars; + const UnicodeSet* digitChars; + const UnicodeSet* nameStartChars; + const UnicodeSet* nameChars; + const UnicodeSet* textChars; + const UnicodeSet* quotedChars; + const UnicodeSet* escapableChars; + // The input string const UnicodeString &source; // The current position within the input string -- counting in UChar32 @@ -165,8 +239,8 @@ namespace message2 { // The parent builder MFDataModel::Builder &dataModel; - }; // class Parser + }; // class Parser } // namespace message2 U_NAMESPACE_END diff --git a/icu4c/source/i18n/ucln_in.h b/icu4c/source/i18n/ucln_in.h index 765cdd559fb4..ea4201911694 100644 --- a/icu4c/source/i18n/ucln_in.h +++ b/icu4c/source/i18n/ucln_in.h @@ -62,6 +62,7 @@ typedef enum ECleanupI18NType { UCLN_I18N_REGION, UCLN_I18N_LIST_FORMATTER, UCLN_I18N_NUMSYS, + UCLN_I18N_MF2_UNISETS, UCLN_I18N_COUNT /* This must be last */ } ECleanupI18NType; diff --git a/icu4c/source/test/intltest/messageformat2test.cpp b/icu4c/source/test/intltest/messageformat2test.cpp index 353082ef5c91..0f17a28fcc52 100644 --- a/icu4c/source/test/intltest/messageformat2test.cpp +++ b/icu4c/source/test/intltest/messageformat2test.cpp @@ -11,16 +11,6 @@ using namespace icu::message2; -/* - TODO: Tests need to be unified in a single format that - both ICU4C and ICU4J can use, rather than being embedded in code. - - Tests are included in their current state to give a sense of - how much test coverage has been achieved. Most of the testing is - of the parser/serializer; the formatter needs to be tested more - thoroughly. -*/ - void TestMessageFormat2::runIndexedTest(int32_t index, UBool exec, const char* &name, char* /*par*/) { diff --git a/icu4c/source/test/intltest/messageformat2test_read_json.cpp b/icu4c/source/test/intltest/messageformat2test_read_json.cpp index ddf93da632ce..96906f618f8a 100644 --- a/icu4c/source/test/intltest/messageformat2test_read_json.cpp +++ b/icu4c/source/test/intltest/messageformat2test_read_json.cpp @@ -309,6 +309,9 @@ void TestMessageFormat2::jsonTestsFromFiles(IcuTestErrorCode& errorCode) { runTestsFromJsonFile(*this, "spec/functions/time.json", errorCode); // Other tests (non-spec) + // TODO: https://github.com/unicode-org/message-format-wg/pull/902 will + // move the bidi tests into the spec + runTestsFromJsonFile(*this, "bidi.json", errorCode); runTestsFromJsonFile(*this, "more-functions.json", errorCode); runTestsFromJsonFile(*this, "valid-tests.json", errorCode); runTestsFromJsonFile(*this, "resolution-errors.json", errorCode); diff --git a/icu4c/source/test/intltest/messageformat2test_utils.h b/icu4c/source/test/intltest/messageformat2test_utils.h index c4ad251c7f48..3abfda1f8f73 100644 --- a/icu4c/source/test/intltest/messageformat2test_utils.h +++ b/icu4c/source/test/intltest/messageformat2test_utils.h @@ -252,7 +252,8 @@ class TestUtils { if (!roundTrip(in, mf.getDataModel(), out) // For now, don't round-trip messages with these errors, // since duplicate options are dropped - && testCase.expectedErrorCode() != U_MF_DUPLICATE_OPTION_NAME_ERROR) { + && (testCase.expectSuccess() || + testCase.expectedErrorCode() != U_MF_DUPLICATE_OPTION_NAME_ERROR)) { failRoundTrip(tmsg, testCase, in, out); } @@ -291,10 +292,10 @@ class TestUtils { } // Re-run the formatter result = mf.formatToString(MessageArguments(testCase.getArguments(), errorCode), errorCode); - if (!testCase.outputMatches(result)) { - failWrongOutput(tmsg, testCase, result); - return; - } + } + if (!testCase.outputMatches(result)) { + failWrongOutput(tmsg, testCase, result); + return; } errorCode.reset(); } diff --git a/testdata/message2/bidi.json b/testdata/message2/bidi.json new file mode 100644 index 000000000000..5979f385f154 --- /dev/null +++ b/testdata/message2/bidi.json @@ -0,0 +1,160 @@ +{ + "scenario": "Bidi support", + "description": "Tests for correct parsing of messages with bidirectional marks and isolates", + "defaultTestProperties": { + "locale": "en-US" + }, + "tests": [ + { + "comment": "simple-message = o [simple-start pattern]", + "src": " \u061C Hello world!", + "exp": " \u061C Hello world!" + }, + { + "comment": "complex-message = o *(declaration o) complex-body o", + "src": "\u200E .local $x = {1} {{ {$x}}}", + "exp": " 1" + }, + { + "comment": "complex-message = o *(declaration o) complex-body o", + "src": ".local $x = {1} \u200F {{ {$x}}}", + "exp": " 1" + }, + { + "comment": "complex-message = o *(declaration o) complex-body o", + "src": ".local $x = {1} {{ {$x}}} \u2066", + "exp": " 1" + }, + { + "comment": "input-declaration = input o variable-expression", + "src": ".input \u2067 {$x :number} {{hello}}", + "params": [{"name": "x", "value": "1"}], + "exp": "hello" + }, + { + "comment": "local s variable o \"=\" o expression", + "src": ".local $x \u2068 = \u2069 {1} {{hello}}", + "exp": "hello" + }, + { + "comment": "local s variable o \"=\" o expression", + "src": ".local \u2067 $x = {1} {{hello}}", + "exp": "hello" + }, + { + "comment": "local s variable o \"=\" o expression", + "src": ".local\u2067 $x = {1} {{hello}}", + "exp": "hello" + }, + { + "comment": "o \"{{\" pattern \"}}\"", + "src": "\u2067 {{hello}}", + "exp": "hello" + }, + { + "comment": "match-statement s variant *(o variant)", + "src": [".local $x = {1 :number}\n", + ".match {$x}\n", + "1 {{one}}\n", + "\u061C * {{other}}"], + "exp": "one" + }, + { + "comment": "match-statement s variant *(o variant)", + "src": [".local $x = {1 :number}", + ".match {$x} \u061c", + "1 {{one}}", + "* {{other}}"], + "exp": "one" + }, + { + "comment": "match-statement s variant *(o variant)", + "src": [".local $x = {1 :number}", + ".match {$x}\u061c", + "1 {{one}}", + "* {{other}}"], + "exp": "one" + }, + { + "comment": "variant = key *(s key) quoted-pattern", + "src": [".local $x = {1 :number} .local $y = {$x :number}", + ".match {$x} {$y}", + "1 \u200E 1 {{one}}", + "* * {{other}}"], + "exp": "one" + }, + { + "comment": "variant = key *(s key) quoted-pattern", + "src": [".local $x = {1 :number} .local $y = {$x :number}", + ".match {$x} {$y}", + "1\u200E 1 {{one}}", + "* * {{other}}"], + "exp": "one" + }, + { + "comment": "literal-expression = \"{\" o literal [s function] *(s attribute) o \"}\"", + "src": "{\u200E hello \u200F}", + "exp": "hello" + }, + { + "comment": "variable-expression = \"{\" o variable [s function] *(s attribute) o \"}\"", + "src": ".local $x = {1} {{ {\u200E $x \u200F} }}", + "exp": " 1 " + }, + { + "comment": "function-expression = \"{\" o function *(s attribute) o \"}\"", + "src": "{1 \u200E :number \u200F}", + "exp": "1" + }, + { + "comment": "markup = \"{\" o \"#\" identifier *(s option) *(s attribute) o [\"/\"] \"}\"", + "src": "{\u200F #b \u200E }", + "exp": "" + }, + { + "comment": "markup = \"{\" o \"/\" identifier *(s option) *(s attribute) o \"}\"", + "src": "{\u200F /b \u200E }", + "exp": "" + }, + { + "comment": "option = identifier o \"=\" o (literal / variable)", + "src": "{1 :number minimumFractionDigits\u200F=\u200E1 }", + "exp": "1.0" + }, + { + "comment": "attribute = \"@\" identifier [o \"=\" o (literal / variable)]", + "src": "{1 :number @locale\u200F=\u200Een }", + "exp": "1" + }, + { + "comment": " name... excludes U+FFFD and U+061C -- this pases as name -> [bidi] name-start *name-char", + "src": ".local $\u061Cfoo = {1} {{ {$\u061Cfoo} }}", + "exp": " 1 " + }, + { + "comment": " name matches https://www.w3.org/TR/REC-xml-names/#NT-NCName but excludes U+FFFD and U+061C", + "src": ".local $foo\u061Cbar = {2} {{ }}", + "expErrors": [{"type": "syntax-error"}] + }, + { + "comment": "name = [bidi] name-start *name-char [bidi]", + "src": ".local $\u200Efoo\u200F = {3} {{{$\u200Efoo\u200F}}}", + "exp": "3" + }, + { + "comment": "name = [bidi] name-start *name-char [bidi]", + "src": ".local $foo = {4} {{{$\u200Efoo\u200F}}}", + "exp": "4" + }, + { + "comment": "name = [bidi] name-start *name-char [bidi]", + "src": ".local $\u200Efoo\u200F = {5} {{{$foo}}}", + "exp": "5" + }, + { + "comment": "name = [bidi] name-start *name-char [bidi]", + "src": ".local $foo\u200Ebar = {6} {{{$foo\u200Ebar}}}", + "expErrors": [{"type": "syntax-error"}] + } + ] +}