From f8aa68b0c1c9584633e7a61157185f1a2c275f58 Mon Sep 17 00:00:00 2001 From: Tim Chevalier Date: Fri, 13 Dec 2024 15:46:54 -0800 Subject: [PATCH] ICU-22940 MF2 ICU4C: Error checking improvements in parser Improve checking for OOM errors when allocating UnicodeSets, per post-merge comments on https://github.com/unicode-org/icu/pull/3236 --- icu4c/source/i18n/messageformat2_parser.cpp | 103 ++++++++++++++------ icu4c/source/i18n/messageformat2_parser.h | 33 +++---- 2 files changed, 86 insertions(+), 50 deletions(-) diff --git a/icu4c/source/i18n/messageformat2_parser.cpp b/icu4c/source/i18n/messageformat2_parser.cpp index f61596e4f4ac..9a9f8e78df03 100644 --- a/icu4c/source/i18n/messageformat2_parser.cpp +++ b/icu4c/source/i18n/messageformat2_parser.cpp @@ -111,7 +111,7 @@ icu::UInitOnce gMF2ParseUniSetsInitOnce {}; UnicodeSet* initContentChars(UErrorCode& status) { if (U_FAILURE(status)) { - return {}; + return nullptr; } UnicodeSet* result = new UnicodeSet(0x0001, 0x0008); // Omit NULL, HTAB and LF @@ -133,7 +133,7 @@ UnicodeSet* initContentChars(UErrorCode& status) { UnicodeSet* initWhitespace(UErrorCode& status) { if (U_FAILURE(status)) { - return {}; + return nullptr; } UnicodeSet* result = new UnicodeSet(); @@ -153,7 +153,7 @@ UnicodeSet* initWhitespace(UErrorCode& status) { UnicodeSet* initBidiControls(UErrorCode& status) { UnicodeSet* result = new UnicodeSet(UnicodeString("[\\u061C]"), status); if (U_FAILURE(status)) { - return {}; + return nullptr; } result->add(0x200E, 0x200F); result->add(0x2066, 0x2069); @@ -164,7 +164,7 @@ UnicodeSet* initBidiControls(UErrorCode& status) { UnicodeSet* initAlpha(UErrorCode& status) { UnicodeSet* result = new UnicodeSet(UnicodeString("[:letter:]"), status); if (U_FAILURE(status)) { - return {}; + return nullptr; } result->freeze(); return result; @@ -173,7 +173,7 @@ UnicodeSet* initAlpha(UErrorCode& status) { UnicodeSet* initDigits(UErrorCode& status) { UnicodeSet* result = new UnicodeSet(UnicodeString("[:number:]"), status); if (U_FAILURE(status)) { - return {}; + return nullptr; } result->freeze(); return result; @@ -181,10 +181,14 @@ UnicodeSet* initDigits(UErrorCode& status) { UnicodeSet* initNameStartChars(UErrorCode& status) { if (U_FAILURE(status)) { - return {}; + return nullptr; } - UnicodeSet* result = new UnicodeSet(*unisets::getImpl(unisets::ALPHA)); + UnicodeSet* isAlpha = unisets::gUnicodeSets[unisets::ALPHA] = initAlpha(status); + if (U_FAILURE(status)) { + return nullptr; + } + UnicodeSet* result = new UnicodeSet(*isAlpha); if (result == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; @@ -209,16 +213,21 @@ UnicodeSet* initNameStartChars(UErrorCode& status) { UnicodeSet* initNameChars(UErrorCode& status) { if (U_FAILURE(status)) { - return {}; + return nullptr; } + UnicodeSet* nameStart = unisets::gUnicodeSets[unisets::NAME_START] = initNameStartChars(status); + UnicodeSet* digit = unisets::gUnicodeSets[unisets::DIGIT] = initDigits(status); + if (U_FAILURE(status)) { + return nullptr; + } UnicodeSet* result = new UnicodeSet(); if (result == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; }; - result->addAll(*unisets::getImpl(unisets::NAME_START)); - result->addAll(*unisets::getImpl(unisets::DIGIT)); + result->addAll(*nameStart); + result->addAll(*digit); result->add(HYPHEN); result->add(PERIOD); result->add(0x00B7); @@ -230,16 +239,21 @@ UnicodeSet* initNameChars(UErrorCode& status) { UnicodeSet* initTextChars(UErrorCode& status) { if (U_FAILURE(status)) { - return {}; + return nullptr; } + UnicodeSet* content = unisets::gUnicodeSets[unisets::CONTENT] = initContentChars(status); + UnicodeSet* whitespace = unisets::gUnicodeSets[unisets::WHITESPACE] = initWhitespace(status); + if (U_FAILURE(status)) { + return nullptr; + } UnicodeSet* result = new UnicodeSet(); if (result == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; }; - result->addAll(*unisets::getImpl(unisets::CONTENT)); - result->addAll(*unisets::getImpl(unisets::WHITESPACE)); + result->addAll(*content); + result->addAll(*whitespace); result->add(PERIOD); result->add(AT); result->add(PIPE); @@ -249,16 +263,31 @@ UnicodeSet* initTextChars(UErrorCode& status) { UnicodeSet* initQuotedChars(UErrorCode& status) { if (U_FAILURE(status)) { - return {}; + return nullptr; } + unisets::gUnicodeSets[unisets::TEXT] = initTextChars(status); + if (U_FAILURE(status)) { + return nullptr; + } UnicodeSet* result = new UnicodeSet(); if (result == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; }; - result->addAll(*unisets::getImpl(unisets::CONTENT)); - result->addAll(*unisets::getImpl(unisets::WHITESPACE)); + // content and whitespace were initialized by `initTextChars()` + UnicodeSet* content = unisets::getImpl(unisets::CONTENT); + if (content == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + result->addAll(*content); + UnicodeSet* whitespace = unisets::getImpl(unisets::WHITESPACE); + if (whitespace == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + result->addAll(*whitespace); result->add(PERIOD); result->add(AT); result->add(LEFT_CURLY_BRACE); @@ -269,7 +298,7 @@ UnicodeSet* initQuotedChars(UErrorCode& status) { UnicodeSet* initEscapableChars(UErrorCode& status) { if (U_FAILURE(status)) { - return {}; + return nullptr; } UnicodeSet* result = new UnicodeSet(); @@ -298,26 +327,44 @@ UBool U_CALLCONV cleanupMF2ParseUniSets() { void U_CALLCONV initMF2ParseUniSets(UErrorCode& status) { ucln_i18n_registerCleanup(UCLN_I18N_MF2_UNISETS, cleanupMF2ParseUniSets); + /* + Each of the init functions initializes the UnicodeSets + that it depends on. - gUnicodeSets[unisets::CONTENT] = initContentChars(status); - gUnicodeSets[unisets::WHITESPACE] = initWhitespace(status); + initBidiControls (no dependencies) + + initEscapableChars (no dependencies) + + initNameChars depends on + initDigits + initNameStartChars depends on + initAlpha + + initQuotedChars depends on + initTextChars depends on + initContentChars + initWhitespace + */ gUnicodeSets[unisets::BIDI] = initBidiControls(status); - gUnicodeSets[unisets::ALPHA] = initAlpha(status); - gUnicodeSets[unisets::DIGIT] = initDigits(status); - gUnicodeSets[unisets::NAME_START] = initNameStartChars(status); gUnicodeSets[unisets::NAME_CHAR] = initNameChars(status); - gUnicodeSets[unisets::TEXT] = initTextChars(status); gUnicodeSets[unisets::QUOTED] = initQuotedChars(status); gUnicodeSets[unisets::ESCAPABLE] = initEscapableChars(status); + + if (U_FAILURE(status)) { + cleanupMF2ParseUniSets(); + } } -const UnicodeSet* get(Key key) { - UErrorCode localStatus = U_ZERO_ERROR; - umtx_initOnce(gMF2ParseUniSetsInitOnce, &initMF2ParseUniSets, localStatus); - if (U_FAILURE(localStatus)) { +const UnicodeSet* get(Key key, UErrorCode& status) { + umtx_initOnce(gMF2ParseUniSetsInitOnce, &initMF2ParseUniSets, status); + if (U_FAILURE(status)) { return nullptr; } - return getImpl(key); + UnicodeSet* result = getImpl(key); + if (result == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + } + return result; } } diff --git a/icu4c/source/i18n/messageformat2_parser.h b/icu4c/source/i18n/messageformat2_parser.h index 7b6dfa3b3824..62a52d8f680a 100644 --- a/icu4c/source/i18n/messageformat2_parser.h +++ b/icu4c/source/i18n/messageformat2_parser.h @@ -74,7 +74,7 @@ namespace message2 { UNISETS_KEY_COUNT }; - U_I18N_API const UnicodeSet* get(Key key); + U_I18N_API const UnicodeSet* get(Key key, UErrorCode& status); } // Parser class (private) @@ -110,16 +110,16 @@ namespace message2 { StaticErrors& e, UnicodeString& normalizedInputRef, UErrorCode& status) - : contentChars(unisets::get(unisets::CONTENT)), - whitespaceChars(unisets::get(unisets::WHITESPACE)), - bidiControlChars(unisets::get(unisets::BIDI)), - alphaChars(unisets::get(unisets::ALPHA)), - digitChars(unisets::get(unisets::DIGIT)), - nameStartChars(unisets::get(unisets::NAME_START)), - nameChars(unisets::get(unisets::NAME_CHAR)), - textChars(unisets::get(unisets::TEXT)), - quotedChars(unisets::get(unisets::QUOTED)), - escapableChars(unisets::get(unisets::ESCAPABLE)), + : contentChars(unisets::get(unisets::CONTENT, status)), + whitespaceChars(unisets::get(unisets::WHITESPACE, status)), + bidiControlChars(unisets::get(unisets::BIDI, status)), + alphaChars(unisets::get(unisets::ALPHA, status)), + digitChars(unisets::get(unisets::DIGIT, status)), + nameStartChars(unisets::get(unisets::NAME_START, status)), + nameChars(unisets::get(unisets::NAME_CHAR, status)), + textChars(unisets::get(unisets::TEXT, status)), + quotedChars(unisets::get(unisets::QUOTED, status)), + escapableChars(unisets::get(unisets::ESCAPABLE, status)), source(input), index(0), errors(e), normalizedInput(normalizedInputRef), dataModel(dataModelBuilder) { (void) status; parseError.line = 0; @@ -129,17 +129,6 @@ namespace message2 { parseError.postContext[0] = '\0'; } - UnicodeSet initContentChars(UErrorCode& status); - UnicodeSet initWhitespace(UErrorCode& status); - UnicodeSet initBidiControls(UErrorCode& status); - UnicodeSet initAlpha(UErrorCode& status); - UnicodeSet initDigits(UErrorCode& status); - UnicodeSet initNameStartChars(UErrorCode& status); - UnicodeSet initNameChars(UErrorCode& status); - UnicodeSet initTextChars(UErrorCode& status); - UnicodeSet initQuotedChars(UErrorCode& status); - UnicodeSet initEscapableChars(UErrorCode& status); - bool isContentChar(UChar32) const; bool isBidiControl(UChar32) const; bool isWhitespace(UChar32) const;