From 503df5703e028dad8accf4ed1ce7368d4c3d23a9 Mon Sep 17 00:00:00 2001 From: Frank Tang Date: Tue, 21 Nov 2023 14:37:24 -0800 Subject: [PATCH] ICU-13219 Address Mark's comment --- icu4c/source/common/rbbi.cpp | 4 ++++ .../com/ibm/icu/text/RuleBasedBreakIterator.java | 14 +++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index 1e10e2ff795b..c8708a8f2b45 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -109,6 +109,10 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseB // "thai-arab" to "[[:scx=thai:][:scx=arab:]]" UnicodeString udxs(u'['); for (size_t i = 0; i < items; i++) { + if (i > 0 && dxs[i*5-1] != u'-') { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } // Special handling of zyyy // "The code Zyyy (Common) can be specified to exclude all scripts" if (uprv_strncmp(dxs+i*5, "zyyy", 4) == 0) { diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java index 4406d6061574..43b6da72260d 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/text/RuleBasedBreakIterator.java @@ -102,13 +102,11 @@ public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is * @param phraseBreaking a flag indicating if phrase breaking is required. * @param dxValues Dictionary break script exclusions. * @throws IOException if there is an error while reading the rules from the buffer. - * IllegalArgumentException if the dxValues is not null nor a String in the supported - * format. * @see #compileRules(String, OutputStream) * @internal */ /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules( - ByteBuffer bytes, boolean phraseBreaking, String dxValues) throws IOException, IllegalArgumentException { + ByteBuffer bytes, boolean phraseBreaking, String dxValues) throws IOException { RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes); instance.fPhraseBreaking = phraseBreaking; instance.fDX = makeExcludedDictionaryBreakUnicodeSet(dxValues); @@ -118,11 +116,10 @@ public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is /** * Crate a UnicodeSet for the Dictionary Break Script Exclusions. * @param dxValues Dictionary break script exclusions, a string of Script code joined by "-". - * @throws IOException if there is an error while constr the rules from the buffer. * @internal */ private static UnicodeSet makeExcludedDictionaryBreakUnicodeSet( - String dxs) throws IllegalArgumentException { + String dxs) { if (dxs == null) { return null; } @@ -133,6 +130,9 @@ private static UnicodeSet makeExcludedDictionaryBreakUnicodeSet( StringBuilder builder = new StringBuilder("["); int items = 1 + (dxs.length() / 5); for (int i = 0; i < items; i++) { + if (i > 0 && dxs.charAt(i*5-1) != '-') { + throw new IllegalArgumentException("Incorrect value for dx key: " + dxs); + } String script = dxs.substring(i*5, i*5+4); // Special handling of zyyy // "The code Zyyy (Common) can be specified to exclude all scripts" @@ -142,6 +142,10 @@ private static UnicodeSet makeExcludedDictionaryBreakUnicodeSet( builder.append("[:scx=").append(script).append(":]"); } builder.append("]"); + // The UnicodeSet constructor will catch malformed dx values below. + // For example, if the locale is "en-u-dx-abc-defgh", dxs is "abc-defgh" + // and builder.toString() return "[[:scx=abc-:][:scx=efgh:]]" and causes + // UnicodeSet constructor to throw IllegalArgumentException return new UnicodeSet(builder.toString()); }