Skip to content

Commit

Permalink
ICU-13219 Address Mark's comment
Browse files Browse the repository at this point in the history
  • Loading branch information
FrankYFTang committed Nov 21, 2023
1 parent 9cf2b52 commit 503df57
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 5 deletions.
4 changes: 4 additions & 0 deletions icu4c/source/common/rbbi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseB
// "thai-arab" to "[[:scx=thai:][:scx=arab:]]"
UnicodeString udxs(u'[');
for (size_t i = 0; i < items; i++) {
if (i > 0 && dxs[i*5-1] != u'-') {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
// Special handling of zyyy
// "The code Zyyy (Common) can be specified to exclude all scripts"
if (uprv_strncmp(dxs+i*5, "zyyy", 4) == 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,11 @@ public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is
* @param phraseBreaking a flag indicating if phrase breaking is required.
* @param dxValues Dictionary break script exclusions.
* @throws IOException if there is an error while reading the rules from the buffer.
* IllegalArgumentException if the dxValues is not null nor a String in the supported
* format.
* @see #compileRules(String, OutputStream)
* @internal
*/
/* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules(
ByteBuffer bytes, boolean phraseBreaking, String dxValues) throws IOException, IllegalArgumentException {
ByteBuffer bytes, boolean phraseBreaking, String dxValues) throws IOException {
RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes);
instance.fPhraseBreaking = phraseBreaking;
instance.fDX = makeExcludedDictionaryBreakUnicodeSet(dxValues);
Expand All @@ -118,11 +116,10 @@ public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is
/**
* Crate a UnicodeSet for the Dictionary Break Script Exclusions.
* @param dxValues Dictionary break script exclusions, a string of Script code joined by "-".
* @throws IOException if there is an error while constr the rules from the buffer.
* @internal
*/
private static UnicodeSet makeExcludedDictionaryBreakUnicodeSet(
String dxs) throws IllegalArgumentException {
String dxs) {
if (dxs == null) {
return null;
}
Expand All @@ -133,6 +130,9 @@ private static UnicodeSet makeExcludedDictionaryBreakUnicodeSet(
StringBuilder builder = new StringBuilder("[");
int items = 1 + (dxs.length() / 5);
for (int i = 0; i < items; i++) {
if (i > 0 && dxs.charAt(i*5-1) != '-') {
throw new IllegalArgumentException("Incorrect value for dx key: " + dxs);
}
String script = dxs.substring(i*5, i*5+4);
// Special handling of zyyy
// "The code Zyyy (Common) can be specified to exclude all scripts"
Expand All @@ -142,6 +142,10 @@ private static UnicodeSet makeExcludedDictionaryBreakUnicodeSet(
builder.append("[:scx=").append(script).append(":]");
}
builder.append("]");
// The UnicodeSet constructor will catch malformed dx values below.
// For example, if the locale is "en-u-dx-abc-defgh", dxs is "abc-defgh"
// and builder.toString() return "[[:scx=abc-:][:scx=efgh:]]" and causes
// UnicodeSet constructor to throw IllegalArgumentException
return new UnicodeSet(builder.toString());
}

Expand Down

0 comments on commit 503df57

Please sign in to comment.