Skip to content

Commit

Permalink
ICU-22503 add property Indic_Conjunct_Break
Browse files Browse the repository at this point in the history
  • Loading branch information
echeran committed Jul 26, 2024
1 parent 23d9628 commit f02235a
Show file tree
Hide file tree
Showing 22 changed files with 3,400 additions and 3,270 deletions.
2 changes: 1 addition & 1 deletion .bazeliskrc
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
# for running Bazel commands while ensuring, through configuration, that only a
# specific version of Bazel is executed.

USE_BAZEL_VERSION=7.1.1
USE_BAZEL_VERSION=7.2.1
2,047 changes: 1,026 additions & 1,021 deletions icu4c/source/common/propname_data.h

Large diffs are not rendered by default.

4,465 changes: 2,234 additions & 2,231 deletions icu4c/source/common/uchar_props_data.h

Large diffs are not rendered by default.

33 changes: 32 additions & 1 deletion icu4c/source/common/unicode/uchar.h
Original file line number Diff line number Diff line change
Expand Up @@ -677,13 +677,19 @@ typedef enum UProperty {
* @draft ICU 75
*/
UCHAR_IDENTIFIER_STATUS=0x1019,
/**
* Enumerated property Indic_Conjunct_Break.
* Used in the grapheme cluster break algorithm in UAX #29.
* @draft ICU 76
*/
UCHAR_INDIC_CONJUNCT_BREAK=0x101A,
#endif // U_HIDE_DRAFT_API
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the last constant for enumerated/integer Unicode properties.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
UCHAR_INT_LIMIT=0x101A,
UCHAR_INT_LIMIT=0x101B,
#endif // U_HIDE_DEPRECATED_API

/** Bitmask property General_Category_Mask.
Expand Down Expand Up @@ -2729,6 +2735,31 @@ typedef enum UIndicSyllabicCategory {
U_INSC_REORDERING_KILLER,
} UIndicSyllabicCategory;

#ifndef U_HIDE_DRAFT_API
/**
* Indic Conjunct Break constants.
*
* @see UCHAR_INDIC_CONJUNCT_BREAK
* @draft ICU 76
*/
typedef enum UIndicConjunctBreak {
/*
* Note: UIndicConjunctBreak constants are parsed by preparseucd.py.
* It matches lines like
* U_INCB_<Unicode Indic_Conjunct_Break value name>
*/

/** @draft ICU 76 */
U_INCB_NONE,
/** @draft ICU 76 */
U_INCB_CONSONANT,
/** @draft ICU 76 */
U_INCB_EXTEND,
/** @draft ICU 76 */
U_INCB_LINKER,
} UIndicConjunctBreak;
#endif // U_HIDE_DRAFT_API

/**
* Vertical Orientation constants.
*
Expand Down
1 change: 1 addition & 0 deletions icu4c/source/common/uprops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -728,6 +728,7 @@ static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={
{ UPROPS_SRC_INSC, 0, 0, getInSC, layoutGetMaxValue },
{ UPROPS_SRC_VO, 0, 0, getVo, layoutGetMaxValue },
{ UPROPS_SRC_PROPSVEC, 0, static_cast<int32_t>(U_ID_STATUS_ALLOWED), getIDStatusValue, getMaxValueFromShift },
{ 0, UPROPS_INCB_MASK, UPROPS_INCB_SHIFT,defaultGetValue, defaultGetMaxValue },
};

U_CAPI int32_t U_EXPORT2
Expand Down
6 changes: 5 additions & 1 deletion icu4c/source/common/uprops.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ namespace {
// Bits
// 31..26 Age major version (major=0..63)
// 25..24 Age minor version (minor=0..3)
// 23..15 reserved
// 23..17 reserved
// 16..15 Indic Conjunct Break
// 14..12 East Asian Width
// 11..10 3..1: Bits 9..0 = Script_Extensions index
// 3: Script value from Script_Extensions
Expand Down Expand Up @@ -158,6 +159,9 @@ inline constexpr uint8_t UPROPS_AGE_MINOR_MAX = 3;
inline constexpr uint32_t UPROPS_EA_MASK = 0x00007000;
inline constexpr int32_t UPROPS_EA_SHIFT = 12;

inline constexpr uint32_t UPROPS_INCB_MASK = 0x00018000;
inline constexpr int32_t UPROPS_INCB_SHIFT = 15;

/** Script_Extensions: mask includes Script */
inline constexpr uint32_t UPROPS_SCRIPT_X_MASK = 0x00000fff;

Expand Down
Binary file modified icu4c/source/data/in/pnames.icu
Binary file not shown.
Binary file modified icu4c/source/data/in/uprops.icu
Binary file not shown.
5 changes: 3 additions & 2 deletions icu4c/source/data/unidata/changes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ export UNICODE_TOOLS=~/oss/unicodetools/mine/src
so that the makefiles see the new version number.
cd $ICU_OUT/icu4c
ICU_DATA_BUILDTOOL_OPTS=--include_uni_core_data CXXFLAGS="-DU_USING_ICU_NAMESPACE=0 -Wimplicit-fallthrough" CPPFLAGS="-DU_NO_DEFAULT_INCLUDE_UTF_HEADERS=1 -fsanitize=bounds" LDFLAGS=-fsanitize=bounds ../../src/icu4c/source/runConfigureICU --enable-debug --disable-release Linux/clang --prefix=/usr/local/google/home/mscherer/icu/mine/inst/icu4c > config.out 2>&1 ; tail config.out
+ Elango's version (diff default C++ compiler & in-source build paths):
cd $ICU_OUT/icu4c/source
ICU_DATA_BUILDTOOL_OPTS=--include_uni_core_data CXXFLAGS="-DU_USING_ICU_NAMESPACE=0 -Wimplicit-fallthrough" CPPFLAGS="-DU_NO_DEFAULT_INCLUDE_UTF_HEADERS=1 -fsanitize=bounds" LDFLAGS=-fsanitize=bounds ./runConfigureICU --enable-debug --disable-release Linux/gcc --prefix=/usr/local/google/home/elango/oss/icu/icu4c > config.out 2>&1 ; tail config.out

*** data files & enums & parser code

Expand Down Expand Up @@ -360,8 +363,6 @@ copying that version number into the $ICU_SRC/.bazeliskrc config file.

* run & fix ICU4J tests

TODO

*** API additions
- send notice to icu-design about new born-@stable API (enum constants etc.)

Expand Down
8 changes: 8 additions & 0 deletions icu4c/source/test/cintltst/cucdtst.c
Original file line number Diff line number Diff line change
Expand Up @@ -2802,6 +2802,14 @@ TestAdditionalProperties(void) {
{ 0x0606, UCHAR_PREPENDED_CONCATENATION_MARK, false },
{ 0x110BD, UCHAR_PREPENDED_CONCATENATION_MARK, true },

/* Indic_Conjunct_Break values */
{ 0x094D, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_LINKER },
{ 0x09B9, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_CONSONANT },
{ 0x05BE, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },
{ 0x05BF, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_EXTEND },
{ 0x05C0, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },
{ 0xD800, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },

/* undefined UProperty values */
{ 0x61, 0x4a7, 0 },
{ 0x234bc, 0x15ed, 0 }
Expand Down
9 changes: 8 additions & 1 deletion icu4c/source/test/intltest/ucdtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
#include "testutil.h"
#include "uparse.h"
#include "ucdtest.h"
#include "usettest.h"

#include <iostream>

static const char *ignorePropNames[]={
"FC_NFKC",
Expand Down Expand Up @@ -1092,6 +1095,10 @@ void UnicodeTest::TestPropertiesUsingPpucd() {
{ UCHAR_NFC_QUICK_CHECK, UNORM_MAYBE },
{ UCHAR_NFKC_QUICK_CHECK, UNORM_MAYBE },
#endif // !UCONFIG_NO_NORMALIZATION
{ UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },
{ UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_CONSONANT },
{ UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_EXTEND },
{ UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_LINKER },
};

// Iterate through PPUCD file, accumulating each line's data into each UnicodeSet per property
Expand Down Expand Up @@ -1133,7 +1140,7 @@ void UnicodeTest::TestPropertiesUsingPpucd() {
if (!tp.isBinary()) {
msg = msg + "=" + u_getPropertyValueName(tp.prop, tp.value, U_LONG_PROPERTY_NAME);
}
assertTrue(msg.c_str(), tp.set == icuPropSet);
UnicodeSetTest::checkEqual(*this, tp.set, icuPropSet, msg.c_str());
}
}

Expand Down
22 changes: 14 additions & 8 deletions icu4c/source/test/intltest/usettest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2114,20 +2114,26 @@ void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool
}

UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
return checkEqual(*this, s, t, message);
}

UBool UnicodeSetTest::checkEqual(
IntlTest& intlTest,
const UnicodeSet& s, const UnicodeSet& t, const char* message) {
intlTest.assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
intlTest.assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
UnicodeString source; s.toPattern(source, true);
UnicodeString result; t.toPattern(result, true);
if (s != t) {
errln(UnicodeString("FAIL: ") + message
+ "; source = " + source
+ "; result = " + result
intlTest.errln((UnicodeString)"FAIL: " + message
+ "\nsource = " + source
+ "\nresult = " + result
);
return false;
} else {
logln(UnicodeString("Ok: ") + message
+ "; source = " + source
+ "; result = " + result
intlTest.logln((UnicodeString)"Ok: " + message
+ "\nsource = " + source
+ "\nresult = " + result
);
}
return true;
Expand Down
2 changes: 2 additions & 0 deletions icu4c/source/test/intltest/usettest.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ class UnicodeSetTest: public IntlTest {
UnicodeSetTest();
~UnicodeSetTest();

static UBool checkEqual(IntlTest& intlTest, const UnicodeSet& s, const UnicodeSet& t, const char* message);

private:
void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par=nullptr) override;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,7 @@ int getMaxValue(int which) {
return IdentifierStatus.ALLOWED.ordinal();
}
},
new IntProperty(0, INCB_MASK, INCB_SHIFT), // INDIC_CONJUNCT_BREAK
};

public int getIntPropertyValue(int c, int which) {
Expand Down Expand Up @@ -1378,7 +1379,8 @@ private static final int ntvGetType(int ntv) {
// Bits
// 31..26 Age major version (major=0..63)
// 25..24 Age minor version (minor=0..3)
// 23..15 reserved
// 23..17 reserved
// 16..15 Indic Conjunct Break
// 14..12 East Asian Width
// 11..10 3..1: Bits 9..0 = Script_Extensions index
// 3: Script value from Script_Extensions
Expand All @@ -1390,6 +1392,9 @@ private static final int ntvGetType(int ntv) {
private static final int EAST_ASIAN_MASK_ = 0x00007000;
private static final int EAST_ASIAN_SHIFT_ = 12;

private static final int INCB_MASK = 0x00018000;
private static final int INCB_SHIFT = 15;

/** Script_Extensions: mask includes Script */
public static final int SCRIPT_X_MASK = 0x00000fff;

Expand Down
18 changes: 18 additions & 0 deletions icu4j/main/core/src/main/java/com/ibm/icu/lang/UCharacter.java
Original file line number Diff line number Diff line change
Expand Up @@ -4124,6 +4124,24 @@ public static interface IndicSyllabicCategory {
public static final int REORDERING_KILLER = 36;
}

/**
* Indic Conjunct Break constants.
* See https://unicode.org/reports/tr44/#Indic_Conjunct_Break
*
* @see UProperty#INDIC_CONJUNCT_BREAK
* @draft ICU 76
*/
public enum IndicConjunctBreak {
/** @draft ICU 76 */
NONE,
/** @draft ICU 76 */
CONSONANT,
/** @draft ICU 76 */
EXTEND,
/** @draft ICU 76 */
LINKER,
}

/**
* Vertical Orientation constants.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -859,12 +859,19 @@ public interface UProperty
*/
public static final int IDENTIFIER_STATUS = 0x1019;

/**
* Enumerated property Indic_Conjunct_Break.
* Used in the grapheme cluster break algorithm in UAX #29.
* @draft ICU 76
*/
public static final int INDIC_CONJUNCT_BREAK = 0x101A;

/**
* One more than the last constant for enumerated/integer Unicode properties.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
@Deprecated
public static final int INT_LIMIT = 0x101A;
public static final int INT_LIMIT = 0x101B;

/**
* Bitmask property General_Category_Mask.
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -2212,6 +2212,14 @@ public void TestAdditionalProperties()
{ 0x0606, UProperty.PREPENDED_CONCATENATION_MARK, FALSE },
{ 0x110BD, UProperty.PREPENDED_CONCATENATION_MARK, TRUE },

/* Indic_Conjunct_Break values */
{ 0x094D, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.LINKER.ordinal() },
{ 0x09B9, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.CONSONANT.ordinal() },
{ 0x05BE, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.NONE.ordinal() },
{ 0x05BF, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.EXTEND.ordinal() },
{ 0x05C0, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.NONE.ordinal() },
{ 0xD800, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.NONE.ordinal() },

/* undefined UProperty values */
{ 0x61, 0x4a7, 0 },
{ 0x234bc, 0x15ed, 0 }
Expand Down
5 changes: 5 additions & 0 deletions tools/unicode/c/genprops/corepropsbuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,10 @@ although the trie can hold 16-bit values.
Props vector 0 bits shuffled so that script and script extensions bits are contiguous.
Used 2 bits from props vector 0 to add Indic_Conjunct_Break. The bits used were freed up
by the preceding move of the Block property out of props vector 0 and the bit shuffling
("defragmentation") of Script and Script_Extensions.
----------------------------------------------------------------------------- */

U_NAMESPACE_USE
Expand Down Expand Up @@ -712,6 +716,7 @@ struct PropToEnum {
const PropToEnum
propToEnums[]={
{ UCHAR_EAST_ASIAN_WIDTH, 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK },
{ UCHAR_INDIC_CONJUNCT_BREAK, 0, UPROPS_INCB_SHIFT, UPROPS_INCB_MASK },
{ UCHAR_DECOMPOSITION_TYPE, 2, 0, UPROPS_DT_MASK },
{ UCHAR_GRAPHEME_CLUSTER_BREAK, 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK },
{ UCHAR_WORD_BREAK, 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK },
Expand Down
10 changes: 9 additions & 1 deletion tools/unicode/c/genprops/pnames_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -1186,6 +1186,13 @@ static const Value VALUES_ID_Status[2] = {
Value(U_ID_STATUS_ALLOWED, "Allowed Allowed"),
};

static const Value VALUES_InCB[4] = {
Value(U_INCB_NONE, "None None"),
Value(U_INCB_CONSONANT, "Consonant Consonant"),
Value(U_INCB_EXTEND, "Extend Extend"),
Value(U_INCB_LINKER, "Linker Linker"),
};

static const Value VALUES_gcm[38] = {
Value((int32_t)U_GC_C_MASK, "C Other"),
Value((int32_t)U_GC_CC_MASK, "Cc Control cntrl"),
Expand Down Expand Up @@ -1242,7 +1249,7 @@ static const Value VALUES_ID_Type[12] = {
Value(U_ID_TYPE_RECOMMENDED, "Recommended Recommended"),
};

static const Property PROPERTIES[119] = {
static const Property PROPERTIES[120] = {
Property(UCHAR_ALPHABETIC, "Alpha Alphabetic"),
Property(UCHAR_ASCII_HEX_DIGIT, "AHex ASCII_Hex_Digit"),
Property(UCHAR_BIDI_CONTROL, "Bidi_C Bidi_Control"),
Expand Down Expand Up @@ -1344,6 +1351,7 @@ static const Property PROPERTIES[119] = {
Property(UCHAR_INDIC_SYLLABIC_CATEGORY, "InSC Indic_Syllabic_Category", VALUES_InSC, 37),
Property(UCHAR_VERTICAL_ORIENTATION, "vo Vertical_Orientation", VALUES_vo, 4),
Property(UCHAR_IDENTIFIER_STATUS, "ID_Status Identifier_Status", VALUES_ID_Status, 2),
Property(UCHAR_INDIC_CONJUNCT_BREAK, "InCB Indic_Conjunct_Break", VALUES_InCB, 4),
Property(UCHAR_GENERAL_CATEGORY_MASK, "gcm General_Category_Mask", VALUES_gcm, 38),
Property(UCHAR_NUMERIC_VALUE, "nv Numeric_Value"),
Property(UCHAR_AGE, "age Age"),
Expand Down
13 changes: 12 additions & 1 deletion tools/unicode/py/preparseucd.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,17 @@ def ParsePropertyAliases(in_file):
AddEnumeratedValue(prop, "Limited_Use")
AddEnumeratedValue(prop, "Inclusion")
AddEnumeratedValue(prop, "Recommended")
# Indic_Conjunct Break. See UAX #29 and
# https://www.unicode.org/reports/tr44/tr44-33.html#Indic_Conjunct_Break
name = "InCB"
prop = ("Enumerated", ["InCB", "Indic_Conjunct_Break"], set(), {})
_properties[name] = prop
_properties[NormPropName(name)] = prop
SetDefaultValue(name, "None")
AddEnumeratedValue(prop, "None")
AddEnumeratedValue(prop, "Consonant")
AddEnumeratedValue(prop, "Extend")
AddEnumeratedValue(prop, "Linker")


def ParsePropertyValueAliases(in_file):
Expand Down Expand Up @@ -2012,7 +2023,7 @@ def PrintNameStats():
# Sample line to match:
# U_EA_AMBIGUOUS,
_prop_and_value_re = re.compile(
" *(U_(BPT|DT|EA|GCB|HST|ID_STATUS|ID_TYPE|INPC|INSC|LB|JG|JT|NT|SB|VO|WB)_([0-9A-Z_]+))")
" *(U_(BPT|DT|EA|GCB|HST|ID_STATUS|ID_TYPE|INCB|INPC|INSC|LB|JG|JT|NT|SB|VO|WB)_([0-9A-Z_]+))")

# Sample line to match if it has matched _prop_and_value_re
# (we want to exclude aliases):
Expand Down

0 comments on commit f02235a

Please sign in to comment.