Skip to content

Commit

Permalink
ICU-22854 Implement subdivision validation
Browse files Browse the repository at this point in the history
  • Loading branch information
FrankYFTang committed Aug 22, 2024
1 parent aabadf7 commit 5e22f00
Show file tree
Hide file tree
Showing 11 changed files with 368 additions and 30 deletions.
83 changes: 73 additions & 10 deletions icu4c/source/common/loclikely.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -391,19 +391,30 @@ namespace {
icu::CharString
GetRegionFromKey(const char* localeID, std::string_view key, UErrorCode& status) {
icu::CharString result;

// First check for keyword value
icu::CharString kw = ulocimp_getKeywordValue(localeID, key, status);
int32_t len = kw.length();
if (U_SUCCESS(status) && len >= 3 && len <= 7) {
// chop off the subdivision code (which will generally be "zzzz" anyway)
const char* const data = kw.data();
if (uprv_isASCIILetter(data[0])) {
result.append(uprv_toupper(data[0]), status);
result.append(uprv_toupper(data[1]), status);
} else {
// assume three-digit region code
result.append(data, 3, status);
// In UTS35
// type = alphanum{3,8} (sep alphanum{3,8})* ;
// so we know the subdivision must fit the type already.
//
// unicode_subdivision_id = unicode_region_subtag unicode_subdivision_suffix ;
// unicode_region_subtag = (alpha{2} | digit{3}) ;
// unicode_subdivision_suffix = alphanum{1,4} ;
// But we also know there are no id in start with digit{3} in
// https://github.com/unicode-org/cldr/blob/main/common/validity/subdivision.xml
// Therefore we can simplify as
// unicode_subdivision_id = alpha{2} alphanum{1,4}
//
// and only need to accept/reject the code based on the alpha{2} and the length.
if (U_SUCCESS(status) && len >= 3 && len <= 6 &&
uprv_isASCIILetter(kw[0]) && uprv_isASCIILetter(kw[1])) {
// Additional Check
static icu::RegionValidateMap valid;
const char region[] = {kw[0], kw[1], '\0'};
if (valid.isSet(region)) {
result.append(uprv_toupper(kw[0]), status);
result.append(uprv_toupper(kw[1]), status);
}
}
return result;
Expand Down Expand Up @@ -436,3 +447,55 @@ ulocimp_getRegionForSupplementalData(const char *localeID, bool inferRegion,

return rgBuf;
}

namespace {

// The following data is generated by unit test code inside
// test/intltest/regiontst.cpp from the resource data while
// the test failed.
const uint32_t gValidRegionMap[] = {
0xeedf597c, 0xdeddbdef, 0x15943f3f, 0x0e00d580,
0xb0095c00, 0x0015fb9f, 0x781c068d, 0x0340400f,
0xf42b1d00, 0xfd4f8141, 0x25d7fffc, 0x0100084b,
0x538f3c40, 0x40000001, 0xfdf15100, 0x9fbb7ae7,
0x0410419a, 0x00408557, 0x00004002, 0x00100001,
0x00400408, 0x00000001,
};

} // namespace
//
U_NAMESPACE_BEGIN
RegionValidateMap::RegionValidateMap() {
uprv_memcpy(map, gValidRegionMap, sizeof(map));
}

RegionValidateMap::~RegionValidateMap() {
}

bool RegionValidateMap::isSet(const char* region) const {
int32_t index = value(region);
if (index < 0) {
return false;
}
return 0 != (map[index / 32] & (1L << (index % 32)));
}

bool RegionValidateMap::equals(const RegionValidateMap& that) const {
return uprv_memcmp(map, that.map, sizeof(map)) == 0;
}

// The code transform two letter a-z to a integer valued between -1, 26x26.
// -1 indicate the region is outside the range of two letter a-z
// the rest of value is between 0 and 676 (= 26x26) and used as an index
// the the bigmap in map. The map is an array of 22 int32_t.
// since 32x21 < 676/32 < 32x22 we store this 676 bits bitmap into 22 int32_t.
int32_t RegionValidateMap::value(const char* region) const {
if (uprv_isASCIILetter(region[0]) && uprv_isASCIILetter(region[1]) &&
region[2] == '\0') {
return (uprv_toupper(region[0])-'A') * 26 +
(uprv_toupper(region[1])-'A');
}
return -1;
}

U_NAMESPACE_END
15 changes: 15 additions & 0 deletions icu4c/source/common/ulocimp.h
Original file line number Diff line number Diff line change
Expand Up @@ -425,4 +425,19 @@ ulocimp_getKnownCanonicalizedLocaleForTest(int32_t& length);
U_EXPORT bool
ulocimp_isCanonicalizedLocaleForTest(const char* localeName);

#ifdef __cplusplus
U_NAMESPACE_BEGIN
class U_COMMON_API RegionValidateMap : public UObject {
public:
RegionValidateMap();
virtual ~RegionValidateMap();
bool isSet(const char* region) const;
bool equals(const RegionValidateMap& that) const;
protected:
int32_t value(const char* region) const;
uint32_t map[22]; // 26x26/32 = 22;
};
U_NAMESPACE_END
#endif /* __cplusplus */

#endif
69 changes: 65 additions & 4 deletions icu4c/source/test/cintltst/ccaltst.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ void TestUcalOpenBufferRead(void);
void TestGetTimeZoneOffsetFromLocal(void);

void TestFWWithISO8601(void);
void TestFWwithRGSD(void);

void addCalTest(TestNode** root);

Expand All @@ -71,6 +72,7 @@ void addCalTest(TestNode** root)
addTest(root, &TestUcalOpenBufferRead, "tsformat/ccaltst/TestUcalOpenBufferRead");
addTest(root, &TestGetTimeZoneOffsetFromLocal, "tsformat/ccaltst/TestGetTimeZoneOffsetFromLocal");
addTest(root, &TestFWWithISO8601, "tsformat/ccaltst/TestFWWithISO8601");
addTest(root, &TestFWwithRGSD, "tsformat/ccaltst/TestFWwithRGSD");
addTest(root, &TestGetIanaTimeZoneID, "tstformat/ccaltst/TestGetIanaTimeZoneID");
}

Expand Down Expand Up @@ -1616,7 +1618,7 @@ void TestGregorianChange(void) {
}

static void TestGetKeywordValuesForLocale(void) {
#define PREFERRED_SIZE 26
#define PREFERRED_SIZE 25
#define MAX_NUMBER_OF_KEYWORDS 5
const char *PREFERRED[PREFERRED_SIZE][MAX_NUMBER_OF_KEYWORDS+1] = {
{ "root", "gregorian", NULL, NULL, NULL, NULL },
Expand Down Expand Up @@ -1646,9 +1648,8 @@ static void TestGetKeywordValuesForLocale(void) {
{ "zh_TW@rg=IT53", "gregorian", NULL, NULL, NULL, NULL }, // two-digit subdivision code
{ "zh_TW@rg=AUnsw", "gregorian", NULL, NULL, NULL, NULL }, // three-letter subdivision code
{ "zh_TW@rg=EE130", "gregorian", NULL, NULL, NULL, NULL }, // three-digit subdivision code
{ "zh_TW@rg=417zzzz", "gregorian", NULL, NULL, NULL, NULL }, // three-digit region code
};
const int32_t EXPECTED_SIZE[PREFERRED_SIZE] = { 1, 1, 1, 1, 2, 2, 2, 5, 5, 2, 2, 2, 1, 3, 5, 4, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1 };
const int32_t EXPECTED_SIZE[PREFERRED_SIZE] = { 1, 1, 1, 1, 2, 2, 2, 5, 5, 2, 2, 2, 1, 3, 5, 4, 2, 3, 3, 1, 1, 1, 1, 1, 1 };
UErrorCode status = U_ZERO_ERROR;
int32_t i, size, j;
UEnumeration *all, *pref;
Expand Down Expand Up @@ -1688,7 +1689,7 @@ static void TestGetKeywordValuesForLocale(void) {
}

if (!matchPref) {
log_err("FAIL: Preferred values for locale \"%s\" does not match expected.\n", loc);
log_err("FAIL: Preferred values for locale (%d) \"%s\" does not match expected.\n", i, loc);
break;
}
uenum_close(pref);
Expand Down Expand Up @@ -2842,6 +2843,66 @@ TestFWWithISO8601(void) {
}
}

void
TestFWwithRGSD(void) {
typedef struct {
const char* locale;
int32_t first_day_of_week;
int32_t minimal_days;
} TestData;
const TestData TESTDATA[] = {
// Region subtag is missing, so add likely subtags to get region.
{"en", UCAL_SUNDAY, 1},

// Explicit region subtag "US" is present.
{"en-US", UCAL_SUNDAY, 1},

// Explicit region subtag "DE" is present.
{"en-DE", UCAL_MONDAY, 4},

// Explicit region subtag "DE" is present, but there's also a valid
// region override to use "US".
{"en-DE-u-rg-uszzzz", UCAL_SUNDAY, 1},

// Explicit region subtag "DE" is present. The region override should be
// ignored, because "AA" is not a valid region.
{"en-DE-u-rg-aazzzz", UCAL_MONDAY, 4},

// Explicit region subtag "DE" is present. The region override should be
// ignored, because "001" is a macroregion.
{"en-DE-u-rg-001zzz", UCAL_MONDAY, 4},

// Region subtag is missing. The region override should be ignored, because
// "AA" is not a valid region.
{"en-u-rg-aazzzz", UCAL_SUNDAY, 1},

// Region subtag is missing. The region override should be ignored, because
// "001" is a macroregion.
{"en-u-rg-001zzz", UCAL_SUNDAY, 1},

{NULL, 0, 0},
};
for (int32_t i = 0; TESTDATA[i].locale != NULL; i++) {
UErrorCode status = U_ZERO_ERROR;
UCalendar* cal = ucal_open(NULL, 0, TESTDATA[i].locale, UCAL_DEFAULT, &status);
if (U_FAILURE(status)) {
log_err("ucal_open failed: TESTDATA[%d].locale = '%s'\n", i, TESTDATA[i].locale);
continue;
}
int32_t first_day_Of_week = ucal_getAttribute(cal, UCAL_FIRST_DAY_OF_WEEK);
if (first_day_Of_week != TESTDATA[i].first_day_of_week) {
log_err("First day of week of '%s' is %d but expected to be %d\n", TESTDATA[i].locale,
first_day_Of_week, TESTDATA[i].first_day_of_week);
}
int32_t minimal_days = ucal_getAttribute(cal, UCAL_MINIMAL_DAYS_IN_FIRST_WEEK);
if (minimal_days != TESTDATA[i].minimal_days) {
log_err("Minimal days of a week of '%s' is %d but expected to be %d\n", TESTDATA[i].locale,
minimal_days, TESTDATA[i].minimal_days);
}
ucal_close(cal);
}
}

void
TestGetIanaTimeZoneID(void) {
const UChar* UNKNOWN = u"Etc/Unknown";
Expand Down
6 changes: 3 additions & 3 deletions icu4c/source/test/intltest/numbertest_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2988,15 +2988,15 @@ void NumberFormatterApiTest::unitLocaleTags() {
"fahrenheit", 0, "default", "fahrenheit", 0.0, u"0 degrees Fahrenheit"},

// Test the behaviour of the `rg` tag
{u"Test the locale with rg = UK and without usage", "en-US-u-rg-ukzzzz", "fahrenheit", 0,
{u"Test the locale with rg = GB and without usage", "en-US-u-rg-gbzzzz", "fahrenheit", 0,
nullptr, "fahrenheit", 0.0, u"0 degrees Fahrenheit"},
{u"Test the locale with rg = UK and with usage", "en-US-u-rg-ukzzzz", "fahrenheit", 0, "default",
{u"Test the locale with rg = GB and with usage", "en-US-u-rg-gbzzzz", "fahrenheit", 0, "default",
"celsius", -18, u"-18 degrees Celsius"},
{"Test the locale with mu = fahrenheit and without usage", "en-US-u-mu-fahrenheit", "celsius", 0,
nullptr, "celsius", 0.0, "0 degrees Celsius"},
{"Test the locale with mu = fahrenheit and with usage", "en-US-u-mu-fahrenheit", "celsius", 0,
"default", "fahrenheit", 32.0, "32 degrees Fahrenheit"},
{u"Test the locale with rg = UKOI and with usage", "en-US-u-rg-ukoi", "fahrenheit", 0,
{u"Test the locale with rg = GBOXF and with usage", "en-US-u-rg-gboxf", "fahrenheit", 0,
"default", "celsius", -18.0, u"-18 degrees Celsius"},

// Test the priorities
Expand Down
58 changes: 58 additions & 0 deletions icu4c/source/test/intltest/regiontst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
#if !UCONFIG_NO_FORMATTING

#include "unicode/region.h"
#include "unicode/ures.h"
#include "regiontst.h"
#include "ulocimp.h"

typedef struct KnownRegion {
const char *code;
Expand Down Expand Up @@ -359,6 +361,7 @@ RegionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char*
TESTCASE_AUTO(TestAvailableTerritories);
TESTCASE_AUTO(TestNoContainedRegions);
TESTCASE_AUTO(TestGroupingChildren);
TESTCASE_AUTO(TestGetRegionForSupplementalDataMatch);
TESTCASE_AUTO_END;
}

Expand Down Expand Up @@ -783,6 +786,61 @@ void RegionTest::TestGroupingChildren() {
}
}

class MutableRegionValidateMap : public RegionValidateMap {
public:
MutableRegionValidateMap() {
uprv_memset(map, 0, sizeof(map));
}
virtual ~MutableRegionValidateMap() {}
void add(const char* region) {
int32_t index = value(region);
if (index >= 0) {
map[index / 32] |= (1L << (index % 32));
}
}
const uint32_t* data(int32_t* length) const {
if (length != nullptr) {
*length = sizeof(map)/sizeof(uint32_t);
}
return map;
}
};

void RegionTest::TestGetRegionForSupplementalDataMatch(void) {
RegionValidateMap builtin;
MutableRegionValidateMap prefab;

UErrorCode status = U_ZERO_ERROR;
LocalUResourceBundlePointer supplementalData(ures_openDirect(nullptr,"supplementalData",&status));

LocalUResourceBundlePointer idValidity(ures_getByKey(supplementalData.getAlias(),"idValidity",nullptr,&status));
LocalUResourceBundlePointer subdivisions(ures_getByKey(idValidity.getAlias(),"subdivision",nullptr,&status));
LocalUResourceBundlePointer unknown(ures_getByKey(subdivisions.getAlias(),"unknown",nullptr,&status));

while (U_SUCCESS(status) && ures_hasNext(unknown.getAlias())) {
UnicodeString subdivision = ures_getNextUnicodeString(unknown.getAlias(),nullptr,&status);
if (U_SUCCESS(status)) {
std::string str;
subdivision.toUTF8String<std::string>(str);
str.resize(2);
prefab.add(str.c_str());
}
}
if (!prefab.equals(builtin)) {
int32_t length;
const uint32_t* data = prefab.data(&length);
printf("const uint32_t gValidRegionMap[] = {");
for (int32_t i = 0; i < length; i++) {
if (i % 4 == 0) {
printf("\n ");
}
printf("0x%08x, ", data[i]);
}
printf("\n};\n");
errln("ulocimp_getRegionForSupplementalData() differs from supplementalData");
}
}

#endif /* #if !UCONFIG_NO_FORMATTING */

//eof
1 change: 1 addition & 0 deletions icu4c/source/test/intltest/regiontst.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class RegionTest: public IntlTest {
void TestAvailableTerritories();
void TestNoContainedRegions();
void TestGroupingChildren();
void TestGetRegionForSupplementalDataMatch();

private:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -511,9 +511,9 @@ public void unitWithLocaleTags() {
{"Test the locale with ms = Matric (wrong spelling) and with usage", "en-US-u-ms-Matric", "fahrenheit", "0", "default", "fahrenheit", "0.0", "0 degrees Fahrenheit"},

// Test the behaviour of the `rg` tag
{"Test the locale with rg = UK and without usage", "en-US-u-rg-ukzzzz", "fahrenheit", "0", null, "fahrenheit", "0.0", "0 degrees Fahrenheit"},
{"Test the locale with rg = UK and with usage", "en-US-u-rg-ukzzzz", "fahrenheit", "0", "default", "celsius", "-18", "-18 degrees Celsius"},
{"Test the locale with rg = UKOI and with usage", "en-US-u-rg-ukoi", "fahrenheit", "0", "default", "celsius", "-18" , "-18 degrees Celsius"},
{"Test the locale with rg = GB and without usage", "en-US-u-rg-gbzzzz", "fahrenheit", "0", null, "fahrenheit", "0.0", "0 degrees Fahrenheit"},
{"Test the locale with rg = GB and with usage", "en-US-u-rg-gbzzzz", "fahrenheit", "0", "default", "celsius", "-18", "-18 degrees Celsius"},
{"Test the locale with rg = GBOXF and with usage", "en-US-u-rg-gboxf", "fahrenheit", "0", "default", "celsius", "-18" , "-18 degrees Celsius"},

// Test the priorities
{"Test the locale with mu,ms,rg --> mu tag wins", "en-US-u-mu-celsius-ms-ussystem-rg-uszzzz", "celsius", "0", "default", "celsius", "0.0", "0 degrees Celsius"},
Expand Down
Loading

0 comments on commit 5e22f00

Please sign in to comment.