Skip to content

Commit

Permalink
WIP gennorm2 fv5 with maybeNo data
Browse files Browse the repository at this point in the history
  • Loading branch information
markusicu committed Apr 3, 2024
1 parent 8d513e2 commit 86e9ba4
Show file tree
Hide file tree
Showing 10 changed files with 897 additions and 793 deletions.
2 changes: 1 addition & 1 deletion icu4c/source/common/loadednormalizer2impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ LoadedNormalizer2Impl::isAcceptable(void * /*context*/,
pInfo->dataFormat[1]==0x72 &&
pInfo->dataFormat[2]==0x6d &&
pInfo->dataFormat[3]==0x32 &&
pInfo->formatVersion[0]==4
pInfo->formatVersion[0]==5
) {
// Normalizer2Impl *me=(Normalizer2Impl *)context;
// uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
Expand Down
1,484 changes: 742 additions & 742 deletions icu4c/source/common/norm2_nfc_data.h

Large diffs are not rendered by default.

14 changes: 9 additions & 5 deletions icu4c/source/common/normalizer2impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -440,14 +440,16 @@ Normalizer2Impl::init(const int32_t *inIndexes, const UCPTrie *inTrie,
minNoNoCompNoMaybeCC = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]);
minNoNoEmpty = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_EMPTY]);
limitNoNo = static_cast<uint16_t>(inIndexes[IX_LIMIT_NO_NO]);
minMaybeNo = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_NO]);
minMaybeNoCombinesFwd = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_NO_COMBINES_FWD]);
minMaybeYes = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_YES]);
U_ASSERT((minMaybeYes & 7) == 0); // 8-aligned for noNoDelta bit fields
centerNoNoDelta = (minMaybeYes >> DELTA_SHIFT) - MAX_DELTA - 1;
U_ASSERT((minMaybeNo & 7) == 0); // 8-aligned for noNoDelta bit fields
centerNoNoDelta = (minMaybeNo >> DELTA_SHIFT) - MAX_DELTA - 1;

normTrie=inTrie;

maybeYesCompositions=inExtraData;
extraData=maybeYesCompositions+((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
extraData=maybeYesCompositions+((MIN_NORMAL_MAYBE_YES-minMaybeNo)>>OFFSET_SHIFT);

smallFCD=inSmallFCD;
}
Expand Down Expand Up @@ -2728,7 +2730,7 @@ unorm2_swap(const UDataSwapper *ds,
pInfo->dataFormat[1]==0x72 &&
pInfo->dataFormat[2]==0x6d &&
pInfo->dataFormat[3]==0x32 &&
(1<=formatVersion0 && formatVersion0<=4)
(1<=formatVersion0 && formatVersion0<=5)
)) {
udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
pInfo->dataFormat[0], pInfo->dataFormat[1],
Expand All @@ -2747,8 +2749,10 @@ unorm2_swap(const UDataSwapper *ds,
minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_YES+1;
} else if(formatVersion0==2) {
minIndexesLength=Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY+1;
} else {
} else if(formatVersion0<=4) {
minIndexesLength=Normalizer2Impl::IX_MIN_LCCC_CP+1;
} else {
minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_NO_COMBINES_FWD+1;
}

if(length>=0) {
Expand Down
32 changes: 26 additions & 6 deletions icu4c/source/common/normalizer2impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,13 @@ class U_COMMON_API Normalizer2Impl : public UObject {

IX_MIN_LCCC_CP,
IX_RESERVED19,
IX_COUNT

/** Two-way mappings; each starts with a character that combines backward. */
IX_MIN_MAYBE_NO, // 20
/** Two-way mappings & compositions. */
IX_MIN_MAYBE_NO_COMBINES_FWD,

IX_COUNT // 22
};

enum {
Expand Down Expand Up @@ -695,8 +701,10 @@ class U_COMMON_API Normalizer2Impl : public UObject {
(norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : *getMapping(norm16) <= 0x1ff);
}

const char16_t *findPreviousCompBoundary(const char16_t *start, const char16_t *p, UBool onlyContiguous) const;
const char16_t *findNextCompBoundary(const char16_t *p, const char16_t *limit, UBool onlyContiguous) const;
const char16_t *findPreviousCompBoundary(const char16_t *start, const char16_t *p,
UBool onlyContiguous) const;
const char16_t *findNextCompBoundary(const char16_t *p, const char16_t *limit,
UBool onlyContiguous) const;

const char16_t *findPreviousFCDBoundary(const char16_t *start, const char16_t *p) const;
const char16_t *findNextFCDBoundary(const char16_t *p, const char16_t *limit) const;
Expand All @@ -723,10 +731,12 @@ class U_COMMON_API Normalizer2Impl : public UObject {
uint16_t minNoNoEmpty;
uint16_t limitNoNo;
uint16_t centerNoNoDelta;
uint16_t minMaybeNo;
uint16_t minMaybeNoCombinesFwd;
uint16_t minMaybeYes;

const UCPTrie *normTrie;
const uint16_t *maybeYesCompositions;
const uint16_t *maybeYesCompositions; // TODO: maybeData
const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0

Expand Down Expand Up @@ -785,7 +795,7 @@ unorm_getFCD16(UChar32 c);

/**
* Format of Normalizer2 .nrm data files.
* Format version 4.0.
* Format version 5.0.
*
* Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
* ICU ships with data files for standard Unicode Normalization Forms
Expand Down Expand Up @@ -829,14 +839,18 @@ unorm_getFCD16(UChar32 c);
*
* The next eight indexes are thresholds of 16-bit trie values for ranges of
* values indicating multiple normalization properties.
* They are listed here in threshold order, not in the order they are stored in the indexes.
* Format version 5 adds the two minMaybeNo* threshold indexes.
* The thresholds are listed here in threshold order,
* not in the order they are stored in the indexes.
* minYesNo=indexes[IX_MIN_YES_NO];
* minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
* minNoNo=indexes[IX_MIN_NO_NO];
* minNoNoCompBoundaryBefore=indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
* minNoNoCompNoMaybeCC=indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
* minNoNoEmpty=indexes[IX_MIN_NO_NO_EMPTY];
* limitNoNo=indexes[IX_LIMIT_NO_NO];
* minMaybeNo=indexes[IX_MIN_MAYBE_NO];
* minMaybeNoCombinesFwd=indexes[IX_MIN_MAYBE_NO_COMBINES_FWD];
* minMaybeYes=indexes[IX_MIN_MAYBE_YES];
* See the normTrie description below and the design doc for details.
*
Expand Down Expand Up @@ -870,13 +884,15 @@ unorm_getFCD16(UChar32 c);
* When the lead surrogate unit's value exceeds the quick check minimum during processing,
* the properties for the full supplementary code point need to be looked up.
*
* TODO
* uint16_t maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes];
* uint16_t extraData[];
*
* There is only one byte offset for the end of these two arrays.
* The split between them is given by the constant and variable mentioned above.
* In version 3, the difference must be shifted right by OFFSET_SHIFT.
*
* TODO
* The maybeYesCompositions array contains compositions lists for characters that
* combine both forward (as starters in composition pairs)
* and backward (as trailing characters in composition pairs).
Expand Down Expand Up @@ -981,6 +997,10 @@ unorm_getFCD16(UChar32 c);
* gennorm2 now has to reject mappings for surrogate code points.
* UTS #46 maps unpaired surrogates to U+FFFD in code rather than via its
* custom normalization data file.
*
* Changes from format version 4 to format version 5 (ICU 76) ------------------
*
* TODO
*/

#endif /* !UCONFIG_NO_NORMALIZATION */
Expand Down
13 changes: 12 additions & 1 deletion icu4c/source/tools/gennorm2/extradata.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ void ExtraData::writeCompositions(UChar32 c, const Norm &norm, UnicodeString &da
const CompositionPair &pair=pairs[i];
// 22 bits for the composite character and whether it combines forward.
UChar32 compositeAndFwd=pair.composite<<1;
if(norms.getNormRef(pair.composite).compositions!=nullptr) {
if(norms.getNormRef(pair.composite).combinesFwd()) {
compositeAndFwd|=1; // The composite character also combines-forward.
}
// Encode most pairs in two units and some in three.
Expand Down Expand Up @@ -231,6 +231,17 @@ void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
// if they have different raw mappings.
norm.offset=writeNoNoMapping(c, norm, noNoMappingsEmpty, previousNoNoMappingsEmpty);
break;
case Norm::MAYBE_NO_MAPPING_ONLY:
printf("*** U+%04lX MAYBE_NO_MAPPING_ONLY\n", (long)c); // TODO
norm.offset=maybeNoMappingsOnly.length()+
writeMapping(c, norm, maybeNoMappingsOnly);
break;
case Norm::MAYBE_NO_COMBINES_FWD:
printf("*** U+%04lX MAYBE_NO_COMBINES_FWD\n", (long)c); // TODO
norm.offset=maybeNoMappingsAndCompositions.length()+
writeMapping(c, norm, maybeNoMappingsAndCompositions);
writeCompositions(c, norm, maybeNoMappingsAndCompositions);
break;
case Norm::MAYBE_YES_COMBINES_FWD:
norm.offset=maybeYesCompositions.length();
writeCompositions(c, norm, maybeYesCompositions);
Expand Down
8 changes: 6 additions & 2 deletions icu4c/source/tools/gennorm2/extradata.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ class ExtraData : public Norms::Enumerator {

void rangeHandler(UChar32 start, UChar32 end, Norm &norm) override;

UnicodeString maybeNoMappingsOnly;
UnicodeString maybeNoMappingsAndCompositions;
UnicodeString maybeYesCompositions;
UnicodeString yesYesCompositions;
UnicodeString yesNoMappingsAndCompositions;
Expand All @@ -44,15 +46,17 @@ class ExtraData : public Norms::Enumerator {
private:
/**
* Requires norm.hasMapping().
* Returns the offset of the "first unit" from the beginning of the extraData for c.
* Returns the offset of the "first unit" from the beginning of the extraData for c,
* not from the beginning of the dataString.
* That is the same as the length of the optional data
* for the raw mapping and the ccc/lccc word.
*/
int32_t writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString);
/** Returns the full offset into the dataString of the "first unit" for c. */
int32_t writeNoNoMapping(UChar32 c, const Norm &norm,
UnicodeString &dataString, Hashtable &previousMappings);
UBool setNoNoDelta(UChar32 c, Norm &norm) const;
/** Requires norm.compositions!=nullptr. */
/** Requires norm.combinesFwd(). */
void writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString);
void writeExtraData(UChar32 c, Norm &norm);

Expand Down
Loading

0 comments on commit 86e9ba4

Please sign in to comment.