Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate old monkeys #979

Merged
merged 3 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ public abstract class GenerateBreakTest implements UCD_Types {
Normalizer nfd;
Normalizer nfkd;

Segmenter segmenter;
UnicodeMap<String> partition;
UnicodeProperty prop;

Expand Down Expand Up @@ -322,6 +323,7 @@ public void run() throws IOException {

boolean forCLDR = seg.target == Segmenter.Target.FOR_CLDR;
String path = "UCD/" + ucd.getVersion() + '/' + (forCLDR ? "cldr/" : "auxiliary/");
String extraPath = "UCD/" + ucd.getVersion() + "/extra/";
String outFilename = fileName + "BreakTest";
if (forCLDR) {
outFilename = outFilename + "-cldr";
Expand Down Expand Up @@ -477,6 +479,37 @@ value, new ParsePosition(0), IUP.getXSymbolTable()))) {
fc.close();

generateTest(false, path, outFilename, propertyName);
generateCppOldMonkeys(extraPath, outFilename);
}

private void generateCppOldMonkeys(String path, String outFilename) throws IOException {
final UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(path, outFilename + ".cpp");
final PrintWriter out = fc.out;
out.println();
out.println("####### Instructions ##################################");
out.println("# Copy the following lines into rbbitst.cpp in ICU4C, #");
out.println(
"# in the constructor of RBBIMeowMonkey, replacing the #"
.replace("Meow", outFilename.substring(0, 4).replace("Graph", "Char")));
out.println("# existing block of generated code. #");
out.println("#######################################################");
out.println();
out.println(" // --- NOLI ME TANGERE ---");
out.println(" // Generated by GenerateBreakTest.java in the Unicode tools.");
for (Segmenter.Builder.NamedRefinedSet part : segmenter.getPartitionDefinition()) {
out.println(
" partition.emplace_back(\""
+ part.getName()
+ "\", UnicodeSet(uR\"("
+ part.getDefinition()
+ ")\", status));");
}
out.println();
for (Segmenter.SegmentationRule rule : segmenter.getRules()) {
out.println(" rules.push_back(" + rule.toCppOldMonkeyString() + ");");
}
out.println(" // --- End of generated code. ---");
fc.close();
}

private void generateTest(
Expand Down Expand Up @@ -1091,6 +1124,7 @@ public XGenerateBreakTest(
}
variables = segBuilder.getVariables();
collectingRules = false;
segmenter = seg;
partition = seg.getSamples();
fileName = filename;
propertyName = (filename.equals("Grapheme") ? "Grapheme_Cluster" : fileName) + "_Break";
Expand Down
147 changes: 88 additions & 59 deletions unicodetools/src/main/java/org/unicode/tools/Segmenter.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSet.SpanCondition;
import com.ibm.icu.text.UnicodeSet.XSymbolTable;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.ULocale;
import java.text.ParsePosition;
Expand All @@ -36,7 +35,9 @@
import java.util.stream.Collectors;
import org.unicode.cldr.draft.FileUtilities;
import org.unicode.cldr.util.TransliteratorUtilities;
import org.unicode.props.IndexUnicodeProperties;
import org.unicode.props.UnicodeProperty;
import org.unicode.tools.Segmenter.Builder.NamedRefinedSet;
import org.unicode.tools.Segmenter.SegmentationRule.Breaks;

/** Ordered list of rules, with variables resolved before building. Use Builder to make. */
Expand Down Expand Up @@ -68,6 +69,7 @@ public enum Target {
public final Target target;

private UnicodeMap<String> samples = new UnicodeMap<String>();
private List<NamedRefinedSet> partitionDefinition = new ArrayList<>();

private Segmenter(Target target) {
this.target = target;
Expand Down Expand Up @@ -279,13 +281,16 @@ public abstract Breaks applyAt(
public String toString() {
return toString(false);
}

public abstract String toCppOldMonkeyString();
}

/** A « treat as » rule. */
public static class RemapRule extends SegmentationRule {

public RemapRule(String leftHandSide, String replacement, String line) {
pattern = Pattern.compile(leftHandSide, REGEX_FLAGS);
patternDefinition = leftHandSide;
pattern = Pattern.compile(Builder.expandUnicodeSets(leftHandSide), REGEX_FLAGS);
this.replacement = replacement;
name = line;
}
Expand Down Expand Up @@ -352,6 +357,7 @@ public void apply(
remap.accept(result);
}

private String patternDefinition;
private Pattern pattern;
private String replacement;
private String name;
Expand All @@ -373,6 +379,17 @@ public Breaks applyAt(
protected String toString(boolean showResolved) {
return name;
}

@Override
public String toCppOldMonkeyString() {
return "std::make_unique<RemapRule>(uR\"("
+ name
+ ")\", uR\"("
+ patternDefinition.replaceAll("&", "&&").replaceAll("-", "--")
+ ")\", uR\"("
+ replacement
+ ")\")";
}
}

/** A rule that determines the status of an offset. */
Expand All @@ -384,6 +401,10 @@ public static class RegexRule extends SegmentationRule {
* @param line
*/
public RegexRule(String before, Breaks result, String after, String line) {
beforeDefinition = before;
afterDefinition = after;
before = Builder.expandUnicodeSets(before);
after = Builder.expandUnicodeSets(after);
breaks = result;
before = ".*(" + before + ")";
String parsing = null;
Expand Down Expand Up @@ -453,12 +474,27 @@ public String toString(boolean showResolved) {
return result;
}

@Override
public String toCppOldMonkeyString() {
return "std::make_unique<RegexRule>(uR\"("
+ name
+ ")\", uR\"("
+ beforeDefinition.replaceAll("&", "&&").replaceAll("-", "--")
+ ")\", u'"
+ (breaks == Breaks.BREAK ? '÷' : '×')
+ "', uR\"("
+ afterDefinition.replaceAll("&", "&&").replaceAll("-", "--")
+ ")\")";
}

// ============== Internals ================
// We cannot use a single regex of the form "(?<= before) after" because
// (RI RI)* RI × RI would require unbounded lookbehind.
private Pattern before;
private Pattern after;
private String name;
private String beforeDefinition;
private String afterDefinition;

private String resolved;
private Breaks breaks;
Expand All @@ -474,31 +510,36 @@ public String toString(boolean showResolved) {
public static class Builder {
private final UnicodeProperty.Factory propFactory;
private final Target target;
private XSymbolTable symbolTable;
private List<String> rawVariables = new ArrayList<String>();
private Map<Double, String> xmlRules = new TreeMap<Double, String>();
private Map<Double, String> htmlRules = new TreeMap<Double, String>();
private List<String> lastComments = new ArrayList<String>();

class NamedSet {
NamedSet(String name, UnicodeSet set) {
NamedSet(String name, String definition, UnicodeSet set) {
this.name = name;
this.definition = definition;
this.set = set;
}

String name;
String definition;
UnicodeSet set;
}

class NamedRefinedSet {
public class NamedRefinedSet {
public NamedRefinedSet clone() {
NamedRefinedSet result = new NamedRefinedSet();
for (var term : intersectionTerms) {
result.intersectionTerms.add(new NamedSet(term.name, term.set.cloneAsThawed()));
result.intersectionTerms.add(
new NamedSet(term.name, term.definition, term.set.cloneAsThawed()));
}
for (var subtrahend : subtrahends) {
result.subtrahends.add(
new NamedSet(subtrahend.name, subtrahend.set.cloneAsThawed()));
new NamedSet(
subtrahend.name,
subtrahend.definition,
subtrahend.set.cloneAsThawed()));
}
result.set = this.set.cloneAsThawed();
return result;
Expand Down Expand Up @@ -547,6 +588,19 @@ public String getName() {
.collect(Collectors.joining());
}

public String getDefinition() {
return intersectionTerms.isEmpty()
? "[^[]]"
: "["
+ intersectionTerms.stream()
.map((s) -> s.definition)
.collect(Collectors.joining("&"))
+ subtrahends.stream()
.map((s) -> "-" + s.definition)
.collect(Collectors.joining())
+ "]";
}

private UnicodeSet getIntersection() {
UnicodeSet result = UnicodeSet.ALL_CODE_POINTS.cloneAsThawed();
for (var term : intersectionTerms) {
Expand All @@ -565,54 +619,11 @@ private UnicodeSet getIntersection() {
public Builder(UnicodeProperty.Factory factory, Target target) {
propFactory = factory;
this.target = target;
symbolTable = new MyXSymbolTable(); // propFactory.getXSymbolTable();
htmlRules.put(new Double(BREAK_SOT), "sot \u00F7");
htmlRules.put(new Double(BREAK_EOT), "\u00F7 eot");
htmlRules.put(new Double(BREAK_ANY), "\u00F7 Any");
}

// copied to make independent of ICU4J internals
private class MyXSymbolTable extends UnicodeSet.XSymbolTable {
public boolean applyPropertyAlias(
String propertyName, String propertyValue, UnicodeSet result) {
UnicodeProperty prop = propFactory.getProperty(propertyName);
if (prop == null) {
if (propertyValue.isEmpty()) {
prop = propFactory.getProperty("Script");
result.clear();
UnicodeSet x = prop.getSet(propertyName, result);
if (!x.isEmpty()) {
return true;
}
}
// If we cannot handle the property name, then we need to really fail.
// If we were to just print something and return false, then the UnicodeSet code
// would just evaluate this itself, and may succeed but give wrong results.
// For example, as long as we require "gc=Cn" and don't handle "Cn" here,
// falling back to built-in ICU data means that we get gc=Cn ranges from ICU
// rather than from the current Unicode beta.
throw new IllegalArgumentException(
"Segmenter.MyXSymbolTable: Unknown property " + propertyName);
}
// Binary properties:
// \p{Extended_Pictographic} is equivalent with \p{Extended_Pictographic=Yes}
if (propertyValue.isEmpty() && prop.isType(UnicodeProperty.BINARY_MASK)) {
propertyValue = "Yes";
}
result.clear();
UnicodeSet x = prop.getSet(propertyValue, result);
if (x.isEmpty()) {
// didn't find anything
System.out.println(
"Segmenter.MyXSymbolTable: !Empty! "
+ propertyName
+ "="
+ propertyValue);
}
return true; // mark that we handled it even if there are no results.
}
}

public String toString(String testName, String indent) {

StringBuffer result = new StringBuffer();
Expand Down Expand Up @@ -728,10 +739,15 @@ Builder addVariable(String name, String value) {
+ TransliteratorUtilities.toXML.transliterate(value)
+ "</variable>");
value = replaceVariables(value, variables);
;
if (!name.endsWith("_")) {
try {
parsePosition.setIndex(0);
UnicodeSet valueSet = new UnicodeSet(value, parsePosition, symbolTable);
UnicodeSet valueSet =
new UnicodeSet(
value,
parsePosition,
IndexUnicodeProperties.make().getXSymbolTable());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this symbol table worth caching? Or cached already?
Rather than fetching it each time?

Copy link
Member Author

@eggrobin eggrobin Dec 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The XSymbolTable is a pretty lightweight object. The IndexUnicodeProperties created by make is cached, and the hard work is done lazily when doing a query for a set on an actual UnicodeProperty (and cached in the IndexUnicodeProperties).

if (parsePosition.getIndex() != value.length()) {
if (SHOW_SAMPLES)
System.out.println(
Expand All @@ -748,7 +764,7 @@ Builder addVariable(String name, String value) {
} else {
String name2 = name;
if (name2.startsWith("$")) name2 = name2.substring(1);
refinePartition(new NamedSet(name2, valueSet));
refinePartition(new NamedSet(name2, value, valueSet));
if (SHOW_SAMPLES) {
System.out.println("Samples for: " + name + " = " + value);
System.out.println("\t" + valueSet);
Expand Down Expand Up @@ -827,8 +843,7 @@ Builder addRemapRule(Double order, String before, String after, String line) {
+ " </rule>");
rules.put(
order,
new Segmenter.RemapRule(
replaceVariables(before, expandedVariables), after, line));
new Segmenter.RemapRule(replaceVariables(before, variables), after, line));
return this;
}

Expand Down Expand Up @@ -889,9 +904,9 @@ Builder addRegexRule(
rules.put(
order,
new Segmenter.RegexRule(
replaceVariables(before, expandedVariables),
replaceVariables(before, variables),
breaks,
replaceVariables(after, expandedVariables),
replaceVariables(after, variables),
line));
return this;
}
Expand All @@ -906,6 +921,7 @@ public Segmenter make() {
for (Double key : rules.keySet()) {
result.add(key.doubleValue(), rules.get(key));
}
result.partitionDefinition = partition;
for (var part : partition) {
if (part.getName() == null) {
throw new IllegalArgumentException("Unclassified characters: " + part.getSet());
Expand Down Expand Up @@ -952,14 +968,19 @@ private static String replaceVariables(String input, Map<String, String> variabl
}

/** Replaces Unicode Sets with literals. */
public String expandUnicodeSets(String input) {
public static String expandUnicodeSets(String input) {
String result = input;
var parsePosition = new ParsePosition(0);
// replace properties
// TODO really dumb parse for now, fix later
for (int i = 0; i < result.length(); ++i) {
if (UnicodeSet.resemblesPattern(result, i)) {
parsePosition.setIndex(i);
UnicodeSet temp = new UnicodeSet(result, parsePosition, symbolTable);
UnicodeSet temp =
new UnicodeSet(
result,
parsePosition,
IndexUnicodeProperties.make().getXSymbolTable());
String insert = getInsertablePattern(temp);
result =
result.substring(0, i)
Expand All @@ -981,7 +1002,7 @@ public String expandUnicodeSets(String input) {
* @param temp
* @return
*/
private String getInsertablePattern(UnicodeSet temp) {
private static String getInsertablePattern(UnicodeSet temp) {
temp.complement().complement();
if (DEBUG_REDUCE_SET_SIZE != null) {
UnicodeSet temp2 = new UnicodeSet(temp);
Expand Down Expand Up @@ -1053,6 +1074,14 @@ public List<String> getRules() {
}
}

public List<NamedRefinedSet> getPartitionDefinition() {
return partitionDefinition;
}

public List<SegmentationRule> getRules() {
return rules;
}

// ============== Internals ================

private List<SegmentationRule> rules = new ArrayList<SegmentationRule>(1);
Expand Down
Loading