unicode-org · eggrobin · Dec 4, 2024 · Nov 28, 2024 · Nov 28, 2024 · Nov 29, 2024
diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java
@@ -60,6 +60,7 @@ public abstract class GenerateBreakTest implements UCD_Types {
     Normalizer nfd;
     Normalizer nfkd;
 
+    Segmenter segmenter;
     UnicodeMap<String> partition;
     UnicodeProperty prop;
 
@@ -322,6 +323,7 @@ public void run() throws IOException {
 
         boolean forCLDR = seg.target == Segmenter.Target.FOR_CLDR;
         String path = "UCD/" + ucd.getVersion() + '/' + (forCLDR ? "cldr/" : "auxiliary/");
+        String extraPath = "UCD/" + ucd.getVersion() + "/extra/";
         String outFilename = fileName + "BreakTest";
         if (forCLDR) {
             outFilename = outFilename + "-cldr";
@@ -477,6 +479,37 @@ value, new ParsePosition(0), IUP.getXSymbolTable()))) {
         fc.close();
 
         generateTest(false, path, outFilename, propertyName);
+        generateCppOldMonkeys(extraPath, outFilename);
+    }
+
+    private void generateCppOldMonkeys(String path, String outFilename) throws IOException {
+        final UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(path, outFilename + ".cpp");
+        final PrintWriter out = fc.out;
+        out.println();
+        out.println("####### Instructions ##################################");
+        out.println("# Copy the following lines into rbbitst.cpp in ICU4C, #");
+        out.println(
+                "# in the constructor of RBBIMeowMonkey, replacing the #"
+                        .replace("Meow", outFilename.substring(0, 4).replace("Graph", "Char")));
+        out.println("# existing block of generated code.                   #");
+        out.println("#######################################################");
+        out.println();
+        out.println("    // --- NOLI ME TANGERE ---");
+        out.println("    // Generated by GenerateBreakTest.java in the Unicode tools.");
+        for (Segmenter.Builder.NamedRefinedSet part : segmenter.getPartitionDefinition()) {
+            out.println(
+                    "    partition.emplace_back(\""
+                            + part.getName()
+                            + "\", UnicodeSet(uR\"("
+                            + part.getDefinition()
+                            + ")\", status));");
+        }
+        out.println();
+        for (Segmenter.SegmentationRule rule : segmenter.getRules()) {
+            out.println("    rules.push_back(" + rule.toCppOldMonkeyString() + ");");
+        }
+        out.println("    // --- End of generated code. ---");
+        fc.close();
     }
 
     private void generateTest(
@@ -1091,6 +1124,7 @@ public XGenerateBreakTest(
             }
             variables = segBuilder.getVariables();
             collectingRules = false;
+            segmenter = seg;
             partition = seg.getSamples();
             fileName = filename;
             propertyName = (filename.equals("Grapheme") ? "Grapheme_Cluster" : fileName) + "_Break";

diff --git a/unicodetools/src/main/java/org/unicode/tools/Segmenter.java b/unicodetools/src/main/java/org/unicode/tools/Segmenter.java
@@ -17,7 +17,6 @@
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.UnicodeSet;
 import com.ibm.icu.text.UnicodeSet.SpanCondition;
-import com.ibm.icu.text.UnicodeSet.XSymbolTable;
 import com.ibm.icu.text.UnicodeSetIterator;
 import com.ibm.icu.util.ULocale;
 import java.text.ParsePosition;
@@ -36,7 +35,9 @@
 import java.util.stream.Collectors;
 import org.unicode.cldr.draft.FileUtilities;
 import org.unicode.cldr.util.TransliteratorUtilities;
+import org.unicode.props.IndexUnicodeProperties;
 import org.unicode.props.UnicodeProperty;
+import org.unicode.tools.Segmenter.Builder.NamedRefinedSet;
 import org.unicode.tools.Segmenter.SegmentationRule.Breaks;
 
 /** Ordered list of rules, with variables resolved before building. Use Builder to make. */
@@ -68,6 +69,7 @@ public enum Target {
     public final Target target;
 
     private UnicodeMap<String> samples = new UnicodeMap<String>();
+    private List<NamedRefinedSet> partitionDefinition = new ArrayList<>();
 
     private Segmenter(Target target) {
         this.target = target;
@@ -279,13 +281,16 @@ public abstract Breaks applyAt(
         public String toString() {
             return toString(false);
         }
+
+        public abstract String toCppOldMonkeyString();
     }
 
     /** A « treat as » rule. */
     public static class RemapRule extends SegmentationRule {
 
         public RemapRule(String leftHandSide, String replacement, String line) {
-            pattern = Pattern.compile(leftHandSide, REGEX_FLAGS);
+            patternDefinition = leftHandSide;
+            pattern = Pattern.compile(Builder.expandUnicodeSets(leftHandSide), REGEX_FLAGS);
             this.replacement = replacement;
             name = line;
         }
@@ -352,6 +357,7 @@ public void apply(
             remap.accept(result);
         }
 
+        private String patternDefinition;
         private Pattern pattern;
         private String replacement;
         private String name;
@@ -373,6 +379,17 @@ public Breaks applyAt(
         protected String toString(boolean showResolved) {
             return name;
         }
+
+        @Override
+        public String toCppOldMonkeyString() {
+            return "std::make_unique<RemapRule>(uR\"("
+                    + name
+                    + ")\", uR\"("
+                    + patternDefinition.replaceAll("&", "&&").replaceAll("-", "--")
+                    + ")\", uR\"("
+                    + replacement
+                    + ")\")";
+        }
     }
 
     /** A rule that determines the status of an offset. */
@@ -384,6 +401,10 @@ public static class RegexRule extends SegmentationRule {
          * @param line
          */
         public RegexRule(String before, Breaks result, String after, String line) {
+            beforeDefinition = before;
+            afterDefinition = after;
+            before = Builder.expandUnicodeSets(before);
+            after = Builder.expandUnicodeSets(after);
             breaks = result;
             before = ".*(" + before + ")";
             String parsing = null;
@@ -453,12 +474,27 @@ public String toString(boolean showResolved) {
             return result;
         }
 
+        @Override
+        public String toCppOldMonkeyString() {
+            return "std::make_unique<RegexRule>(uR\"("
+                    + name
+                    + ")\", uR\"("
+                    + beforeDefinition.replaceAll("&", "&&").replaceAll("-", "--")
+                    + ")\", u'"
+                    + (breaks == Breaks.BREAK ? '÷' : '×')
+                    + "', uR\"("
+                    + afterDefinition.replaceAll("&", "&&").replaceAll("-", "--")
+                    + ")\")";
+        }
+
         // ============== Internals ================
         // We cannot use a single regex of the form "(?<= before) after" because
         // (RI RI)* RI × RI would require unbounded lookbehind.
         private Pattern before;
         private Pattern after;
         private String name;
+        private String beforeDefinition;
+        private String afterDefinition;
 
         private String resolved;
         private Breaks breaks;
@@ -474,31 +510,36 @@ public String toString(boolean showResolved) {
     public static class Builder {
         private final UnicodeProperty.Factory propFactory;
         private final Target target;
-        private XSymbolTable symbolTable;
         private List<String> rawVariables = new ArrayList<String>();
         private Map<Double, String> xmlRules = new TreeMap<Double, String>();
         private Map<Double, String> htmlRules = new TreeMap<Double, String>();
         private List<String> lastComments = new ArrayList<String>();
 
         class NamedSet {
-            NamedSet(String name, UnicodeSet set) {
+            NamedSet(String name, String definition, UnicodeSet set) {
                 this.name = name;
+                this.definition = definition;
                 this.set = set;
             }
 
             String name;
+            String definition;
             UnicodeSet set;
         }
 
-        class NamedRefinedSet {
+        public class NamedRefinedSet {
             public NamedRefinedSet clone() {
                 NamedRefinedSet result = new NamedRefinedSet();
                 for (var term : intersectionTerms) {
-                    result.intersectionTerms.add(new NamedSet(term.name, term.set.cloneAsThawed()));
+                    result.intersectionTerms.add(
+                            new NamedSet(term.name, term.definition, term.set.cloneAsThawed()));
                 }
                 for (var subtrahend : subtrahends) {
                     result.subtrahends.add(
-                            new NamedSet(subtrahend.name, subtrahend.set.cloneAsThawed()));
+                            new NamedSet(
+                                    subtrahend.name,
+                                    subtrahend.definition,
+                                    subtrahend.set.cloneAsThawed()));
                 }
                 result.set = this.set.cloneAsThawed();
                 return result;
@@ -547,6 +588,19 @@ public String getName() {
                                         .collect(Collectors.joining());
             }
 
+            public String getDefinition() {
+                return intersectionTerms.isEmpty()
+                        ? "[^[]]"
+                        : "["
+                                + intersectionTerms.stream()
+                                        .map((s) -> s.definition)
+                                        .collect(Collectors.joining("&"))
+                                + subtrahends.stream()
+                                        .map((s) -> "-" + s.definition)
+                                        .collect(Collectors.joining())
+                                + "]";
+            }
+
             private UnicodeSet getIntersection() {
                 UnicodeSet result = UnicodeSet.ALL_CODE_POINTS.cloneAsThawed();
                 for (var term : intersectionTerms) {
@@ -565,54 +619,11 @@ private UnicodeSet getIntersection() {
         public Builder(UnicodeProperty.Factory factory, Target target) {
             propFactory = factory;
             this.target = target;
-            symbolTable = new MyXSymbolTable(); // propFactory.getXSymbolTable();
             htmlRules.put(new Double(BREAK_SOT), "sot \u00F7");
             htmlRules.put(new Double(BREAK_EOT), "\u00F7 eot");
             htmlRules.put(new Double(BREAK_ANY), "\u00F7 Any");
         }
 
-        // copied to make independent of ICU4J internals
-        private class MyXSymbolTable extends UnicodeSet.XSymbolTable {
-            public boolean applyPropertyAlias(
-                    String propertyName, String propertyValue, UnicodeSet result) {
-                UnicodeProperty prop = propFactory.getProperty(propertyName);
-                if (prop == null) {
-                    if (propertyValue.isEmpty()) {
-                        prop = propFactory.getProperty("Script");
-                        result.clear();
-                        UnicodeSet x = prop.getSet(propertyName, result);
-                        if (!x.isEmpty()) {
-                            return true;
-                        }
-                    }
-                    // If we cannot handle the property name, then we need to really fail.
-                    // If we were to just print something and return false, then the UnicodeSet code
-                    // would just evaluate this itself, and may succeed but give wrong results.
-                    // For example, as long as we require "gc=Cn" and don't handle "Cn" here,
-                    // falling back to built-in ICU data means that we get gc=Cn ranges from ICU
-                    // rather than from the current Unicode beta.
-                    throw new IllegalArgumentException(
-                            "Segmenter.MyXSymbolTable: Unknown property " + propertyName);
-                }
-                // Binary properties:
-                // \p{Extended_Pictographic} is equivalent with \p{Extended_Pictographic=Yes}
-                if (propertyValue.isEmpty() && prop.isType(UnicodeProperty.BINARY_MASK)) {
-                    propertyValue = "Yes";
-                }
-                result.clear();
-                UnicodeSet x = prop.getSet(propertyValue, result);
-                if (x.isEmpty()) {
-                    // didn't find anything
-                    System.out.println(
-                            "Segmenter.MyXSymbolTable: !Empty! "
-                                    + propertyName
-                                    + "="
-                                    + propertyValue);
-                }
-                return true; // mark that we handled it even if there are no results.
-            }
-        }
-
         public String toString(String testName, String indent) {
 
             StringBuffer result = new StringBuffer();
@@ -728,10 +739,15 @@ Builder addVariable(String name, String value) {
                             + TransliteratorUtilities.toXML.transliterate(value)
                             + "</variable>");
             value = replaceVariables(value, variables);
+            ;
             if (!name.endsWith("_")) {
                 try {
                     parsePosition.setIndex(0);
-                    UnicodeSet valueSet = new UnicodeSet(value, parsePosition, symbolTable);
+                    UnicodeSet valueSet =
+                            new UnicodeSet(
+                                    value,
+                                    parsePosition,
+                                    IndexUnicodeProperties.make().getXSymbolTable());
                     if (parsePosition.getIndex() != value.length()) {
                         if (SHOW_SAMPLES)
                             System.out.println(
@@ -748,7 +764,7 @@ Builder addVariable(String name, String value) {
                     } else {
                         String name2 = name;
                         if (name2.startsWith("$")) name2 = name2.substring(1);
-                        refinePartition(new NamedSet(name2, valueSet));
+                        refinePartition(new NamedSet(name2, value, valueSet));
                         if (SHOW_SAMPLES) {
                             System.out.println("Samples for: " + name + " = " + value);
                             System.out.println("\t" + valueSet);
@@ -827,8 +843,7 @@ Builder addRemapRule(Double order, String before, String after, String line) {
                             + " </rule>");
             rules.put(
                     order,
-                    new Segmenter.RemapRule(
-                            replaceVariables(before, expandedVariables), after, line));
+                    new Segmenter.RemapRule(replaceVariables(before, variables), after, line));
             return this;
         }
 
@@ -889,9 +904,9 @@ Builder addRegexRule(
             rules.put(
                     order,
                     new Segmenter.RegexRule(
-                            replaceVariables(before, expandedVariables),
+                            replaceVariables(before, variables),
                             breaks,
-                            replaceVariables(after, expandedVariables),
+                            replaceVariables(after, variables),
                             line));
             return this;
         }
@@ -906,6 +921,7 @@ public Segmenter make() {
             for (Double key : rules.keySet()) {
                 result.add(key.doubleValue(), rules.get(key));
             }
+            result.partitionDefinition = partition;
             for (var part : partition) {
                 if (part.getName() == null) {
                     throw new IllegalArgumentException("Unclassified characters: " + part.getSet());
@@ -952,14 +968,19 @@ private static String replaceVariables(String input, Map<String, String> variabl
         }
 
         /** Replaces Unicode Sets with literals. */
-        public String expandUnicodeSets(String input) {
+        public static String expandUnicodeSets(String input) {
             String result = input;
+            var parsePosition = new ParsePosition(0);
             // replace properties
             // TODO really dumb parse for now, fix later
             for (int i = 0; i < result.length(); ++i) {
                 if (UnicodeSet.resemblesPattern(result, i)) {
                     parsePosition.setIndex(i);
-                    UnicodeSet temp = new UnicodeSet(result, parsePosition, symbolTable);
+                    UnicodeSet temp =
+                            new UnicodeSet(
+                                    result,
+                                    parsePosition,
+                                    IndexUnicodeProperties.make().getXSymbolTable());
                     String insert = getInsertablePattern(temp);
                     result =
                             result.substring(0, i)
@@ -981,7 +1002,7 @@ public String expandUnicodeSets(String input) {
          * @param temp
          * @return
          */
-        private String getInsertablePattern(UnicodeSet temp) {
+        private static String getInsertablePattern(UnicodeSet temp) {
             temp.complement().complement();
             if (DEBUG_REDUCE_SET_SIZE != null) {
                 UnicodeSet temp2 = new UnicodeSet(temp);
@@ -1053,6 +1074,14 @@ public List<String> getRules() {
         }
     }
 
+    public List<NamedRefinedSet> getPartitionDefinition() {
+        return partitionDefinition;
+    }
+
+    public List<SegmentationRule> getRules() {
+        return rules;
+    }
+
     // ============== Internals ================
 
     private List<SegmentationRule> rules = new ArrayList<SegmentationRule>(1);