Skip to content

Commit

Permalink
Add Pattern.programSize() and Matcher.programSize() (#180)
Browse files Browse the repository at this point in the history
* Add `Pattern.programSize()` and `Matcher.programSize()`

This PR exposes `Pattern.programSize()` and `Matcher.programSize()`
public API.

The program size represents a very approximate measure of a
regexp's "cost". Larger numbers are more expensive than smaller
numbers.

Similar to the canonical C++ implementation, re2j will return the
program size as the number of instructions of the regex program
without making any promises or claims except "larger is more
expensive".

Context: The need for this change arose from cross-language projects,
such as gRPC and CEL. gRPC needs to configure the maximum size of
regex programs in CEL to the same number across all languages. While
it's possible in CEL-Cpp and gRPC-Cpp, CEL-Java doesn't provide the
same configuration option simply because the program size is not
available in re2j. In Go, the number of instruction is available in
Go via the length of https://pkg.go.dev/regexp/syntax#Prog.Inst.

* googleJavaFormat
  • Loading branch information
sergiitk authored Jan 5, 2025
1 parent 97df44e commit 2757238
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 1 deletion.
18 changes: 18 additions & 0 deletions java/com/google/re2j/Matcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ public final class Matcher {
// The number of submatches (groups) in the pattern.
private final int groupCount;

// The number of instructions in the pattern.
private final int numberOfInstructions;

private MatcherInput matcherInput;

// The input length in UTF16 codes.
Expand Down Expand Up @@ -77,6 +80,7 @@ private Matcher(Pattern pattern) {
groupCount = re2.numberOfCapturingGroups();
groups = new int[2 + 2 * groupCount];
namedGroups = re2.namedGroups;
numberOfInstructions = re2.numberOfInstructions();
}

/** Creates a new {@code Matcher} with the given pattern and input. */
Expand Down Expand Up @@ -209,6 +213,20 @@ public int end(String group) {
return end(g);
}

/**
* Returns the program size of this pattern.
*
* <p>
* Similar to the C++ implementation, the program size is a very approximate measure of a regexp's
* "cost". Larger numbers are more expensive than smaller numbers.
* </p>
*
* @return the program size of this pattern
*/
public int programSize() {
return numberOfInstructions;
}

/**
* Returns the most recent match.
*
Expand Down
14 changes: 14 additions & 0 deletions java/com/google/re2j/Pattern.java
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,20 @@ public String toString() {
return pattern;
}

/**
* Returns the program size of this pattern.
*
* <p>
* Similar to the C++ implementation, the program size is a very approximate measure of a regexp's
* "cost". Larger numbers are more expensive than smaller numbers.
* </p>
*
* @return the program size of this pattern
*/
public int programSize() {
return re2.numberOfInstructions();
}

/**
* Returns the number of capturing groups in this matcher's pattern. Group zero denotes the entire
* pattern and is excluded from this count.
Expand Down
7 changes: 7 additions & 0 deletions java/com/google/re2j/RE2.java
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,13 @@ int numberOfCapturingGroups() {
return numSubexp;
}

/**
* Returns the number of instructions in this compiled regular expression program.
*/
int numberOfInstructions() {
return prog.numInst();
}

// get() returns a machine to use for matching |this|. It uses |this|'s
// machine cache if possible, to avoid unnecessary allocation.
Machine get() {
Expand Down
21 changes: 21 additions & 0 deletions javatests/com/google/re2j/ApiTestUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,27 @@ public static void testGroupCount(String pattern, int count) {
assertEquals(count, mj.groupCount());
}

// Tests that both RE2 Patterns and Matchers give the same groupCount.
public static void testProgramSize(String pattern, int expectedSize) {
Pattern p = Pattern.compile(pattern);

String input = "foo";
byte[] inputBytes = getUtf8Bytes(input);
Matcher m1 = p.matcher(input);
Matcher m2 = p.matcher(inputBytes);

Truth.assertWithMessage("Pattern(\"%s\") program size", p)
.that(p.programSize())
.isEqualTo(expectedSize);
Truth.assertWithMessage("Matcher(\"%s\", \"%s\") program size", m1.pattern(), input)
.that(m1.programSize())
.isEqualTo(expectedSize);
Truth.assertWithMessage(
"Matcher(\"%s\", %s) program size", m2.pattern(), Arrays.toString(inputBytes))
.that(m2.programSize())
.isEqualTo(expectedSize);
}

public static void testGroup(String text, String regexp, String[] output) {
// RE2
Pattern p = Pattern.compile(regexp);
Expand Down
17 changes: 17 additions & 0 deletions javatests/com/google/re2j/MatcherTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,25 @@ public void testReplaceFirst() {
ApiTestUtils.testReplaceFirst("aab", "a*?", "<$0>", "<>aab");
}

@Test
public void testProgramSize() {
// It is a simple delegation, but still test it.
// More test cases are covered in PatternTest#testProgramSize.
Pattern pattern = Pattern.compile("go+d");
int programSize = pattern.programSize();
Truth.assertWithMessage("Pattern program size").that(programSize).isGreaterThan(1);
Truth.assertWithMessage("Positive matcher program size")
.that(pattern.matcher("good").programSize())
.isEqualTo(programSize);
Truth.assertWithMessage("Negative matcher program size")
.that(pattern.matcher("bad").programSize())
.isEqualTo(programSize);
}

@Test
public void testGroupCount() {
// It is a simple delegation, but still test it.
// More test cases are covered in PatternTest#testGroupCount.
ApiTestUtils.testGroupCount("(a)(b(c))d?(e)", 4);
}

Expand Down
14 changes: 13 additions & 1 deletion javatests/com/google/re2j/PatternTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,21 @@ public void testSplit() {
ApiTestUtils.testSplit(":", ":a::b", new String[] {"", "a", "", "b"});
}

@Test
public void testProgramSize() {
ApiTestUtils.testProgramSize("", 3);
ApiTestUtils.testProgramSize("a", 3);
ApiTestUtils.testProgramSize("^", 3);
ApiTestUtils.testProgramSize("^$", 4);
ApiTestUtils.testProgramSize("a+b", 5);
ApiTestUtils.testProgramSize("a+b?", 6);
ApiTestUtils.testProgramSize("(a+b)", 7);
ApiTestUtils.testProgramSize("a+b.*", 7);
ApiTestUtils.testProgramSize("(a+b?)", 8);
}

@Test
public void testGroupCount() {
// It is a simple delegation, but still test it.
ApiTestUtils.testGroupCount("(.*)ab(.*)a", 2);
ApiTestUtils.testGroupCount("(.*)(ab)(.*)a", 3);
ApiTestUtils.testGroupCount("(.*)((a)b)(.*)a", 4);
Expand Down

0 comments on commit 2757238

Please sign in to comment.