From 05beda4fb5d685be2d886739a616ed0cd1fbd70a Mon Sep 17 00:00:00 2001 From: Suho Date: Fri, 22 Nov 2024 00:09:25 -0800 Subject: [PATCH] Resolve all groups when performing match RE2J does not resolve groups when `Matcher.find()` is called, but rater it only resolves groups when `Matcher.group(*)` is called. This results in input being matched twice reducing performance. This fix enables RE2J to resolve the groups when `Matcher.find()` is called, so that when `Matcher.group(*)` is called the results can be served from the cache. --- .../benchmark/BenchmarkSubMultiMatch.java | 72 +++++++++++++++++++ .../re2j/benchmark/Implementations.java | 18 +++++ java/com/google/re2j/Matcher.java | 13 +++- java/com/google/re2j/Pattern.java | 23 +++++- java/com/google/re2j/RE2.java | 13 ++-- javatests/com/google/re2j/ExecTest.java | 2 +- javatests/com/google/re2j/MatcherTest.java | 53 ++++++++++++++ 7 files changed, 183 insertions(+), 11 deletions(-) create mode 100644 benchmarks/src/main/java/com/google/re2j/benchmark/BenchmarkSubMultiMatch.java diff --git a/benchmarks/src/main/java/com/google/re2j/benchmark/BenchmarkSubMultiMatch.java b/benchmarks/src/main/java/com/google/re2j/benchmark/BenchmarkSubMultiMatch.java new file mode 100644 index 00000000..914ff478 --- /dev/null +++ b/benchmarks/src/main/java/com/google/re2j/benchmark/BenchmarkSubMultiMatch.java @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2020 The Go Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style + * license that can be found in the LICENSE file. + */ +package com.google.re2j.benchmark; + +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.Blackhole; + +import java.nio.charset.StandardCharsets; +import java.util.concurrent.TimeUnit; + +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@State(Scope.Benchmark) +public class BenchmarkSubMultiMatch { + + @Param({"JDK", "RE2J"}) + private Implementations impl; + + @Param({"true", "false"}) + private boolean binary; + + @Param({"true", "false"}) + private boolean successMatch; + + @Param({"true", "false"}) + private boolean resolveGroups; + + byte[] bytes = BenchmarkUtils.readResourceFile("google-maps-contact-info.html"); + private String html = new String(bytes, StandardCharsets.UTF_8); + + private String sucessPatternUrlString = + "(https?:\\/\\/(www\\.)?([-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,4})\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*))"; + private String failurePatternUrlString = + "(https?:\\/\\/(www\\.)?([-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{1})\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*))"; + private Implementations.Pattern successPattern; + private Implementations.Pattern failurePattern; + private Implementations.Pattern successPatternResolveGroups; + private Implementations.Pattern failurePatternResolveGroups; + + @Setup + public void setup() { + successPattern = Implementations.Pattern.compile(impl, sucessPatternUrlString); + successPatternResolveGroups = + Implementations.Pattern.compile( + impl, sucessPatternUrlString, Implementations.Pattern.FLAG_RESOLVE_GROUPS_MATCH); + failurePattern = Implementations.Pattern.compile(impl, failurePatternUrlString); + failurePatternResolveGroups = + Implementations.Pattern.compile( + impl, failurePatternUrlString, Implementations.Pattern.FLAG_RESOLVE_GROUPS_MATCH); + } + + @Benchmark + public void findDomains(Blackhole bh) { + Implementations.Pattern pattern = + successMatch + ? (resolveGroups ? successPatternResolveGroups : successPattern) + : (resolveGroups ? failurePatternResolveGroups : failurePattern); + Implementations.Matcher matcher = binary ? pattern.matcher(bytes) : pattern.matcher(html); + int count = 0; + while (matcher.find()) { + bh.consume(matcher.group(3)); + count++; + } + int expectedMatchers = successMatch ? 178 : 0; + if (count != expectedMatchers) { + throw new AssertionError("Expected " + expectedMatchers + " matches."); + } + } +} diff --git a/benchmarks/src/main/java/com/google/re2j/benchmark/Implementations.java b/benchmarks/src/main/java/com/google/re2j/benchmark/Implementations.java index 8fd21441..b9fb2675 100644 --- a/benchmarks/src/main/java/com/google/re2j/benchmark/Implementations.java +++ b/benchmarks/src/main/java/com/google/re2j/benchmark/Implementations.java @@ -18,6 +18,8 @@ public abstract static class Matcher { public abstract String group(); + public abstract String group(int group); + public static class Re2Matcher extends Matcher { private final com.google.re2j.Matcher matcher; @@ -39,6 +41,11 @@ public boolean matches() { public String group() { return matcher.group(); } + + @Override + public String group(int group) { + return matcher.group(group); + } } public static class JdkMatcher extends Matcher { @@ -62,6 +69,11 @@ public boolean matches() { public String group() { return matcher.group(); } + + @Override + public String group(int group) { + return matcher.group(group); + } } } @@ -71,6 +83,9 @@ public abstract static class Pattern { // indicating that a pattern should be case-insensitive. public static final int FLAG_CASE_INSENSITIVE = 1; + // FLAG_RESOLVE_GROUPS_MATCH enable RE2J to resolve all groups during match operation. + public static final int FLAG_RESOLVE_GROUPS_MATCH = 32; + public static Pattern compile(Implementations impl, String pattern) { return compile(impl, pattern, 0); } @@ -136,6 +151,9 @@ public Re2Pattern(String pattern, int flags) { if ((flags & FLAG_CASE_INSENSITIVE) > 0) { re2PatternFlags |= com.google.re2j.Pattern.CASE_INSENSITIVE; } + if ((flags & FLAG_RESOLVE_GROUPS_MATCH) > 0) { + re2PatternFlags |= com.google.re2j.Pattern.RESOLVE_GROUPS_MATCH; + } this.pattern = com.google.re2j.Pattern.compile(pattern, re2PatternFlags); } diff --git a/java/com/google/re2j/Matcher.java b/java/com/google/re2j/Matcher.java index 874955c7..453615a0 100644 --- a/java/com/google/re2j/Matcher.java +++ b/java/com/google/re2j/Matcher.java @@ -349,12 +349,21 @@ public boolean find(int start) { private boolean genMatch(int startByte, int anchor) { // TODO(rsc): Is matches/lookingAt supposed to reset the append or input positions? // From the JDK docs, looks like no. - boolean ok = pattern.re2().match(matcherInput, startByte, inputLength, anchor, groups, 1); + boolean ok = + pattern + .re2() + .match( + matcherInput, + startByte, + inputLength, + anchor, + groups, + pattern.re2().resolveAllGroups ? 1 + groupCount : 1); if (!ok) { return false; } hasMatch = true; - hasGroups = false; + hasGroups = pattern.re2().resolveAllGroups; anchorFlag = anchor; return true; diff --git a/java/com/google/re2j/Pattern.java b/java/com/google/re2j/Pattern.java index f5883f53..bec7fa5f 100644 --- a/java/com/google/re2j/Pattern.java +++ b/java/com/google/re2j/Pattern.java @@ -50,6 +50,11 @@ public final class Pattern implements Serializable { */ public static final int LONGEST_MATCH = 16; + /** + * Flag: match and find operations resolves all groups. + */ + public static final int RESOLVE_GROUPS_MATCH = 32; + // The pattern string at construction time. private final String pattern; @@ -130,11 +135,17 @@ public static Pattern compile(String regex, int flags) { if ((flags & MULTILINE) != 0) { flregex = "(?m)" + flregex; } - if ((flags & ~(MULTILINE | DOTALL | CASE_INSENSITIVE | DISABLE_UNICODE_GROUPS | LONGEST_MATCH)) + if ((flags + & ~(MULTILINE + | DOTALL + | CASE_INSENSITIVE + | DISABLE_UNICODE_GROUPS + | LONGEST_MATCH + | RESOLVE_GROUPS_MATCH)) != 0) { throw new IllegalArgumentException( "Flags should only be a combination " - + "of MULTILINE, DOTALL, CASE_INSENSITIVE, DISABLE_UNICODE_GROUPS, LONGEST_MATCH"); + + "of MULTILINE, DOTALL, CASE_INSENSITIVE, DISABLE_UNICODE_GROUPS, LONGEST_MATCH, RESOLVE_GROUPS_MATCH"); } return compile(flregex, regex, flags); } @@ -148,7 +159,13 @@ private static Pattern compile(String flregex, String regex, int flags) { re2Flags &= ~RE2.UNICODE_GROUPS; } return new Pattern( - regex, flags, RE2.compileImpl(flregex, re2Flags, (flags & LONGEST_MATCH) != 0)); + regex, + flags, + RE2.compileImpl( + flregex, + re2Flags, + ((flags & LONGEST_MATCH) != 0), + (flags & RESOLVE_GROUPS_MATCH) != 0)); } /** diff --git a/java/com/google/re2j/RE2.java b/java/com/google/re2j/RE2.java index c05fd3c3..40bc23d2 100644 --- a/java/com/google/re2j/RE2.java +++ b/java/com/google/re2j/RE2.java @@ -109,6 +109,7 @@ class RE2 { // required at start of match final int numSubexp; boolean longest; + boolean resolveAllGroups; String prefix; // required UTF-16 prefix in unanchored matches byte[] prefixUTF8; // required UTF-8 prefix in unanchored matches @@ -135,12 +136,13 @@ class RE2 { this.prefixRune = re2.prefixRune; } - private RE2(String expr, Prog prog, int numSubexp, boolean longest) { + private RE2(String expr, Prog prog, int numSubexp, boolean longest, boolean resolveAllGroups) { this.expr = expr; this.prog = prog; this.numSubexp = numSubexp; this.cond = prog.startCond(); this.longest = longest; + this.resolveAllGroups = resolveAllGroups; } /** @@ -155,7 +157,7 @@ private RE2(String expr, Prog prog, int numSubexp, boolean longest) { * backtracking. For POSIX leftmost-longest matching, see {@link #compilePOSIX}. */ static RE2 compile(String expr) throws PatternSyntaxException { - return compileImpl(expr, PERL, /*longest=*/ false); + return compileImpl(expr, PERL, /*longest=*/ false, false); } /** @@ -177,16 +179,17 @@ static RE2 compile(String expr) throws PatternSyntaxException { * even well-defined. See http://swtch.com/~rsc/regexp/regexp2.html#posix */ static RE2 compilePOSIX(String expr) throws PatternSyntaxException { - return compileImpl(expr, POSIX, /*longest=*/ true); + return compileImpl(expr, POSIX, /*longest=*/ true, false); } // Exposed to ExecTests. - static RE2 compileImpl(String expr, int mode, boolean longest) throws PatternSyntaxException { + static RE2 compileImpl(String expr, int mode, boolean longest, boolean resolveAllGroups) + throws PatternSyntaxException { Regexp re = Parser.parse(expr, mode); int maxCap = re.maxCap(); // (may shrink during simplify) re = Simplify.simplify(re); Prog prog = Compiler.compileRegexp(re); - RE2 re2 = new RE2(expr, prog, maxCap, longest); + RE2 re2 = new RE2(expr, prog, maxCap, longest, resolveAllGroups); StringBuilder prefixBuilder = new StringBuilder(); re2.prefixComplete = prog.prefix(prefixBuilder); re2.prefix = prefixBuilder.toString(); diff --git a/javatests/com/google/re2j/ExecTest.java b/javatests/com/google/re2j/ExecTest.java index 1c7b0911..1cfa3b0b 100644 --- a/javatests/com/google/re2j/ExecTest.java +++ b/javatests/com/google/re2j/ExecTest.java @@ -567,7 +567,7 @@ private void testFowler(String file) throws IOException { RE2 re = null; try { - re = RE2.compileImpl(pattern, flags, true); + re = RE2.compileImpl(pattern, flags, true, false); } catch (PatternSyntaxException e) { if (shouldCompileMatch[0]) { System.err.format("%s:%d: %s did not compile\n", file, lineno, pattern); diff --git a/javatests/com/google/re2j/MatcherTest.java b/javatests/com/google/re2j/MatcherTest.java index 3c2c00ef..9c873af2 100644 --- a/javatests/com/google/re2j/MatcherTest.java +++ b/javatests/com/google/re2j/MatcherTest.java @@ -427,6 +427,27 @@ public void testDocumentedExample() { assertFalse(m.find()); } + @Test + public void testDocumentedExampleWithResolveGroups() { + Pattern p = Pattern.compile("b(an)*(.)", Pattern.RESOLVE_GROUPS_MATCH); + Matcher m = p.matcher("by, band, banana"); + assertTrue(m.lookingAt()); + m.reset(); + assertTrue(m.find()); + assertEquals("by", m.group(0)); + assertNull(m.group(1)); + assertEquals("y", m.group(2)); + assertTrue(m.find()); + assertEquals("band", m.group(0)); + assertEquals("an", m.group(1)); + assertEquals("d", m.group(2)); + assertTrue(m.find()); + assertEquals("banana", m.group(0)); + assertEquals("an", m.group(1)); + assertEquals("a", m.group(2)); + assertFalse(m.find()); + } + @Test public void testMutableCharSequence() { Pattern p = Pattern.compile("b(an)*(.)"); @@ -469,6 +490,38 @@ public void testNamedGroups() { } } + @Test + public void testNamedGroupsWithResolveGroups() { + Pattern p = + Pattern.compile( + "(?Pf(?Pb*a(?Pr+)){0,10})" + "(?Pbag)?(?Pzzz)?", + Pattern.RESOLVE_GROUPS_MATCH); + Matcher m = p.matcher("fbbarrrrrbag"); + assertTrue(m.matches()); + assertEquals("fbbarrrrr", m.group("baz")); + assertEquals("bbarrrrr", m.group("foo")); + assertEquals("rrrrr", m.group("another")); + assertEquals(0, m.start("baz")); + assertEquals(1, m.start("foo")); + assertEquals(4, m.start("another")); + assertEquals(9, m.end("baz")); + assertEquals(9, m.end("foo")); + assertEquals("bag", m.group("bag")); + assertEquals(9, m.start("bag")); + assertEquals(12, m.end("bag")); + assertNull(m.group("nomatch")); + assertEquals(-1, m.start("nomatch")); + assertEquals(-1, m.end("nomatch")); + assertEquals("whatbbarrrrreverbag", appendReplacement(m, "what$2ever${bag}")); + + try { + m.group("nonexistent"); + fail("Should have thrown IllegalArgumentException"); + } catch (IllegalArgumentException expected) { + // Expected + } + } + private String appendReplacement(Matcher m, String replacement) { StringBuilder b = new StringBuilder(); m.appendReplacement(b, replacement);