Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolve all groups when performing match #178

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Copyright (c) 2020 The Go Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style
* license that can be found in the LICENSE file.
*/
package com.google.re2j.benchmark;

import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.Blackhole;

import java.nio.charset.StandardCharsets;
import java.util.concurrent.TimeUnit;

@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
public class BenchmarkSubMultiMatch {

@Param({"JDK", "RE2J"})
private Implementations impl;

@Param({"true", "false"})
private boolean binary;

@Param({"true", "false"})
private boolean successMatch;

@Param({"true", "false"})
private boolean resolveGroups;

byte[] bytes = BenchmarkUtils.readResourceFile("google-maps-contact-info.html");
private String html = new String(bytes, StandardCharsets.UTF_8);

private String sucessPatternUrlString =
"(https?:\\/\\/(www\\.)?([-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,4})\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*))";
private String failurePatternUrlString =
"(https?:\\/\\/(www\\.)?([-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{1})\\b([-a-zA-Z0-9@:%_\\+.~#?&//=]*))";
private Implementations.Pattern successPattern;
private Implementations.Pattern failurePattern;
private Implementations.Pattern successPatternResolveGroups;
private Implementations.Pattern failurePatternResolveGroups;

@Setup
public void setup() {
successPattern = Implementations.Pattern.compile(impl, sucessPatternUrlString);
successPatternResolveGroups =
Implementations.Pattern.compile(
impl, sucessPatternUrlString, Implementations.Pattern.FLAG_RESOLVE_GROUPS_MATCH);
failurePattern = Implementations.Pattern.compile(impl, failurePatternUrlString);
failurePatternResolveGroups =
Implementations.Pattern.compile(
impl, failurePatternUrlString, Implementations.Pattern.FLAG_RESOLVE_GROUPS_MATCH);
}

@Benchmark
public void findDomains(Blackhole bh) {
Implementations.Pattern pattern =
successMatch
? (resolveGroups ? successPatternResolveGroups : successPattern)
: (resolveGroups ? failurePatternResolveGroups : failurePattern);
Implementations.Matcher matcher = binary ? pattern.matcher(bytes) : pattern.matcher(html);
int count = 0;
while (matcher.find()) {
bh.consume(matcher.group(3));
count++;
}
int expectedMatchers = successMatch ? 178 : 0;
if (count != expectedMatchers) {
throw new AssertionError("Expected " + expectedMatchers + " matches.");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ public abstract static class Matcher {

public abstract String group();

public abstract String group(int group);

public static class Re2Matcher extends Matcher {
private final com.google.re2j.Matcher matcher;

Expand All @@ -39,6 +41,11 @@ public boolean matches() {
public String group() {
return matcher.group();
}

@Override
public String group(int group) {
return matcher.group(group);
}
}

public static class JdkMatcher extends Matcher {
Expand All @@ -62,6 +69,11 @@ public boolean matches() {
public String group() {
return matcher.group();
}

@Override
public String group(int group) {
return matcher.group(group);
}
}
}

Expand All @@ -71,6 +83,9 @@ public abstract static class Pattern {
// indicating that a pattern should be case-insensitive.
public static final int FLAG_CASE_INSENSITIVE = 1;

// FLAG_RESOLVE_GROUPS_MATCH enable RE2J to resolve all groups during match operation.
public static final int FLAG_RESOLVE_GROUPS_MATCH = 32;

public static Pattern compile(Implementations impl, String pattern) {
return compile(impl, pattern, 0);
}
Expand Down Expand Up @@ -136,6 +151,9 @@ public Re2Pattern(String pattern, int flags) {
if ((flags & FLAG_CASE_INSENSITIVE) > 0) {
re2PatternFlags |= com.google.re2j.Pattern.CASE_INSENSITIVE;
}
if ((flags & FLAG_RESOLVE_GROUPS_MATCH) > 0) {
re2PatternFlags |= com.google.re2j.Pattern.RESOLVE_GROUPS_MATCH;
}
this.pattern = com.google.re2j.Pattern.compile(pattern, re2PatternFlags);
}

Expand Down
13 changes: 11 additions & 2 deletions java/com/google/re2j/Matcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -367,12 +367,21 @@ public boolean find(int start) {
private boolean genMatch(int startByte, int anchor) {
// TODO(rsc): Is matches/lookingAt supposed to reset the append or input positions?
// From the JDK docs, looks like no.
boolean ok = pattern.re2().match(matcherInput, startByte, inputLength, anchor, groups, 1);
boolean ok =
pattern
.re2()
.match(
matcherInput,
startByte,
inputLength,
anchor,
groups,
pattern.re2().resolveAllGroups ? 1 + groupCount : 1);
if (!ok) {
return false;
}
hasMatch = true;
hasGroups = false;
hasGroups = pattern.re2().resolveAllGroups;
anchorFlag = anchor;

return true;
Expand Down
23 changes: 20 additions & 3 deletions java/com/google/re2j/Pattern.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ public final class Pattern implements Serializable {
*/
public static final int LONGEST_MATCH = 16;

/**
* Flag: match and find operations resolves all groups.
*/
public static final int RESOLVE_GROUPS_MATCH = 32;

// The pattern string at construction time.
private final String pattern;

Expand Down Expand Up @@ -130,11 +135,17 @@ public static Pattern compile(String regex, int flags) {
if ((flags & MULTILINE) != 0) {
flregex = "(?m)" + flregex;
}
if ((flags & ~(MULTILINE | DOTALL | CASE_INSENSITIVE | DISABLE_UNICODE_GROUPS | LONGEST_MATCH))
if ((flags
& ~(MULTILINE
| DOTALL
| CASE_INSENSITIVE
| DISABLE_UNICODE_GROUPS
| LONGEST_MATCH
| RESOLVE_GROUPS_MATCH))
!= 0) {
throw new IllegalArgumentException(
"Flags should only be a combination "
+ "of MULTILINE, DOTALL, CASE_INSENSITIVE, DISABLE_UNICODE_GROUPS, LONGEST_MATCH");
+ "of MULTILINE, DOTALL, CASE_INSENSITIVE, DISABLE_UNICODE_GROUPS, LONGEST_MATCH, RESOLVE_GROUPS_MATCH");
}
return compile(flregex, regex, flags);
}
Expand All @@ -148,7 +159,13 @@ private static Pattern compile(String flregex, String regex, int flags) {
re2Flags &= ~RE2.UNICODE_GROUPS;
}
return new Pattern(
regex, flags, RE2.compileImpl(flregex, re2Flags, (flags & LONGEST_MATCH) != 0));
regex,
flags,
RE2.compileImpl(
flregex,
re2Flags,
((flags & LONGEST_MATCH) != 0),
(flags & RESOLVE_GROUPS_MATCH) != 0));
}

/**
Expand Down
13 changes: 8 additions & 5 deletions java/com/google/re2j/RE2.java
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ class RE2 {
// required at start of match
final int numSubexp;
boolean longest;
boolean resolveAllGroups;

String prefix; // required UTF-16 prefix in unanchored matches
byte[] prefixUTF8; // required UTF-8 prefix in unanchored matches
Expand All @@ -135,12 +136,13 @@ class RE2 {
this.prefixRune = re2.prefixRune;
}

private RE2(String expr, Prog prog, int numSubexp, boolean longest) {
private RE2(String expr, Prog prog, int numSubexp, boolean longest, boolean resolveAllGroups) {
this.expr = expr;
this.prog = prog;
this.numSubexp = numSubexp;
this.cond = prog.startCond();
this.longest = longest;
this.resolveAllGroups = resolveAllGroups;
}

/**
Expand All @@ -155,7 +157,7 @@ private RE2(String expr, Prog prog, int numSubexp, boolean longest) {
* backtracking. For POSIX leftmost-longest matching, see {@link #compilePOSIX}.
*/
static RE2 compile(String expr) throws PatternSyntaxException {
return compileImpl(expr, PERL, /*longest=*/ false);
return compileImpl(expr, PERL, /*longest=*/ false, false);
}

/**
Expand All @@ -177,16 +179,17 @@ static RE2 compile(String expr) throws PatternSyntaxException {
* even well-defined. See http://swtch.com/~rsc/regexp/regexp2.html#posix
*/
static RE2 compilePOSIX(String expr) throws PatternSyntaxException {
return compileImpl(expr, POSIX, /*longest=*/ true);
return compileImpl(expr, POSIX, /*longest=*/ true, false);
}

// Exposed to ExecTests.
static RE2 compileImpl(String expr, int mode, boolean longest) throws PatternSyntaxException {
static RE2 compileImpl(String expr, int mode, boolean longest, boolean resolveAllGroups)
throws PatternSyntaxException {
Regexp re = Parser.parse(expr, mode);
int maxCap = re.maxCap(); // (may shrink during simplify)
re = Simplify.simplify(re);
Prog prog = Compiler.compileRegexp(re);
RE2 re2 = new RE2(expr, prog, maxCap, longest);
RE2 re2 = new RE2(expr, prog, maxCap, longest, resolveAllGroups);
StringBuilder prefixBuilder = new StringBuilder();
re2.prefixComplete = prog.prefix(prefixBuilder);
re2.prefix = prefixBuilder.toString();
Expand Down
2 changes: 1 addition & 1 deletion javatests/com/google/re2j/ExecTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,7 @@ private void testFowler(String file) throws IOException {

RE2 re = null;
try {
re = RE2.compileImpl(pattern, flags, true);
re = RE2.compileImpl(pattern, flags, true, false);
} catch (PatternSyntaxException e) {
if (shouldCompileMatch[0]) {
System.err.format("%s:%d: %s did not compile\n", file, lineno, pattern);
Expand Down
53 changes: 53 additions & 0 deletions javatests/com/google/re2j/MatcherTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,27 @@ public void testDocumentedExample() {
assertFalse(m.find());
}

@Test
public void testDocumentedExampleWithResolveGroups() {
Pattern p = Pattern.compile("b(an)*(.)", Pattern.RESOLVE_GROUPS_MATCH);
Matcher m = p.matcher("by, band, banana");
assertTrue(m.lookingAt());
m.reset();
assertTrue(m.find());
assertEquals("by", m.group(0));
assertNull(m.group(1));
assertEquals("y", m.group(2));
assertTrue(m.find());
assertEquals("band", m.group(0));
assertEquals("an", m.group(1));
assertEquals("d", m.group(2));
assertTrue(m.find());
assertEquals("banana", m.group(0));
assertEquals("an", m.group(1));
assertEquals("a", m.group(2));
assertFalse(m.find());
}

@Test
public void testMutableCharSequence() {
Pattern p = Pattern.compile("b(an)*(.)");
Expand Down Expand Up @@ -486,6 +507,38 @@ public void testNamedGroups() {
}
}

@Test
public void testNamedGroupsWithResolveGroups() {
Pattern p =
Pattern.compile(
"(?P<baz>f(?P<foo>b*a(?P<another>r+)){0,10})" + "(?P<bag>bag)?(?P<nomatch>zzz)?",
Pattern.RESOLVE_GROUPS_MATCH);
Matcher m = p.matcher("fbbarrrrrbag");
assertTrue(m.matches());
assertEquals("fbbarrrrr", m.group("baz"));
assertEquals("bbarrrrr", m.group("foo"));
assertEquals("rrrrr", m.group("another"));
assertEquals(0, m.start("baz"));
assertEquals(1, m.start("foo"));
assertEquals(4, m.start("another"));
assertEquals(9, m.end("baz"));
assertEquals(9, m.end("foo"));
assertEquals("bag", m.group("bag"));
assertEquals(9, m.start("bag"));
assertEquals(12, m.end("bag"));
assertNull(m.group("nomatch"));
assertEquals(-1, m.start("nomatch"));
assertEquals(-1, m.end("nomatch"));
assertEquals("whatbbarrrrreverbag", appendReplacement(m, "what$2ever${bag}"));

try {
m.group("nonexistent");
fail("Should have thrown IllegalArgumentException");
} catch (IllegalArgumentException expected) {
// Expected
}
}

private String appendReplacement(Matcher m, String replacement) {
StringBuilder b = new StringBuilder();
m.appendReplacement(b, replacement);
Expand Down