diff --git a/doc/syntax.html b/doc/syntax.html
index eed4fd2a7..6cbda140e 100644
--- a/doc/syntax.html
+++ b/doc/syntax.html
@@ -62,7 +62,7 @@
RE2 regular expression syntax reference
Grouping: |
(re) | numbered capturing group (submatch) |
(?P<name>re) | named & numbered capturing group (submatch) |
-(?<name>re) | named & numbered capturing group (submatch) |
+(?<name>re) | named & numbered capturing group (submatch) |
(?'name're) | named & numbered capturing group (submatch) |
(?:re) | non-capturing group |
(?flags) | set flags within current group; non-capturing |
diff --git a/doc/syntax.txt b/doc/syntax.txt
index 5bb2067f9..6070efd96 100644
--- a/doc/syntax.txt
+++ b/doc/syntax.txt
@@ -51,7 +51,7 @@ x{n}+ exactly «n» «x», possessive NOT SUPPORTED
Grouping:
(re) numbered capturing group (submatch)
(?Pre) named & numbered capturing group (submatch)
-(?re) named & numbered capturing group (submatch) NOT SUPPORTED
+(?re) named & numbered capturing group (submatch)
(?'name're) named & numbered capturing group (submatch) NOT SUPPORTED
(?:re) non-capturing group
(?flags) set flags within current group; non-capturing
diff --git a/re2/parse.cc b/re2/parse.cc
index 67a485791..7b1510dda 100644
--- a/re2/parse.cc
+++ b/re2/parse.cc
@@ -2059,8 +2059,6 @@ bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
return false;
}
- t.remove_prefix(2); // "(?"
-
// Check for named captures, first introduced in Python's regexp library.
// As usual, there are three slightly different syntaxes:
//
@@ -2074,22 +2072,23 @@ bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
// support all three as well. EcmaScript 4 uses only the Python form.
//
// In both the open source world (via Code Search) and the
- // Google source tree, (?Pname) is the dominant form,
- // so that's the one we implement. One is enough.
- if (t.size() > 2 && t[0] == 'P' && t[1] == '<') {
+ // Google source tree, (?Pexpr) and (?expr) are the
+ // dominant forms of named captures and both are supported.
+ if ((t.size() > 4 && t[2] == 'P' && t[3] == '<') ||
+ (t.size() > 3 && t[2] == '<')) {
// Pull out name.
- size_t end = t.find('>', 2);
+ size_t begin = t[2] == 'P' ? 4 : 3;
+ size_t end = t.find('>', begin);
if (end == absl::string_view::npos) {
- if (!IsValidUTF8(*s, status_))
+ if (!IsValidUTF8(t, status_))
return false;
status_->set_code(kRegexpBadNamedCapture);
- status_->set_error_arg(*s);
+ status_->set_error_arg(t);
return false;
}
- // t is "P...", t[end] == '>'
- absl::string_view capture(t.data()-2, end+3); // "(?P"
- absl::string_view name(t.data()+2, end-2); // "name"
+ absl::string_view capture(t.data(), end+1);
+ absl::string_view name(t.data()+begin, end-begin);
if (!IsValidUTF8(name, status_))
return false;
if (!IsValidCaptureName(name)) {
@@ -2103,11 +2102,12 @@ bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
return false;
}
- s->remove_prefix(
- static_cast(capture.data() + capture.size() - s->data()));
+ s->remove_prefix(capture.size());
return true;
}
+ t.remove_prefix(2); // "(?"
+
bool negated = false;
bool sawflags = false;
int nflags = flags_;
diff --git a/re2/regexp.cc b/re2/regexp.cc
index 1614bb0fe..4ea81cfcd 100644
--- a/re2/regexp.cc
+++ b/re2/regexp.cc
@@ -400,7 +400,13 @@ static bool TopEqual(Regexp* a, Regexp* b) {
a->max() == b->max();
case kRegexpCapture:
- return a->cap() == b->cap() && a->name() == b->name();
+ if (a->name() == NULL || b->name() == NULL) {
+ // One pointer is null, so the other pointer should also be null.
+ return a->cap() == b->cap() && a->name() == b->name();
+ } else {
+ // Neither pointer is null, so compare the pointees for equality.
+ return a->cap() == b->cap() && *a->name() == *b->name();
+ }
case kRegexpHaveMatch:
return a->match_id() == b->match_id();
diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc
index 9d3954424..0ee5561e9 100644
--- a/re2/testing/parse_test.cc
+++ b/re2/testing/parse_test.cc
@@ -166,6 +166,8 @@ static Test tests[] = {
// Test named captures
{ "(?Pa)", "cap{name:lit{a}}" },
{ "(?P<中文>a)", "cap{中文:lit{a}}" },
+ { "(?a)", "cap{name:lit{a}}" },
+ { "(?<中文>a)", "cap{中文:lit{a}}" },
// Case-folded literals
{ "[Aa]", "litfold{a}" },
@@ -396,6 +398,11 @@ const char* badtests[] = {
"(?Pa)",
"(?P<>a)",
+ "(?a",
+ "(?",
+ "(?a)",
+ "(?<>a)",
"[a-Z]",
"(?i)[a-Z]",
"a{100000}",
@@ -416,6 +423,7 @@ const char* only_perl[] = {
"\\Q\\\\\\\\\\E",
"(?:a)",
"(?Pa)",
+ "(?a)",
};
// Valid in POSIX, bad in Perl.
@@ -505,6 +513,16 @@ TEST(NamedCaptures, ErrorArgs) {
EXPECT_TRUE(re == NULL);
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
EXPECT_EQ(status.error_arg(), "(?P");
+
+ re = Regexp::Parse("test(?z)", Regexp::LikePerl, &status);
+ EXPECT_TRUE(re == NULL);
+ EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
+ EXPECT_EQ(status.error_arg(), "(?");
}
} // namespace re2