Skip to content

Commit

Permalink
Fix more ancient bugs around Latin-1 handling.
Browse files Browse the repository at this point in the history
It turned out that case folding assumed UTF-8 mode, so
we would fold, say, 0xD1 to 0xF1 even in Latin-1 mode.

Fixes #477.

Change-Id: I73aa5c8e33ee0c6041c54e3a7268635915960f64
Reviewed-on: https://code-review.googlesource.com/c/re2/+/62714
Reviewed-by: Alex Chernyakhovsky <[email protected]>
Reviewed-by: Paul Wankadia <[email protected]>
  • Loading branch information
junyer committed Feb 19, 2024
1 parent 0ff0fab commit f9550c3
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 23 deletions.
60 changes: 44 additions & 16 deletions re2/parse.cc
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,20 @@ Rune CycleFoldRune(Rune r) {
return ApplyFold(f, r);
}

// Add lo-hi to the class, along with their fold-equivalent characters.
static void AddFoldedRangeLatin1(CharClassBuilder* cc, Rune lo, Rune hi) {
while (lo <= hi) {
cc->AddRange(lo, lo);
if ('A' <= lo && lo <= 'Z') {
cc->AddRange(lo - 'A' + 'a', lo - 'A' + 'a');
}
if ('a' <= lo && lo <= 'z') {
cc->AddRange(lo - 'a' + 'A', lo - 'a' + 'A');
}
lo++;
}
}

// Add lo-hi to the class, along with their fold-equivalent characters.
// If lo-hi is already in the class, assume that the fold-equivalent
// chars are there too, so there's no work to do.
Expand Down Expand Up @@ -394,17 +408,26 @@ static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
// Pushes the literal rune r onto the stack.
bool Regexp::ParseState::PushLiteral(Rune r) {
// Do case folding if needed.
if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {
Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
re->ccb_ = new CharClassBuilder;
Rune r1 = r;
do {
if (!(flags_ & NeverNL) || r != '\n') {
re->ccb_->AddRange(r, r);
}
r = CycleFoldRune(r);
} while (r != r1);
return PushRegexp(re);
if (flags_ & FoldCase) {
if (flags_ & Latin1 && (('A' <= r && r <= 'Z') ||
('a' <= r && r <= 'z'))) {
Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
re->ccb_ = new CharClassBuilder;
AddFoldedRangeLatin1(re->ccb_, r, r);
return PushRegexp(re);
}
if (!(flags_ & Latin1) && CycleFoldRune(r) != r) {
Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
re->ccb_ = new CharClassBuilder;
Rune r1 = r;
do {
if (!(flags_ & NeverNL) || r != '\n') {
re->ccb_->AddRange(r, r);
}
r = CycleFoldRune(r);
} while (r != r1);
return PushRegexp(re);
}
}

// Exclude newline if applicable.
Expand Down Expand Up @@ -1176,7 +1199,7 @@ void FactorAlternationImpl::Round3(Regexp** sub, int nsub,
if (re->op() == kRegexpCharClass) {
CharClass* cc = re->cc();
for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
ccb.AddRange(it->lo, it->hi);
ccb.AddRangeFlags(it->lo, it->hi, re->parse_flags());
} else if (re->op() == kRegexpLiteral) {
if (re->parse_flags() & Regexp::FoldCase) {
// AddFoldedRange() can terminate prematurely if the character class
Expand All @@ -1195,7 +1218,7 @@ void FactorAlternationImpl::Round3(Regexp** sub, int nsub,
}
re->Decref();
}
Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags);
Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags & ~Regexp::FoldCase);
splices->emplace_back(re, sub + start, i - start);
}

Expand Down Expand Up @@ -1623,10 +1646,15 @@ void CharClassBuilder::AddRangeFlags(
}

// If folding case, add fold-equivalent characters too.
if (parse_flags & Regexp::FoldCase)
AddFoldedRange(this, lo, hi, 0);
else
if (parse_flags & Regexp::FoldCase) {
if (parse_flags & Regexp::Latin1) {
AddFoldedRangeLatin1(this, lo, hi);
} else {
AddFoldedRange(this, lo, hi, 0);
}
} else {
AddRange(lo, hi);
}
}

// Look for a group with the given name.
Expand Down
20 changes: 14 additions & 6 deletions re2/testing/dump.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,17 +96,25 @@ static void DumpRegexpAppending(Regexp* re, std::string* s) {
break;
case kRegexpLiteral: {
Rune r = re->rune();
char buf[UTFmax+1];
buf[runetochar(buf, &r)] = 0;
s->append(buf);
if (re->parse_flags() & Regexp::Latin1) {
s->push_back(r);
} else {
char buf[UTFmax+1];
buf[runetochar(buf, &r)] = 0;
s->append(buf);
}
break;
}
case kRegexpLiteralString:
for (int i = 0; i < re->nrunes(); i++) {
Rune r = re->runes()[i];
char buf[UTFmax+1];
buf[runetochar(buf, &r)] = 0;
s->append(buf);
if (re->parse_flags() & Regexp::Latin1) {
s->push_back(r);
} else {
char buf[UTFmax+1];
buf[runetochar(buf, &r)] = 0;
s->append(buf);
}
}
break;
case kRegexpConcat:
Expand Down
25 changes: 24 additions & 1 deletion re2/testing/parse_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,29 @@ static Test tests[] = {
// Bug in Regexp::ToString() that emitted [^], which
// would (obviously) fail to parse when fed back in.
{ "[\\s\\S]", "cc{0-0x10ffff}" },

// As per https://github.com/google/re2/issues/477,
// there were long-standing bugs involving Latin-1.
// Here, we exercise it WITHOUT case folding...
{ "\xa5\x64\xd1", "str{\xa5""d\xd1}", Regexp::Latin1 },
{ "\xa5\xd1\x64", "str{\xa5\xd1""d}", Regexp::Latin1 },
{ "\xa5\x64[\xd1\xd2]", "cat{str{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 },
{ "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}lit{d}}", Regexp::Latin1 },
{ "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 },
{ "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 },
{ "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 },
{ "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 },
// Here, we exercise it WITH case folding...
// 0x64 should fold to 0x44, but neither 0xD1 nor 0xD2
// should fold to 0xF1 and 0xF2, respectively.
{ "\xa5\x64\xd1", "strfold{\xa5""d\xd1}", Regexp::Latin1 | Regexp::FoldCase },
{ "\xa5\xd1\x64", "strfold{\xa5\xd1""d}", Regexp::Latin1 | Regexp::FoldCase },
{ "\xa5\x64[\xd1\xd2]", "cat{strfold{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
{ "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}litfold{d}}", Regexp::Latin1 | Regexp::FoldCase },
{ "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase },
{ "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase },
{ "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
{ "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
};

bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
Expand Down Expand Up @@ -492,7 +515,7 @@ TEST(TestToString, EquivalentParse) {
// << " t=" << t << " regexp=" << tests[i].regexp;

// Test that if we parse the new regexp we get the same structure.
Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status);
Regexp* nre = Regexp::Parse(t, f, &status);
ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text();
std::string ss = nre->Dump();
std::string tt = nre->ToString();
Expand Down

0 comments on commit f9550c3

Please sign in to comment.