Skip to content

Commit

Permalink
regmatch: Use new utf8_to_uv; not utf8_to_uvchr_buf
Browse files Browse the repository at this point in the history
This is a subtle bug fix when the input is malformed UTF-8.  We say we
don't support malformed, but this commit is a step towards better
protecting against that eventuality.

Prior to this commit, some patterns that would exhibit different
matching behavior of malformed input depending on if utf8 warnings were
enabled or not.

This is because utf8_to_uvchr_buf() returns NUL if utf8 warnings are on;
and the REPLACEMENT CHARACTER if they are off.  If the match criteria
accepts one but not the other, the behavior would differ.

Now, the match stops immediately without it being considered a match
when a malformed input character is found
  • Loading branch information
khwilliamson committed Dec 5, 2024
1 parent 00f06a3 commit 702e074
Showing 1 changed file with 27 additions and 36 deletions.
63 changes: 27 additions & 36 deletions regexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -6577,6 +6577,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)

switch (state_num) {
SV * anyofh_list;
UV cp; /* scratch */

case SBOL: /* /^../ and /\A../ */
if (locinput == reginfo->strbeg)
Expand Down Expand Up @@ -6947,10 +6948,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)

while (chars) {
if (utf8_target) {
/* XXX This assumes the length is well-formed, as
* does the UTF8SKIP below */
uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
uniflags);
/* XXX This assumes the length is well-formed */
(void) utf8_to_uv_flags((U8*)uc, uc + UTF8_MAXLEN,
&uvc, &len, uniflags);
uc += len;
}
else {
Expand All @@ -6962,8 +6962,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
while (foldlen) {
if (!--chars)
break;
uvc = utf8n_to_uvchr(uscan, foldlen, &len,
uniflags);
(void) utf8_to_uv_flags(uscan, uscan + foldlen,
&uvc, &len, uniflags);
uscan += len;
foldlen -= len;
}
Expand Down Expand Up @@ -7593,10 +7593,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
|| NEXTCHR_IS_EOS
|| ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
|| ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
|| ! _invlist_contains_cp(anyofh_list,
utf8_to_uvchr_buf((U8 *) locinput,
(U8 *) loceol,
NULL)))
|| ! utf8_to_uv((U8 *) locinput, (U8 *) loceol, &cp, NULL)
|| ! _invlist_contains_cp(anyofh_list, cp))
{
sayNO;
}
Expand All @@ -7608,10 +7606,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
|| NEXTCHR_IS_EOS
|| ANYOF_FLAGS(scan) != (U8) *locinput
|| ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
|| ! _invlist_contains_cp(anyofh_list,
utf8_to_uvchr_buf((U8 *) locinput,
(U8 *) loceol,
NULL)))
|| ! utf8_to_uv((U8 *) locinput, (U8 *) loceol, &cp, NULL)
|| ! _invlist_contains_cp(anyofh_list, cp))
{
sayNO;
}
Expand All @@ -7638,10 +7634,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
|| ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
|| ! _invlist_contains_cp(anyofh_list,
utf8_to_uvchr_buf((U8 *) locinput,
(U8 *) loceol,
NULL)))
|| ! utf8_to_uv((U8 *) locinput, (U8 *) loceol, &cp, NULL)
|| ! _invlist_contains_cp(anyofh_list, cp))
{
sayNO;
}
Expand All @@ -7654,10 +7648,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
|| loceol - locinput < FLAGS(scan)
|| memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
|| ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
|| ! _invlist_contains_cp(anyofh_list,
utf8_to_uvchr_buf((U8 *) locinput,
(U8 *) loceol,
NULL)))
|| ! utf8_to_uv((U8 *) locinput, (U8 *) loceol, &cp, NULL)
|| ! _invlist_contains_cp(anyofh_list, cp))
{
sayNO;
}
Expand All @@ -7671,10 +7663,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)

if (utf8_target) {
if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
|| ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
(U8 *) reginfo->strend,
NULL),
ANYOFRbase(scan), ANYOFRdelta(scan)))
|| ! utf8_to_uv((U8 *) locinput, (U8 *) reginfo->strend,
&cp, NULL)
|| ! withinCOUNT(cp, ANYOFRbase(scan), ANYOFRdelta(scan)))
{
sayNO;
}
Expand All @@ -7696,10 +7687,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)

if (utf8_target) {
if ( ANYOF_FLAGS(scan) != (U8) *locinput
|| ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
(U8 *) reginfo->strend,
NULL),
ANYOFRbase(scan), ANYOFRdelta(scan)))
|| ! utf8_to_uv((U8 *) locinput, (U8 *) reginfo->strend,
&cp, NULL)
|| ! withinCOUNT(cp, ANYOFRbase(scan), ANYOFRdelta(scan)))
{
sayNO;
}
Expand Down Expand Up @@ -7840,12 +7830,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
classnum = (char_class_number_) FLAGS(scan);
switch (classnum) {
default:
if (! (to_complement
^ cBOOL(_invlist_contains_cp(
PL_XPosix_ptrs[classnum],
utf8_to_uvchr_buf((U8 *) locinput,
(U8 *) reginfo->strend,
NULL)))))
if ( ! utf8_to_uv((U8 *) locinput,
(U8 *) reginfo->strend,
&cp, NULL)
|| ! (to_complement
^ cBOOL(_invlist_contains_cp(
PL_XPosix_ptrs[classnum],
cp))))
{
sayNO;
}
Expand Down

0 comments on commit 702e074

Please sign in to comment.