Skip to content

Commit

Permalink
find_by_class: Use new utf8_to_uv; not utf8_to_uvchr_buf
Browse files Browse the repository at this point in the history
find_by_class() is used in pattern matching.

This is a subtle bug fix when the input is malformed UTF-8.  We say we
don't support malformed, but this commit is a step towards better
protecting against that eventuality.

frior to this commit, some patterns that use find_by_class() would exhibit
different matching behavior of malformed input depending on if utf8
warnings were enabled or not.

This is because utf8_to_uvchr_buf() returns NUL if utf8 warnings are on;
and the REPLACEMENT CHARACTER if they are off.  If the match criteria
accepts one but not the other, the behavior would differ.

Now, malformed input never matches a class
  • Loading branch information
khwilliamson committed Dec 5, 2024
1 parent 00f06a3 commit cf8cb33
Showing 1 changed file with 18 additions and 29 deletions.
47 changes: 18 additions & 29 deletions regexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -2303,6 +2303,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
* ('p8' and 'pb'. */
switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
SV * anyofh_list;
UV cp; /* scratch */

case ANYOFPOSIXL_t8_pb:
case ANYOFPOSIXL_t8_p8:
Expand Down Expand Up @@ -2398,10 +2399,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
anyofh_list = GET_ANYOFH_INVLIST(prog, c);
REXEC_FBC_UTF8_CLASS_SCAN(
( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
&& _invlist_contains_cp(anyofh_list,
utf8_to_uvchr_buf((U8 *) s,
(U8 *) strend,
NULL))));
&& utf8_to_uv((U8 *) s, (U8 *) strend, &cp, NULL)
&& _invlist_contains_cp(anyofh_list, cp)));
break;

case ANYOFHb_t8_pb:
Expand All @@ -2412,10 +2411,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,

anyofh_list = GET_ANYOFH_INVLIST(prog, c);
REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
_invlist_contains_cp(anyofh_list,
utf8_to_uvchr_buf((U8 *) s,
(U8 *) strend,
NULL)));
( utf8_to_uv((U8 *) s, (U8 *) strend, &cp, NULL)
&& _invlist_contains_cp(anyofh_list, cp)));
}
break;

Expand All @@ -2440,10 +2437,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
( inRANGE(NATIVE_UTF8_TO_I8(*s),
LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
&& _invlist_contains_cp(anyofh_list,
utf8_to_uvchr_buf((U8 *) s,
(U8 *) strend,
NULL))));
&& utf8_to_uv((U8 *) s, (U8 *) strend, &cp, NULL)
&& _invlist_contains_cp(anyofh_list, cp)));
break;

case ANYOFHs_t8_pb:
Expand All @@ -2453,10 +2448,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
((struct regnode_anyofhs *) c)->string,
/* Note FLAGS is the string length in this regnode */
((struct regnode_anyofhs *) c)->string + FLAGS(c),
_invlist_contains_cp(anyofh_list,
utf8_to_uvchr_buf((U8 *) s,
(U8 *) strend,
NULL)));
( utf8_to_uv((U8 *) s, (U8 *) strend, &cp, NULL)
&& _invlist_contains_cp(anyofh_list, cp)));
break;

case ANYOFR_tb_pb:
Expand All @@ -2469,10 +2462,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
case ANYOFR_t8_p8:
REXEC_FBC_UTF8_CLASS_SCAN(
( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
&& withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
(U8 *) strend,
NULL),
ANYOFRbase(c), ANYOFRdelta(c))));
&& utf8_to_uv((U8 *) s, (U8 *) strend, &cp, NULL)
&& withinCOUNT(cp, ANYOFRbase(c), ANYOFRdelta(c))));
break;

case ANYOFRb_tb_pb:
Expand All @@ -2487,10 +2478,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
U8 first_byte = FLAGS(c);

REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
(U8 *) strend,
NULL),
ANYOFRbase(c), ANYOFRdelta(c)));
( utf8_to_uv((U8 *) s, (U8 *) strend, &cp, NULL)
&& withinCOUNT(cp, ANYOFRbase(c), ANYOFRdelta(c))));
}
break;

Expand Down Expand Up @@ -3201,11 +3190,11 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
switch (classnum) {
default:
REXEC_FBC_UTF8_CLASS_SCAN(
to_complement ^ cBOOL(_invlist_contains_cp(
PL_XPosix_ptrs[classnum],
utf8_to_uvchr_buf((U8 *) s,
(U8 *) strend,
NULL))));
to_complement ^ cBOOL(
utf8_to_uv((U8 *) s, (U8 *) strend,
&cp, NULL)
&& _invlist_contains_cp(
PL_XPosix_ptrs[classnum], cp)));
break;

case CC_ENUM_SPACE_:
Expand Down

0 comments on commit cf8cb33

Please sign in to comment.