diff --git a/embed.fnc b/embed.fnc index 7792a28e7a3c..76278cae7d23 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1241,7 +1241,7 @@ p |void |force_locale_unlock Cp |void |force_out_malformed_utf8_message_ \ |NN const U8 * const p \ |NN const U8 * const e \ - |const U32 flags \ + |U32 flags \ |const bool die_here Adfpv |char * |form |NN const char *pat \ |... @@ -3757,19 +3757,19 @@ ATdmp |bool |utf8_to_uv_errors \ |NN const U8 * const e \ |NN UV *cp_p \ |NULLOK Size_t *advance_p \ - |const U32 flags \ + |U32 flags \ |NULLOK U32 *errors ATdmp |bool |utf8_to_uv_flags \ |NN const U8 * const s \ |NN const U8 * const e \ |NN UV *cp_p \ |NULLOK Size_t *advance_p \ - |const U32 flag + |U32 flags ATdip |bool |utf8_to_uv_msgs|NN const U8 * const s0 \ |NN const U8 *e \ |NN UV *cp_p \ |NULLOK Size_t *advance_p \ - |const U32 flags \ + |U32 flags \ |NULLOK U32 *errors \ |NULLOK AV **msgs CTp |bool |utf8_to_uv_msgs_helper_ \ @@ -3777,9 +3777,13 @@ CTp |bool |utf8_to_uv_msgs_helper_ \ |NN const U8 * const e \ |NN UV *cp_p \ |NULLOK Size_t *advance_p \ - |const U32 flags \ + |U32 flags \ |NULLOK U32 *errors \ |NULLOK AV **msgs +ATdip |UV |utf8_to_uv_or_die \ + |NN const U8 * const s \ + |NN const U8 *e \ + |NULLOK Size_t *advance_p CDbdp |UV |utf8_to_uvuni |NN const U8 *s \ |NULLOK STRLEN *retlen : Used in perly.y diff --git a/embed.h b/embed.h index dfcc4f4881e6..c018648047dd 100644 --- a/embed.h +++ b/embed.h @@ -870,6 +870,7 @@ # define utf8_to_uv_flags Perl_utf8_to_uv_flags # define utf8_to_uv_msgs Perl_utf8_to_uv_msgs # define utf8_to_uv_msgs_helper_ Perl_utf8_to_uv_msgs_helper_ +# define utf8_to_uv_or_die Perl_utf8_to_uv_or_die # define utf8n_to_uvchr Perl_utf8n_to_uvchr # define utf8n_to_uvchr_error Perl_utf8n_to_uvchr_error # define utf8n_to_uvchr_msgs Perl_utf8n_to_uvchr_msgs diff --git a/inline.h b/inline.h index 3a2618431702..5c2856060870 100644 --- a/inline.h +++ b/inline.h @@ -3053,7 +3053,7 @@ Perl_utf8_to_uv_msgs(const U8 * const s0, const U8 * const e, UV * cp_p, Size_t *advance_p, - const U32 flags, + U32 flags, U32 * errors, AV ** msgs) { @@ -3138,11 +3138,21 @@ Perl_utf8_to_uv_msgs(const U8 * const s0, return utf8_to_uv_msgs_helper_(s0, e, cp_p, advance_p, flags, errors, msgs); } +PERL_STATIC_INLINE UV +Perl_utf8_to_uv_or_die(const U8 *s, const U8 *e, STRLEN *advance_p) +{ + PERL_ARGS_ASSERT_UTF8_TO_UV_OR_DIE; + + UV cp; + (void) utf8_to_uv_flags(s, e, &cp, advance_p, UTF8_DIE_IF_MALFORMED); + return cp; +} + PERL_STATIC_INLINE UV Perl_utf8n_to_uvchr_msgs(const U8 * const s0, STRLEN curlen, STRLEN *retlen, - const U32 flags, + U32 flags, U32 * errors, AV ** msgs) { diff --git a/pod/perldelta.pod b/pod/perldelta.pod index d53738408312..f6835e32c945 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -436,9 +436,10 @@ New API functions are introduced to convert strings encoded in UTF-8 to their ordinal code point equivalent. These are safe to use by default, and generally more convenient to use than the existing ones. -L> replaces L> (which is -retained for backwards compatibility), but you should convert to use the -new form, as likely you aren't using the old one safely. +L> and L> replace +L> (which is retained for backwards +compatibility), but you should convert to use the new forms, as likely +you aren't using the old one safely. To convert in the opposite direction, you can now use L>. This is not a new function, but a new synonym diff --git a/proto.h b/proto.h index 32e8d48f4fa7..fe1becffba3b 100644 --- a/proto.h +++ b/proto.h @@ -1187,7 +1187,7 @@ Perl_force_locale_unlock(pTHX) #define PERL_ARGS_ASSERT_FORCE_LOCALE_UNLOCK PERL_CALLCONV void -Perl_force_out_malformed_utf8_message_(pTHX_ const U8 * const p, const U8 * const e, const U32 flags, const bool die_here); +Perl_force_out_malformed_utf8_message_(pTHX_ const U8 * const p, const U8 * const e, U32 flags, const bool die_here); #define PERL_ARGS_ASSERT_FORCE_OUT_MALFORMED_UTF8_MESSAGE_ \ assert(p); assert(e) @@ -5372,13 +5372,13 @@ Perl_utf8_to_utf16_base(pTHX_ U8 *s, U8 *d, Size_t bytelen, Size_t *newlen, cons Perl_utf8_to_uv(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p); */ /* PERL_CALLCONV bool -Perl_utf8_to_uv_errors(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p, const U32 flags, U32 *errors); */ +Perl_utf8_to_uv_errors(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p, U32 flags, U32 *errors); */ /* PERL_CALLCONV bool -Perl_utf8_to_uv_flags(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p, const U32 flag); */ +Perl_utf8_to_uv_flags(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p, U32 flags); */ PERL_CALLCONV bool -Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, const U8 * const e, UV *cp_p, Size_t *advance_p, const U32 flags, U32 *errors, AV **msgs); +Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, const U8 * const e, UV *cp_p, Size_t *advance_p, U32 flags, U32 *errors, AV **msgs); #define PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS_HELPER_ \ assert(s0); assert(e); assert(cp_p) @@ -10030,10 +10030,15 @@ Perl_utf8_hop_overshoot(const U8 *s, SSize_t off, const U8 * const start, const assert(s); assert(start); assert(end) PERL_STATIC_INLINE bool -Perl_utf8_to_uv_msgs(const U8 * const s0, const U8 *e, UV *cp_p, Size_t *advance_p, const U32 flags, U32 *errors, AV **msgs); +Perl_utf8_to_uv_msgs(const U8 * const s0, const U8 *e, UV *cp_p, Size_t *advance_p, U32 flags, U32 *errors, AV **msgs); # define PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS \ assert(s0); assert(e); assert(cp_p) +PERL_STATIC_INLINE UV +Perl_utf8_to_uv_or_die(const U8 * const s, const U8 *e, Size_t *advance_p); +# define PERL_ARGS_ASSERT_UTF8_TO_UV_OR_DIE \ + assert(s); assert(e) + PERL_STATIC_INLINE UV Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen); # define PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF \ diff --git a/utf8.c b/utf8.c index 1da50e1bc39d..de2149fc70ad 100644 --- a/utf8.c +++ b/utf8.c @@ -51,7 +51,7 @@ Perl_force_out_malformed_utf8_message_(pTHX_ const U8 *const p, /* First byte in UTF-8 sequence */ const U8 * const e, /* Final byte in sequence (may include multiple chars */ - const U32 flags, /* Flags to pass to utf8_to_uv(), + U32 flags, /* Flags to pass to utf8_to_uv(), usually 0, or some DISALLOW flags */ const bool die_here) /* If TRUE, this function does not return */ { @@ -70,29 +70,17 @@ Perl_force_out_malformed_utf8_message_(pTHX_ * flexibility is here to return to the caller so they can finish up and * die themselves */ U32 errors; + UV dummy; - ENTER; - SAVEI8(PL_dowarn); - SAVESPTR(PL_curcop); - - PL_dowarn = G_WARN_ALL_ON|G_WARN_ON; - if (PL_curcop) { - SAVECURCOPWARNINGS(); - PL_curcop->cop_warnings = pWARN_ALL; - } - - (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors); - - LEAVE; + flags &= ~UTF8_CHECK_ONLY; + flags |= (die_here) ? UTF8_DIE_IF_MALFORMED + : UTF8_FORCE_WARN_IF_MALFORMED; + (void) utf8_to_uv_errors(p, e, &dummy, NULL, flags, &errors); if (! errors) { Perl_croak(aTHX_ "panic: force_out_malformed_utf8_message_ should" " be called only when there are errors found"); } - - if (die_here) { - Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); - } } STATIC HV * @@ -1015,6 +1003,7 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s, =for apidoc_item extended_utf8_to_uv =for apidoc_item strict_utf8_to_uv =for apidoc_item c9strict_utf8_to_uv +=for apidoc_item utf8_to_uv_or_die =for apidoc_item utf8_to_uvchr_buf =for apidoc_item utf8_to_uvchr @@ -1045,13 +1034,15 @@ these. Private use characters and those code points yet to be assigned to a particular character are never considered problematic. Additionally, most of the functions accept non-Unicode code points, those starting at 0x110000. +There are two sets of these functions: + =over 4 =item C forms Almost all code should use only C, C, C, or C. The other functions are -either the problematic old form, or are for highly specialized uses. +either the problematic old form, or are for specialized uses. These four functions each return C if the sequence of bytes starting at C form a complete, legal UTF-8 (or UTF-EBCDIC) sequence for a code point. @@ -1087,16 +1078,17 @@ instead uses the exchangeable definition given by Unicode's Corregendum #9, which accepts non-character code points while still rejecting surrogates. It does not raise a warning when rejecting. -=item * C +=item * C accepts all syntactically valid UTF-8, as extended by Perl to allow 64-bit code points to be encoded. -=back +C is merely a synonym for C. Use this form +to draw attention to the fact that it accepts any code point. But since +Perl programs traditionally do this by default, plain C is the form +most often used. -C is merely a synonym for C, whose name -explicitly indicates that it accepts Perl-extended UTF-8. Perl programs -traditionally handle this by default. +=back Whenever syntactically invalid input is rejected, an explanatory warning message is raised, unless C warnings (or the appropriate subcategory) are @@ -1108,6 +1100,11 @@ sequence. You can use that function or C> to exert more control over the input that is considered acceptable, and the warnings that are raised. +C has a simpler interface, for use when any errors are +fatal. It returns the code point instead of using an output parameter, and +throws an exception with any errors found where the other functions here would +have returned false. + Often, C is an arbitrarily long string containing the UTF-8 representations of many code points in a row, and these functions are called in the course of parsing C to find all those code points. @@ -1116,8 +1113,8 @@ If your code doesn't know how to deal with illegal input, as would be typical of a low level routine, the loop could look like: while (s < e) { - UV cp; Size_t advance; + UV cp; (void) utf8_to_uv(s, e, &cp, &advance); s += advance; @@ -1127,11 +1124,24 @@ A REPLACEMENT CHARACTER will be inserted everywhere that malformed input occurs. Obviously, we aren't expecting such outcomes, but your code will be protected from attacks and many harmful effects that could otherwise occur. +If the situation is such that it would be a bug for the input to be invalid, a +somewhat simpler loop suffices: + + while (s < e) { + Size_t advance; + UV cp = utf8_to_uv_or_die(s, e, &advance); + + s += advance; + } + +This will throw an exception on invalid input, so your code doesn't have to +concern itself with that possibility. + If you do have a plan for handling malformed input, you could instead write: while (s < e) { - UV cp; Size_t advance; + UV cp; if (UNLIKELY(! utf8_to_uv(s, e, &cp, &advance)) { @@ -1151,9 +1161,10 @@ attacks against such code; and it is extra work always, as the functions have already done the equivalent work and return the correct value in C, regardless of whether the input is well-formed or not. -You must always pass a non-NULL pointer into which to store the (first) code -point C represents. If you don't care about this value, you should be using -one of the C> functions instead. +Except with C, you must always pass a non-NULL pointer into +which to store the (first) code point C represents. If you don't care about +this value, you should be using one of the C> functions +instead. =item C forms @@ -1234,18 +1245,20 @@ unlikely to be needed except for specialized purposes. C is more like an extension of C, but with fewer quirks, and a different method of specifying the bytes in C it is allowed to examine. It has a C parameter instead of an C parameter, -so the furthest byte in C it can look at is S>. Its return -value is, like C, ambiguous with respect to the NUL and -REPLACEMENT characters, but the value of C<*retlen> can be relied on (except -with the C flag described below) to know where the next -possible character along C starts, removing that quirk. Hence, you always -should use C<*retlen> to determine where the next character in C starts. +so the furthest byte in C it can look at is S>. Its +return value is, like C, ambiguous with respect to the NUL +and REPLACEMENT characters, but the value of C<*retlen> can be relied on +(except with the C flag described below) to know where the +next possible character along C starts, removing that quirk. Hence, you +always should use C<*retlen> to determine where the next character in C +starts. These functions have an additional parameter, C, besides the ones in C and C, which can be used to broaden or restrict what is acceptable UTF-8. C has the same meaning and behavior in both functions. When C is 0, these functions accept any -syntactically valid Perl-extended-UTF-8 sequence. +syntactically valid Perl-extended-UTF-8 sequence that doesn't overflow the +platform's word size. There are flags that apply to accepting particular sequences, and flags that apply to raising warnings about encountering sequences. Each type is @@ -1254,39 +1267,51 @@ or both reject and warn. Rejecting means that the sequence gets translated into the Unicode REPLACEMENT CHARACTER instead of what it was meant to represent. -Even if a flag is passed that indicates warnings are desired; no warning will be -raised if C<'utf8'> warnings (or the appropriate subcategory) are disabled at -the point of the call. +Unless otherwise stated below, warnings are subject to the C warnings +category being on. =over 4 =item C -This also suppresses any warnings. And it changes what is stored into +This suppresses any warnings. And it changes what is stored into C<*retlen> with the C family of functions (for the worse). It is not likely to be of use to you. You can use C (described below) to also turn off warnings, and that flag doesn't adversely affect C<*retlen>. +=item C + +Normally, no warnings are generated if warnings are turned off lexically or +globally, regardless of any flags to the contrary. But this flag effectively +turns on warnings temporarily for the duration of this function's execution. + +Do not use it lightly. + +This flag is ignored if C is also set. + =item C =item C -These disallow and/or warn about UTF-8 sequences that represent surrogate -characters. +These reject and/or warn about UTF-8 sequences that represent surrogate +characters. The warning categories C and C control if +warnings are actually raised. =item C =item C -These disallow and/or warn about UTF-8 sequences that represent non-character -code points. +These reject and/or warn about UTF-8 sequences that represent non-character +code points. The warning categories C and C control if warnings +are actually raised. =item C =item C -These disallow and/or warn about UTF-8 sequences that represent code points -above 0x10FFFF. +These reject and/or warn about UTF-8 sequences that represent code points +above 0x10FFFF. The warning categories C and C control if +warnings are actually raised. =item C @@ -1315,10 +1340,12 @@ L. =item C -These disallow and/or warn on encountering sequences that require Perl's +These reject and/or warn on encountering sequences that require Perl's extension to UTF-8 to represent them. These are all for code points above 0x10FFFF, so these sequences are a subset of the ones controlled by SUPER or -either of the illegal interchange sets of flags. +either of the illegal interchange sets of flags. The warning categories +C, C, and C control if warnings are actually +raised. Perl predates Unicode, and earlier standards allowed for code points up through 0x7FFF_FFFF (2**31 - 1). Perl, of course, would like you to be able to @@ -1354,8 +1381,16 @@ regardless of any of the flags. The only such flag that you would ever have any reason to use is C which applies to any of the syntactic malformations and -overflow, except for empty input. The other flags are shown in the C<_GOT_> -bits list in C>. +overflow, except for empty input. The other flags are analogous to ones in +the C<_GOT_> bits list in C>. + +=item C + +If the function would otherwise return C, it instead croaks. The +C flag is effectively turned on so that the cause +of the croak is displayed. + +This flag is ignored if C is also set. =back @@ -1508,6 +1543,8 @@ function creates a new AV to store information, described below, about all the malformations that were encountered. If the flag C is passed, this parameter is ignored. +Otherwise, when this parameter is set, the flags C and +C are ignored. What is considered a malformation is affected by C, the same as described in C>. No array element is generated for @@ -1584,7 +1621,7 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, const U8 * const e, UV *cp_p, Size_t *advance_p, - const U32 flags, + U32 flags, U32 * errors, AV ** msgs) { @@ -1619,6 +1656,9 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, } if (UNLIKELY(msgs)) { *msgs = NULL; + + /* The msgs parameter has higher priority than these flags */ + flags &= ~(UTF8_DIE_IF_MALFORMED|UTF8_FORCE_WARN_IF_MALFORMED); } /* Each of the affected Hanguls starts with \xED */ @@ -1983,18 +2023,19 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, bool disallowed = FALSE; const U32 orig_problems = possible_problems; U32 error_flags_return = 0; + AV * msgs_return = NULL; /* The following macro returns 0 if no message needs to be generated * for this problem even if everything else says to. Otherwise returns * the warning category to use for the message.. * * No message need be generated if the UTF8_CHECK_ONLY flag has been - * set by the caller. Otherwise, a message should be generated if - * either: + * set by the caller. Otherwise, a message should be generated if: * 1) the caller has furnished a structure into which messages should * be returned to it (so it itself can decide what to do); or * 2) warnings are enabled for either of the category parameters to - * the macro. + * the macro; or + * 3) the special MALFORMED flags have been passed * * The 'warning' parameter is the higher priority warning category to * check. The macro calls ckWARN_d(warning), so warnings for it are @@ -2010,11 +2051,13 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, * * When called without a second category, the macro outputs a bunch of * zeroes that the compiler should fold to nothing */ -#define NEED_MESSAGE(warning, extra_ckWARN, extra_category) \ - ((flags & UTF8_CHECK_ONLY) ? 0 : \ - ((ckWARN_d(warning)) ? warning : \ - ((extra_ckWARN(extra_category +0)) ? extra_category +0 : \ - ((msgs) ? warning : 0)))) +#define NEED_MESSAGE(warning, extra_ckWARN, extra_category) \ + ((flags & UTF8_CHECK_ONLY) ? 0 : \ + ((ckWARN_d(warning)) ? warning : \ + ((extra_ckWARN(extra_category +0)) ? extra_category +0 : \ + ((flags & ( UTF8_DIE_IF_MALFORMED \ + |UTF8_FORCE_WARN_IF_MALFORMED)) ? warning : \ + ((msgs) ? warning : 0))))) while (possible_problems) { /* Handle each possible problem */ char * message = NULL; @@ -2463,23 +2506,47 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, } /* End of switch() on the possible problems */ - /* Display the message (if any) for the problem being handled in - * this iteration of the loop */ + /* Display or save the message (if any) for the problem being + * handled in this iteration of the loop */ if (message) { if (msgs) { - if (*msgs == NULL) { - *msgs = newAV(); + if (msgs_return == NULL) { + msgs_return = newAV(); } - av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message, - pack_warn, - this_flag_bit))); + av_push(msgs_return, + newRV_noinc((SV*) new_msg_hv(message, pack_warn, + this_flag_bit))); + } + else if (! (flags & UTF8_CHECK_ONLY)) { + if (UNLIKELY(flags & ( UTF8_DIE_IF_MALFORMED + |UTF8_FORCE_WARN_IF_MALFORMED))) + { + ENTER; + SAVEI8(PL_dowarn); + SAVESPTR(PL_curcop); + + PL_dowarn = G_WARN_ALL_ON|G_WARN_ON; + if (PL_curcop) { + SAVECURCOPWARNINGS(); + PL_curcop->cop_warnings = pWARN_ALL; + } + } + + if (PL_op) { + Perl_warner(aTHX_ pack_warn, "%s in %s", message, + OP_DESC(PL_op)); + } + else { + Perl_warner(aTHX_ pack_warn, "%s", message); + } + + if (UNLIKELY(flags & ( UTF8_DIE_IF_MALFORMED + |UTF8_FORCE_WARN_IF_MALFORMED))) + { + LEAVE; + } } - else if (PL_op) - Perl_warner(aTHX_ pack_warn, "%s in %s", message, - OP_DESC(PL_op)); - else - Perl_warner(aTHX_ pack_warn, "%s", message); } } /* End of 'while (possible_problems)' */ @@ -2490,11 +2557,19 @@ Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, *advance_p = curlen; } + if (msgs_return) { + *msgs = msgs_return; + } + if (errors) { *errors = error_flags_return; } if (disallowed) { + if ((flags & ~UTF8_CHECK_ONLY) & UTF8_DIE_IF_MALFORMED) { + Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); + } + success = false; uv = UNICODE_REPLACEMENT; } diff --git a/utf8.h b/utf8.h index d62a06f742af..6ed7c3304e4d 100644 --- a/utf8.h +++ b/utf8.h @@ -1214,6 +1214,8 @@ point's representation. #define UTF8_CHECK_ONLY 0x8000 #define UTF8_NO_CONFIDENCE_IN_CURLEN_ 0x10000 /* Internal core use only */ +#define UTF8_DIE_IF_MALFORMED 0x20000 +#define UTF8_FORCE_WARN_IF_MALFORMED 0x40000 /* For backwards source compatibility. They do nothing, as the default now * includes what they used to mean. The first one's meaning was to allow the