diff --git a/inline.h b/inline.h index ed65babdd9755..8e818f8e09b91 100644 --- a/inline.h +++ b/inline.h @@ -2381,41 +2381,97 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) /* -=for apidoc isUTF8_CHAR - -Evaluates to non-zero if the first few bytes of the string starting at C and -looking no further than S> are well-formed UTF-8, as extended by Perl, -that represents some code point; otherwise it evaluates to 0. If non-zero, the -value gives how many bytes starting at C comprise the code point's -representation. Any bytes remaining before C, but beyond the ones needed to -form the first code point in C, are not examined. - -The code point can be any that will fit in an IV on this machine, using Perl's -extension to official UTF-8 to represent those higher than the Unicode maximum -of 0x10FFFF. That means that this macro is used to efficiently decide if the -next few bytes in C is legal UTF-8 for a single character. +=for apidoc isUTF8_CHAR +=for apidoc_item isSTRICT_UTF8_CHAR +=for apidoc_item isC9_STRICT_UTF8_CHAR +=for apidoc_item isUTF8_CHAR_flags +=for apidoc_item is_utf8_char_buf + +These each evaluate to non-zero if the first few bytes of the string starting +at C and looking no further than S> are well-formed UTF-8 that +represents some code point, for varying degrees of strictness. Otherwise they +evaluate to 0. If non-zero, the value gives how many bytes starting at C +comprise the code point's representation. Any bytes remaining before C, but +beyond the ones needed to form the first code point in C, are not examined. + +These are used to efficiently decide if the next few bytes in C are +legal UTF-8 for a single character. + +With C, the code point can be any that will fit in an IV on this +machine, using Perl's extension to official UTF-8 to represent those higher +than the Unicode maximum of 0x10FFFF. That means that this will consider valid +bytes that are unrecognized or considered illegal by non-Perl applications. + +With C>, acceptable code points are restricted to those +defined by Unicode to be fully interchangeable across applications. +This means code points above the Unicode range (max legal is 0x10FFFF), +surrogates, and non-character code points are rejected. + +With C>, acceptable code points are restricted to +those defined by Unicode to be fully interchangeable within an application. +This means code points above the Unicode range and surrogates are rejected, but +non-character code points are accepted. See L. + +Use C> to customize what code points are acceptable. +If C is 0, this gives the same results as C>; +if C is C, this gives the same results +as C>; +and if C is C, this gives +the same results as C>. +Otherwise C may be any combination of the C> flags +understood by C>, with the same meanings. -Use C> to restrict the acceptable code points to those -defined by Unicode to be fully interchangeable across applications; -C> to use the L definition of allowable -code points; and C> for a more customized definition. +The three alternative macros are for the most commonly needed validations; they +are likely to run somewhat faster than this more general one, as they can be +inlined into your code. -Use C>, C>, and -C> to check entire strings. +Use one of the C> forms to check entire strings. Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC machines) is a valid UTF-8 character. +C is the old name for C. Do not use it in new +code. + =cut -This uses an adaptation of the table and algorithm given in -https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive -documentation of the original version. A copyright notice for the original -version is given at the beginning of this file. The Perl adaptation is -documented at the definition of PL_extended_utf8_dfa_tab[]. +All the functions except isUTF8_CHAR_flags) use adaptations of the table and +algorithm given in https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which +provides comprehensive documentation of the original version. A copyright +notice for the original version is given at the beginning of this file. + +The Perl adaptation for isUTF8_CHAR is documented at the definition of +PL_extended_utf8_dfa_tab[]. + +The Perl adaptation for isSTRICT_UTF8_CHAR is documented at the definition of +PL_strict_utf8_dfa_tab[]; + +The Perl adaptation for isC9_STRICT_UTF8_CHAR is documented at the definition +of PL_c9_utf8_dfa_tab[]. + */ +PERL_STATIC_INLINE Size_t +Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR; + + PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab, + DFA_RETURN_SUCCESS_, + goto check_hanguls, + DFA_RETURN_FAILURE_); + check_hanguls: + + /* Here, we didn't return success, but dropped out of the loop. In the + * case of PL_strict_utf8_dfa_tab, this means the input is either + * malformed, or was for certain Hanguls; handle them specially */ + + /* The dfa above drops out for incomplete or illegal inputs, and certain + * legal Hanguls; check and return accordingly */ + return is_HANGUL_ED_utf8_safe(s0, e); +} + PERL_STATIC_INLINE Size_t Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e) { @@ -2449,98 +2505,6 @@ Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e) } -/* - -=for apidoc isSTRICT_UTF8_CHAR - -Evaluates to non-zero if the first few bytes of the string starting at C and -looking no further than S> are well-formed UTF-8 that represents some -Unicode code point completely acceptable for open interchange between all -applications; otherwise it evaluates to 0. If non-zero, the value gives how -many bytes starting at C comprise the code point's representation. Any -bytes remaining before C, but beyond the ones needed to form the first code -point in C, are not examined. - -The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not -be a surrogate nor a non-character code point. Thus this excludes any code -point from Perl's extended UTF-8. - -This is used to efficiently decide if the next few bytes in C is -legal Unicode-acceptable UTF-8 for a single character. - -Use C> to use the L definition of allowable -code points; C> to check for Perl's extended UTF-8; -and C> for a more customized definition. - -Use C>, C>, and -C> to check entire strings. - -=cut - -This uses an adaptation of the tables and algorithm given in -https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive -documentation of the original version. A copyright notice for the original -version is given at the beginning of this file. The Perl adaptation is -documented at the definition of strict_extended_utf8_dfa_tab[]. - -*/ - -PERL_STATIC_INLINE Size_t -Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) -{ - PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR; - - PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab, - DFA_RETURN_SUCCESS_, - goto check_hanguls, - DFA_RETURN_FAILURE_); - check_hanguls: - - /* Here, we didn't return success, but dropped out of the loop. In the - * case of PL_strict_utf8_dfa_tab, this means the input is either - * malformed, or was for certain Hanguls; handle them specially */ - - /* The dfa above drops out for incomplete or illegal inputs, and certain - * legal Hanguls; check and return accordingly */ - return is_HANGUL_ED_utf8_safe(s0, e); -} - -/* - -=for apidoc isC9_STRICT_UTF8_CHAR - -Evaluates to non-zero if the first few bytes of the string starting at C and -looking no further than S> are well-formed UTF-8 that represents some -Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero, -the value gives how many bytes starting at C comprise the code point's -representation. Any bytes remaining before C, but beyond the ones needed to -form the first code point in C, are not examined. - -The largest acceptable code point is the Unicode maximum 0x10FFFF. This -differs from C> only in that it accepts non-character -code points. This corresponds to -L. -which said that non-character code points are merely discouraged rather than -completely forbidden in open interchange. See -L. - -Use C> to check for Perl's extended UTF-8; and -C> for a more customized definition. - -Use C>, C>, and -C> to check entire strings. - -=cut - -This uses an adaptation of the tables and algorithm given in -https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive -documentation of the original version. A copyright notice for the original -version is given at the beginning of this file. The Perl adaptation is -documented at the definition of PL_c9_utf8_dfa_tab[]. - -*/ - PERL_STATIC_INLINE Size_t Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) { @@ -3021,36 +2985,6 @@ Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end) } } -/* - -=for apidoc isUTF8_CHAR_flags - -Evaluates to non-zero if the first few bytes of the string starting at C and -looking no further than S> are well-formed UTF-8, as extended by Perl, -that represents some code point, subject to the restrictions given by C; -otherwise it evaluates to 0. If non-zero, the value gives how many bytes -starting at C comprise the code point's representation. Any bytes remaining -before C, but beyond the ones needed to form the first code point in C, -are not examined. - -If C is 0, this gives the same results as C>; -if C is C, this gives the same results -as C>; -and if C is C, this gives -the same results as C>. -Otherwise C may be any combination of the C> flags -understood by C>, with the same meanings. - -The three alternative macros are for the most commonly needed validations; they -are likely to run somewhat faster than this more general one, as they can be -inlined into your code. - -Use L, L, and -L to check entire strings. - -=cut -*/ - PERL_STATIC_INLINE STRLEN Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags) { diff --git a/mathoms.c b/mathoms.c index dba2459b1ef33..7e81e2c76e003 100644 --- a/mathoms.c +++ b/mathoms.c @@ -804,14 +804,6 @@ Perl_sv_copypv(pTHX_ SV *const dsv, SV *const ssv) sv_copypv_flags(dsv, ssv, SV_GMAGIC); } -/* -=for apidoc_section $unicode -=for apidoc is_utf8_char_buf - -This is identical to the macro L. - -=cut */ - STRLEN Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end) {