From 7e75adb021ccb5fd021169e67ba60d4adb291ed0 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 10 Jun 2024 01:17:56 -0600 Subject: [PATCH] perlapi: Combine all isUTF8_CHAR variants into one entry This is more compact and makes it easier for the reader to compare and contrast the possile functions in order to choose which is best for their application. --- inline.h | 228 +++++++++++++++++++----------------------------------- mathoms.c | 8 -- 2 files changed, 81 insertions(+), 155 deletions(-) diff --git a/inline.h b/inline.h index 0aff0a9e0c21..7c47a2e1193c 100644 --- a/inline.h +++ b/inline.h @@ -2365,41 +2365,97 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) /* -=for apidoc isUTF8_CHAR - -Evaluates to non-zero if the first few bytes of the string starting at C and -looking no further than S> are well-formed UTF-8, as extended by Perl, -that represents some code point; otherwise it evaluates to 0. If non-zero, the -value gives how many bytes starting at C comprise the code point's -representation. Any bytes remaining before C, but beyond the ones needed to -form the first code point in C, are not examined. - -The code point can be any that will fit in an IV on this machine, using Perl's -extension to official UTF-8 to represent those higher than the Unicode maximum -of 0x10FFFF. That means that this macro is used to efficiently decide if the -next few bytes in C is legal UTF-8 for a single character. +=for apidoc isUTF8_CHAR +=for apidoc_item isSTRICT_UTF8_CHAR +=for apidoc_item isC9_STRICT_UTF8_CHAR +=for apidoc_item isUTF8_CHAR_flags +=for apidoc_item is_utf8_char_buf + +These each evaluate to non-zero if the first few bytes of the string starting +at C and looking no further than S> are well-formed UTF-8 that +represents some code point, for varying degrees of strictness. Otherwise they +evaluate to 0. If non-zero, the value gives how many bytes starting at C +comprise the code point's representation. Any bytes remaining before C, but +beyond the ones needed to form the first code point in C, are not examined. + +These are used to efficiently decide if the next few bytes in C are +legal UTF-8 for a single character. + +With C, the code point can be any that will fit in an IV on this +machine, using Perl's extension to official UTF-8 to represent those higher +than the Unicode maximum of 0x10FFFF. That means that this will consider valid +bytes that are unrecognized or considered illegal by non-Perl applications. + +With C>, acceptable code points are restricted to those +defined by Unicode to be fully interchangeable across applications. +This means code points above the Unicode range (max legal is 0x10FFFF), +surrogates, and non-character code points are rejected. + +With C>, acceptable code points are restricted to +those defined by Unicode to be fully interchangeable within an application. +This means code points above the Unicode range and surrogates are rejected, but +non-character code points are accepted. See L. + +Use C> to customize what code points are acceptable. +If C is 0, this gives the same results as C>; +if C is C, this gives the same results +as C>; +and if C is C, this gives +the same results as C>. +Otherwise C may be any combination of the C> flags +understood by C>, with the same meanings. -Use C> to restrict the acceptable code points to those -defined by Unicode to be fully interchangeable across applications; -C> to use the L definition of allowable -code points; and C> for a more customized definition. +The three alternative macros are for the most commonly needed validations; they +are likely to run somewhat faster than this more general one, as they can be +inlined into your code. -Use C>, C>, and -C> to check entire strings. +Use one of the C> forms to check entire strings. Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC machines) is a valid UTF-8 character. +C is the old name for C. Do not use it in new +code. + =cut -This uses an adaptation of the table and algorithm given in -https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive -documentation of the original version. A copyright notice for the original -version is given at the beginning of this file. The Perl adaptation is -documented at the definition of PL_extended_utf8_dfa_tab[]. +All the functions except isUTF8_CHAR_flags) use adaptations of the table and +algorithm given in https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which +provides comprehensive documentation of the original version. A copyright +notice for the original version is given at the beginning of this file. + +The Perl adaptation for isUTF8_CHAR is documented at the definition of +PL_extended_utf8_dfa_tab[]. + +The Perl adaptation for isSTRICT_UTF8_CHAR is documented at the definition of +PL_strict_utf8_dfa_tab[]; + +The Perl adaptation for isC9_STRICT_UTF8_CHAR is documented at the definition +of PL_c9_utf8_dfa_tab[]. + */ +PERL_STATIC_INLINE Size_t +Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR; + + PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab, + DFA_RETURN_SUCCESS_, + goto check_hanguls, + DFA_RETURN_FAILURE_); + check_hanguls: + + /* Here, we didn't return success, but dropped out of the loop. In the + * case of PL_strict_utf8_dfa_tab, this means the input is either + * malformed, or was for certain Hanguls; handle them specially */ + + /* The dfa above drops out for incomplete or illegal inputs, and certain + * legal Hanguls; check and return accordingly */ + return is_HANGUL_ED_utf8_safe(s0, e); +} + PERL_STATIC_INLINE Size_t Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e) { @@ -2433,98 +2489,6 @@ Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e) } -/* - -=for apidoc isSTRICT_UTF8_CHAR - -Evaluates to non-zero if the first few bytes of the string starting at C and -looking no further than S> are well-formed UTF-8 that represents some -Unicode code point completely acceptable for open interchange between all -applications; otherwise it evaluates to 0. If non-zero, the value gives how -many bytes starting at C comprise the code point's representation. Any -bytes remaining before C, but beyond the ones needed to form the first code -point in C, are not examined. - -The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not -be a surrogate nor a non-character code point. Thus this excludes any code -point from Perl's extended UTF-8. - -This is used to efficiently decide if the next few bytes in C is -legal Unicode-acceptable UTF-8 for a single character. - -Use C> to use the L definition of allowable -code points; C> to check for Perl's extended UTF-8; -and C> for a more customized definition. - -Use C>, C>, and -C> to check entire strings. - -=cut - -This uses an adaptation of the tables and algorithm given in -https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive -documentation of the original version. A copyright notice for the original -version is given at the beginning of this file. The Perl adaptation is -documented at the definition of strict_extended_utf8_dfa_tab[]. - -*/ - -PERL_STATIC_INLINE Size_t -Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) -{ - PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR; - - PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab, - DFA_RETURN_SUCCESS_, - goto check_hanguls, - DFA_RETURN_FAILURE_); - check_hanguls: - - /* Here, we didn't return success, but dropped out of the loop. In the - * case of PL_strict_utf8_dfa_tab, this means the input is either - * malformed, or was for certain Hanguls; handle them specially */ - - /* The dfa above drops out for incomplete or illegal inputs, and certain - * legal Hanguls; check and return accordingly */ - return is_HANGUL_ED_utf8_safe(s0, e); -} - -/* - -=for apidoc isC9_STRICT_UTF8_CHAR - -Evaluates to non-zero if the first few bytes of the string starting at C and -looking no further than S> are well-formed UTF-8 that represents some -Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero, -the value gives how many bytes starting at C comprise the code point's -representation. Any bytes remaining before C, but beyond the ones needed to -form the first code point in C, are not examined. - -The largest acceptable code point is the Unicode maximum 0x10FFFF. This -differs from C> only in that it accepts non-character -code points. This corresponds to -L. -which said that non-character code points are merely discouraged rather than -completely forbidden in open interchange. See -L. - -Use C> to check for Perl's extended UTF-8; and -C> for a more customized definition. - -Use C>, C>, and -C> to check entire strings. - -=cut - -This uses an adaptation of the tables and algorithm given in -https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive -documentation of the original version. A copyright notice for the original -version is given at the beginning of this file. The Perl adaptation is -documented at the definition of PL_c9_utf8_dfa_tab[]. - -*/ - PERL_STATIC_INLINE Size_t Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) { @@ -3005,36 +2969,6 @@ Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end) } } -/* - -=for apidoc isUTF8_CHAR_flags - -Evaluates to non-zero if the first few bytes of the string starting at C and -looking no further than S> are well-formed UTF-8, as extended by Perl, -that represents some code point, subject to the restrictions given by C; -otherwise it evaluates to 0. If non-zero, the value gives how many bytes -starting at C comprise the code point's representation. Any bytes remaining -before C, but beyond the ones needed to form the first code point in C, -are not examined. - -If C is 0, this gives the same results as C>; -if C is C, this gives the same results -as C>; -and if C is C, this gives -the same results as C>. -Otherwise C may be any combination of the C> flags -understood by C>, with the same meanings. - -The three alternative macros are for the most commonly needed validations; they -are likely to run somewhat faster than this more general one, as they can be -inlined into your code. - -Use L, L, and -L to check entire strings. - -=cut -*/ - PERL_STATIC_INLINE STRLEN Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags) { diff --git a/mathoms.c b/mathoms.c index dba2459b1ef3..7e81e2c76e00 100644 --- a/mathoms.c +++ b/mathoms.c @@ -804,14 +804,6 @@ Perl_sv_copypv(pTHX_ SV *const dsv, SV *const ssv) sv_copypv_flags(dsv, ssv, SV_GMAGIC); } -/* -=for apidoc_section $unicode -=for apidoc is_utf8_char_buf - -This is identical to the macro L. - -=cut */ - STRLEN Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end) {