From f0a6b06bd295a056aafb250e830f191723a1f5eb Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Fri, 21 Jun 2024 11:42:33 -0600 Subject: [PATCH] perlapi: Combine all forms of is_utf8_invariant_string() --- inline.h | 64 +++++++++++++++++++++++++------------------------------- utf8.h | 15 ++----------- 2 files changed, 31 insertions(+), 48 deletions(-) diff --git a/inline.h b/inline.h index 0603f4b0681b..630e0fa039e0 100644 --- a/inline.h +++ b/inline.h @@ -1276,55 +1276,49 @@ Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen) } /* -=for apidoc is_utf8_invariant_string - -Returns TRUE if the first C bytes of the string C are the same -regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on -EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they -are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only -the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range -characters are invariant, but so also are the C1 controls. +=for apidoc is_utf8_invariant_string +=for apidoc_item is_utf8_invariant_string_loc +=for apidoc_item is_ascii_string +=for apidoc_item is_invariant_string + +These each return TRUE if the first C bytes of the string C are the +same regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on +EBCDIC machines); otherwise they returns FALSE. That is, they return TRUE if +they are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and +only the ASCII characters fit this definition. On EBCDIC machines, the +ASCII-range characters are invariant, but so also are the C1 controls. If C is 0, it will be calculated using C, (which means if you use this option, that C can't have embedded C characters and has to have a terminating C byte). -See also -C>, -C>, -C>, -C>, -C>, -C>, -C>, -C>, -C>, -C>, -C>, -C>, -C>, -C>, -and -C>. - -=cut +All forms except C have identical behavior. The +only difference with it is that it has an extra pointer parameter, C, into +which, if it isn't NULL, the location of the first UTF-8 variant character in +the C pointer will be stored upon failure. If all characters are UTF-8 +invariant, this function does not change the contents of C<*ep>. -*/ +C is somewhat misleadingly named. +C is preferred, as it indicates under what conditions +the string is invariant. -#define is_utf8_invariant_string(s, len) \ - is_utf8_invariant_string_loc(s, len, NULL) +C is misleadingly-named. On ASCII-ish platforms, the name +isn't misleading: the ASCII-range characters are exactly the UTF-8 invariants. +But EBCDIC machines have more UTF-8 invariants than just the ASCII characters, +so the name C is preferred. -/* -=for apidoc is_utf8_invariant_string_loc +See also +C> and C>. -Like C> but upon failure, stores the location of -the first UTF-8 variant character in the C pointer; if all characters are -UTF-8 invariant, this function does not change the contents of C<*ep>. +=for apidoc_defn is_utf8_invariant_string bool|NN const U8 * const s|STRLEN len =cut */ +#define is_utf8_invariant_string(s, len) \ + is_utf8_invariant_string_loc(s, len, NULL) + PERL_STATIC_INLINE bool Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) { diff --git a/utf8.h b/utf8.h index 0aa0fd5349e7..5df7a02043f8 100644 --- a/utf8.h +++ b/utf8.h @@ -127,19 +127,8 @@ typedef enum { #define FOLD_FLAGS_NOMIX_ASCII 0x4 /* -=for apidoc is_ascii_string - -This is a misleadingly-named synonym for L. -On ASCII-ish platforms, the name isn't misleading: the ASCII-range characters -are exactly the UTF-8 invariants. But EBCDIC machines have more invariants -than just the ASCII characters, so C is preferred. - -=for apidoc is_invariant_string - -This is a somewhat misleadingly-named synonym for L. -C is preferred, as it indicates under what conditions -the string is invariant. - +=for apidoc_defn is_ascii_string bool|NN const U8 * const s|STRLEN len +=for apidoc_defn is_invariant_string bool|NN const U8 * const s|STRLEN len =cut */ #define is_ascii_string(s, len) is_utf8_invariant_string(s, len)