Add new function bytes_to_utf8_free_me

This is like bytes_to_utf8, but if the representation of the input string is the same in UTF-8 as it is in native format, the allocation of new memory is skipped. This presents optimization possibilities.
Perl · Dec 16, 2024 · 5d31895 · 5d31895
1 parent df4d5e8
commit 5d31895
Show file tree

Hide file tree

Showing 5 changed files with 75 additions and 22 deletions.
diff --git a/embed.fnc b/embed.fnc
@@ -794,8 +794,12 @@ Adp	|int	|bytes_cmp_utf8 |NN const U8 *b 			\
 Adp	|U8 *	|bytes_from_utf8|NN const U8 *s 			\
 				|NN STRLEN *lenp			\
 				|NN bool *is_utf8p
-Adp	|U8 *	|bytes_to_utf8	|NN const U8 *s 			\
+Admp	|U8 *	|bytes_to_utf8	|NN const U8 *s 			\
 				|NN STRLEN *lenp
+Adp	|U8 *	|bytes_to_utf8_free_me					\
+				|NN const U8 *s 			\
+				|NN STRLEN *lenp			\
+				|NULLOK const U8 **free_me
 AOdp	|SSize_t|call_argv	|NN const char *sub_name		\
 				|I32 flags				\
 				|NN char **argv

diff --git a/embed.h b/embed.h
@@ -155,7 +155,8 @@
 # define block_start(a)                         Perl_block_start(aTHX_ a)
 # define bytes_cmp_utf8(a,b,c,d)                Perl_bytes_cmp_utf8(aTHX_ a,b,c,d)
 # define bytes_from_utf8(a,b,c)                 Perl_bytes_from_utf8(aTHX_ a,b,c)
-# define bytes_to_utf8(a,b)                     Perl_bytes_to_utf8(aTHX_ a,b)
+# define bytes_to_utf8(a,b)                     Perl_bytes_to_utf8(aTHX,a,b)
+# define bytes_to_utf8_free_me(a,b,c)           Perl_bytes_to_utf8_free_me(aTHX_ a,b,c)
 # define c9strict_utf8_to_uv                    Perl_c9strict_utf8_to_uv
 # define call_argv(a,b,c)                       Perl_call_argv(aTHX_ a,b,c)
 # define call_atexit(a,b)                       Perl_call_atexit(aTHX_ a,b)

diff --git a/proto.h b/proto.h
diff --git a/utf8.c b/utf8.c
@@ -3182,20 +3182,45 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
 }
 
 /*
-=for apidoc bytes_to_utf8
-
-Converts a string C<s> of length C<*lenp> bytes from the native encoding into
-UTF-8.
-Returns a pointer to the newly-created string, and sets C<*lenp> to
-reflect the new length in bytes.  The caller is responsible for arranging for
-the memory used by this string to get freed.
+=for apidoc      bytes_to_utf8
+=for apidoc_item bytes_to_utf8_free_me
+
+These each convert a string C<s> of length C<*lenp> bytes from the native
+encoding into UTF-8 (UTF-EBCDIC on EBCDIC platforms), returning a pointer to
+the UTF-8 string, and setting C<*lenp> to its length in bytes, while making
+sure that the string is terminated by a C<NUL> character.
+
+They differ in that C<bytes_to_utf8_free_me> takes an extra parameter
+C<free_me>.  If that parameter is NULL, this function behaves identically to
+C<bytes_to_utf8>.
+
+But if not NULL, the function skips allocating new memory if the input string
+already is C<NUL>-terminated, and its UTF-8 representation is the same as its
+native representation.  In other words it returns the input string if
+converting the string would be a no-op.  It sets C<*free_me> to NULL in that
+case.  Otherwise C<*free_me> is set to the address of the newly allocated
+memory.  Note that in both cases, you can pass that result to C<L</Safefree>>
+and it will do the right thing.  
+
+Note that when new memory is allocated, the caller is responsible for arranging
+for that memory to get freed.  (This is transparent to the caller if
+C<Safefree> is called with C<free_me>.)
+
+The two forms have subtle differences in trailing C<NUL> handling.
+C<bytes_to_utf8> does not look for any trailing C<NUL>.  Instead it
+overallocates space for the copy by 1 byte and adds a C<NUL> to the end of it.
+If the input C<*lenp> included a C<NUL>, there will be two trailing C<NUL>
+characters; otherwise just 1.  There is code that depends on this behavior.
+C<bytes_to_utf8_free_me> has to look for a trailing C<NUL> in order to be able
+to guarantee that the result has one if no copy is made.  If the input C<*lenp>
+doesn't include any C<NUL> character at the end, this form has to assume that
+one doesn't exist, and will create a copy.  Only if C<*lenp> does include the
+C<NUL>, does this form check to see if a copy can be avoided.
 
 Upon return, the number of variants in the string can be computed by
 having saved the value of C<*lenp> before the call, and subtracting it from the
 after-call value of C<*lenp>.
 
-A C<NUL> character will be written after the end of the string.
-
 If you want to convert to UTF-8 from encodings other than
 the native (Latin1 or EBCDIC),
 see L</sv_recode_to_utf8>().
@@ -3204,26 +3229,45 @@ see L</sv_recode_to_utf8>().
 */
 
 U8*
-Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
+Perl_bytes_to_utf8_free_me(pTHX_ const U8 *s, Size_t *lenp,
+                                 const U8 ** free_me_ptr)
 {
-    const U8 * const send = s + (*lenp);
+    PERL_ARGS_ASSERT_BYTES_TO_UTF8_FREE_ME;
+    PERL_UNUSED_CONTEXT;
+
+    const Size_t len = *lenp;
+    const U8 * const send = s + len;
+    const Size_t variant_count = variant_under_utf8_count(s, send);
+    const bool has_trailing_NUL = free_me_ptr && len > 0 && *(send - 1) == '\0';
+
+    /* Return the input unchanged if the flag indicates to do so, and there
+     * are no characters that differ when represented in UTF-8, and the
+     * original is NUL-terminated */
+    if (free_me_ptr != NULL && variant_count == 0 && has_trailing_NUL) {
+        *free_me_ptr = NULL;
+        return (U8 *) s;
+    }
+
     U8 *d;
     U8 *dst;
 
-    PERL_ARGS_ASSERT_BYTES_TO_UTF8;
-    PERL_UNUSED_CONTEXT;
-
-    /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
-    Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
+    /*   1 for each byte except any trailing NUL
+     * + 1 for each byte that expands to two
+     * + 1 for the trailing NUL */
+    Newx(d, len - has_trailing_NUL + variant_count + 1, U8);
     dst = d;
 
-    while (s < send) {
+    while (s < send - has_trailing_NUL) {
         append_utf8_from_native_byte(*s, &d);
         s++;
     }
 
     *d = '\0';
-    *lenp = d-dst;
+    *lenp = d - dst;
+
+    if (free_me_ptr != NULL) {
+        *free_me_ptr = dst;
+    }
 
     return dst;
 }

diff --git a/utf8.h b/utf8.h
@@ -1328,6 +1328,7 @@ point's representation.
 
 #define Perl_is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)
 
+#define Perl_bytes_to_utf8(mTHX, s, lenp)  Perl_bytes_to_utf8_free_me(aTHX_ s, lenp, NULL)
 typedef enum {
     PL_utf8_to_bytes_overwrite = 0,
     PL_utf8_to_bytes_new_memory,