Perl · khwilliamson · Dec 5, 2024 · Dec 18, 2024
diff --git a/embed.fnc b/embed.fnc
@@ -794,8 +794,12 @@ Adp	|int	|bytes_cmp_utf8 |NN const U8 *b 			\
 Adp	|U8 *	|bytes_from_utf8|NN const U8 *s 			\
 				|NN STRLEN *lenp			\
 				|NN bool *is_utf8p
-Adp	|U8 *	|bytes_to_utf8	|NN const U8 *s 			\
+Admp	|U8 *	|bytes_to_utf8	|NN const U8 *s 			\
 				|NN STRLEN *lenp
+Adp	|U8 *	|bytes_to_utf8_free_me					\
+				|NN const U8 *s 			\
+				|NN STRLEN *lenp			\
+				|NULLOK const U8 **free_me
 AOdp	|SSize_t|call_argv	|NN const char *sub_name		\
 				|I32 flags				\
 				|NN char **argv

diff --git a/embed.h b/embed.h
@@ -155,7 +155,8 @@
 # define block_start(a)                         Perl_block_start(aTHX_ a)
 # define bytes_cmp_utf8(a,b,c,d)                Perl_bytes_cmp_utf8(aTHX_ a,b,c,d)
 # define bytes_from_utf8(a,b,c)                 Perl_bytes_from_utf8(aTHX_ a,b,c)
-# define bytes_to_utf8(a,b)                     Perl_bytes_to_utf8(aTHX_ a,b)
+# define bytes_to_utf8(a,b)                     Perl_bytes_to_utf8(aTHX,a,b)
+# define bytes_to_utf8_free_me(a,b,c)           Perl_bytes_to_utf8_free_me(aTHX_ a,b,c)
 # define c9strict_utf8_to_uv                    Perl_c9strict_utf8_to_uv
 # define call_argv(a,b,c)                       Perl_call_argv(aTHX_ a,b,c)
 # define call_atexit(a,b)                       Perl_call_atexit(aTHX_ a,b)

diff --git a/proto.h b/proto.h
diff --git a/utf8.c b/utf8.c
@@ -3257,20 +3257,35 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
 }
 
 /*
-=for apidoc bytes_to_utf8
+=for apidoc      bytes_to_utf8
+=for apidoc_item bytes_to_utf8_free_me
 
-Converts a string C<s> of length C<*lenp> bytes from the native encoding into
-UTF-8.
-Returns a pointer to the newly-created string, and sets C<*lenp> to
-reflect the new length in bytes.  The caller is responsible for arranging for
-the memory used by this string to get freed.
+These each convert a string C<s> of length C<*lenp> bytes from the native
+encoding into UTF-8 (UTF-EBCDIC on EBCDIC platforms), returning a pointer to
+the UTF-8 string, and setting C<*lenp> to its length in bytes, while making
+sure that the string is terminated by a C<NUL> character.
+
+They differ in that C<bytes_to_utf8_free_me> takes an extra parameter
+C<free_me>.  If that parameter is NULL, this function behaves identically to
+C<bytes_to_utf8>.
+
+But if not NULL, the function skips allocating new memory if the UTF-8
+representation of the input string is the same as its native representation.
+In other words it returns the input string if converting the string would be a
+no-op.  It sets C<*free_me> to NULL in that case.  Otherwise C<*free_me> is set
+to the address of the newly allocated memory.  Note that in both cases, you can
+pass that result to C<L</Safefree>> and it will do the right thing.  
+
+If new memory is allocated, it will be C<NUL>-terminated.
+
+Note that when new memory is allocated, the caller is responsible for arranging
+for that memory to get freed.  (This is transparent to the caller if
+C<Safefree> is called with C<free_me>.)
 
 Upon return, the number of variants in the string can be computed by
 having saved the value of C<*lenp> before the call, and subtracting it from the
 after-call value of C<*lenp>.
 
-A C<NUL> character will be written after the end of the string.
-
 If you want to convert to UTF-8 from encodings other than
 the native (Latin1 or EBCDIC),
 see L</sv_recode_to_utf8>().
@@ -3279,17 +3294,28 @@ see L</sv_recode_to_utf8>().
 */
 
 U8*
-Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
+Perl_bytes_to_utf8_free_me(pTHX_ const U8 *s, Size_t *lenp,
+                                 const U8 ** free_me_ptr)
 {
+    PERL_ARGS_ASSERT_BYTES_TO_UTF8_FREE_ME;
+    PERL_UNUSED_CONTEXT;
+
     const U8 * const send = s + (*lenp);
+    const Size_t variant_count = variant_under_utf8_count(s, send);
+
+    /* Return the input unchanged if the flag indicates to do so, and there
+     * are no characters that differ when represented in UTF-8, and the
+     * original is NUL-terminated */
+    if (free_me_ptr != NULL && variant_count == 0) {
+        *free_me_ptr = NULL;
+        return (U8 *) s;
+    }
+
     U8 *d;
     U8 *dst;
 
-    PERL_ARGS_ASSERT_BYTES_TO_UTF8;
-    PERL_UNUSED_CONTEXT;
-
     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
-    Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
+    Newx(d, (*lenp) + variant_count + 1, U8);
     dst = d;
 
     while (s < send) {
@@ -3298,7 +3324,11 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
     }
 
     *d = '\0';
-    *lenp = d-dst;
+    *lenp = d - dst;
+
+    if (free_me_ptr != NULL) {
+        *free_me_ptr = dst;
+    }
 
     return dst;
 }

diff --git a/utf8.h b/utf8.h
@@ -1330,6 +1330,7 @@ point's representation.
 
 #define Perl_is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)
 
+#define Perl_bytes_to_utf8(mTHX, s, lenp)  Perl_bytes_to_utf8_free_me(aTHX_ s, lenp, NULL)
 typedef enum {
     PL_utf8_to_bytes_overwrite = 0,
     PL_utf8_to_bytes_new_memory,