diff --git a/utf8.c b/utf8.c index 2b89fae4314e..5dbee8d4047d 100644 --- a/utf8.c +++ b/utf8.c @@ -3269,29 +3269,19 @@ They differ in that C takes an extra parameter C. If that parameter is NULL, this function behaves identically to C. -But if not NULL, the function skips allocating new memory if the input string -already is C-terminated, and its UTF-8 representation is the same as its -native representation. In other words it returns the input string if -converting the string would be a no-op. It sets C<*free_me> to NULL in that -case. Otherwise C<*free_me> is set to the address of the newly allocated -memory. Note that in both cases, you can pass that result to C> -and it will do the right thing. +But if not NULL, the function skips allocating new memory if the UTF-8 +representation of the input string is the same as its native representation. +In other words it returns the input string if converting the string would be a +no-op. It sets C<*free_me> to NULL in that case. Otherwise C<*free_me> is set +to the address of the newly allocated memory. Note that in both cases, you can +pass that result to C> and it will do the right thing. + +If new memory is allocated, it will be C-terminated. Note that when new memory is allocated, the caller is responsible for arranging for that memory to get freed. (This is transparent to the caller if C is called with C.) -The two forms have subtle differences in trailing C handling. -C does not look for any trailing C. Instead it -overallocates space for the copy by 1 byte and adds a C to the end of it. -If the input C<*lenp> included a C, there will be two trailing C -characters; otherwise just 1. There is code that depends on this behavior. -C has to look for a trailing C in order to be able -to guarantee that the result has one if no copy is made. If the input C<*lenp> -doesn't include any C character at the end, this form has to assume that -one doesn't exist, and will create a copy. Only if C<*lenp> does include the -C, does this form check to see if a copy can be avoided. - Upon return, the number of variants in the string can be computed by having saved the value of C<*lenp> before the call, and subtracting it from the after-call value of C<*lenp>. @@ -3310,15 +3300,13 @@ Perl_bytes_to_utf8_free_me(pTHX_ const U8 *s, Size_t *lenp, PERL_ARGS_ASSERT_BYTES_TO_UTF8_FREE_ME; PERL_UNUSED_CONTEXT; - const Size_t len = *lenp; - const U8 * const send = s + len; + const U8 * const send = s + (*lenp); const Size_t variant_count = variant_under_utf8_count(s, send); - const bool has_trailing_NUL = free_me_ptr && len > 0 && *(send - 1) == '\0'; /* Return the input unchanged if the flag indicates to do so, and there * are no characters that differ when represented in UTF-8, and the * original is NUL-terminated */ - if (free_me_ptr != NULL && variant_count == 0 && has_trailing_NUL) { + if (free_me_ptr != NULL && variant_count == 0) { *free_me_ptr = NULL; return (U8 *) s; } @@ -3326,13 +3314,11 @@ Perl_bytes_to_utf8_free_me(pTHX_ const U8 *s, Size_t *lenp, U8 *d; U8 *dst; - /* 1 for each byte except any trailing NUL - * + 1 for each byte that expands to two - * + 1 for the trailing NUL */ - Newx(d, len - has_trailing_NUL + variant_count + 1, U8); + /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */ + Newx(d, (*lenp) + variant_count + 1, U8); dst = d; - while (s < send - has_trailing_NUL) { + while (s < send) { append_utf8_from_native_byte(*s, &d); s++; }