Skip to content

Commit

Permalink
Add new function bytes_to_utf8_free_me
Browse files Browse the repository at this point in the history
This is like bytes_to_utf8, but if the representation of the input
string is the same in UTF-8 as it is in native format, the allocation of
new memory is skipped.

This presents optimization possibilities.
  • Loading branch information
khwilliamson committed Dec 16, 2024
1 parent df4d5e8 commit 5d31895
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 22 deletions.
6 changes: 5 additions & 1 deletion embed.fnc
Original file line number Diff line number Diff line change
Expand Up @@ -794,8 +794,12 @@ Adp |int |bytes_cmp_utf8 |NN const U8 *b \
Adp |U8 * |bytes_from_utf8|NN const U8 *s \
|NN STRLEN *lenp \
|NN bool *is_utf8p
Adp |U8 * |bytes_to_utf8 |NN const U8 *s \
Admp |U8 * |bytes_to_utf8 |NN const U8 *s \
|NN STRLEN *lenp
Adp |U8 * |bytes_to_utf8_free_me \
|NN const U8 *s \
|NN STRLEN *lenp \
|NULLOK const U8 **free_me
AOdp |SSize_t|call_argv |NN const char *sub_name \
|I32 flags \
|NN char **argv
Expand Down
3 changes: 2 additions & 1 deletion embed.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,8 @@
# define block_start(a) Perl_block_start(aTHX_ a)
# define bytes_cmp_utf8(a,b,c,d) Perl_bytes_cmp_utf8(aTHX_ a,b,c,d)
# define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c)
# define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b)
# define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX,a,b)
# define bytes_to_utf8_free_me(a,b,c) Perl_bytes_to_utf8_free_me(aTHX_ a,b,c)
# define c9strict_utf8_to_uv Perl_c9strict_utf8_to_uv
# define call_argv(a,b,c) Perl_call_argv(aTHX_ a,b,c)
# define call_atexit(a,b) Perl_call_atexit(aTHX_ a,b)
Expand Down
7 changes: 5 additions & 2 deletions proto.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

80 changes: 62 additions & 18 deletions utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -3182,20 +3182,45 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
}

/*
=for apidoc bytes_to_utf8
Converts a string C<s> of length C<*lenp> bytes from the native encoding into
UTF-8.
Returns a pointer to the newly-created string, and sets C<*lenp> to
reflect the new length in bytes. The caller is responsible for arranging for
the memory used by this string to get freed.
=for apidoc bytes_to_utf8
=for apidoc_item bytes_to_utf8_free_me
These each convert a string C<s> of length C<*lenp> bytes from the native
encoding into UTF-8 (UTF-EBCDIC on EBCDIC platforms), returning a pointer to
the UTF-8 string, and setting C<*lenp> to its length in bytes, while making
sure that the string is terminated by a C<NUL> character.
They differ in that C<bytes_to_utf8_free_me> takes an extra parameter
C<free_me>. If that parameter is NULL, this function behaves identically to
C<bytes_to_utf8>.
But if not NULL, the function skips allocating new memory if the input string
already is C<NUL>-terminated, and its UTF-8 representation is the same as its
native representation. In other words it returns the input string if
converting the string would be a no-op. It sets C<*free_me> to NULL in that
case. Otherwise C<*free_me> is set to the address of the newly allocated
memory. Note that in both cases, you can pass that result to C<L</Safefree>>
and it will do the right thing.
Note that when new memory is allocated, the caller is responsible for arranging
for that memory to get freed. (This is transparent to the caller if
C<Safefree> is called with C<free_me>.)
The two forms have subtle differences in trailing C<NUL> handling.
C<bytes_to_utf8> does not look for any trailing C<NUL>. Instead it
overallocates space for the copy by 1 byte and adds a C<NUL> to the end of it.
If the input C<*lenp> included a C<NUL>, there will be two trailing C<NUL>
characters; otherwise just 1. There is code that depends on this behavior.
C<bytes_to_utf8_free_me> has to look for a trailing C<NUL> in order to be able
to guarantee that the result has one if no copy is made. If the input C<*lenp>
doesn't include any C<NUL> character at the end, this form has to assume that
one doesn't exist, and will create a copy. Only if C<*lenp> does include the
C<NUL>, does this form check to see if a copy can be avoided.
Upon return, the number of variants in the string can be computed by
having saved the value of C<*lenp> before the call, and subtracting it from the
after-call value of C<*lenp>.
A C<NUL> character will be written after the end of the string.
If you want to convert to UTF-8 from encodings other than
the native (Latin1 or EBCDIC),
see L</sv_recode_to_utf8>().
Expand All @@ -3204,26 +3229,45 @@ see L</sv_recode_to_utf8>().
*/

U8*
Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
Perl_bytes_to_utf8_free_me(pTHX_ const U8 *s, Size_t *lenp,
const U8 ** free_me_ptr)
{
const U8 * const send = s + (*lenp);
PERL_ARGS_ASSERT_BYTES_TO_UTF8_FREE_ME;
PERL_UNUSED_CONTEXT;

const Size_t len = *lenp;
const U8 * const send = s + len;
const Size_t variant_count = variant_under_utf8_count(s, send);
const bool has_trailing_NUL = free_me_ptr && len > 0 && *(send - 1) == '\0';

/* Return the input unchanged if the flag indicates to do so, and there
* are no characters that differ when represented in UTF-8, and the
* original is NUL-terminated */
if (free_me_ptr != NULL && variant_count == 0 && has_trailing_NUL) {
*free_me_ptr = NULL;
return (U8 *) s;
}

U8 *d;
U8 *dst;

PERL_ARGS_ASSERT_BYTES_TO_UTF8;
PERL_UNUSED_CONTEXT;

/* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
/* 1 for each byte except any trailing NUL
* + 1 for each byte that expands to two
* + 1 for the trailing NUL */
Newx(d, len - has_trailing_NUL + variant_count + 1, U8);
dst = d;

while (s < send) {
while (s < send - has_trailing_NUL) {
append_utf8_from_native_byte(*s, &d);
s++;
}

*d = '\0';
*lenp = d-dst;
*lenp = d - dst;

if (free_me_ptr != NULL) {
*free_me_ptr = dst;
}

return dst;
}
Expand Down
1 change: 1 addition & 0 deletions utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -1328,6 +1328,7 @@ point's representation.

#define Perl_is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)

#define Perl_bytes_to_utf8(mTHX, s, lenp) Perl_bytes_to_utf8_free_me(aTHX_ s, lenp, NULL)
typedef enum {
PL_utf8_to_bytes_overwrite = 0,
PL_utf8_to_bytes_new_memory,
Expand Down

0 comments on commit 5d31895

Please sign in to comment.