Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new function bytes_to_utf8_free_me #22823

Open
wants to merge 2 commits into
base: blead
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion embed.fnc
Original file line number Diff line number Diff line change
Expand Up @@ -794,8 +794,12 @@ Adp |int |bytes_cmp_utf8 |NN const U8 *b \
Adp |U8 * |bytes_from_utf8|NN const U8 *s \
|NN STRLEN *lenp \
|NN bool *is_utf8p
Adp |U8 * |bytes_to_utf8 |NN const U8 *s \
Admp |U8 * |bytes_to_utf8 |NN const U8 *s \
|NN STRLEN *lenp
Adp |U8 * |bytes_to_utf8_free_me \
|NN const U8 *s \
|NN STRLEN *lenp \
|NULLOK const U8 **free_me
AOdp |SSize_t|call_argv |NN const char *sub_name \
|I32 flags \
|NN char **argv
Expand Down
3 changes: 2 additions & 1 deletion embed.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,8 @@
# define block_start(a) Perl_block_start(aTHX_ a)
# define bytes_cmp_utf8(a,b,c,d) Perl_bytes_cmp_utf8(aTHX_ a,b,c,d)
# define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c)
# define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b)
# define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX,a,b)
# define bytes_to_utf8_free_me(a,b,c) Perl_bytes_to_utf8_free_me(aTHX_ a,b,c)
# define c9strict_utf8_to_uv Perl_c9strict_utf8_to_uv
# define call_argv(a,b,c) Perl_call_argv(aTHX_ a,b,c)
# define call_atexit(a,b) Perl_call_atexit(aTHX_ a,b)
Expand Down
7 changes: 5 additions & 2 deletions proto.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

58 changes: 44 additions & 14 deletions utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -3257,20 +3257,35 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
}

/*
=for apidoc bytes_to_utf8
=for apidoc bytes_to_utf8
=for apidoc_item bytes_to_utf8_free_me

Converts a string C<s> of length C<*lenp> bytes from the native encoding into
UTF-8.
Returns a pointer to the newly-created string, and sets C<*lenp> to
reflect the new length in bytes. The caller is responsible for arranging for
the memory used by this string to get freed.
These each convert a string C<s> of length C<*lenp> bytes from the native
encoding into UTF-8 (UTF-EBCDIC on EBCDIC platforms), returning a pointer to
the UTF-8 string, and setting C<*lenp> to its length in bytes, while making
sure that the string is terminated by a C<NUL> character.

They differ in that C<bytes_to_utf8_free_me> takes an extra parameter
C<free_me>. If that parameter is NULL, this function behaves identically to
C<bytes_to_utf8>.

But if not NULL, the function skips allocating new memory if the UTF-8
representation of the input string is the same as its native representation.
In other words it returns the input string if converting the string would be a
no-op. It sets C<*free_me> to NULL in that case. Otherwise C<*free_me> is set
to the address of the newly allocated memory. Note that in both cases, you can
pass that result to C<L</Safefree>> and it will do the right thing.

If new memory is allocated, it will be C<NUL>-terminated.

Note that when new memory is allocated, the caller is responsible for arranging
for that memory to get freed. (This is transparent to the caller if
C<Safefree> is called with C<free_me>.)

Upon return, the number of variants in the string can be computed by
having saved the value of C<*lenp> before the call, and subtracting it from the
after-call value of C<*lenp>.

A C<NUL> character will be written after the end of the string.

If you want to convert to UTF-8 from encodings other than
the native (Latin1 or EBCDIC),
see L</sv_recode_to_utf8>().
Expand All @@ -3279,17 +3294,28 @@ see L</sv_recode_to_utf8>().
*/

U8*
Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
Perl_bytes_to_utf8_free_me(pTHX_ const U8 *s, Size_t *lenp,
const U8 ** free_me_ptr)
{
PERL_ARGS_ASSERT_BYTES_TO_UTF8_FREE_ME;
PERL_UNUSED_CONTEXT;

const U8 * const send = s + (*lenp);
const Size_t variant_count = variant_under_utf8_count(s, send);

/* Return the input unchanged if the flag indicates to do so, and there
* are no characters that differ when represented in UTF-8, and the
* original is NUL-terminated */
if (free_me_ptr != NULL && variant_count == 0) {
*free_me_ptr = NULL;
return (U8 *) s;
}

U8 *d;
U8 *dst;

PERL_ARGS_ASSERT_BYTES_TO_UTF8;
PERL_UNUSED_CONTEXT;

/* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
Newx(d, (*lenp) + variant_count + 1, U8);
dst = d;

while (s < send) {
Expand All @@ -3298,7 +3324,11 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
}

*d = '\0';
*lenp = d-dst;
*lenp = d - dst;

if (free_me_ptr != NULL) {
*free_me_ptr = dst;
}

return dst;
}
Expand Down
1 change: 1 addition & 0 deletions utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -1330,6 +1330,7 @@ point's representation.

#define Perl_is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)

#define Perl_bytes_to_utf8(mTHX, s, lenp) Perl_bytes_to_utf8_free_me(aTHX_ s, lenp, NULL)
typedef enum {
PL_utf8_to_bytes_overwrite = 0,
PL_utf8_to_bytes_new_memory,
Expand Down
Loading