Skip to content

Commit

Permalink
refcounted_he_(new|fetch)_pvn: Don't roll-own code
Browse files Browse the repository at this point in the history
The new function utf8_to_bytes_temp_pv() does a better job than these
code sections that are nearly duplicates of each other.

It's better for several reasons, such as that for long keys, it checks a
word at a time if it is downgradable.

This is a follow on to #22638, which was closed without the original
commits that this replaces.
  • Loading branch information
khwilliamson committed Dec 3, 2024
1 parent 45e19c8 commit f7257b9
Showing 1 changed file with 8 additions and 64 deletions.
72 changes: 8 additions & 64 deletions hv.c
Original file line number Diff line number Diff line change
Expand Up @@ -3686,39 +3686,11 @@ Perl_refcounted_he_fetch_pvn(pTHX_ const struct refcounted_he *chain,
(UV)flags);
if (!chain)
goto ret;
if (flags & REFCOUNTED_HE_KEY_UTF8) {
/* For searching purposes, canonicalise to Latin-1 where possible. */
const char *keyend = keypv + keylen, *p;
STRLEN nonascii_count = 0;
for (p = keypv; p != keyend; p++) {
if (! UTF8_IS_INVARIANT(*p)) {
if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, keyend)) {
goto canonicalised_key;
}
nonascii_count++;
p++;
}
}
if (nonascii_count) {
char *q;
const char *p = keypv, *keyend = keypv + keylen;
keylen -= nonascii_count;
Newx(q, keylen, char);
SAVEFREEPV(q);
keypv = q;
for (; p != keyend; p++, q++) {
U8 c = (U8)*p;
if (UTF8_IS_INVARIANT(c)) {
*q = (char) c;
}
else {
p++;
*q = (char) EIGHT_BIT_UTF8_TO_NATIVE(c, *p);
}
}
}
/* For searching purposes, canonicalise to Latin-1 where possible. */
if ( flags & REFCOUNTED_HE_KEY_UTF8
&& utf8_to_bytes_temp_pv(&keypv, &keylen))
{
flags &= ~REFCOUNTED_HE_KEY_UTF8;
canonicalised_key: ;
}
utf8_flag = (flags & REFCOUNTED_HE_KEY_UTF8) ? HVhek_UTF8 : 0;
if (!hash)
Expand Down Expand Up @@ -3861,39 +3833,11 @@ Perl_refcounted_he_new_pvn(pTHX_ struct refcounted_he *parent,
}
hekflags = value_type;

if (flags & REFCOUNTED_HE_KEY_UTF8) {
/* Canonicalise to Latin-1 where possible. */
const char *keyend = keypv + keylen, *p;
STRLEN nonascii_count = 0;
for (p = keypv; p != keyend; p++) {
if (! UTF8_IS_INVARIANT(*p)) {
if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, keyend)) {
goto canonicalised_key;
}
nonascii_count++;
p++;
}
}
if (nonascii_count) {
char *q;
const char *p = keypv, *keyend = keypv + keylen;
keylen -= nonascii_count;
Newx(q, keylen, char);
SAVEFREEPV(q);
keypv = q;
for (; p != keyend; p++, q++) {
U8 c = (U8)*p;
if (UTF8_IS_INVARIANT(c)) {
*q = (char) c;
}
else {
p++;
*q = (char) EIGHT_BIT_UTF8_TO_NATIVE(c, *p);
}
}
}
/* Canonicalise to Latin-1 where possible. */
if ( (flags & REFCOUNTED_HE_KEY_UTF8)
&& utf8_to_bytes_temp_pv(&keypv, &keylen))
{
flags &= ~REFCOUNTED_HE_KEY_UTF8;
canonicalised_key: ;
}
if (flags & REFCOUNTED_HE_KEY_UTF8)
hekflags |= HVhek_UTF8;
Expand Down

0 comments on commit f7257b9

Please sign in to comment.