From f7257b9f0fbcf0d9f7abc90cc54d85b92cb1784c Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sat, 12 Oct 2024 05:28:33 -0600 Subject: [PATCH] refcounted_he_(new|fetch)_pvn: Don't roll-own code The new function utf8_to_bytes_temp_pv() does a better job than these code sections that are nearly duplicates of each other. It's better for several reasons, such as that for long keys, it checks a word at a time if it is downgradable. This is a follow on to #22638, which was closed without the original commits that this replaces. --- hv.c | 72 +++++++----------------------------------------------------- 1 file changed, 8 insertions(+), 64 deletions(-) diff --git a/hv.c b/hv.c index 1703084d7dd6..30eafc274c72 100644 --- a/hv.c +++ b/hv.c @@ -3686,39 +3686,11 @@ Perl_refcounted_he_fetch_pvn(pTHX_ const struct refcounted_he *chain, (UV)flags); if (!chain) goto ret; - if (flags & REFCOUNTED_HE_KEY_UTF8) { - /* For searching purposes, canonicalise to Latin-1 where possible. */ - const char *keyend = keypv + keylen, *p; - STRLEN nonascii_count = 0; - for (p = keypv; p != keyend; p++) { - if (! UTF8_IS_INVARIANT(*p)) { - if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, keyend)) { - goto canonicalised_key; - } - nonascii_count++; - p++; - } - } - if (nonascii_count) { - char *q; - const char *p = keypv, *keyend = keypv + keylen; - keylen -= nonascii_count; - Newx(q, keylen, char); - SAVEFREEPV(q); - keypv = q; - for (; p != keyend; p++, q++) { - U8 c = (U8)*p; - if (UTF8_IS_INVARIANT(c)) { - *q = (char) c; - } - else { - p++; - *q = (char) EIGHT_BIT_UTF8_TO_NATIVE(c, *p); - } - } - } + /* For searching purposes, canonicalise to Latin-1 where possible. */ + if ( flags & REFCOUNTED_HE_KEY_UTF8 + && utf8_to_bytes_temp_pv(&keypv, &keylen)) + { flags &= ~REFCOUNTED_HE_KEY_UTF8; - canonicalised_key: ; } utf8_flag = (flags & REFCOUNTED_HE_KEY_UTF8) ? HVhek_UTF8 : 0; if (!hash) @@ -3861,39 +3833,11 @@ Perl_refcounted_he_new_pvn(pTHX_ struct refcounted_he *parent, } hekflags = value_type; - if (flags & REFCOUNTED_HE_KEY_UTF8) { - /* Canonicalise to Latin-1 where possible. */ - const char *keyend = keypv + keylen, *p; - STRLEN nonascii_count = 0; - for (p = keypv; p != keyend; p++) { - if (! UTF8_IS_INVARIANT(*p)) { - if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, keyend)) { - goto canonicalised_key; - } - nonascii_count++; - p++; - } - } - if (nonascii_count) { - char *q; - const char *p = keypv, *keyend = keypv + keylen; - keylen -= nonascii_count; - Newx(q, keylen, char); - SAVEFREEPV(q); - keypv = q; - for (; p != keyend; p++, q++) { - U8 c = (U8)*p; - if (UTF8_IS_INVARIANT(c)) { - *q = (char) c; - } - else { - p++; - *q = (char) EIGHT_BIT_UTF8_TO_NATIVE(c, *p); - } - } - } + /* Canonicalise to Latin-1 where possible. */ + if ( (flags & REFCOUNTED_HE_KEY_UTF8) + && utf8_to_bytes_temp_pv(&keypv, &keylen)) + { flags &= ~REFCOUNTED_HE_KEY_UTF8; - canonicalised_key: ; } if (flags & REFCOUNTED_HE_KEY_UTF8) hekflags |= HVhek_UTF8;