From 1ed2ac374b4208daa9e8e9a7b819a28f16afa24e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Va=CC=81clav=20Slavi=CC=81k?= Date: Sat, 30 Mar 2024 11:44:29 +0100 Subject: [PATCH] Optimize CatalogItemsComparator for common case Prepare cache for faster comparison. ICU uses UTF-16 internally, we can significantly speed up comparisons by doing string conversion in advance, in O(n) time and space, if the platform's native representation is UTF-32 or -8 (which it is on non-Windows). Moreover, the additional processing of removing accelerators is also done only once on all platforms, resulting in massive speedups on Windows too. These two optimizations combined result in 10-20x speedup depending on platform. --- src/cat_sorting.cpp | 32 ++++++++++++++++++++++++-------- src/cat_sorting.h | 31 +++++++++++++++++++++++-------- src/catalog.h | 3 ++- src/str_helpers.h | 15 ++++++++++++++- src/unicode_helpers.h | 5 +++++ 5 files changed, 68 insertions(+), 18 deletions(-) diff --git a/src/cat_sorting.cpp b/src/cat_sorting.cpp index c8568c771d..37c2068cc4 100644 --- a/src/cat_sorting.cpp +++ b/src/cat_sorting.cpp @@ -94,6 +94,29 @@ CatalogItemsComparator::CatalogItemsComparator(const Catalog& catalog, const Sor m_collator.reset(new unicode::Collator(catalog.GetSourceLanguage(), unicode::Collator::case_insensitive)); break; } + + // Prepare cache for faster comparison. ICU uses UTF-16 internally, we can significantly + // speed up comparisons by doing string conversion in advance, in O(n) time and space, if + // the platform's native representation is UTF-32 or -8 (which it is on non-Windows). + // Moreover, the additional processing of removing accelerators is also done only once + // on all platforms, resulting in massive speedups on Windows too. + switch (m_order.by) + { + case SortOrder::By_Source: + m_sortKeys.reserve(m_catalog.GetCount()); + for (auto& i: m_catalog.items()) + m_sortKeys.push_back(ConvertToSortKey(i->GetString())); + break; + + case SortOrder::By_Translation: + m_sortKeys.reserve(m_catalog.GetCount()); + for (auto& i: m_catalog.items()) + m_sortKeys.push_back(ConvertToSortKey(i->GetTranslation())); + break; + + case SortOrder::By_FileOrder: + break; + } } @@ -156,16 +179,9 @@ bool CatalogItemsComparator::operator()(int i, int j) const } case SortOrder::By_Source: - { - auto r = CompareTranslationStrings(a.GetString(), b.GetString()); - if ( r != 0 ) - return r < 0; - break; - } - case SortOrder::By_Translation: { - auto r = CompareTranslationStrings(a.GetTranslation(), b.GetTranslation()); + auto r = m_collator->compare(m_sortKeys[i], m_sortKeys[j]); if ( r != 0 ) return r < 0; break; diff --git a/src/cat_sorting.h b/src/cat_sorting.h index fb84acd64a..a66bf913e2 100644 --- a/src/cat_sorting.h +++ b/src/cat_sorting.h @@ -62,6 +62,7 @@ struct SortOrder bool errorsFirst; }; + /** Comparator for sorting catalog items by different criteria. */ @@ -81,21 +82,35 @@ class CatalogItemsComparator protected: const CatalogItem& Item(int i) const { return *m_catalog[i]; } - unicode::Collator::result_type CompareTranslationStrings(wxString a, wxString b) const + // Pre-process given string and return it in a form efficient for comparing + // with ICU collator. This does two things: + // 1. Converts to UTF-16 (matters on non-Windows platforms where wchar_t is UTF-32) + // 2. Perform substitution of accelerator characters in the string + static str::UCharBuffer ConvertToSortKey(const wxString& a) { - a.Replace("&", ""); - a.Replace("_", ""); - - b.Replace("&", ""); - b.Replace("_", ""); - - return m_collator->compare(a, b); + if (a.find_first_of(L"&_") == wxString::npos) + { + return str::to_icu(a); + } + else + { + wxString a_(a); + a_.Replace("&", ""); + a_.Replace("_", ""); + + auto buf = str::to_icu(a_); + // on Windows to_icu() returns shallow view of the string, we need to make + // a deep copy because a_ is a local temporary variable: + buf.ensure_owned(); + return buf; + } } private: const Catalog& m_catalog; SortOrder m_order; std::unique_ptr m_collator; + std::vector m_sortKeys; }; diff --git a/src/catalog.h b/src/catalog.h index f360097883..045d18ede8 100644 --- a/src/catalog.h +++ b/src/catalog.h @@ -144,7 +144,8 @@ class CatalogItem unsigned GetPluralFormsCount() const; /// Returns the nth-translation. - wxString GetTranslation(unsigned n = 0) const; + wxString GetTranslation(unsigned n) const; + const wxString& GetTranslation() const { return m_translations[0]; } /// Returns all translations. const wxArrayString& GetTranslations() const { return m_translations; } diff --git a/src/str_helpers.h b/src/str_helpers.h index a9c5c9970d..a7d8b51ac9 100644 --- a/src/str_helpers.h +++ b/src/str_helpers.h @@ -194,9 +194,22 @@ class UCharBuffer operator const UChar*() const { return m_data; } UChar* data() { return const_cast(m_data); } - // available buffer size, only for owned versions, returns 0 for read-only non-owned + /// Available buffer size, only for owned versions, returns 0 for read-only non-owned int32_t capacity() { return m_capacity; } + /// Ensure the buffer has a deep copy of the data, if it's not already owned + void ensure_owned() + { + if (m_owned) + return; + if (m_capacity == -1) + m_capacity = u_strlen(m_data) + 1; + auto copy = new UChar[m_capacity]; + memcpy(copy, m_data, m_capacity * sizeof(UChar)); + m_data = copy; + m_owned = true; + } + private: UCharBuffer(bool owned, const UChar *data, int32_t capacity) : m_owned(owned), m_data(data), m_capacity(capacity) {} diff --git a/src/unicode_helpers.h b/src/unicode_helpers.h index 340d5437ab..1c9709f243 100644 --- a/src/unicode_helpers.h +++ b/src/unicode_helpers.h @@ -87,6 +87,11 @@ class Collator return ucol_strcoll(m_coll, str::to_icu(a), -1, str::to_icu(b), -1); } + result_type compare(const UChar *a, const UChar *b) const + { + return ucol_strcoll(m_coll, a, -1, b, -1); + } + template bool operator()(const T& a, const T& b) const {