From 0f6d35538fb51d95a7aafc5a7ef3a4730e3d77e2 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 13 Jul 2024 10:22:33 -0400 Subject: [PATCH] `frequency`: fix column with all unique values detection; remove unneeded utf8 conversion --- src/cmd/frequency.rs | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/cmd/frequency.rs b/src/cmd/frequency.rs index 869f16053..c386614bd 100644 --- a/src/cmd/frequency.rs +++ b/src/cmd/frequency.rs @@ -242,12 +242,20 @@ impl Args { let unique_counts_len = counts.len(); if self.flag_lmt_threshold == 0 || self.flag_lmt_threshold >= unique_counts_len { // check if the column has all unique values - // by checking if counts length is equal to ftable length + // do this by looking at the counts vec + // and see if it has a count of 1, indicating all unique values + let all_unique = counts[if self.flag_asc { + unique_counts_len - 1 + } else { + 0 + }] + .1 == 1; + let abs_limit = self.flag_limit.unsigned_abs(); - let unique_limited = if self.flag_limit > 0 + let unique_limited = if all_unique + && self.flag_limit > 0 && self.flag_unq_limit != abs_limit && self.flag_unq_limit > 0 - && unique_counts_len == ftab.len() { counts.truncate(self.flag_unq_limit); true @@ -435,13 +443,9 @@ impl Args { if self.flag_no_trim { // case-sensitive, don't trim whitespace for (i, field) in nsel.select(row_buffer.into_iter()).enumerate() { - field_buffer = { - if let Ok(s) = simdutf8::basic::from_utf8(field) { - s.as_bytes().to_vec() - } else { - field.to_vec() - } - }; + // no need to convert to string and back to bytes for a "case-sensitive" + // comparison we can just use the field directly + field_buffer = field.to_vec(); // safety: we do get_unchecked_mut on freq_tables for the same reason above if !field_buffer.is_empty() {