From 0f6d35538fb51d95a7aafc5a7ef3a4730e3d77e2 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 13 Jul 2024 10:22:33 -0400 Subject: [PATCH 1/2] `frequency`: fix column with all unique values detection; remove unneeded utf8 conversion --- src/cmd/frequency.rs | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/cmd/frequency.rs b/src/cmd/frequency.rs index 869f16053..c386614bd 100644 --- a/src/cmd/frequency.rs +++ b/src/cmd/frequency.rs @@ -242,12 +242,20 @@ impl Args { let unique_counts_len = counts.len(); if self.flag_lmt_threshold == 0 || self.flag_lmt_threshold >= unique_counts_len { // check if the column has all unique values - // by checking if counts length is equal to ftable length + // do this by looking at the counts vec + // and see if it has a count of 1, indicating all unique values + let all_unique = counts[if self.flag_asc { + unique_counts_len - 1 + } else { + 0 + }] + .1 == 1; + let abs_limit = self.flag_limit.unsigned_abs(); - let unique_limited = if self.flag_limit > 0 + let unique_limited = if all_unique + && self.flag_limit > 0 && self.flag_unq_limit != abs_limit && self.flag_unq_limit > 0 - && unique_counts_len == ftab.len() { counts.truncate(self.flag_unq_limit); true @@ -435,13 +443,9 @@ impl Args { if self.flag_no_trim { // case-sensitive, don't trim whitespace for (i, field) in nsel.select(row_buffer.into_iter()).enumerate() { - field_buffer = { - if let Ok(s) = simdutf8::basic::from_utf8(field) { - s.as_bytes().to_vec() - } else { - field.to_vec() - } - }; + // no need to convert to string and back to bytes for a "case-sensitive" + // comparison we can just use the field directly + field_buffer = field.to_vec(); // safety: we do get_unchecked_mut on freq_tables for the same reason above if !field_buffer.is_empty() { From 6d97805d742b6e0a4bb030a3e53dda4e96cbc3e7 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 13 Jul 2024 10:23:12 -0400 Subject: [PATCH 2/2] `tests`: add test data for issue 1962 --- resources/test/data1962.csv | 301 ++++++++++++++++++++++++++++++++++++ tests/test_frequency.rs | 70 +++++++++ 2 files changed, 371 insertions(+) create mode 100644 resources/test/data1962.csv diff --git a/resources/test/data1962.csv b/resources/test/data1962.csv new file mode 100644 index 000000000..57a8a6ad3 --- /dev/null +++ b/resources/test/data1962.csv @@ -0,0 +1,301 @@ +year +2001 +2002 +2002 +2003 +2003 +2003 +2004 +2004 +2004 +2004 +2005 +2005 +2005 +2005 +2005 +2006 +2006 +2006 +2006 +2006 +2006 +2007 +2007 +2007 +2007 +2007 +2007 +2007 +2008 +2008 +2008 +2008 +2008 +2008 +2008 +2008 +2009 +2009 +2009 +2009 +2009 +2009 +2009 +2009 +2009 +2010 +2010 +2010 +2010 +2010 +2010 +2010 +2010 +2010 +2010 +2011 +2011 +2011 +2011 +2011 +2011 +2011 +2011 +2011 +2011 +2011 +2012 +2012 +2012 +2012 +2012 +2012 +2012 +2012 +2012 +2012 +2012 +2012 +2013 +2013 +2013 +2013 +2013 +2013 +2013 +2013 +2013 +2013 +2013 +2013 +2013 +2014 +2014 +2014 +2014 +2014 +2014 +2014 +2014 +2014 +2014 +2014 +2014 +2014 +2014 +2015 +2015 +2015 +2015 +2015 +2015 +2015 +2015 +2015 +2015 +2015 +2015 +2015 +2015 +2015 +2016 +2016 +2016 +2016 +2016 +2016 +2016 +2016 +2016 +2016 +2016 +2016 +2016 +2016 +2016 +2016 +2017 +2017 +2017 +2017 +2017 +2017 +2017 +2017 +2017 +2017 +2017 +2017 +2017 +2017 +2017 +2017 +2017 +2018 +2018 +2018 +2018 +2018 +2018 +2018 +2018 +2018 +2018 +2018 +2018 +2018 +2018 +2018 +2018 +2018 +2018 +2019 +2019 +2019 +2019 +2019 +2019 +2019 +2019 +2019 +2019 +2019 +2019 +2019 +2019 +2019 +2019 +2019 +2019 +2019 +2020 +2020 +2020 +2020 +2020 +2020 +2020 +2020 +2020 +2020 +2020 +2020 +2020 +2020 +2020 +2020 +2020 +2020 +2020 +2020 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2021 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2022 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2023 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 +2024 diff --git a/tests/test_frequency.rs b/tests/test_frequency.rs index f0946c1f8..b583a5b7b 100644 --- a/tests/test_frequency.rs +++ b/tests/test_frequency.rs @@ -446,6 +446,76 @@ fn frequency_select() { assert_eq!(got, expected); } +#[test] +fn frequency_all_unique() { + let wrk = Workdir::new("frequency_all_unique"); + let testdata = wrk.load_test_file("boston311-100.csv"); + let mut cmd = wrk.command("frequency"); + cmd.args(["--select", "1"]).arg(testdata); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["field", "value", "count", "percentage"], + svec!["case_enquiry_id", "101004113298", "1", "1"], + svec!["case_enquiry_id", "101004113313", "1", "1"], + svec!["case_enquiry_id", "101004113348", "1", "1"], + svec!["case_enquiry_id", "101004113363", "1", "1"], + svec!["case_enquiry_id", "101004113371", "1", "1"], + svec!["case_enquiry_id", "101004113385", "1", "1"], + svec!["case_enquiry_id", "101004113386", "1", "1"], + svec!["case_enquiry_id", "101004113391", "1", "1"], + svec!["case_enquiry_id", "101004113394", "1", "1"], + svec!["case_enquiry_id", "101004113403", "1", "1"], + svec!["case_enquiry_id", "Other (90)", "90", "90"], + ]; + assert_eq!(got, expected); +} + +#[test] +fn frequency_issue1962() { + let wrk = Workdir::new("frequency_1962"); + let testdata = wrk.load_test_file("data1962.csv"); + let mut cmd = wrk.command("frequency"); + cmd.args(["--limit", "15"]).arg(testdata.clone()); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["field", "value", "count", "percentage"], + svec!["year", "2024", "24", "8"], + svec!["year", "2023", "23", "7.66667"], + svec!["year", "2022", "22", "7.33333"], + svec!["year", "2021", "21", "7"], + svec!["year", "2020", "20", "6.66667"], + svec!["year", "2019", "19", "6.33333"], + svec!["year", "2018", "18", "6"], + svec!["year", "2017", "17", "5.66667"], + svec!["year", "2016", "16", "5.33333"], + svec!["year", "2015", "15", "5"], + svec!["year", "2014", "14", "4.66667"], + svec!["year", "2013", "13", "4.33333"], + svec!["year", "2012", "12", "4"], + svec!["year", "2011", "11", "3.66667"], + svec!["year", "2010", "10", "3.33333"], + svec!["year", "Other (9)", "45", "15"], + ]; + assert_eq!(got, expected); + + let mut cmd = wrk.command("frequency"); + cmd.args(["--limit", "5"]).arg(testdata); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["field", "value", "count", "percentage"], + svec!["year", "2024", "24", "8"], + svec!["year", "2023", "23", "7.66667"], + svec!["year", "2022", "22", "7.33333"], + svec!["year", "2021", "21", "7"], + svec!["year", "2020", "20", "6.66667"], + svec!["year", "Other (19)", "190", "63.33333"], + ]; + assert_eq!(got, expected); +} + // This tests that a frequency table computed by `qsv` is always the same // as the frequency table computed in memory. #[test]