From 9fb16abce15391314fa61fdd609fcfdae2b2d5e7 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sat, 28 Dec 2024 11:09:16 -0500 Subject: [PATCH 01/12] chore: initial check-in of expanded `stats` to accomodate `outlier` command reqts --- src/cmd/mod.rs | 1 + src/cmd/outliers.rs | 406 +++++++++++++++++++++++------------------ src/cmd/stats.rs | 61 ++++++- src/main.rs | 2 + tests/test_outliers.rs | 161 ++++++++++++++++ tests/tests.rs | 1 + 6 files changed, 445 insertions(+), 187 deletions(-) create mode 100644 tests/test_outliers.rs diff --git a/src/cmd/mod.rs b/src/cmd/mod.rs index 6b1ea16ab..8bc8917b2 100644 --- a/src/cmd/mod.rs +++ b/src/cmd/mod.rs @@ -59,6 +59,7 @@ pub mod jsonl; pub mod lens; #[cfg(feature = "luau")] pub mod luau; +pub mod outliers; #[cfg(any(feature = "feature_capable", feature = "lite"))] pub mod partition; #[cfg(all( diff --git a/src/cmd/outliers.rs b/src/cmd/outliers.rs index fec7d97fe..eb3701296 100644 --- a/src/cmd/outliers.rs +++ b/src/cmd/outliers.rs @@ -19,8 +19,6 @@ outliers options: Common options: -h, --help Display this message -o, --output Write output to instead of stdout. - -n, --no-headers When set, the first row will not be interpreted - as headers. -d, --delimiter The field delimiter for reading CSV data. Must be a single character. (default: ,) @@ -42,26 +40,49 @@ Examples: qsv outliers -m both -q data.csv "#; -use polars::prelude::*; -use std::collections::HashMap; +use std::{collections::HashMap, fs::File, io, path::Path, str}; + +use csv::{ByteRecord, Reader}; use indicatif::{ProgressBar, ProgressStyle}; +use serde::Deserialize; + +use crate::{ + cmd::stats::StatsData, + config::{Config, Delimiter}, + select::SelectColumns, + util, + util::{get_stats_records, StatsMode}, + CliResult, +}; + +#[derive(Deserialize)] +struct Args { + arg_input: Option, + flag_select: SelectColumns, + flag_method: Option, + flag_force: bool, + flag_quiet: bool, + flag_delimiter: Option, + flag_output: Option, +} #[derive(Debug)] struct OutlierResult { - column: String, - data_type: String, - outlier_count: usize, + column: String, + data_type: String, + outlier_count: usize, outlier_details: Vec, } #[derive(Debug)] struct OutlierDetail { - value: String, - reason: String, + value: String, + reason: String, fence_type: FenceType, // inner or outer + record_no: u64, // Add this field } -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Clone)] enum FenceType { Inner, Outer, @@ -85,17 +106,38 @@ fn is_outlier(value: f64, lower_fence: f64, upper_fence: f64) -> bool { } fn process_outliers( - df: &DataFrame, + // rdr: &mut Reader>, + rdr: &mut Reader>, // Add + Send trait bound stats: &[StatsData], method: FenceType, quiet: bool, ) -> CliResult> { - let mut results = Vec::new(); + let mut results: Vec = stats + .iter() + .map(|stat| OutlierResult { + column: stat.field.clone(), + data_type: stat.r#type.clone(), + outlier_count: 0, + outlier_details: Vec::new(), + }) + .collect(); + + eprintln!("results: {:#?}", results); + + // Create index map for column positions + let headers = rdr.headers()?.clone(); + let col_indices: HashMap<_, _> = headers + .iter() + .enumerate() + .map(|(i, name)| (name.to_string(), i)) + .collect(); + eprintln!("col_indices: {:#?}", col_indices); + let pb = if !quiet { - let pb = ProgressBar::new(stats.len() as u64); + let pb = ProgressBar::new_spinner(); pb.set_style( - ProgressStyle::default_bar() - .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} columns") + ProgressStyle::default_spinner() + .template("{spinner:.green} [{elapsed_precise}] Processing record {pos}") .unwrap(), ); Some(pb) @@ -103,189 +145,199 @@ fn process_outliers( None }; - for stat in stats { + let mut record = ByteRecord::new(); + let mut record_count = 0; + while rdr.read_byte_record(&mut record)? { + record_count += 1; if let Some(pb) = &pb { - pb.inc(1); + pb.set_position(record_count); } - let mut outlier_details = Vec::new(); - - match stat.r#type.as_str() { - "Integer" | "Float" => { - // Process numeric outliers using fences - if let (Some(lower_inner), Some(upper_inner), Some(lower_outer), Some(upper_outer)) = ( - stat.lower_inner_fence, - stat.upper_inner_fence, - stat.lower_outer_fence, - stat.upper_outer_fence, - ) { - let col = df.column(&stat.field)?; - let values = col.f64()?; - - values.into_iter().flatten().enumerate().for_each(|(idx, val)| { - let (is_inner, is_outer) = ( - is_outlier(val, lower_inner, upper_inner), - is_outlier(val, lower_outer, upper_outer), - ); - - match (method, is_inner, is_outer) { - (FenceType::Inner, true, _) | - (FenceType::Outer, _, true) | - (FenceType::Both, true, _) => { - outlier_details.push(OutlierDetail { - value: val.to_string(), - reason: format!("Outside {} fences ({:.2}, {:.2})", - if is_outer { "outer" } else { "inner" }, - if is_outer { lower_outer } else { lower_inner }, - if is_outer { upper_outer } else { upper_inner }), - fence_type: if is_outer { FenceType::Outer } else { FenceType::Inner }, - }); - }, - _ => {}, - } - }); - } - }, - "Date" | "DateTime" => { - // Process date outliers using fences (converted to days) - if let (Some(lower_inner), Some(upper_inner), Some(lower_outer), Some(upper_outer)) = ( - stat.lower_inner_fence, - stat.upper_inner_fence, - stat.lower_outer_fence, - stat.upper_outer_fence, - ) { - let col = df.column(&stat.field)?; - if let Ok(dates) = col.datetime() { - dates.into_iter().flatten().enumerate().for_each(|(idx, val)| { - let days = val.timestamp_millis() as f64 / (24.0 * 60.0 * 60.0 * 1000.0); + for (result_idx, stat) in stats.iter().enumerate() { + let col_idx = match col_indices.get(&stat.field) { + Some(idx) => idx, + None => continue, + }; + + // Get the field as a byte slice + let field = record.get(*col_idx).unwrap_or_default(); + + match stat.r#type.as_str() { + "Integer" | "Float" => { + if let ( + Some(lower_inner), + Some(upper_inner), + Some(lower_outer), + Some(upper_outer), + ) = ( + stat.lower_inner_fence, + stat.upper_inner_fence, + stat.lower_outer_fence, + stat.upper_outer_fence, + ) { + // Parse the bytes directly as a float + // if let Ok(val) = str::from_utf8(field) + // .ok() + // .and_then(|s| s.parse::().ok()) + // { + // let (is_inner, is_outer) = ( + // is_outlier(val, lower_inner, upper_inner), + // is_outlier(val, lower_outer, upper_outer), + // ); + if let Some(val) = str::from_utf8(field) + .ok() + .and_then(|s| s.parse::().ok()) + { let (is_inner, is_outer) = ( - is_outlier(days, lower_inner, upper_inner), - is_outlier(days, lower_outer, upper_outer), + is_outlier(val, lower_inner, upper_inner), + is_outlier(val, lower_outer, upper_outer), ); - - match (method, is_inner, is_outer) { - (FenceType::Inner, true, _) | - (FenceType::Outer, _, true) | - (FenceType::Both, true, _) => { - outlier_details.push(OutlierDetail { - value: val.to_string(), - reason: format!("Outside {} fences", - if is_outer { "outer" } else { "inner" }), - fence_type: if is_outer { FenceType::Outer } else { FenceType::Inner }, + + match (method.clone(), is_inner, is_outer) { + (FenceType::Inner, true, _) + | (FenceType::Outer, _, true) + | (FenceType::Both, true, _) => { + results[result_idx].outlier_count += 1; + results[result_idx].outlier_details.push(OutlierDetail { + value: val.to_string(), + reason: format!( + "Outside {} fences ({:.2}, {:.2})", + if is_outer { "outer" } else { "inner" }, + if is_outer { lower_outer } else { lower_inner }, + if is_outer { upper_outer } else { upper_inner } + ), + fence_type: if is_outer { + FenceType::Outer + } else { + FenceType::Inner + }, + record_no: record_count, }); }, _ => {}, } - }); - } - } - }, - "String" => { - // Process string outliers using length statistics - if let (Some(mean_len), Some(stddev)) = (stat.avg_length, stat.stddev) { - let col = df.column(&stat.field)?; - let strings = col.utf8()?; - - strings.into_iter().flatten().enumerate().for_each(|(idx, val)| { - let len = val.len() as f64; - let z_score = (len - mean_len) / stddev; - - if z_score.abs() > 3.0 { - outlier_details.push(OutlierDetail { - value: val.to_string(), - reason: format!("Unusual length: {} (z-score: {:.2})", len, z_score), - fence_type: FenceType::Both, - }); } - }); - } - - // Also check for rare categories using antimode information - if let Some(ref antimode) = stat.antimode { - if !antimode.starts_with("*ALL") { // Skip if all values are unique - let antimodes: Vec<&str> = antimode.split(',').collect(); - let col = df.column(&stat.field)?; - let strings = col.utf8()?; - - strings.into_iter().flatten().enumerate().for_each(|(idx, val)| { - if antimodes.contains(&val) { - outlier_details.push(OutlierDetail { - value: val.to_string(), - reason: "Rare category (antimode)".to_string(), + } + }, + "String" => { + // Convert bytes to string only when needed + if let Ok(val) = str::from_utf8(field) { + // Check string length outliers + if let (Some(mean_len), Some(stddev_len)) = + (stat.avg_length, stat.stddev_length) + { + let len = val.len() as f64; + let z_score = (len - mean_len) / stddev_len; + + if z_score.abs() > 3.0 { + results[result_idx].outlier_count += 1; + results[result_idx].outlier_details.push(OutlierDetail { + value: val.to_string(), + reason: format!( + "Unusual length: {} (z-score: {:.2})", + len, z_score + ), fence_type: FenceType::Both, + record_no: record_count, }); } - }); - } - } - }, - _ => {}, // Skip other types - } + } - if !outlier_details.is_empty() { - results.push(OutlierResult { - column: stat.field.clone(), - data_type: stat.r#type.clone(), - outlier_count: outlier_details.len(), - outlier_details, - }); + // Check rare categories + if let Some(ref antimode) = stat.antimode { + if !antimode.starts_with("*ALL") { + let antimodes: Vec<&str> = antimode.split(',').collect(); + if antimodes.contains(&val) { + results[result_idx].outlier_count += 1; + results[result_idx].outlier_details.push(OutlierDetail { + value: val.to_string(), + reason: "Rare category (antimode)".to_string(), + fence_type: FenceType::Both, + record_no: record_count, + }); + } + } + } + } + }, + _ => {}, + } } } if let Some(pb) = &pb { - pb.finish_with_message("Analysis complete"); + pb.finish_with_message(format!("Processed {} records", record_count)); } + results.retain(|result| result.outlier_count > 0); Ok(results) } pub fn run(argv: &[&str]) -> CliResult<()> { let args: Args = util::get_args(USAGE, argv)?; - + // Get stats records let schema_args = util::SchemaArgs { - flag_enum_threshold: 0, - flag_ignore_case: false, - flag_strict_dates: false, + flag_enum_threshold: 0, + flag_ignore_case: false, + flag_strict_dates: false, flag_pattern_columns: crate::select::SelectColumns::parse("").unwrap(), flag_dates_whitelist: String::new(), - flag_prefer_dmy: false, - flag_force: args.flag_force, - flag_stdout: false, - flag_jobs: None, - flag_no_headers: args.flag_no_headers, - flag_delimiter: args.flag_delimiter.clone(), - arg_input: Some(args.arg_input.clone()), - flag_memcheck: false, + flag_prefer_dmy: false, + flag_force: args.flag_force, + flag_stdout: false, + flag_jobs: None, + flag_no_headers: false, + flag_delimiter: args.flag_delimiter.clone(), + arg_input: args.arg_input.clone(), + flag_memcheck: false, }; - let (csv_fields, csv_stats) = get_stats_records(&schema_args, StatsMode::FrequencyForceStats)?; + let (_csv_fields, csv_stats) = get_stats_records(&schema_args, StatsMode::FrequencyForceStats)?; + + // Read CSV file using Config + let rconfig = Config::new(args.arg_input.as_ref()) + .delimiter(args.flag_delimiter) + .select(args.flag_select); + + let mut rdr = rconfig.reader()?; + + let headers = rdr.byte_headers()?.clone(); + let sel = rconfig.selection(&headers)?; // Read the CSV file - let mut csv_reader = LazyCsvReader::new(&args.arg_input) - .with_has_header(!args.flag_no_headers) - .with_delimiter(args.flag_delimiter.unwrap_or(Delimiter(b',')).0); + // let mut csv_reader = LazyCsvReader::new(&args.arg_input) + // .with_has_header(!args.flag_no_headers) + // .with_delimiter(args.flag_delimiter.unwrap_or(Delimiter(b',')).0); - if args.flag_infer_dates { - csv_reader = csv_reader.with_try_parse_dates(true); - } + // let df = csv_reader.finish()?.collect()?; - let df = csv_reader.finish()?.collect()?; + // Process selected columns + // let selected_stats = if let Some(select) = args.flag_select { + // let selected: Vec = select.split(',').map(String::from).collect(); + // csv_stats + // .into_iter() + // .filter(|stat| selected.contains(&stat.field)) + // .collect() + // } else { + // csv_stats + // }; // Process selected columns - let selected_stats = if let Some(select) = args.flag_select { - let selected: Vec = select.split(',').map(String::from).collect(); - csv_stats - .into_iter() - .filter(|stat| selected.contains(&stat.field)) - .collect() - } else { - csv_stats - }; + // let selected_stats: Vec = csv_stats.into_iter().filter(|(_, stat)| + // sel.contains(&stat.field)).collect(); + + let mut selected_stats: Vec = Vec::new(); + for (idx, stat) in csv_stats.iter().enumerate() { + if sel.contains(&idx) { + selected_stats.push(stat.clone()); + } + } + eprintln!("selected_stats: {:#?}", selected_stats); // Process outliers let method = FenceType::from_str(args.flag_method.as_deref().unwrap_or("outer")); - let results = process_outliers(&df, &selected_stats, method, args.flag_quiet)?; + let results = process_outliers(&mut rdr, &selected_stats, method, args.flag_quiet)?; // Write results let mut wtr: Box = match args.flag_output { @@ -294,25 +346,25 @@ pub fn run(argv: &[&str]) -> CliResult<()> { }; // Write summary - writeln!(wtr, "\nOutlier Analysis Summary:")?; - writeln!(wtr, "=======================")?; - - for result in &results { - writeln!( - wtr, - "\nColumn: {} ({})", - result.column, result.data_type - )?; - writeln!(wtr, "Found {} outliers", result.outlier_count)?; - - if !args.flag_quiet { - writeln!(wtr, "\nOutlier Details:")?; - for detail in &result.outlier_details { - writeln!( - wtr, - " - Value: {:<20} | Reason: {}", - detail.value, detail.reason - )?; + if results.is_empty() { + writeln!(wtr, "No outliers found")?; + } else { + writeln!(wtr, "\nOutlier Analysis Summary:")?; + writeln!(wtr, "=======================")?; + + for result in &results { + writeln!(wtr, "\nColumn: {} ({})", result.column, result.data_type)?; + writeln!(wtr, "Found {} outliers", result.outlier_count)?; + + if !args.flag_quiet { + writeln!(wtr, "\nOutlier Details:")?; + for detail in &result.outlier_details { + writeln!( + wtr, + " - Record #{:<6} | Value: {:<20} | Reason: {}", + detail.record_no, detail.value, detail.reason + )?; + } } } } diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index abe87eaa1..c42613697 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -381,7 +381,7 @@ impl StatsArgs { } } -#[derive(Clone, Serialize, Deserialize, PartialEq, Default)] +#[derive(Clone, Serialize, Deserialize, PartialEq, Default, Debug)] pub struct StatsData { pub field: String, // type is a reserved keyword in Rust @@ -398,6 +398,9 @@ pub struct StatsData { pub max_length: Option, pub sum_length: Option, pub avg_length: Option, + pub stddev_length: Option, + pub variance_length: Option, + pub cv_length: Option, pub mean: Option, pub sem: Option, pub stddev: Option, @@ -448,6 +451,9 @@ const STATSDATA_TYPES_ARRAY: [JsonTypes; MAX_STAT_COLUMNS] = [ JsonTypes::Int, //max_length JsonTypes::Int, //sum_length JsonTypes::Float, //avg_length + JsonTypes::Float, //stddev_length + JsonTypes::Float, //variance_length + JsonTypes::Float, //cv_length JsonTypes::Float, //mean JsonTypes::Float, //sem JsonTypes::Float, //geometric_mean @@ -494,7 +500,7 @@ const MS_IN_DAY_INT: i64 = 86_400_000; const DAY_DECIMAL_PLACES: u32 = 5; // maximum number of output columns -const MAX_STAT_COLUMNS: usize = 39; +const MAX_STAT_COLUMNS: usize = 42; // maximum number of antimodes to display const MAX_ANTIMODES: usize = 10; @@ -1192,6 +1198,9 @@ impl Args { "max_length", "sum_length", "avg_length", + "stddev_length", + "variance_length", + "cv_length", "mean", "sem", "geometric_mean", @@ -1336,6 +1345,7 @@ pub struct Stats { sum_stotlen: u64, minmax: Option, online: Option, + online_len: Option, nullcount: u64, max_precision: u16, modes: Option>>, @@ -1361,8 +1371,16 @@ fn timestamp_ms_to_rfc3339(timestamp: i64, typ: FieldType) -> String { impl Stats { fn new(which: WhichStats) -> Stats { - let (mut sum, mut minmax, mut online, mut modes, mut median, mut quartiles, mut mad) = - (None, None, None, None, None, None, None); + let ( + mut sum, + mut minmax, + mut online, + mut online_len, + mut modes, + mut median, + mut quartiles, + mut mad, + ) = (None, None, None, None, None, None, None, None); if which.sum { sum = Some(TypedSum::default()); } @@ -1371,6 +1389,7 @@ impl Stats { } if which.dist { online = Some(stats::OnlineStats::default()); + online_len = Some(stats::OnlineStats::default()); } if which.mode || which.cardinality { modes = Some(stats::Unsorted::default()); @@ -1390,6 +1409,7 @@ impl Stats { sum_stotlen: 0, minmax, online, + online_len, nullcount: 0, max_precision: 0, modes, @@ -1433,6 +1453,9 @@ impl Stats { match t { TString => { self.is_ascii &= sample.is_ascii(); + if let Some(v) = self.online_len.as_mut() { + v.add(&sample.len()); + } }, TFloat | TInteger => { if sample_type == TNull { @@ -1705,11 +1728,11 @@ impl Stats { // actually append it here - to preserve legacy ordering of columns pieces.extend_from_slice(&minmax_range_sortorder_pieces); - // min/max/sum/avg length + // min/max/sum/avg/stddev/variance/cv length if typ == FieldType::TDate || typ == FieldType::TDateTime { // returning min/max length for dates doesn't make sense // especially since we convert the date stats to rfc3339 format - pieces.extend_from_slice(&[empty(), empty(), empty(), empty()]); + pieces.extend_from_slice(&[empty(), empty(), empty(), empty(), empty(), empty(), empty()]); } else if let Some(mm) = self.minmax.as_ref().and_then(TypedMinMax::len_range) { pieces.extend_from_slice(&[mm.0, mm.1]); // we have a sum_length @@ -1718,20 +1741,37 @@ impl Stats { // so we can compute avg_length pieces.push(itoa::Buffer::new().format(stotlen).to_owned()); #[allow(clippy::cast_precision_loss)] - pieces.push(util::round_num(stotlen as f64 / record_count as f64, 4)); + let avg_len = stotlen as f64 / record_count as f64; + pieces.push(util::round_num(avg_len, round_places)); + + // Add stddev_length/variance_length for strings + if let Some(vl) = self.online_len.as_ref() { + let vlen_stddev = vl.stddev(); + let vlen_variance = vl.variance(); + pieces.push(util::round_num(vlen_stddev, round_places)); + pieces.push(util::round_num(vlen_variance, round_places)); + pieces.push(util::round_num(vlen_stddev / avg_len, round_places)); + } else { + pieces.push(empty()); + pieces.push(empty()); + pieces.push(empty()); + } } else { // however, we saturated the sum, it means we had an overflow - // so we return OVERFLOW_STRING for sum and avg length + // so we return OVERFLOW_STRING for sum,avg,stddev,variance length pieces.extend_from_slice(&[ OVERFLOW_STRING.to_string(), OVERFLOW_STRING.to_string(), + OVERFLOW_STRING.to_string(), + OVERFLOW_STRING.to_string(), + OVERFLOW_STRING.to_string(), ]); } } else { - pieces.extend_from_slice(&[empty(), empty()]); + pieces.extend_from_slice(&[empty(), empty(), empty(), empty(), empty()]); } } else { - pieces.extend_from_slice(&[empty(), empty(), empty(), empty()]); + pieces.extend_from_slice(&[empty(), empty(), empty(), empty(), empty(), empty(), empty()]); } // mean, sem, geometric_mean, harmonic_mean, stddev, variance & cv @@ -1959,6 +1999,7 @@ impl Commute for Stats { self.sum_stotlen = self.sum_stotlen.saturating_add(other.sum_stotlen); self.minmax.merge(other.minmax); self.online.merge(other.online); + self.online_len.merge(other.online_len); self.nullcount += other.nullcount; self.max_precision = std::cmp::max(self.max_precision, other.max_precision); self.modes.merge(other.modes); diff --git a/src/main.rs b/src/main.rs index 3d86d4f53..9ca008952 100644 --- a/src/main.rs +++ b/src/main.rs @@ -380,6 +380,7 @@ enum Command { Lens, #[cfg(all(feature = "luau", feature = "feature_capable"))] Luau, + Outliers, Partition, #[cfg(all(feature = "polars", feature = "feature_capable"))] PivotP, @@ -480,6 +481,7 @@ impl Command { Command::Lens => cmd::lens::run(argv), #[cfg(all(feature = "luau", feature = "feature_capable"))] Command::Luau => cmd::luau::run(argv), + Command::Outliers => cmd::outliers::run(argv), Command::Partition => cmd::partition::run(argv), #[cfg(all(feature = "polars", feature = "feature_capable"))] Command::PivotP => cmd::pivotp::run(argv), diff --git a/tests/test_outliers.rs b/tests/test_outliers.rs new file mode 100644 index 000000000..4f113d2a5 --- /dev/null +++ b/tests/test_outliers.rs @@ -0,0 +1,161 @@ +use crate::workdir::Workdir; + +#[test] +fn test_outliers_basic() { + let wrk = Workdir::new("outliers"); + wrk.create( + "data.csv", + vec![ + svec!["number", "value"], + svec!["1", "10"], + svec!["2", "12"], + svec!["3", "15"], + svec!["4", "100"], // Outlier + svec!["5", "13"], + svec!["6", "11"], + svec!["7", "14"], + ], + ); + + let mut cmd = wrk.command("outliers"); + cmd.arg("data.csv"); + + wrk.assert_success(&mut cmd); + + let got = wrk.output_stderr(&mut cmd); + assert!(got.contains("Found 1 outlier")); + assert!(got.contains("value: 100")); +} + +#[test] +fn test_outliers_multiple_columns() { + let wrk = Workdir::new("outliers_multiple"); + wrk.create( + "data.csv", + vec![ + svec!["temp", "pressure", "humidity"], + svec!["20", "1013", "45"], + svec!["22", "1014", "48"], + svec!["21", "1012", "46"], + svec!["50", "900", "99"], // All outliers + svec!["23", "1015", "47"], + ], + ); + + let mut cmd = wrk.command("outliers"); + cmd.arg("-s").arg("temp,pressure,humidity").arg("data.csv"); + + wrk.assert_success(&mut cmd); + + let got = wrk.output_stderr(&mut cmd); + assert!(got.contains("temp: Found 1 outlier")); + assert!(got.contains("pressure: Found 1 outlier")); + assert!(got.contains("humidity: Found 1 outlier")); +} + +#[test] +fn test_outliers_inner_fence() { + let wrk = Workdir::new("outliers_inner"); + wrk.create( + "data.csv", + vec![ + svec!["value"], + svec!["10"], + svec!["12"], + svec!["15"], + svec!["30"], // Outlier with inner fence + svec!["13"], + svec!["11"], + svec!["14"], + ], + ); + + let mut cmd = wrk.command("outliers"); + cmd.arg("-m").arg("inner").arg("data.csv"); + + wrk.assert_success(&mut cmd); + + let got = wrk.output_stderr(&mut cmd); + assert!(got.contains("Found 1 outlier")); + assert!(got.contains("value: 30")); +} + +#[test] +fn test_outliers_quiet_mode() { + let wrk = Workdir::new("outliers_quiet"); + wrk.create( + "data.csv", + vec![ + svec!["value"], + svec!["10"], + svec!["12"], + svec!["15"], + svec!["100"], // Outlier + svec!["13"], + ], + ); + + let mut cmd = wrk.command("outliers"); + cmd.arg("-q").arg("data.csv"); + + wrk.assert_success(&mut cmd); + + let got = wrk.output_stderr(&mut cmd); + assert!(got.contains("Found 1 outlier")); + assert!(!got.contains("value: 100")); // Detailed output should be suppressed +} + +#[test] +fn test_outliers_string_column() { + let wrk = Workdir::new("outliers_string"); + wrk.create( + "data.csv", + vec![ + svec!["text"], + svec!["normal"], + svec!["typical"], + svec!["regular"], + svec!["very very very very long text"], // Length outlier + svec!["usual"], + ], + ); + + let mut cmd = wrk.command("outliers"); + cmd.arg("data.csv"); + + wrk.assert_success(&mut cmd); + + let got = wrk.output_stderr(&mut cmd); + assert!(got.contains("Found 1 outlier")); + assert!(got.contains("very very very very long text")); +} + +#[test] +fn test_outliers_both_fences() { + let wrk = Workdir::new("outliers_both"); + wrk.create( + "data.csv", + vec![ + svec!["value"], + svec!["10"], + svec!["12"], + svec!["15"], + svec!["30"], // Inner fence outlier + svec!["100"], // Outer fence outlier + svec!["13"], + svec!["11"], + svec!["14"], + ], + ); + + let mut cmd = wrk.command("outliers"); + cmd.arg("-m").arg("both").arg("data.csv"); + + wrk.assert_success(&mut cmd); + + let got = wrk.output_stderr(&mut cmd); + assert!(got.contains("Inner fence outliers:")); + assert!(got.contains("Outer fence outliers:")); + assert!(got.contains("value: 30")); + assert!(got.contains("value: 100")); +} diff --git a/tests/tests.rs b/tests/tests.rs index 522bfc3f0..24740667c 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -91,6 +91,7 @@ mod test_json; mod test_jsonl; #[cfg(feature = "luau")] mod test_luau; +mod test_outliers; #[cfg(any(feature = "feature_capable", feature = "lite"))] mod test_partition; #[cfg(feature = "polars")] From a1884fdddd08a9e735e203b443363b024a3a47e1 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 29 Dec 2024 14:26:37 -0500 Subject: [PATCH 02/12] tests: adjust `stats` tests to account for new addl length stats - `stddev_length`, `variance_length` and `cv_length` --- .../test/boston311-10-boolean-1or0-stats.csv | 70 +++++------ .../test/boston311-10-boolean-tf-stats.csv | 70 +++++------ .../boston311-100-antimodes-len500-stats.csv | 68 +++++----- ...boston311-100-everything-8places-stats.csv | 2 +- ...-everything-date-stats-variance-stddev.csv | 68 +++++----- .../boston311-100-everything-date-stats.csv | 2 +- ...ton311-100-everything-datenotime-stats.csv | 2 +- ...hing-inferdates-defaultwhitelist-stats.csv | 2 +- .../boston311-100-everything-nodate-stats.csv | 2 +- resources/test/boston311-100-stats.csv | 2 +- .../boston311-100-with-nonascii-stats.csv | 2 +- tests/test_index.rs | 23 +++- tests/test_stats.rs | 117 +++++++++++------- 13 files changed, 242 insertions(+), 188 deletions(-) diff --git a/resources/test/boston311-10-boolean-1or0-stats.csv b/resources/test/boston311-10-boolean-1or0-stats.csv index d52d28d4b..f7ff1ee28 100644 --- a/resources/test/boston311-10-boolean-1or0-stats.csv +++ b/resources/test/boston311-10-boolean-1or0-stats.csv @@ -1,35 +1,35 @@ -field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,cardinality,qsv__value -case_enquiry_id,Integer,,1010041354742,101004113298,101004155594,42296,Unsorted,12,12,120,12,101004135474.2,4663.4961,101004135474.1991,101004135474.1978,14747.2697,217481962.3498,0,0,,0,10, -open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,190,19,,,,,,,,0,,0,10, -target_dt,String,true,,2022-01-11 08:30:00,2022-05-20 13:03:21,,Unsorted,0,19,114,11.4,,,,,,,,4,,0.4,6, -closed_dt,String,true,,2022-01-09 06:43:06,2022-01-20 08:45:12,,Unsorted,0,19,95,9.5,,,,,,,,5,,0.5,6, -ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,62,6.2,,,,,,,,0,,0,2, -case_status,String,true,,Closed,Open,,Unsorted,4,6,50,5,,,,,,,,0,,0,2, -case_status_boolean,Boolean,,5,0,1,1,Unsorted,1,1,10,1,0.5,0.1581,0,,0.5,0.25,100,0,,0,2, -closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,82,350,35,,,,,,,,0,,0,6, -case_title,String,true,,BTDT: Complaint,Sidewalk Cover / Manhole,,Unsorted,13,57,235,23.5,,,,,,,,0,,0,8, -subject,String,true,,Boston Police Department,Public Works Department,,Unsorted,21,31,235,23.5,,,,,,,,0,,0,5, -reason,String,true,,Administrative & General Requests,Street Cleaning,,Unsorted,7,33,174,17.4,,,,,,,,0,,0,7, -type,String,true,,CE Collection,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,13,47,240,24,,,,,,,,0,,0,8, -queue,String,true,,BTDT_Parking Enforcement,PWDx_Snow Cases,,Unsorted,15,46,272,27.2,,,,,,,,0,,0,7, -department,String,true,,BTDT,PWDx,,Unsorted,3,4,38,3.8,,,,,,,,0,,0,5, -submittedphoto,NULL,,,,,,,0,0,,,,,,,,,,10,,1,1, -closedphoto,NULL,,,,,,,0,0,,,,,,,,,,10,,1,1, -location,String,true,, ,850 South St Roslindale MA 02131,,Unsorted,1,40,309,30.9,,,,,,,,0,,0,10, -fire_district,String,true,, ,9,,Unsorted,1,1,10,1,,,,,,,,0,,0,4, -pwd_district,String,true,, ,1C,,Unsorted,1,2,19,1.9,,,,,,,,0,,0,6, -city_council_district,String,true,, ,8,,Unsorted,1,1,10,1,,,,,,,,0,,0,6, -police_district,String,true,, ,E5,,Unsorted,1,3,21,2.1,,,,,,,,0,,0,6, -neighborhood,String,true,, ,South End,,Unsorted,1,13,91,9.1,,,,,,,,0,,0,8, -neighborhood_services_district,String,true,, ,6,,Unsorted,1,2,14,1.4,,,,,,,,0,,0,7, -ward,String,true,, ,Ward 9,,Unsorted,1,7,53,5.3,,,,,,,,0,,0,8, -precinct,String,true,, ,2004,,Unsorted,1,4,37,3.7,,,,,,,,0,,0,9, -location_street_name,String,true,,12 Derne St,850 South St,,Unsorted,0,20,120,12,,,,,,,,1,,0.1,10, -location_zipcode,String,true,,02113,02131,,Unsorted,0,5,45,4.5,,,,,,,,1,,0.1,8, -latitude,Float,,423.4656,42.2884,42.3735,0.0851,Unsorted,7,7,70,7,42.3466,0.008,42.3466,42.3465,0.0252,0.0006,0.0595,0,4,0,9, -longitude,Float,,-710.782,-71.133,-71.0566,0.0764,Unsorted,6,8,77,7.7,-71.0782,0.0078,,,0.0246,0.0006,-0.0346,0,4,0,10, -source,String,true,,City Worker App,Constituent Call,,Unsorted,15,16,157,15.7,,,,,,,,0,,0,2, -qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,10 -qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,30 -qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,3887 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,1a4c2204a401f6791b6e5efde990955e1b6c59aec5b3de300686fadb63ee457b +field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,stddev_length,variance_length,cv_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,cardinality,qsv__value +case_enquiry_id,Integer,,1010041354742,101004113298,101004155594,42296,Unsorted,12,12,120,12,,,,101004135474.2,4663.4961,101004135474.1991,101004135474.1978,14747.2697,217481962.3498,0,0,,0,10, +open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,190,19,0,0,0,,,,,,,,0,,0,10, +target_dt,String,true,,2022-01-11 08:30:00,2022-05-20 13:03:21,,Unsorted,0,19,114,11.4,9.3081,86.64,0.8165,,,,,,,,4,,0.4,6, +closed_dt,String,true,,2022-01-09 06:43:06,2022-01-20 08:45:12,,Unsorted,0,19,95,9.5,9.4412,89.1358,0.9938,,,,,,,,5,,0.5,6, +ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,62,6.2,0.4,0.16,0.0645,,,,,,,,0,,0,2, +case_status,String,true,,Closed,Open,,Unsorted,4,6,50,5,1,1,0.2,,,,,,,,0,,0,2, +case_status_boolean,Boolean,,5,0,1,1,Unsorted,1,1,10,1,,,,0.5,0.1581,0,,0.5,0.25,100,0,,0,2, +closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,82,350,35,34.5543,1194,0.9873,,,,,,,,0,,0,6, +case_title,String,true,,BTDT: Complaint,Sidewalk Cover / Manhole,,Unsorted,13,57,235,23.5,14.1156,199.25,0.6007,,,,,,,,0,,0,8, +subject,String,true,,Boston Police Department,Public Works Department,,Unsorted,21,31,235,23.5,2.6552,7.05,0.113,,,,,,,,0,,0,5, +reason,String,true,,Administrative & General Requests,Street Cleaning,,Unsorted,7,33,174,17.4,7.9019,62.44,0.4541,,,,,,,,0,,0,7, +type,String,true,,CE Collection,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,13,47,240,24,11.619,135,0.4841,,,,,,,,0,,0,8, +queue,String,true,,BTDT_Parking Enforcement,PWDx_Snow Cases,,Unsorted,15,46,272,27.2,10.1272,102.56,0.3723,,,,,,,,0,,0,7, +department,String,true,,BTDT,PWDx,,Unsorted,3,4,38,3.8,0.4,0.16,0.1053,,,,,,,,0,,0,5, +submittedphoto,NULL,,,,,,,0,0,,,,,,,,,,,,,10,,1,1, +closedphoto,NULL,,,,,,,0,0,,,,,,,,,,,,,10,,1,1, +location,String,true,, ,850 South St Roslindale MA 02131,,Unsorted,1,40,309,30.9,10.4062,108.29,0.3368,,,,,,,,0,,0,10, +fire_district,String,true,, ,9,,Unsorted,1,1,10,1,0,0,0,,,,,,,,0,,0,4, +pwd_district,String,true,, ,1C,,Unsorted,1,2,19,1.9,0.3,0.09,0.1579,,,,,,,,0,,0,6, +city_council_district,String,true,, ,8,,Unsorted,1,1,10,1,0,0,0,,,,,,,,0,,0,6, +police_district,String,true,, ,E5,,Unsorted,1,3,21,2.1,0.5385,0.29,0.2564,,,,,,,,0,,0,6, +neighborhood,String,true,, ,South End,,Unsorted,1,13,91,9.1,3.2696,10.69,0.3593,,,,,,,,0,,0,8, +neighborhood_services_district,String,true,, ,6,,Unsorted,1,2,14,1.4,0.4899,0.24,0.3499,,,,,,,,0,,0,7, +ward,String,true,, ,Ward 9,,Unsorted,1,7,53,5.3,1.9519,3.81,0.3683,,,,,,,,0,,0,8, +precinct,String,true,, ,2004,,Unsorted,1,4,37,3.7,0.9,0.81,0.2432,,,,,,,,0,,0,9, +location_street_name,String,true,,12 Derne St,850 South St,,Unsorted,0,20,120,12,2.7889,7.7778,0.2324,,,,,,,,1,,0.1,10, +location_zipcode,String,true,,02113,02131,,Unsorted,0,5,45,4.5,0,0,0,,,,,,,,1,,0.1,8, +latitude,Float,,423.4656,42.2884,42.3735,0.0851,Unsorted,7,7,70,7,,,,42.3466,0.008,42.3466,42.3465,0.0252,0.0006,0.0595,0,4,0,9, +longitude,Float,,-710.782,-71.133,-71.0566,0.0764,Unsorted,6,8,77,7.7,,,,-71.0782,0.0078,,,0.0246,0.0006,-0.0346,0,4,0,10, +source,String,true,,City Worker App,Constituent Call,,Unsorted,15,16,157,15.7,0.4583,0.21,0.0292,,,,,,,,0,,0,2, +qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,10 +qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,30 +qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,3887 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,71b0f8ff9ddfe2ed63633fd0f29bddaadd1613d73b622b54b3be54c6dea56b0d diff --git a/resources/test/boston311-10-boolean-tf-stats.csv b/resources/test/boston311-10-boolean-tf-stats.csv index f0b085e95..53e034a23 100644 --- a/resources/test/boston311-10-boolean-tf-stats.csv +++ b/resources/test/boston311-10-boolean-tf-stats.csv @@ -1,35 +1,35 @@ -field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,cardinality,qsv__value -case_enquiry_id,Integer,,1010041354742,101004113298,101004155594,42296,Unsorted,12,12,120,12,101004135474.2,4663.4961,101004135474.1991,101004135474.1978,14747.2697,217481962.3498,0,0,,0,10, -open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,190,19,,,,,,,,0,,0,10, -target_dt,String,true,,2022-01-11 08:30:00,2022-05-20 13:03:21,,Unsorted,0,19,114,11.4,,,,,,,,4,,0.4,6, -closed_dt,String,true,,2022-01-09 06:43:06,2022-01-20 08:45:12,,Unsorted,0,19,95,9.5,,,,,,,,5,,0.5,6, -ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,62,6.2,,,,,,,,0,,0,2, -case_status,String,true,,Closed,Open,,Unsorted,4,6,50,5,,,,,,,,0,,0,2, -case_status_boolean,Boolean,true,,False,True,,Unsorted,4,5,45,4.5,,,,,,,,0,,0,2, -closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,82,350,35,,,,,,,,0,,0,6, -case_title,String,true,,BTDT: Complaint,Sidewalk Cover / Manhole,,Unsorted,13,57,235,23.5,,,,,,,,0,,0,8, -subject,String,true,,Boston Police Department,Public Works Department,,Unsorted,21,31,235,23.5,,,,,,,,0,,0,5, -reason,String,true,,Administrative & General Requests,Street Cleaning,,Unsorted,7,33,174,17.4,,,,,,,,0,,0,7, -type,String,true,,CE Collection,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,13,47,240,24,,,,,,,,0,,0,8, -queue,String,true,,BTDT_Parking Enforcement,PWDx_Snow Cases,,Unsorted,15,46,272,27.2,,,,,,,,0,,0,7, -department,String,true,,BTDT,PWDx,,Unsorted,3,4,38,3.8,,,,,,,,0,,0,5, -submittedphoto,NULL,,,,,,,0,0,,,,,,,,,,10,,1,1, -closedphoto,NULL,,,,,,,0,0,,,,,,,,,,10,,1,1, -location,String,true,, ,850 South St Roslindale MA 02131,,Unsorted,1,40,309,30.9,,,,,,,,0,,0,10, -fire_district,String,true,, ,9,,Unsorted,1,1,10,1,,,,,,,,0,,0,4, -pwd_district,String,true,, ,1C,,Unsorted,1,2,19,1.9,,,,,,,,0,,0,6, -city_council_district,String,true,, ,8,,Unsorted,1,1,10,1,,,,,,,,0,,0,6, -police_district,String,true,, ,E5,,Unsorted,1,3,21,2.1,,,,,,,,0,,0,6, -neighborhood,String,true,, ,South End,,Unsorted,1,13,91,9.1,,,,,,,,0,,0,8, -neighborhood_services_district,String,true,, ,6,,Unsorted,1,2,14,1.4,,,,,,,,0,,0,7, -ward,String,true,, ,Ward 9,,Unsorted,1,7,53,5.3,,,,,,,,0,,0,8, -precinct,String,true,, ,2004,,Unsorted,1,4,37,3.7,,,,,,,,0,,0,9, -location_street_name,String,true,,12 Derne St,850 South St,,Unsorted,0,20,120,12,,,,,,,,1,,0.1,10, -location_zipcode,String,true,,02113,02131,,Unsorted,0,5,45,4.5,,,,,,,,1,,0.1,8, -latitude,Float,,423.4656,42.2884,42.3735,0.0851,Unsorted,7,7,70,7,42.3466,0.008,42.3466,42.3465,0.0252,0.0006,0.0595,0,4,0,9, -longitude,Float,,-710.782,-71.133,-71.0566,0.0764,Unsorted,6,8,77,7.7,-71.0782,0.0078,,,0.0246,0.0006,-0.0346,0,4,0,10, -source,String,true,,City Worker App,Constituent Call,,Unsorted,15,16,157,15.7,,,,,,,,0,,0,2, -qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,10 -qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,30 -qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,3922 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,dd97ad46b4b34efa66aa634d6c54188eebaf44ef5aaa5dde38180c3435a9ddaa +field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,stddev_length,variance_length,cv_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,cardinality,qsv__value +case_enquiry_id,Integer,,1010041354742,101004113298,101004155594,42296,Unsorted,12,12,120,12,,,,101004135474.2,4663.4961,101004135474.1991,101004135474.1978,14747.2697,217481962.3498,0,0,,0,10, +open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,190,19,0,0,0,,,,,,,,0,,0,10, +target_dt,String,true,,2022-01-11 08:30:00,2022-05-20 13:03:21,,Unsorted,0,19,114,11.4,9.3081,86.64,0.8165,,,,,,,,4,,0.4,6, +closed_dt,String,true,,2022-01-09 06:43:06,2022-01-20 08:45:12,,Unsorted,0,19,95,9.5,9.4412,89.1358,0.9938,,,,,,,,5,,0.5,6, +ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,62,6.2,0.4,0.16,0.0645,,,,,,,,0,,0,2, +case_status,String,true,,Closed,Open,,Unsorted,4,6,50,5,1,1,0.2,,,,,,,,0,,0,2, +case_status_boolean,Boolean,true,,False,True,,Unsorted,4,5,45,4.5,0.5,0.25,0.1111,,,,,,,,0,,0,2, +closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,82,350,35,34.5543,1194,0.9873,,,,,,,,0,,0,6, +case_title,String,true,,BTDT: Complaint,Sidewalk Cover / Manhole,,Unsorted,13,57,235,23.5,14.1156,199.25,0.6007,,,,,,,,0,,0,8, +subject,String,true,,Boston Police Department,Public Works Department,,Unsorted,21,31,235,23.5,2.6552,7.05,0.113,,,,,,,,0,,0,5, +reason,String,true,,Administrative & General Requests,Street Cleaning,,Unsorted,7,33,174,17.4,7.9019,62.44,0.4541,,,,,,,,0,,0,7, +type,String,true,,CE Collection,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,13,47,240,24,11.619,135,0.4841,,,,,,,,0,,0,8, +queue,String,true,,BTDT_Parking Enforcement,PWDx_Snow Cases,,Unsorted,15,46,272,27.2,10.1272,102.56,0.3723,,,,,,,,0,,0,7, +department,String,true,,BTDT,PWDx,,Unsorted,3,4,38,3.8,0.4,0.16,0.1053,,,,,,,,0,,0,5, +submittedphoto,NULL,,,,,,,0,0,,,,,,,,,,,,,10,,1,1, +closedphoto,NULL,,,,,,,0,0,,,,,,,,,,,,,10,,1,1, +location,String,true,, ,850 South St Roslindale MA 02131,,Unsorted,1,40,309,30.9,10.4062,108.29,0.3368,,,,,,,,0,,0,10, +fire_district,String,true,, ,9,,Unsorted,1,1,10,1,0,0,0,,,,,,,,0,,0,4, +pwd_district,String,true,, ,1C,,Unsorted,1,2,19,1.9,0.3,0.09,0.1579,,,,,,,,0,,0,6, +city_council_district,String,true,, ,8,,Unsorted,1,1,10,1,0,0,0,,,,,,,,0,,0,6, +police_district,String,true,, ,E5,,Unsorted,1,3,21,2.1,0.5385,0.29,0.2564,,,,,,,,0,,0,6, +neighborhood,String,true,, ,South End,,Unsorted,1,13,91,9.1,3.2696,10.69,0.3593,,,,,,,,0,,0,8, +neighborhood_services_district,String,true,, ,6,,Unsorted,1,2,14,1.4,0.4899,0.24,0.3499,,,,,,,,0,,0,7, +ward,String,true,, ,Ward 9,,Unsorted,1,7,53,5.3,1.9519,3.81,0.3683,,,,,,,,0,,0,8, +precinct,String,true,, ,2004,,Unsorted,1,4,37,3.7,0.9,0.81,0.2432,,,,,,,,0,,0,9, +location_street_name,String,true,,12 Derne St,850 South St,,Unsorted,0,20,120,12,2.7889,7.7778,0.2324,,,,,,,,1,,0.1,10, +location_zipcode,String,true,,02113,02131,,Unsorted,0,5,45,4.5,0,0,0,,,,,,,,1,,0.1,8, +latitude,Float,,423.4656,42.2884,42.3735,0.0851,Unsorted,7,7,70,7,,,,42.3466,0.008,42.3466,42.3465,0.0252,0.0006,0.0595,0,4,0,9, +longitude,Float,,-710.782,-71.133,-71.0566,0.0764,Unsorted,6,8,77,7.7,,,,-71.0782,0.0078,,,0.0246,0.0006,-0.0346,0,4,0,10, +source,String,true,,City Worker App,Constituent Call,,Unsorted,15,16,157,15.7,0.4583,0.21,0.0292,,,,,,,,0,,0,2, +qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,10 +qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,30 +qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,3922 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,874abe7cd02691b113acc7122097731ef6011f9e8e96dfd63ebbddc6724d19ef diff --git a/resources/test/boston311-100-antimodes-len500-stats.csv b/resources/test/boston311-100-antimodes-len500-stats.csv index d90fcc68c..c6553de46 100644 --- a/resources/test/boston311-100-antimodes-len500-stats.csv +++ b/resources/test/boston311-100-antimodes-len500-stats.csv @@ -1,34 +1,34 @@ -field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,mad,lower_outer_fence,lower_inner_fence,q1,q2_median,q3,iqr,upper_inner_fence,upper_outer_fence,skewness,cardinality,mode,mode_count,mode_occurrences,antimode,antimode_count,antimode_occurrences,qsv__value -case_enquiry_id,Integer,,10100411645180,101004113298,101004155594,42296,Unsorted,12,12,1200,12,101004116451.8,790.552,101004116451.8012,101004116451.7994,7905.5202,62497248.9138,0,0,,0,673,101004109567,101004111646,101004113725,101004114353,101004115111,1386,101004117190,101004119269,0.0938,100,,0,0,*ALL,0,1, -open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,1900,19,,,,,,,,0,,0,,,,,,,,,,,100,,0,0,*ALL,0,1, -target_dt,String,true,,2022-01-03 10:32:34,2022-05-20 13:03:21,,Unsorted,0,19,1691,16.91,,,,,,,,11,,0.11,,,,,,,,,,,42,2022-01-04 08:30:00,1,25,"*PREVIEW: 2022-01-03 10:32:34,2022-01-03 11:58:12,2022-01-04 09:58:36,2022-01-04 10:41:29,2022-01-04 10:42:18,2022-01-04 11:26:45,2022-01-04 11:31:57,2022-01-04 12:13:47,2022-01-04 12:19:43,2022-01-04 15:51:30",34,1, -closed_dt,String,true,,2022-01-01 12:56:14,2022-04-25 14:30:31,,Unsorted,0,19,1615,16.15,,,,,,,,15,,0.15,,,,,,,,,,,86,,1,15,"*PREVIEW: 2022-01-01 12:56:14,2022-01-01 14:17:15,2022-01-01 14:59:41,2022-01-01 15:10:16,2022-01-01 19:07:41,2022-01-02 06:43:42,2022-01-02 06:49:07,2022-01-02 08:54:11,2022-01-02 11:03:10,2022-01-02 12:45:49",85,1, -ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,617,6.17,,,,,,,,0,,0,,,,,,,,,,,2,ONTIME,1,83,OVERDUE,1,17, -case_status,String,true,,Closed,Open,,Unsorted,4,6,570,5.7,,,,,,,,0,,0,,,,,,,,,,,2,Closed,1,85,Open,1,15, -closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,,,,,,,,0,,0,,,,,,,,,,,86, ,1,15,"*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 ,Case Closed Case Resolved NO OVERFLOW NO ADDITIONAL CART DELV 1/11/22 ,Case Closed. Closed date : 2022-01-02 23:40:14.32 Case Resolved CLEAR ,Case Closed. Closed date : 2022-01-02 23:54:13.31 Case Resolved BOSTON POLICE WERE THERE TO TICKET ,Case Closed. Closed date : 2022-01-02 23:55:44.4 Case Resolved CLEAR ,Case Closed. Closed date : 2022-01-02 23:59:18.367 Case Resolved CLEAR ,Case Closed. Closed date : 2022-01-03 00:...",85,1, -case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,,,,,,,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,BTDT: Complaint,City/State Snow Issues,DISPATCHED Short Term Rental,Electrical,Graffiti: Ward 8 0803 ,Loud Parties/Music/People,Misc. Snow Complaint,Missed ""Other"" Trash: District 07,Missed Trash/Recycling/Yard Waste/Bulk Item",24,1, -subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,,,,,,,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,"Animal Control,Boston Police Department,Boston Water & Sewer Commission",3,1, -reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,,,,,,,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,"Administrative & General Requests,Animal Issues,Building,Employee & General Comments,Noise Disturbance,Programs,Sidewalk Cover / Manhole",7,1, -type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,,,,,,,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,City/State Snow Issues,Electrical,General Comments For a Program or Policy,General Lighting Request,Knockdown Replacement,Loud Parties/Music/People,Misc. Snow Complaint,New Sign Crosswalk or Pavement Marking,Schedule a Bulk Item Pickup SS",15,1, -queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,,,,,,,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,"*PREVIEW: BTDT_BostonBikes,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT_Sign Shop_Sign Installation,BTDT_Traffic Signal_Graffiti,ISD_Building (INTERNAL),PARK_Maintenance_Region 6,PWDx_Contractor Complaints,PWDx_District 04: Allston/Brighton,PWDx_District 06: West Roxbury and Roslindale,PWDx_District 09: East Boston",15,1, -department,String,true,,BTDT,PWDx,,Unsorted,3,4,392,3.92,,,,,,,,0,,0,,,,,,,,,,,7,PWDx,1,49,GEN_,1,2, -submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,,,,,,,,58,,0.58,,,,,,,,,,,43,,1,58,"*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d0627a05bbcf180c297a39/photo_20220101_091656.jpg,https://311.boston.gov/media/boston/report/photos/61d063e505bbcf180c297b6a/photo_20220101_092319.jpg,https://311.boston.gov/media/boston/report/photos/61d0688a05bbcf180c297be9/report.jpg,https://311.boston.gov/media/boston/report/photos/61d074c005bbcf180c298048/report.jpg,https://311.boston.gov/media/b...",42,1, -closedphoto,NULL,,,,,,,0,0,,,,,,,,,,100,,1,,,,,,,,,,,1,,1,100,,0,0, -location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,,,,,,,,0,,0,,,,,,,,,,,98,"563 Columbus Ave Roxbury MA 02118,INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ",2,2,"*PREVIEW: ,103 N Beacon St Brighton MA 02135,11 Aberdeen St Boston MA 02215,1148 Hyde Park Ave Hyde Park MA 02136,119 L St South Boston MA 02127,12 Derne St Boston MA 02114,126 Elm St Charlestown MA 02129,1270 Commonwealth Ave Allston MA 02134,130 Shirley St Roxbury MA 02119,131 Arlington St Boston MA 02116",96,1, -fire_district,String,true,, ,9,,Unsorted,1,2,113,1.13,,,,,,,,0,,0,,,,,,,,,,,10,3,1,19, ,1,1, -pwd_district,String,true,, ,1C,,Unsorted,1,3,209,2.09,,,,,,,,0,,0,,,,,,,,,,,14,1B,1,16, ,1,1, -city_council_district,String,true,, ,9,,Unsorted,1,1,100,1,,,,,,,,0,,0,,,,,,,,,,,10,1,1,22, ,1,1, -police_district,String,true,, ,E5,,Unsorted,1,3,223,2.23,,,,,,,,0,,0,,,,,,,,,,,13,A1,1,20, ,1,1, -neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,,,,,,,,0,,0,,,,,,,,,,,19,Dorchester,1,15," ,Brighton,Mission Hill",3,1, -neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,,,,,,,,0,,0,,,,,,,,,,,16,3,1,15," ,12",2,1, -ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,,,,,,,,0,,0,,,,,,,,,,,42,Ward 3,1,10,"*PREVIEW: ,01,02,04,06,07,1,10,16,18",23,1, -precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,,,,,,,,1,,0.01,,,,,,,,,,,76,0306,1,5,"*PREVIEW: NULL, ,0102,0105,0108,0109,0201,0204,0305,0307",61,1, -location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,,,,,,,,1,,0.01,,,,,,,,,,,97,"20 Washington St,563 Columbus Ave,INTERSECTION Gallivan Blvd & Washington St",3,2,"*PREVIEW: NULL,103 N Beacon St,11 Aberdeen St,1148 Hyde Park Ave,119 L St,12 Derne St,126 Elm St,1270 Commonwealth Ave,130 Shirley St,131 Arlington St",94,1, -location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,,,,,,,,17,,0.17,,,,,,,,,,,24,,1,17,"02126,02134,02210,02215",4,1, -latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,42.3367,0.0031,42.3367,42.3367,0.0305,0.0009,0.072,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,"*PREVIEW: 42.2553,42.2601,42.2609,42.2645,42.2674,42.2789,42.2797,42.2804,42.2821,42.2878",74,1, -longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,-71.0727,0.0031,,,0.0311,0.001,-0.0437,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,"*PREVIEW: -71.0298,-71.0301,-71.0309,-71.0323,-71.0325,-71.0329,-71.0336,-71.0338,-71.034,-71.0355",72,1, -source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01,,,,,,,,0,,0,,,,,,,,,,,4,Citizens Connect App,1,56,Self Service,1,3, -qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 -qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 -qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,47702 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1551f88101b999f0ca88c62c062668a881513d9b2ee4af8741855501ebcdba0e +field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,stddev_length,variance_length,cv_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,mad,lower_outer_fence,lower_inner_fence,q1,q2_median,q3,iqr,upper_inner_fence,upper_outer_fence,skewness,cardinality,mode,mode_count,mode_occurrences,antimode,antimode_count,antimode_occurrences,qsv__value +case_enquiry_id,Integer,,10100411645180,101004113298,101004155594,42296,Unsorted,12,12,1200,12,,,,101004116451.8,790.552,101004116451.8012,101004116451.7994,7905.5202,62497248.9138,0,0,,0,673,101004109567,101004111646,101004113725,101004114353,101004115111,1386,101004117190,101004119269,0.0938,100,,0,0,*ALL,0,1, +open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,1900,19,0,0,0,,,,,,,,0,,0,,,,,,,,,,,100,,0,0,*ALL,0,1, +target_dt,String,true,,2022-01-03 10:32:34,2022-05-20 13:03:21,,Unsorted,0,19,1691,16.91,5.9449,35.3419,0.3516,,,,,,,,11,,0.11,,,,,,,,,,,42,2022-01-04 08:30:00,1,25,"*PREVIEW: 2022-01-03 10:32:34,2022-01-03 11:58:12,2022-01-04 09:58:36,2022-01-04 10:41:29,2022-01-04 10:42:18,2022-01-04 11:26:45,2022-01-04 11:31:57,2022-01-04 12:13:47,2022-01-04 12:19:43,2022-01-04 15:51:30",34,1, +closed_dt,String,true,,2022-01-01 12:56:14,2022-04-25 14:30:31,,Unsorted,0,19,1615,16.15,6.6205,43.8312,0.4099,,,,,,,,15,,0.15,,,,,,,,,,,86,,1,15,"*PREVIEW: 2022-01-01 12:56:14,2022-01-01 14:17:15,2022-01-01 14:59:41,2022-01-01 15:10:16,2022-01-01 19:07:41,2022-01-02 06:43:42,2022-01-02 06:49:07,2022-01-02 08:54:11,2022-01-02 11:03:10,2022-01-02 12:45:49",85,1, +ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,617,6.17,0.3756,0.1411,0.0609,,,,,,,,0,,0,,,,,,,,,,,2,ONTIME,1,83,OVERDUE,1,17, +case_status,String,true,,Closed,Open,,Unsorted,4,6,570,5.7,0.7141,0.51,0.1253,,,,,,,,0,,0,,,,,,,,,,,2,Closed,1,85,Open,1,15, +closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,55.0262,3027.8804,0.6618,,,,,,,,0,,0,,,,,,,,,,,86, ,1,15,"*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 ,Case Closed Case Resolved NO OVERFLOW NO ADDITIONAL CART DELV 1/11/22 ,Case Closed. Closed date : 2022-01-02 23:40:14.32 Case Resolved CLEAR ,Case Closed. Closed date : 2022-01-02 23:54:13.31 Case Resolved BOSTON POLICE WERE THERE TO TICKET ,Case Closed. Closed date : 2022-01-02 23:55:44.4 Case Resolved CLEAR ,Case Closed. Closed date : 2022-01-02 23:59:18.367 Case Resolved CLEAR ,Case Closed. Closed date : 2022-01-03 00:...",85,1, +case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,9.452,89.3404,0.3961,,,,,,,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,BTDT: Complaint,City/State Snow Issues,DISPATCHED Short Term Rental,Electrical,Graffiti: Ward 8 0803 ,Loud Parties/Music/People,Misc. Snow Complaint,Missed ""Other"" Trash: District 07,Missed Trash/Recycling/Yard Waste/Bulk Item",24,1, +subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,4.9041,24.05,0.1908,,,,,,,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,"Animal Control,Boston Police Department,Boston Water & Sewer Commission",3,1, +reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,8.4056,70.6536,0.4443,,,,,,,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,"Administrative & General Requests,Animal Issues,Building,Employee & General Comments,Noise Disturbance,Programs,Sidewalk Cover / Manhole",7,1, +type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,8.034,64.5444,0.3545,,,,,,,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,City/State Snow Issues,Electrical,General Comments For a Program or Policy,General Lighting Request,Knockdown Replacement,Loud Parties/Music/People,Misc. Snow Complaint,New Sign Crosswalk or Pavement Marking,Schedule a Bulk Item Pickup SS",15,1, +queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,8.0224,64.3596,0.2863,,,,,,,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,"*PREVIEW: BTDT_BostonBikes,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT_Sign Shop_Sign Installation,BTDT_Traffic Signal_Graffiti,ISD_Building (INTERNAL),PARK_Maintenance_Region 6,PWDx_Contractor Complaints,PWDx_District 04: Allston/Brighton,PWDx_District 06: West Roxbury and Roslindale,PWDx_District 09: East Boston",15,1, +department,String,true,,BTDT,PWDx,,Unsorted,3,4,392,3.92,0.2713,0.0736,0.0692,,,,,,,,0,,0,,,,,,,,,,,7,PWDx,1,49,GEN_,1,2, +submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,43.2585,1871.2989,1.1907,,,,,,,,58,,0.58,,,,,,,,,,,43,,1,58,"*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d0627a05bbcf180c297a39/photo_20220101_091656.jpg,https://311.boston.gov/media/boston/report/photos/61d063e505bbcf180c297b6a/photo_20220101_092319.jpg,https://311.boston.gov/media/boston/report/photos/61d0688a05bbcf180c297be9/report.jpg,https://311.boston.gov/media/boston/report/photos/61d074c005bbcf180c298048/report.jpg,https://311.boston.gov/media/b...",42,1, +closedphoto,NULL,,,,,,,0,0,,,,,,,,,,,,,100,,1,,,,,,,,,,,1,,1,100,,0,0, +location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,9.6247,92.6356,0.2444,,,,,,,,0,,0,,,,,,,,,,,98,"563 Columbus Ave Roxbury MA 02118,INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ",2,2,"*PREVIEW: ,103 N Beacon St Brighton MA 02135,11 Aberdeen St Boston MA 02215,1148 Hyde Park Ave Hyde Park MA 02136,119 L St South Boston MA 02127,12 Derne St Boston MA 02114,126 Elm St Charlestown MA 02129,1270 Commonwealth Ave Allston MA 02134,130 Shirley St Roxbury MA 02119,131 Arlington St Boston MA 02116",96,1, +fire_district,String,true,, ,9,,Unsorted,1,2,113,1.13,0.3363,0.1131,0.2976,,,,,,,,0,,0,,,,,,,,,,,10,3,1,19, ,1,1, +pwd_district,String,true,, ,1C,,Unsorted,1,3,209,2.09,0.3192,0.1019,0.1527,,,,,,,,0,,0,,,,,,,,,,,14,1B,1,16, ,1,1, +city_council_district,String,true,, ,9,,Unsorted,1,1,100,1,0,0,0,,,,,,,,0,,0,,,,,,,,,,,10,1,1,22, ,1,1, +police_district,String,true,, ,E5,,Unsorted,1,3,223,2.23,0.444,0.1971,0.1991,,,,,,,,0,,0,,,,,,,,,,,13,A1,1,20, ,1,1, +neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,9.8671,97.3604,0.664,,,,,,,,0,,0,,,,,,,,,,,19,Dorchester,1,15," ,Brighton,Mission Hill",3,1, +neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,0.4877,0.2379,0.3509,,,,,,,,0,,0,,,,,,,,,,,16,3,1,15," ,12",2,1, +ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,2.2293,4.9699,0.4468,,,,,,,,0,,0,,,,,,,,,,,42,Ward 3,1,10,"*PREVIEW: ,01,02,04,06,07,1,10,16,18",23,1, +precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,0.4951,0.2451,0.126,,,,,,,,1,,0.01,,,,,,,,,,,76,0306,1,5,"*PREVIEW: NULL, ,0102,0105,0108,0109,0201,0204,0305,0307",61,1, +location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,9.3995,88.3508,0.5222,,,,,,,,1,,0.01,,,,,,,,,,,97,"20 Washington St,563 Columbus Ave,INTERSECTION Gallivan Blvd & Washington St",3,2,"*PREVIEW: NULL,103 N Beacon St,11 Aberdeen St,1148 Hyde Park Ave,119 L St,12 Derne St,126 Elm St,1270 Commonwealth Ave,130 Shirley St,131 Arlington St",94,1, +location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,1.8405,3.3874,0.4435,,,,,,,,17,,0.17,,,,,,,,,,,24,,1,17,"02126,02134,02210,02215",4,1, +latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,,,,42.3367,0.0031,42.3367,42.3367,0.0305,0.0009,0.072,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,"*PREVIEW: 42.2553,42.2601,42.2609,42.2645,42.2674,42.2789,42.2797,42.2804,42.2821,42.2878",74,1, +longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,,,,-71.0727,0.0031,,,0.0311,0.001,-0.0437,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,"*PREVIEW: -71.0298,-71.0301,-71.0309,-71.0323,-71.0325,-71.0329,-71.0336,-71.0338,-71.034,-71.0355",72,1, +source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01,2.3473,5.5099,0.1303,,,,,,,,0,,0,,,,,,,,,,,4,Citizens Connect App,1,56,Self Service,1,3, +qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 +qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 +qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,47702 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5c687cf954ac8a31ab901cb8a61b0d0bb2f4109df893511259e3ef8e33847e63 diff --git a/resources/test/boston311-100-everything-8places-stats.csv b/resources/test/boston311-100-everything-8places-stats.csv index b886a6471..2b92fd146 100644 --- a/resources/test/boston311-100-everything-8places-stats.csv +++ b/resources/test/boston311-100-everything-8places-stats.csv @@ -31,4 +31,4 @@ source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01 qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,47702 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,bc8660240b24f489683c31066951cf6ccd248c0d688589a42963395372e03d43 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,d05e1be5fa05c794ff47fa4ac9aa43a98845f1ca2bf3997e9a314c01ccfc4ee5 diff --git a/resources/test/boston311-100-everything-date-stats-variance-stddev.csv b/resources/test/boston311-100-everything-date-stats-variance-stddev.csv index 969a054af..c69643d0a 100644 --- a/resources/test/boston311-100-everything-date-stats-variance-stddev.csv +++ b/resources/test/boston311-100-everything-date-stats-variance-stddev.csv @@ -1,34 +1,34 @@ -field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,mad,lower_outer_fence,lower_inner_fence,q1,q2_median,q3,iqr,upper_inner_fence,upper_outer_fence,skewness,cardinality,mode,mode_count,mode_occurrences,antimode,antimode_count,antimode_occurrences,qsv__value -case_enquiry_id,Integer,,10100411645180,101004113298,101004155594,42296,Unsorted,12,12,1200,12,101004116451.8,790.552,101004116451.8012,101004116451.7994,7905.5202,62497248.9138,0,0,,0,673,101004109567,101004111646,101004113725,101004114353,101004115111,1386,101004117190,101004119269,0.0938,100,,0,0,*ALL,0,1, -open_dt,DateTime,,,2022-01-01T00:16:00+00:00,2022-01-31T11:46:00+00:00,30.47917,Unsorted,,,,,2022-01-04T07:07:45.050+00:00,0.5568,18996.29623,18996.29542,5.568,31.00259,0.0293,0,,0,0.76261,2021-12-27T14:16:49+00:00,2021-12-30T06:00:07+00:00,2022-01-01T21:43:25+00:00,2022-01-03T07:02:14+00:00,2022-01-03T16:12:17+00:00,1.77005,2022-01-06T07:55:35+00:00,2022-01-08T23:38:53+00:00,-0.5684,100,,0,0,*ALL,0,1, -target_dt,DateTime,,,2022-01-03T10:32:34+00:00,2022-05-20T13:03:21+00:00,137.10471,Unsorted,,,,,2022-01-17T03:14:16.404+00:00,2.86258,19009.11578,19009.0967,27.00551,729.29774,0.1421,11,,0.11,1,2021-11-26T08:30:00+00:00,2021-12-15T20:30:00+00:00,2022-01-04T08:30:00+00:00,2022-01-05T08:30:00+00:00,2022-01-17T08:30:00+00:00,13,2022-02-05T20:30:00+00:00,2022-02-25T08:30:00+00:00,0.8462,42,2022-01-04 08:30:00,1,25,"*PREVIEW: 2022-01-03 10:32:34,2022-01-03 11:58:12,2022-01-04 09:58:36,2022-01-04 10:41:29,2022-01-04...",34,1, -closed_dt,DateTime,,,2022-01-01T12:56:14+00:00,2022-04-25T14:30:31+00:00,114.06547,Unsorted,,,,,2022-01-08T01:10:44.411+00:00,1.71655,19000.04255,19000.036,15.82577,250.4549,0.0833,15,,0.15,0.77213,2021-12-29T15:13:29+00:00,2021-12-31T19:50:08.750+00:00,2022-01-03T00:26:48.500+00:00,2022-01-03T12:15:23+00:00,2022-01-04T11:31:15+00:00,1.46142,2022-01-06T16:07:54.750+00:00,2022-01-08T20:44:34.500+00:00,0.3266,86,,1,15,"*PREVIEW: 2022-01-01 12:56:14,2022-01-01 14:17:15,2022-01-01 14:59:41,2022-01-01 15:10:16,2022-01-01...",85,1, -ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,617,6.17,,,,,,,,0,,0,,,,,,,,,,,2,ONTIME,1,83,OVERDUE,1,17, -case_status,String,true,,Closed,Open,,Unsorted,4,6,570,5.7,,,,,,,,0,,0,,,,,,,,,,,2,Closed,1,85,Open,1,15, -closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,,,,,,,,0,,0,,,,,,,,,,,86, ,1,15,"*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 ,Case Closed Case Resolved ...",85,1, -case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,,,,,,,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,BTDT: Complaint,City/State Snow Issues,DISPATCHED Short Term Rental...",24,1, -subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,,,,,,,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,"Animal Control,Boston Police Department,Boston Water & Sewer Commission",3,1, -reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,,,,,,,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,"Administrative & General Requests,Animal Issues,Building,Employee & General Comments,Noise Disturban...",7,1, -type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,,,,,,,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,City/State Snow Issues,Electrical,General Comments For a Program or...",15,1, -queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,,,,,,,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,"*PREVIEW: BTDT_BostonBikes,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT_Sign Shop_Si...",15,1, -department,String,true,,BTDT,PWDx,,Unsorted,3,4,392,3.92,,,,,,,,0,,0,,,,,,,,,,,7,PWDx,1,49,GEN_,1,2, -submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,,,,,,,,58,,0.58,,,,,,,,,,,43,,1,58,"*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,http...",42,1, -closedphoto,NULL,,,,,,,0,0,,,,,,,,,,100,,1,,,,,,,,,,,1,,1,100,,0,0, -location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,,,,,,,,0,,0,,,,,,,,,,,98,"563 Columbus Ave Roxbury MA 02118,INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ",2,2,"*PREVIEW: ,103 N Beacon St Brighton MA 02135,11 Aberdeen St Boston MA 02215,1148 Hyde Park Av...",96,1, -fire_district,String,true,, ,9,,Unsorted,1,2,113,1.13,,,,,,,,0,,0,,,,,,,,,,,10,3,1,19, ,1,1, -pwd_district,String,true,, ,1C,,Unsorted,1,3,209,2.09,,,,,,,,0,,0,,,,,,,,,,,14,1B,1,16, ,1,1, -city_council_district,String,true,, ,9,,Unsorted,1,1,100,1,,,,,,,,0,,0,,,,,,,,,,,10,1,1,22, ,1,1, -police_district,String,true,, ,E5,,Unsorted,1,3,223,2.23,,,,,,,,0,,0,,,,,,,,,,,13,A1,1,20, ,1,1, -neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,,,,,,,,0,,0,,,,,,,,,,,19,Dorchester,1,15," ,Brighton,Mission Hill",3,1, -neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,,,,,,,,0,,0,,,,,,,,,,,16,3,1,15," ,12",2,1, -ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,,,,,,,,0,,0,,,,,,,,,,,42,Ward 3,1,10,"*PREVIEW: ,01,02,04,06,07,1,10,16,18",23,1, -precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,,,,,,,,1,,0.01,,,,,,,,,,,76,0306,1,5,"*PREVIEW: NULL, ,0102,0105,0108,0109,0201,0204,0305,0307",61,1, -location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,,,,,,,,1,,0.01,,,,,,,,,,,97,"20 Washington St,563 Columbus Ave,INTERSECTION Gallivan Blvd & Washington St",3,2,"*PREVIEW: NULL,103 N Beacon St,11 Aberdeen St,1148 Hyde Park Ave,119 L St,12 Derne St,126 Elm St,127...",94,1, -location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,,,,,,,,17,,0.17,,,,,,,,,,,24,,1,17,"02126,02134,02210,02215",4,1, -latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,42.3367,0.0031,42.3367,42.3367,0.0305,0.0009,0.072,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,"*PREVIEW: 42.2553,42.2601,42.2609,42.2645,42.2674,42.2789,42.2797,42.2804,42.2821,42.2878",74,1, -longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,-71.0727,0.0031,,,0.0311,0.001,-0.0437,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,"*PREVIEW: -71.0298,-71.0301,-71.0309,-71.0323,-71.0325,-71.0329,-71.0336,-71.0338,-71.034,-71.0355",72,1, -source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01,,,,,,,,0,,0,,,,,,,,,,,4,Citizens Connect App,1,56,Self Service,1,3, -qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 -qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 -qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,47702 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ebf67188b7f1cf11f3d943b7f97e32d80dba08a12f26d4eb76da2088460cf29d +field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,stddev_length,variance_length,cv_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,mad,lower_outer_fence,lower_inner_fence,q1,q2_median,q3,iqr,upper_inner_fence,upper_outer_fence,skewness,cardinality,mode,mode_count,mode_occurrences,antimode,antimode_count,antimode_occurrences,qsv__value +case_enquiry_id,Integer,,10100411645180,101004113298,101004155594,42296,Unsorted,12,12,1200,12,,,,101004116451.8,790.552,101004116451.8012,101004116451.7994,7905.5202,62497248.9138,0,0,,0,673,101004109567,101004111646,101004113725,101004114353,101004115111,1386,101004117190,101004119269,0.0938,100,,0,0,*ALL,0,1, +open_dt,DateTime,,,2022-01-01T00:16:00+00:00,2022-01-31T11:46:00+00:00,30.47917,Unsorted,,,,,,,,2022-01-04T07:07:45.050+00:00,0.5568,18996.29623,18996.29542,5.568,31.00259,0.0293,0,,0,0.76261,2021-12-27T14:16:49+00:00,2021-12-30T06:00:07+00:00,2022-01-01T21:43:25+00:00,2022-01-03T07:02:14+00:00,2022-01-03T16:12:17+00:00,1.77005,2022-01-06T07:55:35+00:00,2022-01-08T23:38:53+00:00,-0.5684,100,,0,0,*ALL,0,1, +target_dt,DateTime,,,2022-01-03T10:32:34+00:00,2022-05-20T13:03:21+00:00,137.10471,Unsorted,,,,,,,,2022-01-17T03:14:16.404+00:00,2.86258,19009.11578,19009.0967,27.00551,729.29774,0.1421,11,,0.11,1,2021-11-26T08:30:00+00:00,2021-12-15T20:30:00+00:00,2022-01-04T08:30:00+00:00,2022-01-05T08:30:00+00:00,2022-01-17T08:30:00+00:00,13,2022-02-05T20:30:00+00:00,2022-02-25T08:30:00+00:00,0.8462,42,2022-01-04 08:30:00,1,25,"*PREVIEW: 2022-01-03 10:32:34,2022-01-03 11:58:12,2022-01-04 09:58:36,2022-01-04 10:41:29,2022-01-04...",34,1, +closed_dt,DateTime,,,2022-01-01T12:56:14+00:00,2022-04-25T14:30:31+00:00,114.06547,Unsorted,,,,,,,,2022-01-08T01:10:44.411+00:00,1.71655,19000.04255,19000.036,15.82577,250.4549,0.0833,15,,0.15,0.77213,2021-12-29T15:13:29+00:00,2021-12-31T19:50:08.750+00:00,2022-01-03T00:26:48.500+00:00,2022-01-03T12:15:23+00:00,2022-01-04T11:31:15+00:00,1.46142,2022-01-06T16:07:54.750+00:00,2022-01-08T20:44:34.500+00:00,0.3266,86,,1,15,"*PREVIEW: 2022-01-01 12:56:14,2022-01-01 14:17:15,2022-01-01 14:59:41,2022-01-01 15:10:16,2022-01-01...",85,1, +ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,617,6.17,0.3756,0.1411,0.0609,,,,,,,,0,,0,,,,,,,,,,,2,ONTIME,1,83,OVERDUE,1,17, +case_status,String,true,,Closed,Open,,Unsorted,4,6,570,5.7,0.7141,0.51,0.1253,,,,,,,,0,,0,,,,,,,,,,,2,Closed,1,85,Open,1,15, +closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,55.0262,3027.8804,0.6618,,,,,,,,0,,0,,,,,,,,,,,86, ,1,15,"*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 ,Case Closed Case Resolved ...",85,1, +case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,9.452,89.3404,0.3961,,,,,,,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,BTDT: Complaint,City/State Snow Issues,DISPATCHED Short Term Rental...",24,1, +subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,4.9041,24.05,0.1908,,,,,,,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,"Animal Control,Boston Police Department,Boston Water & Sewer Commission",3,1, +reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,8.4056,70.6536,0.4443,,,,,,,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,"Administrative & General Requests,Animal Issues,Building,Employee & General Comments,Noise Disturban...",7,1, +type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,8.034,64.5444,0.3545,,,,,,,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,City/State Snow Issues,Electrical,General Comments For a Program or...",15,1, +queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,8.0224,64.3596,0.2863,,,,,,,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,"*PREVIEW: BTDT_BostonBikes,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT_Sign Shop_Si...",15,1, +department,String,true,,BTDT,PWDx,,Unsorted,3,4,392,3.92,0.2713,0.0736,0.0692,,,,,,,,0,,0,,,,,,,,,,,7,PWDx,1,49,GEN_,1,2, +submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,43.2585,1871.2989,1.1907,,,,,,,,58,,0.58,,,,,,,,,,,43,,1,58,"*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,http...",42,1, +closedphoto,NULL,,,,,,,0,0,,,,,,,,,,,,,100,,1,,,,,,,,,,,1,,1,100,,0,0, +location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,9.6247,92.6356,0.2444,,,,,,,,0,,0,,,,,,,,,,,98,"563 Columbus Ave Roxbury MA 02118,INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ",2,2,"*PREVIEW: ,103 N Beacon St Brighton MA 02135,11 Aberdeen St Boston MA 02215,1148 Hyde Park Av...",96,1, +fire_district,String,true,, ,9,,Unsorted,1,2,113,1.13,0.3363,0.1131,0.2976,,,,,,,,0,,0,,,,,,,,,,,10,3,1,19, ,1,1, +pwd_district,String,true,, ,1C,,Unsorted,1,3,209,2.09,0.3192,0.1019,0.1527,,,,,,,,0,,0,,,,,,,,,,,14,1B,1,16, ,1,1, +city_council_district,String,true,, ,9,,Unsorted,1,1,100,1,0,0,0,,,,,,,,0,,0,,,,,,,,,,,10,1,1,22, ,1,1, +police_district,String,true,, ,E5,,Unsorted,1,3,223,2.23,0.444,0.1971,0.1991,,,,,,,,0,,0,,,,,,,,,,,13,A1,1,20, ,1,1, +neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,9.8671,97.3604,0.664,,,,,,,,0,,0,,,,,,,,,,,19,Dorchester,1,15," ,Brighton,Mission Hill",3,1, +neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,0.4877,0.2379,0.3509,,,,,,,,0,,0,,,,,,,,,,,16,3,1,15," ,12",2,1, +ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,2.2293,4.9699,0.4468,,,,,,,,0,,0,,,,,,,,,,,42,Ward 3,1,10,"*PREVIEW: ,01,02,04,06,07,1,10,16,18",23,1, +precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,0.4951,0.2451,0.126,,,,,,,,1,,0.01,,,,,,,,,,,76,0306,1,5,"*PREVIEW: NULL, ,0102,0105,0108,0109,0201,0204,0305,0307",61,1, +location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,9.3995,88.3508,0.5222,,,,,,,,1,,0.01,,,,,,,,,,,97,"20 Washington St,563 Columbus Ave,INTERSECTION Gallivan Blvd & Washington St",3,2,"*PREVIEW: NULL,103 N Beacon St,11 Aberdeen St,1148 Hyde Park Ave,119 L St,12 Derne St,126 Elm St,127...",94,1, +location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,1.8405,3.3874,0.4435,,,,,,,,17,,0.17,,,,,,,,,,,24,,1,17,"02126,02134,02210,02215",4,1, +latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,,,,42.3367,0.0031,42.3367,42.3367,0.0305,0.0009,0.072,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,"*PREVIEW: 42.2553,42.2601,42.2609,42.2645,42.2674,42.2789,42.2797,42.2804,42.2821,42.2878",74,1, +longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,,,,-71.0727,0.0031,,,0.0311,0.001,-0.0437,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,"*PREVIEW: -71.0298,-71.0301,-71.0309,-71.0323,-71.0325,-71.0329,-71.0336,-71.0338,-71.034,-71.0355",72,1, +source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01,2.3473,5.5099,0.1303,,,,,,,,0,,0,,,,,,,,,,,4,Citizens Connect App,1,56,Self Service,1,3, +qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 +qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 +qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,47702 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,a668d977d70c381c6edc4d7365032b09225e3f2605848237a49dd8cb149feef4 diff --git a/resources/test/boston311-100-everything-date-stats.csv b/resources/test/boston311-100-everything-date-stats.csv index 85e35cd78..35e5d930f 100644 --- a/resources/test/boston311-100-everything-date-stats.csv +++ b/resources/test/boston311-100-everything-date-stats.csv @@ -31,4 +31,4 @@ source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01 qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,47702 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ebf67188b7f1cf11f3d943b7f97e32d80dba08a12f26d4eb76da2088460cf29d +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,a668d977d70c381c6edc4d7365032b09225e3f2605848237a49dd8cb149feef4 diff --git a/resources/test/boston311-100-everything-datenotime-stats.csv b/resources/test/boston311-100-everything-datenotime-stats.csv index cb4411fe8..917bfa6f7 100644 --- a/resources/test/boston311-100-everything-datenotime-stats.csv +++ b/resources/test/boston311-100-everything-datenotime-stats.csv @@ -31,4 +31,4 @@ source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01 qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,45236 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4142a9338de2e31210d6673e0c0a3d2533895823fecc36700c1ee540ee637b46 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,fc0bb0988a0c91f373ae403b143349025dec0fef26fbc6a1fe6940c0ae4fcb47 diff --git a/resources/test/boston311-100-everything-inferdates-defaultwhitelist-stats.csv b/resources/test/boston311-100-everything-inferdates-defaultwhitelist-stats.csv index 8a9ebf39f..e9dfad15d 100644 --- a/resources/test/boston311-100-everything-inferdates-defaultwhitelist-stats.csv +++ b/resources/test/boston311-100-everything-inferdates-defaultwhitelist-stats.csv @@ -31,4 +31,4 @@ source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01 qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,47702 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,e7537bd732c9a6eb2d1c34855ef7b803629d03f6316a125e89f377f74e9e7fd7 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15f3cea14ecb4f5a1a8078c687d03a2856eab33e22fe22dad46ceb92f868dff6 diff --git a/resources/test/boston311-100-everything-nodate-stats.csv b/resources/test/boston311-100-everything-nodate-stats.csv index 4678776d5..068b1fe21 100644 --- a/resources/test/boston311-100-everything-nodate-stats.csv +++ b/resources/test/boston311-100-everything-nodate-stats.csv @@ -31,4 +31,4 @@ source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01 qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,47702 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1551f88101b999f0ca88c62c062668a881513d9b2ee4af8741855501ebcdba0e +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5c687cf954ac8a31ab901cb8a61b0d0bb2f4109df893511259e3ef8e33847e63 diff --git a/resources/test/boston311-100-stats.csv b/resources/test/boston311-100-stats.csv index c9b80a70c..e26402d5e 100644 --- a/resources/test/boston311-100-stats.csv +++ b/resources/test/boston311-100-stats.csv @@ -31,4 +31,4 @@ source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01 qsv__rowcount,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,29 qsv__filesize_bytes,,,,,,,,,,,,,,,,47702 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,ebf67188b7f1cf11f3d943b7f97e32d80dba08a12f26d4eb76da2088460cf29d +qsv__fingerprint_hash,,,,,,,,,,,,,,,,a668d977d70c381c6edc4d7365032b09225e3f2605848237a49dd8cb149feef4 diff --git a/resources/test/boston311-100-with-nonascii-stats.csv b/resources/test/boston311-100-with-nonascii-stats.csv index 28aa0469f..3cc616c01 100644 --- a/resources/test/boston311-100-with-nonascii-stats.csv +++ b/resources/test/boston311-100-with-nonascii-stats.csv @@ -31,4 +31,4 @@ source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01 qsv__rowcount,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,29 qsv__filesize_bytes,,,,,,,,,,,,,,,,47704 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,f476d634e6251c59b71ed07431e30fb654b06da7095629657fa6fed28ea7adcf +qsv__fingerprint_hash,,,,,,,,,,,,,,,,a3bba3bb353ae38ace256314690467197a01916a3d73b2432118da3c038e8a79 diff --git a/tests/test_index.rs b/tests/test_index.rs index 416a69473..e1f85150f 100644 --- a/tests/test_index.rs +++ b/tests/test_index.rs @@ -77,6 +77,9 @@ fn index_outdated_stats() { "max_length", "sum_length", "avg_length", + "stddev_length", + "variance_length", + "cv_length", "mean", "sem", "geometric_mean", @@ -102,6 +105,9 @@ fn index_outdated_stats() { "1", "3", "1", + "0", + "0", + "0", "", "", "", @@ -127,6 +133,9 @@ fn index_outdated_stats() { "1", "3", "1", + "", + "", + "", "2", "0.4714", "1.8171", @@ -162,6 +171,9 @@ fn index_outdated_stats() { "", "", "", + "", + "", + "", "3" ], svec![ @@ -187,6 +199,9 @@ fn index_outdated_stats() { "", "", "", + "", + "", + "", "2" ], svec![ @@ -212,6 +227,9 @@ fn index_outdated_stats() { "", "", "", + "", + "", + "", "26" ], svec![ @@ -237,7 +255,10 @@ fn index_outdated_stats() { "", "", "", - "09b55353162931d7a4617e04939bee06546049eae0b4b5969021ef02572a2193" + "", + "", + "", + "ed38d8c4f2747d82eec243d750fd8448ae4b5ebfb2c85c2323085e3e75b64b6a" ], ]; diff --git a/tests/test_stats.rs b/tests/test_stats.rs index fd4a26cdb..f4d28eecd 100644 --- a/tests/test_stats.rs +++ b/tests/test_stats.rs @@ -629,9 +629,10 @@ fn stats_prefer_dmy() { wrk.create("in2.csv", got); - // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values + // removed stddev_length, variance_length, cv_length, variance, geometric_mean, harmonic_mean, + // stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + cmd.arg("!/stddev_length|variance_length|cv_length|variance|geometric_mean|harmonic_mean|stddev|sem|cv/") .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); @@ -655,9 +656,10 @@ fn stats_prefer_mdy() { wrk.create("in2.csv", got); - // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values + // removed stddev_length, variance_length, cv_length, variance, geometric_mean, harmonic_mean, + // stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + cmd.arg("!/stddev_length|variance_length|cv_length|variance|geometric_mean|harmonic_mean|stddev|sem|cv/") .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); @@ -681,9 +683,10 @@ fn stats_rounding() { wrk.create("in2.csv", got); - // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values + // removed stddev_length, variance_length, cv_length, variance, geometric_mean, harmonic_mean, + // stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + cmd.arg("!/stddev_length|variance_length|cv_length|variance|geometric_mean|harmonic_mean|stddev|sem|cv/") .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); @@ -725,9 +728,10 @@ fn stats_no_date_inference() { wrk.create("in2.csv", got); - // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values + // removed stddev_length, variance_length, cv_length, variance, geometric_mean, harmonic_mean, + // stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + cmd.arg("!/stddev_length|variance_length|cv_length|variance|geometric_mean|harmonic_mean|stddev|sem|cv/") .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); @@ -752,9 +756,10 @@ fn stats_with_date_inference() { wrk.create("in2.csv", got); - // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values + // removed stddev_length, variance_length, cv_length, variance, geometric_mean, harmonic_mean, + // stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + cmd.arg("!/stddev_length|variance_length|cv_length|variance|geometric_mean|harmonic_mean|stddev|sem|cv/") .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); @@ -775,9 +780,10 @@ fn stats_with_date_inference_default_whitelist() { wrk.create("in2.csv", got); - // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values + // removed stddev_length, variance_length, cv_length, variance, geometric_mean, harmonic_mean, + // stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + cmd.arg("!/stddev_length|variance_length|cv_length|variance|geometric_mean|harmonic_mean|stddev|sem|cv/") .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); @@ -826,9 +832,10 @@ fn stats_with_date_type() { wrk.create("in2.csv", got); - // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values + // removed stddev_length, variance_length, cv_length, variance, geometric_mean, harmonic_mean, + // stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + cmd.arg("!/stddev_length|variance_length|cv_length|variance|geometric_mean|harmonic_mean|stddev|sem|cv/") .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); @@ -1141,9 +1148,10 @@ fn stats_is_ascii() { wrk.create("in2.csv", got); - // removed variance, stddev, sem & cv columns as its causing flaky CI test for float values + // removed stddev_length, variance_length, cv_length, variance, geometric_mean, harmonic_mean, + // stddev, sem & cv columns as its causing flaky CI test for float values let mut cmd = wrk.command("select"); - cmd.arg("!/variance|geometric_mean|harmonic_mean|stddev|sem|cv/") + cmd.arg("!/stddev_length|variance_length|cv_length|variance|geometric_mean|harmonic_mean|stddev|sem|cv/") .arg("in2.csv"); let got2: String = wrk.stdout(&mut cmd); @@ -1241,6 +1249,9 @@ fn stats_zero_cv() { "max_length", "sum_length", "avg_length", + "stddev_length", + "variance_length", + "cv_length", "mean", "sem", "geometric_mean", @@ -1266,6 +1277,9 @@ fn stats_zero_cv() { "1", "5", "1", + "", + "", + "", "3", "0.6325", "2.6052", @@ -1291,6 +1305,9 @@ fn stats_zero_cv() { "3", "9", "1.8", + "", + "", + "", "0", "3.1623", "0", @@ -1316,6 +1333,9 @@ fn stats_zero_cv() { "6", "25", "5", + "", + "", + "", "0", "28.8472", "0", @@ -1330,7 +1350,8 @@ fn stats_zero_cv() { ], svec![ "col4", "Integer", "", "935", "-900", "1000", "1900", "Unsorted", "1", "4", "14", - "2.8", "187", "304.3603", "0", "", "680.5703", "463176", "363.9414", "0", "", "0", "" + "2.8", "", "", "", "187", "304.3603", "0", "", "680.5703", "463176", "363.9414", "0", + "", "0", "" ], svec![ "qsv__rowcount", @@ -1355,6 +1376,9 @@ fn stats_zero_cv() { "", "", "", + "", + "", + "", "5" ], svec![ @@ -1380,6 +1404,9 @@ fn stats_zero_cv() { "", "", "", + "", + "", + "", "4" ], svec![ @@ -1405,6 +1432,9 @@ fn stats_zero_cv() { "", "", "", + "", + "", + "", "93" ], svec![ @@ -1430,7 +1460,10 @@ fn stats_zero_cv() { "", "", "", - "228f039bafd53f7562c1418b74114a3a03f9c64e7be4c6965e67f2e7a3938267" + "", + "", + "", + "bfe0b46361bf7532d2ea6fcdc1e2c25c07ad06d15ba448271d5aeea377d2d506" ], ]; assert_eq!(got, expected); @@ -1460,14 +1493,14 @@ fn stats_output_tab_delimited() { wrk.assert_success(&mut cmd); let got = std::fs::read_to_string(out_file).unwrap(); - let expected = r#"field type is_ascii sum min max range sort_order min_length max_length sum_length avg_length mean sem geometric_mean harmonic_mean stddev variance cv nullcount max_precision sparsity qsv__value -col1 Integer 15 1 5 4 Ascending 1 1 5 1 3 0.6325 2.6052 2.1898 1.4142 2 47.1405 0 0 -col2 Integer 10644 0 4321 4321 Descending 1 4 17 3.4 2128.8 685.6979 0 1533.267 2350907.76 72.0249 0 0 -col3 String true 01 10 Ascending 2 2 10 2 0 0 -qsv__rowcount 5 -qsv__columncount 3 -qsv__filesize_bytes 62 -qsv__fingerprint_hash a61c70d1eda11fb60d4300481c11610493487aa22654a22f637147aede3c8c0c + let expected = r#"field type is_ascii sum min max range sort_order min_length max_length sum_length avg_length stddev_length variance_length cv_length mean sem geometric_mean harmonic_mean stddev variance cv nullcount max_precision sparsity qsv__value +col1 Integer 15 1 5 4 Ascending 1 1 5 1 3 0.6325 2.6052 2.1898 1.4142 2 47.1405 0 0 +col2 Integer 10644 0 4321 4321 Descending 1 4 17 3.4 2128.8 685.6979 0 1533.267 2350907.76 72.0249 0 0 +col3 String true 01 10 Ascending 2 2 10 2 0 0 0 0 0 +qsv__rowcount 5 +qsv__columncount 3 +qsv__filesize_bytes 62 +qsv__fingerprint_hash 14a30758a66a00ca7f90b3b763d16a4195fa7b7427f5ce5afb32ff87aece8d0c "#; assert_eq!(got, expected); } @@ -1496,14 +1529,14 @@ fn stats_output_ssv_delimited() { wrk.assert_success(&mut cmd); let got = std::fs::read_to_string(out_file).unwrap(); - let expected = r#"field;type;is_ascii;sum;min;max;range;sort_order;min_length;max_length;sum_length;avg_length;mean;sem;geometric_mean;harmonic_mean;stddev;variance;cv;nullcount;max_precision;sparsity;qsv__value -col1;Integer;;15;1;5;4;Ascending;1;1;5;1;3;0.6325;2.6052;2.1898;1.4142;2;47.1405;0;;0; -col2;Integer;;10644;0;4321;4321;Descending;1;4;17;3.4;2128.8;685.6979;0;;1533.267;2350907.76;72.0249;0;;0; -col3;String;true;;01;10;;Ascending;2;2;10;2;;;;;;;;0;;0; -qsv__rowcount;;;;;;;;;;;;;;;;;;;;;;5 -qsv__columncount;;;;;;;;;;;;;;;;;;;;;;3 -qsv__filesize_bytes;;;;;;;;;;;;;;;;;;;;;;62 -qsv__fingerprint_hash;;;;;;;;;;;;;;;;;;;;;;a61c70d1eda11fb60d4300481c11610493487aa22654a22f637147aede3c8c0c + let expected = r#"field;type;is_ascii;sum;min;max;range;sort_order;min_length;max_length;sum_length;avg_length;stddev_length;variance_length;cv_length;mean;sem;geometric_mean;harmonic_mean;stddev;variance;cv;nullcount;max_precision;sparsity;qsv__value +col1;Integer;;15;1;5;4;Ascending;1;1;5;1;;;;3;0.6325;2.6052;2.1898;1.4142;2;47.1405;0;;0; +col2;Integer;;10644;0;4321;4321;Descending;1;4;17;3.4;;;;2128.8;685.6979;0;;1533.267;2350907.76;72.0249;0;;0; +col3;String;true;;01;10;;Ascending;2;2;10;2;0;0;0;;;;;;;;0;;0; +qsv__rowcount;;;;;;;;;;;;;;;;;;;;;;;;;5 +qsv__columncount;;;;;;;;;;;;;;;;;;;;;;;;;3 +qsv__filesize_bytes;;;;;;;;;;;;;;;;;;;;;;;;;62 +qsv__fingerprint_hash;;;;;;;;;;;;;;;;;;;;;;;;;14a30758a66a00ca7f90b3b763d16a4195fa7b7427f5ce5afb32ff87aece8d0c "#; assert_eq!(got, expected); } @@ -1535,14 +1568,14 @@ fn stats_output_csvsz_delimited() { cmd.arg("decompress").arg(out_file.clone()); let got: String = wrk.stdout(&mut cmd); - let expected = r#"field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,qsv__value -col1,Integer,,15,1,5,4,Ascending,1,1,5,1,3,0.6325,2.6052,2.1898,1.4142,2,47.1405,0,,0, -col2,Integer,,10644,0,4321,4321,Descending,1,4,17,3.4,2128.8,685.6979,0,,1533.267,2350907.76,72.0249,0,,0, -col3,String,true,,01,10,,Ascending,2,2,10,2,,,,,,,,0,,0, -qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,5 -qsv__columncount,,,,,,,,,,,,,,,,,,,,,,3 -qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,62 -qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,a61c70d1eda11fb60d4300481c11610493487aa22654a22f637147aede3c8c0c"#; + let expected = r#"field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,stddev_length,variance_length,cv_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,qsv__value +col1,Integer,,15,1,5,4,Ascending,1,1,5,1,,,,3,0.6325,2.6052,2.1898,1.4142,2,47.1405,0,,0, +col2,Integer,,10644,0,4321,4321,Descending,1,4,17,3.4,,,,2128.8,685.6979,0,,1533.267,2350907.76,72.0249,0,,0, +col3,String,true,,01,10,,Ascending,2,2,10,2,0,0,0,,,,,,,,0,,0, +qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,5 +qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,3 +qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,62 +qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,14a30758a66a00ca7f90b3b763d16a4195fa7b7427f5ce5afb32ff87aece8d0c"#; assert_eq!(got, expected); } From 8cfe74385e00439884d307227beeb03e5b701b21 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 29 Dec 2024 14:27:44 -0500 Subject: [PATCH 03/12] feat: `stats` addl length stats - stdev_length, variance_length, cv_length --- src/cmd/stats.rs | 53 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index c42613697..f2bb81d5e 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -5,17 +5,18 @@ Compute summary statistics & infers data types for each column in a CSV. UTF-8 encoded. If you encounter problems generating stats, use `qsv validate` to confirm the input CSV is valid. -Summary statistics includes sum, min/max/range, sort order, min/max/sum/avg length, mean, -standard error of the mean (SEM), geometric mean, harmonic mean, stddev, variance, coefficient +Summary statistics includes sum, min/max/range, sort order, min/max/sum/avg/stddev/variance/cv length, +mean, standard error of the mean (SEM), geometric mean, harmonic mean, stddev, variance, coefficient of variation (CV), nullcount, max_precision, sparsity, Median Absolute Deviation (MAD), quartiles, interquartile range (IQR), lower/upper fences, skewness, median, cardinality, mode/s & "antimode/s". Note that some stats require loading the entire file into memory, so they must be enabled explicitly. By default, the following "streaming" statistics are reported for *every* column: -sum, min/max/range values, sort order, min/max/sum/avg length, mean, sem, geometric_mean, harmonic_mean, -stddev, variance, cv, nullcount, max_precision & sparsity. The default set of statistics corresponds to -ones that can be computed efficiently on a stream of data (i.e., constant memory) and works with -arbitrarily large CSVs. + sum, min/max/range values, sort order, min/max/sum/avg/stddev/variance/cv length, mean, sem, + geometric_mean, harmonic_mean,stddev, variance, cv, nullcount, max_precision & sparsity. + +The default set of statistics corresponds to ones that can be computed efficiently on a stream of data +(i.e., constant memory) and works with arbitrarily large CSVs. The following additional "non-streaming" statistics require loading the entire file into memory: cardinality, modes/antimodes, median, MAD, quartiles and its related measures (q1, q2, q3, IQR, @@ -400,7 +401,7 @@ pub struct StatsData { pub avg_length: Option, pub stddev_length: Option, pub variance_length: Option, - pub cv_length: Option, + pub cv_length: Option, pub mean: Option, pub sem: Option, pub stddev: Option, @@ -502,6 +503,9 @@ const DAY_DECIMAL_PLACES: u32 = 5; // maximum number of output columns const MAX_STAT_COLUMNS: usize = 42; +// the first N columns are fingerprint hash columns +const FINGERPRINT_HASH_COLUMNS: usize = 25; + // maximum number of antimodes to display const MAX_ANTIMODES: usize = 10; // default length of antimode string before truncating and appending "..." @@ -793,7 +797,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> { // vec we use to compute dataset-level fingerprint hash let mut stats_br_vec: Vec = Vec::with_capacity(stats_sr_vec.len()); - let stats_headers_sr = args.stat_headers(); + let stats_headers_sr = args.stats_headers(); wtr.write_record(&stats_headers_sr)?; let fields = headers.iter().zip(stats_sr_vec); for (i, (header, stat)) in fields.enumerate() { @@ -842,13 +846,13 @@ pub fn run(argv: &[&str]) -> CliResult<()> { // Compute hash of stats for data fingerprinting let stats_hash = { - // the first 22 stats columns are used for the fingerprint hash - let mut hash_input = Vec::with_capacity(22); + // the first FINGERPRINT_HASH_COLUMNS are used for the fingerprint hash + let mut hash_input = Vec::with_capacity(FINGERPRINT_HASH_COLUMNS); // First, create a stable representation of the stats for record in &stats_br_vec { - // Take first 22 columns only - for field in record.iter().take(22) { + // Take FINGERPRINT_HASH_COLUMNS columns only + for field in record.iter().take(FINGERPRINT_HASH_COLUMNS) { let s = String::from_utf8_lossy(field); // Standardize number format if let Ok(f) = s.parse::() { @@ -1175,7 +1179,7 @@ impl Args { stats } - pub fn stat_headers(&self) -> csv::StringRecord { + pub fn stats_headers(&self) -> csv::StringRecord { if self.flag_typesonly { return csv::StringRecord::from(vec!["field", "type"]); } @@ -1184,7 +1188,8 @@ impl Args { let mut fields = Vec::with_capacity(MAX_STAT_COLUMNS); // these are the standard stats columns that are always output - // the "streaming" stats + // the "streaming" stats that are always included in stats output + // aka the 25 FINGERPINT_HASH_COLUMNS fields.extend_from_slice(&[ "field", "type", @@ -1732,7 +1737,15 @@ impl Stats { if typ == FieldType::TDate || typ == FieldType::TDateTime { // returning min/max length for dates doesn't make sense // especially since we convert the date stats to rfc3339 format - pieces.extend_from_slice(&[empty(), empty(), empty(), empty(), empty(), empty(), empty()]); + pieces.extend_from_slice(&[ + empty(), + empty(), + empty(), + empty(), + empty(), + empty(), + empty(), + ]); } else if let Some(mm) = self.minmax.as_ref().and_then(TypedMinMax::len_range) { pieces.extend_from_slice(&[mm.0, mm.1]); // we have a sum_length @@ -1771,7 +1784,15 @@ impl Stats { pieces.extend_from_slice(&[empty(), empty(), empty(), empty(), empty()]); } } else { - pieces.extend_from_slice(&[empty(), empty(), empty(), empty(), empty(), empty(), empty()]); + pieces.extend_from_slice(&[ + empty(), + empty(), + empty(), + empty(), + empty(), + empty(), + empty(), + ]); } // mean, sem, geometric_mean, harmonic_mean, stddev, variance & cv From 8e3617a7c6d1925c4d23a8ea6e50d6cacedfffcc Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Sun, 29 Dec 2024 14:31:17 -0500 Subject: [PATCH 04/12] fix: typo --- src/cmd/stats.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index f2bb81d5e..a6851e727 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -1189,7 +1189,7 @@ impl Args { // these are the standard stats columns that are always output // the "streaming" stats that are always included in stats output - // aka the 25 FINGERPINT_HASH_COLUMNS + // aka the 25 FINGERPRINT_HASH_COLUMNS fields.extend_from_slice(&[ "field", "type", From 661fd058351510aaebe842b9607aad55cfd8583c Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 30 Dec 2024 13:03:48 -0500 Subject: [PATCH 05/12] refactor: `stats` - change default modes separator to "|" from ";"; disable max anitmodes len; --- src/cmd/stats.rs | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index a6851e727..38a57744c 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -140,9 +140,11 @@ stats options: and the 2 values' first characters are 0/1, t/f & y/n case-insensitive, the data type is inferred as boolean. --mode Compute the mode/s & antimode/s. Multimodal-aware. - This requires loading all CSV data in memory. + This requires loading CSV data in memory proportionate to the + cardinality of each column. --cardinality Compute the cardinality. - This requires loading all CSV data in memory. + This requires loading CSV data in memory proportionate to the + number of unique values in each column. --median Compute the median. This requires loading all CSV data in memory. --mad Compute the median absolute deviation (MAD). @@ -487,6 +489,7 @@ const STATSDATA_TYPES_ARRAY: [JsonTypes; MAX_STAT_COLUMNS] = [ static INFER_DATE_FLAGS: OnceLock> = OnceLock::new(); static RECORD_COUNT: OnceLock = OnceLock::new(); static ANTIMODES_LEN: OnceLock = OnceLock::new(); +static ANTIMODES_SEPARATOR: OnceLock = OnceLock::new(); // standard overflow and underflow strings // for sum, sum_length and avg_length @@ -510,7 +513,7 @@ const FINGERPRINT_HASH_COLUMNS: usize = 25; const MAX_ANTIMODES: usize = 10; // default length of antimode string before truncating and appending "..." const DEFAULT_ANTIMODES_LEN: usize = 100; -const MAX_ANTIMODES_LEN: usize = 5192; +const DEFAULT_MODES_SEPARATOR: &str = "|"; // we do this so this is evaluated at compile-time pub const fn get_stats_data_types() -> [JsonTypes; MAX_STAT_COLUMNS] { @@ -1607,6 +1610,12 @@ impl Stats { mc_pieces.push(itoa::Buffer::new().format(cardinality).to_owned()); } if self.which.mode { + // get the modes separator + let modes_separator = ANTIMODES_SEPARATOR.get_or_init(|| { + std::env::var("QSV_MODES_SEPARATOR") + .unwrap_or(DEFAULT_MODES_SEPARATOR.to_string()) + }); + // mode/s if cardinality == record_count { // all values unique, short-circuit modes calculation as there is none @@ -1616,7 +1625,7 @@ impl Stats { let modes_list = modes_result .iter() .map(|c| String::from_utf8_lossy(c)) - .join(","); + .join(modes_separator); mc_pieces.extend_from_slice(&[ modes_list, modes_count.to_string(), @@ -1642,10 +1651,11 @@ impl Stats { .map(|val| { let parsed = val.parse::().unwrap_or(DEFAULT_ANTIMODES_LEN); + // if 0, disable length limiting if parsed == 0 { - MAX_ANTIMODES_LEN + usize::MAX } else { - parsed.min(MAX_ANTIMODES_LEN) + parsed } }) .unwrap_or(DEFAULT_ANTIMODES_LEN) @@ -1662,8 +1672,11 @@ impl Stats { let antimodes_vals = &antimodes_result .iter() .map(|c| String::from_utf8_lossy(c)) - .join(","); - if antimodes_vals.starts_with(',') { + .join(modes_separator); + + // if the antimodes result starts with the separator, + // it indicates that NULL is the first antimode. Add NULL to the list. + if antimodes_vals.starts_with(modes_separator) { antimodes_list.push_str("NULL"); } antimodes_list.push_str(antimodes_vals); From 7cdf06da6ec167f46850064ba9c32b6e747f5fc3 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 30 Dec 2024 13:04:53 -0500 Subject: [PATCH 06/12] tests: update `stats` tests to reflect "|" as default modes separator --- .../boston311-100-antimodes-len500-stats.csv | 36 +++++++++--------- ...boston311-100-everything-8places-stats.csv | 36 +++++++++--------- ...-everything-date-stats-variance-stddev.csv | 36 +++++++++--------- .../boston311-100-everything-date-stats.csv | 36 +++++++++--------- ...ton311-100-everything-datenotime-stats.csv | 38 +++++++++---------- ...hing-inferdates-defaultwhitelist-stats.csv | 36 +++++++++--------- .../boston311-100-everything-nodate-stats.csv | 36 +++++++++--------- tests/test_stats.rs | 10 ++--- 8 files changed, 132 insertions(+), 132 deletions(-) diff --git a/resources/test/boston311-100-antimodes-len500-stats.csv b/resources/test/boston311-100-antimodes-len500-stats.csv index c6553de46..c08aab90d 100644 --- a/resources/test/boston311-100-antimodes-len500-stats.csv +++ b/resources/test/boston311-100-antimodes-len500-stats.csv @@ -1,32 +1,32 @@ field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,stddev_length,variance_length,cv_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,mad,lower_outer_fence,lower_inner_fence,q1,q2_median,q3,iqr,upper_inner_fence,upper_outer_fence,skewness,cardinality,mode,mode_count,mode_occurrences,antimode,antimode_count,antimode_occurrences,qsv__value case_enquiry_id,Integer,,10100411645180,101004113298,101004155594,42296,Unsorted,12,12,1200,12,,,,101004116451.8,790.552,101004116451.8012,101004116451.7994,7905.5202,62497248.9138,0,0,,0,673,101004109567,101004111646,101004113725,101004114353,101004115111,1386,101004117190,101004119269,0.0938,100,,0,0,*ALL,0,1, open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,1900,19,0,0,0,,,,,,,,0,,0,,,,,,,,,,,100,,0,0,*ALL,0,1, -target_dt,String,true,,2022-01-03 10:32:34,2022-05-20 13:03:21,,Unsorted,0,19,1691,16.91,5.9449,35.3419,0.3516,,,,,,,,11,,0.11,,,,,,,,,,,42,2022-01-04 08:30:00,1,25,"*PREVIEW: 2022-01-03 10:32:34,2022-01-03 11:58:12,2022-01-04 09:58:36,2022-01-04 10:41:29,2022-01-04 10:42:18,2022-01-04 11:26:45,2022-01-04 11:31:57,2022-01-04 12:13:47,2022-01-04 12:19:43,2022-01-04 15:51:30",34,1, -closed_dt,String,true,,2022-01-01 12:56:14,2022-04-25 14:30:31,,Unsorted,0,19,1615,16.15,6.6205,43.8312,0.4099,,,,,,,,15,,0.15,,,,,,,,,,,86,,1,15,"*PREVIEW: 2022-01-01 12:56:14,2022-01-01 14:17:15,2022-01-01 14:59:41,2022-01-01 15:10:16,2022-01-01 19:07:41,2022-01-02 06:43:42,2022-01-02 06:49:07,2022-01-02 08:54:11,2022-01-02 11:03:10,2022-01-02 12:45:49",85,1, +target_dt,String,true,,2022-01-03 10:32:34,2022-05-20 13:03:21,,Unsorted,0,19,1691,16.91,5.9449,35.3419,0.3516,,,,,,,,11,,0.11,,,,,,,,,,,42,2022-01-04 08:30:00,1,25,*PREVIEW: 2022-01-03 10:32:34|2022-01-03 11:58:12|2022-01-04 09:58:36|2022-01-04 10:41:29|2022-01-04 10:42:18|2022-01-04 11:26:45|2022-01-04 11:31:57|2022-01-04 12:13:47|2022-01-04 12:19:43|2022-01-04 15:51:30,34,1, +closed_dt,String,true,,2022-01-01 12:56:14,2022-04-25 14:30:31,,Unsorted,0,19,1615,16.15,6.6205,43.8312,0.4099,,,,,,,,15,,0.15,,,,,,,,,,,86,,1,15,*PREVIEW: 2022-01-01 12:56:14|2022-01-01 14:17:15|2022-01-01 14:59:41|2022-01-01 15:10:16|2022-01-01 19:07:41|2022-01-02 06:43:42|2022-01-02 06:49:07|2022-01-02 08:54:11|2022-01-02 11:03:10|2022-01-02 12:45:49,85,1, ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,617,6.17,0.3756,0.1411,0.0609,,,,,,,,0,,0,,,,,,,,,,,2,ONTIME,1,83,OVERDUE,1,17, case_status,String,true,,Closed,Open,,Unsorted,4,6,570,5.7,0.7141,0.51,0.1253,,,,,,,,0,,0,,,,,,,,,,,2,Closed,1,85,Open,1,15, -closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,55.0262,3027.8804,0.6618,,,,,,,,0,,0,,,,,,,,,,,86, ,1,15,"*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 ,Case Closed Case Resolved NO OVERFLOW NO ADDITIONAL CART DELV 1/11/22 ,Case Closed. Closed date : 2022-01-02 23:40:14.32 Case Resolved CLEAR ,Case Closed. Closed date : 2022-01-02 23:54:13.31 Case Resolved BOSTON POLICE WERE THERE TO TICKET ,Case Closed. Closed date : 2022-01-02 23:55:44.4 Case Resolved CLEAR ,Case Closed. Closed date : 2022-01-02 23:59:18.367 Case Resolved CLEAR ,Case Closed. Closed date : 2022-01-03 00:...",85,1, -case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,9.452,89.3404,0.3961,,,,,,,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,BTDT: Complaint,City/State Snow Issues,DISPATCHED Short Term Rental,Electrical,Graffiti: Ward 8 0803 ,Loud Parties/Music/People,Misc. Snow Complaint,Missed ""Other"" Trash: District 07,Missed Trash/Recycling/Yard Waste/Bulk Item",24,1, -subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,4.9041,24.05,0.1908,,,,,,,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,"Animal Control,Boston Police Department,Boston Water & Sewer Commission",3,1, -reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,8.4056,70.6536,0.4443,,,,,,,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,"Administrative & General Requests,Animal Issues,Building,Employee & General Comments,Noise Disturbance,Programs,Sidewalk Cover / Manhole",7,1, -type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,8.034,64.5444,0.3545,,,,,,,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,City/State Snow Issues,Electrical,General Comments For a Program or Policy,General Lighting Request,Knockdown Replacement,Loud Parties/Music/People,Misc. Snow Complaint,New Sign Crosswalk or Pavement Marking,Schedule a Bulk Item Pickup SS",15,1, -queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,8.0224,64.3596,0.2863,,,,,,,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,"*PREVIEW: BTDT_BostonBikes,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT_Sign Shop_Sign Installation,BTDT_Traffic Signal_Graffiti,ISD_Building (INTERNAL),PARK_Maintenance_Region 6,PWDx_Contractor Complaints,PWDx_District 04: Allston/Brighton,PWDx_District 06: West Roxbury and Roslindale,PWDx_District 09: East Boston",15,1, +closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,55.0262,3027.8804,0.6618,,,,,,,,0,,0,,,,,,,,,,,86, ,1,15,*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 |Case Closed Case Resolved NO OVERFLOW NO ADDITIONAL CART DELV 1/11/22 |Case Closed. Closed date : 2022-01-02 23:40:14.32 Case Resolved CLEAR |Case Closed. Closed date : 2022-01-02 23:54:13.31 Case Resolved BOSTON POLICE WERE THERE TO TICKET |Case Closed. Closed date : 2022-01-02 23:55:44.4 Case Resolved CLEAR |Case Closed. Closed date : 2022-01-02 23:59:18.367 Case Resolved CLEAR |Case Closed. Closed date : 2022-01-03 00:...,85,1, +case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,9.452,89.3404,0.3961,,,,,,,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request|BTDT: Complaint|City/State Snow Issues|DISPATCHED Short Term Rental|Electrical|Graffiti: Ward 8 0803 |Loud Parties/Music/People|Misc. Snow Complaint|Missed ""Other"" Trash: District 07|Missed Trash/Recycling/Yard Waste/Bulk Item",24,1, +subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,4.9041,24.05,0.1908,,,,,,,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,Animal Control|Boston Police Department|Boston Water & Sewer Commission,3,1, +reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,8.4056,70.6536,0.4443,,,,,,,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,Administrative & General Requests|Animal Issues|Building|Employee & General Comments|Noise Disturbance|Programs|Sidewalk Cover / Manhole,7,1, +type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,8.034,64.5444,0.3545,,,,,,,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,*PREVIEW: Animal Generic Request|City/State Snow Issues|Electrical|General Comments For a Program or Policy|General Lighting Request|Knockdown Replacement|Loud Parties/Music/People|Misc. Snow Complaint|New Sign Crosswalk or Pavement Marking|Schedule a Bulk Item Pickup SS,15,1, +queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,8.0224,64.3596,0.2863,,,,,,,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,*PREVIEW: BTDT_BostonBikes|BTDT_Engineering_New Sign and Pavement Marking Requests|BTDT_Sign Shop_Sign Installation|BTDT_Traffic Signal_Graffiti|ISD_Building (INTERNAL)|PARK_Maintenance_Region 6|PWDx_Contractor Complaints|PWDx_District 04: Allston/Brighton|PWDx_District 06: West Roxbury and Roslindale|PWDx_District 09: East Boston,15,1, department,String,true,,BTDT,PWDx,,Unsorted,3,4,392,3.92,0.2713,0.0736,0.0692,,,,,,,,0,,0,,,,,,,,,,,7,PWDx,1,49,GEN_,1,2, -submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,43.2585,1871.2989,1.1907,,,,,,,,58,,0.58,,,,,,,,,,,43,,1,58,"*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d0627a05bbcf180c297a39/photo_20220101_091656.jpg,https://311.boston.gov/media/boston/report/photos/61d063e505bbcf180c297b6a/photo_20220101_092319.jpg,https://311.boston.gov/media/boston/report/photos/61d0688a05bbcf180c297be9/report.jpg,https://311.boston.gov/media/boston/report/photos/61d074c005bbcf180c298048/report.jpg,https://311.boston.gov/media/b...",42,1, +submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,43.2585,1871.2989,1.1907,,,,,,,,58,,0.58,,,,,,,,,,,43,,1,58,*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg|https://311.boston.gov/media/boston/report/photos/61d0627a05bbcf180c297a39/photo_20220101_091656.jpg|https://311.boston.gov/media/boston/report/photos/61d063e505bbcf180c297b6a/photo_20220101_092319.jpg|https://311.boston.gov/media/boston/report/photos/61d0688a05bbcf180c297be9/report.jpg|https://311.boston.gov/media/boston/report/photos/61d074c005bbcf180c298048/report.jpg|https://311.boston.gov/media/b...,42,1, closedphoto,NULL,,,,,,,0,0,,,,,,,,,,,,,100,,1,,,,,,,,,,,1,,1,100,,0,0, -location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,9.6247,92.6356,0.2444,,,,,,,,0,,0,,,,,,,,,,,98,"563 Columbus Ave Roxbury MA 02118,INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ",2,2,"*PREVIEW: ,103 N Beacon St Brighton MA 02135,11 Aberdeen St Boston MA 02215,1148 Hyde Park Ave Hyde Park MA 02136,119 L St South Boston MA 02127,12 Derne St Boston MA 02114,126 Elm St Charlestown MA 02129,1270 Commonwealth Ave Allston MA 02134,130 Shirley St Roxbury MA 02119,131 Arlington St Boston MA 02116",96,1, +location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,9.6247,92.6356,0.2444,,,,,,,,0,,0,,,,,,,,,,,98,563 Columbus Ave Roxbury MA 02118|INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ,2,2,*PREVIEW: |103 N Beacon St Brighton MA 02135|11 Aberdeen St Boston MA 02215|1148 Hyde Park Ave Hyde Park MA 02136|119 L St South Boston MA 02127|12 Derne St Boston MA 02114|126 Elm St Charlestown MA 02129|1270 Commonwealth Ave Allston MA 02134|130 Shirley St Roxbury MA 02119|131 Arlington St Boston MA 02116,96,1, fire_district,String,true,, ,9,,Unsorted,1,2,113,1.13,0.3363,0.1131,0.2976,,,,,,,,0,,0,,,,,,,,,,,10,3,1,19, ,1,1, pwd_district,String,true,, ,1C,,Unsorted,1,3,209,2.09,0.3192,0.1019,0.1527,,,,,,,,0,,0,,,,,,,,,,,14,1B,1,16, ,1,1, city_council_district,String,true,, ,9,,Unsorted,1,1,100,1,0,0,0,,,,,,,,0,,0,,,,,,,,,,,10,1,1,22, ,1,1, police_district,String,true,, ,E5,,Unsorted,1,3,223,2.23,0.444,0.1971,0.1991,,,,,,,,0,,0,,,,,,,,,,,13,A1,1,20, ,1,1, -neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,9.8671,97.3604,0.664,,,,,,,,0,,0,,,,,,,,,,,19,Dorchester,1,15," ,Brighton,Mission Hill",3,1, -neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,0.4877,0.2379,0.3509,,,,,,,,0,,0,,,,,,,,,,,16,3,1,15," ,12",2,1, -ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,2.2293,4.9699,0.4468,,,,,,,,0,,0,,,,,,,,,,,42,Ward 3,1,10,"*PREVIEW: ,01,02,04,06,07,1,10,16,18",23,1, -precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,0.4951,0.2451,0.126,,,,,,,,1,,0.01,,,,,,,,,,,76,0306,1,5,"*PREVIEW: NULL, ,0102,0105,0108,0109,0201,0204,0305,0307",61,1, -location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,9.3995,88.3508,0.5222,,,,,,,,1,,0.01,,,,,,,,,,,97,"20 Washington St,563 Columbus Ave,INTERSECTION Gallivan Blvd & Washington St",3,2,"*PREVIEW: NULL,103 N Beacon St,11 Aberdeen St,1148 Hyde Park Ave,119 L St,12 Derne St,126 Elm St,1270 Commonwealth Ave,130 Shirley St,131 Arlington St",94,1, -location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,1.8405,3.3874,0.4435,,,,,,,,17,,0.17,,,,,,,,,,,24,,1,17,"02126,02134,02210,02215",4,1, -latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,,,,42.3367,0.0031,42.3367,42.3367,0.0305,0.0009,0.072,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,"*PREVIEW: 42.2553,42.2601,42.2609,42.2645,42.2674,42.2789,42.2797,42.2804,42.2821,42.2878",74,1, -longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,,,,-71.0727,0.0031,,,0.0311,0.001,-0.0437,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,"*PREVIEW: -71.0298,-71.0301,-71.0309,-71.0323,-71.0325,-71.0329,-71.0336,-71.0338,-71.034,-71.0355",72,1, +neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,9.8671,97.3604,0.664,,,,,,,,0,,0,,,,,,,,,,,19,Dorchester,1,15, |Brighton|Mission Hill,3,1, +neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,0.4877,0.2379,0.3509,,,,,,,,0,,0,,,,,,,,,,,16,3,1,15, |12,2,1, +ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,2.2293,4.9699,0.4468,,,,,,,,0,,0,,,,,,,,,,,42,Ward 3,1,10,*PREVIEW: |01|02|04|06|07|1|10|16|18,23,1, +precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,0.4951,0.2451,0.126,,,,,,,,1,,0.01,,,,,,,,,,,76,0306,1,5,*PREVIEW: NULL| |0102|0105|0108|0109|0201|0204|0305|0307,61,1, +location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,9.3995,88.3508,0.5222,,,,,,,,1,,0.01,,,,,,,,,,,97,20 Washington St|563 Columbus Ave|INTERSECTION Gallivan Blvd & Washington St,3,2,*PREVIEW: NULL|103 N Beacon St|11 Aberdeen St|1148 Hyde Park Ave|119 L St|12 Derne St|126 Elm St|1270 Commonwealth Ave|130 Shirley St|131 Arlington St,94,1, +location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,1.8405,3.3874,0.4435,,,,,,,,17,,0.17,,,,,,,,,,,24,,1,17,02126|02134|02210|02215,4,1, +latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,,,,42.3367,0.0031,42.3367,42.3367,0.0305,0.0009,0.072,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,*PREVIEW: 42.2553|42.2601|42.2609|42.2645|42.2674|42.2789|42.2797|42.2804|42.2821|42.2878,74,1, +longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,,,,-71.0727,0.0031,,,0.0311,0.001,-0.0437,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,*PREVIEW: -71.0298|-71.0301|-71.0309|-71.0323|-71.0325|-71.0329|-71.0336|-71.0338|-71.034|-71.0355,72,1, source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01,2.3473,5.5099,0.1303,,,,,,,,0,,0,,,,,,,,,,,4,Citizens Connect App,1,56,Self Service,1,3, qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 diff --git a/resources/test/boston311-100-everything-8places-stats.csv b/resources/test/boston311-100-everything-8places-stats.csv index 2b92fd146..53bf7e59c 100644 --- a/resources/test/boston311-100-everything-8places-stats.csv +++ b/resources/test/boston311-100-everything-8places-stats.csv @@ -1,32 +1,32 @@ field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,nullcount,max_precision,sparsity,mad,lower_outer_fence,lower_inner_fence,q1,q2_median,q3,iqr,upper_inner_fence,upper_outer_fence,skewness,cardinality,mode,mode_count,mode_occurrences,antimode,antimode_count,antimode_occurrences,qsv__value case_enquiry_id,Integer,,10100411645180,101004113298,101004155594,42296,Unsorted,12,12,1200,12,101004116451.80003357,0,,0,673,101004109567,101004111646,101004113725,101004114353,101004115111,1386,101004117190,101004119269,0.09379509,100,,0,0,*ALL,0,1, open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,1900,19,,0,,0,,,,,,,,,,,100,,0,0,*ALL,0,1, -target_dt,String,true,,2022-01-03 10:32:34,2022-05-20 13:03:21,,Unsorted,0,19,1691,16.91,,11,,0.11,,,,,,,,,,,42,2022-01-04 08:30:00,1,25,"*PREVIEW: 2022-01-03 10:32:34,2022-01-03 11:58:12,2022-01-04 09:58:36,2022-01-04 10:41:29,2022-01-04...",34,1, -closed_dt,String,true,,2022-01-01 12:56:14,2022-04-25 14:30:31,,Unsorted,0,19,1615,16.15,,15,,0.15,,,,,,,,,,,86,,1,15,"*PREVIEW: 2022-01-01 12:56:14,2022-01-01 14:17:15,2022-01-01 14:59:41,2022-01-01 15:10:16,2022-01-01...",85,1, +target_dt,String,true,,2022-01-03 10:32:34,2022-05-20 13:03:21,,Unsorted,0,19,1691,16.91,,11,,0.11,,,,,,,,,,,42,2022-01-04 08:30:00,1,25,*PREVIEW: 2022-01-03 10:32:34|2022-01-03 11:58:12|2022-01-04 09:58:36|2022-01-04 10:41:29|2022-01-04...,34,1, +closed_dt,String,true,,2022-01-01 12:56:14,2022-04-25 14:30:31,,Unsorted,0,19,1615,16.15,,15,,0.15,,,,,,,,,,,86,,1,15,*PREVIEW: 2022-01-01 12:56:14|2022-01-01 14:17:15|2022-01-01 14:59:41|2022-01-01 15:10:16|2022-01-01...,85,1, ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,617,6.17,,0,,0,,,,,,,,,,,2,ONTIME,1,83,OVERDUE,1,17, case_status,String,true,,Closed,Open,,Unsorted,4,6,570,5.7,,0,,0,,,,,,,,,,,2,Closed,1,85,Open,1,15, -closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,,0,,0,,,,,,,,,,,86, ,1,15,"*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 ,Case Closed Case Resolved ...",85,1, -case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,BTDT: Complaint,City/State Snow Issues,DISPATCHED Short Term Rental...",24,1, -subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,"Animal Control,Boston Police Department,Boston Water & Sewer Commission",3,1, -reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,"Administrative & General Requests,Animal Issues,Building,Employee & General Comments,Noise Disturban...",7,1, -type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,City/State Snow Issues,Electrical,General Comments For a Program or...",15,1, -queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,"*PREVIEW: BTDT_BostonBikes,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT_Sign Shop_Si...",15,1, +closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,,0,,0,,,,,,,,,,,86, ,1,15,*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 |Case Closed Case Resolved ...,85,1, +case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,*PREVIEW: Animal Generic Request|BTDT: Complaint|City/State Snow Issues|DISPATCHED Short Term Rental...,24,1, +subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,Animal Control|Boston Police Department|Boston Water & Sewer Commission,3,1, +reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,Administrative & General Requests|Animal Issues|Building|Employee & General Comments|Noise Disturban...,7,1, +type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,*PREVIEW: Animal Generic Request|City/State Snow Issues|Electrical|General Comments For a Program or...,15,1, +queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,*PREVIEW: BTDT_BostonBikes|BTDT_Engineering_New Sign and Pavement Marking Requests|BTDT_Sign Shop_Si...,15,1, department,String,true,,BTDT,PWDx,,Unsorted,3,4,392,3.92,,0,,0,,,,,,,,,,,7,PWDx,1,49,GEN_,1,2, -submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,,58,,0.58,,,,,,,,,,,43,,1,58,"*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,http...",42,1, +submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,,58,,0.58,,,,,,,,,,,43,,1,58,*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg|http...,42,1, closedphoto,NULL,,,,,,,0,0,,,,100,,1,,,,,,,,,,,1,,1,100,,0,0, -location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,,0,,0,,,,,,,,,,,98,"563 Columbus Ave Roxbury MA 02118,INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ",2,2,"*PREVIEW: ,103 N Beacon St Brighton MA 02135,11 Aberdeen St Boston MA 02215,1148 Hyde Park Av...",96,1, +location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,,0,,0,,,,,,,,,,,98,563 Columbus Ave Roxbury MA 02118|INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ,2,2,*PREVIEW: |103 N Beacon St Brighton MA 02135|11 Aberdeen St Boston MA 02215|1148 Hyde Park Av...,96,1, fire_district,String,true,, ,9,,Unsorted,1,2,113,1.13,,0,,0,,,,,,,,,,,10,3,1,19, ,1,1, pwd_district,String,true,, ,1C,,Unsorted,1,3,209,2.09,,0,,0,,,,,,,,,,,14,1B,1,16, ,1,1, city_council_district,String,true,, ,9,,Unsorted,1,1,100,1,,0,,0,,,,,,,,,,,10,1,1,22, ,1,1, police_district,String,true,, ,E5,,Unsorted,1,3,223,2.23,,0,,0,,,,,,,,,,,13,A1,1,20, ,1,1, -neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,,0,,0,,,,,,,,,,,19,Dorchester,1,15," ,Brighton,Mission Hill",3,1, -neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,,0,,0,,,,,,,,,,,16,3,1,15," ,12",2,1, -ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,,0,,0,,,,,,,,,,,42,Ward 3,1,10,"*PREVIEW: ,01,02,04,06,07,1,10,16,18",23,1, -precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,,1,,0.01,,,,,,,,,,,76,0306,1,5,"*PREVIEW: NULL, ,0102,0105,0108,0109,0201,0204,0305,0307",61,1, -location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,,1,,0.01,,,,,,,,,,,97,"20 Washington St,563 Columbus Ave,INTERSECTION Gallivan Blvd & Washington St",3,2,"*PREVIEW: NULL,103 N Beacon St,11 Aberdeen St,1148 Hyde Park Ave,119 L St,12 Derne St,126 Elm St,127...",94,1, -location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,,17,,0.17,,,,,,,,,,,24,,1,17,"02126,02134,02210,02215",4,1, -latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,42.336674,0,4,0,0.0163,42.2034,42.2619,42.3204,42.34315,42.3594,0.039,42.4179,42.4764,-0.16666667,78,42.3594,1,20,"*PREVIEW: 42.2553,42.2601,42.2609,42.2645,42.2674,42.2789,42.2797,42.2804,42.2821,42.2878",74,1, -longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,-71.072688,0,4,0,0.01205,-71.17405,-71.129425,-71.0848,-71.06085,-71.05505,0.02975,-71.010425,-70.9658,-0.61008403,77,-71.0587,1,19,"*PREVIEW: -71.0298,-71.0301,-71.0309,-71.0323,-71.0325,-71.0329,-71.0336,-71.0338,-71.034,-71.0355",72,1, +neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,,0,,0,,,,,,,,,,,19,Dorchester,1,15, |Brighton|Mission Hill,3,1, +neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,,0,,0,,,,,,,,,,,16,3,1,15, |12,2,1, +ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,,0,,0,,,,,,,,,,,42,Ward 3,1,10,*PREVIEW: |01|02|04|06|07|1|10|16|18,23,1, +precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,,1,,0.01,,,,,,,,,,,76,0306,1,5,*PREVIEW: NULL| |0102|0105|0108|0109|0201|0204|0305|0307,61,1, +location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,,1,,0.01,,,,,,,,,,,97,20 Washington St|563 Columbus Ave|INTERSECTION Gallivan Blvd & Washington St,3,2,*PREVIEW: NULL|103 N Beacon St|11 Aberdeen St|1148 Hyde Park Ave|119 L St|12 Derne St|126 Elm St|127...,94,1, +location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,,17,,0.17,,,,,,,,,,,24,,1,17,02126|02134|02210|02215,4,1, +latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,42.336674,0,4,0,0.0163,42.2034,42.2619,42.3204,42.34315,42.3594,0.039,42.4179,42.4764,-0.16666667,78,42.3594,1,20,*PREVIEW: 42.2553|42.2601|42.2609|42.2645|42.2674|42.2789|42.2797|42.2804|42.2821|42.2878,74,1, +longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,-71.072688,0,4,0,0.01205,-71.17405,-71.129425,-71.0848,-71.06085,-71.05505,0.02975,-71.010425,-70.9658,-0.61008403,77,-71.0587,1,19,*PREVIEW: -71.0298|-71.0301|-71.0309|-71.0323|-71.0325|-71.0329|-71.0336|-71.0338|-71.034|-71.0355,72,1, source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01,,0,,0,,,,,,,,,,,4,Citizens Connect App,1,56,Self Service,1,3, qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 diff --git a/resources/test/boston311-100-everything-date-stats-variance-stddev.csv b/resources/test/boston311-100-everything-date-stats-variance-stddev.csv index c69643d0a..d2ec13321 100644 --- a/resources/test/boston311-100-everything-date-stats-variance-stddev.csv +++ b/resources/test/boston311-100-everything-date-stats-variance-stddev.csv @@ -1,32 +1,32 @@ field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,stddev_length,variance_length,cv_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,mad,lower_outer_fence,lower_inner_fence,q1,q2_median,q3,iqr,upper_inner_fence,upper_outer_fence,skewness,cardinality,mode,mode_count,mode_occurrences,antimode,antimode_count,antimode_occurrences,qsv__value case_enquiry_id,Integer,,10100411645180,101004113298,101004155594,42296,Unsorted,12,12,1200,12,,,,101004116451.8,790.552,101004116451.8012,101004116451.7994,7905.5202,62497248.9138,0,0,,0,673,101004109567,101004111646,101004113725,101004114353,101004115111,1386,101004117190,101004119269,0.0938,100,,0,0,*ALL,0,1, open_dt,DateTime,,,2022-01-01T00:16:00+00:00,2022-01-31T11:46:00+00:00,30.47917,Unsorted,,,,,,,,2022-01-04T07:07:45.050+00:00,0.5568,18996.29623,18996.29542,5.568,31.00259,0.0293,0,,0,0.76261,2021-12-27T14:16:49+00:00,2021-12-30T06:00:07+00:00,2022-01-01T21:43:25+00:00,2022-01-03T07:02:14+00:00,2022-01-03T16:12:17+00:00,1.77005,2022-01-06T07:55:35+00:00,2022-01-08T23:38:53+00:00,-0.5684,100,,0,0,*ALL,0,1, -target_dt,DateTime,,,2022-01-03T10:32:34+00:00,2022-05-20T13:03:21+00:00,137.10471,Unsorted,,,,,,,,2022-01-17T03:14:16.404+00:00,2.86258,19009.11578,19009.0967,27.00551,729.29774,0.1421,11,,0.11,1,2021-11-26T08:30:00+00:00,2021-12-15T20:30:00+00:00,2022-01-04T08:30:00+00:00,2022-01-05T08:30:00+00:00,2022-01-17T08:30:00+00:00,13,2022-02-05T20:30:00+00:00,2022-02-25T08:30:00+00:00,0.8462,42,2022-01-04 08:30:00,1,25,"*PREVIEW: 2022-01-03 10:32:34,2022-01-03 11:58:12,2022-01-04 09:58:36,2022-01-04 10:41:29,2022-01-04...",34,1, -closed_dt,DateTime,,,2022-01-01T12:56:14+00:00,2022-04-25T14:30:31+00:00,114.06547,Unsorted,,,,,,,,2022-01-08T01:10:44.411+00:00,1.71655,19000.04255,19000.036,15.82577,250.4549,0.0833,15,,0.15,0.77213,2021-12-29T15:13:29+00:00,2021-12-31T19:50:08.750+00:00,2022-01-03T00:26:48.500+00:00,2022-01-03T12:15:23+00:00,2022-01-04T11:31:15+00:00,1.46142,2022-01-06T16:07:54.750+00:00,2022-01-08T20:44:34.500+00:00,0.3266,86,,1,15,"*PREVIEW: 2022-01-01 12:56:14,2022-01-01 14:17:15,2022-01-01 14:59:41,2022-01-01 15:10:16,2022-01-01...",85,1, +target_dt,DateTime,,,2022-01-03T10:32:34+00:00,2022-05-20T13:03:21+00:00,137.10471,Unsorted,,,,,,,,2022-01-17T03:14:16.404+00:00,2.86258,19009.11578,19009.0967,27.00551,729.29774,0.1421,11,,0.11,1,2021-11-26T08:30:00+00:00,2021-12-15T20:30:00+00:00,2022-01-04T08:30:00+00:00,2022-01-05T08:30:00+00:00,2022-01-17T08:30:00+00:00,13,2022-02-05T20:30:00+00:00,2022-02-25T08:30:00+00:00,0.8462,42,2022-01-04 08:30:00,1,25,*PREVIEW: 2022-01-03 10:32:34|2022-01-03 11:58:12|2022-01-04 09:58:36|2022-01-04 10:41:29|2022-01-04...,34,1, +closed_dt,DateTime,,,2022-01-01T12:56:14+00:00,2022-04-25T14:30:31+00:00,114.06547,Unsorted,,,,,,,,2022-01-08T01:10:44.411+00:00,1.71655,19000.04255,19000.036,15.82577,250.4549,0.0833,15,,0.15,0.77213,2021-12-29T15:13:29+00:00,2021-12-31T19:50:08.750+00:00,2022-01-03T00:26:48.500+00:00,2022-01-03T12:15:23+00:00,2022-01-04T11:31:15+00:00,1.46142,2022-01-06T16:07:54.750+00:00,2022-01-08T20:44:34.500+00:00,0.3266,86,,1,15,*PREVIEW: 2022-01-01 12:56:14|2022-01-01 14:17:15|2022-01-01 14:59:41|2022-01-01 15:10:16|2022-01-01...,85,1, ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,617,6.17,0.3756,0.1411,0.0609,,,,,,,,0,,0,,,,,,,,,,,2,ONTIME,1,83,OVERDUE,1,17, case_status,String,true,,Closed,Open,,Unsorted,4,6,570,5.7,0.7141,0.51,0.1253,,,,,,,,0,,0,,,,,,,,,,,2,Closed,1,85,Open,1,15, -closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,55.0262,3027.8804,0.6618,,,,,,,,0,,0,,,,,,,,,,,86, ,1,15,"*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 ,Case Closed Case Resolved ...",85,1, -case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,9.452,89.3404,0.3961,,,,,,,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,BTDT: Complaint,City/State Snow Issues,DISPATCHED Short Term Rental...",24,1, -subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,4.9041,24.05,0.1908,,,,,,,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,"Animal Control,Boston Police Department,Boston Water & Sewer Commission",3,1, -reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,8.4056,70.6536,0.4443,,,,,,,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,"Administrative & General Requests,Animal Issues,Building,Employee & General Comments,Noise Disturban...",7,1, -type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,8.034,64.5444,0.3545,,,,,,,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,City/State Snow Issues,Electrical,General Comments For a Program or...",15,1, -queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,8.0224,64.3596,0.2863,,,,,,,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,"*PREVIEW: BTDT_BostonBikes,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT_Sign Shop_Si...",15,1, +closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,55.0262,3027.8804,0.6618,,,,,,,,0,,0,,,,,,,,,,,86, ,1,15,*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 |Case Closed Case Resolved ...,85,1, +case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,9.452,89.3404,0.3961,,,,,,,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,*PREVIEW: Animal Generic Request|BTDT: Complaint|City/State Snow Issues|DISPATCHED Short Term Rental...,24,1, +subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,4.9041,24.05,0.1908,,,,,,,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,Animal Control|Boston Police Department|Boston Water & Sewer Commission,3,1, +reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,8.4056,70.6536,0.4443,,,,,,,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,Administrative & General Requests|Animal Issues|Building|Employee & General Comments|Noise Disturban...,7,1, +type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,8.034,64.5444,0.3545,,,,,,,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,*PREVIEW: Animal Generic Request|City/State Snow Issues|Electrical|General Comments For a Program or...,15,1, +queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,8.0224,64.3596,0.2863,,,,,,,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,*PREVIEW: BTDT_BostonBikes|BTDT_Engineering_New Sign and Pavement Marking Requests|BTDT_Sign Shop_Si...,15,1, department,String,true,,BTDT,PWDx,,Unsorted,3,4,392,3.92,0.2713,0.0736,0.0692,,,,,,,,0,,0,,,,,,,,,,,7,PWDx,1,49,GEN_,1,2, -submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,43.2585,1871.2989,1.1907,,,,,,,,58,,0.58,,,,,,,,,,,43,,1,58,"*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,http...",42,1, +submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,43.2585,1871.2989,1.1907,,,,,,,,58,,0.58,,,,,,,,,,,43,,1,58,*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg|http...,42,1, closedphoto,NULL,,,,,,,0,0,,,,,,,,,,,,,100,,1,,,,,,,,,,,1,,1,100,,0,0, -location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,9.6247,92.6356,0.2444,,,,,,,,0,,0,,,,,,,,,,,98,"563 Columbus Ave Roxbury MA 02118,INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ",2,2,"*PREVIEW: ,103 N Beacon St Brighton MA 02135,11 Aberdeen St Boston MA 02215,1148 Hyde Park Av...",96,1, +location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,9.6247,92.6356,0.2444,,,,,,,,0,,0,,,,,,,,,,,98,563 Columbus Ave Roxbury MA 02118|INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ,2,2,*PREVIEW: |103 N Beacon St Brighton MA 02135|11 Aberdeen St Boston MA 02215|1148 Hyde Park Av...,96,1, fire_district,String,true,, ,9,,Unsorted,1,2,113,1.13,0.3363,0.1131,0.2976,,,,,,,,0,,0,,,,,,,,,,,10,3,1,19, ,1,1, pwd_district,String,true,, ,1C,,Unsorted,1,3,209,2.09,0.3192,0.1019,0.1527,,,,,,,,0,,0,,,,,,,,,,,14,1B,1,16, ,1,1, city_council_district,String,true,, ,9,,Unsorted,1,1,100,1,0,0,0,,,,,,,,0,,0,,,,,,,,,,,10,1,1,22, ,1,1, police_district,String,true,, ,E5,,Unsorted,1,3,223,2.23,0.444,0.1971,0.1991,,,,,,,,0,,0,,,,,,,,,,,13,A1,1,20, ,1,1, -neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,9.8671,97.3604,0.664,,,,,,,,0,,0,,,,,,,,,,,19,Dorchester,1,15," ,Brighton,Mission Hill",3,1, -neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,0.4877,0.2379,0.3509,,,,,,,,0,,0,,,,,,,,,,,16,3,1,15," ,12",2,1, -ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,2.2293,4.9699,0.4468,,,,,,,,0,,0,,,,,,,,,,,42,Ward 3,1,10,"*PREVIEW: ,01,02,04,06,07,1,10,16,18",23,1, -precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,0.4951,0.2451,0.126,,,,,,,,1,,0.01,,,,,,,,,,,76,0306,1,5,"*PREVIEW: NULL, ,0102,0105,0108,0109,0201,0204,0305,0307",61,1, -location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,9.3995,88.3508,0.5222,,,,,,,,1,,0.01,,,,,,,,,,,97,"20 Washington St,563 Columbus Ave,INTERSECTION Gallivan Blvd & Washington St",3,2,"*PREVIEW: NULL,103 N Beacon St,11 Aberdeen St,1148 Hyde Park Ave,119 L St,12 Derne St,126 Elm St,127...",94,1, -location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,1.8405,3.3874,0.4435,,,,,,,,17,,0.17,,,,,,,,,,,24,,1,17,"02126,02134,02210,02215",4,1, -latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,,,,42.3367,0.0031,42.3367,42.3367,0.0305,0.0009,0.072,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,"*PREVIEW: 42.2553,42.2601,42.2609,42.2645,42.2674,42.2789,42.2797,42.2804,42.2821,42.2878",74,1, -longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,,,,-71.0727,0.0031,,,0.0311,0.001,-0.0437,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,"*PREVIEW: -71.0298,-71.0301,-71.0309,-71.0323,-71.0325,-71.0329,-71.0336,-71.0338,-71.034,-71.0355",72,1, +neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,9.8671,97.3604,0.664,,,,,,,,0,,0,,,,,,,,,,,19,Dorchester,1,15, |Brighton|Mission Hill,3,1, +neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,0.4877,0.2379,0.3509,,,,,,,,0,,0,,,,,,,,,,,16,3,1,15, |12,2,1, +ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,2.2293,4.9699,0.4468,,,,,,,,0,,0,,,,,,,,,,,42,Ward 3,1,10,*PREVIEW: |01|02|04|06|07|1|10|16|18,23,1, +precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,0.4951,0.2451,0.126,,,,,,,,1,,0.01,,,,,,,,,,,76,0306,1,5,*PREVIEW: NULL| |0102|0105|0108|0109|0201|0204|0305|0307,61,1, +location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,9.3995,88.3508,0.5222,,,,,,,,1,,0.01,,,,,,,,,,,97,20 Washington St|563 Columbus Ave|INTERSECTION Gallivan Blvd & Washington St,3,2,*PREVIEW: NULL|103 N Beacon St|11 Aberdeen St|1148 Hyde Park Ave|119 L St|12 Derne St|126 Elm St|127...,94,1, +location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,1.8405,3.3874,0.4435,,,,,,,,17,,0.17,,,,,,,,,,,24,,1,17,02126|02134|02210|02215,4,1, +latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,,,,42.3367,0.0031,42.3367,42.3367,0.0305,0.0009,0.072,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,*PREVIEW: 42.2553|42.2601|42.2609|42.2645|42.2674|42.2789|42.2797|42.2804|42.2821|42.2878,74,1, +longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,,,,-71.0727,0.0031,,,0.0311,0.001,-0.0437,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,*PREVIEW: -71.0298|-71.0301|-71.0309|-71.0323|-71.0325|-71.0329|-71.0336|-71.0338|-71.034|-71.0355,72,1, source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01,2.3473,5.5099,0.1303,,,,,,,,0,,0,,,,,,,,,,,4,Citizens Connect App,1,56,Self Service,1,3, qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 diff --git a/resources/test/boston311-100-everything-date-stats.csv b/resources/test/boston311-100-everything-date-stats.csv index 35e5d930f..487105f1c 100644 --- a/resources/test/boston311-100-everything-date-stats.csv +++ b/resources/test/boston311-100-everything-date-stats.csv @@ -1,32 +1,32 @@ field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,nullcount,max_precision,sparsity,mad,lower_outer_fence,lower_inner_fence,q1,q2_median,q3,iqr,upper_inner_fence,upper_outer_fence,skewness,cardinality,mode,mode_count,mode_occurrences,antimode,antimode_count,antimode_occurrences,qsv__value case_enquiry_id,Integer,,10100411645180,101004113298,101004155594,42296,Unsorted,12,12,1200,12,101004116451.8,0,,0,673,101004109567,101004111646,101004113725,101004114353,101004115111,1386,101004117190,101004119269,0.0938,100,,0,0,*ALL,0,1, open_dt,DateTime,,,2022-01-01T00:16:00+00:00,2022-01-31T11:46:00+00:00,30.47917,Unsorted,,,,,2022-01-04T07:07:45.050+00:00,0,,0,0.76261,2021-12-27T14:16:49+00:00,2021-12-30T06:00:07+00:00,2022-01-01T21:43:25+00:00,2022-01-03T07:02:14+00:00,2022-01-03T16:12:17+00:00,1.77005,2022-01-06T07:55:35+00:00,2022-01-08T23:38:53+00:00,-0.5684,100,,0,0,*ALL,0,1, -target_dt,DateTime,,,2022-01-03T10:32:34+00:00,2022-05-20T13:03:21+00:00,137.10471,Unsorted,,,,,2022-01-17T03:14:16.404+00:00,11,,0.11,1,2021-11-26T08:30:00+00:00,2021-12-15T20:30:00+00:00,2022-01-04T08:30:00+00:00,2022-01-05T08:30:00+00:00,2022-01-17T08:30:00+00:00,13,2022-02-05T20:30:00+00:00,2022-02-25T08:30:00+00:00,0.8462,42,2022-01-04 08:30:00,1,25,"*PREVIEW: 2022-01-03 10:32:34,2022-01-03 11:58:12,2022-01-04 09:58:36,2022-01-04 10:41:29,2022-01-04...",34,1, -closed_dt,DateTime,,,2022-01-01T12:56:14+00:00,2022-04-25T14:30:31+00:00,114.06547,Unsorted,,,,,2022-01-08T01:10:44.411+00:00,15,,0.15,0.77213,2021-12-29T15:13:29+00:00,2021-12-31T19:50:08.750+00:00,2022-01-03T00:26:48.500+00:00,2022-01-03T12:15:23+00:00,2022-01-04T11:31:15+00:00,1.46142,2022-01-06T16:07:54.750+00:00,2022-01-08T20:44:34.500+00:00,0.3266,86,,1,15,"*PREVIEW: 2022-01-01 12:56:14,2022-01-01 14:17:15,2022-01-01 14:59:41,2022-01-01 15:10:16,2022-01-01...",85,1, +target_dt,DateTime,,,2022-01-03T10:32:34+00:00,2022-05-20T13:03:21+00:00,137.10471,Unsorted,,,,,2022-01-17T03:14:16.404+00:00,11,,0.11,1,2021-11-26T08:30:00+00:00,2021-12-15T20:30:00+00:00,2022-01-04T08:30:00+00:00,2022-01-05T08:30:00+00:00,2022-01-17T08:30:00+00:00,13,2022-02-05T20:30:00+00:00,2022-02-25T08:30:00+00:00,0.8462,42,2022-01-04 08:30:00,1,25,*PREVIEW: 2022-01-03 10:32:34|2022-01-03 11:58:12|2022-01-04 09:58:36|2022-01-04 10:41:29|2022-01-04...,34,1, +closed_dt,DateTime,,,2022-01-01T12:56:14+00:00,2022-04-25T14:30:31+00:00,114.06547,Unsorted,,,,,2022-01-08T01:10:44.411+00:00,15,,0.15,0.77213,2021-12-29T15:13:29+00:00,2021-12-31T19:50:08.750+00:00,2022-01-03T00:26:48.500+00:00,2022-01-03T12:15:23+00:00,2022-01-04T11:31:15+00:00,1.46142,2022-01-06T16:07:54.750+00:00,2022-01-08T20:44:34.500+00:00,0.3266,86,,1,15,*PREVIEW: 2022-01-01 12:56:14|2022-01-01 14:17:15|2022-01-01 14:59:41|2022-01-01 15:10:16|2022-01-01...,85,1, ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,617,6.17,,0,,0,,,,,,,,,,,2,ONTIME,1,83,OVERDUE,1,17, case_status,String,true,,Closed,Open,,Unsorted,4,6,570,5.7,,0,,0,,,,,,,,,,,2,Closed,1,85,Open,1,15, -closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,,0,,0,,,,,,,,,,,86, ,1,15,"*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 ,Case Closed Case Resolved ...",85,1, -case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,BTDT: Complaint,City/State Snow Issues,DISPATCHED Short Term Rental...",24,1, -subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,"Animal Control,Boston Police Department,Boston Water & Sewer Commission",3,1, -reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,"Administrative & General Requests,Animal Issues,Building,Employee & General Comments,Noise Disturban...",7,1, -type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,City/State Snow Issues,Electrical,General Comments For a Program or...",15,1, -queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,"*PREVIEW: BTDT_BostonBikes,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT_Sign Shop_Si...",15,1, +closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,,0,,0,,,,,,,,,,,86, ,1,15,*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 |Case Closed Case Resolved ...,85,1, +case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,*PREVIEW: Animal Generic Request|BTDT: Complaint|City/State Snow Issues|DISPATCHED Short Term Rental...,24,1, +subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,Animal Control|Boston Police Department|Boston Water & Sewer Commission,3,1, +reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,Administrative & General Requests|Animal Issues|Building|Employee & General Comments|Noise Disturban...,7,1, +type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,*PREVIEW: Animal Generic Request|City/State Snow Issues|Electrical|General Comments For a Program or...,15,1, +queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,*PREVIEW: BTDT_BostonBikes|BTDT_Engineering_New Sign and Pavement Marking Requests|BTDT_Sign Shop_Si...,15,1, department,String,true,,BTDT,PWDx,,Unsorted,3,4,392,3.92,,0,,0,,,,,,,,,,,7,PWDx,1,49,GEN_,1,2, -submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,,58,,0.58,,,,,,,,,,,43,,1,58,"*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,http...",42,1, +submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,,58,,0.58,,,,,,,,,,,43,,1,58,*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg|http...,42,1, closedphoto,NULL,,,,,,,0,0,,,,100,,1,,,,,,,,,,,1,,1,100,,0,0, -location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,,0,,0,,,,,,,,,,,98,"563 Columbus Ave Roxbury MA 02118,INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ",2,2,"*PREVIEW: ,103 N Beacon St Brighton MA 02135,11 Aberdeen St Boston MA 02215,1148 Hyde Park Av...",96,1, +location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,,0,,0,,,,,,,,,,,98,563 Columbus Ave Roxbury MA 02118|INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ,2,2,*PREVIEW: |103 N Beacon St Brighton MA 02135|11 Aberdeen St Boston MA 02215|1148 Hyde Park Av...,96,1, fire_district,String,true,, ,9,,Unsorted,1,2,113,1.13,,0,,0,,,,,,,,,,,10,3,1,19, ,1,1, pwd_district,String,true,, ,1C,,Unsorted,1,3,209,2.09,,0,,0,,,,,,,,,,,14,1B,1,16, ,1,1, city_council_district,String,true,, ,9,,Unsorted,1,1,100,1,,0,,0,,,,,,,,,,,10,1,1,22, ,1,1, police_district,String,true,, ,E5,,Unsorted,1,3,223,2.23,,0,,0,,,,,,,,,,,13,A1,1,20, ,1,1, -neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,,0,,0,,,,,,,,,,,19,Dorchester,1,15," ,Brighton,Mission Hill",3,1, -neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,,0,,0,,,,,,,,,,,16,3,1,15," ,12",2,1, -ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,,0,,0,,,,,,,,,,,42,Ward 3,1,10,"*PREVIEW: ,01,02,04,06,07,1,10,16,18",23,1, -precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,,1,,0.01,,,,,,,,,,,76,0306,1,5,"*PREVIEW: NULL, ,0102,0105,0108,0109,0201,0204,0305,0307",61,1, -location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,,1,,0.01,,,,,,,,,,,97,"20 Washington St,563 Columbus Ave,INTERSECTION Gallivan Blvd & Washington St",3,2,"*PREVIEW: NULL,103 N Beacon St,11 Aberdeen St,1148 Hyde Park Ave,119 L St,12 Derne St,126 Elm St,127...",94,1, -location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,,17,,0.17,,,,,,,,,,,24,,1,17,"02126,02134,02210,02215",4,1, -latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,42.3367,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,"*PREVIEW: 42.2553,42.2601,42.2609,42.2645,42.2674,42.2789,42.2797,42.2804,42.2821,42.2878",74,1, -longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,-71.0727,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,"*PREVIEW: -71.0298,-71.0301,-71.0309,-71.0323,-71.0325,-71.0329,-71.0336,-71.0338,-71.034,-71.0355",72,1, +neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,,0,,0,,,,,,,,,,,19,Dorchester,1,15, |Brighton|Mission Hill,3,1, +neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,,0,,0,,,,,,,,,,,16,3,1,15, |12,2,1, +ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,,0,,0,,,,,,,,,,,42,Ward 3,1,10,*PREVIEW: |01|02|04|06|07|1|10|16|18,23,1, +precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,,1,,0.01,,,,,,,,,,,76,0306,1,5,*PREVIEW: NULL| |0102|0105|0108|0109|0201|0204|0305|0307,61,1, +location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,,1,,0.01,,,,,,,,,,,97,20 Washington St|563 Columbus Ave|INTERSECTION Gallivan Blvd & Washington St,3,2,*PREVIEW: NULL|103 N Beacon St|11 Aberdeen St|1148 Hyde Park Ave|119 L St|12 Derne St|126 Elm St|127...,94,1, +location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,,17,,0.17,,,,,,,,,,,24,,1,17,02126|02134|02210|02215,4,1, +latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,42.3367,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,*PREVIEW: 42.2553|42.2601|42.2609|42.2645|42.2674|42.2789|42.2797|42.2804|42.2821|42.2878,74,1, +longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,-71.0727,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,*PREVIEW: -71.0298|-71.0301|-71.0309|-71.0323|-71.0325|-71.0329|-71.0336|-71.0338|-71.034|-71.0355,72,1, source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01,,0,,0,,,,,,,,,,,4,Citizens Connect App,1,56,Self Service,1,3, qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 diff --git a/resources/test/boston311-100-everything-datenotime-stats.csv b/resources/test/boston311-100-everything-datenotime-stats.csv index 917bfa6f7..d6a7b797c 100644 --- a/resources/test/boston311-100-everything-datenotime-stats.csv +++ b/resources/test/boston311-100-everything-datenotime-stats.csv @@ -1,32 +1,32 @@ field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,nullcount,max_precision,sparsity,mad,lower_outer_fence,lower_inner_fence,q1,q2_median,q3,iqr,upper_inner_fence,upper_outer_fence,skewness,cardinality,mode,mode_count,mode_occurrences,antimode,antimode_count,antimode_occurrences,qsv__value case_enquiry_id,Integer,,10100411645180,101004113298,101004155594,42296,Unsorted,12,12,1200,12,101004116451.8,0,,0,673,101004109567,101004111646,101004113725,101004114353,101004115111,1386,101004117190,101004119269,0.0938,100,,0,0,*ALL,0,1, -open_dt,Date,,,2022-01-01,2022-01-31,30,Unsorted,,,,,2022-01-03,0,,0,1,2021-12-26,2021-12-29,2022-01-01,2022-01-03,2022-01-03,2,2022-01-06,2022-01-09,-1,10,2022-01-03,1,38,"2022-01-06,2022-01-08,2022-01-19,2022-01-21",4,1, -target_dt,Date,,,2022-01-03,2022-05-20,137,Unsorted,,,,,2022-01-16,11,,0.11,1,2021-11-26,2021-12-15,2022-01-04,2022-01-05,2022-01-17,13,2022-02-05,2022-02-25,0.8462,22,2022-01-04,1,37,"2022-01-11,2022-01-19,2022-02-04,2022-02-14,2022-02-17,2022-03-02,2022-03-10,2022-04-01,2022-05-20",9,1, -closed_dt,Date,,,2022-01-01,2022-04-25,114,Unsorted,,,,,2022-01-07,15,,0.15,1,2021-12-31,2022-01-01,2022-01-03,2022-01-03,2022-01-04,1,2022-01-05,2022-01-07,1,18,2022-01-03,1,35,"2022-01-07,2022-01-09,2022-01-13,2022-01-19,2022-02-12,2022-02-28,2022-03-09,2022-04-25",8,1, +open_dt,Date,,,2022-01-01,2022-01-31,30,Unsorted,,,,,2022-01-03,0,,0,1,2021-12-26,2021-12-29,2022-01-01,2022-01-03,2022-01-03,2,2022-01-06,2022-01-09,-1,10,2022-01-03,1,38,2022-01-06|2022-01-08|2022-01-19|2022-01-21,4,1, +target_dt,Date,,,2022-01-03,2022-05-20,137,Unsorted,,,,,2022-01-16,11,,0.11,1,2021-11-26,2021-12-15,2022-01-04,2022-01-05,2022-01-17,13,2022-02-05,2022-02-25,0.8462,22,2022-01-04,1,37,2022-01-11|2022-01-19|2022-02-04|2022-02-14|2022-02-17|2022-03-02|2022-03-10|2022-04-01|2022-05-20,9,1, +closed_dt,Date,,,2022-01-01,2022-04-25,114,Unsorted,,,,,2022-01-07,15,,0.15,1,2021-12-31,2022-01-01,2022-01-03,2022-01-03,2022-01-04,1,2022-01-05,2022-01-07,1,18,2022-01-03,1,35,2022-01-07|2022-01-09|2022-01-13|2022-01-19|2022-02-12|2022-02-28|2022-03-09|2022-04-25,8,1, ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,617,6.17,,0,,0,,,,,,,,,,,2,ONTIME,1,83,OVERDUE,1,17, case_status,String,true,,Closed,Open,,Unsorted,4,6,570,5.7,,0,,0,,,,,,,,,,,2,Closed,1,85,Open,1,15, -closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,,0,,0,,,,,,,,,,,86, ,1,15,"*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 ,Case Closed Case Resolved ...",85,1, -case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,BTDT: Complaint,City/State Snow Issues,DISPATCHED Short Term Rental...",24,1, -subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,"Animal Control,Boston Police Department,Boston Water & Sewer Commission",3,1, -reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,"Administrative & General Requests,Animal Issues,Building,Employee & General Comments,Noise Disturban...",7,1, -type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,City/State Snow Issues,Electrical,General Comments For a Program or...",15,1, -queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,"*PREVIEW: BTDT_BostonBikes,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT_Sign Shop_Si...",15,1, +closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,,0,,0,,,,,,,,,,,86, ,1,15,*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 |Case Closed Case Resolved ...,85,1, +case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,*PREVIEW: Animal Generic Request|BTDT: Complaint|City/State Snow Issues|DISPATCHED Short Term Rental...,24,1, +subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,Animal Control|Boston Police Department|Boston Water & Sewer Commission,3,1, +reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,Administrative & General Requests|Animal Issues|Building|Employee & General Comments|Noise Disturban...,7,1, +type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,*PREVIEW: Animal Generic Request|City/State Snow Issues|Electrical|General Comments For a Program or...,15,1, +queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,*PREVIEW: BTDT_BostonBikes|BTDT_Engineering_New Sign and Pavement Marking Requests|BTDT_Sign Shop_Si...,15,1, department,String,true,,BTDT,PWDx,,Unsorted,3,4,392,3.92,,0,,0,,,,,,,,,,,7,PWDx,1,49,GEN_,1,2, -submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,,58,,0.58,,,,,,,,,,,43,,1,58,"*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,http...",42,1, +submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,,58,,0.58,,,,,,,,,,,43,,1,58,*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg|http...,42,1, closedphoto,NULL,,,,,,,0,0,,,,100,,1,,,,,,,,,,,1,,1,100,,0,0, -location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,,0,,0,,,,,,,,,,,98,"563 Columbus Ave Roxbury MA 02118,INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ",2,2,"*PREVIEW: ,103 N Beacon St Brighton MA 02135,11 Aberdeen St Boston MA 02215,1148 Hyde Park Av...",96,1, +location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,,0,,0,,,,,,,,,,,98,563 Columbus Ave Roxbury MA 02118|INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ,2,2,*PREVIEW: |103 N Beacon St Brighton MA 02135|11 Aberdeen St Boston MA 02215|1148 Hyde Park Av...,96,1, fire_district,String,true,, ,9,,Unsorted,1,2,113,1.13,,0,,0,,,,,,,,,,,10,3,1,19, ,1,1, pwd_district,String,true,, ,1C,,Unsorted,1,3,209,2.09,,0,,0,,,,,,,,,,,14,1B,1,16, ,1,1, city_council_district,String,true,, ,9,,Unsorted,1,1,100,1,,0,,0,,,,,,,,,,,10,1,1,22, ,1,1, police_district,String,true,, ,E5,,Unsorted,1,3,223,2.23,,0,,0,,,,,,,,,,,13,A1,1,20, ,1,1, -neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,,0,,0,,,,,,,,,,,19,Dorchester,1,15," ,Brighton,Mission Hill",3,1, -neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,,0,,0,,,,,,,,,,,16,3,1,15," ,12",2,1, -ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,,0,,0,,,,,,,,,,,42,Ward 3,1,10,"*PREVIEW: ,01,02,04,06,07,1,10,16,18",23,1, -precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,,1,,0.01,,,,,,,,,,,76,0306,1,5,"*PREVIEW: NULL, ,0102,0105,0108,0109,0201,0204,0305,0307",61,1, -location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,,1,,0.01,,,,,,,,,,,97,"20 Washington St,563 Columbus Ave,INTERSECTION Gallivan Blvd & Washington St",3,2,"*PREVIEW: NULL,103 N Beacon St,11 Aberdeen St,1148 Hyde Park Ave,119 L St,12 Derne St,126 Elm St,127...",94,1, -location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,,17,,0.17,,,,,,,,,,,24,,1,17,"02126,02134,02210,02215",4,1, -latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,42.3367,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,"*PREVIEW: 42.2553,42.2601,42.2609,42.2645,42.2674,42.2789,42.2797,42.2804,42.2821,42.2878",74,1, -longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,-71.0727,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,"*PREVIEW: -71.0298,-71.0301,-71.0309,-71.0323,-71.0325,-71.0329,-71.0336,-71.0338,-71.034,-71.0355",72,1, +neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,,0,,0,,,,,,,,,,,19,Dorchester,1,15, |Brighton|Mission Hill,3,1, +neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,,0,,0,,,,,,,,,,,16,3,1,15, |12,2,1, +ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,,0,,0,,,,,,,,,,,42,Ward 3,1,10,*PREVIEW: |01|02|04|06|07|1|10|16|18,23,1, +precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,,1,,0.01,,,,,,,,,,,76,0306,1,5,*PREVIEW: NULL| |0102|0105|0108|0109|0201|0204|0305|0307,61,1, +location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,,1,,0.01,,,,,,,,,,,97,20 Washington St|563 Columbus Ave|INTERSECTION Gallivan Blvd & Washington St,3,2,*PREVIEW: NULL|103 N Beacon St|11 Aberdeen St|1148 Hyde Park Ave|119 L St|12 Derne St|126 Elm St|127...,94,1, +location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,,17,,0.17,,,,,,,,,,,24,,1,17,02126|02134|02210|02215,4,1, +latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,42.3367,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,*PREVIEW: 42.2553|42.2601|42.2609|42.2645|42.2674|42.2789|42.2797|42.2804|42.2821|42.2878,74,1, +longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,-71.0727,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,*PREVIEW: -71.0298|-71.0301|-71.0309|-71.0323|-71.0325|-71.0329|-71.0336|-71.0338|-71.034|-71.0355,72,1, source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01,,0,,0,,,,,,,,,,,4,Citizens Connect App,1,56,Self Service,1,3, qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 diff --git a/resources/test/boston311-100-everything-inferdates-defaultwhitelist-stats.csv b/resources/test/boston311-100-everything-inferdates-defaultwhitelist-stats.csv index e9dfad15d..afced7f6e 100644 --- a/resources/test/boston311-100-everything-inferdates-defaultwhitelist-stats.csv +++ b/resources/test/boston311-100-everything-inferdates-defaultwhitelist-stats.csv @@ -1,32 +1,32 @@ field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,nullcount,max_precision,sparsity,mad,lower_outer_fence,lower_inner_fence,q1,q2_median,q3,iqr,upper_inner_fence,upper_outer_fence,skewness,cardinality,mode,mode_count,mode_occurrences,antimode,antimode_count,antimode_occurrences,qsv__value case_enquiry_id,Integer,,10100411645180,101004113298,101004155594,42296,Unsorted,12,12,1200,12,101004116451.8,0,,0,673,101004109567,101004111646,101004113725,101004114353,101004115111,1386,101004117190,101004119269,0.0938,100,,0,0,*ALL,0,1, open_dt,DateTime,,,2022-01-01T00:16:00+00:00,2022-01-31T11:46:00+00:00,30.47917,Unsorted,,,,,2022-01-04T07:07:45.050+00:00,0,,0,0.76261,2021-12-27T14:16:49+00:00,2021-12-30T06:00:07+00:00,2022-01-01T21:43:25+00:00,2022-01-03T07:02:14+00:00,2022-01-03T16:12:17+00:00,1.77005,2022-01-06T07:55:35+00:00,2022-01-08T23:38:53+00:00,-0.5684,100,,0,0,*ALL,0,1, -target_dt,String,true,,2022-01-03 10:32:34,2022-05-20 13:03:21,,Unsorted,0,19,1691,16.91,,11,,0.11,,,,,,,,,,,42,2022-01-04 08:30:00,1,25,"*PREVIEW: 2022-01-03 10:32:34,2022-01-03 11:58:12,2022-01-04 09:58:36,2022-01-04 10:41:29,2022-01-04...",34,1, -closed_dt,DateTime,,,2022-01-01T12:56:14+00:00,2022-04-25T14:30:31+00:00,114.06547,Unsorted,,,,,2022-01-08T01:10:44.411+00:00,15,,0.15,0.77213,2021-12-29T15:13:29+00:00,2021-12-31T19:50:08.750+00:00,2022-01-03T00:26:48.500+00:00,2022-01-03T12:15:23+00:00,2022-01-04T11:31:15+00:00,1.46142,2022-01-06T16:07:54.750+00:00,2022-01-08T20:44:34.500+00:00,0.3266,86,,1,15,"*PREVIEW: 2022-01-01 12:56:14,2022-01-01 14:17:15,2022-01-01 14:59:41,2022-01-01 15:10:16,2022-01-01...",85,1, +target_dt,String,true,,2022-01-03 10:32:34,2022-05-20 13:03:21,,Unsorted,0,19,1691,16.91,,11,,0.11,,,,,,,,,,,42,2022-01-04 08:30:00,1,25,*PREVIEW: 2022-01-03 10:32:34|2022-01-03 11:58:12|2022-01-04 09:58:36|2022-01-04 10:41:29|2022-01-04...,34,1, +closed_dt,DateTime,,,2022-01-01T12:56:14+00:00,2022-04-25T14:30:31+00:00,114.06547,Unsorted,,,,,2022-01-08T01:10:44.411+00:00,15,,0.15,0.77213,2021-12-29T15:13:29+00:00,2021-12-31T19:50:08.750+00:00,2022-01-03T00:26:48.500+00:00,2022-01-03T12:15:23+00:00,2022-01-04T11:31:15+00:00,1.46142,2022-01-06T16:07:54.750+00:00,2022-01-08T20:44:34.500+00:00,0.3266,86,,1,15,*PREVIEW: 2022-01-01 12:56:14|2022-01-01 14:17:15|2022-01-01 14:59:41|2022-01-01 15:10:16|2022-01-01...,85,1, ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,617,6.17,,0,,0,,,,,,,,,,,2,ONTIME,1,83,OVERDUE,1,17, case_status,String,true,,Closed,Open,,Unsorted,4,6,570,5.7,,0,,0,,,,,,,,,,,2,Closed,1,85,Open,1,15, -closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,,0,,0,,,,,,,,,,,86, ,1,15,"*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 ,Case Closed Case Resolved ...",85,1, -case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,BTDT: Complaint,City/State Snow Issues,DISPATCHED Short Term Rental...",24,1, -subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,"Animal Control,Boston Police Department,Boston Water & Sewer Commission",3,1, -reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,"Administrative & General Requests,Animal Issues,Building,Employee & General Comments,Noise Disturban...",7,1, -type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,City/State Snow Issues,Electrical,General Comments For a Program or...",15,1, -queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,"*PREVIEW: BTDT_BostonBikes,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT_Sign Shop_Si...",15,1, +closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,,0,,0,,,,,,,,,,,86, ,1,15,*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 |Case Closed Case Resolved ...,85,1, +case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,*PREVIEW: Animal Generic Request|BTDT: Complaint|City/State Snow Issues|DISPATCHED Short Term Rental...,24,1, +subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,Animal Control|Boston Police Department|Boston Water & Sewer Commission,3,1, +reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,Administrative & General Requests|Animal Issues|Building|Employee & General Comments|Noise Disturban...,7,1, +type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,*PREVIEW: Animal Generic Request|City/State Snow Issues|Electrical|General Comments For a Program or...,15,1, +queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,*PREVIEW: BTDT_BostonBikes|BTDT_Engineering_New Sign and Pavement Marking Requests|BTDT_Sign Shop_Si...,15,1, department,String,true,,BTDT,PWDx,,Unsorted,3,4,392,3.92,,0,,0,,,,,,,,,,,7,PWDx,1,49,GEN_,1,2, -submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,,58,,0.58,,,,,,,,,,,43,,1,58,"*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,http...",42,1, +submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,,58,,0.58,,,,,,,,,,,43,,1,58,*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg|http...,42,1, closedphoto,NULL,,,,,,,0,0,,,,100,,1,,,,,,,,,,,1,,1,100,,0,0, -location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,,0,,0,,,,,,,,,,,98,"563 Columbus Ave Roxbury MA 02118,INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ",2,2,"*PREVIEW: ,103 N Beacon St Brighton MA 02135,11 Aberdeen St Boston MA 02215,1148 Hyde Park Av...",96,1, +location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,,0,,0,,,,,,,,,,,98,563 Columbus Ave Roxbury MA 02118|INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ,2,2,*PREVIEW: |103 N Beacon St Brighton MA 02135|11 Aberdeen St Boston MA 02215|1148 Hyde Park Av...,96,1, fire_district,String,true,, ,9,,Unsorted,1,2,113,1.13,,0,,0,,,,,,,,,,,10,3,1,19, ,1,1, pwd_district,String,true,, ,1C,,Unsorted,1,3,209,2.09,,0,,0,,,,,,,,,,,14,1B,1,16, ,1,1, city_council_district,String,true,, ,9,,Unsorted,1,1,100,1,,0,,0,,,,,,,,,,,10,1,1,22, ,1,1, police_district,String,true,, ,E5,,Unsorted,1,3,223,2.23,,0,,0,,,,,,,,,,,13,A1,1,20, ,1,1, -neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,,0,,0,,,,,,,,,,,19,Dorchester,1,15," ,Brighton,Mission Hill",3,1, -neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,,0,,0,,,,,,,,,,,16,3,1,15," ,12",2,1, -ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,,0,,0,,,,,,,,,,,42,Ward 3,1,10,"*PREVIEW: ,01,02,04,06,07,1,10,16,18",23,1, -precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,,1,,0.01,,,,,,,,,,,76,0306,1,5,"*PREVIEW: NULL, ,0102,0105,0108,0109,0201,0204,0305,0307",61,1, -location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,,1,,0.01,,,,,,,,,,,97,"20 Washington St,563 Columbus Ave,INTERSECTION Gallivan Blvd & Washington St",3,2,"*PREVIEW: NULL,103 N Beacon St,11 Aberdeen St,1148 Hyde Park Ave,119 L St,12 Derne St,126 Elm St,127...",94,1, -location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,,17,,0.17,,,,,,,,,,,24,,1,17,"02126,02134,02210,02215",4,1, -latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,42.3367,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,"*PREVIEW: 42.2553,42.2601,42.2609,42.2645,42.2674,42.2789,42.2797,42.2804,42.2821,42.2878",74,1, -longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,-71.0727,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,"*PREVIEW: -71.0298,-71.0301,-71.0309,-71.0323,-71.0325,-71.0329,-71.0336,-71.0338,-71.034,-71.0355",72,1, +neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,,0,,0,,,,,,,,,,,19,Dorchester,1,15, |Brighton|Mission Hill,3,1, +neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,,0,,0,,,,,,,,,,,16,3,1,15, |12,2,1, +ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,,0,,0,,,,,,,,,,,42,Ward 3,1,10,*PREVIEW: |01|02|04|06|07|1|10|16|18,23,1, +precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,,1,,0.01,,,,,,,,,,,76,0306,1,5,*PREVIEW: NULL| |0102|0105|0108|0109|0201|0204|0305|0307,61,1, +location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,,1,,0.01,,,,,,,,,,,97,20 Washington St|563 Columbus Ave|INTERSECTION Gallivan Blvd & Washington St,3,2,*PREVIEW: NULL|103 N Beacon St|11 Aberdeen St|1148 Hyde Park Ave|119 L St|12 Derne St|126 Elm St|127...,94,1, +location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,,17,,0.17,,,,,,,,,,,24,,1,17,02126|02134|02210|02215,4,1, +latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,42.3367,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,*PREVIEW: 42.2553|42.2601|42.2609|42.2645|42.2674|42.2789|42.2797|42.2804|42.2821|42.2878,74,1, +longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,-71.0727,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,*PREVIEW: -71.0298|-71.0301|-71.0309|-71.0323|-71.0325|-71.0329|-71.0336|-71.0338|-71.034|-71.0355,72,1, source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01,,0,,0,,,,,,,,,,,4,Citizens Connect App,1,56,Self Service,1,3, qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 diff --git a/resources/test/boston311-100-everything-nodate-stats.csv b/resources/test/boston311-100-everything-nodate-stats.csv index 068b1fe21..f76e15039 100644 --- a/resources/test/boston311-100-everything-nodate-stats.csv +++ b/resources/test/boston311-100-everything-nodate-stats.csv @@ -1,32 +1,32 @@ field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,nullcount,max_precision,sparsity,mad,lower_outer_fence,lower_inner_fence,q1,q2_median,q3,iqr,upper_inner_fence,upper_outer_fence,skewness,cardinality,mode,mode_count,mode_occurrences,antimode,antimode_count,antimode_occurrences,qsv__value case_enquiry_id,Integer,,10100411645180,101004113298,101004155594,42296,Unsorted,12,12,1200,12,101004116451.8,0,,0,673,101004109567,101004111646,101004113725,101004114353,101004115111,1386,101004117190,101004119269,0.0938,100,,0,0,*ALL,0,1, open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,1900,19,,0,,0,,,,,,,,,,,100,,0,0,*ALL,0,1, -target_dt,String,true,,2022-01-03 10:32:34,2022-05-20 13:03:21,,Unsorted,0,19,1691,16.91,,11,,0.11,,,,,,,,,,,42,2022-01-04 08:30:00,1,25,"*PREVIEW: 2022-01-03 10:32:34,2022-01-03 11:58:12,2022-01-04 09:58:36,2022-01-04 10:41:29,2022-01-04...",34,1, -closed_dt,String,true,,2022-01-01 12:56:14,2022-04-25 14:30:31,,Unsorted,0,19,1615,16.15,,15,,0.15,,,,,,,,,,,86,,1,15,"*PREVIEW: 2022-01-01 12:56:14,2022-01-01 14:17:15,2022-01-01 14:59:41,2022-01-01 15:10:16,2022-01-01...",85,1, +target_dt,String,true,,2022-01-03 10:32:34,2022-05-20 13:03:21,,Unsorted,0,19,1691,16.91,,11,,0.11,,,,,,,,,,,42,2022-01-04 08:30:00,1,25,*PREVIEW: 2022-01-03 10:32:34|2022-01-03 11:58:12|2022-01-04 09:58:36|2022-01-04 10:41:29|2022-01-04...,34,1, +closed_dt,String,true,,2022-01-01 12:56:14,2022-04-25 14:30:31,,Unsorted,0,19,1615,16.15,,15,,0.15,,,,,,,,,,,86,,1,15,*PREVIEW: 2022-01-01 12:56:14|2022-01-01 14:17:15|2022-01-01 14:59:41|2022-01-01 15:10:16|2022-01-01...,85,1, ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,617,6.17,,0,,0,,,,,,,,,,,2,ONTIME,1,83,OVERDUE,1,17, case_status,String,true,,Closed,Open,,Unsorted,4,6,570,5.7,,0,,0,,,,,,,,,,,2,Closed,1,85,Open,1,15, -closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,,0,,0,,,,,,,,,,,86, ,1,15,"*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 ,Case Closed Case Resolved ...",85,1, -case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,BTDT: Complaint,City/State Snow Issues,DISPATCHED Short Term Rental...",24,1, -subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,"Animal Control,Boston Police Department,Boston Water & Sewer Commission",3,1, -reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,"Administrative & General Requests,Animal Issues,Building,Employee & General Comments,Noise Disturban...",7,1, -type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,"*PREVIEW: Animal Generic Request,City/State Snow Issues,Electrical,General Comments For a Program or...",15,1, -queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,"*PREVIEW: BTDT_BostonBikes,BTDT_Engineering_New Sign and Pavement Marking Requests,BTDT_Sign Shop_Si...",15,1, +closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,284,8314,83.14,,0,,0,,,,,,,,,,,86, ,1,15,*PREVIEW: Case Closed Case Resolved NEW CART#21026466 DELV ON 1/11/22 |Case Closed Case Resolved ...,85,1, +case_title,String,true,,Abandoned Vehicles,Traffic Signal Inspection,,Unsorted,10,57,2386,23.86,,0,,0,,,,,,,,,,,42,Parking Enforcement,1,20,*PREVIEW: Animal Generic Request|BTDT: Complaint|City/State Snow Issues|DISPATCHED Short Term Rental...,24,1, +subject,String,true,,Animal Control,Transportation - Traffic Division,,Unsorted,14,33,2570,25.7,,0,,0,,,,,,,,,,,9,Public Works Department,1,51,Animal Control|Boston Police Department|Boston Water & Sewer Commission,3,1, +reason,String,true,,Administrative & General Requests,Street Lights,,Unsorted,7,33,1892,18.92,,0,,0,,,,,,,,,,,20,Enforcement & Abandoned Vehicles,1,23,Administrative & General Requests|Animal Issues|Building|Employee & General Comments|Noise Disturban...,7,1, +type,String,true,,Abandoned Vehicles,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,10,47,2266,22.66,,0,,0,,,,,,,,,,,36,Parking Enforcement,1,20,*PREVIEW: Animal Generic Request|City/State Snow Issues|Electrical|General Comments For a Program or...,15,1, +queue,String,true,,BTDT_AVRS Interface Queue,PWDx_Street Light_General Lighting Request,,Unsorted,13,55,2802,28.02,,0,,0,,,,,,,,,,,35,BTDT_Parking Enforcement,1,21,*PREVIEW: BTDT_BostonBikes|BTDT_Engineering_New Sign and Pavement Marking Requests|BTDT_Sign Shop_Si...,15,1, department,String,true,,BTDT,PWDx,,Unsorted,3,4,392,3.92,,0,,0,,,,,,,,,,,7,PWDx,1,49,GEN_,1,2, -submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,,58,,0.58,,,,,,,,,,,43,,1,58,"*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,http...",42,1, +submittedphoto,String,true,,https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg,https://311.boston.gov/media/boston/report/photos/61d75bba05bbcf180c2d41de/report.jpg,,Unsorted,0,100,3633,36.33,,58,,0.58,,,,,,,,,,,43,,1,58,*PREVIEW: https://311.boston.gov/media/boston/report/photos/61d03f0d05bbcf180c2965fd/report.jpg|http...,42,1, closedphoto,NULL,,,,,,,0,0,,,,100,,1,,,,,,,,,,,1,,1,100,,0,0, -location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,,0,,0,,,,,,,,,,,98,"563 Columbus Ave Roxbury MA 02118,INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ",2,2,"*PREVIEW: ,103 N Beacon St Brighton MA 02135,11 Aberdeen St Boston MA 02215,1148 Hyde Park Av...",96,1, +location,String,true,, ,INTERSECTION of Verdun St & Gallivan Blvd Dorchester MA ,,Unsorted,1,63,3938,39.38,,0,,0,,,,,,,,,,,98,563 Columbus Ave Roxbury MA 02118|INTERSECTION of Gallivan Blvd & Washington St Dorchester MA ,2,2,*PREVIEW: |103 N Beacon St Brighton MA 02135|11 Aberdeen St Boston MA 02215|1148 Hyde Park Av...,96,1, fire_district,String,true,, ,9,,Unsorted,1,2,113,1.13,,0,,0,,,,,,,,,,,10,3,1,19, ,1,1, pwd_district,String,true,, ,1C,,Unsorted,1,3,209,2.09,,0,,0,,,,,,,,,,,14,1B,1,16, ,1,1, city_council_district,String,true,, ,9,,Unsorted,1,1,100,1,,0,,0,,,,,,,,,,,10,1,1,22, ,1,1, police_district,String,true,, ,E5,,Unsorted,1,3,223,2.23,,0,,0,,,,,,,,,,,13,A1,1,20, ,1,1, -neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,,0,,0,,,,,,,,,,,19,Dorchester,1,15," ,Brighton,Mission Hill",3,1, -neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,,0,,0,,,,,,,,,,,16,3,1,15," ,12",2,1, -ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,,0,,0,,,,,,,,,,,42,Ward 3,1,10,"*PREVIEW: ,01,02,04,06,07,1,10,16,18",23,1, -precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,,1,,0.01,,,,,,,,,,,76,0306,1,5,"*PREVIEW: NULL, ,0102,0105,0108,0109,0201,0204,0305,0307",61,1, -location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,,1,,0.01,,,,,,,,,,,97,"20 Washington St,563 Columbus Ave,INTERSECTION Gallivan Blvd & Washington St",3,2,"*PREVIEW: NULL,103 N Beacon St,11 Aberdeen St,1148 Hyde Park Ave,119 L St,12 Derne St,126 Elm St,127...",94,1, -location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,,17,,0.17,,,,,,,,,,,24,,1,17,"02126,02134,02210,02215",4,1, -latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,42.3367,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,"*PREVIEW: 42.2553,42.2601,42.2609,42.2645,42.2674,42.2789,42.2797,42.2804,42.2821,42.2878",74,1, -longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,-71.0727,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,"*PREVIEW: -71.0298,-71.0301,-71.0309,-71.0323,-71.0325,-71.0329,-71.0336,-71.0338,-71.034,-71.0355",72,1, +neighborhood,String,true,, ,West Roxbury,,Unsorted,1,38,1486,14.86,,0,,0,,,,,,,,,,,19,Dorchester,1,15, |Brighton|Mission Hill,3,1, +neighborhood_services_district,String,true,, ,9,,Unsorted,1,2,139,1.39,,0,,0,,,,,,,,,,,16,3,1,15, |12,2,1, +ward,String,true,, ,Ward 9,,Unsorted,1,7,499,4.99,,0,,0,,,,,,,,,,,42,Ward 3,1,10,*PREVIEW: |01|02|04|06|07|1|10|16|18,23,1, +precinct,String,true,, ,2210,,Unsorted,0,4,393,3.93,,1,,0.01,,,,,,,,,,,76,0306,1,5,*PREVIEW: NULL| |0102|0105|0108|0109|0201|0204|0305|0307,61,1, +location_street_name,String,true,,103 N Beacon St,INTERSECTION Verdun St & Gallivan Blvd,,Unsorted,0,45,1800,18,,1,,0.01,,,,,,,,,,,97,20 Washington St|563 Columbus Ave|INTERSECTION Gallivan Blvd & Washington St,3,2,*PREVIEW: NULL|103 N Beacon St|11 Aberdeen St|1148 Hyde Park Ave|119 L St|12 Derne St|126 Elm St|127...,94,1, +location_zipcode,String,true,,02109,02215,,Unsorted,0,5,415,4.15,,17,,0.17,,,,,,,,,,,24,,1,17,02126|02134|02210|02215,4,1, +latitude,Float,,4233.6674,42.2553,42.3806,0.1253,Unsorted,6,7,694,6.94,42.3367,0,4,0,0.0163,42.2034,42.2619,42.3204,42.3432,42.3594,0.039,42.4179,42.4764,-0.1667,78,42.3594,1,20,*PREVIEW: 42.2553|42.2601|42.2609|42.2645|42.2674|42.2789|42.2797|42.2804|42.2821|42.2878,74,1, +longitude,Float,,-7107.2688,-71.1626,-71.0298,0.1328,Unsorted,6,8,791,7.91,-71.0727,0,4,0,0.0121,-71.1741,-71.1294,-71.0848,-71.0609,-71.055,0.0298,-71.0104,-70.9658,-0.6101,77,-71.0587,1,19,*PREVIEW: -71.0298|-71.0301|-71.0309|-71.0323|-71.0325|-71.0329|-71.0336|-71.0338|-71.034|-71.0355,72,1, source,String,true,,Citizens Connect App,Self Service,,Unsorted,12,20,1801,18.01,,0,,0,,,,,,,,,,,4,Citizens Connect App,1,56,Self Service,1,3, qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100 qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,29 diff --git a/tests/test_stats.rs b/tests/test_stats.rs index f4d28eecd..f4741631a 100644 --- a/tests/test_stats.rs +++ b/tests/test_stats.rs @@ -329,25 +329,25 @@ stats_tests!( stats_multiple_modes, "mode", &["a", "a", "b", "b", "c", "d", "e", "e"], - "a,b,e,3,1" + "a|b|e|3|1" ); stats_tests!( stats_multiple_modes_num, "mode", &["5", "5", "33", "33", "42", "17", "99", "99"], - "33,5,99,3,1" + "33|5|99|3|1" ); stats_tests!( stats_multiple_antimodes, "antimode", &["a", "a", "b", "b", "c", "d", "e", "e"], - "c,d,2,1" + "c|d|2|1" ); stats_tests!( stats_multiple_antimodes_num, "antimode", &["5", "5", "33", "33", "42", "17", "98", "99", "99"], - "17,42,98,3,1" + "17|42|98|3|1" ); stats_tests!( stats_range, @@ -527,7 +527,7 @@ stats_tests!( stats_antimode_null, "antimode", &["", "a", "b", "a"], - "NULL,b,2,1" + "NULL|b|2|1" ); stats_tests!(stats_median, "median", &["1", "2", "3"], "2"); stats_tests!(stats_median_null, "median", &["", "1", "2", "3"], "2"); From 2a9d9a5eb4642108cf4c428f427ea297df5f0e57 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Mon, 30 Dec 2024 13:05:35 -0500 Subject: [PATCH 07/12] docs: document `QSV_ANTIMODES_LEN` - setting it to 0 disables length limiting --- docs/ENVIRONMENT_VARIABLES.md | 2 +- dotenv.template | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ENVIRONMENT_VARIABLES.md b/docs/ENVIRONMENT_VARIABLES.md index 1f6a62b54..1d892cfa8 100644 --- a/docs/ENVIRONMENT_VARIABLES.md +++ b/docs/ENVIRONMENT_VARIABLES.md @@ -7,7 +7,7 @@ | `QSV_SNIFF_DELIMITER` | if set, the delimiter is automatically detected. Overrides `QSV_DEFAULT_DELIMITER` & `--delimiter` option. Note that this does not work with stdin. | | `QSV_NO_HEADERS` | if set, the first row will **NOT** be interpreted as headers. Supersedes `QSV_TOGGLE_HEADERS`. | | `QSV_TOGGLE_HEADERS` | if set to `1`, toggles header setting - i.e. inverts qsv header behavior, with no headers being the default, & setting `--no-headers` will actually mean headers will not be ignored. | -| `QSV_ANTIMODES_LEN` | set to the maximum number of characters when listing "antimodes" in `stats`. Otherwise, the default is 100 (max: 5192). | +| `QSV_ANTIMODES_LEN` | set to the maximum number of characters when listing "antimodes" in `stats`. Otherwise, the default is 100. Set to 0 to disable length limiting. | | `QSV_AUTOINDEX_SIZE` | if set, specifies the minimum file size (in bytes) of a CSV file before an index is automatically created. Note that stale indices are automatically updated regardless of this setting. | | `QSV_CACHE_DIR` | The directory to use for caching downloaded lookup_table resources using the `luau` qsv_register_lookup() helper function. | | `QSV_CKAN_API` | The CKAN Action API endpoint to use with the `luau` qsv_register_lookup() helper function when using the "ckan://" scheme. | diff --git a/dotenv.template b/dotenv.template index 8f41d9646..6a371b4bd 100644 --- a/dotenv.template +++ b/dotenv.template @@ -41,7 +41,7 @@ QSV_NO_HEADERS = False # QSV_TOGGLE_HEADERS = False # set to the maximum number of characters when listing "antimodes" in `stats`. Otherwise, the default is 100. -# max length is 5192 characters +# set to 0 to disable length limiting # QSV_ANTIMODES_LEN = 100 # if set, specifies the minimum file size (in bytes) of a CSV file before an From aee7bee7be5c2b50c1c07dd637b5f238fc434d6e Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Tue, 31 Dec 2024 23:50:38 -0500 Subject: [PATCH 08/12] refactor: `outliers` command - WIP --- src/cmd/outliers.rs | 417 ++++++++++++++++++-------------------------- src/util.rs | 16 +- 2 files changed, 189 insertions(+), 244 deletions(-) diff --git a/src/cmd/outliers.rs b/src/cmd/outliers.rs index eb3701296..88282b626 100644 --- a/src/cmd/outliers.rs +++ b/src/cmd/outliers.rs @@ -1,33 +1,32 @@ static USAGE: &str = r#" -Detect outliers in numeric columns using statistical methods. +Identify or remove outliers in CSV data. Usage: + qsv outliers remove [] qsv outliers [options] [] qsv outliers --help outliers options: - -s, --select Select specific columns to analyze for outliers - (comma separated). By default all numeric columns - are analyzed. - -m, --method Method to use for outlier detection: + -s, --select Select specific columns to analyze for outliers + By default all columns are analyzed. + See 'qsv select --help' for the format details. + -m, --method Method to use for outlier detection: outer - Use outer fences (Q3 + 3.0×IQR) [default] inner - Use inner fences (Q3 + 1.5×IQR) both - Show outliers using both fence types --force Force recomputing stats even if cache exists - -q, --quiet Don't show detailed outlier information, only summary + -q, --quiet Don't show detailed outlier information, only summary Common options: -h, --help Display this message -o, --output Write output to instead of stdout. -d, --delimiter The field delimiter for reading CSV data. - Must be a single character. (default: ,) + Must be a single character. (default: ,) Notes: - Uses the stats cache if available (see 'qsv stats --help') - For numeric columns: Values outside the IQR fences are considered outliers - For dates: Values are converted to days before outlier detection - - Outputs both a summary count and detailed list of outliers per column - - The --quiet flag suppresses detailed outlier listings Examples: # Find outliers in all numeric columns using outer fences @@ -40,9 +39,9 @@ Examples: qsv outliers -m both -q data.csv "#; -use std::{collections::HashMap, fs::File, io, path::Path, str}; +use std::{fs::File, io, path::Path, str}; -use csv::{ByteRecord, Reader}; +use csv::ByteRecord; use indicatif::{ProgressBar, ProgressStyle}; use serde::Deserialize; @@ -57,6 +56,7 @@ use crate::{ #[derive(Deserialize)] struct Args { + cmd_remove: bool, arg_input: Option, flag_select: SelectColumns, flag_method: Option, @@ -66,22 +66,6 @@ struct Args { flag_output: Option, } -#[derive(Debug)] -struct OutlierResult { - column: String, - data_type: String, - outlier_count: usize, - outlier_details: Vec, -} - -#[derive(Debug)] -struct OutlierDetail { - value: String, - reason: String, - fence_type: FenceType, // inner or outer - record_no: u64, // Add this field -} - #[derive(Debug, PartialEq, Clone)] enum FenceType { Inner, @@ -93,7 +77,7 @@ impl FenceType { fn from_str(s: &str) -> FenceType { match s.to_lowercase().as_str() { "inner" => FenceType::Inner, - "outer" => FenceType::Outer, + // "outer" => FenceType::Outer, "both" => FenceType::Both, _ => FenceType::Outer, // default } @@ -105,35 +89,70 @@ fn is_outlier(value: f64, lower_fence: f64, upper_fence: f64) -> bool { value < lower_fence || value > upper_fence } -fn process_outliers( - // rdr: &mut Reader>, - rdr: &mut Reader>, // Add + Send trait bound - stats: &[StatsData], - method: FenceType, - quiet: bool, -) -> CliResult> { - let mut results: Vec = stats - .iter() - .map(|stat| OutlierResult { - column: stat.field.clone(), - data_type: stat.r#type.clone(), - outlier_count: 0, - outlier_details: Vec::new(), - }) - .collect(); +pub fn run(argv: &[&str]) -> CliResult<()> { + let args: Args = util::get_args(USAGE, argv)?; - eprintln!("results: {:#?}", results); + // Get stats records (we still need these for the fences/thresholds) + let schema_args = util::SchemaArgs { + flag_enum_threshold: 0, + flag_ignore_case: false, + flag_strict_dates: false, + flag_pattern_columns: crate::select::SelectColumns::parse("").unwrap(), + flag_dates_whitelist: String::new(), + flag_prefer_dmy: false, + flag_force: args.flag_force, + flag_stdout: false, + flag_jobs: None, + flag_no_headers: false, + flag_delimiter: args.flag_delimiter, + arg_input: args.arg_input.clone(), + flag_memcheck: false, + }; + let (_csv_fields, csv_stats) = get_stats_records(&schema_args, StatsMode::Outliers)?; + + // Setup CSV reader with selection + let rconfig = Config::new(args.arg_input.as_ref()) + .delimiter(args.flag_delimiter) + .select(args.flag_select); + let mut rdr = rconfig.reader()?; - // Create index map for column positions - let headers = rdr.headers()?.clone(); - let col_indices: HashMap<_, _> = headers + // Get headers and create selection + let headers = rdr.byte_headers()?.clone(); + let sel = rconfig.selection(&headers)?; + + // Filter stats to only include selected columns + let selected_stats: Vec = csv_stats .iter() .enumerate() - .map(|(i, name)| (name.to_string(), i)) + .filter(|(idx, _)| sel.contains(idx)) + .map(|(_, stat)| stat.clone()) .collect(); - eprintln!("col_indices: {:#?}", col_indices); - let pb = if !quiet { + // Setup CSV writer + let wtr: Box = match args.flag_output { + Some(ref output_path) => Box::new(File::create(Path::new(output_path))?), + None => Box::new(io::stdout()), + }; + let mut csv_wtr = csv::WriterBuilder::new() + .delimiter(args.flag_delimiter.unwrap_or(Delimiter(b',')).0) + .from_writer(wtr); + + // Write CSV headers + csv_wtr.write_record([ + "column", + "data_type", + "value", + "record_number", + "fence_type", + "reason", + "lower_fence", + "upper_fence", + ])?; + + // Setup progress bar if not quiet + let pb = if args.flag_quiet { + None + } else { let pb = ProgressBar::new_spinner(); pb.set_style( ProgressStyle::default_spinner() @@ -141,123 +160,35 @@ fn process_outliers( .unwrap(), ); Some(pb) - } else { - None }; + // Process records one at a time + let method = FenceType::from_str(args.flag_method.as_deref().unwrap_or("outer")); let mut record = ByteRecord::new(); - let mut record_count = 0; + let mut record_count = 0u64; + while rdr.read_byte_record(&mut record)? { record_count += 1; if let Some(pb) = &pb { pb.set_position(record_count); } - for (result_idx, stat) in stats.iter().enumerate() { - let col_idx = match col_indices.get(&stat.field) { - Some(idx) => idx, - None => continue, - }; - - // Get the field as a byte slice - let field = record.get(*col_idx).unwrap_or_default(); + // Process each selected column + for (col_idx, stat) in selected_stats.iter().enumerate() { + let field = record.get(sel[col_idx]).unwrap_or_default(); match stat.r#type.as_str() { "Integer" | "Float" => { - if let ( - Some(lower_inner), - Some(upper_inner), - Some(lower_outer), - Some(upper_outer), - ) = ( - stat.lower_inner_fence, - stat.upper_inner_fence, - stat.lower_outer_fence, - stat.upper_outer_fence, - ) { - // Parse the bytes directly as a float - // if let Ok(val) = str::from_utf8(field) - // .ok() - // .and_then(|s| s.parse::().ok()) - // { - // let (is_inner, is_outer) = ( - // is_outlier(val, lower_inner, upper_inner), - // is_outlier(val, lower_outer, upper_outer), - // ); - if let Some(val) = str::from_utf8(field) - .ok() - .and_then(|s| s.parse::().ok()) - { - let (is_inner, is_outer) = ( - is_outlier(val, lower_inner, upper_inner), - is_outlier(val, lower_outer, upper_outer), - ); - - match (method.clone(), is_inner, is_outer) { - (FenceType::Inner, true, _) - | (FenceType::Outer, _, true) - | (FenceType::Both, true, _) => { - results[result_idx].outlier_count += 1; - results[result_idx].outlier_details.push(OutlierDetail { - value: val.to_string(), - reason: format!( - "Outside {} fences ({:.2}, {:.2})", - if is_outer { "outer" } else { "inner" }, - if is_outer { lower_outer } else { lower_inner }, - if is_outer { upper_outer } else { upper_inner } - ), - fence_type: if is_outer { - FenceType::Outer - } else { - FenceType::Inner - }, - record_no: record_count, - }); - }, - _ => {}, - } - } + if let Some(val) = str::from_utf8(field) + .ok() + .and_then(|s| s.parse::().ok()) + { + check_numeric_outlier(val, stat, &method, record_count, &mut csv_wtr)?; } }, "String" => { - // Convert bytes to string only when needed if let Ok(val) = str::from_utf8(field) { - // Check string length outliers - if let (Some(mean_len), Some(stddev_len)) = - (stat.avg_length, stat.stddev_length) - { - let len = val.len() as f64; - let z_score = (len - mean_len) / stddev_len; - - if z_score.abs() > 3.0 { - results[result_idx].outlier_count += 1; - results[result_idx].outlier_details.push(OutlierDetail { - value: val.to_string(), - reason: format!( - "Unusual length: {} (z-score: {:.2})", - len, z_score - ), - fence_type: FenceType::Both, - record_no: record_count, - }); - } - } - - // Check rare categories - if let Some(ref antimode) = stat.antimode { - if !antimode.starts_with("*ALL") { - let antimodes: Vec<&str> = antimode.split(',').collect(); - if antimodes.contains(&val) { - results[result_idx].outlier_count += 1; - results[result_idx].outlier_details.push(OutlierDetail { - value: val.to_string(), - reason: "Rare category (antimode)".to_string(), - fence_type: FenceType::Both, - record_no: record_count, - }); - } - } - } + check_string_outlier(val, stat, record_count, &mut csv_wtr)?; } }, _ => {}, @@ -266,108 +197,108 @@ fn process_outliers( } if let Some(pb) = &pb { - pb.finish_with_message(format!("Processed {} records", record_count)); + pb.finish_with_message(format!("Processed {record_count} records")); } - results.retain(|result| result.outlier_count > 0); - Ok(results) + csv_wtr.flush()?; + Ok(()) } -pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = util::get_args(USAGE, argv)?; - - // Get stats records - let schema_args = util::SchemaArgs { - flag_enum_threshold: 0, - flag_ignore_case: false, - flag_strict_dates: false, - flag_pattern_columns: crate::select::SelectColumns::parse("").unwrap(), - flag_dates_whitelist: String::new(), - flag_prefer_dmy: false, - flag_force: args.flag_force, - flag_stdout: false, - flag_jobs: None, - flag_no_headers: false, - flag_delimiter: args.flag_delimiter.clone(), - arg_input: args.arg_input.clone(), - flag_memcheck: false, - }; - - let (_csv_fields, csv_stats) = get_stats_records(&schema_args, StatsMode::FrequencyForceStats)?; - - // Read CSV file using Config - let rconfig = Config::new(args.arg_input.as_ref()) - .delimiter(args.flag_delimiter) - .select(args.flag_select); - - let mut rdr = rconfig.reader()?; - - let headers = rdr.byte_headers()?.clone(); - let sel = rconfig.selection(&headers)?; +// Helper function to check numeric outliers +fn check_numeric_outlier( + value: f64, + stat: &StatsData, + method: &FenceType, + record_no: u64, + csv_wtr: &mut csv::Writer>, +) -> CliResult<()> { + if let (Some(lower_inner), Some(upper_inner), Some(lower_outer), Some(upper_outer)) = ( + stat.lower_inner_fence, + stat.upper_inner_fence, + stat.lower_outer_fence, + stat.upper_outer_fence, + ) { + let (is_inner, is_outer) = ( + is_outlier(value, lower_inner, upper_inner), + is_outlier(value, lower_outer, upper_outer), + ); - // Read the CSV file - // let mut csv_reader = LazyCsvReader::new(&args.arg_input) - // .with_has_header(!args.flag_no_headers) - // .with_delimiter(args.flag_delimiter.unwrap_or(Delimiter(b',')).0); - - // let df = csv_reader.finish()?.collect()?; - - // Process selected columns - // let selected_stats = if let Some(select) = args.flag_select { - // let selected: Vec = select.split(',').map(String::from).collect(); - // csv_stats - // .into_iter() - // .filter(|stat| selected.contains(&stat.field)) - // .collect() - // } else { - // csv_stats - // }; - - // Process selected columns - // let selected_stats: Vec = csv_stats.into_iter().filter(|(_, stat)| - // sel.contains(&stat.field)).collect(); - - let mut selected_stats: Vec = Vec::new(); - for (idx, stat) in csv_stats.iter().enumerate() { - if sel.contains(&idx) { - selected_stats.push(stat.clone()); + match (method, is_inner, is_outer) { + (FenceType::Inner | FenceType::Both, true, _) | (FenceType::Outer, _, true) => { + let (fence_type, lower, upper) = if is_outer { + (FenceType::Outer, lower_outer, upper_outer) + } else { + (FenceType::Inner, lower_inner, upper_inner) + }; + + csv_wtr.write_record([ + &stat.field, + &stat.r#type, + &value.to_string(), + &record_no.to_string(), + &format!("{fence_type:?}"), + &format!( + "Outside {} fences ({:.2}, {:.2})", + if is_outer { "outer" } else { "inner" }, + lower, + upper + ), + &lower.to_string(), + &upper.to_string(), + ])?; + }, + _ => {}, } } - eprintln!("selected_stats: {:#?}", selected_stats); - - // Process outliers - let method = FenceType::from_str(args.flag_method.as_deref().unwrap_or("outer")); - let results = process_outliers(&mut rdr, &selected_stats, method, args.flag_quiet)?; + Ok(()) +} - // Write results - let mut wtr: Box = match args.flag_output { - Some(ref output_path) => Box::new(File::create(Path::new(output_path))?), - None => Box::new(io::stdout()), - }; +// Helper function to check string outliers +fn check_string_outlier( + value: &str, + stat: &StatsData, + record_no: u64, + csv_wtr: &mut csv::Writer>, +) -> CliResult<()> { + // Check string length outliers + if let (Some(mean_len), Some(stddev_len)) = (stat.avg_length, stat.stddev_length) { + println!("mean_len: {mean_len}, stddev_len: {stddev_len} value_len: {}", value.len()); + #[allow(clippy::cast_precision_loss)] + let len = value.len() as f64; + let z_score = (len - mean_len) / stddev_len; + + if z_score.abs() > 3.0 { + csv_wtr.write_record([ + &stat.field, + &stat.r#type, + value, + &record_no.to_string(), + "Both", + &format!("Unusual length: {len} (z-score: {z_score:.2})"), + "", + "", + ])?; + } + } - // Write summary - if results.is_empty() { - writeln!(wtr, "No outliers found")?; - } else { - writeln!(wtr, "\nOutlier Analysis Summary:")?; - writeln!(wtr, "=======================")?; - - for result in &results { - writeln!(wtr, "\nColumn: {} ({})", result.column, result.data_type)?; - writeln!(wtr, "Found {} outliers", result.outlier_count)?; - - if !args.flag_quiet { - writeln!(wtr, "\nOutlier Details:")?; - for detail in &result.outlier_details { - writeln!( - wtr, - " - Record #{:<6} | Value: {:<20} | Reason: {}", - detail.record_no, detail.value, detail.reason - )?; - } + // Check rare categories + if let Some(ref antimode) = stat.antimode { + if !antimode.starts_with("*ALL") { + // let antimodes: Vec<&str> = antimode.split(',').collect(); + if antimode.split('|').any(|x| x == value) { + csv_wtr.write_record([ + &stat.field, + &stat.r#type, + value, + &record_no.to_string(), + "Both", + "Rare category (antimode)", + "", + "", + ])?; } } - } + } Ok(()) } diff --git a/src/util.rs b/src/util.rs index 4809f7a6b..2431237cb 100644 --- a/src/util.rs +++ b/src/util.rs @@ -61,6 +61,7 @@ pub enum StatsMode { FrequencyForceStats, #[cfg(feature = "polars")] PolarsSchema, + Outliers, None, } @@ -2122,6 +2123,11 @@ pub fn get_stats_records( // we need data types, ranges & cardinality format!("stats\t{input}\t--cardinality\t--stats-jsonl\t--output\t{tempfile_path}") }, + StatsMode::Outliers => { + // StatsMode::Outliers + // we need data types, ranges, cardinality, quartiles, mad and modes/antimodes + format!("stats\t{input}\t--cardinality\t--quartiles\t--mad\t--mode\t--stats-jsonl\t--output\t{tempfile_path}") + }, StatsMode::None => unreachable!(), // we returned early on None earlier }; if args.flag_prefer_dmy { @@ -2140,12 +2146,20 @@ pub fn get_stats_records( if let Some(jobs) = stats_args.flag_jobs { stats_args_str = format!("{stats_args_str}\t--jobs\t{jobs}"); } + if stats_args.flag_nulls { + stats_args_str = format!("{stats_args_str}\t--nulls"); + } let stats_args_vec: Vec<&str> = stats_args_str.split('\t').collect(); let qsv_bin = std::env::current_exe().unwrap(); let mut stats_cmd = std::process::Command::new(qsv_bin); - stats_cmd.args(stats_args_vec); + if mode == StatsMode::Outliers { + // set the max length for antimodes + stats_cmd.env("QSV_ANTIMODES_LEN", "0").args(stats_args_vec); + } else { + stats_cmd.args(stats_args_vec); + } let status = stats_cmd.output()?.status; if !status.success() { let status_code = status.code(); From f95cd1ee20611aa7b38c1cfd20407ce4c2eba8dd Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Wed, 1 Jan 2025 09:59:36 -0500 Subject: [PATCH 09/12] refactor: `outliers` to cache antimodes and unify CSV setup for both modes --- src/cmd/outliers.rs | 262 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 216 insertions(+), 46 deletions(-) diff --git a/src/cmd/outliers.rs b/src/cmd/outliers.rs index 88282b626..ed3be1cfd 100644 --- a/src/cmd/outliers.rs +++ b/src/cmd/outliers.rs @@ -54,6 +54,26 @@ use crate::{ CliResult, }; +use std::collections::HashSet; +use std::sync::{Mutex, OnceLock}; + +static ANTIMODE_CACHE: OnceLock>> = OnceLock::new(); + +// Helper function to get or create cached antimodes +fn get_cached_antimodes(antimode: &str) -> HashSet { + let cache = ANTIMODE_CACHE.get_or_init(|| Mutex::new(HashSet::new())); + let mut cache = cache.lock().unwrap(); + if cache.is_empty() { + cache.extend( + antimode + .split('|') + .map(String::from) + .collect::>() + ); + } + cache.clone() +} + #[derive(Deserialize)] struct Args { cmd_remove: bool, @@ -89,31 +109,23 @@ fn is_outlier(value: f64, lower_fence: f64, upper_fence: f64) -> bool { value < lower_fence || value > upper_fence } -pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = util::get_args(USAGE, argv)?; - - // Get stats records (we still need these for the fences/thresholds) - let schema_args = util::SchemaArgs { - flag_enum_threshold: 0, - flag_ignore_case: false, - flag_strict_dates: false, - flag_pattern_columns: crate::select::SelectColumns::parse("").unwrap(), - flag_dates_whitelist: String::new(), - flag_prefer_dmy: false, - flag_force: args.flag_force, - flag_stdout: false, - flag_jobs: None, - flag_no_headers: false, - flag_delimiter: args.flag_delimiter, - arg_input: args.arg_input.clone(), - flag_memcheck: false, - }; - let (_csv_fields, csv_stats) = get_stats_records(&schema_args, StatsMode::Outliers)?; +struct CsvSetup { + reader: csv::Reader>, + writer: csv::Writer>, + headers: ByteRecord, + selected_stats: Vec, + progress_bar: Option, +} +fn setup_csv( + args: &Args, + csv_stats: &[StatsData], + write_outlier_headers: bool, +) -> CliResult { // Setup CSV reader with selection let rconfig = Config::new(args.arg_input.as_ref()) .delimiter(args.flag_delimiter) - .select(args.flag_select); + .select(args.flag_select.clone()); let mut rdr = rconfig.reader()?; // Get headers and create selection @@ -137,20 +149,24 @@ pub fn run(argv: &[&str]) -> CliResult<()> { .delimiter(args.flag_delimiter.unwrap_or(Delimiter(b',')).0) .from_writer(wtr); - // Write CSV headers - csv_wtr.write_record([ - "column", - "data_type", - "value", - "record_number", - "fence_type", - "reason", - "lower_fence", - "upper_fence", - ])?; - - // Setup progress bar if not quiet - let pb = if args.flag_quiet { + // Write headers based on mode + if write_outlier_headers { + csv_wtr.write_record([ + "column", + "data_type", + "value", + "record_number", + "fence_type", + "reason", + "lower_fence", + "upper_fence", + ])?; + } else { + csv_wtr.write_record(&headers)?; + } + + // Setup progress bar + let progress_bar = if args.flag_quiet { None } else { let pb = ProgressBar::new_spinner(); @@ -162,20 +178,172 @@ pub fn run(argv: &[&str]) -> CliResult<()> { Some(pb) }; - // Process records one at a time + Ok(CsvSetup { + reader: rdr, + writer: csv_wtr, + headers, + selected_stats, + progress_bar, + }) +} + +pub fn run(argv: &[&str]) -> CliResult<()> { + let args: Args = util::get_args(USAGE, argv)?; + + // Get stats records (we still need these for the fences/thresholds) + let schema_args = util::SchemaArgs { + flag_enum_threshold: 0, + flag_ignore_case: false, + flag_strict_dates: false, + flag_pattern_columns: crate::select::SelectColumns::parse("").unwrap(), + flag_dates_whitelist: String::new(), + flag_prefer_dmy: false, + flag_force: args.flag_force, + flag_stdout: false, + flag_jobs: None, + flag_no_headers: false, + flag_delimiter: args.flag_delimiter, + arg_input: args.arg_input.clone(), + flag_memcheck: false, + }; + let (_csv_fields, csv_stats) = get_stats_records(&schema_args, StatsMode::Outliers)?; + eprintln!("csv_stats: {:#?}", csv_stats); + + if args.cmd_remove { + remove_outliers(&args, &csv_stats) + } else { + identify_outliers(&args, &csv_stats) + } +} + +// New function to handle the remove subcommand +fn remove_outliers(args: &Args, csv_stats: &[StatsData]) -> CliResult<()> { + let mut setup = setup_csv(args, csv_stats, false)?; + let method = FenceType::from_str(args.flag_method.as_deref().unwrap_or("outer")); + let mut record = ByteRecord::new(); + let mut record_count = 0u64; + let mut removed_count = 0u64; + + while setup.reader.read_byte_record(&mut record)? { + record_count += 1; + if let Some(pb) = &setup.progress_bar { + pb.set_position(record_count); + } + + let mut is_outlier = false; + + // Check each selected column for outliers + for (col_idx, stat) in setup.selected_stats.iter().enumerate() { + let field = record.get(col_idx).unwrap_or_default(); + + match stat.r#type.as_str() { + "Integer" | "Float" => { + if let Some(val) = str::from_utf8(field) + .ok() + .and_then(|s| s.parse::().ok()) + { + is_outlier |= is_numeric_outlier(val, stat, &method); + } + }, + "String" => { + if let Ok(val) = str::from_utf8(field) { + is_outlier |= is_string_outlier(val, stat); + } + }, + _ => {}, + } + + if is_outlier { + break; // No need to check other columns if we found an outlier + } + } + + // Write record only if it's not an outlier + if is_outlier { + removed_count += 1; + } else { + setup.writer.write_record(&record)?; + } + } + + if let Some(pb) = &setup.progress_bar { + pb.finish_with_message(format!( + "Processed {record_count} records, removed {removed_count} outliers" + )); + } + + setup.writer.flush()?; + Ok(()) +} + +// New helper function for checking numeric outliers without writing +fn is_numeric_outlier(value: f64, stat: &StatsData, method: &FenceType) -> bool { + if let (Some(lower_inner), Some(upper_inner), Some(lower_outer), Some(upper_outer)) = ( + stat.lower_inner_fence, + stat.upper_inner_fence, + stat.lower_outer_fence, + stat.upper_outer_fence, + ) { + let (is_inner, is_outer) = ( + is_outlier(value, lower_inner, upper_inner), + is_outlier(value, lower_outer, upper_outer), + ); + + match method { + FenceType::Inner => is_inner, + FenceType::Outer => is_outer, + FenceType::Both => is_inner || is_outer, + } + } else { + false + } +} + +// Helper function for checking string outliers +fn is_string_outlier(value: &str, stat: &StatsData) -> bool { + // Check string length outliers + if let (Some(mean_len), Some(stddev_len)) = (stat.avg_length, stat.stddev_length) { + #[allow(clippy::cast_precision_loss)] + let len = value.len() as f64; + let z_score = (len - mean_len) / stddev_len; + if z_score.abs() > 3.0 { + return true; + } + } + + // Check rare categories with cached antimodes + if let Some(ref antimode) = stat.antimode { + if !antimode.starts_with("*ALL") { + let cached_antimodes = get_cached_antimodes(antimode); + if cached_antimodes.contains(value) { + return true; + } + } + } + + false +} + +fn identify_outliers(args: &Args, csv_stats: &[StatsData]) -> CliResult<()> { + let mut setup = setup_csv(args, csv_stats, true)?; let method = FenceType::from_str(args.flag_method.as_deref().unwrap_or("outer")); let mut record = ByteRecord::new(); let mut record_count = 0u64; - while rdr.read_byte_record(&mut record)? { + while setup.reader.read_byte_record(&mut record)? { record_count += 1; - if let Some(pb) = &pb { + if let Some(pb) = &setup.progress_bar { pb.set_position(record_count); } // Process each selected column - for (col_idx, stat) in selected_stats.iter().enumerate() { - let field = record.get(sel[col_idx]).unwrap_or_default(); + for stat in &setup.selected_stats { + let col_idx = setup + .headers + .iter() + .position(|h| h == stat.field.as_bytes()) + .unwrap_or(0); + let field = record.get(col_idx).unwrap_or_default(); match stat.r#type.as_str() { "Integer" | "Float" => { @@ -183,12 +351,12 @@ pub fn run(argv: &[&str]) -> CliResult<()> { .ok() .and_then(|s| s.parse::().ok()) { - check_numeric_outlier(val, stat, &method, record_count, &mut csv_wtr)?; + check_numeric_outlier(val, stat, &method, record_count, &mut setup.writer)?; } }, "String" => { if let Ok(val) = str::from_utf8(field) { - check_string_outlier(val, stat, record_count, &mut csv_wtr)?; + check_string_outlier(val, stat, record_count, &mut setup.writer)?; } }, _ => {}, @@ -196,11 +364,11 @@ pub fn run(argv: &[&str]) -> CliResult<()> { } } - if let Some(pb) = &pb { + if let Some(pb) = &setup.progress_bar { pb.finish_with_message(format!("Processed {record_count} records")); } - csv_wtr.flush()?; + setup.writer.flush()?; Ok(()) } @@ -262,7 +430,10 @@ fn check_string_outlier( ) -> CliResult<()> { // Check string length outliers if let (Some(mean_len), Some(stddev_len)) = (stat.avg_length, stat.stddev_length) { - println!("mean_len: {mean_len}, stddev_len: {stddev_len} value_len: {}", value.len()); + eprintln!( + "mean_len: {mean_len}, stddev_len: {stddev_len} value_len: {}", + value.len() + ); #[allow(clippy::cast_precision_loss)] let len = value.len() as f64; let z_score = (len - mean_len) / stddev_len; @@ -298,7 +469,6 @@ fn check_string_outlier( ])?; } } - } Ok(()) } From db05c79b0682fcb0dcaaa6cfb434e342505a14da Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Thu, 2 Jan 2025 01:36:20 -0500 Subject: [PATCH 10/12] chore: defer `outliers` command until v3 --- src/cmd/outliers.rs | 474 ----------------------------------------- tests/test_outliers.rs | 161 -------------- 2 files changed, 635 deletions(-) delete mode 100644 src/cmd/outliers.rs delete mode 100644 tests/test_outliers.rs diff --git a/src/cmd/outliers.rs b/src/cmd/outliers.rs deleted file mode 100644 index ed3be1cfd..000000000 --- a/src/cmd/outliers.rs +++ /dev/null @@ -1,474 +0,0 @@ -static USAGE: &str = r#" -Identify or remove outliers in CSV data. - -Usage: - qsv outliers remove [] - qsv outliers [options] [] - qsv outliers --help - -outliers options: - -s, --select Select specific columns to analyze for outliers - By default all columns are analyzed. - See 'qsv select --help' for the format details. - -m, --method Method to use for outlier detection: - outer - Use outer fences (Q3 + 3.0×IQR) [default] - inner - Use inner fences (Q3 + 1.5×IQR) - both - Show outliers using both fence types - --force Force recomputing stats even if cache exists - -q, --quiet Don't show detailed outlier information, only summary - -Common options: - -h, --help Display this message - -o, --output Write output to instead of stdout. - -d, --delimiter The field delimiter for reading CSV data. - Must be a single character. (default: ,) - -Notes: - - Uses the stats cache if available (see 'qsv stats --help') - - For numeric columns: Values outside the IQR fences are considered outliers - - For dates: Values are converted to days before outlier detection - -Examples: - # Find outliers in all numeric columns using outer fences - qsv outliers data.csv - - # Find outliers in specific columns using inner fences - qsv outliers -s "temperature,pressure" -m inner data.csv - - # Show both inner and outer fence outliers with minimal output - qsv outliers -m both -q data.csv -"#; - -use std::{fs::File, io, path::Path, str}; - -use csv::ByteRecord; -use indicatif::{ProgressBar, ProgressStyle}; -use serde::Deserialize; - -use crate::{ - cmd::stats::StatsData, - config::{Config, Delimiter}, - select::SelectColumns, - util, - util::{get_stats_records, StatsMode}, - CliResult, -}; - -use std::collections::HashSet; -use std::sync::{Mutex, OnceLock}; - -static ANTIMODE_CACHE: OnceLock>> = OnceLock::new(); - -// Helper function to get or create cached antimodes -fn get_cached_antimodes(antimode: &str) -> HashSet { - let cache = ANTIMODE_CACHE.get_or_init(|| Mutex::new(HashSet::new())); - let mut cache = cache.lock().unwrap(); - if cache.is_empty() { - cache.extend( - antimode - .split('|') - .map(String::from) - .collect::>() - ); - } - cache.clone() -} - -#[derive(Deserialize)] -struct Args { - cmd_remove: bool, - arg_input: Option, - flag_select: SelectColumns, - flag_method: Option, - flag_force: bool, - flag_quiet: bool, - flag_delimiter: Option, - flag_output: Option, -} - -#[derive(Debug, PartialEq, Clone)] -enum FenceType { - Inner, - Outer, - Both, -} - -impl FenceType { - fn from_str(s: &str) -> FenceType { - match s.to_lowercase().as_str() { - "inner" => FenceType::Inner, - // "outer" => FenceType::Outer, - "both" => FenceType::Both, - _ => FenceType::Outer, // default - } - } -} - -// Helper function to determine if a value is an outlier based on fences -fn is_outlier(value: f64, lower_fence: f64, upper_fence: f64) -> bool { - value < lower_fence || value > upper_fence -} - -struct CsvSetup { - reader: csv::Reader>, - writer: csv::Writer>, - headers: ByteRecord, - selected_stats: Vec, - progress_bar: Option, -} - -fn setup_csv( - args: &Args, - csv_stats: &[StatsData], - write_outlier_headers: bool, -) -> CliResult { - // Setup CSV reader with selection - let rconfig = Config::new(args.arg_input.as_ref()) - .delimiter(args.flag_delimiter) - .select(args.flag_select.clone()); - let mut rdr = rconfig.reader()?; - - // Get headers and create selection - let headers = rdr.byte_headers()?.clone(); - let sel = rconfig.selection(&headers)?; - - // Filter stats to only include selected columns - let selected_stats: Vec = csv_stats - .iter() - .enumerate() - .filter(|(idx, _)| sel.contains(idx)) - .map(|(_, stat)| stat.clone()) - .collect(); - - // Setup CSV writer - let wtr: Box = match args.flag_output { - Some(ref output_path) => Box::new(File::create(Path::new(output_path))?), - None => Box::new(io::stdout()), - }; - let mut csv_wtr = csv::WriterBuilder::new() - .delimiter(args.flag_delimiter.unwrap_or(Delimiter(b',')).0) - .from_writer(wtr); - - // Write headers based on mode - if write_outlier_headers { - csv_wtr.write_record([ - "column", - "data_type", - "value", - "record_number", - "fence_type", - "reason", - "lower_fence", - "upper_fence", - ])?; - } else { - csv_wtr.write_record(&headers)?; - } - - // Setup progress bar - let progress_bar = if args.flag_quiet { - None - } else { - let pb = ProgressBar::new_spinner(); - pb.set_style( - ProgressStyle::default_spinner() - .template("{spinner:.green} [{elapsed_precise}] Processing record {pos}") - .unwrap(), - ); - Some(pb) - }; - - Ok(CsvSetup { - reader: rdr, - writer: csv_wtr, - headers, - selected_stats, - progress_bar, - }) -} - -pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = util::get_args(USAGE, argv)?; - - // Get stats records (we still need these for the fences/thresholds) - let schema_args = util::SchemaArgs { - flag_enum_threshold: 0, - flag_ignore_case: false, - flag_strict_dates: false, - flag_pattern_columns: crate::select::SelectColumns::parse("").unwrap(), - flag_dates_whitelist: String::new(), - flag_prefer_dmy: false, - flag_force: args.flag_force, - flag_stdout: false, - flag_jobs: None, - flag_no_headers: false, - flag_delimiter: args.flag_delimiter, - arg_input: args.arg_input.clone(), - flag_memcheck: false, - }; - let (_csv_fields, csv_stats) = get_stats_records(&schema_args, StatsMode::Outliers)?; - eprintln!("csv_stats: {:#?}", csv_stats); - - if args.cmd_remove { - remove_outliers(&args, &csv_stats) - } else { - identify_outliers(&args, &csv_stats) - } -} - -// New function to handle the remove subcommand -fn remove_outliers(args: &Args, csv_stats: &[StatsData]) -> CliResult<()> { - let mut setup = setup_csv(args, csv_stats, false)?; - let method = FenceType::from_str(args.flag_method.as_deref().unwrap_or("outer")); - let mut record = ByteRecord::new(); - let mut record_count = 0u64; - let mut removed_count = 0u64; - - while setup.reader.read_byte_record(&mut record)? { - record_count += 1; - if let Some(pb) = &setup.progress_bar { - pb.set_position(record_count); - } - - let mut is_outlier = false; - - // Check each selected column for outliers - for (col_idx, stat) in setup.selected_stats.iter().enumerate() { - let field = record.get(col_idx).unwrap_or_default(); - - match stat.r#type.as_str() { - "Integer" | "Float" => { - if let Some(val) = str::from_utf8(field) - .ok() - .and_then(|s| s.parse::().ok()) - { - is_outlier |= is_numeric_outlier(val, stat, &method); - } - }, - "String" => { - if let Ok(val) = str::from_utf8(field) { - is_outlier |= is_string_outlier(val, stat); - } - }, - _ => {}, - } - - if is_outlier { - break; // No need to check other columns if we found an outlier - } - } - - // Write record only if it's not an outlier - if is_outlier { - removed_count += 1; - } else { - setup.writer.write_record(&record)?; - } - } - - if let Some(pb) = &setup.progress_bar { - pb.finish_with_message(format!( - "Processed {record_count} records, removed {removed_count} outliers" - )); - } - - setup.writer.flush()?; - Ok(()) -} - -// New helper function for checking numeric outliers without writing -fn is_numeric_outlier(value: f64, stat: &StatsData, method: &FenceType) -> bool { - if let (Some(lower_inner), Some(upper_inner), Some(lower_outer), Some(upper_outer)) = ( - stat.lower_inner_fence, - stat.upper_inner_fence, - stat.lower_outer_fence, - stat.upper_outer_fence, - ) { - let (is_inner, is_outer) = ( - is_outlier(value, lower_inner, upper_inner), - is_outlier(value, lower_outer, upper_outer), - ); - - match method { - FenceType::Inner => is_inner, - FenceType::Outer => is_outer, - FenceType::Both => is_inner || is_outer, - } - } else { - false - } -} - -// Helper function for checking string outliers -fn is_string_outlier(value: &str, stat: &StatsData) -> bool { - // Check string length outliers - if let (Some(mean_len), Some(stddev_len)) = (stat.avg_length, stat.stddev_length) { - #[allow(clippy::cast_precision_loss)] - let len = value.len() as f64; - let z_score = (len - mean_len) / stddev_len; - if z_score.abs() > 3.0 { - return true; - } - } - - // Check rare categories with cached antimodes - if let Some(ref antimode) = stat.antimode { - if !antimode.starts_with("*ALL") { - let cached_antimodes = get_cached_antimodes(antimode); - if cached_antimodes.contains(value) { - return true; - } - } - } - - false -} - -fn identify_outliers(args: &Args, csv_stats: &[StatsData]) -> CliResult<()> { - let mut setup = setup_csv(args, csv_stats, true)?; - let method = FenceType::from_str(args.flag_method.as_deref().unwrap_or("outer")); - let mut record = ByteRecord::new(); - let mut record_count = 0u64; - - while setup.reader.read_byte_record(&mut record)? { - record_count += 1; - if let Some(pb) = &setup.progress_bar { - pb.set_position(record_count); - } - - // Process each selected column - for stat in &setup.selected_stats { - let col_idx = setup - .headers - .iter() - .position(|h| h == stat.field.as_bytes()) - .unwrap_or(0); - let field = record.get(col_idx).unwrap_or_default(); - - match stat.r#type.as_str() { - "Integer" | "Float" => { - if let Some(val) = str::from_utf8(field) - .ok() - .and_then(|s| s.parse::().ok()) - { - check_numeric_outlier(val, stat, &method, record_count, &mut setup.writer)?; - } - }, - "String" => { - if let Ok(val) = str::from_utf8(field) { - check_string_outlier(val, stat, record_count, &mut setup.writer)?; - } - }, - _ => {}, - } - } - } - - if let Some(pb) = &setup.progress_bar { - pb.finish_with_message(format!("Processed {record_count} records")); - } - - setup.writer.flush()?; - Ok(()) -} - -// Helper function to check numeric outliers -fn check_numeric_outlier( - value: f64, - stat: &StatsData, - method: &FenceType, - record_no: u64, - csv_wtr: &mut csv::Writer>, -) -> CliResult<()> { - if let (Some(lower_inner), Some(upper_inner), Some(lower_outer), Some(upper_outer)) = ( - stat.lower_inner_fence, - stat.upper_inner_fence, - stat.lower_outer_fence, - stat.upper_outer_fence, - ) { - let (is_inner, is_outer) = ( - is_outlier(value, lower_inner, upper_inner), - is_outlier(value, lower_outer, upper_outer), - ); - - match (method, is_inner, is_outer) { - (FenceType::Inner | FenceType::Both, true, _) | (FenceType::Outer, _, true) => { - let (fence_type, lower, upper) = if is_outer { - (FenceType::Outer, lower_outer, upper_outer) - } else { - (FenceType::Inner, lower_inner, upper_inner) - }; - - csv_wtr.write_record([ - &stat.field, - &stat.r#type, - &value.to_string(), - &record_no.to_string(), - &format!("{fence_type:?}"), - &format!( - "Outside {} fences ({:.2}, {:.2})", - if is_outer { "outer" } else { "inner" }, - lower, - upper - ), - &lower.to_string(), - &upper.to_string(), - ])?; - }, - _ => {}, - } - } - Ok(()) -} - -// Helper function to check string outliers -fn check_string_outlier( - value: &str, - stat: &StatsData, - record_no: u64, - csv_wtr: &mut csv::Writer>, -) -> CliResult<()> { - // Check string length outliers - if let (Some(mean_len), Some(stddev_len)) = (stat.avg_length, stat.stddev_length) { - eprintln!( - "mean_len: {mean_len}, stddev_len: {stddev_len} value_len: {}", - value.len() - ); - #[allow(clippy::cast_precision_loss)] - let len = value.len() as f64; - let z_score = (len - mean_len) / stddev_len; - - if z_score.abs() > 3.0 { - csv_wtr.write_record([ - &stat.field, - &stat.r#type, - value, - &record_no.to_string(), - "Both", - &format!("Unusual length: {len} (z-score: {z_score:.2})"), - "", - "", - ])?; - } - } - - // Check rare categories - if let Some(ref antimode) = stat.antimode { - if !antimode.starts_with("*ALL") { - // let antimodes: Vec<&str> = antimode.split(',').collect(); - if antimode.split('|').any(|x| x == value) { - csv_wtr.write_record([ - &stat.field, - &stat.r#type, - value, - &record_no.to_string(), - "Both", - "Rare category (antimode)", - "", - "", - ])?; - } - } - } - Ok(()) -} diff --git a/tests/test_outliers.rs b/tests/test_outliers.rs deleted file mode 100644 index 4f113d2a5..000000000 --- a/tests/test_outliers.rs +++ /dev/null @@ -1,161 +0,0 @@ -use crate::workdir::Workdir; - -#[test] -fn test_outliers_basic() { - let wrk = Workdir::new("outliers"); - wrk.create( - "data.csv", - vec![ - svec!["number", "value"], - svec!["1", "10"], - svec!["2", "12"], - svec!["3", "15"], - svec!["4", "100"], // Outlier - svec!["5", "13"], - svec!["6", "11"], - svec!["7", "14"], - ], - ); - - let mut cmd = wrk.command("outliers"); - cmd.arg("data.csv"); - - wrk.assert_success(&mut cmd); - - let got = wrk.output_stderr(&mut cmd); - assert!(got.contains("Found 1 outlier")); - assert!(got.contains("value: 100")); -} - -#[test] -fn test_outliers_multiple_columns() { - let wrk = Workdir::new("outliers_multiple"); - wrk.create( - "data.csv", - vec![ - svec!["temp", "pressure", "humidity"], - svec!["20", "1013", "45"], - svec!["22", "1014", "48"], - svec!["21", "1012", "46"], - svec!["50", "900", "99"], // All outliers - svec!["23", "1015", "47"], - ], - ); - - let mut cmd = wrk.command("outliers"); - cmd.arg("-s").arg("temp,pressure,humidity").arg("data.csv"); - - wrk.assert_success(&mut cmd); - - let got = wrk.output_stderr(&mut cmd); - assert!(got.contains("temp: Found 1 outlier")); - assert!(got.contains("pressure: Found 1 outlier")); - assert!(got.contains("humidity: Found 1 outlier")); -} - -#[test] -fn test_outliers_inner_fence() { - let wrk = Workdir::new("outliers_inner"); - wrk.create( - "data.csv", - vec![ - svec!["value"], - svec!["10"], - svec!["12"], - svec!["15"], - svec!["30"], // Outlier with inner fence - svec!["13"], - svec!["11"], - svec!["14"], - ], - ); - - let mut cmd = wrk.command("outliers"); - cmd.arg("-m").arg("inner").arg("data.csv"); - - wrk.assert_success(&mut cmd); - - let got = wrk.output_stderr(&mut cmd); - assert!(got.contains("Found 1 outlier")); - assert!(got.contains("value: 30")); -} - -#[test] -fn test_outliers_quiet_mode() { - let wrk = Workdir::new("outliers_quiet"); - wrk.create( - "data.csv", - vec![ - svec!["value"], - svec!["10"], - svec!["12"], - svec!["15"], - svec!["100"], // Outlier - svec!["13"], - ], - ); - - let mut cmd = wrk.command("outliers"); - cmd.arg("-q").arg("data.csv"); - - wrk.assert_success(&mut cmd); - - let got = wrk.output_stderr(&mut cmd); - assert!(got.contains("Found 1 outlier")); - assert!(!got.contains("value: 100")); // Detailed output should be suppressed -} - -#[test] -fn test_outliers_string_column() { - let wrk = Workdir::new("outliers_string"); - wrk.create( - "data.csv", - vec![ - svec!["text"], - svec!["normal"], - svec!["typical"], - svec!["regular"], - svec!["very very very very long text"], // Length outlier - svec!["usual"], - ], - ); - - let mut cmd = wrk.command("outliers"); - cmd.arg("data.csv"); - - wrk.assert_success(&mut cmd); - - let got = wrk.output_stderr(&mut cmd); - assert!(got.contains("Found 1 outlier")); - assert!(got.contains("very very very very long text")); -} - -#[test] -fn test_outliers_both_fences() { - let wrk = Workdir::new("outliers_both"); - wrk.create( - "data.csv", - vec![ - svec!["value"], - svec!["10"], - svec!["12"], - svec!["15"], - svec!["30"], // Inner fence outlier - svec!["100"], // Outer fence outlier - svec!["13"], - svec!["11"], - svec!["14"], - ], - ); - - let mut cmd = wrk.command("outliers"); - cmd.arg("-m").arg("both").arg("data.csv"); - - wrk.assert_success(&mut cmd); - - let got = wrk.output_stderr(&mut cmd); - assert!(got.contains("Inner fence outliers:")); - assert!(got.contains("Outer fence outliers:")); - assert!(got.contains("value: 30")); - assert!(got.contains("value: 100")); -} From 3323163418da841055e131aafd2c7e2439b5957e Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Thu, 2 Jan 2025 01:37:22 -0500 Subject: [PATCH 11/12] chore: minor changes for upcoming `outliers` command --- src/cmd/stats.rs | 4 ++-- src/select.rs | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index 38a57744c..f52030526 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -513,7 +513,7 @@ const FINGERPRINT_HASH_COLUMNS: usize = 25; const MAX_ANTIMODES: usize = 10; // default length of antimode string before truncating and appending "..." const DEFAULT_ANTIMODES_LEN: usize = 100; -const DEFAULT_MODES_SEPARATOR: &str = "|"; +pub const DEFAULT_MODES_SEPARATOR: &str = "|"; // we do this so this is evaluated at compile-time pub const fn get_stats_data_types() -> [JsonTypes; MAX_STAT_COLUMNS] { @@ -1613,7 +1613,7 @@ impl Stats { // get the modes separator let modes_separator = ANTIMODES_SEPARATOR.get_or_init(|| { std::env::var("QSV_MODES_SEPARATOR") - .unwrap_or(DEFAULT_MODES_SEPARATOR.to_string()) + .unwrap_or_else(|_| DEFAULT_MODES_SEPARATOR.to_string()) }); // mode/s diff --git a/src/select.rs b/src/select.rs index b9ee64275..b961b21ef 100644 --- a/src/select.rs +++ b/src/select.rs @@ -64,10 +64,9 @@ impl SelectColumns { Ok(Selection(map)) } - // commented out because it's unused - // pub fn is_empty(&self) -> bool { - // self.selectors.is_empty() - // } + pub fn is_empty(&self) -> bool { + self.selectors.is_empty() + } } impl fmt::Debug for SelectColumns { From 8f6c31a389d578b7f9c30bd4881b8848f37a3259 Mon Sep 17 00:00:00 2001 From: Joel Natividad <1980690+jqnatividad@users.noreply.github.com> Date: Thu, 2 Jan 2025 01:40:36 -0500 Subject: [PATCH 12/12] chore: deferring `outliers` till v3 --- src/cmd/mod.rs | 1 - src/main.rs | 2 -- tests/tests.rs | 1 - 3 files changed, 4 deletions(-) diff --git a/src/cmd/mod.rs b/src/cmd/mod.rs index 8bc8917b2..6b1ea16ab 100644 --- a/src/cmd/mod.rs +++ b/src/cmd/mod.rs @@ -59,7 +59,6 @@ pub mod jsonl; pub mod lens; #[cfg(feature = "luau")] pub mod luau; -pub mod outliers; #[cfg(any(feature = "feature_capable", feature = "lite"))] pub mod partition; #[cfg(all( diff --git a/src/main.rs b/src/main.rs index 9ca008952..3d86d4f53 100644 --- a/src/main.rs +++ b/src/main.rs @@ -380,7 +380,6 @@ enum Command { Lens, #[cfg(all(feature = "luau", feature = "feature_capable"))] Luau, - Outliers, Partition, #[cfg(all(feature = "polars", feature = "feature_capable"))] PivotP, @@ -481,7 +480,6 @@ impl Command { Command::Lens => cmd::lens::run(argv), #[cfg(all(feature = "luau", feature = "feature_capable"))] Command::Luau => cmd::luau::run(argv), - Command::Outliers => cmd::outliers::run(argv), Command::Partition => cmd::partition::run(argv), #[cfg(all(feature = "polars", feature = "feature_capable"))] Command::PivotP => cmd::pivotp::run(argv), diff --git a/tests/tests.rs b/tests/tests.rs index 24740667c..522bfc3f0 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -91,7 +91,6 @@ mod test_json; mod test_jsonl; #[cfg(feature = "luau")] mod test_luau; -mod test_outliers; #[cfg(any(feature = "feature_capable", feature = "lite"))] mod test_partition; #[cfg(feature = "polars")]