Skip to content

Commit

Permalink
Merge pull request #2390 from dathere/outlier-stats-revamp
Browse files Browse the repository at this point in the history
`stats`: add string length stats to set stage for upcoming `outliers` "smart"  command to quickly identify outliers using stats/frequency info
  • Loading branch information
jqnatividad authored Jan 2, 2025
2 parents d2ab1a9 + 8f6c31a commit 0e99fdf
Show file tree
Hide file tree
Showing 19 changed files with 464 additions and 643 deletions.
2 changes: 1 addition & 1 deletion docs/ENVIRONMENT_VARIABLES.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
| `QSV_SNIFF_DELIMITER` | if set, the delimiter is automatically detected. Overrides `QSV_DEFAULT_DELIMITER` & `--delimiter` option. Note that this does not work with stdin. |
| `QSV_NO_HEADERS` | if set, the first row will **NOT** be interpreted as headers. Supersedes `QSV_TOGGLE_HEADERS`. |
| `QSV_TOGGLE_HEADERS` | if set to `1`, toggles header setting - i.e. inverts qsv header behavior, with no headers being the default, & setting `--no-headers` will actually mean headers will not be ignored. |
| `QSV_ANTIMODES_LEN` | set to the maximum number of characters when listing "antimodes" in `stats`. Otherwise, the default is 100 (max: 5192). |
| `QSV_ANTIMODES_LEN` | set to the maximum number of characters when listing "antimodes" in `stats`. Otherwise, the default is 100. Set to 0 to disable length limiting. |
| `QSV_AUTOINDEX_SIZE` | if set, specifies the minimum file size (in bytes) of a CSV file before an index is automatically created. Note that stale indices are automatically updated regardless of this setting. |
| `QSV_CACHE_DIR` | The directory to use for caching downloaded lookup_table resources using the `luau` qsv_register_lookup() helper function. |
| `QSV_CKAN_API` | The CKAN Action API endpoint to use with the `luau` qsv_register_lookup() helper function when using the "ckan://" scheme. |
Expand Down
2 changes: 1 addition & 1 deletion dotenv.template
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ QSV_NO_HEADERS = False
# QSV_TOGGLE_HEADERS = False

# set to the maximum number of characters when listing "antimodes" in `stats`. Otherwise, the default is 100.
# max length is 5192 characters
# set to 0 to disable length limiting
# QSV_ANTIMODES_LEN = 100

# if set, specifies the minimum file size (in bytes) of a CSV file before an
Expand Down
70 changes: 35 additions & 35 deletions resources/test/boston311-10-boolean-1or0-stats.csv
Original file line number Diff line number Diff line change
@@ -1,35 +1,35 @@
field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,cardinality,qsv__value
case_enquiry_id,Integer,,1010041354742,101004113298,101004155594,42296,Unsorted,12,12,120,12,101004135474.2,4663.4961,101004135474.1991,101004135474.1978,14747.2697,217481962.3498,0,0,,0,10,
open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,190,19,,,,,,,,0,,0,10,
target_dt,String,true,,2022-01-11 08:30:00,2022-05-20 13:03:21,,Unsorted,0,19,114,11.4,,,,,,,,4,,0.4,6,
closed_dt,String,true,,2022-01-09 06:43:06,2022-01-20 08:45:12,,Unsorted,0,19,95,9.5,,,,,,,,5,,0.5,6,
ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,62,6.2,,,,,,,,0,,0,2,
case_status,String,true,,Closed,Open,,Unsorted,4,6,50,5,,,,,,,,0,,0,2,
case_status_boolean,Boolean,,5,0,1,1,Unsorted,1,1,10,1,0.5,0.1581,0,,0.5,0.25,100,0,,0,2,
closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,82,350,35,,,,,,,,0,,0,6,
case_title,String,true,,BTDT: Complaint,Sidewalk Cover / Manhole,,Unsorted,13,57,235,23.5,,,,,,,,0,,0,8,
subject,String,true,,Boston Police Department,Public Works Department,,Unsorted,21,31,235,23.5,,,,,,,,0,,0,5,
reason,String,true,,Administrative & General Requests,Street Cleaning,,Unsorted,7,33,174,17.4,,,,,,,,0,,0,7,
type,String,true,,CE Collection,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,13,47,240,24,,,,,,,,0,,0,8,
queue,String,true,,BTDT_Parking Enforcement,PWDx_Snow Cases,,Unsorted,15,46,272,27.2,,,,,,,,0,,0,7,
department,String,true,,BTDT,PWDx,,Unsorted,3,4,38,3.8,,,,,,,,0,,0,5,
submittedphoto,NULL,,,,,,,0,0,,,,,,,,,,10,,1,1,
closedphoto,NULL,,,,,,,0,0,,,,,,,,,,10,,1,1,
location,String,true,, ,850 South St Roslindale MA 02131,,Unsorted,1,40,309,30.9,,,,,,,,0,,0,10,
fire_district,String,true,, ,9,,Unsorted,1,1,10,1,,,,,,,,0,,0,4,
pwd_district,String,true,, ,1C,,Unsorted,1,2,19,1.9,,,,,,,,0,,0,6,
city_council_district,String,true,, ,8,,Unsorted,1,1,10,1,,,,,,,,0,,0,6,
police_district,String,true,, ,E5,,Unsorted,1,3,21,2.1,,,,,,,,0,,0,6,
neighborhood,String,true,, ,South End,,Unsorted,1,13,91,9.1,,,,,,,,0,,0,8,
neighborhood_services_district,String,true,, ,6,,Unsorted,1,2,14,1.4,,,,,,,,0,,0,7,
ward,String,true,, ,Ward 9,,Unsorted,1,7,53,5.3,,,,,,,,0,,0,8,
precinct,String,true,, ,2004,,Unsorted,1,4,37,3.7,,,,,,,,0,,0,9,
location_street_name,String,true,,12 Derne St,850 South St,,Unsorted,0,20,120,12,,,,,,,,1,,0.1,10,
location_zipcode,String,true,,02113,02131,,Unsorted,0,5,45,4.5,,,,,,,,1,,0.1,8,
latitude,Float,,423.4656,42.2884,42.3735,0.0851,Unsorted,7,7,70,7,42.3466,0.008,42.3466,42.3465,0.0252,0.0006,0.0595,0,4,0,9,
longitude,Float,,-710.782,-71.133,-71.0566,0.0764,Unsorted,6,8,77,7.7,-71.0782,0.0078,,,0.0246,0.0006,-0.0346,0,4,0,10,
source,String,true,,City Worker App,Constituent Call,,Unsorted,15,16,157,15.7,,,,,,,,0,,0,2,
qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,10
qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,30
qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,3887
qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,1a4c2204a401f6791b6e5efde990955e1b6c59aec5b3de300686fadb63ee457b
field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,stddev_length,variance_length,cv_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,cardinality,qsv__value
case_enquiry_id,Integer,,1010041354742,101004113298,101004155594,42296,Unsorted,12,12,120,12,,,,101004135474.2,4663.4961,101004135474.1991,101004135474.1978,14747.2697,217481962.3498,0,0,,0,10,
open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,190,19,0,0,0,,,,,,,,0,,0,10,
target_dt,String,true,,2022-01-11 08:30:00,2022-05-20 13:03:21,,Unsorted,0,19,114,11.4,9.3081,86.64,0.8165,,,,,,,,4,,0.4,6,
closed_dt,String,true,,2022-01-09 06:43:06,2022-01-20 08:45:12,,Unsorted,0,19,95,9.5,9.4412,89.1358,0.9938,,,,,,,,5,,0.5,6,
ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,62,6.2,0.4,0.16,0.0645,,,,,,,,0,,0,2,
case_status,String,true,,Closed,Open,,Unsorted,4,6,50,5,1,1,0.2,,,,,,,,0,,0,2,
case_status_boolean,Boolean,,5,0,1,1,Unsorted,1,1,10,1,,,,0.5,0.1581,0,,0.5,0.25,100,0,,0,2,
closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,82,350,35,34.5543,1194,0.9873,,,,,,,,0,,0,6,
case_title,String,true,,BTDT: Complaint,Sidewalk Cover / Manhole,,Unsorted,13,57,235,23.5,14.1156,199.25,0.6007,,,,,,,,0,,0,8,
subject,String,true,,Boston Police Department,Public Works Department,,Unsorted,21,31,235,23.5,2.6552,7.05,0.113,,,,,,,,0,,0,5,
reason,String,true,,Administrative & General Requests,Street Cleaning,,Unsorted,7,33,174,17.4,7.9019,62.44,0.4541,,,,,,,,0,,0,7,
type,String,true,,CE Collection,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,13,47,240,24,11.619,135,0.4841,,,,,,,,0,,0,8,
queue,String,true,,BTDT_Parking Enforcement,PWDx_Snow Cases,,Unsorted,15,46,272,27.2,10.1272,102.56,0.3723,,,,,,,,0,,0,7,
department,String,true,,BTDT,PWDx,,Unsorted,3,4,38,3.8,0.4,0.16,0.1053,,,,,,,,0,,0,5,
submittedphoto,NULL,,,,,,,0,0,,,,,,,,,,,,,10,,1,1,
closedphoto,NULL,,,,,,,0,0,,,,,,,,,,,,,10,,1,1,
location,String,true,, ,850 South St Roslindale MA 02131,,Unsorted,1,40,309,30.9,10.4062,108.29,0.3368,,,,,,,,0,,0,10,
fire_district,String,true,, ,9,,Unsorted,1,1,10,1,0,0,0,,,,,,,,0,,0,4,
pwd_district,String,true,, ,1C,,Unsorted,1,2,19,1.9,0.3,0.09,0.1579,,,,,,,,0,,0,6,
city_council_district,String,true,, ,8,,Unsorted,1,1,10,1,0,0,0,,,,,,,,0,,0,6,
police_district,String,true,, ,E5,,Unsorted,1,3,21,2.1,0.5385,0.29,0.2564,,,,,,,,0,,0,6,
neighborhood,String,true,, ,South End,,Unsorted,1,13,91,9.1,3.2696,10.69,0.3593,,,,,,,,0,,0,8,
neighborhood_services_district,String,true,, ,6,,Unsorted,1,2,14,1.4,0.4899,0.24,0.3499,,,,,,,,0,,0,7,
ward,String,true,, ,Ward 9,,Unsorted,1,7,53,5.3,1.9519,3.81,0.3683,,,,,,,,0,,0,8,
precinct,String,true,, ,2004,,Unsorted,1,4,37,3.7,0.9,0.81,0.2432,,,,,,,,0,,0,9,
location_street_name,String,true,,12 Derne St,850 South St,,Unsorted,0,20,120,12,2.7889,7.7778,0.2324,,,,,,,,1,,0.1,10,
location_zipcode,String,true,,02113,02131,,Unsorted,0,5,45,4.5,0,0,0,,,,,,,,1,,0.1,8,
latitude,Float,,423.4656,42.2884,42.3735,0.0851,Unsorted,7,7,70,7,,,,42.3466,0.008,42.3466,42.3465,0.0252,0.0006,0.0595,0,4,0,9,
longitude,Float,,-710.782,-71.133,-71.0566,0.0764,Unsorted,6,8,77,7.7,,,,-71.0782,0.0078,,,0.0246,0.0006,-0.0346,0,4,0,10,
source,String,true,,City Worker App,Constituent Call,,Unsorted,15,16,157,15.7,0.4583,0.21,0.0292,,,,,,,,0,,0,2,
qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,10
qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,30
qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,3887
qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,71b0f8ff9ddfe2ed63633fd0f29bddaadd1613d73b622b54b3be54c6dea56b0d
70 changes: 35 additions & 35 deletions resources/test/boston311-10-boolean-tf-stats.csv
Original file line number Diff line number Diff line change
@@ -1,35 +1,35 @@
field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,cardinality,qsv__value
case_enquiry_id,Integer,,1010041354742,101004113298,101004155594,42296,Unsorted,12,12,120,12,101004135474.2,4663.4961,101004135474.1991,101004135474.1978,14747.2697,217481962.3498,0,0,,0,10,
open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,190,19,,,,,,,,0,,0,10,
target_dt,String,true,,2022-01-11 08:30:00,2022-05-20 13:03:21,,Unsorted,0,19,114,11.4,,,,,,,,4,,0.4,6,
closed_dt,String,true,,2022-01-09 06:43:06,2022-01-20 08:45:12,,Unsorted,0,19,95,9.5,,,,,,,,5,,0.5,6,
ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,62,6.2,,,,,,,,0,,0,2,
case_status,String,true,,Closed,Open,,Unsorted,4,6,50,5,,,,,,,,0,,0,2,
case_status_boolean,Boolean,true,,False,True,,Unsorted,4,5,45,4.5,,,,,,,,0,,0,2,
closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,82,350,35,,,,,,,,0,,0,6,
case_title,String,true,,BTDT: Complaint,Sidewalk Cover / Manhole,,Unsorted,13,57,235,23.5,,,,,,,,0,,0,8,
subject,String,true,,Boston Police Department,Public Works Department,,Unsorted,21,31,235,23.5,,,,,,,,0,,0,5,
reason,String,true,,Administrative & General Requests,Street Cleaning,,Unsorted,7,33,174,17.4,,,,,,,,0,,0,7,
type,String,true,,CE Collection,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,13,47,240,24,,,,,,,,0,,0,8,
queue,String,true,,BTDT_Parking Enforcement,PWDx_Snow Cases,,Unsorted,15,46,272,27.2,,,,,,,,0,,0,7,
department,String,true,,BTDT,PWDx,,Unsorted,3,4,38,3.8,,,,,,,,0,,0,5,
submittedphoto,NULL,,,,,,,0,0,,,,,,,,,,10,,1,1,
closedphoto,NULL,,,,,,,0,0,,,,,,,,,,10,,1,1,
location,String,true,, ,850 South St Roslindale MA 02131,,Unsorted,1,40,309,30.9,,,,,,,,0,,0,10,
fire_district,String,true,, ,9,,Unsorted,1,1,10,1,,,,,,,,0,,0,4,
pwd_district,String,true,, ,1C,,Unsorted,1,2,19,1.9,,,,,,,,0,,0,6,
city_council_district,String,true,, ,8,,Unsorted,1,1,10,1,,,,,,,,0,,0,6,
police_district,String,true,, ,E5,,Unsorted,1,3,21,2.1,,,,,,,,0,,0,6,
neighborhood,String,true,, ,South End,,Unsorted,1,13,91,9.1,,,,,,,,0,,0,8,
neighborhood_services_district,String,true,, ,6,,Unsorted,1,2,14,1.4,,,,,,,,0,,0,7,
ward,String,true,, ,Ward 9,,Unsorted,1,7,53,5.3,,,,,,,,0,,0,8,
precinct,String,true,, ,2004,,Unsorted,1,4,37,3.7,,,,,,,,0,,0,9,
location_street_name,String,true,,12 Derne St,850 South St,,Unsorted,0,20,120,12,,,,,,,,1,,0.1,10,
location_zipcode,String,true,,02113,02131,,Unsorted,0,5,45,4.5,,,,,,,,1,,0.1,8,
latitude,Float,,423.4656,42.2884,42.3735,0.0851,Unsorted,7,7,70,7,42.3466,0.008,42.3466,42.3465,0.0252,0.0006,0.0595,0,4,0,9,
longitude,Float,,-710.782,-71.133,-71.0566,0.0764,Unsorted,6,8,77,7.7,-71.0782,0.0078,,,0.0246,0.0006,-0.0346,0,4,0,10,
source,String,true,,City Worker App,Constituent Call,,Unsorted,15,16,157,15.7,,,,,,,,0,,0,2,
qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,10
qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,30
qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,3922
qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,dd97ad46b4b34efa66aa634d6c54188eebaf44ef5aaa5dde38180c3435a9ddaa
field,type,is_ascii,sum,min,max,range,sort_order,min_length,max_length,sum_length,avg_length,stddev_length,variance_length,cv_length,mean,sem,geometric_mean,harmonic_mean,stddev,variance,cv,nullcount,max_precision,sparsity,cardinality,qsv__value
case_enquiry_id,Integer,,1010041354742,101004113298,101004155594,42296,Unsorted,12,12,120,12,,,,101004135474.2,4663.4961,101004135474.1991,101004135474.1978,14747.2697,217481962.3498,0,0,,0,10,
open_dt,String,true,,2022-01-01 00:16:00,2022-01-31 11:46:00,,Unsorted,19,19,190,19,0,0,0,,,,,,,,0,,0,10,
target_dt,String,true,,2022-01-11 08:30:00,2022-05-20 13:03:21,,Unsorted,0,19,114,11.4,9.3081,86.64,0.8165,,,,,,,,4,,0.4,6,
closed_dt,String,true,,2022-01-09 06:43:06,2022-01-20 08:45:12,,Unsorted,0,19,95,9.5,9.4412,89.1358,0.9938,,,,,,,,5,,0.5,6,
ontime,String,true,,ONTIME,OVERDUE,,Unsorted,6,7,62,6.2,0.4,0.16,0.0645,,,,,,,,0,,0,2,
case_status,String,true,,Closed,Open,,Unsorted,4,6,50,5,1,1,0.2,,,,,,,,0,,0,2,
case_status_boolean,Boolean,true,,False,True,,Unsorted,4,5,45,4.5,0.5,0.25,0.1111,,,,,,,,0,,0,2,
closure_reason,String,true,, ,Case Closed. Closed date : Wed Jan 19 11:42:16 EST 2022 Resolved Removed df ,,Unsorted,1,82,350,35,34.5543,1194,0.9873,,,,,,,,0,,0,6,
case_title,String,true,,BTDT: Complaint,Sidewalk Cover / Manhole,,Unsorted,13,57,235,23.5,14.1156,199.25,0.6007,,,,,,,,0,,0,8,
subject,String,true,,Boston Police Department,Public Works Department,,Unsorted,21,31,235,23.5,2.6552,7.05,0.113,,,,,,,,0,,0,5,
reason,String,true,,Administrative & General Requests,Street Cleaning,,Unsorted,7,33,174,17.4,7.9019,62.44,0.4541,,,,,,,,0,,0,7,
type,String,true,,CE Collection,Unsatisfactory Utilities - Electrical Plumbing,,Unsorted,13,47,240,24,11.619,135,0.4841,,,,,,,,0,,0,8,
queue,String,true,,BTDT_Parking Enforcement,PWDx_Snow Cases,,Unsorted,15,46,272,27.2,10.1272,102.56,0.3723,,,,,,,,0,,0,7,
department,String,true,,BTDT,PWDx,,Unsorted,3,4,38,3.8,0.4,0.16,0.1053,,,,,,,,0,,0,5,
submittedphoto,NULL,,,,,,,0,0,,,,,,,,,,,,,10,,1,1,
closedphoto,NULL,,,,,,,0,0,,,,,,,,,,,,,10,,1,1,
location,String,true,, ,850 South St Roslindale MA 02131,,Unsorted,1,40,309,30.9,10.4062,108.29,0.3368,,,,,,,,0,,0,10,
fire_district,String,true,, ,9,,Unsorted,1,1,10,1,0,0,0,,,,,,,,0,,0,4,
pwd_district,String,true,, ,1C,,Unsorted,1,2,19,1.9,0.3,0.09,0.1579,,,,,,,,0,,0,6,
city_council_district,String,true,, ,8,,Unsorted,1,1,10,1,0,0,0,,,,,,,,0,,0,6,
police_district,String,true,, ,E5,,Unsorted,1,3,21,2.1,0.5385,0.29,0.2564,,,,,,,,0,,0,6,
neighborhood,String,true,, ,South End,,Unsorted,1,13,91,9.1,3.2696,10.69,0.3593,,,,,,,,0,,0,8,
neighborhood_services_district,String,true,, ,6,,Unsorted,1,2,14,1.4,0.4899,0.24,0.3499,,,,,,,,0,,0,7,
ward,String,true,, ,Ward 9,,Unsorted,1,7,53,5.3,1.9519,3.81,0.3683,,,,,,,,0,,0,8,
precinct,String,true,, ,2004,,Unsorted,1,4,37,3.7,0.9,0.81,0.2432,,,,,,,,0,,0,9,
location_street_name,String,true,,12 Derne St,850 South St,,Unsorted,0,20,120,12,2.7889,7.7778,0.2324,,,,,,,,1,,0.1,10,
location_zipcode,String,true,,02113,02131,,Unsorted,0,5,45,4.5,0,0,0,,,,,,,,1,,0.1,8,
latitude,Float,,423.4656,42.2884,42.3735,0.0851,Unsorted,7,7,70,7,,,,42.3466,0.008,42.3466,42.3465,0.0252,0.0006,0.0595,0,4,0,9,
longitude,Float,,-710.782,-71.133,-71.0566,0.0764,Unsorted,6,8,77,7.7,,,,-71.0782,0.0078,,,0.0246,0.0006,-0.0346,0,4,0,10,
source,String,true,,City Worker App,Constituent Call,,Unsorted,15,16,157,15.7,0.4583,0.21,0.0292,,,,,,,,0,,0,2,
qsv__rowcount,,,,,,,,,,,,,,,,,,,,,,,,,,10
qsv__columncount,,,,,,,,,,,,,,,,,,,,,,,,,,30
qsv__filesize_bytes,,,,,,,,,,,,,,,,,,,,,,,,,,3922
qsv__fingerprint_hash,,,,,,,,,,,,,,,,,,,,,,,,,,874abe7cd02691b113acc7122097731ef6011f9e8e96dfd63ebbddc6724d19ef
Loading

0 comments on commit 0e99fdf

Please sign in to comment.