-
Notifications
You must be signed in to change notification settings - Fork 73
/
input.rs
276 lines (242 loc) · 10.5 KB
/
input.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
static USAGE: &str = r#"
Read CSV data with special commenting, quoting, trimming, line-skipping &
non UTF-8 encoding rules and transforms it to a "normalized", UTF-8 encoded CSV.
Generally, all qsv commands support basic options like specifying the delimiter
used in CSV data. However, this does not cover all possible types of CSV data. For
example, some CSV files don't use '"' for quotes or use different escaping styles.
Also, CSVs with preamble lines can have them skipped with the --skip-lines & --auto-skip
options. Similarly, --skip-lastlines allows epilogue lines to be skipped.
Finally, non UTF-8 encoded files are "lossy" saved to UTF-8 by default, replacing all
invalid UTF-8 sequences with �. Note though that this is not true transcoding.
If you need to properly transcode non UTF-8 files, you'll need to use a tool like `iconv`
before processing it with qsv - e.g. to convert an ISO-8859-1 encoded file to UTF-8:
`iconv -f ISO-8859-1 -t UTF-8 input.csv -o utf8_output.csv`.
You can change this behavior with the --encoding-errors option.
See https://github.com/jqnatividad/qsv#utf-8-encoding for more details.
This command is typically used at the beginning of a data pipeline (thus the name `input`)
to normalize & prepare CSVs for further processing with other qsv commands.
For examples, see https://github.com/jqnatividad/qsv/blob/master/tests/test_input.rs.
Usage:
qsv input [options] [<input>]
qsv input --help
input options:
--quote <arg> The quote character to use. [default: "]
--escape <arg> The escape character to use. When not specified,
quotes are escaped by doubling them.
--no-quoting Disable quoting completely.
Otherwise, input uses csv::QuoteStyle::NonNumeric,
which puts quotes around all fields that are non-numeric.
Namely, when writing a field that doesn't parse as a valid
float or integer, quotes will be used.
This makes CSV files more portable.
--skip-lines <arg> The number of preamble lines to skip.
--auto-skip Sniffs a CSV for preamble lines and automatically
skips them. Takes precedence over --skip-lines option.
Does not work with <stdin>.
--skip-lastlines <arg> The number of epilogue lines to skip.
--trim-headers Trim leading & trailing whitespace & quotes from header values.
--trim-fields Trim leading & trailing whitespace from field values.
--comment <char> The comment character to use. When set, lines
starting with this character will be skipped.
--encoding-errors <arg> How to handle UTF-8 encoding errors.
Possible values: replace, skip, strict.
replace: Replace invalid UTF-8 sequences with �.
skip: Fields with encoding errors are "<SKIPPED>".
strict: Fail on any encoding errors.
[default: replace]
Common options:
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
"#;
use std::{env, str::FromStr};
use log::{debug, info, warn};
use serde::Deserialize;
use strum_macros::EnumString;
use crate::{
config::{Config, Delimiter},
util, CliResult,
};
#[derive(EnumString, Clone, Copy)]
#[strum(ascii_case_insensitive)]
#[allow(non_camel_case_types)]
enum EncodingHandling {
Replace,
Skip,
Strict,
}
#[derive(Deserialize)]
struct Args {
arg_input: Option<String>,
flag_output: Option<String>,
flag_delimiter: Option<Delimiter>,
flag_quote: Delimiter,
flag_escape: Option<Delimiter>,
flag_no_quoting: bool,
flag_skip_lines: Option<u64>,
flag_skip_lastlines: Option<u64>,
flag_auto_skip: bool,
flag_trim_headers: bool,
flag_trim_fields: bool,
flag_comment: Option<char>,
flag_encoding_errors: String,
}
pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
let trim_setting = match (args.flag_trim_headers, args.flag_trim_fields) {
(false, false) => csv::Trim::None,
(true, true) => csv::Trim::All,
(true, false) => csv::Trim::Headers,
(false, true) => csv::Trim::Fields,
};
let Ok(encode_handler) = EncodingHandling::from_str(&args.flag_encoding_errors) else {
return fail_incorrectusage_clierror!(
"Invalid --encoding-errors option: {}. Valid values: replace, skip, strict.",
args.flag_encoding_errors
);
};
if args.flag_auto_skip {
std::env::set_var("QSV_SNIFF_PREAMBLE", "1");
}
let comment_char: Option<u8> = if let Ok(cmt_char) = env::var("QSV_COMMENT_CHAR") {
Some(cmt_char.as_bytes().first().unwrap().to_owned())
} else {
args.flag_comment.map(|char| char as u8)
};
let mut rconfig = Config::new(&args.arg_input)
.delimiter(args.flag_delimiter)
.no_headers(true)
.quote(args.flag_quote.as_byte())
.comment(comment_char)
.trim(trim_setting);
if args.flag_auto_skip {
std::env::remove_var("QSV_SNIFF_PREAMBLE");
}
let wconfig = Config::new(&args.flag_output);
if let Some(escape) = args.flag_escape {
rconfig = rconfig.escape(Some(escape.as_byte())).double_quote(false);
}
if args.flag_no_quoting {
rconfig = rconfig.quoting(false);
} else {
rconfig = rconfig.quote_style(csv::QuoteStyle::NonNumeric);
}
if args.flag_auto_skip || args.flag_skip_lines.is_some() || args.flag_skip_lastlines.is_some() {
rconfig = rconfig.flexible(true);
}
let mut total_lines = 0_u64;
if let Some(skip_llines) = args.flag_skip_lastlines {
// use the regular count_rows to get the row_count
// as Polars doesn't support skipping last lines
let row_count = util::count_rows_regular(&rconfig)?;
if skip_llines > row_count {
return fail_incorrectusage_clierror!(
"--skip-lastlines: {skip_llines} is greater than row_count: {row_count}."
);
}
info!("Set to skip last {skip_llines} lines...");
total_lines = row_count.saturating_sub(skip_llines);
}
let mut rdr = rconfig.reader()?;
let mut wtr = wconfig.writer()?;
let mut row = csv::ByteRecord::new();
let mut str_row = csv::StringRecord::new();
let preamble_rows: u64 = if args.flag_auto_skip {
info!("auto-skip on...");
rconfig.preamble_rows
} else if args.flag_skip_lines.is_some() {
// safety: we already checked that skip_lines is some
args.flag_skip_lines.unwrap()
} else {
0
};
if preamble_rows > 0 {
info!("skipping {preamble_rows} preamble rows...");
for _i in 1..=preamble_rows {
rdr.read_byte_record(&mut row)?;
}
if total_lines.saturating_sub(preamble_rows) > 0 {
total_lines -= preamble_rows;
}
}
// the first rdr record is the header, since we have no_headers = true.
// If trim_setting is equal to Headers or All, we "manually" trim the first record
if trim_setting == csv::Trim::Headers || trim_setting == csv::Trim::All {
info!("trimming headers...");
rdr.read_byte_record(&mut row)?;
row.trim();
for field in &row {
// we also trim excess quotes from the header, to be consistent with safenames
str_row.push_field(String::from_utf8_lossy(field).trim_matches('"'));
}
wtr.write_record(&str_row)?;
}
let mut idx = 1_u64;
let mut not_utf8 = false;
let mut lossy_field;
let debug_log = log::log_enabled!(log::Level::Debug);
'main: loop {
match rdr.read_byte_record(&mut row) {
Ok(moredata) => {
if !moredata {
break 'main;
}
},
Err(e) => {
return fail_clierror!("Invalid CSV. Last valid row ({idx}): {e}");
},
};
str_row.clear();
for field in &row {
if let Ok(utf8_field) = simdutf8::basic::from_utf8(field) {
str_row.push_field(utf8_field);
} else {
match encode_handler {
EncodingHandling::Replace => {
lossy_field = String::from_utf8_lossy(field);
str_row.push_field(&lossy_field);
if debug_log {
debug!("REPLACE: Invalid UTF8 - row {idx} in \"{lossy_field}\".");
}
not_utf8 = true;
},
EncodingHandling::Skip => {
str_row.push_field("<SKIPPED>");
if debug_log {
lossy_field = String::from_utf8_lossy(field);
debug!("SKIPPED: Invalid UTF8 - row {idx} in \"{lossy_field}\".");
}
not_utf8 = true;
},
EncodingHandling::Strict => {
lossy_field = String::from_utf8_lossy(field);
return fail_encoding_clierror!(
"STRICT. Invalid UTF8 - row {idx} in \"{lossy_field}\"."
);
},
}
};
}
wtr.write_record(&str_row)?;
idx += 1;
if total_lines > 0 && idx > total_lines {
break 'main;
}
}
if not_utf8 {
match encode_handler {
EncodingHandling::Replace => warn!(
"Some rows contained invalid UTF-8 sequences. These sequences were replaced with \
the U+FFFD (�) replacement character."
),
EncodingHandling::Skip => warn!(
"Some fields contained invalid UTF-8 sequences. These fields set to \"<SKIPPED>\"."
),
// STRICT is unreachable because we return early if we encounter invalid UTF-8
EncodingHandling::Strict => unreachable!(),
}
}
info!("Wrote {} rows...", idx - 1);
Ok(wtr.flush()?)
}