-
Notifications
You must be signed in to change notification settings - Fork 74
/
select.rs
198 lines (158 loc) · 6.73 KB
/
select.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
static USAGE: &str = r#"
Select columns from CSV data efficiently.
This command lets you manipulate the columns in CSV data. You can re-order,
duplicate, reverse or drop them. Columns can be referenced by index or by
name if there is a header row (duplicate column names can be disambiguated with
more indexing). Column ranges can also be specified. Finally, columns can be
selected using regular expressions.
Select the first and fourth columns:
$ qsv select 1,4
Select the first 4 columns (by index and by name):
$ qsv select 1-4
$ qsv select Header1-Header4
Ignore the first 2 columns (by range and by omission):
$ qsv select 3-
$ qsv select '!1-2'
Select the third column named 'Foo':
$ qsv select 'Foo[2]'
Select the first and last columns, _ is a special character for the last column:
$ qsv select 1,_
Reverse the order of columns:
$ qsv select _-1
Sort the columns lexicographically (i.e. by their byte values)
$ qsv select 1- --sort
Select some columns and then sort them:
$ qsv select 1,4,5-7 --sort
Randomly shuffle the columns:
$ qsv select 1- --random
# with a seed
$ qsv select 1- --random --seed 42
Select some columns and then shuffle them with a seed:
$ qsv select 1,4,5-7 --random --seed 42
Select columns using a regex using '/<regex>/':
# select columns starting with 'a'
$ qsv select /^a/
# select columns with a digit
$ qsv select '/^.*\d.*$/'
# remove SSN, account_no and password columns
$ qsv select '!/SSN|account_no|password/'
Re-order and duplicate columns arbitrarily using different types of selectors:
$ qsv select 3-1,Header3-Header1,Header1,Foo[2],Header1
Quote column names that conflict with selector syntax:
$ qsv select '\"Date - Opening\",\"Date - Actual Closing\"'
For more examples, see https://github.com/dathere/qsv/blob/master/tests/test_select.rs.
Usage:
qsv select [options] [--] <selection> [<input>]
qsv select --help
select arguments:
<selection> The columns to select.
You can select columns by index, by name, by range, by regex and
any combination of these. If the first character is '!', the
selection will be inverted. If the selection contains embedded
spaces or characters that conflict with selector syntax, it must
be quoted. See examples above.
select options:
These options only apply to the `select` command, not the `--select` option in other commands.
-R, --random Randomly shuffle the columns in the selection.
--seed <number> Seed for the random number generator.
-S, --sort Sort the selected columns lexicographically,
i.e. by their byte values.
Common options:
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
-n, --no-headers When set, the first row will not be interpreted
as headers. (i.e., They are not searched, analyzed,
sliced, etc.)
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
"#;
use rand::{seq::SliceRandom, SeedableRng};
use serde::Deserialize;
use crate::{
config::{Config, Delimiter},
select::SelectColumns,
util, CliResult,
};
#[derive(Deserialize)]
struct Args {
arg_input: Option<String>,
arg_selection: SelectColumns,
flag_random: bool,
flag_seed: Option<u64>,
flag_sort: bool,
flag_output: Option<String>,
flag_no_headers: bool,
flag_delimiter: Option<Delimiter>,
}
pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
if args.flag_sort && args.flag_random {
return fail_clierror!("Cannot use both --random and --sort options.");
}
let rconfig = Config::new(args.arg_input.as_ref())
.delimiter(args.flag_delimiter)
.no_headers(args.flag_no_headers)
.select(args.arg_selection);
let mut rdr = rconfig.reader()?;
let mut wtr = Config::new(args.flag_output.as_ref()).writer()?;
let headers = rdr.byte_headers()?.clone();
let sel = if args.flag_random {
// Use seed if it is provided when initializing the random number generator.
let mut rng = if let Some(seed) = args.flag_seed {
// we add the DevSkim ignore comment here because we don't need to worry about
// cryptographic security in this context.
rand::rngs::StdRng::seed_from_u64(seed) // DevSkim: ignore DS148264
} else {
rand::rngs::StdRng::from_entropy()
};
let initial_selection = rconfig.selection(&headers)?;
// make a vector of the column indices (1-indexed).
let mut shuffled_selection_vec: Vec<usize> =
initial_selection.iter().map(|&i| i + 1).collect();
shuffled_selection_vec.shuffle(&mut rng);
// Convert the shuffled indices into a comma-separated string.
let shuffled_selection_string = shuffled_selection_vec
.into_iter()
.map(|i| i.to_string())
.collect::<Vec<String>>()
.join(",");
// Parse the shuffled string into a SelectColumns object.
let shuffled_selection = SelectColumns::parse(&shuffled_selection_string)?;
rconfig
.clone()
.select(shuffled_selection)
.selection(&headers)?
} else if args.flag_sort {
// get the headers from the initial selection
let initial_selection = rconfig.selection(&headers)?;
let mut initial_headers_vec = initial_selection
.iter()
.map(|&i| &headers[i])
.collect::<Vec<&[u8]>>();
// sort the headers lexicographically
initial_headers_vec.sort_unstable();
// make a comma-separated string of the sorted, quoted headers
let sorted_selection_string = initial_headers_vec
.iter()
.map(|h| format!("\"{}\"", String::from_utf8_lossy(h)))
.collect::<Vec<String>>()
.join(",");
// Parse the sorted selection string into a SelectColumns object.
let sorted_selection = SelectColumns::parse(&sorted_selection_string)?;
rconfig
.clone()
.select(sorted_selection)
.selection(&headers)?
} else {
rconfig.selection(&headers)?
};
if !rconfig.no_headers {
wtr.write_record(sel.iter().map(|&i| &headers[i]))?;
}
let mut record = csv::ByteRecord::new();
while rdr.read_byte_record(&mut record)? {
wtr.write_record(sel.iter().map(|&i| &record[i]))?;
}
wtr.flush()?;
Ok(())
}