diff --git a/docs/src/customization.md b/docs/src/customization.md index 5a787ad4fc..cbc69928f3 100644 --- a/docs/src/customization.md +++ b/docs/src/customization.md @@ -50,7 +50,7 @@ and the `--csv` part will automatically be understood. If you do want to process * You can include any command-line flags, except the "terminal" ones such as `--help`. -* The `--prepipe`, `--load`, and `--mload` flags aren't allowed in `.mlrrc` as they control code execution, and could result in your scripts running things you don't expect if you receive data from someone with a `./.mlrrc` in it. You can use `--prepipe-bz2`, `--prepipe-gunzip`, and `--prepipe-zcat` in `.mlrrc`, though. +* The `--prepipe`, `--load`, and `--mload` flags aren't allowed in `.mlrrc` as they control code execution, and could result in your scripts running things you don't expect if you receive data from someone with a `./.mlrrc` in it. You can use `--prepipe-bz2`, `--prepipe-gunzip`, `--prepipe-zcat`, and `--prepipe-zstdcat` in `.mlrrc`, though. * The formatting rule is you need to put one flag beginning with `--` per line: for example, `--csv` on one line and `--nr-progress-mod 1000` on a separate line. diff --git a/docs/src/customization.md.in b/docs/src/customization.md.in index 9a1d2894b7..00367b2f76 100644 --- a/docs/src/customization.md.in +++ b/docs/src/customization.md.in @@ -34,7 +34,7 @@ and the `--csv` part will automatically be understood. If you do want to process * You can include any command-line flags, except the "terminal" ones such as `--help`. -* The `--prepipe`, `--load`, and `--mload` flags aren't allowed in `.mlrrc` as they control code execution, and could result in your scripts running things you don't expect if you receive data from someone with a `./.mlrrc` in it. You can use `--prepipe-bz2`, `--prepipe-gunzip`, and `--prepipe-zcat` in `.mlrrc`, though. +* The `--prepipe`, `--load`, and `--mload` flags aren't allowed in `.mlrrc` as they control code execution, and could result in your scripts running things you don't expect if you receive data from someone with a `./.mlrrc` in it. You can use `--prepipe-bz2`, `--prepipe-gunzip`, `--prepipe-zcat`, and `--prepipe-zstdcat` in `.mlrrc`, though. * The formatting rule is you need to put one flag beginning with `--` per line: for example, `--csv` on one line and `--nr-progress-mod 1000` on a separate line. diff --git a/docs/src/data-diving-examples.md b/docs/src/data-diving-examples.md index 39738f193d..100716ec26 100644 --- a/docs/src/data-diving-examples.md +++ b/docs/src/data-diving-examples.md @@ -160,11 +160,11 @@ CITRUS COUNTY 1332.9 79974.9 483785.1 stats2 -a corr,linreg-ols,r2 -f tiv_2011,tiv_2012
-tiv_2011_tiv_2012_corr 0.9730497632351692 -tiv_2011_tiv_2012_ols_m 0.9835583980337723 -tiv_2011_tiv_2012_ols_b 433854.6428968317 +tiv_2011_tiv_2012_corr 0.9730497632351701 +tiv_2011_tiv_2012_ols_m 0.9835583980337732 +tiv_2011_tiv_2012_ols_b 433854.6428968301 tiv_2011_tiv_2012_ols_n 36634 -tiv_2011_tiv_2012_r2 0.9468258417320189 +tiv_2011_tiv_2012_r2 0.9468258417320204
@@ -322,7 +322,7 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
u_v_corr w_x_corr -0.1334180491027861 -0.011319841199866178 +0.1334180491027861 -0.011319841199852926
@@ -332,22 +332,22 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
color shape u_v_corr w_x_corr - red circle 0.9807984401887236 -0.01856553658708754 -orange square 0.17685855992752927 -0.07104431573806054 - green circle 0.05764419437577255 0.01179572988801509 - red square 0.05574477124893523 -0.0006801456507510942 -yellow triangle 0.04457273771962798 0.024604310103081825 -yellow square 0.04379172927296089 -0.04462197201631237 -purple circle 0.03587354936895086 0.1341133954140899 - blue square 0.03241153095761164 -0.053507648119643196 - blue triangle 0.015356427073158766 -0.0006089997461435399 -orange circle 0.010518953877704048 -0.16279397329279383 - red triangle 0.00809782571528034 0.012486621357942596 -purple triangle 0.005155190909099334 -0.045057909256220656 -purple square -0.025680276963377404 0.05769429647930396 - green square -0.0257760734502851 -0.003265173252087127 -orange triangle -0.030456661186085785 -0.1318699981926352 -yellow circle -0.06477331572781474 0.07369449819706045 - blue circle -0.10234761901929677 -0.030528539069837757 - green triangle -0.10901825107358765 -0.04848782060162929 + red circle 0.9807984401887242 -0.018565536587084836 +orange square 0.17685855992752933 -0.07104431573805543 + green circle 0.05764419437577257 0.011795729888018455 + red square 0.0557447712489348 -0.0006801456507506415 +yellow triangle 0.0445727377196281 0.024604310103079844 +yellow square 0.0437917292729612 -0.044621972016306265 +purple circle 0.03587354936895115 0.13411339541407613 + blue square 0.03241153095761152 -0.05350764811965621 + blue triangle 0.015356427073158612 -0.0006089997461408209 +orange circle 0.010518953877704181 -0.1627939732927932 + red triangle 0.00809782571528054 0.01248662135795501 +purple triangle 0.005155190909099739 -0.04505790925621933 +purple square -0.02568027696337717 0.057694296479293694 + green square -0.025776073450284875 -0.0032651732520739014 +orange triangle -0.030456661186085584 -0.13186999819263814 +yellow circle -0.06477331572781515 0.0736944981970553 + blue circle -0.1023476190192966 -0.030528539069839333 + green triangle -0.10901825107358747 -0.04848782060162855diff --git a/docs/src/example-mlr-s-script b/docs/src/example-mlr-s-script index 7b9cdb972c..50e3f5db45 100755 --- a/docs/src/example-mlr-s-script +++ b/docs/src/example-mlr-s-script @@ -1,5 +1,5 @@ #!/usr/bin/env mlr -s --c2p -filter '$quantity != 20' +filter '$quantity != 20' # Here is a comment then count-distinct -f shape then fraction -f count diff --git a/docs/src/glossary.md b/docs/src/glossary.md index bb731297b7..774975c41e 100644 --- a/docs/src/glossary.md +++ b/docs/src/glossary.md @@ -905,3 +905,8 @@ See also the [arrays page](reference-main-arrays.md), as well as the page on A [data-compression format supported by Miller](reference-main-compressed-data.md). Files compressed using ZLIB compression normally end in `.z`. + +## ZSTD / .zst + +A [data-compression format supported by Miller](reference-main-compressed-data.md). +Files compressed using ZSTD compression normally end in`.zst`. diff --git a/docs/src/glossary.md.in b/docs/src/glossary.md.in index 7e03b7d11b..b8eb8f4177 100644 --- a/docs/src/glossary.md.in +++ b/docs/src/glossary.md.in @@ -889,3 +889,8 @@ See also the [arrays page](reference-main-arrays.md), as well as the page on A [data-compression format supported by Miller](reference-main-compressed-data.md). Files compressed using ZLIB compression normally end in `.z`. + +## ZSTD / .zst + +A [data-compression format supported by Miller](reference-main-compressed-data.md). +Files compressed using ZSTD compression normally end in`.zst`. diff --git a/docs/src/manpage.md b/docs/src/manpage.md index 08ffd80225..d801934335 100644 --- a/docs/src/manpage.md +++ b/docs/src/manpage.md @@ -194,12 +194,13 @@ MILLER(1) MILLER(1) 1mVERB LIST0m altkv bar bootstrap case cat check clean-whitespace count-distinct count count-similar cut decimate fill-down fill-empty filter flatten format-values - fraction gap grep group-by group-like having-fields head histogram json-parse - json-stringify join label latin1-to-utf8 least-frequent merge-fields - most-frequent nest nothing put regularize remove-empty-columns rename reorder - repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records - sort sort-within-records split stats1 stats2 step summary tac tail tee - template top utf8-to-latin1 unflatten uniq unspace unsparsify + fraction gap grep group-by group-like gsub having-fields head histogram + json-parse json-stringify join label latin1-to-utf8 least-frequent + merge-fields most-frequent nest nothing put regularize remove-empty-columns + rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle + skip-trivial-records sort sort-within-records split ssub stats1 stats2 step + sub summary tac tail tee template top utf8-to-latin1 unflatten uniq unspace + unsparsify 1mFUNCTION LIST0m abs acos acosh any append apply arrayify asin asinh asserting_absent @@ -262,7 +263,7 @@ MILLER(1) MILLER(1) Miller offers a few different ways to handle reading data files which have been compressed. - * Decompression done within the Miller process itself: `--bz2in` `--gzin` `--zin` + * Decompression done within the Miller process itself: `--bz2in` `--gzin` `--zin``--zstdin` * Decompression done outside the Miller process: `--prepipe` `--prepipex` Using `--prepipe` and `--prepipex` you can specify an action to be @@ -285,7 +286,7 @@ MILLER(1) MILLER(1) Lastly, note that if `--prepipe` or `--prepipex` is specified, it replaces any decisions that might have been made based on the file suffix. Likewise, - `--gzin`/`--bz2in`/`--zin` are ignored if `--prepipe` is also specified. + `--gzin`/`--bz2in`/`--zin``--zin` are ignored if `--prepipe` is also specified. --bz2in Uncompress bzip2 within the Miller process. Done by default if file ends in `.bz2`. @@ -302,6 +303,8 @@ MILLER(1) MILLER(1) `.mlrrc`. --prepipe-zcat Same as `--prepipe zcat`, except this is allowed in `.mlrrc`. + --prepipe-zstdcat Same as `--prepipe zstdcat`, except this is allowed + in `.mlrrc`. --prepipex {decompression command} Like `--prepipe` with one exception: doesn't insert `<` between command and filename at runtime. Useful @@ -310,6 +313,8 @@ MILLER(1) MILLER(1) in `.mlrrc` to avoid unexpected code execution. --zin Uncompress zlib within the Miller process. Done by default if file ends in `.z`. + --zstdin Uncompress zstd within the Miller process. Done by + default if file ends in `.zstd`. 1mCSV/TSV-ONLY FLAGS0m These are flags which are applicable to CSV format. @@ -572,6 +577,11 @@ MILLER(1) MILLER(1) to be modified, except when input is from `tail -f`. See also https://miller.readthedocs.io/en/latest/reference-main-flag-list/. + --s-no-comment-strip {file name} + Take command-line flags from file name, like -s, but + with no comment-stripping. For more information + please see + https://miller.readthedocs.io/en/latest/scripting/. --seed {n} with `n` of the form `12345678` or `0xcafefeed`. For `put`/`filter` `urand`, `urandint`, and `urand32`. --tz {timezone} Specify timezone, overriding `$TZ` environment @@ -1236,6 +1246,15 @@ MILLER(1) MILLER(1) Options: -h|--help Show this message. + 1mgsub0m + Usage: mlr gsub [options] + Replaces old string with new string in specified field(s), with regex support + for the old string and handling multiple matches, like the `gsub` DSL function. + See also the `sub` and `ssub` verbs. + Options: + -f {a,b,c} Field names to convert. + -h|--help Show this message. + 1mhaving-fields0m Usage: mlr having-fields [options] Conditionally passes through records depending on each record's field names. @@ -1844,6 +1863,14 @@ MILLER(1) MILLER(1) See also the "tee" DSL function which lets you do more ad-hoc customization. + 1mssub0m + Usage: mlr ssub [options] + Replaces old string with new string in specified field(s), without regex support for + the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs. + Options: + -f {a,b,c} Field names to convert. + -h|--help Show this message. + 1mstats10m Usage: mlr stats1 [options] Computes univariate statistics for one or more given fields, accumulated across @@ -1981,6 +2008,15 @@ MILLER(1) MILLER(1) https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average for more information on EWMA. + 1msub0m + Usage: mlr sub [options] + Replaces old string with new string in specified field(s), with regex support + for the old string and not handling multiple matches, like the `sub` DSL function. + See also the `gsub` and `ssub` verbs. + Options: + -f {a,b,c} Field names to convert. + -h|--help Show this message. + 1msummary0m Usage: mlr summary [options] Show summary statistics about the input data. diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt index 58ff3991fd..0c04fc330e 100644 --- a/docs/src/manpage.txt +++ b/docs/src/manpage.txt @@ -173,12 +173,13 @@ MILLER(1) MILLER(1) 1mVERB LIST0m altkv bar bootstrap case cat check clean-whitespace count-distinct count count-similar cut decimate fill-down fill-empty filter flatten format-values - fraction gap grep group-by group-like having-fields head histogram json-parse - json-stringify join label latin1-to-utf8 least-frequent merge-fields - most-frequent nest nothing put regularize remove-empty-columns rename reorder - repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records - sort sort-within-records split stats1 stats2 step summary tac tail tee - template top utf8-to-latin1 unflatten uniq unspace unsparsify + fraction gap grep group-by group-like gsub having-fields head histogram + json-parse json-stringify join label latin1-to-utf8 least-frequent + merge-fields most-frequent nest nothing put regularize remove-empty-columns + rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle + skip-trivial-records sort sort-within-records split ssub stats1 stats2 step + sub summary tac tail tee template top utf8-to-latin1 unflatten uniq unspace + unsparsify 1mFUNCTION LIST0m abs acos acosh any append apply arrayify asin asinh asserting_absent @@ -241,7 +242,7 @@ MILLER(1) MILLER(1) Miller offers a few different ways to handle reading data files which have been compressed. - * Decompression done within the Miller process itself: `--bz2in` `--gzin` `--zin` + * Decompression done within the Miller process itself: `--bz2in` `--gzin` `--zin``--zstdin` * Decompression done outside the Miller process: `--prepipe` `--prepipex` Using `--prepipe` and `--prepipex` you can specify an action to be @@ -264,7 +265,7 @@ MILLER(1) MILLER(1) Lastly, note that if `--prepipe` or `--prepipex` is specified, it replaces any decisions that might have been made based on the file suffix. Likewise, - `--gzin`/`--bz2in`/`--zin` are ignored if `--prepipe` is also specified. + `--gzin`/`--bz2in`/`--zin``--zin` are ignored if `--prepipe` is also specified. --bz2in Uncompress bzip2 within the Miller process. Done by default if file ends in `.bz2`. @@ -281,6 +282,8 @@ MILLER(1) MILLER(1) `.mlrrc`. --prepipe-zcat Same as `--prepipe zcat`, except this is allowed in `.mlrrc`. + --prepipe-zstdcat Same as `--prepipe zstdcat`, except this is allowed + in `.mlrrc`. --prepipex {decompression command} Like `--prepipe` with one exception: doesn't insert `<` between command and filename at runtime. Useful @@ -289,6 +292,8 @@ MILLER(1) MILLER(1) in `.mlrrc` to avoid unexpected code execution. --zin Uncompress zlib within the Miller process. Done by default if file ends in `.z`. + --zstdin Uncompress zstd within the Miller process. Done by + default if file ends in `.zstd`. 1mCSV/TSV-ONLY FLAGS0m These are flags which are applicable to CSV format. @@ -551,6 +556,11 @@ MILLER(1) MILLER(1) to be modified, except when input is from `tail -f`. See also https://miller.readthedocs.io/en/latest/reference-main-flag-list/. + --s-no-comment-strip {file name} + Take command-line flags from file name, like -s, but + with no comment-stripping. For more information + please see + https://miller.readthedocs.io/en/latest/scripting/. --seed {n} with `n` of the form `12345678` or `0xcafefeed`. For `put`/`filter` `urand`, `urandint`, and `urand32`. --tz {timezone} Specify timezone, overriding `$TZ` environment @@ -1215,6 +1225,15 @@ MILLER(1) MILLER(1) Options: -h|--help Show this message. + 1mgsub0m + Usage: mlr gsub [options] + Replaces old string with new string in specified field(s), with regex support + for the old string and handling multiple matches, like the `gsub` DSL function. + See also the `sub` and `ssub` verbs. + Options: + -f {a,b,c} Field names to convert. + -h|--help Show this message. + 1mhaving-fields0m Usage: mlr having-fields [options] Conditionally passes through records depending on each record's field names. @@ -1823,6 +1842,14 @@ MILLER(1) MILLER(1) See also the "tee" DSL function which lets you do more ad-hoc customization. + 1mssub0m + Usage: mlr ssub [options] + Replaces old string with new string in specified field(s), without regex support for + the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs. + Options: + -f {a,b,c} Field names to convert. + -h|--help Show this message. + 1mstats10m Usage: mlr stats1 [options] Computes univariate statistics for one or more given fields, accumulated across @@ -1960,6 +1987,15 @@ MILLER(1) MILLER(1) https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average for more information on EWMA. + 1msub0m + Usage: mlr sub [options] + Replaces old string with new string in specified field(s), with regex support + for the old string and not handling multiple matches, like the `sub` DSL function. + See also the `gsub` and `ssub` verbs. + Options: + -f {a,b,c} Field names to convert. + -h|--help Show this message. + 1msummary0m Usage: mlr summary [options] Show summary statistics about the input data. diff --git a/docs/src/new-in-miller-6.md b/docs/src/new-in-miller-6.md index 3170819c9d..32633b6f8e 100644 --- a/docs/src/new-in-miller-6.md +++ b/docs/src/new-in-miller-6.md @@ -143,7 +143,7 @@ the `TZ` environment variable. Please see [DSL datetime/timezone functions](refe ### In-process support for compressed input -In addition to `--prepipe gunzip`, you can now use the `--gzin` flag. In fact, if your files end in `.gz` you don't even need to do that -- Miller will autodetect by file extension and automatically uncompress `mlr --csv cat foo.csv.gz`. Similarly for `.z` and `.bz2` files. Please see the page on [Compressed data](reference-main-compressed-data.md) for more information. +In addition to `--prepipe gunzip`, you can now use the `--gzin` flag. In fact, if your files end in `.gz` you don't even need to do that -- Miller will autodetect by file extension and automatically uncompress `mlr --csv cat foo.csv.gz`. Similarly for `.z`, `.bz2`, and `.zst` files. Please see the page on [Compressed data](reference-main-compressed-data.md) for more information. ### Support for reading web URLs diff --git a/docs/src/new-in-miller-6.md.in b/docs/src/new-in-miller-6.md.in index 43ea44d905..c450a96224 100644 --- a/docs/src/new-in-miller-6.md.in +++ b/docs/src/new-in-miller-6.md.in @@ -125,7 +125,7 @@ the `TZ` environment variable. Please see [DSL datetime/timezone functions](refe ### In-process support for compressed input -In addition to `--prepipe gunzip`, you can now use the `--gzin` flag. In fact, if your files end in `.gz` you don't even need to do that -- Miller will autodetect by file extension and automatically uncompress `mlr --csv cat foo.csv.gz`. Similarly for `.z` and `.bz2` files. Please see the page on [Compressed data](reference-main-compressed-data.md) for more information. +In addition to `--prepipe gunzip`, you can now use the `--gzin` flag. In fact, if your files end in `.gz` you don't even need to do that -- Miller will autodetect by file extension and automatically uncompress `mlr --csv cat foo.csv.gz`. Similarly for `.z`, `.bz2`, and `.zst` files. Please see the page on [Compressed data](reference-main-compressed-data.md) for more information. ### Support for reading web URLs diff --git a/docs/src/reference-main-compressed-data.md b/docs/src/reference-main-compressed-data.md index a54ed8026b..729cf5bbcc 100644 --- a/docs/src/reference-main-compressed-data.md +++ b/docs/src/reference-main-compressed-data.md @@ -16,13 +16,13 @@ Quick links: # Compressed data -As of [Miller 6](new-in-miller-6.md), Miller supports reading GZIP, BZIP2, and -ZLIB formats transparently, and in-process. And (as before Miller 6) you have a +As of [Miller 6](new-in-miller-6.md), Miller supports reading GZIP, BZIP2, ZLIB, and +ZSTD formats transparently, and in-process. And (as before Miller 6) you have a more general `--prepipe` option to support other decompression programs. ## Automatic detection on input -If your files end in `.gz`, `.bz2`, or `.z` then Miller will autodetect by file extension: +If your files end in `.gz`, `.bz2`, `.z`, or `.zst` then Miller will autodetect by file extension:
file gz-example.csv.gz @@ -52,7 +52,7 @@ This will decompress the input data on the fly, while leaving the disk file unmo ## Manual detection on input -If the filename doesn't in in `.gz`, `.bz2`, or `.z` then you can use the flags `--gzin`, `--bz2in`, or `--zin` to let Miller know: +If the filename doesn't in in `.gz`, `.bz2`, `-z`, or `.zst` then you can use the flags `--gzin`, `--bz2in`, `--zin`, or `--zstdin` to let Miller know:## step @@ -3574,6 +3671,55 @@ $ each 10 uptime | mlr -p step -a delta -f 11 +## sub + +mlr --csv --gzin sort -f color myfile.bin # myfile.bin has gzip contents @@ -94,7 +94,7 @@ If the command has flags, quote them: e.g. `mlr --prepipe 'zcat -cf'`. In your [.mlrrc file](customization.md), `--prepipe` and `--prepipex` are not allowed as they could be used for unexpected code execution. You can use -`--prepipe-bz2`, `--prepipe-gunzip`, and `--prepipe-zcat` in `.mlrrc`, though. +`--prepipe-bz2`, `--prepipe-gunzip`, `--prepipe-zcat`, and `--prepipe-zstdcat` in `.mlrrc`, though. Note that this feature is quite general and is not limited to decompression utilities. You can use it to apply per-file filters of your choice: e.g. `mlr @@ -107,7 +107,7 @@ There is a `--prepipe` and a `--prepipex`: Lastly, note that if `--prepipe` or `--prepipex` is specified on the Miller command line, it replaces any autodetect decisions that might have been made -based on the filename extension. Likewise, `--gzin`/`--bz2in`/`--zin` are ignored if +based on the filename extension. Likewise, `--gzin`/`--bz2in`/`--zin`/`--zstdin` are ignored if `--prepipe` or `--prepipex` is also specified. ## Compressed output diff --git a/docs/src/reference-main-compressed-data.md.in b/docs/src/reference-main-compressed-data.md.in index b13e5e7327..cbca6a3c34 100644 --- a/docs/src/reference-main-compressed-data.md.in +++ b/docs/src/reference-main-compressed-data.md.in @@ -1,12 +1,12 @@ # Compressed data -As of [Miller 6](new-in-miller-6.md), Miller supports reading GZIP, BZIP2, and -ZLIB formats transparently, and in-process. And (as before Miller 6) you have a +As of [Miller 6](new-in-miller-6.md), Miller supports reading GZIP, BZIP2, ZLIB, and +ZSTD formats transparently, and in-process. And (as before Miller 6) you have a more general `--prepipe` option to support other decompression programs. ## Automatic detection on input -If your files end in `.gz`, `.bz2`, or `.z` then Miller will autodetect by file extension: +If your files end in `.gz`, `.bz2`, `.z`, or `.zst` then Miller will autodetect by file extension: GENMD-CARDIFY-HIGHLIGHT-ONE file gz-example.csv.gz @@ -21,7 +21,7 @@ This will decompress the input data on the fly, while leaving the disk file unmo ## Manual detection on input -If the filename doesn't in in `.gz`, `.bz2`, or `.z` then you can use the flags `--gzin`, `--bz2in`, or `--zin` to let Miller know: +If the filename doesn't in in `.gz`, `.bz2`, `-z`, or `.zst` then you can use the flags `--gzin`, `--bz2in`, `--zin`, or `--zstdin` to let Miller know: GENMD-CARDIFY-HIGHLIGHT-ONE mlr --csv --gzin sort -f color myfile.bin # myfile.bin has gzip contents @@ -50,7 +50,7 @@ If the command has flags, quote them: e.g. `mlr --prepipe 'zcat -cf'`. In your [.mlrrc file](customization.md), `--prepipe` and `--prepipex` are not allowed as they could be used for unexpected code execution. You can use -`--prepipe-bz2`, `--prepipe-gunzip`, and `--prepipe-zcat` in `.mlrrc`, though. +`--prepipe-bz2`, `--prepipe-gunzip`, `--prepipe-zcat`, and `--prepipe-zstdcat` in `.mlrrc`, though. Note that this feature is quite general and is not limited to decompression utilities. You can use it to apply per-file filters of your choice: e.g. `mlr @@ -63,7 +63,7 @@ There is a `--prepipe` and a `--prepipex`: Lastly, note that if `--prepipe` or `--prepipex` is specified on the Miller command line, it replaces any autodetect decisions that might have been made -based on the filename extension. Likewise, `--gzin`/`--bz2in`/`--zin` are ignored if +based on the filename extension. Likewise, `--gzin`/`--bz2in`/`--zin`/`--zstdin` are ignored if `--prepipe` or `--prepipex` is also specified. ## Compressed output diff --git a/docs/src/reference-main-flag-list.md b/docs/src/reference-main-flag-list.md index 24e2cbc388..8e2daf9d02 100644 --- a/docs/src/reference-main-flag-list.md +++ b/docs/src/reference-main-flag-list.md @@ -72,7 +72,7 @@ Notes: Miller offers a few different ways to handle reading data files which have been compressed. -* Decompression done within the Miller process itself: `--bz2in` `--gzin` `--zin` +* Decompression done within the Miller process itself: `--bz2in` `--gzin` `--zin``--zstdin` * Decompression done outside the Miller process: `--prepipe` `--prepipex` Using `--prepipe` and `--prepipex` you can specify an action to be @@ -95,7 +95,7 @@ compression (or other) utilities, simply pipe the output: Lastly, note that if `--prepipe` or `--prepipex` is specified, it replaces any decisions that might have been made based on the file suffix. Likewise, -`--gzin`/`--bz2in`/`--zin` are ignored if `--prepipe` is also specified. +`--gzin`/`--bz2in`/`--zin``--zin` are ignored if `--prepipe` is also specified. **Flags:** @@ -106,8 +106,10 @@ decisions that might have been made based on the file suffix. Likewise, * `--prepipe-bz2`: Same as `--prepipe bz2`, except this is allowed in `.mlrrc`. * `--prepipe-gunzip`: Same as `--prepipe gunzip`, except this is allowed in `.mlrrc`. * `--prepipe-zcat`: Same as `--prepipe zcat`, except this is allowed in `.mlrrc`. +* `--prepipe-zstdcat`: Same as `--prepipe zstdcat`, except this is allowed in `.mlrrc`. * `--prepipex {decompression command}`: Like `--prepipe` with one exception: doesn't insert `<` between command and filename at runtime. Useful for some commands like `unzip -qc` which don't read standard input. Allowed at the command line, but not in `.mlrrc` to avoid unexpected code execution. * `--zin`: Uncompress zlib within the Miller process. Done by default if file ends in `.z`. +* `--zstdin`: Uncompress zstd within the Miller process. Done by default if file ends in `.zstd`. ## CSV/TSV-only flags @@ -281,6 +283,7 @@ These are flags which don't fit into any other category. * `--ofmtf {n}`: Use --ofmtf 6 as shorthand for --ofmt %.6f, etc. * `--ofmtg {n}`: Use --ofmtg 6 as shorthand for --ofmt %.6g, etc. * `--records-per-batch {n}`: This is an internal parameter for maximum number of records in a batch size. Normally this does not need to be modified, except when input is from `tail -f`. See also https://miller.readthedocs.io/en/latest/reference-main-flag-list/. +* `--s-no-comment-strip {file name}`: Take command-line flags from file name, like -s, but with no comment-stripping. For more information please see https://miller.readthedocs.io/en/latest/scripting/. * `--seed {n}`: with `n` of the form `12345678` or `0xcafefeed`. For `put`/`filter` `urand`, `urandint`, and `urand32`. * `--tz {timezone}`: Specify timezone, overriding `$TZ` environment variable (if any). * `-I`: Process files in-place. For each file name on the command line, output is written to a temp file in the same directory, which is then renamed over the original. Each file is processed in isolation: if the output format is CSV, CSV headers will be present in each output file, statistics are only over each file's own records; and so on. diff --git a/docs/src/reference-verbs.md b/docs/src/reference-verbs.md index 998900ddf6..6e9fbb4780 100644 --- a/docs/src/reference-verbs.md +++ b/docs/src/reference-verbs.md @@ -1447,6 +1447,55 @@ record_count resource 150 /path/to/second/file+## gsub + ++mlr gsub -h +++Usage: mlr gsub [options] +Replaces old string with new string in specified field(s), with regex support +for the old string and handling multiple matches, like the `gsub` DSL function. +See also the `sub` and `ssub` verbs. +Options: +-f {a,b,c} Field names to convert. +-h|--help Show this message. ++ ++mlr --icsv --opprint --from example.csv cat --filename then sub -f color,shape l X +++filename color shape flag k index quantity rate +example.csv yeXlow triangXe true 1 11 43.6498 9.8870 +example.csv red square true 2 15 79.2778 0.0130 +example.csv red circXe true 3 16 13.8103 2.9010 +example.csv red square false 4 48 77.5542 7.4670 +example.csv purpXe triangXe false 5 51 81.2290 8.5910 +example.csv red square false 6 64 77.1991 9.5310 +example.csv purpXe triangXe false 7 65 80.1405 5.8240 +example.csv yeXlow circXe true 8 73 63.9785 4.2370 +example.csv yeXlow circXe true 9 87 63.5058 8.3350 +example.csv purpXe square false 10 91 72.3735 8.2430 ++ ++mlr --icsv --opprint --from example.csv cat --filename then gsub -f color,shape l X +++filename color shape flag k index quantity rate +example.csv yeXXow triangXe true 1 11 43.6498 9.8870 +example.csv red square true 2 15 79.2778 0.0130 +example.csv red circXe true 3 16 13.8103 2.9010 +example.csv red square false 4 48 77.5542 7.4670 +example.csv purpXe triangXe false 5 51 81.2290 8.5910 +example.csv red square false 6 64 77.1991 9.5310 +example.csv purpXe triangXe false 7 65 80.1405 5.8240 +example.csv yeXXow circXe true 8 73 63.9785 4.2370 +example.csv yeXXow circXe true 9 87 63.5058 8.3350 +example.csv purpXe square false 10 91 72.3735 8.2430 ++ ## having-fields@@ -3120,6 +3169,54 @@ then there will be split_yellow_triangle.csv, split_yellow_square.csv, etc. See also the "tee" DSL function which lets you do more ad-hoc customization.+## ssub + ++mlr ssub -h +++Usage: mlr ssub [options] +Replaces old string with new string in specified field(s), without regex support for +the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs. +Options: +-f {a,b,c} Field names to convert. +-h|--help Show this message. ++ ++mlr --icsv --opprint --from example.csv cat --filename then sub -f filename . o +++filename color shape flag k index quantity rate +oxample.csv yellow triangle true 1 11 43.6498 9.8870 +oxample.csv red square true 2 15 79.2778 0.0130 +oxample.csv red circle true 3 16 13.8103 2.9010 +oxample.csv red square false 4 48 77.5542 7.4670 +oxample.csv purple triangle false 5 51 81.2290 8.5910 +oxample.csv red square false 6 64 77.1991 9.5310 +oxample.csv purple triangle false 7 65 80.1405 5.8240 +oxample.csv yellow circle true 8 73 63.9785 4.2370 +oxample.csv yellow circle true 9 87 63.5058 8.3350 +oxample.csv purple square false 10 91 72.3735 8.2430 ++ ++mlr --icsv --opprint --from example.csv cat --filename then ssub -f filename . o +++filename color shape flag k index quantity rate +exampleocsv yellow triangle true 1 11 43.6498 9.8870 +exampleocsv red square true 2 15 79.2778 0.0130 +exampleocsv red circle true 3 16 13.8103 2.9010 +exampleocsv red square false 4 48 77.5542 7.4670 +exampleocsv purple triangle false 5 51 81.2290 8.5910 +exampleocsv red square false 6 64 77.1991 9.5310 +exampleocsv purple triangle false 7 65 80.1405 5.8240 +exampleocsv yellow circle true 8 73 63.9785 4.2370 +exampleocsv yellow circle true 9 87 63.5058 8.3350 +exampleocsv purple square false 10 91 72.3735 8.2430 ++ ## stats1@@ -3307,14 +3404,14 @@ fields, optionally categorized by one or more fields. data/medium-x_y_cov 0.000042574820827444476 -x_y_corr 0.0005042001844467462 -y_y_cov 0.08461122467974003 +x_y_cov 0.00004257482082749404 +x_y_corr 0.0005042001844473328 +y_y_cov 0.08461122467974005 y_y_corr 1 -x2_xy_cov 0.04188382281779374 -x2_xy_corr 0.630174342037994 -x2_y2_cov -0.00030953725962542085 -x2_y2_corr -0.0034249088761121966 +x2_xy_cov 0.041883822817793716 +x2_xy_corr 0.6301743420379936 +x2_y2_cov -0.0003095372596253918 +x2_y2_corr -0.003424908876111875@@ -3323,12 +3420,12 @@ x2_y2_corr -0.0034249088761121966 data/medium-a x_y_ols_m x_y_ols_b x_y_ols_n x_y_r2 y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m xy_y2_ols_b xy_y2_ols_n xy_y2_r2 -pan 0.01702551273681908 0.5004028922897639 2081 0.00028691820445814767 1 0 2081 1 0.8781320866715662 0.11908230147563566 2081 0.41749827377311266 -eks 0.0407804923685586 0.48140207967651016 1965 0.0016461239223448587 1 0 1965 1 0.8978728611690183 0.10734054433612333 1965 0.45563223864254526 -wye -0.03915349075204814 0.5255096523974456 1966 0.0015051268704373607 1 0 1966 1 0.8538317334220835 0.1267454301662969 1966 0.38991721818599295 -zee 0.0027812364960399147 0.5043070448033061 2047 0.000007751652858786137 1 0 2047 1 0.8524439912011013 0.12401684308018937 2047 0.39356598090006495 -hat -0.018620577041095078 0.5179005397264935 1941 0.0003520036646055585 1 0 1941 1 0.8412305086345014 0.13557328318623216 1941 0.3687944261732265 +a x_y_ols_m x_y_ols_b x_y_ols_n x_y_r2 y_y_ols_m y_y_ols_b y_y_ols_n y_y_r2 xy_y2_ols_m xy_y2_ols_b xy_y2_ols_n xy_y2_r2 +pan 0.017025512736819345 0.500402892289764 2081 0.00028691820445815624 1 -0.00000000000000002890430283104539 2081 1 0.8781320866715664 0.11908230147563569 2081 0.4174982737731127 +eks 0.04078049236855813 0.4814020796765104 1965 0.0016461239223448218 1 0.00000000000000017862676354313703 1965 1 0.897872861169018 0.1073405443361234 1965 0.4556322386425451 +wye -0.03915349075204785 0.5255096523974457 1966 0.0015051268704373377 1 0.00000000000000004464425401127647 1966 1 0.8538317334220837 0.1267454301662969 1966 0.3899172181859931 +zee 0.0027812364960401333 0.5043070448033061 2047 0.000007751652858787357 1 0.00000000000000004819404567023685 2047 1 0.8524439912011011 0.12401684308018947 2047 0.39356598090006495 +hat -0.018620577041095272 0.5179005397264937 1941 0.00035200366460556604 1 -0.00000000000000003400445761787692 1941 1 0.8412305086345017 0.13557328318623207 1941 0.3687944261732266Here's an example simple line-fit. The `x` and `y` @@ -3414,11 +3511,11 @@ upsec_count_pca_quality 0.9999590846136102 donesec 92.33051350964094 color purple -upsec_count_pca_m -39.03009744795354 -upsec_count_pca_b 979.9883413064914 +upsec_count_pca_m -39.030097447953594 +upsec_count_pca_b 979.9883413064917 upsec_count_pca_n 21 upsec_count_pca_quality 0.9999908956206317 -donesec 25.10852919630297 +donesec 25.108529196302943
+mlr sub -h ++
+Usage: mlr sub [options] +Replaces old string with new string in specified field(s), with regex support +for the old string and not handling multiple matches, like the `sub` DSL function. +See also the `gsub` and `ssub` verbs. +Options: +-f {a,b,c} Field names to convert. +-h|--help Show this message. ++ +
+mlr --icsv --opprint --from example.csv cat --filename then sub -f color,shape l X ++
+filename color shape flag k index quantity rate +example.csv yeXlow triangXe true 1 11 43.6498 9.8870 +example.csv red square true 2 15 79.2778 0.0130 +example.csv red circXe true 3 16 13.8103 2.9010 +example.csv red square false 4 48 77.5542 7.4670 +example.csv purpXe triangXe false 5 51 81.2290 8.5910 +example.csv red square false 6 64 77.1991 9.5310 +example.csv purpXe triangXe false 7 65 80.1405 5.8240 +example.csv yeXlow circXe true 8 73 63.9785 4.2370 +example.csv yeXlow circXe true 9 87 63.5058 8.3350 +example.csv purpXe square false 10 91 72.3735 8.2430 ++ +
+mlr --icsv --opprint --from example.csv cat --filename then gsub -f color,shape l X ++
+filename color shape flag k index quantity rate +example.csv yeXXow triangXe true 1 11 43.6498 9.8870 +example.csv red square true 2 15 79.2778 0.0130 +example.csv red circXe true 3 16 13.8103 2.9010 +example.csv red square false 4 48 77.5542 7.4670 +example.csv purpXe triangXe false 5 51 81.2290 8.5910 +example.csv red square false 6 64 77.1991 9.5310 +example.csv purpXe triangXe false 7 65 80.1405 5.8240 +example.csv yeXXow circXe true 8 73 63.9785 4.2370 +example.csv yeXXow circXe true 9 87 63.5058 8.3350 +example.csv purpXe square false 10 91 72.3735 8.2430 ++ ## summary
@@ -3646,9 +3792,9 @@ distinct_count 5 5 10000 10000 10000 mode pan wye 1 0.3467901443380824 0.7268028627434533 sum 0 0 50005000 4986.019681679581 5062.057444929905 mean - - 5000.5 0.49860196816795804 0.5062057444929905 -stddev - - 2886.8956799071675 0.2902925151144007 0.290880086426933 -var - - 8334166.666666667 0.08426974433144456 0.08461122467974003 -skewness - - 0 -0.0006899591185521965 -0.017849760120133784 +stddev - - 2886.8956799071675 0.29029251511440074 0.2908800864269331 +var - - 8334166.666666667 0.08426974433144457 0.08461122467974005 +skewness - - 0 -0.0006899591185517494 -0.01784976012013298 minlen 3 3 1 15 13 maxlen 3 3 5 22 22 min eks eks 1 0.00004509679127584487 0.00008818962627266114 diff --git a/docs/src/reference-verbs.md.in b/docs/src/reference-verbs.md.in index 0ff0bd15dd..44feda3deb 100644 --- a/docs/src/reference-verbs.md.in +++ b/docs/src/reference-verbs.md.in @@ -487,6 +487,20 @@ GENMD-RUN-COMMAND mlr --opprint group-like data/het.dkvp GENMD-EOF +## gsub + +GENMD-RUN-COMMAND +mlr gsub -h +GENMD-EOF + +GENMD-RUN-COMMAND +mlr --icsv --opprint --from example.csv cat --filename then sub -f color,shape l X +GENMD-EOF + +GENMD-RUN-COMMAND +mlr --icsv --opprint --from example.csv cat --filename then gsub -f color,shape l X +GENMD-EOF + ## having-fields GENMD-RUN-COMMAND @@ -987,6 +1001,20 @@ GENMD-RUN-COMMAND mlr split --help GENMD-EOF +## ssub + +GENMD-RUN-COMMAND +mlr ssub -h +GENMD-EOF + +GENMD-RUN-COMMAND +mlr --icsv --opprint --from example.csv cat --filename then sub -f filename . o +GENMD-EOF + +GENMD-RUN-COMMAND +mlr --icsv --opprint --from example.csv cat --filename then ssub -f filename . o +GENMD-EOF + ## stats1 GENMD-RUN-COMMAND @@ -1095,6 +1123,20 @@ Example deriving uptime-delta from system uptime: GENMD-INCLUDE-ESCAPED(data/ping-delta-example.txt) +## sub + +GENMD-RUN-COMMAND +mlr sub -h +GENMD-EOF + +GENMD-RUN-COMMAND +mlr --icsv --opprint --from example.csv cat --filename then sub -f color,shape l X +GENMD-EOF + +GENMD-RUN-COMMAND +mlr --icsv --opprint --from example.csv cat --filename then gsub -f color,shape l X +GENMD-EOF + ## summary GENMD-RUN-COMMAND diff --git a/docs/src/scripting.md b/docs/src/scripting.md index 29cac3fb71..71c6b22a0c 100644 --- a/docs/src/scripting.md +++ b/docs/src/scripting.md @@ -137,7 +137,7 @@ Here instead of putting `#!/bin/bash` on the first line, we can put `mlr` direct#!/usr/bin/env mlr -s --c2p -filter '$quantity != 20' +filter '$quantity != 20' # Here is a comment then count-distinct -f shape then fraction -f count@@ -149,6 +149,7 @@ Points: * You leave off the initial `mlr` since that's present on line 1. * You don't need all the backslashing for line-continuations. * You don't need the explicit `--` or `"$@"`. +* All text from `#` to end of line is stripped out. If for any reason you need to suppress this, please use `mlr --s-no-comment-strip` in place of `mlr -s`. Then you can do diff --git a/docs/src/scripting.md.in b/docs/src/scripting.md.in index 0e4afc9ac4..3234c93984 100644 --- a/docs/src/scripting.md.in +++ b/docs/src/scripting.md.in @@ -67,6 +67,7 @@ Points: * You leave off the initial `mlr` since that's present on line 1. * You don't need all the backslashing for line-continuations. * You don't need the explicit `--` or `"$@"`. +* All text from `#` to end of line is stripped out. If for any reason you need to suppress this, please use `mlr --s-no-comment-strip` in place of `mlr -s`. Then you can do diff --git a/docs/src/two-pass-algorithms.md b/docs/src/two-pass-algorithms.md index 146f3a81e1..e475aebf3b 100644 --- a/docs/src/two-pass-algorithms.md +++ b/docs/src/two-pass-algorithms.md @@ -598,8 +598,8 @@ hat pan 0.4643355557376876 x_count 10000 x_sum 4986.019681679581 x_mean 0.49860196816795804 -x_var 0.08426974433144456 -x_stddev 0.2902925151144007 +x_var 0.08426974433144457 +x_stddev 0.29029251511440074
diff --git a/go.mod b/go.mod index b9e11f7eb3..2373dea148 100644 --- a/go.mod +++ b/go.mod @@ -34,6 +34,7 @@ require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/felixge/fgprof v0.9.3 // indirect github.com/google/pprof v0.0.0-20211214055906-6f57359322fd // indirect + github.com/klauspost/compress v1.16.7 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index e896c8f4e9..84593de572 100644 --- a/go.sum +++ b/go.sum @@ -15,6 +15,8 @@ github.com/johnkerl/lumin v1.0.0 h1:CV34cHZOJ92Y02RbQ0rd4gA0C06Qck9q8blOyaPoWpU= github.com/johnkerl/lumin v1.0.0/go.mod h1:eLf5AdQOaLvzZ2zVy4REr/DSeEwG+CZreHwNLICqv9E= github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs= github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8= +github.com/klauspost/compress v1.16.7 h1:2mk3MPGNzKyxErAw8YaohYh69+pa4sIQSC0fPGCFR9I= +github.com/klauspost/compress v1.16.7/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE= github.com/lestrrat-go/envload v0.0.0-20180220234015-a3eb8ddeffcc h1:RKf14vYWi2ttpEmkA4aQ3j4u9dStX2t4M8UM6qqNsG8= github.com/lestrrat-go/envload v0.0.0-20180220234015-a3eb8ddeffcc/go.mod h1:kopuH9ugFRkIXf3YoqHKyrJ9YfUFsckUU9S7B+XP+is= github.com/lestrrat-go/strftime v1.0.6 h1:CFGsDEt1pOpFNU+TJB0nhz9jl+K0hZSLE205AhTIGQQ= diff --git a/internal/pkg/cli/option_parse.go b/internal/pkg/cli/option_parse.go index cb01c27410..0ee362f2b3 100644 --- a/internal/pkg/cli/option_parse.go +++ b/internal/pkg/cli/option_parse.go @@ -2200,7 +2200,8 @@ func CompressedDataPrintInfo() { fmt.Print(`Miller offers a few different ways to handle reading data files which have been compressed. -* Decompression done within the Miller process itself: ` + "`--bz2in`" + ` ` + "`--gzin`" + ` ` + "`--zin`" + ` +* Decompression done within the Miller process itself: ` + "`--bz2in`" + ` ` + "`--gzin`" + ` ` + "`--zin`" + "`--zstdin`" + + ` * Decompression done outside the Miller process: ` + "`--prepipe`" + ` ` + "`--prepipex`" + ` Using ` + "`--prepipe`" + ` and ` + "`--prepipex`" + ` you can specify an action to be @@ -2223,7 +2224,7 @@ compression (or other) utilities, simply pipe the output: Lastly, note that if ` + "`--prepipe`" + ` or ` + "`--prepipex`" + ` is specified, it replaces any decisions that might have been made based on the file suffix. Likewise, -` + "`--gzin`" + `/` + "`--bz2in`" + `/` + "`--zin`" + ` are ignored if ` + "`--prepipe`" + ` is also specified. +` + "`--gzin`" + `/` + "`--bz2in`" + `/` + "`--zin`" + "`--zin`" + ` are ignored if ` + "`--prepipe`" + ` is also specified. `) } @@ -2278,6 +2279,16 @@ var CompressedDataFlagSection = FlagSection{ }, }, + { + name: "--prepipe-zstdcat", + help: "Same as `--prepipe zstdcat`, except this is allowed in `.mlrrc`.", + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.Prepipe = "zstdcat" + options.ReaderOptions.PrepipeIsRaw = false + *pargi += 1 + }, + }, + { name: "--prepipe-bz2", help: "Same as `--prepipe bz2`, except this is allowed in `.mlrrc`.", @@ -2314,6 +2325,15 @@ var CompressedDataFlagSection = FlagSection{ *pargi += 1 }, }, + + { + name: "--zstdin", + help: "Uncompress zstd within the Miller process. Done by default if file ends in `.zstd`.", + parser: func(args []string, argc int, pargi *int, options *TOptions) { + options.ReaderOptions.FileInputEncoding = lib.FileInputEncodingZstd + *pargi += 1 + }, + }, }, } @@ -2988,5 +3008,16 @@ has its own overhead.`, *pargi += 2 }, }, + + { + name: "--s-no-comment-strip", + arg: "{file name}", + help: `Take command-line flags from file name, like -s, but with no comment-stripping. For more information please see ` + + lib.DOC_URL + `/en/latest/scripting/.`, + parser: func(args []string, argc int, pargi *int, options *TOptions) { + // Already handled in main(). Nothing to do here except to accept this as valid syntax. + *pargi += 2 + }, + }, }, } diff --git a/internal/pkg/climain/mlrcli_shebang.go b/internal/pkg/climain/mlrcli_shebang.go index 99811e6a80..e465ed2b76 100644 --- a/internal/pkg/climain/mlrcli_shebang.go +++ b/internal/pkg/climain/mlrcli_shebang.go @@ -3,6 +3,7 @@ package climain import ( "fmt" "io/ioutil" + "regexp" "strings" "github.com/johnkerl/miller/internal/pkg/lib" @@ -25,10 +26,16 @@ import ( // * This is how shebang lines work // * There are Miller verbs with -s flags and we don't want to disrupt their behavior. func maybeInterpolateDashS(args []string) ([]string, error) { + stripComments := true + if len(args) < 2 { return args, nil } - if args[1] != "-s" { // Normal case + if args[1] == "-s" { + stripComments = true + } else if args[1] == "--s-no-comment-strip" { + stripComments = false + } else { // Normal case return args, nil } if len(args) < 3 { @@ -59,9 +66,12 @@ func maybeInterpolateDashS(args []string) ([]string, error) { } } - // TODO: maybe support comment lines deeper within the script-file. - // Make sure they're /^[\s]+#/ since we don't want to disrupt a "#" within - // strings which are not actually comment characters. + if stripComments { + re := regexp.MustCompile(`#.*`) + for i, _ := range lines { + lines[i] = re.ReplaceAllString(lines[i], "") + } + } // Re-join lines to strings, and pass off to a shell-parser to split into // an args[]-style array. diff --git a/internal/pkg/lib/file_readers.go b/internal/pkg/lib/file_readers.go index fa42688efd..1511200406 100644 --- a/internal/pkg/lib/file_readers.go +++ b/internal/pkg/lib/file_readers.go @@ -25,6 +25,7 @@ import ( "compress/gzip" "compress/zlib" "fmt" + "github.com/klauspost/compress/zstd" "io" "net/http" "os" @@ -38,6 +39,7 @@ const ( FileInputEncodingBzip2 FileInputEncodingGzip FileInputEncodingZlib + FileInputEncodingZstd ) // OpenFileForRead: If prepipe is non-empty, popens "{prepipe} < {filename}" @@ -160,6 +162,8 @@ func openEncodedHandleForRead( return gzip.NewReader(handle) case FileInputEncodingZlib: return zlib.NewReader(handle) + case FileInputEncodingZstd: + return NewZstdReadCloser(handle) } InternalCodingErrorIf(encoding != FileInputEncodingDefault) @@ -173,6 +177,9 @@ func openEncodedHandleForRead( if strings.HasSuffix(filename, ".z") { return zlib.NewReader(handle) } + if strings.HasSuffix(filename, ".zst") { + return NewZstdReadCloser(handle) + } // Pass along os.Stdin or os.Open(filename) return handle, nil @@ -200,6 +207,32 @@ func (rc *BZip2ReadCloser) Close() error { return rc.originalHandle.Close() } +// ---------------------------------------------------------------- +// ZstdReadCloser remedies the fact that zstd.NewReader does not implement io.ReadCloser. +type ZstdReadCloser struct { + originalHandle io.ReadCloser + zstdHandle io.Reader +} + +func NewZstdReadCloser(handle io.ReadCloser) (*ZstdReadCloser, error) { + zstdHandle, err := zstd.NewReader(handle) + if err != nil { + return nil, err + } + return &ZstdReadCloser{ + originalHandle: handle, + zstdHandle: zstdHandle, + }, nil +} + +func (rc *ZstdReadCloser) Read(p []byte) (n int, err error) { + return rc.zstdHandle.Read(p) +} + +func (rc *ZstdReadCloser) Close() error { + return rc.originalHandle.Close() +} + // ---------------------------------------------------------------- // IsEOF handles the following problem: reading past end of files opened with diff --git a/internal/pkg/platform/getargs_windows.go b/internal/pkg/platform/getargs_windows.go index 536a6288e1..4349e43462 100644 --- a/internal/pkg/platform/getargs_windows.go +++ b/internal/pkg/platform/getargs_windows.go @@ -11,6 +11,7 @@ package platform import ( "fmt" "os" + "path/filepath" "strings" shellquote "github.com/kballard/go-shellquote" @@ -76,7 +77,20 @@ func GetArgs() []string { } } //printArgs(retargs, "NEW") - return retargs + + globbed := make([]string, 0) + for i, _ := range retargs { + // Expand things like *.csv + matches, err := filepath.Glob(retargs[i]) + if matches != nil && err == nil { + globbed = append(globbed, matches...) + } else { + globbed = append(globbed, retargs[i]) + } + } + //printArgs(globbed, "NEW") + + return globbed } // ---------------------------------------------------------------- diff --git a/internal/pkg/transformers/aaa_transformer_table.go b/internal/pkg/transformers/aaa_transformer_table.go index 60f490e0d8..ece90a8584 100644 --- a/internal/pkg/transformers/aaa_transformer_table.go +++ b/internal/pkg/transformers/aaa_transformer_table.go @@ -33,6 +33,7 @@ var TRANSFORMER_LOOKUP_TABLE = []TransformerSetup{ GrepSetup, GroupBySetup, GroupLikeSetup, + GsubSetup, HavingFieldsSetup, HeadSetup, HistogramSetup, @@ -62,9 +63,11 @@ var TRANSFORMER_LOOKUP_TABLE = []TransformerSetup{ SortSetup, SortWithinRecordsSetup, SplitSetup, + SsubSetup, Stats1Setup, Stats2Setup, StepSetup, + SubSetup, SummarySetup, TacSetup, TailSetup, diff --git a/internal/pkg/transformers/gsub.go b/internal/pkg/transformers/gsub.go new file mode 100644 index 0000000000..550aeda5af --- /dev/null +++ b/internal/pkg/transformers/gsub.go @@ -0,0 +1,157 @@ +package transformers + +import ( + "container/list" + "fmt" + "os" + "strings" + + "github.com/johnkerl/miller/internal/pkg/bifs" + "github.com/johnkerl/miller/internal/pkg/cli" + "github.com/johnkerl/miller/internal/pkg/mlrval" + "github.com/johnkerl/miller/internal/pkg/types" +) + +// ---------------------------------------------------------------- +const verbNameGsub = "gsub" + +var GsubSetup = TransformerSetup{ + Verb: verbNameGsub, + UsageFunc: transformerGsubUsage, + ParseCLIFunc: transformerGsubParseCLI, + IgnoresInput: false, +} + +func transformerGsubUsage( + o *os.File, +) { + fmt.Fprintf(o, "Usage: %s %s [options]\n", "mlr", verbNameGsub) + fmt.Fprintf(o, "Replaces old string with new string in specified field(s), with regex support\n") + fmt.Fprintf(o, "for the old string and handling multiple matches, like the `gsub` DSL function.\n") + fmt.Fprintf(o, "See also the `sub` and `ssub` verbs.\n") + fmt.Fprintf(o, "Options:\n") + fmt.Fprintf(o, "-f {a,b,c} Field names to convert.\n") + fmt.Fprintf(o, "-h|--help Show this message.\n") +} + +func transformerGsubParseCLI( + pargi *int, + argc int, + args []string, + _ *cli.TOptions, + doConstruct bool, // false for first pass of CLI-parse, true for second pass +) IRecordTransformer { + + // Skip the verb name from the current spot in the mlr command line + argi := *pargi + verb := args[argi] + argi++ + + // Parse local flags + var fieldNames []string = nil + var oldText string + var newText string + + for argi < argc /* variable increment: 1 or 2 depending on flag */ { + opt := args[argi] + if !strings.HasPrefix(opt, "-") { + break // No more flag options to process + } + if args[argi] == "--" { + break // All transformers must do this so main-flags can follow verb-flags + } + argi++ + + if opt == "-h" || opt == "--help" { + transformerGsubUsage(os.Stdout) + os.Exit(0) + + } else if opt == "-f" { + fieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + } else { + transformerGsubUsage(os.Stderr) + os.Exit(1) + } + } + + if fieldNames == nil { + transformerGsubUsage(os.Stderr) + os.Exit(1) + } + + // Get the old and new text from the command line + if (argc - argi) < 2 { + transformerGsubUsage(os.Stderr) + os.Exit(1) + } + oldText = args[argi] + newText = args[argi+1] + + argi += 2 + + *pargi = argi + if !doConstruct { // All transformers must do this for main command-line parsing + return nil + } + + transformer, err := NewTransformerGsub( + fieldNames, + oldText, + newText, + ) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + + return transformer +} + +// ---------------------------------------------------------------- +type TransformerGsub struct { + fieldNames []string + oldText *mlrval.Mlrval + newText *mlrval.Mlrval +} + +// ---------------------------------------------------------------- +func NewTransformerGsub( + fieldNames []string, + oldText string, + newText string, +) (*TransformerGsub, error) { + tr := &TransformerGsub{ + fieldNames: fieldNames, + oldText: mlrval.FromString(oldText), + newText: mlrval.FromString(newText), + } + return tr, nil +} + +func (tr *TransformerGsub) Transform( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) + + if !inrecAndContext.EndOfStream { + inrec := inrecAndContext.Record + + for _, fieldName := range tr.fieldNames { + oldValue := inrec.Get(fieldName) + if oldValue == nil { + continue + } + + newValue := bifs.BIF_gsub(oldValue, tr.oldText, tr.newText) + + inrec.PutReference(fieldName, newValue) + } + + outputRecordsAndContexts.PushBack(inrecAndContext) + } else { + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker + } +} diff --git a/internal/pkg/transformers/ssub.go b/internal/pkg/transformers/ssub.go new file mode 100644 index 0000000000..bd8e542473 --- /dev/null +++ b/internal/pkg/transformers/ssub.go @@ -0,0 +1,156 @@ +package transformers + +import ( + "container/list" + "fmt" + "os" + "strings" + + "github.com/johnkerl/miller/internal/pkg/bifs" + "github.com/johnkerl/miller/internal/pkg/cli" + "github.com/johnkerl/miller/internal/pkg/mlrval" + "github.com/johnkerl/miller/internal/pkg/types" +) + +// ---------------------------------------------------------------- +const verbNameSsub = "ssub" + +var SsubSetup = TransformerSetup{ + Verb: verbNameSsub, + UsageFunc: transformerSsubUsage, + ParseCLIFunc: transformerSsubParseCLI, + IgnoresInput: false, +} + +func transformerSsubUsage( + o *os.File, +) { + fmt.Fprintf(o, "Usage: %s %s [options]\n", "mlr", verbNameSsub) + fmt.Fprintf(o, "Replaces old string with new string in specified field(s), without regex support for\n") + fmt.Fprintf(o, "the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs.\n") + fmt.Fprintf(o, "Options:\n") + fmt.Fprintf(o, "-f {a,b,c} Field names to convert.\n") + fmt.Fprintf(o, "-h|--help Show this message.\n") +} + +func transformerSsubParseCLI( + pargi *int, + argc int, + args []string, + _ *cli.TOptions, + doConstruct bool, // false for first pass of CLI-parse, true for second pass +) IRecordTransformer { + + // Skip the verb name from the current spot in the mlr command line + argi := *pargi + verb := args[argi] + argi++ + + // Parse local flags + var fieldNames []string = nil + var oldText string + var newText string + + for argi < argc /* variable increment: 1 or 2 depending on flag */ { + opt := args[argi] + if !strings.HasPrefix(opt, "-") { + break // No more flag options to process + } + if args[argi] == "--" { + break // All transformers must do this so main-flags can follow verb-flags + } + argi++ + + if opt == "-h" || opt == "--help" { + transformerSsubUsage(os.Stdout) + os.Exit(0) + + } else if opt == "-f" { + fieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + } else { + transformerSsubUsage(os.Stderr) + os.Exit(1) + } + } + + if fieldNames == nil { + transformerSsubUsage(os.Stderr) + os.Exit(1) + } + + // Get the old and new text from the command line + if (argc - argi) < 2 { + transformerSsubUsage(os.Stderr) + os.Exit(1) + } + oldText = args[argi] + newText = args[argi+1] + + argi += 2 + + *pargi = argi + if !doConstruct { // All transformers must do this for main command-line parsing + return nil + } + + transformer, err := NewTransformerSsub( + fieldNames, + oldText, + newText, + ) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + + return transformer +} + +// ---------------------------------------------------------------- +type TransformerSsub struct { + fieldNames []string + oldText *mlrval.Mlrval + newText *mlrval.Mlrval +} + +// ---------------------------------------------------------------- +func NewTransformerSsub( + fieldNames []string, + oldText string, + newText string, +) (*TransformerSsub, error) { + tr := &TransformerSsub{ + fieldNames: fieldNames, + oldText: mlrval.FromString(oldText), + newText: mlrval.FromString(newText), + } + return tr, nil +} + +func (tr *TransformerSsub) Transform( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) + + if !inrecAndContext.EndOfStream { + inrec := inrecAndContext.Record + + for _, fieldName := range tr.fieldNames { + oldValue := inrec.Get(fieldName) + if oldValue == nil { + continue + } + + newValue := bifs.BIF_ssub(oldValue, tr.oldText, tr.newText) + + inrec.PutReference(fieldName, newValue) + } + + outputRecordsAndContexts.PushBack(inrecAndContext) + } else { + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker + } +} diff --git a/internal/pkg/transformers/sub.go b/internal/pkg/transformers/sub.go new file mode 100644 index 0000000000..eee7783624 --- /dev/null +++ b/internal/pkg/transformers/sub.go @@ -0,0 +1,157 @@ +package transformers + +import ( + "container/list" + "fmt" + "os" + "strings" + + "github.com/johnkerl/miller/internal/pkg/bifs" + "github.com/johnkerl/miller/internal/pkg/cli" + "github.com/johnkerl/miller/internal/pkg/mlrval" + "github.com/johnkerl/miller/internal/pkg/types" +) + +// ---------------------------------------------------------------- +const verbNameSub = "sub" + +var SubSetup = TransformerSetup{ + Verb: verbNameSub, + UsageFunc: transformerSubUsage, + ParseCLIFunc: transformerSubParseCLI, + IgnoresInput: false, +} + +func transformerSubUsage( + o *os.File, +) { + fmt.Fprintf(o, "Usage: %s %s [options]\n", "mlr", verbNameSub) + fmt.Fprintf(o, "Replaces old string with new string in specified field(s), with regex support\n") + fmt.Fprintf(o, "for the old string and not handling multiple matches, like the `sub` DSL function.\n") + fmt.Fprintf(o, "See also the `gsub` and `ssub` verbs.\n") + fmt.Fprintf(o, "Options:\n") + fmt.Fprintf(o, "-f {a,b,c} Field names to convert.\n") + fmt.Fprintf(o, "-h|--help Show this message.\n") +} + +func transformerSubParseCLI( + pargi *int, + argc int, + args []string, + _ *cli.TOptions, + doConstruct bool, // false for first pass of CLI-parse, true for second pass +) IRecordTransformer { + + // Skip the verb name from the current spot in the mlr command line + argi := *pargi + verb := args[argi] + argi++ + + // Parse local flags + var fieldNames []string = nil + var oldText string + var newText string + + for argi < argc /* variable increment: 1 or 2 depending on flag */ { + opt := args[argi] + if !strings.HasPrefix(opt, "-") { + break // No more flag options to process + } + if args[argi] == "--" { + break // All transformers must do this so main-flags can follow verb-flags + } + argi++ + + if opt == "-h" || opt == "--help" { + transformerSubUsage(os.Stdout) + os.Exit(0) + + } else if opt == "-f" { + fieldNames = cli.VerbGetStringArrayArgOrDie(verb, opt, args, &argi, argc) + } else { + transformerSubUsage(os.Stderr) + os.Exit(1) + } + } + + if fieldNames == nil { + transformerSubUsage(os.Stderr) + os.Exit(1) + } + + // Get the old and new text from the command line + if (argc - argi) < 2 { + transformerSubUsage(os.Stderr) + os.Exit(1) + } + oldText = args[argi] + newText = args[argi+1] + + argi += 2 + + *pargi = argi + if !doConstruct { // All transformers must do this for main command-line parsing + return nil + } + + transformer, err := NewTransformerSub( + fieldNames, + oldText, + newText, + ) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + + return transformer +} + +// ---------------------------------------------------------------- +type TransformerSub struct { + fieldNames []string + oldText *mlrval.Mlrval + newText *mlrval.Mlrval +} + +// ---------------------------------------------------------------- +func NewTransformerSub( + fieldNames []string, + oldText string, + newText string, +) (*TransformerSub, error) { + tr := &TransformerSub{ + fieldNames: fieldNames, + oldText: mlrval.FromString(oldText), + newText: mlrval.FromString(newText), + } + return tr, nil +} + +func (tr *TransformerSub) Transform( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, // list of *types.RecordAndContext + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) + + if !inrecAndContext.EndOfStream { + inrec := inrecAndContext.Record + + for _, fieldName := range tr.fieldNames { + oldValue := inrec.Get(fieldName) + if oldValue == nil { + continue + } + + newValue := bifs.BIF_sub(oldValue, tr.oldText, tr.newText) + + inrec.PutReference(fieldName, newValue) + } + + outputRecordsAndContexts.PushBack(inrecAndContext) + } else { + outputRecordsAndContexts.PushBack(inrecAndContext) // emit end-of-stream marker + } +} diff --git a/man/manpage.txt b/man/manpage.txt index 58ff3991fd..0c04fc330e 100644 --- a/man/manpage.txt +++ b/man/manpage.txt @@ -173,12 +173,13 @@ MILLER(1) MILLER(1) 1mVERB LIST0m altkv bar bootstrap case cat check clean-whitespace count-distinct count count-similar cut decimate fill-down fill-empty filter flatten format-values - fraction gap grep group-by group-like having-fields head histogram json-parse - json-stringify join label latin1-to-utf8 least-frequent merge-fields - most-frequent nest nothing put regularize remove-empty-columns rename reorder - repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records - sort sort-within-records split stats1 stats2 step summary tac tail tee - template top utf8-to-latin1 unflatten uniq unspace unsparsify + fraction gap grep group-by group-like gsub having-fields head histogram + json-parse json-stringify join label latin1-to-utf8 least-frequent + merge-fields most-frequent nest nothing put regularize remove-empty-columns + rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle + skip-trivial-records sort sort-within-records split ssub stats1 stats2 step + sub summary tac tail tee template top utf8-to-latin1 unflatten uniq unspace + unsparsify 1mFUNCTION LIST0m abs acos acosh any append apply arrayify asin asinh asserting_absent @@ -241,7 +242,7 @@ MILLER(1) MILLER(1) Miller offers a few different ways to handle reading data files which have been compressed. - * Decompression done within the Miller process itself: `--bz2in` `--gzin` `--zin` + * Decompression done within the Miller process itself: `--bz2in` `--gzin` `--zin``--zstdin` * Decompression done outside the Miller process: `--prepipe` `--prepipex` Using `--prepipe` and `--prepipex` you can specify an action to be @@ -264,7 +265,7 @@ MILLER(1) MILLER(1) Lastly, note that if `--prepipe` or `--prepipex` is specified, it replaces any decisions that might have been made based on the file suffix. Likewise, - `--gzin`/`--bz2in`/`--zin` are ignored if `--prepipe` is also specified. + `--gzin`/`--bz2in`/`--zin``--zin` are ignored if `--prepipe` is also specified. --bz2in Uncompress bzip2 within the Miller process. Done by default if file ends in `.bz2`. @@ -281,6 +282,8 @@ MILLER(1) MILLER(1) `.mlrrc`. --prepipe-zcat Same as `--prepipe zcat`, except this is allowed in `.mlrrc`. + --prepipe-zstdcat Same as `--prepipe zstdcat`, except this is allowed + in `.mlrrc`. --prepipex {decompression command} Like `--prepipe` with one exception: doesn't insert `<` between command and filename at runtime. Useful @@ -289,6 +292,8 @@ MILLER(1) MILLER(1) in `.mlrrc` to avoid unexpected code execution. --zin Uncompress zlib within the Miller process. Done by default if file ends in `.z`. + --zstdin Uncompress zstd within the Miller process. Done by + default if file ends in `.zstd`. 1mCSV/TSV-ONLY FLAGS0m These are flags which are applicable to CSV format. @@ -551,6 +556,11 @@ MILLER(1) MILLER(1) to be modified, except when input is from `tail -f`. See also https://miller.readthedocs.io/en/latest/reference-main-flag-list/. + --s-no-comment-strip {file name} + Take command-line flags from file name, like -s, but + with no comment-stripping. For more information + please see + https://miller.readthedocs.io/en/latest/scripting/. --seed {n} with `n` of the form `12345678` or `0xcafefeed`. For `put`/`filter` `urand`, `urandint`, and `urand32`. --tz {timezone} Specify timezone, overriding `$TZ` environment @@ -1215,6 +1225,15 @@ MILLER(1) MILLER(1) Options: -h|--help Show this message. + 1mgsub0m + Usage: mlr gsub [options] + Replaces old string with new string in specified field(s), with regex support + for the old string and handling multiple matches, like the `gsub` DSL function. + See also the `sub` and `ssub` verbs. + Options: + -f {a,b,c} Field names to convert. + -h|--help Show this message. + 1mhaving-fields0m Usage: mlr having-fields [options] Conditionally passes through records depending on each record's field names. @@ -1823,6 +1842,14 @@ MILLER(1) MILLER(1) See also the "tee" DSL function which lets you do more ad-hoc customization. + 1mssub0m + Usage: mlr ssub [options] + Replaces old string with new string in specified field(s), without regex support for + the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs. + Options: + -f {a,b,c} Field names to convert. + -h|--help Show this message. + 1mstats10m Usage: mlr stats1 [options] Computes univariate statistics for one or more given fields, accumulated across @@ -1960,6 +1987,15 @@ MILLER(1) MILLER(1) https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average for more information on EWMA. + 1msub0m + Usage: mlr sub [options] + Replaces old string with new string in specified field(s), with regex support + for the old string and not handling multiple matches, like the `sub` DSL function. + See also the `gsub` and `ssub` verbs. + Options: + -f {a,b,c} Field names to convert. + -h|--help Show this message. + 1msummary0m Usage: mlr summary [options] Show summary statistics about the input data. diff --git a/man/mlr.1 b/man/mlr.1 index 50d617ebcc..ab56c69bb3 100644 --- a/man/mlr.1 +++ b/man/mlr.1 @@ -214,12 +214,13 @@ for all things with "map" in their names. .nf altkv bar bootstrap case cat check clean-whitespace count-distinct count count-similar cut decimate fill-down fill-empty filter flatten format-values -fraction gap grep group-by group-like having-fields head histogram json-parse -json-stringify join label latin1-to-utf8 least-frequent merge-fields -most-frequent nest nothing put regularize remove-empty-columns rename reorder -repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records -sort sort-within-records split stats1 stats2 step summary tac tail tee -template top utf8-to-latin1 unflatten uniq unspace unsparsify +fraction gap grep group-by group-like gsub having-fields head histogram +json-parse json-stringify join label latin1-to-utf8 least-frequent +merge-fields most-frequent nest nothing put regularize remove-empty-columns +rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle +skip-trivial-records sort sort-within-records split ssub stats1 stats2 step +sub summary tac tail tee template top utf8-to-latin1 unflatten uniq unspace +unsparsify .fi .if n \{\ .RE @@ -304,7 +305,7 @@ Notes: Miller offers a few different ways to handle reading data files which have been compressed. -* Decompression done within the Miller process itself: `--bz2in` `--gzin` `--zin` +* Decompression done within the Miller process itself: `--bz2in` `--gzin` `--zin``--zstdin` * Decompression done outside the Miller process: `--prepipe` `--prepipex` Using `--prepipe` and `--prepipex` you can specify an action to be @@ -327,7 +328,7 @@ compression (or other) utilities, simply pipe the output: Lastly, note that if `--prepipe` or `--prepipex` is specified, it replaces any decisions that might have been made based on the file suffix. Likewise, -`--gzin`/`--bz2in`/`--zin` are ignored if `--prepipe` is also specified. +`--gzin`/`--bz2in`/`--zin``--zin` are ignored if `--prepipe` is also specified. --bz2in Uncompress bzip2 within the Miller process. Done by default if file ends in `.bz2`. @@ -344,6 +345,8 @@ decisions that might have been made based on the file suffix. Likewise, `.mlrrc`. --prepipe-zcat Same as `--prepipe zcat`, except this is allowed in `.mlrrc`. +--prepipe-zstdcat Same as `--prepipe zstdcat`, except this is allowed + in `.mlrrc`. --prepipex {decompression command} Like `--prepipe` with one exception: doesn't insert `<` between command and filename at runtime. Useful @@ -352,6 +355,8 @@ decisions that might have been made based on the file suffix. Likewise, in `.mlrrc` to avoid unexpected code execution. --zin Uncompress zlib within the Miller process. Done by default if file ends in `.z`. +--zstdin Uncompress zstd within the Miller process. Done by + default if file ends in `.zstd`. .fi .if n \{\ .RE @@ -670,6 +675,11 @@ These are flags which don't fit into any other category. to be modified, except when input is from `tail -f`. See also https://miller.readthedocs.io/en/latest/reference-main-flag-list/. +--s-no-comment-strip {file name} + Take command-line flags from file name, like -s, but + with no comment-stripping. For more information + please see + https://miller.readthedocs.io/en/latest/scripting/. --seed {n} with `n` of the form `12345678` or `0xcafefeed`. For `put`/`filter` `urand`, `urandint`, and `urand32`. --tz {timezone} Specify timezone, overriding `$TZ` environment @@ -1520,6 +1530,21 @@ Options: .fi .if n \{\ .RE +.SS "gsub" +.if n \{\ +.RS 0 +.\} +.nf +Usage: mlr gsub [options] +Replaces old string with new string in specified field(s), with regex support +for the old string and handling multiple matches, like the `gsub` DSL function. +See also the `sub` and `ssub` verbs. +Options: +-f {a,b,c} Field names to convert. +-h|--help Show this message. +.fi +.if n \{\ +.RE .SS "having-fields" .if n \{\ .RS 0 @@ -2302,6 +2327,20 @@ See also the "tee" DSL function which lets you do more ad-hoc customization. .fi .if n \{\ .RE +.SS "ssub" +.if n \{\ +.RS 0 +.\} +.nf +Usage: mlr ssub [options] +Replaces old string with new string in specified field(s), without regex support for +the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs. +Options: +-f {a,b,c} Field names to convert. +-h|--help Show this message. +.fi +.if n \{\ +.RE .SS "stats1" .if n \{\ .RS 0 @@ -2457,6 +2496,21 @@ for more information on EWMA. .fi .if n \{\ .RE +.SS "sub" +.if n \{\ +.RS 0 +.\} +.nf +Usage: mlr sub [options] +Replaces old string with new string in specified field(s), with regex support +for the old string and not handling multiple matches, like the `sub` DSL function. +See also the `gsub` and `ssub` verbs. +Options: +-f {a,b,c} Field names to convert. +-h|--help Show this message. +.fi +.if n \{\ +.RE .SS "summary" .if n \{\ .RS 0 diff --git a/test/cases/cli-help/0001/expout b/test/cases/cli-help/0001/expout index d6f70fe41a..55efea8ac7 100644 --- a/test/cases/cli-help/0001/expout +++ b/test/cases/cli-help/0001/expout @@ -379,6 +379,16 @@ Outputs records in batches having identical field names. Options: -h|--help Show this message. +================================================================ +gsub +Usage: mlr gsub [options] +Replaces old string with new string in specified field(s), with regex support +for the old string and handling multiple matches, like the `gsub` DSL function. +See also the `sub` and `ssub` verbs. +Options: +-f {a,b,c} Field names to convert. +-h|--help Show this message. + ================================================================ having-fields Usage: mlr having-fields [options] @@ -1016,6 +1026,15 @@ then there will be split_yellow_triangle.csv, split_yellow_square.csv, etc. See also the "tee" DSL function which lets you do more ad-hoc customization. +================================================================ +ssub +Usage: mlr ssub [options] +Replaces old string with new string in specified field(s), without regex support for +the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs. +Options: +-f {a,b,c} Field names to convert. +-h|--help Show this message. + ================================================================ stats1 Usage: mlr stats1 [options] @@ -1156,6 +1175,16 @@ Please see https://miller.readthedocs.io/en/latest/reference-verbs.html#filter o https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average for more information on EWMA. +================================================================ +sub +Usage: mlr sub [options] +Replaces old string with new string in specified field(s), with regex support +for the old string and not handling multiple matches, like the `sub` DSL function. +See also the `gsub` and `ssub` verbs. +Options: +-f {a,b,c} Field names to convert. +-h|--help Show this message. + ================================================================ summary Usage: mlr summary [options] diff --git a/test/cases/globbing/0001/a.csv b/test/cases/globbing/0001/a.csv new file mode 100644 index 0000000000..bfde6bfa0b --- /dev/null +++ b/test/cases/globbing/0001/a.csv @@ -0,0 +1,2 @@ +a,b,c +1,2,3 diff --git a/test/cases/globbing/0001/b.csv b/test/cases/globbing/0001/b.csv new file mode 100644 index 0000000000..a9411aa9de --- /dev/null +++ b/test/cases/globbing/0001/b.csv @@ -0,0 +1,2 @@ +a,b,c +4,5,6 diff --git a/test/cases/globbing/0001/cmd b/test/cases/globbing/0001/cmd new file mode 100644 index 0000000000..a5eecc5776 --- /dev/null +++ b/test/cases/globbing/0001/cmd @@ -0,0 +1 @@ +mlr --c2p cat ${CASEDIR}/*.csv diff --git a/test/cases/globbing/0001/experr b/test/cases/globbing/0001/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/globbing/0001/expout b/test/cases/globbing/0001/expout new file mode 100644 index 0000000000..d0c04ad137 --- /dev/null +++ b/test/cases/globbing/0001/expout @@ -0,0 +1,3 @@ +a b c +1 2 3 +4 5 6 diff --git a/test/cases/io-compressed-input/0014/cmd b/test/cases/io-compressed-input/0014/cmd new file mode 100644 index 0000000000..f6141361ef --- /dev/null +++ b/test/cases/io-compressed-input/0014/cmd @@ -0,0 +1 @@ +mlr count -g a test/input/medium.zst diff --git a/test/cases/io-compressed-input/0014/experr b/test/cases/io-compressed-input/0014/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-compressed-input/0014/expout b/test/cases/io-compressed-input/0014/expout new file mode 100644 index 0000000000..7dcf142127 --- /dev/null +++ b/test/cases/io-compressed-input/0014/expout @@ -0,0 +1,5 @@ +a=pan,count=8 +a=eks,count=10 +a=wye,count=7 +a=zee,count=8 +a=hat,count=7 diff --git a/test/cases/io-compressed-input/0015/cmd b/test/cases/io-compressed-input/0015/cmd new file mode 100644 index 0000000000..8a6e18c1e2 --- /dev/null +++ b/test/cases/io-compressed-input/0015/cmd @@ -0,0 +1 @@ +mlr --zstdin count -g a < test/input/medium.zst diff --git a/test/cases/io-compressed-input/0015/experr b/test/cases/io-compressed-input/0015/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-compressed-input/0015/expout b/test/cases/io-compressed-input/0015/expout new file mode 100644 index 0000000000..7dcf142127 --- /dev/null +++ b/test/cases/io-compressed-input/0015/expout @@ -0,0 +1,5 @@ +a=pan,count=8 +a=eks,count=10 +a=wye,count=7 +a=zee,count=8 +a=hat,count=7 diff --git a/test/cases/io-compressed-input/0016/cmd b/test/cases/io-compressed-input/0016/cmd new file mode 100644 index 0000000000..7d38bc22ac --- /dev/null +++ b/test/cases/io-compressed-input/0016/cmd @@ -0,0 +1 @@ +mlr --zstdin count -g a test/input/medium.zst diff --git a/test/cases/io-compressed-input/0016/experr b/test/cases/io-compressed-input/0016/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-compressed-input/0016/expout b/test/cases/io-compressed-input/0016/expout new file mode 100644 index 0000000000..7dcf142127 --- /dev/null +++ b/test/cases/io-compressed-input/0016/expout @@ -0,0 +1,5 @@ +a=pan,count=8 +a=eks,count=10 +a=wye,count=7 +a=zee,count=8 +a=hat,count=7 diff --git a/test/cases/verb-case/x b/test/cases/verb-case/x deleted file mode 100644 index a24cc18bdf..0000000000 --- a/test/cases/verb-case/x +++ /dev/null @@ -1,13 +0,0 @@ -mkdir 0020; echo mlr --from test/input.cases-csv --c2j case -u > 0020/cmd -mkdir 0021; echo mlr --from test/input.cases-csv --c2j case -l > 0021/cmd -mkdir 0022; echo mlr --from test/input.cases-csv --c2j case -s > 0022/cmd -mkdir 0023; echo mlr --from test/input.cases-csv --c2j case -t > 0023/cmd -mkdir 0024; echo mlr --from test/input.cases-csv --c2j case -k -u > 0024/cmd -mkdir 0025; echo mlr --from test/input.cases-csv --c2j case -k -l > 0025/cmd -mkdir 0026; echo mlr --from test/input.cases-csv --c2j case -k -s > 0026/cmd -mkdir 0027; echo mlr --from test/input.cases-csv --c2j case -k -t > 0027/cmd -mkdir 0028; echo mlr --from test/input.cases-csv --c2j case -v -u > 0028/cmd -mkdir 0029; echo mlr --from test/input.cases-csv --c2j case -v -l > 0029/cmd -mkdir 0030; echo mlr --from test/input.cases-csv --c2j case -v -s > 0030/cmd -mkdir 0031; echo mlr --from test/input.cases-csv --c2j case -v -t > 0031/cmd -mkdir 0032; echo mlr --from test/input.cases-csv --c2j case -u apple,ball then case -l cat,dog > 0032/cmd diff --git a/test/cases/verb-sub-gsub-ssub/0001/cmd b/test/cases/verb-sub-gsub-ssub/0001/cmd new file mode 100644 index 0000000000..7d4cec775c --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/0001/cmd @@ -0,0 +1 @@ +mlr --d2p --from test/input/abixy sub -f a,b e X diff --git a/test/cases/verb-sub-gsub-ssub/0001/experr b/test/cases/verb-sub-gsub-ssub/0001/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/verb-sub-gsub-ssub/0001/expout b/test/cases/verb-sub-gsub-ssub/0001/expout new file mode 100644 index 0000000000..917c3f5ed6 --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/0001/expout @@ -0,0 +1,11 @@ +a b i x y +pan pan 1 0.34679014 0.72680286 +Xks pan 2 0.75867996 0.52215111 +wyX wyX 3 0.20460331 0.33831853 +Xks wyX 4 0.38139939 0.13418874 +wyX pan 5 0.57328892 0.86362447 +zXe pan 6 0.52712616 0.49322129 +Xks zXe 7 0.61178406 0.18788492 +zXe wyX 8 0.59855401 0.97618139 +hat wyX 9 0.03144188 0.74955076 +pan wyX 10 0.50262601 0.95261836 diff --git a/test/cases/verb-sub-gsub-ssub/0002/cmd b/test/cases/verb-sub-gsub-ssub/0002/cmd new file mode 100644 index 0000000000..f33200891d --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/0002/cmd @@ -0,0 +1 @@ +mlr --d2p --from test/input/abixy gsub -f a,b e X diff --git a/test/cases/verb-sub-gsub-ssub/0002/experr b/test/cases/verb-sub-gsub-ssub/0002/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/verb-sub-gsub-ssub/0002/expout b/test/cases/verb-sub-gsub-ssub/0002/expout new file mode 100644 index 0000000000..49d53727b3 --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/0002/expout @@ -0,0 +1,11 @@ +a b i x y +pan pan 1 0.34679014 0.72680286 +Xks pan 2 0.75867996 0.52215111 +wyX wyX 3 0.20460331 0.33831853 +Xks wyX 4 0.38139939 0.13418874 +wyX pan 5 0.57328892 0.86362447 +zXX pan 6 0.52712616 0.49322129 +Xks zXX 7 0.61178406 0.18788492 +zXX wyX 8 0.59855401 0.97618139 +hat wyX 9 0.03144188 0.74955076 +pan wyX 10 0.50262601 0.95261836 diff --git a/test/cases/verb-sub-gsub-ssub/0003/cmd b/test/cases/verb-sub-gsub-ssub/0003/cmd new file mode 100644 index 0000000000..ff6b15c4ac --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/0003/cmd @@ -0,0 +1 @@ +mlr --d2p --from test/input/abixy sub -f a,b . X diff --git a/test/cases/verb-sub-gsub-ssub/0003/experr b/test/cases/verb-sub-gsub-ssub/0003/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/verb-sub-gsub-ssub/0003/expout b/test/cases/verb-sub-gsub-ssub/0003/expout new file mode 100644 index 0000000000..a8b8e86432 --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/0003/expout @@ -0,0 +1,11 @@ +a b i x y +Xan Xan 1 0.34679014 0.72680286 +Xks Xan 2 0.75867996 0.52215111 +Xye Xye 3 0.20460331 0.33831853 +Xks Xye 4 0.38139939 0.13418874 +Xye Xan 5 0.57328892 0.86362447 +Xee Xan 6 0.52712616 0.49322129 +Xks Xee 7 0.61178406 0.18788492 +Xee Xye 8 0.59855401 0.97618139 +Xat Xye 9 0.03144188 0.74955076 +Xan Xye 10 0.50262601 0.95261836 diff --git a/test/cases/verb-sub-gsub-ssub/0004/cmd b/test/cases/verb-sub-gsub-ssub/0004/cmd new file mode 100644 index 0000000000..8770d578d5 --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/0004/cmd @@ -0,0 +1 @@ +mlr --d2p --from test/input/abixy ssub -f a,b e X diff --git a/test/cases/verb-sub-gsub-ssub/0004/experr b/test/cases/verb-sub-gsub-ssub/0004/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/verb-sub-gsub-ssub/0004/expout b/test/cases/verb-sub-gsub-ssub/0004/expout new file mode 100644 index 0000000000..917c3f5ed6 --- /dev/null +++ b/test/cases/verb-sub-gsub-ssub/0004/expout @@ -0,0 +1,11 @@ +a b i x y +pan pan 1 0.34679014 0.72680286 +Xks pan 2 0.75867996 0.52215111 +wyX wyX 3 0.20460331 0.33831853 +Xks wyX 4 0.38139939 0.13418874 +wyX pan 5 0.57328892 0.86362447 +zXe pan 6 0.52712616 0.49322129 +Xks zXe 7 0.61178406 0.18788492 +zXe wyX 8 0.59855401 0.97618139 +hat wyX 9 0.03144188 0.74955076 +pan wyX 10 0.50262601 0.95261836 diff --git a/test/input/medium.zst b/test/input/medium.zst new file mode 100644 index 0000000000..f7b5c9a0d7 Binary files /dev/null and b/test/input/medium.zst differ