Merge branch 'kerl/utf8-a0-ff' of github.com:johnkerl/miller into ker…

…l/utf8-a0-ff
johnkerl · Aug 20, 2023 · c48a61a · c48a61a
2 parents 099d85b + e9957f9
commit c48a61a
Show file tree

Hide file tree

Showing 59 changed files with 1,135 additions and 116 deletions.
diff --git a/docs/src/customization.md b/docs/src/customization.md
@@ -50,7 +50,7 @@ and the `--csv` part will automatically be understood. If you do want to process
 
 * You can include any command-line flags, except the "terminal" ones such as `--help`.
 
-* The `--prepipe`, `--load`, and `--mload` flags aren't allowed in `.mlrrc` as they control code execution, and could result in your scripts running things you don't expect if you receive data from someone with a `./.mlrrc` in it. You can use `--prepipe-bz2`, `--prepipe-gunzip`, and `--prepipe-zcat` in `.mlrrc`, though.
+* The `--prepipe`, `--load`, and `--mload` flags aren't allowed in `.mlrrc` as they control code execution, and could result in your scripts running things you don't expect if you receive data from someone with a `./.mlrrc` in it. You can use `--prepipe-bz2`, `--prepipe-gunzip`, `--prepipe-zcat`, and `--prepipe-zstdcat` in `.mlrrc`, though.
 
 * The formatting rule is you need to put one flag beginning with `--` per line: for example, `--csv` on one line and `--nr-progress-mod 1000` on a separate line.
 

diff --git a/docs/src/customization.md.in b/docs/src/customization.md.in
@@ -34,7 +34,7 @@ and the `--csv` part will automatically be understood. If you do want to process
 
 * You can include any command-line flags, except the "terminal" ones such as `--help`.
 
-* The `--prepipe`, `--load`, and `--mload` flags aren't allowed in `.mlrrc` as they control code execution, and could result in your scripts running things you don't expect if you receive data from someone with a `./.mlrrc` in it. You can use `--prepipe-bz2`, `--prepipe-gunzip`, and `--prepipe-zcat` in `.mlrrc`, though.
+* The `--prepipe`, `--load`, and `--mload` flags aren't allowed in `.mlrrc` as they control code execution, and could result in your scripts running things you don't expect if you receive data from someone with a `./.mlrrc` in it. You can use `--prepipe-bz2`, `--prepipe-gunzip`, `--prepipe-zcat`, and `--prepipe-zstdcat` in `.mlrrc`, though.
 
 * The formatting rule is you need to put one flag beginning with `--` per line: for example, `--csv` on one line and `--nr-progress-mod 1000` on a separate line.
 

diff --git a/docs/src/data-diving-examples.md b/docs/src/data-diving-examples.md
@@ -160,11 +160,11 @@ CITRUS COUNTY       1332.9                 79974.9                483785.1
 <b>  stats2 -a corr,linreg-ols,r2 -f tiv_2011,tiv_2012</b>
 </pre>
 <pre class="pre-non-highlight-in-pair">
-tiv_2011_tiv_2012_corr  0.9730497632351692
-tiv_2011_tiv_2012_ols_m 0.9835583980337723
-tiv_2011_tiv_2012_ols_b 433854.6428968317
+tiv_2011_tiv_2012_corr  0.9730497632351701
+tiv_2011_tiv_2012_ols_m 0.9835583980337732
+tiv_2011_tiv_2012_ols_b 433854.6428968301
 tiv_2011_tiv_2012_ols_n 36634
-tiv_2011_tiv_2012_r2    0.9468258417320189
+tiv_2011_tiv_2012_r2    0.9468258417320204
 </pre>
 
 <pre class="pre-highlight-in-pair">
@@ -322,7 +322,7 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
 </pre>
 <pre class="pre-non-highlight-in-pair">
           u_v_corr              w_x_corr
-0.1334180491027861 -0.011319841199866178
+0.1334180491027861 -0.011319841199852926
 </pre>
 
 <pre class="pre-highlight-in-pair">
@@ -332,22 +332,22 @@ Look at bivariate stats by color and shape. In particular, `u,v` pairwise correl
 </pre>
 <pre class="pre-non-highlight-in-pair">
  color    shape              u_v_corr               w_x_corr
-   red   circle    0.9807984401887236   -0.01856553658708754
-orange   square   0.17685855992752927   -0.07104431573806054
- green   circle   0.05764419437577255    0.01179572988801509
-   red   square   0.05574477124893523 -0.0006801456507510942
-yellow triangle   0.04457273771962798   0.024604310103081825
-yellow   square   0.04379172927296089   -0.04462197201631237
-purple   circle   0.03587354936895086     0.1341133954140899
-  blue   square   0.03241153095761164  -0.053507648119643196
-  blue triangle  0.015356427073158766 -0.0006089997461435399
-orange   circle  0.010518953877704048   -0.16279397329279383
-   red triangle   0.00809782571528034   0.012486621357942596
-purple triangle  0.005155190909099334  -0.045057909256220656
-purple   square -0.025680276963377404    0.05769429647930396
- green   square   -0.0257760734502851  -0.003265173252087127
-orange triangle -0.030456661186085785    -0.1318699981926352
-yellow   circle  -0.06477331572781474    0.07369449819706045
-  blue   circle  -0.10234761901929677  -0.030528539069837757
- green triangle  -0.10901825107358765   -0.04848782060162929
+   red   circle    0.9807984401887242  -0.018565536587084836
+orange   square   0.17685855992752933   -0.07104431573805543
+ green   circle   0.05764419437577257   0.011795729888018455
+   red   square    0.0557447712489348 -0.0006801456507506415
+yellow triangle    0.0445727377196281   0.024604310103079844
+yellow   square    0.0437917292729612  -0.044621972016306265
+purple   circle   0.03587354936895115    0.13411339541407613
+  blue   square   0.03241153095761152   -0.05350764811965621
+  blue triangle  0.015356427073158612 -0.0006089997461408209
+orange   circle  0.010518953877704181    -0.1627939732927932
+   red triangle   0.00809782571528054    0.01248662135795501
+purple triangle  0.005155190909099739   -0.04505790925621933
+purple   square  -0.02568027696337717   0.057694296479293694
+ green   square -0.025776073450284875 -0.0032651732520739014
+orange triangle -0.030456661186085584   -0.13186999819263814
+yellow   circle  -0.06477331572781515     0.0736944981970553
+  blue   circle   -0.1023476190192966  -0.030528539069839333
+ green triangle  -0.10901825107358747   -0.04848782060162855
 </pre>
diff --git a/docs/src/example-mlr-s-script b/docs/src/example-mlr-s-script
@@ -1,5 +1,5 @@
 #!/usr/bin/env mlr -s
 --c2p
-filter '$quantity != 20'
+filter '$quantity != 20' # Here is a comment
 then count-distinct -f shape
 then fraction -f count
diff --git a/docs/src/glossary.md b/docs/src/glossary.md
@@ -905,3 +905,8 @@ See also the [arrays page](reference-main-arrays.md), as well as the page on
 
 A [data-compression format supported by Miller](reference-main-compressed-data.md).
 Files compressed using ZLIB compression normally end in `.z`.
+
+## ZSTD / .zst
+
+A [data-compression format supported by Miller](reference-main-compressed-data.md).
+Files compressed using ZSTD compression normally end in`.zst`.
diff --git a/docs/src/glossary.md.in b/docs/src/glossary.md.in
@@ -889,3 +889,8 @@ See also the [arrays page](reference-main-arrays.md), as well as the page on
 
 A [data-compression format supported by Miller](reference-main-compressed-data.md).
 Files compressed using ZLIB compression normally end in `.z`.
+
+## ZSTD / .zst
+
+A [data-compression format supported by Miller](reference-main-compressed-data.md).
+Files compressed using ZSTD compression normally end in`.zst`.
diff --git a/docs/src/manpage.md b/docs/src/manpage.md
@@ -194,12 +194,13 @@ MILLER(1)                                                            MILLER(1)
 1mVERB LIST0m
        altkv bar bootstrap case cat check clean-whitespace count-distinct count
        count-similar cut decimate fill-down fill-empty filter flatten format-values
-       fraction gap grep group-by group-like having-fields head histogram json-parse
-       json-stringify join label latin1-to-utf8 least-frequent merge-fields
-       most-frequent nest nothing put regularize remove-empty-columns rename reorder
-       repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records
-       sort sort-within-records split stats1 stats2 step summary tac tail tee
-       template top utf8-to-latin1 unflatten uniq unspace unsparsify
+       fraction gap grep group-by group-like gsub having-fields head histogram
+       json-parse json-stringify join label latin1-to-utf8 least-frequent
+       merge-fields most-frequent nest nothing put regularize remove-empty-columns
+       rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle
+       skip-trivial-records sort sort-within-records split ssub stats1 stats2 step
+       sub summary tac tail tee template top utf8-to-latin1 unflatten uniq unspace
+       unsparsify
 
 1mFUNCTION LIST0m
        abs acos acosh any append apply arrayify asin asinh asserting_absent
@@ -262,7 +263,7 @@ MILLER(1)                                                            MILLER(1)
        Miller offers a few different ways to handle reading data files
             which have been compressed.
 
-       * Decompression done within the Miller process itself: `--bz2in` `--gzin` `--zin`
+       * Decompression done within the Miller process itself: `--bz2in` `--gzin` `--zin``--zstdin`
        * Decompression done outside the Miller process: `--prepipe` `--prepipex`
 
        Using `--prepipe` and `--prepipex` you can specify an action to be
@@ -285,7 +286,7 @@ MILLER(1)                                                            MILLER(1)
 
        Lastly, note that if `--prepipe` or `--prepipex` is specified, it replaces any
        decisions that might have been made based on the file suffix. Likewise,
-       `--gzin`/`--bz2in`/`--zin` are ignored if `--prepipe` is also specified.
+       `--gzin`/`--bz2in`/`--zin``--zin` are ignored if `--prepipe` is also specified.
 
        --bz2in                  Uncompress bzip2 within the Miller process. Done by
                                 default if file ends in `.bz2`.
@@ -302,6 +303,8 @@ MILLER(1)                                                            MILLER(1)
                                 `.mlrrc`.
        --prepipe-zcat           Same as `--prepipe zcat`, except this is allowed in
                                 `.mlrrc`.
+       --prepipe-zstdcat        Same as `--prepipe zstdcat`, except this is allowed
+                                in `.mlrrc`.
        --prepipex {decompression command}
                                 Like `--prepipe` with one exception: doesn't insert
                                 `&lt;` between command and filename at runtime. Useful
@@ -310,6 +313,8 @@ MILLER(1)                                                            MILLER(1)
                                 in `.mlrrc` to avoid unexpected code execution.
        --zin                    Uncompress zlib within the Miller process. Done by
                                 default if file ends in `.z`.
+       --zstdin                 Uncompress zstd within the Miller process. Done by
+                                default if file ends in `.zstd`.
 
 1mCSV/TSV-ONLY FLAGS0m
        These are flags which are applicable to CSV format.
@@ -572,6 +577,11 @@ MILLER(1)                                                            MILLER(1)
                                 to be modified, except when input is from `tail -f`.
                                 See also
                                 https://miller.readthedocs.io/en/latest/reference-main-flag-list/.
+       --s-no-comment-strip {file name}
+                                Take command-line flags from file name, like -s, but
+                                with no comment-stripping. For more information
+                                please see
+                                https://miller.readthedocs.io/en/latest/scripting/.
        --seed {n}               with `n` of the form `12345678` or `0xcafefeed`. For
                                 `put`/`filter` `urand`, `urandint`, and `urand32`.
        --tz {timezone}          Specify timezone, overriding `$TZ` environment
@@ -1236,6 +1246,15 @@ MILLER(1)                                                            MILLER(1)
        Options:
        -h|--help Show this message.
 
+   1mgsub0m
+       Usage: mlr gsub [options]
+       Replaces old string with new string in specified field(s), with regex support
+       for the old string and handling multiple matches, like the `gsub` DSL function.
+       See also the `sub` and `ssub` verbs.
+       Options:
+       -f {a,b,c}  Field names to convert.
+       -h|--help   Show this message.
+
    1mhaving-fields0m
        Usage: mlr having-fields [options]
        Conditionally passes through records depending on each record's field names.
@@ -1844,6 +1863,14 @@ MILLER(1)                                                            MILLER(1)
 
        See also the "tee" DSL function which lets you do more ad-hoc customization.
 
+   1mssub0m
+       Usage: mlr ssub [options]
+       Replaces old string with new string in specified field(s), without regex support for
+       the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs.
+       Options:
+       -f {a,b,c}  Field names to convert.
+       -h|--help   Show this message.
+
    1mstats10m
        Usage: mlr stats1 [options]
        Computes univariate statistics for one or more given fields, accumulated across
@@ -1981,6 +2008,15 @@ MILLER(1)                                                            MILLER(1)
        https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average
        for more information on EWMA.
 
+   1msub0m
+       Usage: mlr sub [options]
+       Replaces old string with new string in specified field(s), with regex support
+       for the old string and not handling multiple matches, like the `sub` DSL function.
+       See also the `gsub` and `ssub` verbs.
+       Options:
+       -f {a,b,c}  Field names to convert.
+       -h|--help   Show this message.
+
    1msummary0m
        Usage: mlr summary [options]
        Show summary statistics about the input data.

diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt
@@ -173,12 +173,13 @@ MILLER(1)                                                            MILLER(1)
 1mVERB LIST0m
        altkv bar bootstrap case cat check clean-whitespace count-distinct count
        count-similar cut decimate fill-down fill-empty filter flatten format-values
-       fraction gap grep group-by group-like having-fields head histogram json-parse
-       json-stringify join label latin1-to-utf8 least-frequent merge-fields
-       most-frequent nest nothing put regularize remove-empty-columns rename reorder
-       repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle skip-trivial-records
-       sort sort-within-records split stats1 stats2 step summary tac tail tee
-       template top utf8-to-latin1 unflatten uniq unspace unsparsify
+       fraction gap grep group-by group-like gsub having-fields head histogram
+       json-parse json-stringify join label latin1-to-utf8 least-frequent
+       merge-fields most-frequent nest nothing put regularize remove-empty-columns
+       rename reorder repeat reshape sample sec2gmtdate sec2gmt seqgen shuffle
+       skip-trivial-records sort sort-within-records split ssub stats1 stats2 step
+       sub summary tac tail tee template top utf8-to-latin1 unflatten uniq unspace
+       unsparsify
 
 1mFUNCTION LIST0m
        abs acos acosh any append apply arrayify asin asinh asserting_absent
@@ -241,7 +242,7 @@ MILLER(1)                                                            MILLER(1)
        Miller offers a few different ways to handle reading data files
             which have been compressed.
 
-       * Decompression done within the Miller process itself: `--bz2in` `--gzin` `--zin`
+       * Decompression done within the Miller process itself: `--bz2in` `--gzin` `--zin``--zstdin`
        * Decompression done outside the Miller process: `--prepipe` `--prepipex`
 
        Using `--prepipe` and `--prepipex` you can specify an action to be
@@ -264,7 +265,7 @@ MILLER(1)                                                            MILLER(1)
 
        Lastly, note that if `--prepipe` or `--prepipex` is specified, it replaces any
        decisions that might have been made based on the file suffix. Likewise,
-       `--gzin`/`--bz2in`/`--zin` are ignored if `--prepipe` is also specified.
+       `--gzin`/`--bz2in`/`--zin``--zin` are ignored if `--prepipe` is also specified.
 
        --bz2in                  Uncompress bzip2 within the Miller process. Done by
                                 default if file ends in `.bz2`.
@@ -281,6 +282,8 @@ MILLER(1)                                                            MILLER(1)
                                 `.mlrrc`.
        --prepipe-zcat           Same as `--prepipe zcat`, except this is allowed in
                                 `.mlrrc`.
+       --prepipe-zstdcat        Same as `--prepipe zstdcat`, except this is allowed
+                                in `.mlrrc`.
        --prepipex {decompression command}
                                 Like `--prepipe` with one exception: doesn't insert
                                 `<` between command and filename at runtime. Useful
@@ -289,6 +292,8 @@ MILLER(1)                                                            MILLER(1)
                                 in `.mlrrc` to avoid unexpected code execution.
        --zin                    Uncompress zlib within the Miller process. Done by
                                 default if file ends in `.z`.
+       --zstdin                 Uncompress zstd within the Miller process. Done by
+                                default if file ends in `.zstd`.
 
 1mCSV/TSV-ONLY FLAGS0m
        These are flags which are applicable to CSV format.
@@ -551,6 +556,11 @@ MILLER(1)                                                            MILLER(1)
                                 to be modified, except when input is from `tail -f`.
                                 See also
                                 https://miller.readthedocs.io/en/latest/reference-main-flag-list/.
+       --s-no-comment-strip {file name}
+                                Take command-line flags from file name, like -s, but
+                                with no comment-stripping. For more information
+                                please see
+                                https://miller.readthedocs.io/en/latest/scripting/.
        --seed {n}               with `n` of the form `12345678` or `0xcafefeed`. For
                                 `put`/`filter` `urand`, `urandint`, and `urand32`.
        --tz {timezone}          Specify timezone, overriding `$TZ` environment
@@ -1215,6 +1225,15 @@ MILLER(1)                                                            MILLER(1)
        Options:
        -h|--help Show this message.
 
+   1mgsub0m
+       Usage: mlr gsub [options]
+       Replaces old string with new string in specified field(s), with regex support
+       for the old string and handling multiple matches, like the `gsub` DSL function.
+       See also the `sub` and `ssub` verbs.
+       Options:
+       -f {a,b,c}  Field names to convert.
+       -h|--help   Show this message.
+
    1mhaving-fields0m
        Usage: mlr having-fields [options]
        Conditionally passes through records depending on each record's field names.
@@ -1823,6 +1842,14 @@ MILLER(1)                                                            MILLER(1)
 
        See also the "tee" DSL function which lets you do more ad-hoc customization.
 
+   1mssub0m
+       Usage: mlr ssub [options]
+       Replaces old string with new string in specified field(s), without regex support for
+       the old string, like the `ssub` DSL function. See also the `gsub` and `sub` verbs.
+       Options:
+       -f {a,b,c}  Field names to convert.
+       -h|--help   Show this message.
+
    1mstats10m
        Usage: mlr stats1 [options]
        Computes univariate statistics for one or more given fields, accumulated across
@@ -1960,6 +1987,15 @@ MILLER(1)                                                            MILLER(1)
        https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average
        for more information on EWMA.
 
+   1msub0m
+       Usage: mlr sub [options]
+       Replaces old string with new string in specified field(s), with regex support
+       for the old string and not handling multiple matches, like the `sub` DSL function.
+       See also the `gsub` and `ssub` verbs.
+       Options:
+       -f {a,b,c}  Field names to convert.
+       -h|--help   Show this message.
+
    1msummary0m
        Usage: mlr summary [options]
        Show summary statistics about the input data.