Skip to content

Some small utilities for statistic analysis in shell

License

Notifications You must be signed in to change notification settings

bknotwell/shell-stats

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

7 Commits
 
 
 
 
 
 

Repository files navigation

Shell stats

Background, rationale and constraints

The R language has several functions that would be useful in various shell scripts:

summary
a function that takes numeric input and returns min, max, mean, median, 25th and 75th percentile cutoffs.
stem
a function that takes numeric input and creates a stem and leaf plot.

Q&A

Why bother?
Small shell utiities will be more easily used to display simple analyses if the user needs to install fewer dependencies.
What dependencies are needed?
for maximum portability and installation convenience, POSIX-compliant methods will be used unless the mechanism doesn’t exist.
Won’t this be slow?
probably. This is about convenience for the user as a means of avoiding installing R when they need to run a small utility.

Summary

As part of building the summary function, the following functions must be built:

min
the minimum value in the data set.
max
the maximum value in the data set.
mean
the average value in the data set.
median
the point in a data set with 50% of elements above and 50% below.
quartile
the 25% or 75% cutoff.
stem
output a stem and leaf plot

An extractable shell script (source shell-stats.sh) follows:

minmaxhelper() {
    tr ' ' '\n' | sort -nr | $1 -1
}

max() {
    minmaxhelper head
}

min() {
    minmaxhelper tail
}

mean() {
    if [ $# -eq 1 ]
    then
        dp="$1k"
    else
        dp="0k"
    fi

    # the - to _ conversion handles negative numbers
    tr ' ' '\n' | tr '-' '_' | sed -e ' 
    1i \
    0 sC 0
    s/$/ + lC 1 + sC/
    $a \
      lC '"$dp"' / p' | dc
}

quartile() {
  tr ' ' '\n' | sort -n |  awk -vquartile="$1" '
        BEGIN { i = 0; idx=0 }
        { arr[i] = $1; i++ }
        END {
            if(quartile == 1) {
                idx = (i / 4)
            }
            else if(quartile == 3) {
                idx = (i / 4) * 3
            }
            print(arr[int(idx)])
        }'
}

median() {
  tr ' ' '\n' | sort -n |  awk '
        BEGIN { i = 0; idx=0 }
        { arr[i] = $1; i++; if(i > 0 && ((i % 2) == 0))idx++; }
        END {
            if((i % 2) == 0) {
                printf("%0.1f", (arr[idx] + arr[idx - 1]) / 2)
            }
            else {
                printf("%d", arr[idx])
            }
        }'
}

summary() {
    mean_=$(tee $$.tmp | mean)
    median_=$(median < $$.tmp)
    first_=$(quartile 1 < $$.tmp)
    third_=$(quartile 3 < $$.tmp)
    min_=$(min < $$.tmp)
    max_=$(max < $$.tmp)
    rm $$.tmp
    printf "Summary data: min(%d) first(%d) mean(%0.1f) median(%0.1f) third(%d) max(%d)\n" \
           $min_ $first_ $mean_ $median_ $third_ $max_
}

stem() {
    tr ' ' '\n' | sort -n | awk -v scale=$1 -v width=$2 -v atom=$3 '
function imin2(a, b) {
   return(int(a < b ? a : b));
}

function imax2(a, b) {
   return(int(a > b ? a : b));
}

function fabs(x) {
    return(x < 0 ? -x : x);
}

function abs(x) {
    return int(fabs(x));
}

function log10(x) {
    return(log(x) / log(10));
}

function stem_print(close_, dist, ndigits) {
   if((close_ / 10 == 0) && (dist < 0))
       printf("  %*s | ", ndigits, "-0");
   else
       printf("  %*d | ", ndigits, close_/10);
}

function stem_leaf(x, scale, width, atom) {
    printf("\n");
    mu=10
    if(x[length(x) - 1] > x[0]) {
        r = atom + (x[length(x) - 1]  - x[0]) / scale
        t = log10(r);
        if(t < 0)
           t--; 
        c = 10 ^ int(1.0 - int(t));
        mm = imin2(2, imax2(0, int(r * c / 25)))

        k = int(3 * mm + 2 - int(150 / (length(x) + 50)))
        if(((k-1) * (k-2) * (k-5)) == 0)
            c = c * 10;

        if((k * (k-4) * (k-8)) == 0)
            mu = 5;

        if(((k-1) * (k-5) * (k-6)) == 0)
            mu = 20;
    } else {
        r = atom + fabs(x[0]) / scale;
        c = 10 ^ int(1.0 - int(log10(r)));
    }

    t = int(x[0] * c / mu);
    if(x[0] < 0)
        t--;
    lo = t * mu;

    t = int(x[length(x) - 1] * c / mu);
    if(x[length(x) - 1] < 0)
       t--;

    hi = t * mu;

    ldigits = (lo < 0 ? int(log10(-lo)) + 1 : 0);
    hdigits = (hi > 0 ? int(log10(hi)) : 0);

    if(ldigits < hdigits)
        ndigits = hdigits;
    else
        ndigits = ldigits;

    if(lo < 0 && int(x[0]*c) == lo)
        lo = lo - mu;
    hi = lo + mu;
    if(int(x[0]*c+0.5) > hi) {
       lo = hi;
       hi = lo + mu;
    }

    t = log10(c) + 0.5;
    if(t < 0)
       t--;
    pdigits = 1 - int(t);
    printf("  The decimal point is ");
    if(pdigits == 0)
        printf("at the |\n\n");
    else
        printf("%d digit(s) to the %s of the |\n\n", fabs(pdigits),
               pdigits > 0 ? "right" : "left");

    i = 0;
    do {
        if(lo < 0)
            stem_print(int(hi), int(lo), ndigits);
        else
            stem_print(int(lo), int(hi), ndigits);

        j = 0;
        do {
           if(x[i] < 0)
               xi = int(x[i]*c - .5);
           else
               xi = int(x[i]*c + .5)
           if((hi == 0 && x[i] >= 0) ||
              (lo < 0 && xi > hi) ||
              (lo >= 0 && xi >= hi))
               break;

           j++;

           if(j <= width - 12)
               printf("%1d", abs(xi) % 10);
 
           i++;
        } while(i < length(x));

        if(j > width)
            printf("+%d", j - width);
        printf("\n");
        if(i >= length(x))
            break;

        hi += mu;
        lo += mu;
    } while(1);
    printf("\n");
}

{
   arr[length(arr)] = $1;
}

END {
    stem_leaf(arr, scale, width, atom);
}
    '
}

if [ ! -z $TESTIT  ]
then
    dotest() {
        fn=$1
        name=$2
        res="$3"
        shift; shift; shift

        if [ "$(echo $td | $fn $*)" == "$res" ]
        then
            echo "$name passed $(echo $td | $fn $*)"
        else
            echo "$name failed $(echo $td | $fn $*)"
        fi
    }
    td="2 1 0 10 15 10 4 3 1 33 66 99 44"
    dotest min Min 0
    dotest max Max 99
    dotest mean Mean 22
    dotest mean Mean 22.1 1
    dotest median Median 10
    dotest quartile Quartile 2 1
    dotest quartile Quartile 33 3
    dotest summary Summary "Summary data: min(0) first(2) mean(22.0) median(10.0) third(33) max(99)"
    td="$td 9"  # test median with an even number of elements"
    dotest median Median 9.5
    stemgolden="
  The decimal point is 1 digit(s) to the right of the |

  0 | 0112349005
  2 | 3
  4 | 4
  6 | 6
  8 | 9"
    dotest stem Stem "$stemgolden" 1 80 .00000001
fi

All functions take the same interface–a list of numbers separated by spaces or newlines that’s presented to the function on stdin.

Other stem implementations

The initial stem code was difficult to get right in awk so it was initially implemented in C and Python:

C

#include <stdio.h>
#include <math.h>
#include <limits.h> /* INT_MAX */
#include <stdlib.h> /* abs */

static int imin2(int x, int y)
{
    return (x < y) ? x : y;
}

static int imax2(int x, int y)
{
    return (x < y) ? y : x;
}


static void stem_print(int close, int dist, int ndigits)
{
    if((close/10 == 0) && (dist < 0))
	printf("  %*s | ", ndigits, "-0");
    else
        printf("  %*d | ", ndigits, close/10);
}

static int cmp(const void* a, const void* b) {
   return((*(double*)a) == (*(double*)b) ? 0 : ((*(double*)a) < (*(double*)b) ? -1 : 1));
}

static int
stem_leaf(double *x, int n, double scale, int width, double atom)
{
    double r, c, x1, x2;
    double mu, lo, hi;
    int mm, k, i, j, xi;
    int ldigits, hdigits, ndigits, pdigits;

    if(n <= 1)
	return EXIT_FAILURE;

    qsort(x, n, sizeof(*x), cmp);

    printf("\n");
    mu = 10;
    if(x[n-1] > x[0]) {
	r = atom + (x[n-1] - x[0])/scale;
	c = pow(10.0, (int)(1.0 - floor(log10(r))));
	mm = imin2(2, imax2(0, (int)(r*c/25)));
	k = 3*mm + 2 - 150/(n + 50);
	if ((k-1)*(k-2)*(k-5) == 0) {
	    c *= 10.;
        }
	/* need to ensure that x[i]*c does not integer overflow */
	x1 = fabs(x[0]); x2 = fabs(x[n-1]);
	if(x2 > x1) x1 = x2;
	while(x1*c > INT_MAX) c /= 10;
	if (k*(k-4)*(k-8) == 0) mu = 5;
	if ((k-1)*(k-5)*(k-6) == 0) mu = 20;
    } else {
	r = atom + fabs(x[0])/scale;
	c = pow(10.0, (int)(1.0 - floor(log10(r))));
    }
    
    /* Find the print width of the stem. */

    lo = floor(x[0]*c/mu)*mu;
    hi = floor(x[n-1]*c/mu)*mu;
    ldigits = (lo < 0) ? (int) floor(log10(-(double)lo)) + 1 : 0;
    hdigits = (hi > 0) ? (int) floor(log10((double)hi)): 0;
    ndigits = (ldigits < hdigits) ? hdigits : ldigits;

    /* Starting cell */

    if(lo < 0 && floor(x[0]*c) == lo) lo = lo - mu;
    hi = lo + mu;
    if(floor(x[0]*c+0.5) > hi) {
	lo = hi;
	hi = lo + mu;
    }

    /* Print out the info about the decimal place */

    pdigits = 1 - (int) floor(log10(c) + 0.5);

    printf("  The decimal point is ");
    if(pdigits == 0)
	printf("at the |\n\n");
    else
	printf("%d digit(s) to the %s of the |\n\n",abs(pdigits),
		(pdigits > 0) ? "right" : "left");
    i = 0;
    do {
	if(lo < 0)
	    stem_print((int)hi, (int)lo, ndigits);
	else
	    stem_print((int)lo, (int)hi, ndigits);
	j = 0;
	do {
	    if(x[i] < 0)xi = (int) (x[i]*c - .5);
	    else	xi = (int) (x[i]*c + .5);

	    if( (hi == 0 && x[i] >= 0)||
		(lo <  0 && xi >  hi) ||
		(lo >= 0 && xi >= hi) )
		break;

	    j++;
	    if(j <= width-12)
		printf("%1d", abs(xi) % 10);
	    i++;
	} while(i < n);
	if(j > width)
	    printf("+%d", j - width);
	printf("\n");
	if(i >= n)
	    break;
	hi += mu;
	lo += mu;
    } while(1);
    printf("\n");
    return EXIT_SUCCESS;
}

Python

import math

def imin2(a, b):
   if(a < b):
      return(int(a))
   return(int(b))

def imax2(a, b):
   if(a > b):
      return(int(a))
   return(int(b))

def stem_print(close, dist, ndigits):
   if((close / 10 == 0) and (dist < 0)):
       print("  %*s | " % (ndigits, "-0"), end='')
   else:
       print("  %*d | " % (ndigits, close/10), end='')

def stem_leaf(x, scale=1, width=80, atom=0.00000001):
    x.sort()

    print('')
    mu = 10

    if x[-1] > x[0]:
       r = atom + (x[-1] - x[0]) / scale
       c = math.pow(10, int(1.0 - math.floor(math.log10(r))))
       mm = imin2(2, imax2(0, int(r * c / 25)))
       k = int(3 * mm + 2 - 150 // (len(x) + 50))
       if((k-1) * (k-2) * (k-5) == 0):
           c = c * 10
       if(k*(k-4)*(k-8) == 0):
           mu = 5;
       if((k-1)*(k-5)*(k-6) == 0):
           mu = 20;
    else:
       r = atom + math.fabs(x[0]) / scale
       c = math.pow(10, int(1.0 - math.floor(math.log10(r))))

    lo = math.floor(x[0]*c/mu)*mu;
    hi = math.floor(x[-1]*c/mu)*mu;

    if lo < 0:
        ldigits = int(math.floor(math.log10(-lo))) + 1
    else:
        ldigits = 0
    if hi > 0:
        hdigits = int(math.floor(math.log10(hi)))
    else:
        hdigits = 0

    if(ldigits < hdigits):
       ndigits = hdigits
    else:
       ndigits = ldigits

    if(lo < 0 and math.floor(x[0]*c) == lo):
        lo = lo - mu
    hi = lo + mu
    if(math.floor(x[0]*c+0.5) > hi):
        lo = hi
        hi = lo + mu

    pdigits = 1 - int(math.floor(math.log10(c) + 0.5))
    print('  The decimal point is ',end=''),
    if(pdigits == 0):
        print("at the |")
    else:
        if pdigits > 0:
             side = 'right'
        else:
             side = 'left'

        print("%d digit(s) to the %s of the |" % (math.fabs(pdigits), side))
    print()
    i = 0

    while True:
        if(lo < 0):
            stem_print(int(hi), int(lo), ndigits)
        else:
            stem_print(int(lo), int(hi), ndigits)

        j = 0
        while True:
            if(x[i] < 0):
               xi = int(x[i]*c - .5)
            else: 
               xi = int(x[i]*c + .5)

            if((hi == 0 and x[i] >= 0) or
               (lo < 0 and xi > hi) or
               (lo >= 0 and xi >= hi)):
               break

            j = j + 1
            if(j <= (width - 12)):
                print("%1d" % (int(math.fabs(xi)) % 10), end='') 
            i = i + 1

            if(i >= len(x)):
                break
        if(j > width):
            print("+%d" % (j - width), end='')
        print()
        if(i >= len(x)):
            break

        hi = hi + mu
        lo = lo + mu
    print()

About

Some small utilities for statistic analysis in shell

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published