diff --git a/Dockerfile b/Dockerfile index 8c20063..fb35084 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,3 @@ - FROM r-base:latest MAINTAINER Tesera Systems Inc. @@ -17,8 +16,9 @@ RUN apt-get update && apt-get install -y \ bats \ && rm -rf /var/lib/apt/lists/* -ENV PYLEARN_REF master -ENV RLEARN_REF master +ENV PYLEARN_REF v1.0.1 +ENV PRELURN_REF v1.0.0 +ENV RLEARN_REF v1.0.1 ENV WD /opt/learn ENV HISTFILE $WD/.bash_history diff --git a/README.md b/README.md index 415da66..4e5f41a 100644 --- a/README.md +++ b/README.md @@ -2,24 +2,18 @@ [ ![Codeship Status for tesera/learn-cli](https://codeship.com/projects/f2a31230-b7e8-0133-9192-1269d3e58a72/status?branch=master)](https://codeship.com/projects/134949) -learn-cli performs variable selection, model development and target dataset processing. It uses [pylearn](https://github.com/tesera/pylearn) and [rlearn](https://github.com/tesera/rlearn) libraries. The cli invokes rlearn function via rpy2. - -Although the cli is docker ready you can choose to run the cli locally the old fashion way. Running the cli in Docker will simplify the efforts tremendously but Docker is not required. - -### Prerequisites - -* R -* Python 2.7 -* rlearn `library('devtools'); install_github(repo='tesera/rlearn', dependencies=TRUE, ref='master');` -* AWS Access Keys (optional: for using S3 data location) - -### Install - -```console -$ pip install git+https://github.com/tesera/learn-cli.git -``` - -### Usage +learn-cli performs machine learning tasks, including variable selection, model +development and target dataset processing. It uses +[pylearn](https://github.com/tesera/pylearn), +[prelurn](https://github.com/tesera/pylearn), and +[rlearn](https://github.com/tesera/rlearn) libraries. The cli invokes rlearn +function via rpy2. + +Support for developing and using the CLI is only provided if you are using +docker, as the CLI has a fairly complex set of requirements (packages, +runtimes, etc) + +## Usage ```console $ learn --help Usage: @@ -54,17 +48,13 @@ Examples: learn discrat --xy-data s3://bucket/xy_reference.csv --x-data s3://bucket/x_filtered.csv --dfunct s3://bucket/dfunct.csv --idf s3://bucket/idf.csv --varset 18 --output s3://bucket/varsel ``` -### Testing - -`bats ./tests/intergration` - -### Docker +## Setup with Docker If you are using docker-machine make sure you have a machine running and that you have evaluated the machine environment. -#### Creating a Docker Machine Host VM +### Creating a Docker Machine Host VM -#####Windows Powershell +#### Windows Powershell ```console $ docker-machine create --driver virtualbox --virtualbox-host-dns-resolver default $ docker-machine env --shell powershell default | Invoke-Expression @@ -76,7 +66,7 @@ $ docker-machine create --driver virtualbox default $ eval "$(docker-machine env default)" ``` -#### Running the container +### Running the container ```console $ docker build -t learn . @@ -84,11 +74,11 @@ $ docker run learn /bin/bash root@1e36bb3275b5:/opt/learn# learn --help ``` -#### Development +### Development -During development you will want to bring in the codebase with you in the container. You can simply use the Docker Compose command bellow. Once in the container run the `install-dependencies.sh` script passing in the `--dev` flag to make the project editable. This wil install all the Python dependencies in the project root under the `pysite folder and the R dependencies under the rlibs folder. You will only need to run this once unless you dependencies change. +During development you will want to bring in the codebase with you in the container. You can simply use the Docker Compose command bellow. Once in the container run the `install-dependencies.sh` script passing in the `--dev` flag to make the project editable. This wil install all the Python dependencies in the project root under the `pysite folder and the R dependencies under the rlibs folder. You will only need to run this once unless your dependencies change. -You will need to add a `dev.env` file with at least `PYLEARN_REF` and `RLEARN_REF` variables set to the Github ref/version of the respective libraries. Optionaly you can also add you AWS Access Keys and region in order to use S3 as a data location. +You will need to add a `dev.env` file with at least `PYLEARN_REF`, `RLEARN_REF` and `PRELURN_REF` variables set to the Github ref (branch or tag) of the respective libraries. Optionaly you can also add you AWS Access Keys and region in order to use S3 as a data location. ```console $ cat dev.env @@ -107,11 +97,27 @@ root@1e36bb3275b5:/opt/learn# bash ./install-dependencies --dev root@1e36bb3275b5:/opt/learn# learn --help ``` -#### Testing +### Testing + +You can run the tests, which are written with bats, using the following docker compose task: `docker-compose run tests` -### Contributing +You can also enter the contianer and run specific tests as follows: + +``` +> dc run dev +root@4d3df46d52c7:/opt/learn# bats tests/integration/ +.DS_Store output/ test_cli_discrat.bats test_cli_varsel.bats +input/ test_cli_describe.bats test_cli_lda.bats +root@4d3df46d52c7:/opt/learn# bats tests/integration/test_cli_describe.bats + ✓ describe runs and output expected files + + 1 test, 0 failures +``` + +## Contributing -- [Python Style Guide](https://www.python.org/dev/peps/pep-0008/) -- [R Style Guide](http://adv-r.had.co.nz/Style.html) +Refer to the [pylearn](https://github.com/tesera/pylearn#contribution-guidelines) and +[rlearn](https://github.com/tesera/rlearn#contribution-guidelines) for guides on how to +contribute. diff --git a/bin/install-dependencies.sh b/bin/install-dependencies.sh index b0bd1b2..f7371b1 100644 --- a/bin/install-dependencies.sh +++ b/bin/install-dependencies.sh @@ -6,12 +6,12 @@ mkdir -p {pysite,rlibs} install2.r -l $R_LIBS_USER devtools -r ./bin/installGithub2.r tesera/rlearn -d TRUE -r ${RLEARN_REF-master} +r ./bin/installGithub2.r tesera/rlearn -d TRUE -r ${RLEARN_REF-v1.0.1} pip install --user scipy awscli -pip install --user "git+https://github.com/tesera/pylearn.git@${PYLEARN_REF-master}" -pip install --user "git+https://github.com/tesera/prelurn.git@${PRELURN_REF-master}" +pip install --user "git+https://github.com/tesera/pylearn.git@${PYLEARN_REF-v1.0.1}" +pip install --user "git+https://github.com/tesera/prelurn.git@${PRELURN_REF-v1.0.0}" rm -rf /tmp/downloaded_packages/ /tmp/*.rds diff --git a/docker-compose.yml b/docker-compose.yml index 4145086..6744474 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,5 +8,7 @@ dev: test: container_name: learn build: . + volumes: + - ./:/opt/learn env_file: ./env/dev.env - command: ['bats ./tests/integration'] + command: ['bats', './tests/integration'] diff --git a/learn/clients/analyze.py b/learn/clients/analyze.py index 747b8c8..d11e07c 100644 --- a/learn/clients/analyze.py +++ b/learn/clients/analyze.py @@ -1,15 +1,15 @@ import os import logging - import pandas as pd from rpy2.robjects import pandas2ri pandas2ri.activate() - import rpy2.robjects as robjects from rpy2.robjects.packages import importr - +from schema import Schema, And, Or from pylearn.lda import cohens_khat, combine_evaluation_datasets +from learn.utils import is_s3_url + logger = logging.getLogger('pylearn') importr('MASS') importr('logging') @@ -35,9 +35,29 @@ def analyze(xy, config, yvar, output): class Analyze(object): + @staticmethod + def _validate(args): + schema = Schema({ + '--xy-data': Or( + os.path.isfile, is_s3_url, + error=' should exist and be readable.'), + '--config': Or( + os.path.isfile, is_s3_url, + error='--config should exist and be readable.'), + '--output': Or( + os.path.exists, is_s3_url, + error='--output should exist and be writable.'), + '--yvar': And(str, len), + }, ignore_extra_keys=True) + args = schema.validate(args) + return args + def run(self, args): # disable cloudwatch rlearn logging until it is prod ready rlearn.logger_init(log_toAwslogs=False) + + logger.info('Validating args') + args = self._validate(args) outdir = args['--output'] yvar = args['--yvar'] diff --git a/learn/clients/describe.py b/learn/clients/describe.py index 22a8a07..d477e56 100644 --- a/learn/clients/describe.py +++ b/learn/clients/describe.py @@ -2,7 +2,9 @@ import logging import pandas as pd import prelurn +from schema import Schema, And, Or, Optional +from learn.utils import is_s3_url logger = logging.getLogger('pylearn') @@ -11,6 +13,25 @@ class Describe(object): def __init__(self): pass + @staticmethod + def _validate(args): + schema = Schema({ + '--xy-data': Or( + os.path.isfile, is_s3_url, + error=' should exist and be readable.'), + Optional('--quantile-type'): And( + str, + lambda s: s in ('decile', 'quartile'), + error='--config should exist and be readable.'), + '--output': Or( + os.path.exists, is_s3_url, + error='--output should exist and be writable.'), + Optional('--format'): And(str, lambda s: s in ('json', 'csv')), + }, ignore_extra_keys=True) + args = schema.validate(args) + return args + + def run(self, args): logger.info('Running describe') diff --git a/learn/clients/discrating.py b/learn/clients/discrating.py index a0afbc3..3e9d18a 100644 --- a/learn/clients/discrating.py +++ b/learn/clients/discrating.py @@ -1,9 +1,11 @@ import os +import sys import logging import pandas as pd - +from schema import Schema, And, Or from pylearn.discrating import predict +from learn.utils import is_s3_url logger = logging.getLogger('pylearn') @@ -13,14 +15,45 @@ class Discrating(object): def __init__(self): logger.info("running dicsriminant ratings...") + @staticmethod + def _validate(args): + schema = Schema({ + '--xy-data': Or( + os.path.isfile, is_s3_url, + error='<--xy-data should exist and be readable.'), + '--x-data': Or( + os.path.isfile, is_s3_url, + error='--x-data should exist and be readable.'), + '--dfunct': Or( + os.path.isfile, is_s3_url, + error='--dfunct should exist and be readable.'), + '--idf': Or( + os.path.exists, is_s3_url, + error='--idf should exist and be writable.'), + '--output': Or( + os.path.exists, is_s3_url, + error='--output should exist and be writable.'), + '--yvar': And(str, len), + }, ignore_extra_keys=True) + args = schema.validate(args) + return args + def run(self, args): - logger.info("invoking predict with varset: %s", args['--varset']) + varset = int(args['--varset']) + dfunct = pd.read_csv(args['--dfunct']) + + # this is a hack to avoid handling this in pylearn right now in pylearn + # an exception should be raised which we can catch when running predict + if varset not in dfunct.VARSET3.unique(): + msg = "varset '%d' missing from dfunct" %varset + logger.error(msg) + sys.exit(msg) pargs = { 'xy': pd.read_csv(args['--xy-data']), 'x_filtered': pd.read_csv(args['--x-data']), - 'dfunct': pd.read_csv(args['--dfunct']), - 'varset': int(args['--varset']), + 'dfunct': dfunct, + 'varset': varset, 'yvar': args['--yvar'], 'idf': pd.read_csv(args['--idf']), } diff --git a/learn/clients/varselect.py b/learn/clients/varselect.py index b07ecc0..92de092 100644 --- a/learn/clients/varselect.py +++ b/learn/clients/varselect.py @@ -1,12 +1,12 @@ import os import logging import pandas as pd +from learn.utils import is_s3_url from rpy2.robjects import pandas2ri pandas2ri.activate() - import rpy2.robjects as robjects from rpy2.robjects.packages import importr - +from schema import Schema, And, Or, Optional from pylearn.varselect import (count_xvars, rank_xvars, extract_xvar_combos, remove_high_corvar) @@ -20,14 +20,16 @@ def var_select(xy, config, args): args['--nSolutions'], args['--minNvar'], args['--maxNvar'] = args['--iteration'].split(':') - varselect = rlearn.vs_selectVars(xy=xy, config=config, + varselect = rlearn.vs_selectVars( + xy=xy, config=config, yName=args['--yvar'], removeRowValue=-1, removeRowColName='SORTGRP', improveCriteriaVarName=args['--criteria'], minNumVar=int(args['--minNvar']), maxNumVar=int(args['--maxNvar']), - nSolutions=int(args['--nSolutions'])) + nSolutions=int(args['--nSolutions']) + ) return pandas2ri.ri2py(varselect) @@ -60,6 +62,29 @@ def varselect(data_xy, xy_config, args): class VarSelect(object): + @staticmethod + def _validate(args): + schema = Schema({ + '--xy-data': Or( + os.path.isfile, is_s3_url, + error=' should exist and be readable.'), + '--config': Or( + os.path.isfile, is_s3_url, + error='--config should exist and be readable.'), + Optional('--output'): Or( + os.path.exists, is_s3_url, + error='--output should exist and be writable.'), + Optional('--yvar'): And(str, len), + Optional('--iteration'): And(str, len), + Optional('--criteria'): And( + str, + lambda s: s in ('ccr12', 'Wilkes', 'xi2', 'zeta2') + ), + }, ignore_extra_keys=True) + + args = schema.validate(args) + return args + def run(self, args): # disable cloudwatch rlearn logging until it is prod ready rlearn.logger_init(log_toAwslogs=False) diff --git a/learn/utils.py b/learn/utils.py new file mode 100644 index 0000000..b2b2680 --- /dev/null +++ b/learn/utils.py @@ -0,0 +1,4 @@ +from urlparse import urlparse + +def is_s3_url(url): + return urlparse(url).scheme == 's3' diff --git a/setup.py b/setup.py index f077e08..967f609 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='learn-cli', - version='0.1.1', + version='1.0.0', description=u"Learn Model Builder", classifiers=[], keywords='', @@ -19,9 +19,6 @@ 'boto3', 'rpy2' ], - extras_require={ - 'test': ['pytest'], - }, entry_points={ 'console_scripts': [ 'learn=learn.cli:cli' diff --git a/tests/integration/test_cli_describe.bats b/tests/integration/test_cli_describe.bats index 6e553d0..62e0c89 100644 --- a/tests/integration/test_cli_describe.bats +++ b/tests/integration/test_cli_describe.bats @@ -4,21 +4,43 @@ setup () { data_xy="data_xy.csv" input="./tests/integration/input" - output="./tests/integration/output" + output_dir="./tests/integration/output" - mkdir -p "$output" + mkdir -p "$output_dir" +} + +@test "describe runs and output expected files without optional args" { + run learn describe \ + --xy-data=$input/$data_xy \ + --output=$output_dir + + [ -f "$output_dir/describe.json" ] + [ -f "pylearn.log" ] +} - learn describe \ +@test "describe runs and output expected files with json and quartile" { + run learn describe \ --xy-data=$input/$data_xy \ - --output=$output + --output=$output_dir \ + --format=json \ + --quantile-type=quartile + + [ -f "$output_dir/describe.json" ] + [ -f "pylearn.log" ] } -@test "describe runs and output expected files" { - [ -f "$output/describe.json" ] +@test "describe runs and output expected files with csv and decile" { + run learn describe \ + --xy-data=$input/$data_xy \ + --output=$output_dir \ + --format=csv \ + --quantile-type=decile + + [ -f "$output_dir/describe.csv" ] [ -f "pylearn.log" ] } teardown () { - rm $output/* + rm $output_dir/* rm *.log } diff --git a/tests/integration/test_cli_discrat.bats b/tests/integration/test_cli_discrat.bats index 6aeeea2..2712682 100644 --- a/tests/integration/test_cli_discrat.bats +++ b/tests/integration/test_cli_discrat.bats @@ -1,31 +1,43 @@ #!/usr/bin/env bats setup () { - data_xy="data_xy.csv" data_x_filtered="data_xy.csv" lda_x_dfunct="lda_x_dfunct.csv" data_idf="data_idf.csv" input="./tests/integration/input" - output="./tests/integration/output" + output_dir="./tests/integration/output" - mkdir -p "$output" + mkdir -p "$output_dir" +} - learn discrat \ +@test "discrat runs and output expected files" { + run learn discrat \ --xy-data=$input/$data_xy \ --x-data=$input/$data_x_filtered \ --dfunct=$input/$lda_x_dfunct \ --idf=$input/$data_idf \ --varset=10 \ - --output=$output -} + --output=$output_dir -@test "discrat runs and output expected files" { - [ -f "$output/forecasts.csv" ] + [ -f "$output_dir/forecasts.csv" ] [ -f "pylearn.log" ] } +@test "discrat doesn't run if varset missing" { + run learn discrat \ + --xy-data=$input/$data_xy \ + --x-data=$input/$data_x_filtered \ + --dfunct=$input/$lda_x_dfunct \ + --idf=$input/$data_idf \ + --varset=100 \ + --output=$output_dir + + [ "$output" = "varset '100' missing from dfunct" ] + [ "$status" -eq 1 ] +} + teardown () { - rm $output/* + rm -f $output_dir/* rm *.log } diff --git a/tests/integration/test_cli_lda.bats b/tests/integration/test_cli_lda.bats index 846f6b1..0840164 100644 --- a/tests/integration/test_cli_lda.bats +++ b/tests/integration/test_cli_lda.bats @@ -5,26 +5,26 @@ setup () { data_xy="data_xy.csv" vsel_x="vsel_x.csv" input="./tests/integration/input" - output="./tests/integration/output" + output_dir="./tests/integration/output" - mkdir -p "$output" + mkdir -p "$output_dir" learn lda \ --xy-data "$input/$data_xy" \ --config "$input/$vsel_x" \ --yvar VAR47 \ - --output "$output" + --output "$output_dir" } @test "lda runs and output expected files" { - [ -f "$output/lda_x_assess.csv" ] - [ -f "$output/lda_x_dfunct.csv" ] + [ -f "$output_dir/lda_x_assess.csv" ] + [ -f "$output_dir/lda_x_dfunct.csv" ] [ -f "pylearn.log" ] [ -f "rlearn.log" ] } teardown () { - rm $output/* + rm $output_dir/* rm *.log } diff --git a/tests/integration/test_cli_varsel.bats b/tests/integration/test_cli_varsel.bats index 2ab91e2..f048e1e 100644 --- a/tests/integration/test_cli_varsel.bats +++ b/tests/integration/test_cli_varsel.bats @@ -5,26 +5,26 @@ setup () { data_xy="data_xy.csv" vsel_xy_config="vsel_xy_config.csv" input="./tests/integration/input" - output="./tests/integration/output" + output_dir="./tests/integration/output" - mkdir -p "$output" + mkdir -p "$output_dir" learn varsel \ --xy-data "$input/$data_xy" \ --config "$input/$vsel_xy_config" \ --yvar VAR47 \ - --output "$output" + --output "$output_dir" } @test "varsels and output expected files" { - [ -f "$output/vsel_x.csv" ] - [ -f "$output/vsel_varrank.csv" ] + [ -f "$output_dir/vsel_x.csv" ] + [ -f "$output_dir/vsel_varrank.csv" ] [ -f "pylearn.log" ] [ -f "rlearn.log" ] } teardown () { - rm $output/* + rm $output_dir/* rm *.log }