Create the cognoml package to implement an MVP API (#51)

* Begin constructing a MVP machine learner * Export JSON API input for Hippo pathway * Ignore __pycache__ * classify() functioning with mock input * Save output corresponding to hippo-input.json From the `cognoml` directory, ran: ``` python analysis.py > ../data/api/hippo-output.json ``` * Export model information to JSON output Also filter zero-variance features. * Return unselected observations Unselected observations (samples in the dataset that were not selected by the user) are now returned. These observations receive predictions but are missing (-1 encoded) for fields such as `testing` and `status`. Sorted model parameters by key. * Save grid_search performance metrics * Move classifier and pipeline to it's own module * Add setup.py to make module installable * Review comments: spacing and results doc * Check whether pipeline has function before calling Meant to address https://git.io/vPvtI * Acquire data from figshare * Update for sklearn 0.18.0, Fix pipeline Fix pipeline according to: scikit-learn/scikit-learn#7536 (comment) Extract selected feature names according to: scikit-learn/scikit-learn#7536 (comment) * Semantic improvements of get_feature_df * Update API JSON files * Mention hippo-output-schema.json in docstring * Address @gwaygenomics review comments Does not address "Lasso or Ridge only?" * Grid search: optimize alpha not l1_ratio Do not optimize `l1_ratio`. Instead use the default of 0.15. Search a denser grid for `alpha`. See #56
cognoma · Oct 11, 2016 · 2cb9f34 · 2cb9f34
1 parent 5199430
commit 2cb9f34
Show file tree

Hide file tree

Showing 14 changed files with 157,957 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,7 @@
 
 # Large downloaded data files
 download/
+
+# Python
+__pycache__/
+*.egg-info/
diff --git a/README.md b/README.md
@@ -20,3 +20,11 @@ conda env update --file environment.yml
 ```
 
 Activate the environment by running `source activate cognoma-machine-learning` on Linux or OS X and `activate cognoma-machine-learning` on Windows. Once this environment is active in a terminal, run `jupyter notebook` to start a notebook server.
+
+## `cognoml` package
+
+This repository contains a python package named `cognoml`. For development, the package can be installed locally. First, make sure the `cognoma-machine-learning` environment is activated, so the `cognoml` is only installed for this environment. Then install using the following command from this repository's root directory:
+
+```sh
+pip install --editable .
+```
diff --git a/cognoml/__init__.py b/cognoml/__init__.py
diff --git a/cognoml/analysis.py b/cognoml/analysis.py
@@ -0,0 +1,114 @@
+import collections
+import os
+import warnings
+
+import pandas as pd
+import numpy as np
+import sklearn
+from sklearn.model_selection import train_test_split
+
+from cognoml import utils
+from cognoml.classifiers.logistic_regression import grid_search
+from cognoml.figshare import download_files
+
+# expression_path = os.path.join('download', 'mutation-matrix.tsv.bz2')
+data_directory = "download"
+
+def read_data(version=None):
+    """
+    Read data.
+    """
+    v_dir = download_files(directory=data_directory, article_id=3487685, version=version)
+    # Read expression data
+    path = os.path.join(v_dir, 'expression-matrix.tsv.bz2')
+    X = pd.read_table(path, index_col=0)
+    return X
+
+def classify(sample_id, mutation_status, data_version, **kwargs):
+    """
+    Perform an analysis.
+    
+    Parameters
+    ----------
+    sample_id : list
+        Sample IDs of the observations.
+    mutation_status : list
+        Mutation status (0 or 1) of each sample.
+
+    Returns
+    -------
+    results : dict
+        A JSON-serializable object. OrderedDicts are used for dictionaries to
+        enable reproducibile export. See `data/api/hippo-output-schema.json`
+        for JSON schema.
+    """
+    results = collections.OrderedDict()
+
+    obs_df = pd.DataFrame.from_items([
+        ('sample_id', sample_id),
+        ('status', mutation_status),
+    ])
+
+    X_whole = read_data(version=data_version)
+    X = X_whole.loc[obs_df.sample_id, :]
+    y = obs_df.status
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.1, random_state=0, stratify=y)
+    obs_df['testing'] = obs_df.sample_id.isin(X_test.index).astype(int)
+
+    grid_search.fit(X=X_train, y=y_train)
+
+    predict_df = pd.DataFrame.from_items([
+        ('sample_id', X_whole.index),
+        ('predicted_status', grid_search.predict(X_whole)),
+    ])
+    if hasattr(grid_search, 'decision_function'):
+        predict_df['predicted_score'] = grid_search.decision_function(X_whole)
+    if hasattr(grid_search, 'predict_proba'):
+        predict_df['predicted_prob'] = grid_search.predict_proba(X_whole)[:, 1]
+
+    # obs_df switches to containing non-selected samples
+    obs_df = obs_df.merge(predict_df, how='right', sort=True)
+    obs_df['selected'] = obs_df.sample_id.isin(sample_id).astype(int)
+    for column in 'status', 'testing', 'selected':
+        obs_df[column] = obs_df[column].fillna(-1).astype(int)
+    obs_train_df = obs_df.query("testing == 0")
+    obs_test_df = obs_df.query("testing == 1")
+
+    #y_pred_train = obs_df.query("testing == 0").predicted_score
+    #y_pred_test = obs_df.query("testing == 1").predicted_score
+
+    dimensions = collections.OrderedDict()
+    dimensions['observations_selected'] = sum(obs_df.selected == 1)
+    dimensions['observations_unselected'] = sum(obs_df.selected == 0)
+    dimensions['features'] = len(X.columns)
+    dimensions['positives'] = sum(obs_df.status == 1)
+    dimensions['negatives'] = sum(obs_df.status == 0)
+    dimensions['positive_prevalence'] = y.mean().round(5)
+    dimensions['training_observations'] = len(obs_train_df)
+    dimensions['testing_observations'] = len(obs_test_df)
+    results['dimensions'] = utils.value_map(dimensions, round, ndigits=5)
+
+    performance = collections.OrderedDict()
+    for part, df in ('training', obs_train_df), ('testing', obs_test_df):
+        y_true = df.status
+        y_pred = df.predicted_status
+        metrics = utils.class_metrics(y_true, y_pred)
+        metrics.update(utils.threshold_metrics(y_true, y_pred))
+        performance[part] = utils.value_map(metrics, round, ndigits=5)
+    performance['cv'] = {'auroc': round(grid_search.best_score_, 5)}
+    results['performance'] = performance
+
+    gs = collections.OrderedDict()
+    gs['cv_scores'] = utils.cv_results_to_df(grid_search.cv_results_)
+    gs = utils.value_map(gs, utils.df_to_datatables)
+    results['grid_search'] = gs
+
+    results['model'] = utils.model_info(grid_search.best_estimator_.steps[-1][1])
+
+    feature_df = utils.get_feature_df(grid_search, X.columns)
+    results['model']['features'] = utils.df_to_datatables(feature_df)
+
+    results['observations'] = utils.df_to_datatables(obs_df)
+    return results
diff --git a/cognoml/classifiers/__init__.py b/cognoml/classifiers/__init__.py
diff --git a/cognoml/classifiers/logistic_regression.py b/cognoml/classifiers/logistic_regression.py
@@ -0,0 +1,28 @@
+import numpy as np
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import Pipeline
+from sklearn.linear_model import SGDClassifier
+from sklearn.feature_selection import VarianceThreshold
+from sklearn.preprocessing import StandardScaler
+
+pipeline = Pipeline(steps=[
+    ('select', VarianceThreshold()),
+    ('standardize', StandardScaler()),
+    ('classify', SGDClassifier())
+])
+
+param_grid = {
+    'classify__random_state': [0],
+    'classify__class_weight': ['balanced'],
+    'classify__loss': ['log'],
+    'classify__penalty': ['elasticnet'],
+    'classify__alpha': 10.0 ** np.linspace(-3, 1, 10),
+    'classify__l1_ratio': [0.15],
+}
+
+grid_search = GridSearchCV(
+    estimator=pipeline,
+    param_grid=param_grid,
+    n_jobs=-1,
+    scoring='roc_auc'
+)
diff --git a/cognoml/figshare.py b/cognoml/figshare.py
@@ -0,0 +1,62 @@
+import os
+from urllib.request import urlretrieve
+import json
+
+import requests
+
+def get_article_versions(article_id):
+    """
+    Get version_to_url dictionary for a figshare article.
+    """
+    url = 'https://api.figshare.com/v2/articles/{}/versions'.format(article_id)
+    response = requests.get(url)
+    version_to_url = {d['version']: d['url'] for d in response.json()}
+    return version_to_url
+
+def download_files(directory, article_id=3487685, version=None):
+    """
+    Download files for a specific figshare article_id and version. Creates a
+    version-specific subdirectory in `directory` and downloads all files from
+    the figshare article into this subdirectory.
+    
+    Parameters
+    ----------
+    directory : str
+        Directory to download files to. Files are written inside a versioned
+        directory (e.g. `v2`).
+    article_id : int
+        article_id on figshare
+    version : int or None
+        figshare version. `None` means latest.
+    
+    Returns
+    -------
+    str
+        The version-specific DOI corresponding to the downloaded data.
+    """
+    version_to_url = get_article_versions(article_id)
+    if version is None:
+        version = max(version_to_url.keys())
+    url = version_to_url[version]
+    response = requests.get(url)
+    article = response.json()
+    version_directory = os.path.join(directory, 'v{}'.format(version))
+
+    if not os.path.exists(version_directory):
+        os.mkdir(version_directory)
+
+    path = os.path.join(version_directory, 'info.json')
+    with open(path, 'w') as write_file:
+        json.dump(article, write_file, indent=2, ensure_ascii=False, sort_keys=True)
+
+    # Download the files specified by the metadata
+    for file_info in article['files']:
+        name = file_info['name']
+        path = os.path.join(version_directory, name)
+        if os.path.exists(path):
+            continue
+        url = file_info['download_url']
+        print('Downloading {} to `{}`'.format(url, name))
+        urlretrieve(url, path)
+
+    return version_directory
diff --git a/cognoml/main.py b/cognoml/main.py
@@ -0,0 +1,16 @@
+import json
+
+import requests
+
+from cognoml.analysis import classify
+from cognoml import utils
+
+if __name__ == '__main__':
+    # Create a classifier using mock input. Print output to stdout.
+    url = 'https://github.com/cognoma/machine-learning/raw/876b8131bab46878cb49ae7243e459ec0acd2b47/data/api/hippo-input.json'
+    response = requests.get(url)
+    payload = response.json()
+    payload['data_version'] = 4
+    results = classify(**payload)
+    json_results = json.dumps(results, indent=2, cls=utils.JSONEncoder)
+    print(json_results)
diff --git a/cognoml/utils.py b/cognoml/utils.py
@@ -0,0 +1,132 @@
+import collections
+import json
+
+import numpy as np
+import pandas as pd
+import sklearn
+
+def cv_results_to_df(cv_results):
+    """
+    Convert a `sklearn.grid_search.GridSearchCV.cv_results_` attribute to a tidy
+    pandas DataFrame where each row is a hyperparameter combinatination.
+    """
+    cv_results_df = pd.DataFrame(cv_results)
+    columns = [x for x in cv_results_df.columns if x.startswith('param_')]
+    columns += ['mean_train_score', 'mean_test_score', 'std_test_score']
+    cv_results_df = cv_results_df[columns]
+    return cv_results_df
+
+def expand_grid(data_dict):
+    """
+    Create a dataframe from every combination of given values.
+    """
+    rows = itertools.product(*data_dict.values())
+    grid_df = pd.DataFrame.from_records(rows, columns=data_dict.keys())
+    return grid_df
+
+def df_to_datatables(df, double_precision=5, indent=2):
+    """
+    Convert a pandas dataframe to a JSON object formatted for datatables input.
+    """
+    dump_str = df.to_json(orient='split', double_precision=double_precision)
+    obj = json.loads(dump_str)
+    del obj['index']
+    obj = collections.OrderedDict(obj)
+    obj.move_to_end('data')
+    return obj
+
+def json_sanitize(obj, object_pairs_hook=collections.OrderedDict):
+    """
+    Sanitize an object containing pandas/numpy objects so it's JSON
+    serializable. Does not preserve order since `pandas.json.dumps()` does not
+    respect OrderedDict objects. Hence, it's recommended to just use the builtin
+    `json.dump` function with `cls=JSONEncoder`.
+    """
+    obj_str = pd.json.dumps(obj)
+    print(obj_str)
+    obj = json.loads(obj_str, object_pairs_hook=object_pairs_hook)
+    return obj
+
+class JSONEncoder(json.JSONEncoder):
+    """
+    A JSONEncoder that supports numpy types by converting them to standard
+    python types.
+    """
+
+    def default(self, o):
+        if type(o).__module__ == 'numpy':
+            return o.item()        
+        return super().default(o)
+
+def value_map(dictionary, function, *args, **kwargs):
+    """
+    Edits a dictionary-like object in place to apply a function to its values.
+    """
+    for key, value in dictionary.items():
+        dictionary[key] = function(value, *args, **kwargs)
+    return dictionary
+
+def class_metrics(y_true, y_pred):
+    metrics = collections.OrderedDict()
+    metrics['precision'] = sklearn.metrics.precision_score(y_true, y_pred)
+    metrics['recall'] = sklearn.metrics.recall_score(y_true, y_pred)
+    metrics['f1'] = sklearn.metrics.f1_score(y_true, y_pred)
+    metrics['accuracy'] = sklearn.metrics.accuracy_score(y_true, y_pred)
+    # See https://github.com/scikit-learn/scikit-learn/pull/6752
+    metrics['balanced_accuracy'] = sklearn.metrics.recall_score(
+        y_true, y_pred, pos_label=None, average='macro')
+    return metrics
+
+def threshold_metrics(y_true, y_pred):
+    metrics = collections.OrderedDict()
+    metrics['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_pred)
+    metrics['auprc'] = sklearn.metrics.average_precision_score(y_true, y_pred)
+    return metrics
+
+def model_info(estimator):
+    model = collections.OrderedDict()
+    model['class'] = type(estimator).__name__
+    model['module'] = estimator.__module__
+    model['parameters'] = sort_dict(estimator.get_params())
+    return model
+
+def get_feature_df(grid_search, features):
+    """
+    Return the feature names and coefficients from the final classifier of the
+    best pipeline found by GridSearchCV. See https://git.io/vPWLI. This function
+    assumes every selection step of the pipeline has a name starting with
+    `select`.
+    
+    Params
+    ------
+    grid_search: GridSearchCV object
+        A post-fit GridSearchCV object where the estimator is a Pipeline.
+    features: list
+        initial feature names
+    
+    Returns
+    -------
+    pandas.DataFrame
+        Dataframe of feature name and coefficient values
+    """
+    features = np.array(features)
+    pipeline = grid_search.best_estimator_
+    for name, transformer in pipeline.steps:
+        if name.startswith('select'):
+            X_index = np.arange(len(features)).reshape(1, -1)
+            indexes = transformer.transform(X_index).tolist()
+            features = features[indexes]
+    step_name, classifier = pipeline.steps[-1]
+    coefficients, = classifier.coef_
+    feature_df = pd.DataFrame.from_items([
+        ('feature', features),
+        ('coefficient', coefficients),
+    ])
+    return feature_df
+
+def sort_dict(dictionary):
+    """
+    Return a dictionary as an OrderedDict sorted by keys.
+    """
+    items = sorted(dictionary.items())
+    return collections.OrderedDict(items)