Skip to content

Commit

Permalink
Create the cognoml package to implement an MVP API (#51)
Browse files Browse the repository at this point in the history
* Begin constructing a MVP machine learner

* Export JSON API input for Hippo pathway

* Ignore __pycache__

* classify() functioning with mock input

* Save output corresponding to hippo-input.json

From the `cognoml` directory, ran:

```
python analysis.py > ../data/api/hippo-output.json
```

* Export model information to JSON output

Also filter zero-variance features.

* Return unselected observations

Unselected observations (samples in the dataset that were not selected
by the user) are now returned. These observations receive predictions
but are missing (-1 encoded) for fields such as `testing` and `status`.

Sorted model parameters by key.

* Save grid_search performance metrics

* Move classifier and pipeline to it's own module

* Add setup.py to make module installable

* Review comments: spacing and results doc

* Check whether pipeline has function before calling

Meant to address https://git.io/vPvtI

* Acquire data from figshare

* Update for sklearn 0.18.0, Fix pipeline

Fix pipeline according to:
scikit-learn/scikit-learn#7536 (comment)

Extract selected feature names according to:
scikit-learn/scikit-learn#7536 (comment)

* Semantic improvements of get_feature_df

* Update API JSON files

* Mention hippo-output-schema.json in docstring

* Address @gwaygenomics review comments

Does not address "Lasso or Ridge only?"

* Grid search: optimize alpha not l1_ratio

Do not optimize `l1_ratio`. Instead use the default of 0.15. Search a denser
grid for `alpha`. See #56
  • Loading branch information
dhimmel authored Oct 11, 2016
1 parent 5199430 commit 2cb9f34
Show file tree
Hide file tree
Showing 14 changed files with 157,957 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@

# Large downloaded data files
download/

# Python
__pycache__/
*.egg-info/
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,11 @@ conda env update --file environment.yml
```

Activate the environment by running `source activate cognoma-machine-learning` on Linux or OS X and `activate cognoma-machine-learning` on Windows. Once this environment is active in a terminal, run `jupyter notebook` to start a notebook server.

## `cognoml` package

This repository contains a python package named `cognoml`. For development, the package can be installed locally. First, make sure the `cognoma-machine-learning` environment is activated, so the `cognoml` is only installed for this environment. Then install using the following command from this repository's root directory:

```sh
pip install --editable .
```
Empty file added cognoml/__init__.py
Empty file.
114 changes: 114 additions & 0 deletions cognoml/analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import collections
import os
import warnings

import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split

from cognoml import utils
from cognoml.classifiers.logistic_regression import grid_search
from cognoml.figshare import download_files

# expression_path = os.path.join('download', 'mutation-matrix.tsv.bz2')
data_directory = "download"

def read_data(version=None):
"""
Read data.
"""
v_dir = download_files(directory=data_directory, article_id=3487685, version=version)
# Read expression data
path = os.path.join(v_dir, 'expression-matrix.tsv.bz2')
X = pd.read_table(path, index_col=0)
return X

def classify(sample_id, mutation_status, data_version, **kwargs):
"""
Perform an analysis.
Parameters
----------
sample_id : list
Sample IDs of the observations.
mutation_status : list
Mutation status (0 or 1) of each sample.
Returns
-------
results : dict
A JSON-serializable object. OrderedDicts are used for dictionaries to
enable reproducibile export. See `data/api/hippo-output-schema.json`
for JSON schema.
"""
results = collections.OrderedDict()

obs_df = pd.DataFrame.from_items([
('sample_id', sample_id),
('status', mutation_status),
])

X_whole = read_data(version=data_version)
X = X_whole.loc[obs_df.sample_id, :]
y = obs_df.status

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.1, random_state=0, stratify=y)
obs_df['testing'] = obs_df.sample_id.isin(X_test.index).astype(int)

grid_search.fit(X=X_train, y=y_train)

predict_df = pd.DataFrame.from_items([
('sample_id', X_whole.index),
('predicted_status', grid_search.predict(X_whole)),
])
if hasattr(grid_search, 'decision_function'):
predict_df['predicted_score'] = grid_search.decision_function(X_whole)
if hasattr(grid_search, 'predict_proba'):
predict_df['predicted_prob'] = grid_search.predict_proba(X_whole)[:, 1]

# obs_df switches to containing non-selected samples
obs_df = obs_df.merge(predict_df, how='right', sort=True)
obs_df['selected'] = obs_df.sample_id.isin(sample_id).astype(int)
for column in 'status', 'testing', 'selected':
obs_df[column] = obs_df[column].fillna(-1).astype(int)
obs_train_df = obs_df.query("testing == 0")
obs_test_df = obs_df.query("testing == 1")

#y_pred_train = obs_df.query("testing == 0").predicted_score
#y_pred_test = obs_df.query("testing == 1").predicted_score

dimensions = collections.OrderedDict()
dimensions['observations_selected'] = sum(obs_df.selected == 1)
dimensions['observations_unselected'] = sum(obs_df.selected == 0)
dimensions['features'] = len(X.columns)
dimensions['positives'] = sum(obs_df.status == 1)
dimensions['negatives'] = sum(obs_df.status == 0)
dimensions['positive_prevalence'] = y.mean().round(5)
dimensions['training_observations'] = len(obs_train_df)
dimensions['testing_observations'] = len(obs_test_df)
results['dimensions'] = utils.value_map(dimensions, round, ndigits=5)

performance = collections.OrderedDict()
for part, df in ('training', obs_train_df), ('testing', obs_test_df):
y_true = df.status
y_pred = df.predicted_status
metrics = utils.class_metrics(y_true, y_pred)
metrics.update(utils.threshold_metrics(y_true, y_pred))
performance[part] = utils.value_map(metrics, round, ndigits=5)
performance['cv'] = {'auroc': round(grid_search.best_score_, 5)}
results['performance'] = performance

gs = collections.OrderedDict()
gs['cv_scores'] = utils.cv_results_to_df(grid_search.cv_results_)
gs = utils.value_map(gs, utils.df_to_datatables)
results['grid_search'] = gs

results['model'] = utils.model_info(grid_search.best_estimator_.steps[-1][1])

feature_df = utils.get_feature_df(grid_search, X.columns)
results['model']['features'] = utils.df_to_datatables(feature_df)

results['observations'] = utils.df_to_datatables(obs_df)
return results
Empty file added cognoml/classifiers/__init__.py
Empty file.
28 changes: 28 additions & 0 deletions cognoml/classifiers/logistic_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline(steps=[
('select', VarianceThreshold()),
('standardize', StandardScaler()),
('classify', SGDClassifier())
])

param_grid = {
'classify__random_state': [0],
'classify__class_weight': ['balanced'],
'classify__loss': ['log'],
'classify__penalty': ['elasticnet'],
'classify__alpha': 10.0 ** np.linspace(-3, 1, 10),
'classify__l1_ratio': [0.15],
}

grid_search = GridSearchCV(
estimator=pipeline,
param_grid=param_grid,
n_jobs=-1,
scoring='roc_auc'
)
62 changes: 62 additions & 0 deletions cognoml/figshare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os
from urllib.request import urlretrieve
import json

import requests

def get_article_versions(article_id):
"""
Get version_to_url dictionary for a figshare article.
"""
url = 'https://api.figshare.com/v2/articles/{}/versions'.format(article_id)
response = requests.get(url)
version_to_url = {d['version']: d['url'] for d in response.json()}
return version_to_url

def download_files(directory, article_id=3487685, version=None):
"""
Download files for a specific figshare article_id and version. Creates a
version-specific subdirectory in `directory` and downloads all files from
the figshare article into this subdirectory.
Parameters
----------
directory : str
Directory to download files to. Files are written inside a versioned
directory (e.g. `v2`).
article_id : int
article_id on figshare
version : int or None
figshare version. `None` means latest.
Returns
-------
str
The version-specific DOI corresponding to the downloaded data.
"""
version_to_url = get_article_versions(article_id)
if version is None:
version = max(version_to_url.keys())
url = version_to_url[version]
response = requests.get(url)
article = response.json()
version_directory = os.path.join(directory, 'v{}'.format(version))

if not os.path.exists(version_directory):
os.mkdir(version_directory)

path = os.path.join(version_directory, 'info.json')
with open(path, 'w') as write_file:
json.dump(article, write_file, indent=2, ensure_ascii=False, sort_keys=True)

# Download the files specified by the metadata
for file_info in article['files']:
name = file_info['name']
path = os.path.join(version_directory, name)
if os.path.exists(path):
continue
url = file_info['download_url']
print('Downloading {} to `{}`'.format(url, name))
urlretrieve(url, path)

return version_directory
16 changes: 16 additions & 0 deletions cognoml/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import json

import requests

from cognoml.analysis import classify
from cognoml import utils

if __name__ == '__main__':
# Create a classifier using mock input. Print output to stdout.
url = 'https://github.com/cognoma/machine-learning/raw/876b8131bab46878cb49ae7243e459ec0acd2b47/data/api/hippo-input.json'
response = requests.get(url)
payload = response.json()
payload['data_version'] = 4
results = classify(**payload)
json_results = json.dumps(results, indent=2, cls=utils.JSONEncoder)
print(json_results)
132 changes: 132 additions & 0 deletions cognoml/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import collections
import json

import numpy as np
import pandas as pd
import sklearn

def cv_results_to_df(cv_results):
"""
Convert a `sklearn.grid_search.GridSearchCV.cv_results_` attribute to a tidy
pandas DataFrame where each row is a hyperparameter combinatination.
"""
cv_results_df = pd.DataFrame(cv_results)
columns = [x for x in cv_results_df.columns if x.startswith('param_')]
columns += ['mean_train_score', 'mean_test_score', 'std_test_score']
cv_results_df = cv_results_df[columns]
return cv_results_df

def expand_grid(data_dict):
"""
Create a dataframe from every combination of given values.
"""
rows = itertools.product(*data_dict.values())
grid_df = pd.DataFrame.from_records(rows, columns=data_dict.keys())
return grid_df

def df_to_datatables(df, double_precision=5, indent=2):
"""
Convert a pandas dataframe to a JSON object formatted for datatables input.
"""
dump_str = df.to_json(orient='split', double_precision=double_precision)
obj = json.loads(dump_str)
del obj['index']
obj = collections.OrderedDict(obj)
obj.move_to_end('data')
return obj

def json_sanitize(obj, object_pairs_hook=collections.OrderedDict):
"""
Sanitize an object containing pandas/numpy objects so it's JSON
serializable. Does not preserve order since `pandas.json.dumps()` does not
respect OrderedDict objects. Hence, it's recommended to just use the builtin
`json.dump` function with `cls=JSONEncoder`.
"""
obj_str = pd.json.dumps(obj)
print(obj_str)
obj = json.loads(obj_str, object_pairs_hook=object_pairs_hook)
return obj

class JSONEncoder(json.JSONEncoder):
"""
A JSONEncoder that supports numpy types by converting them to standard
python types.
"""

def default(self, o):
if type(o).__module__ == 'numpy':
return o.item()
return super().default(o)

def value_map(dictionary, function, *args, **kwargs):
"""
Edits a dictionary-like object in place to apply a function to its values.
"""
for key, value in dictionary.items():
dictionary[key] = function(value, *args, **kwargs)
return dictionary

def class_metrics(y_true, y_pred):
metrics = collections.OrderedDict()
metrics['precision'] = sklearn.metrics.precision_score(y_true, y_pred)
metrics['recall'] = sklearn.metrics.recall_score(y_true, y_pred)
metrics['f1'] = sklearn.metrics.f1_score(y_true, y_pred)
metrics['accuracy'] = sklearn.metrics.accuracy_score(y_true, y_pred)
# See https://github.com/scikit-learn/scikit-learn/pull/6752
metrics['balanced_accuracy'] = sklearn.metrics.recall_score(
y_true, y_pred, pos_label=None, average='macro')
return metrics

def threshold_metrics(y_true, y_pred):
metrics = collections.OrderedDict()
metrics['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_pred)
metrics['auprc'] = sklearn.metrics.average_precision_score(y_true, y_pred)
return metrics

def model_info(estimator):
model = collections.OrderedDict()
model['class'] = type(estimator).__name__
model['module'] = estimator.__module__
model['parameters'] = sort_dict(estimator.get_params())
return model

def get_feature_df(grid_search, features):
"""
Return the feature names and coefficients from the final classifier of the
best pipeline found by GridSearchCV. See https://git.io/vPWLI. This function
assumes every selection step of the pipeline has a name starting with
`select`.
Params
------
grid_search: GridSearchCV object
A post-fit GridSearchCV object where the estimator is a Pipeline.
features: list
initial feature names
Returns
-------
pandas.DataFrame
Dataframe of feature name and coefficient values
"""
features = np.array(features)
pipeline = grid_search.best_estimator_
for name, transformer in pipeline.steps:
if name.startswith('select'):
X_index = np.arange(len(features)).reshape(1, -1)
indexes = transformer.transform(X_index).tolist()
features = features[indexes]
step_name, classifier = pipeline.steps[-1]
coefficients, = classifier.coef_
feature_df = pd.DataFrame.from_items([
('feature', features),
('coefficient', coefficients),
])
return feature_df

def sort_dict(dictionary):
"""
Return a dictionary as an OrderedDict sorted by keys.
"""
items = sorted(dictionary.items())
return collections.OrderedDict(items)
Loading

0 comments on commit 2cb9f34

Please sign in to comment.