-
Notifications
You must be signed in to change notification settings - Fork 47
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create the cognoml package to implement an MVP API (#51)
* Begin constructing a MVP machine learner * Export JSON API input for Hippo pathway * Ignore __pycache__ * classify() functioning with mock input * Save output corresponding to hippo-input.json From the `cognoml` directory, ran: ``` python analysis.py > ../data/api/hippo-output.json ``` * Export model information to JSON output Also filter zero-variance features. * Return unselected observations Unselected observations (samples in the dataset that were not selected by the user) are now returned. These observations receive predictions but are missing (-1 encoded) for fields such as `testing` and `status`. Sorted model parameters by key. * Save grid_search performance metrics * Move classifier and pipeline to it's own module * Add setup.py to make module installable * Review comments: spacing and results doc * Check whether pipeline has function before calling Meant to address https://git.io/vPvtI * Acquire data from figshare * Update for sklearn 0.18.0, Fix pipeline Fix pipeline according to: scikit-learn/scikit-learn#7536 (comment) Extract selected feature names according to: scikit-learn/scikit-learn#7536 (comment) * Semantic improvements of get_feature_df * Update API JSON files * Mention hippo-output-schema.json in docstring * Address @gwaygenomics review comments Does not address "Lasso or Ridge only?" * Grid search: optimize alpha not l1_ratio Do not optimize `l1_ratio`. Instead use the default of 0.15. Search a denser grid for `alpha`. See #56
- Loading branch information
Showing
14 changed files
with
157,957 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,3 +3,7 @@ | |
|
||
# Large downloaded data files | ||
download/ | ||
|
||
# Python | ||
__pycache__/ | ||
*.egg-info/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
import collections | ||
import os | ||
import warnings | ||
|
||
import pandas as pd | ||
import numpy as np | ||
import sklearn | ||
from sklearn.model_selection import train_test_split | ||
|
||
from cognoml import utils | ||
from cognoml.classifiers.logistic_regression import grid_search | ||
from cognoml.figshare import download_files | ||
|
||
# expression_path = os.path.join('download', 'mutation-matrix.tsv.bz2') | ||
data_directory = "download" | ||
|
||
def read_data(version=None): | ||
""" | ||
Read data. | ||
""" | ||
v_dir = download_files(directory=data_directory, article_id=3487685, version=version) | ||
# Read expression data | ||
path = os.path.join(v_dir, 'expression-matrix.tsv.bz2') | ||
X = pd.read_table(path, index_col=0) | ||
return X | ||
|
||
def classify(sample_id, mutation_status, data_version, **kwargs): | ||
""" | ||
Perform an analysis. | ||
Parameters | ||
---------- | ||
sample_id : list | ||
Sample IDs of the observations. | ||
mutation_status : list | ||
Mutation status (0 or 1) of each sample. | ||
Returns | ||
------- | ||
results : dict | ||
A JSON-serializable object. OrderedDicts are used for dictionaries to | ||
enable reproducibile export. See `data/api/hippo-output-schema.json` | ||
for JSON schema. | ||
""" | ||
results = collections.OrderedDict() | ||
|
||
obs_df = pd.DataFrame.from_items([ | ||
('sample_id', sample_id), | ||
('status', mutation_status), | ||
]) | ||
|
||
X_whole = read_data(version=data_version) | ||
X = X_whole.loc[obs_df.sample_id, :] | ||
y = obs_df.status | ||
|
||
X_train, X_test, y_train, y_test = train_test_split( | ||
X, y, test_size=0.1, random_state=0, stratify=y) | ||
obs_df['testing'] = obs_df.sample_id.isin(X_test.index).astype(int) | ||
|
||
grid_search.fit(X=X_train, y=y_train) | ||
|
||
predict_df = pd.DataFrame.from_items([ | ||
('sample_id', X_whole.index), | ||
('predicted_status', grid_search.predict(X_whole)), | ||
]) | ||
if hasattr(grid_search, 'decision_function'): | ||
predict_df['predicted_score'] = grid_search.decision_function(X_whole) | ||
if hasattr(grid_search, 'predict_proba'): | ||
predict_df['predicted_prob'] = grid_search.predict_proba(X_whole)[:, 1] | ||
|
||
# obs_df switches to containing non-selected samples | ||
obs_df = obs_df.merge(predict_df, how='right', sort=True) | ||
obs_df['selected'] = obs_df.sample_id.isin(sample_id).astype(int) | ||
for column in 'status', 'testing', 'selected': | ||
obs_df[column] = obs_df[column].fillna(-1).astype(int) | ||
obs_train_df = obs_df.query("testing == 0") | ||
obs_test_df = obs_df.query("testing == 1") | ||
|
||
#y_pred_train = obs_df.query("testing == 0").predicted_score | ||
#y_pred_test = obs_df.query("testing == 1").predicted_score | ||
|
||
dimensions = collections.OrderedDict() | ||
dimensions['observations_selected'] = sum(obs_df.selected == 1) | ||
dimensions['observations_unselected'] = sum(obs_df.selected == 0) | ||
dimensions['features'] = len(X.columns) | ||
dimensions['positives'] = sum(obs_df.status == 1) | ||
dimensions['negatives'] = sum(obs_df.status == 0) | ||
dimensions['positive_prevalence'] = y.mean().round(5) | ||
dimensions['training_observations'] = len(obs_train_df) | ||
dimensions['testing_observations'] = len(obs_test_df) | ||
results['dimensions'] = utils.value_map(dimensions, round, ndigits=5) | ||
|
||
performance = collections.OrderedDict() | ||
for part, df in ('training', obs_train_df), ('testing', obs_test_df): | ||
y_true = df.status | ||
y_pred = df.predicted_status | ||
metrics = utils.class_metrics(y_true, y_pred) | ||
metrics.update(utils.threshold_metrics(y_true, y_pred)) | ||
performance[part] = utils.value_map(metrics, round, ndigits=5) | ||
performance['cv'] = {'auroc': round(grid_search.best_score_, 5)} | ||
results['performance'] = performance | ||
|
||
gs = collections.OrderedDict() | ||
gs['cv_scores'] = utils.cv_results_to_df(grid_search.cv_results_) | ||
gs = utils.value_map(gs, utils.df_to_datatables) | ||
results['grid_search'] = gs | ||
|
||
results['model'] = utils.model_info(grid_search.best_estimator_.steps[-1][1]) | ||
|
||
feature_df = utils.get_feature_df(grid_search, X.columns) | ||
results['model']['features'] = utils.df_to_datatables(feature_df) | ||
|
||
results['observations'] = utils.df_to_datatables(obs_df) | ||
return results |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import numpy as np | ||
from sklearn.model_selection import GridSearchCV | ||
from sklearn.pipeline import Pipeline | ||
from sklearn.linear_model import SGDClassifier | ||
from sklearn.feature_selection import VarianceThreshold | ||
from sklearn.preprocessing import StandardScaler | ||
|
||
pipeline = Pipeline(steps=[ | ||
('select', VarianceThreshold()), | ||
('standardize', StandardScaler()), | ||
('classify', SGDClassifier()) | ||
]) | ||
|
||
param_grid = { | ||
'classify__random_state': [0], | ||
'classify__class_weight': ['balanced'], | ||
'classify__loss': ['log'], | ||
'classify__penalty': ['elasticnet'], | ||
'classify__alpha': 10.0 ** np.linspace(-3, 1, 10), | ||
'classify__l1_ratio': [0.15], | ||
} | ||
|
||
grid_search = GridSearchCV( | ||
estimator=pipeline, | ||
param_grid=param_grid, | ||
n_jobs=-1, | ||
scoring='roc_auc' | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import os | ||
from urllib.request import urlretrieve | ||
import json | ||
|
||
import requests | ||
|
||
def get_article_versions(article_id): | ||
""" | ||
Get version_to_url dictionary for a figshare article. | ||
""" | ||
url = 'https://api.figshare.com/v2/articles/{}/versions'.format(article_id) | ||
response = requests.get(url) | ||
version_to_url = {d['version']: d['url'] for d in response.json()} | ||
return version_to_url | ||
|
||
def download_files(directory, article_id=3487685, version=None): | ||
""" | ||
Download files for a specific figshare article_id and version. Creates a | ||
version-specific subdirectory in `directory` and downloads all files from | ||
the figshare article into this subdirectory. | ||
Parameters | ||
---------- | ||
directory : str | ||
Directory to download files to. Files are written inside a versioned | ||
directory (e.g. `v2`). | ||
article_id : int | ||
article_id on figshare | ||
version : int or None | ||
figshare version. `None` means latest. | ||
Returns | ||
------- | ||
str | ||
The version-specific DOI corresponding to the downloaded data. | ||
""" | ||
version_to_url = get_article_versions(article_id) | ||
if version is None: | ||
version = max(version_to_url.keys()) | ||
url = version_to_url[version] | ||
response = requests.get(url) | ||
article = response.json() | ||
version_directory = os.path.join(directory, 'v{}'.format(version)) | ||
|
||
if not os.path.exists(version_directory): | ||
os.mkdir(version_directory) | ||
|
||
path = os.path.join(version_directory, 'info.json') | ||
with open(path, 'w') as write_file: | ||
json.dump(article, write_file, indent=2, ensure_ascii=False, sort_keys=True) | ||
|
||
# Download the files specified by the metadata | ||
for file_info in article['files']: | ||
name = file_info['name'] | ||
path = os.path.join(version_directory, name) | ||
if os.path.exists(path): | ||
continue | ||
url = file_info['download_url'] | ||
print('Downloading {} to `{}`'.format(url, name)) | ||
urlretrieve(url, path) | ||
|
||
return version_directory |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import json | ||
|
||
import requests | ||
|
||
from cognoml.analysis import classify | ||
from cognoml import utils | ||
|
||
if __name__ == '__main__': | ||
# Create a classifier using mock input. Print output to stdout. | ||
url = 'https://github.com/cognoma/machine-learning/raw/876b8131bab46878cb49ae7243e459ec0acd2b47/data/api/hippo-input.json' | ||
response = requests.get(url) | ||
payload = response.json() | ||
payload['data_version'] = 4 | ||
results = classify(**payload) | ||
json_results = json.dumps(results, indent=2, cls=utils.JSONEncoder) | ||
print(json_results) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
import collections | ||
import json | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import sklearn | ||
|
||
def cv_results_to_df(cv_results): | ||
""" | ||
Convert a `sklearn.grid_search.GridSearchCV.cv_results_` attribute to a tidy | ||
pandas DataFrame where each row is a hyperparameter combinatination. | ||
""" | ||
cv_results_df = pd.DataFrame(cv_results) | ||
columns = [x for x in cv_results_df.columns if x.startswith('param_')] | ||
columns += ['mean_train_score', 'mean_test_score', 'std_test_score'] | ||
cv_results_df = cv_results_df[columns] | ||
return cv_results_df | ||
|
||
def expand_grid(data_dict): | ||
""" | ||
Create a dataframe from every combination of given values. | ||
""" | ||
rows = itertools.product(*data_dict.values()) | ||
grid_df = pd.DataFrame.from_records(rows, columns=data_dict.keys()) | ||
return grid_df | ||
|
||
def df_to_datatables(df, double_precision=5, indent=2): | ||
""" | ||
Convert a pandas dataframe to a JSON object formatted for datatables input. | ||
""" | ||
dump_str = df.to_json(orient='split', double_precision=double_precision) | ||
obj = json.loads(dump_str) | ||
del obj['index'] | ||
obj = collections.OrderedDict(obj) | ||
obj.move_to_end('data') | ||
return obj | ||
|
||
def json_sanitize(obj, object_pairs_hook=collections.OrderedDict): | ||
""" | ||
Sanitize an object containing pandas/numpy objects so it's JSON | ||
serializable. Does not preserve order since `pandas.json.dumps()` does not | ||
respect OrderedDict objects. Hence, it's recommended to just use the builtin | ||
`json.dump` function with `cls=JSONEncoder`. | ||
""" | ||
obj_str = pd.json.dumps(obj) | ||
print(obj_str) | ||
obj = json.loads(obj_str, object_pairs_hook=object_pairs_hook) | ||
return obj | ||
|
||
class JSONEncoder(json.JSONEncoder): | ||
""" | ||
A JSONEncoder that supports numpy types by converting them to standard | ||
python types. | ||
""" | ||
|
||
def default(self, o): | ||
if type(o).__module__ == 'numpy': | ||
return o.item() | ||
return super().default(o) | ||
|
||
def value_map(dictionary, function, *args, **kwargs): | ||
""" | ||
Edits a dictionary-like object in place to apply a function to its values. | ||
""" | ||
for key, value in dictionary.items(): | ||
dictionary[key] = function(value, *args, **kwargs) | ||
return dictionary | ||
|
||
def class_metrics(y_true, y_pred): | ||
metrics = collections.OrderedDict() | ||
metrics['precision'] = sklearn.metrics.precision_score(y_true, y_pred) | ||
metrics['recall'] = sklearn.metrics.recall_score(y_true, y_pred) | ||
metrics['f1'] = sklearn.metrics.f1_score(y_true, y_pred) | ||
metrics['accuracy'] = sklearn.metrics.accuracy_score(y_true, y_pred) | ||
# See https://github.com/scikit-learn/scikit-learn/pull/6752 | ||
metrics['balanced_accuracy'] = sklearn.metrics.recall_score( | ||
y_true, y_pred, pos_label=None, average='macro') | ||
return metrics | ||
|
||
def threshold_metrics(y_true, y_pred): | ||
metrics = collections.OrderedDict() | ||
metrics['auroc'] = sklearn.metrics.roc_auc_score(y_true, y_pred) | ||
metrics['auprc'] = sklearn.metrics.average_precision_score(y_true, y_pred) | ||
return metrics | ||
|
||
def model_info(estimator): | ||
model = collections.OrderedDict() | ||
model['class'] = type(estimator).__name__ | ||
model['module'] = estimator.__module__ | ||
model['parameters'] = sort_dict(estimator.get_params()) | ||
return model | ||
|
||
def get_feature_df(grid_search, features): | ||
""" | ||
Return the feature names and coefficients from the final classifier of the | ||
best pipeline found by GridSearchCV. See https://git.io/vPWLI. This function | ||
assumes every selection step of the pipeline has a name starting with | ||
`select`. | ||
Params | ||
------ | ||
grid_search: GridSearchCV object | ||
A post-fit GridSearchCV object where the estimator is a Pipeline. | ||
features: list | ||
initial feature names | ||
Returns | ||
------- | ||
pandas.DataFrame | ||
Dataframe of feature name and coefficient values | ||
""" | ||
features = np.array(features) | ||
pipeline = grid_search.best_estimator_ | ||
for name, transformer in pipeline.steps: | ||
if name.startswith('select'): | ||
X_index = np.arange(len(features)).reshape(1, -1) | ||
indexes = transformer.transform(X_index).tolist() | ||
features = features[indexes] | ||
step_name, classifier = pipeline.steps[-1] | ||
coefficients, = classifier.coef_ | ||
feature_df = pd.DataFrame.from_items([ | ||
('feature', features), | ||
('coefficient', coefficients), | ||
]) | ||
return feature_df | ||
|
||
def sort_dict(dictionary): | ||
""" | ||
Return a dictionary as an OrderedDict sorted by keys. | ||
""" | ||
items = sorted(dictionary.items()) | ||
return collections.OrderedDict(items) |
Oops, something went wrong.