Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ukbb mri neck to knee ozery #155

Open
wants to merge 44 commits into
base: ukbb_mri_neck_to_knee
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
b5f0f52
when reading features csv file, the file columns is now of the form "…
michalozeryflato Jul 26, 2022
ed45421
moving generation of explainability maps to a new method & running_mo…
michalozeryflato Jul 27, 2022
14d9ca3
update num workers
Jul 27, 2022
4bd7b3b
saving volume box of max attention
Jul 27, 2022
534ae20
made model wrapper more general and with documentation
Jul 28, 2022
bbf20e0
added types to wrapper
Jul 28, 2022
4e9c6d6
modified to find best bounding box
Jul 28, 2022
5073ae6
need to find other solution for multi maximums in 3d
Jul 28, 2022
d62c71a
supporting additional running configurations
michalozeryflato Jul 28, 2022
adef979
Merge branch 'ukbb_mri_neck_to_knee_ozery' of github.com:IBM/fuse-med…
michalozeryflato Jul 28, 2022
8ac2e98
removing config.yaml
michalozeryflato Jul 28, 2022
2f717df
changing file to file_pattern
michalozeryflato Jul 30, 2022
21b7a54
adding men_no_neoplasms and men_cancer_genital
michalozeryflato Aug 1, 2022
808bd65
draw several bbox, find best to center works
Aug 1, 2022
f861318
Merge branch 'ukbb_mri_neck_to_knee_ozery' of https://github.com/IBM/…
Aug 1, 2022
7540642
move ukbb req to examples
Aug 1, 2022
6d2aea6
change req
Aug 1, 2022
71f72b2
merge req
Aug 1, 2022
ba5d4a9
merge req
Aug 1, 2022
6696e34
resolved merge conlict
Aug 1, 2022
9abab55
cohort_seleciton: adding filter_out to cohort definition, writing to log
michalozeryflato Aug 1, 2022
3c18219
Merge branch 'ukbb_mri_neck_to_knee_ozery' of github.com:IBM/fuse-med…
michalozeryflato Aug 1, 2022
845a9c4
cosmetic changes in cohort_and_label_def.py
michalozeryflato Aug 1, 2022
d824738
minor change in runner
michalozeryflato Aug 1, 2022
70256e0
runner: write cohort def to log
michalozeryflato Aug 1, 2022
7b66562
supporting 'resume_from_checkpoint' in runner.py
michalozeryflato Aug 2, 2022
da36d80
Merge branch 'ukbb_mri_neck_to_knee' of github.com:IBM/fuse-med-ml in…
michalozeryflato Aug 2, 2022
7f1a6d0
add kidney cancer experiment
michalozeryflato Aug 2, 2022
1515eb2
revising the definition of cohorts
michalozeryflato Aug 5, 2022
a7ea109
revise cohort and target definition
michalozeryflato Aug 6, 2022
2801f7a
minor changes
michalozeryflato Aug 6, 2022
a91487f
adding optional evaluation of subgoups
michalozeryflato Aug 7, 2022
515e89c
fix file path
Aug 8, 2022
c54bd1a
merge
michalozeryflato Aug 8, 2022
71bd518
create file for creating cohort for genetic features
michalozeryflato Aug 9, 2022
a160691
added op for volume around point, explain print point
Aug 9, 2022
cbcf1f4
Merge branch 'ukbb_mri_neck_to_knee_ozery' of https://github.com/IBM/…
Aug 9, 2022
85739ad
added support for centerpoints inference and usage, fixed warning in …
Aug 10, 2022
0c87e6f
added parallel support for explain
Aug 10, 2022
8aec7ea
split explain to new file
Aug 10, 2022
2a72c8a
fix support batching, data parallel still a problem
Aug 11, 2022
0d5ff75
fixed problem with non caching and bad sample
Aug 15, 2022
05fa933
ipdate explain
Aug 25, 2022
7098a04
updated to new UKBB op and run explain
Sep 7, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

import typing
from typing import Optional

import numpy as np
Expand All @@ -7,55 +7,58 @@
from fuse.utils import NDict


def get_samples_for_cohort(cohort_config: NDict, clinical_data_file:str, seed:Optional[int]=222):
df = pd.read_csv(clinical_data_file)
sample_ids = df['file_pattern'].values
selected = np.zeros(df.shape[0], dtype=bool)
group_ids = cohort_config['group_ids']
def get_samples_for_cohort(cohort_config: NDict, var_namespace:typing.Dict, seed:Optional[int]=222, lgr=None):

def write_log_info(s):
if lgr is not None:
lgr.info(s)

max_group_size = cohort_config[ 'max_group_size']
max_group_size = None if max_group_size <= 0 else max_group_size
np.random.seed(seed)
for group_id in group_ids:
if group_id == 'all':
group_filter = np.ones(df.shape[0], dtype=bool)
elif group_id == 'men':
group_filter = df['is female']==0
elif group_id == 'men_no_cancer':
group_filter = (df['is female'] == 0) & (df['preindex cancer'] == 0)
elif group_id == 'men_no_neoplasms':
group_filter = (df['is female'] == 0) & (df['preindex neoplasms'] == 0)
elif group_id == 'men_prostate_cancer':
group_filter = (df['is female'] == 0) & (df['preindex prostate cancer'] >0)
elif group_id == 'men_prostate_cancer_no_prostatectomy':
group_filter = (df['is female'] == 0) & (df['preindex prostate cancer'] >0) & (df['preindex prostatectomy'] == 0)
elif group_id == 'men_prostatectomy':
group_filter = (df['is female'] == 0) & (df['preindex prostatectomy']>0)
elif group_id == 'men_no_prostatectomy':
group_filter = (df['is female'] == 0) & (df['preindex prostatectomy'] == 0)
elif group_id == 'men_cancer_genital':
group_filter = (df['is female'] == 0) & (df['blocks preindex C60-C63 Malignant neoplasms of male genital organs']>0)
else:
raise NotImplementedError(group_id)

selected = eval(cohort_config['inclusion'], {}, var_namespace)

y = var_namespace[cohort_config['group_id_vec']]
y_vals = np.unique(y)

n = 0

for y_val in y_vals:
group_filter = (y == y_val) & selected
group_size = group_filter.sum()

write_log_info(f'target={y_val} size={group_size}')

if max_group_size is not None and group_size > max_group_size:
all_indexes = np.where(group_filter)[0]
rand_perm = np.random.permutation(group_size)
n_remove = group_size -max_group_size
indexes_to_remove = all_indexes[rand_perm[:n_remove]]
assert np.all(group_filter[indexes_to_remove])
selected[indexes_to_remove] = False
group_filter[indexes_to_remove] = False
assert np.sum(group_filter) == max_group_size
print( group_id, "size:", group_size, "=>", max_group_size, "First removed index=", indexes_to_remove[0])
else:
print(group_id, "size:", group_size)
selected |= group_filter
write_log_info( f"target={y_val} size: {group_size} => {max_group_size}, First removed index= {indexes_to_remove[0]}")
n += np.sum(group_filter)
print("cohort size=", np.sum(selected))
return sample_ids[selected].tolist()
assert np.sum(selected) == n
return var_namespace[cohort_config['sample_id_col']][selected].tolist()



def get_clinical_vars_namespace(df, columns_to_add):
var_namespace = {col.replace(' ', '_').replace(',', '_').replace('-', '_'):
df[col].values for col in df.columns }

for col_name, col_expression in columns_to_add.items():
x = eval(col_expression, {}, var_namespace)
var_namespace[col_name] = x

return var_namespace

def get_class_names(label_type:str):
#todo: need to revisit. define class names in config
if label_type == "classification":
class_names = ["Male", "Female","Male-prostate-excision"]
elif label_type == "is female":
Expand All @@ -67,5 +70,5 @@ def get_class_names(label_type:str):
elif label_type == "preindex cancer":
class_names = ["no-cancer", "cancer"]
else:
raise NotImplementedError("unsuported target!!")
class_names = [f'no {label_type}', label_type]
return class_names

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
columns_to_add:
women: is_female > 0
men: is_female == 0
preindex_cancer_prostate : preindex_C61_Malignant_neoplasm_of_prostate > 0
postindex_cancer_prostate : postindex_C61_Malignant_neoplasm_of_prostate > 0
prostate_hyperplasia_preindex : preindex_prostate_hyperplasia > 0
prepostindex_cancer_prostate: preindex_cancer_prostate | postindex_cancer_prostate
preindex_cancer_male_genital : preindex_C60_C63_Malignant_neoplasms_of_male_genital_organs > 0
postindex_cancer_male_genital : postindex_C60_C63_Malignant_neoplasms_of_male_genital_organs > 0
preindex_cancer_urinary_tract : preindex_C64_C68_Malignant_neoplasms_of_urinary_tract > 0
postindex_cancer_urinary_tract : postindex_C64_C68_Malignant_neoplasms_of_urinary_tract > 0
preindex_kidney_cancer : preindex_C64_Malignant_neoplasm_of_kidney__except_renal_pelvis > 0
postindex_kidney_cancer : postindex_C64_Malignant_neoplasm_of_kidney__except_renal_pelvis > 0
cancer_prostate_preindex_postindex: preindex_cancer_prostate | postindex_cancer_prostate
cohort_men_with_prostate: men & (preindex_prostatectomy==0)
cohort_cancer_urinary_tract_detection: (preindex_neoplasms==0) | preindex_cancer_urinary_tract # contains kidney cancer
cohort_cancer_male_genital_detection: men & (preindex_prostatectomy==0) & (preindex_cancer_male_genital | (preindex_neoplasms==0))
cohort_cancer_prostate_preindex: men & (preindex_prostatectomy==0) & (preindex_cancer_prostate | (preindex_neoplasms==0))
cohort_cancer_prostate_prepostindex: men & (preindex_prostatectomy==0) & (preindex_cancer_prostate | postindex_cancer_prostate | (preindex_neoplasms==0))
cohort_cancer_prostate_postindex: men & (preindex_prostatectomy==0) & (preindex_neoplasms==0)

max_group_size : 1000

# target: preindex_prostatectomy
# cohort : men

target: prostate_hyperplasia_preindex
cohort: cohort_men_with_prostate

# target: preindex_cancer_prostate
# cohort: cohort_cancer_prostate_preindex

# target: postindex_cancer_prostate
# cohort : cohort_cancer_prostate_postindex

# target: prepostindex_cancer_prostate
# cohort : cohort_cancer_prostate_prepostindex



label : ${cohort}_${max_group_size}_${target}
sample_id_col : file_pattern
paths:
my_ukbb_dir: /dccstor/mm_hcls/usr/${oc.env:USER}/fuse_example_runs/ukbb_mri_body/ #/projects/msieve_dev3/usr/${oc.env:USER}/fuse_examples/ukbb #
data_dir: ${oc.env:UKBB_MRI_BODY_DATA_PATH}
cache_dir: ${paths.my_ukbb_dir}/cache
model_dir : ${paths.my_ukbb_dir}/model_${label}
data_split_filename: ${paths.my_ukbb_dir}/ukbb_split_${label}.pkl # what if the cohort changes??
clinical_data_file: ${paths.data_dir}/body_clinical_data_v3.1.csv #label_prostatectomy_v5_331.csv
data_misc_dir: ${paths.my_ukbb_dir}/data_misc
inference_dir : ${paths.model_dir}/infer_dir
eval_dir : ${paths.model_dir}/eval_dir
sample_ids : ${paths.data_misc_dir}/sample_ids.csv
run:
running_modes : [train, infer,eval] #[train, infer, eval, explain]
train:
cos:
bucket_name: body-mri-data
credentials_file: ${paths.data_dir}/cos_body.json
fields: [ 20201 ]
columns_to_add: ${columns_to_add}
target: ${target}
cohort:
inclusion: ${cohort}
# filter_out: women
group_id_vec: ${target}
max_group_size: ${max_group_size}
sample_id_col: ${sample_id_col}
series_config:
series: Dixon_BH_17s_W
station: 4
reset_cache: False
num_workers : 12
num_folds : 5
train_folds : [0,1,2]
validation_folds : [3]
batch_size: 4
learning_rate : 1e-4
weight_decay : 0
resume_checkpoint_filename :
trainer:
accelerator : gpu
devices : 1
num_epochs : 20
ckpt_path :
infer:
infer_filename : validation_set_infer.gz
checkpoint : best_epoch.ckpt
infer_folds : [4]
target : ${target}
columns_to_add: ${columns_to_add}
num_workers : 12
eval:
cohorts: #[cohort_cancer_prostate_preindex, cohort_cancer_prostate_postindex]
columns_to_add: ${columns_to_add}
sample_id_col: ${sample_id_col}
explain:
sample_ids : ${paths.data_misc_dir}/sample_ids.csv
attention_dir : attention
centerpoints_dir_name : centerpoints
infer_filename : validation_set_infer.gz
checkpoint : best_epoch.ckpt
target : ${target}
columns_to_add: ${columns_to_add}
num_workers : 12
debug: False
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from hydra import compose, initialize
from fuse.utils import file_io
import numpy as np
import pandas as pd


def main():
cfg_overrides = ['target=prepostindex_cancer_prostate', 'cohort=cohort_cancer_prostate_prepostindex', 'max_group_size=1000']

initialize(config_path="conf", job_name="test_app") # only allows relative path
cfg = compose(config_name="config", overrides=cfg_overrides)

input_split_file = cfg["paths"]["data_split_filename"]
print("using split file", input_split_file)
output_file = input_split_file.replace(".pkl", "_for_genetics.csv") #.csv

folds = file_io.load_pickle(input_split_file)
assert len(folds) == 5
print("Using patients in folds 0, 1,2 as as train")
df_list = []
for fold in range(5):
sample_ids = folds[fold]
patient_ids = [s.split('_')[0] for s in sample_ids]
patient_ids = np.asarray(list(set(patient_ids)))
df = pd.DataFrame(patient_ids.reshape(-1, 1), columns=['eid'])
df['is_test'] = 0 if fold <= 2 else 1
df_list.append(df)

df_all = pd.concat(df_list, axis=0)
assert df_all.eid.nunique() == df_all.shape[0]
df_all.to_csv(output_file, index=False)
print("wrote", output_file)

df_test = pd.read_csv(output_file)
print(df_test.groupby('is_test').count())

if __name__ == '__main__':
main()
Loading