From b5f0f529a43eb73cd7a644cfa268d6ce4ca478b1 Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Tue, 26 Jul 2022 13:53:59 +0300 Subject: [PATCH 01/38] when reading features csv file, the file columns is now of the form "eid_*__0", since in kidney MRI there could be 2 zip files per patient --- fuseimg/datasets/ukbb_neck_to_knee.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fuseimg/datasets/ukbb_neck_to_knee.py b/fuseimg/datasets/ukbb_neck_to_knee.py index 271b2dcab..b6722ac98 100644 --- a/fuseimg/datasets/ukbb_neck_to_knee.py +++ b/fuseimg/datasets/ukbb_neck_to_knee.py @@ -30,7 +30,7 @@ import shutil import skimage import skimage.transform -import sys +from glob import glob from matplotlib import pyplot as plt def dump(img, filename, slice): @@ -67,7 +67,10 @@ def __call__(self, sample_dict: NDict, series : str , station : int, key_in:str, ''' scans = [] - zip_filename = os.path.join(self._dir_path,sample_dict[key_in]) + zip_filenames = glob(os.path.join(self._dir_path,sample_dict[key_in])) + if len(zip_filenames) >1: + raise NotImplementedError(f"{sample_dict[key_in]} has more then one match. Currently not supported") + zip_filename = zip_filenames[0] try: zip_file = zipfile.ZipFile(zip_filename) except: @@ -210,11 +213,15 @@ def dataset( :param is_female filter only male/females from database :return: DatasetDefault object """ - if is_female == None: - all_sample_ids = [file for file in os.listdir(data_dir) if '.zip' in file] - else: - all_sample_ids = [file for file in os.listdir(data_dir) if '.zip' in file] - all_sample_ids = list(set(input_source_gt[input_source_gt['is female'] == is_female][input_source_gt['file'].isin(all_sample_ids)]['file'].to_list())) + + existing_files = [file for file in os.listdir(data_dir) if '.zip' in file] + existing_sample_id_fields = [f.split("_") for f in existing_files] + existing_sample_ids = set([a[0] + "_*_" + a[2] + "_" + a[3] for a in existing_sample_id_fields]) + a_filter = input_source_gt['file'].isin(existing_sample_ids) + if is_female is not None: + a_filter &= input_source_gt['is female'] == is_female + + all_sample_ids = list(set(input_source_gt[a_filter]['file'].to_list())) if sample_ids is None: sample_ids = all_sample_ids From ed4542159a6aa816910aa3411106c6a6bd6a4797 Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Wed, 27 Jul 2022 11:17:48 +0300 Subject: [PATCH 02/38] moving generation of explainability maps to a new method & running_mode (="explain") --- .../classification/ukbb_prostate/runner.py | 91 ++++++++++++------- 1 file changed, 60 insertions(+), 31 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index c000ada69..a709b6f99 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -268,30 +268,30 @@ def run_infer(train : NDict, paths : NDict , infer: NDict): fuse_logger_start(output_path=paths["inference_dir"], console_verbose_level=logging.INFO) lgr = logging.getLogger('Fuse') lgr.info('Fuse Inference', {'attrs': ['bold', 'underline']}) + + pl_module, pl_trainer, infer_dataloader = load_model_and_test_data(train, paths, infer) + infer_file = os.path.join(paths['inference_dir'], infer['infer_filename']) - checkpoint_file = os.path.join(paths["model_dir"], infer["checkpoint"]) - lgr.info(f'infer_filename={checkpoint_file}', {'color': 'magenta'}) - lgr.info('Model:', {'attrs': 'bold'}) + pl_module.set_predictions_keys(['model.output.head_0', 'data.gt.classification']) # which keys to extract and dump into file + # create a trainer instance + predictions = pl_trainer.predict(pl_module, infer_dataloader, return_predictions=True) - model, pl_trainer, num_classes, gt_label , class_names = create_model(train, paths) - lgr.info('Model: Done', {'attrs': 'bold'}) - ## Data - folds = load_pickle(os.path.join( paths["data_misc_dir"], paths["data_split_filename"])) # assume exists and created in train func + # convert list of batch outputs into a dataframe + infer_df = convert_predictions_to_dataframe(predictions) + save_dataframe(infer_df, infer_file) + +###################################### +# Explain Template +###################################### +def run_explain(train : NDict, paths : NDict, infer: NDict): + fuse_logger_start(output_path=None, console_verbose_level=logging.INFO) + lgr = logging.getLogger('Fuse') + lgr.info('Fuse Explain', {'attrs': ['bold', 'underline']}) + + pl_module, _, infer_dataloader = load_model_and_test_data(train, paths, infer) - infer_sample_ids = [] - for fold in infer["infer_folds"]: - infer_sample_ids += folds[fold] - input_source_gt = pd.read_csv(paths["gt_file"]) - test_dataset = UKBB.dataset(paths["data_dir"], infer['target'], input_source_gt, paths["cache_dir"], sample_ids=infer_sample_ids, train=False , is_female = train["is_female"]) - ## Create dataloader - infer_dataloader = DataLoader(dataset=test_dataset, - shuffle=False, drop_last=False, - collate_fn=CollateDefault(), - num_workers=infer["num_workers"]) - # load python lightning module - pl_module = LightningModuleDefault.load_from_checkpoint(checkpoint_file, model_dir=paths["model_dir"], model=model, map_location="cpu", strict=True) model = ModelWrapDictToSeq(pl_module._model) model = medcam.inject(model, output_dir="attention_maps", backend='gcam', save_maps=True, layer='auto',return_attention=True) for i, batch in enumerate(infer_dataloader): @@ -305,16 +305,39 @@ def run_infer(train : NDict, paths : NDict , infer: NDict): nib.save(original, filename=os.path.join('attention_maps','original_'+str(i)+'_'+batch['data.input.img_path'][0]+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) nib.save(attention_map, filename=os.path.join('attention_maps','attention_'+str(i)+'_'+batch['data.input.img_path'][0]+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) - # lgr.info(f'Test Data: Done', {'attrs': 'bold'}) - # #set the prediction keys to extract (the ones used be the evaluation function). - # pl_module.set_predictions_keys(['model.output.head_0', 'data.gt.classification']) # which keys to extract and dump into file - # # create a trainer instance - # predictions = pl_trainer.predict(pl_module, infer_dataloader, return_predictions=True) - # convert list of batch outputs into a dataframe - # infer_df = convert_predictions_to_dataframe(predictions) - # save_dataframe(infer_df, infer_file) - + +def load_model_and_test_data(train : NDict, paths : NDict, infer: NDict): + lgr = logging.getLogger('Fuse') + + checkpoint_file = os.path.join(paths["model_dir"], infer["checkpoint"]) + lgr.info(f'checkpoint_file={checkpoint_file}', {'color': 'magenta'}) + + # load model + lgr.info('Model:', {'attrs': 'bold'}) + model, pl_trainer, num_classes, gt_label, class_names = create_model(train, paths) + lgr.info('Model: Done', {'attrs': 'bold'}) + + ## Data + folds = load_pickle(os.path.join(paths["data_misc_dir"], paths["data_split_filename"])) # assume exists and created in train func + + infer_sample_ids = [] + for fold in infer["infer_folds"]: + infer_sample_ids += folds[fold] + input_source_gt = pd.read_csv(paths["gt_file"]) + test_dataset = UKBB.dataset(paths["data_dir"], infer['target'], input_source_gt, paths["cache_dir"], sample_ids=infer_sample_ids, train=False, + is_female=train["is_female"]) + + ## Create dataloader + infer_dataloader = DataLoader(dataset=test_dataset, + shuffle=False, drop_last=False, + collate_fn=CollateDefault(), + num_workers=infer["num_workers"]) + + pl_module = LightningModuleDefault.load_from_checkpoint(checkpoint_file, model_dir=paths["model_dir"], model=model, map_location="cpu", strict=True) + + return pl_module, pl_trainer, infer_dataloader + def show_attention_on_image(img: np.ndarray, mask: np.ndarray, colormap: int = cv2.COLORMAP_JET) -> np.ndarray: @@ -346,7 +369,7 @@ def show_attention_on_image(img: np.ndarray, ###################################### # Analyze Template ###################################### -def run_eval(paths : NDict , infer: NDict): +def run_eval(paths : NDict, infer: NDict): fuse_logger_start(output_path=None, console_verbose_level=logging.INFO) lgr = logging.getLogger('Fuse') lgr.info('Fuse Eval', {'attrs': ['bold', 'underline']}) @@ -388,13 +411,19 @@ def main(cfg : DictConfig) -> None: else: assert "Expecting train mode to be set." - # infer + + # infer (infer set) if 'infer' in cfg["run.running_modes"]: run_infer(cfg["train"], cfg["paths"] , cfg["infer"]) # - # analyze + # evaluate (infer set) if 'eval' in cfg["run.running_modes"]: run_eval(cfg["paths"] ,cfg["infer"]) + + # explain (infer set) + if 'explain' in cfg["run.running_modes"]: + run_explain(cfg["train"], cfg["paths"], cfg["infer"]) + if __name__ == "__main__": sys.argv.append('hydra.run.dir=working_dir') main() From 14d9ca3ebfc4f72a59af41ffb7db3ed3062186f8 Mon Sep 17 00:00:00 2001 From: itaiguez Date: Wed, 27 Jul 2022 14:42:44 +0300 Subject: [PATCH 03/38] update num workers --- .../imaging/classification/ukbb_prostate/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index a709b6f99..ca9e9946e 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -325,7 +325,7 @@ def load_model_and_test_data(train : NDict, paths : NDict, infer: NDict): for fold in infer["infer_folds"]: infer_sample_ids += folds[fold] input_source_gt = pd.read_csv(paths["gt_file"]) - test_dataset = UKBB.dataset(paths["data_dir"], infer['target'], input_source_gt, paths["cache_dir"], sample_ids=infer_sample_ids, train=False, + test_dataset = UKBB.dataset(paths["data_dir"], infer['target'], input_source_gt, paths["cache_dir"], num_workers = infer['num_workers'], sample_ids=infer_sample_ids, train=False, is_female=train["is_female"]) ## Create dataloader From 4bd7b3bed7173f850050b7d8208fc8d235a68380 Mon Sep 17 00:00:00 2001 From: itaiguez Date: Wed, 27 Jul 2022 19:03:56 +0300 Subject: [PATCH 04/38] saving volume box of max attention --- .../classification/ukbb_prostate/runner.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index ca9e9946e..444005025 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -296,13 +296,22 @@ def run_explain(train : NDict, paths : NDict, infer: NDict): model = medcam.inject(model, output_dir="attention_maps", backend='gcam', save_maps=True, layer='auto',return_attention=True) for i, batch in enumerate(infer_dataloader): logit, attention_map = model(batch['data.input.img'],batch['data.gt.classification']) - max_volume = np.unravel_index(attention_map.argmax(), attention_map.shape) - print(i,max_volume) + attention_map = attention_map[0][0].numpy() batch['data.input.img'] = batch['data.input.img'][0][0].numpy() - attention_map = show_attention_on_image(batch['data.input.img'],attention_map[0][0].numpy()) + original_attention_map = nib.load(os.path.join('attention_maps','model.backbone.layer4','attention_map_'+str(i)+'_0_0.nii.gz')).get_fdata() + scale_ratio = [batch['data.input.img'].shape[i]/value for i,value in enumerate(original_attention_map.shape)] + max_volume = np.unravel_index(original_attention_map.argmax(), original_attention_map.shape) + bouding_box_indices = [(int((max_volume[i]-1)*scale_ratio[i]),int((max_volume[i]+1)*scale_ratio[i])+1) for i in range(3)] + print(scale_ratio) + print(i,max_volume) + print(bouding_box_indices) + volume_box = batch['data.input.img'][bouding_box_indices[0][0]:bouding_box_indices[0][1],bouding_box_indices[1][0]:bouding_box_indices[1][1],bouding_box_indices[2][0]:bouding_box_indices[2][1]] + attention_map = show_attention_on_image(batch['data.input.img'],attention_map) batch['data.input.img'] = np.transpose(batch['data.input.img'], axes=(1, 2, 0)) original = nib.Nifti1Image(batch['data.input.img'], affine=np.eye(4)) + volume_box = nib.Nifti1Image(volume_box, affine=np.eye(4)) nib.save(original, filename=os.path.join('attention_maps','original_'+str(i)+'_'+batch['data.input.img_path'][0]+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) + nib.save(volume_box, filename=os.path.join('attention_maps','maxvolume_'+str(i)+'_'+batch['data.input.img_path'][0]+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) nib.save(attention_map, filename=os.path.join('attention_maps','attention_'+str(i)+'_'+batch['data.input.img_path'][0]+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) From 534ae20b2cfb0495360461c6e3e6a9bfad5e6d07 Mon Sep 17 00:00:00 2001 From: itaiguez Date: Thu, 28 Jul 2022 09:08:20 +0300 Subject: [PATCH 05/38] made model wrapper more general and with documentation --- .../imaging/classification/ukbb_prostate/runner.py | 2 +- fuse/dl/models/model_wrapper.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 444005025..32443421f 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -292,7 +292,7 @@ def run_explain(train : NDict, paths : NDict, infer: NDict): pl_module, _, infer_dataloader = load_model_and_test_data(train, paths, infer) - model = ModelWrapDictToSeq(pl_module._model) + model = ModelWrapDictToSeq(pl_module._model, output_key = 'head_0') model = medcam.inject(model, output_dir="attention_maps", backend='gcam', save_maps=True, layer='auto',return_attention=True) for i, batch in enumerate(infer_dataloader): logit, attention_map = model(batch['data.input.img'],batch['data.gt.classification']) diff --git a/fuse/dl/models/model_wrapper.py b/fuse/dl/models/model_wrapper.py index eabd48f7c..ab25d7d8c 100644 --- a/fuse/dl/models/model_wrapper.py +++ b/fuse/dl/models/model_wrapper.py @@ -95,12 +95,15 @@ class ModelWrapDictToSeq(torch.nn.Module): def __init__(self, fuse_model): super().__init__() self.model = fuse_model - def forward(self, input): + def forward(self, input, output_key): batch_dict = NDict() + #find input key fuse_input = self.model.conv_inputs[0][0] batch_dict[fuse_input] = input + #feed fuse model with dict as he excpect ans_ndict = self.model(batch_dict) - output =ans_ndict['output']['head_0'] + #extract model output from dict + output =ans_ndict['output'][output_key] return output \ No newline at end of file From bbf20e015a31c9a55c3a643cb4803238ffc6827b Mon Sep 17 00:00:00 2001 From: itaiguez Date: Thu, 28 Jul 2022 10:30:46 +0300 Subject: [PATCH 06/38] added types to wrapper --- fuse/dl/models/model_wrapper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fuse/dl/models/model_wrapper.py b/fuse/dl/models/model_wrapper.py index ab25d7d8c..136767f08 100644 --- a/fuse/dl/models/model_wrapper.py +++ b/fuse/dl/models/model_wrapper.py @@ -92,10 +92,10 @@ class ModelWrapDictToSeq(torch.nn.Module): Fuse model wrapper for wrapping torch modules and passing through Fuse """ - def __init__(self, fuse_model): + def __init__(self, fuse_model : torch.nn.Module): super().__init__() self.model = fuse_model - def forward(self, input, output_key): + def forward(self, input : torch.tensor, output_key : str): batch_dict = NDict() #find input key fuse_input = self.model.conv_inputs[0][0] From 4e9c6d6de53b1b605e39c9d081ffd9cb17a4a6e9 Mon Sep 17 00:00:00 2001 From: itaiguez Date: Thu, 28 Jul 2022 17:23:57 +0300 Subject: [PATCH 07/38] modified to find best bounding box --- .../classification/ukbb_prostate/runner.py | 29 ++++++++++++++++--- fuse/dl/models/model_wrapper.py | 7 +++-- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 32443421f..c80545bb2 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -299,23 +299,38 @@ def run_explain(train : NDict, paths : NDict, infer: NDict): attention_map = attention_map[0][0].numpy() batch['data.input.img'] = batch['data.input.img'][0][0].numpy() original_attention_map = nib.load(os.path.join('attention_maps','model.backbone.layer4','attention_map_'+str(i)+'_0_0.nii.gz')).get_fdata() - scale_ratio = [batch['data.input.img'].shape[i]/value for i,value in enumerate(original_attention_map.shape)] - max_volume = np.unravel_index(original_attention_map.argmax(), original_attention_map.shape) + original_transposed = np.transpose(batch['data.input.img'], axes=(1, 2, 0)) + scale_ratio = [ original_transposed.shape[i]/value for i,value in enumerate(original_attention_map.shape)] + max_volumes = largest_indices(original_attention_map, 3) + center = tuple([index/2 for index in original_attention_map.shape]) + min_dist = 99999999999999999 + max_volume = max_volumes[0] + for point in max_volumes : + dist = np.linalg.norm(point-center) + if dist < min_dist : + min_dist = dist + max_volume = point bouding_box_indices = [(int((max_volume[i]-1)*scale_ratio[i]),int((max_volume[i]+1)*scale_ratio[i])+1) for i in range(3)] print(scale_ratio) print(i,max_volume) print(bouding_box_indices) - volume_box = batch['data.input.img'][bouding_box_indices[0][0]:bouding_box_indices[0][1],bouding_box_indices[1][0]:bouding_box_indices[1][1],bouding_box_indices[2][0]:bouding_box_indices[2][1]] attention_map = show_attention_on_image(batch['data.input.img'],attention_map) batch['data.input.img'] = np.transpose(batch['data.input.img'], axes=(1, 2, 0)) original = nib.Nifti1Image(batch['data.input.img'], affine=np.eye(4)) + volume_box = np.zeros(batch['data.input.img'].shape) + for slice in range(bouding_box_indices[2][0],bouding_box_indices[2][1] - 1): + for x in range(bouding_box_indices[0][0],bouding_box_indices[0][1] - 1) : + volume_box[x,bouding_box_indices[1][0],slice] = 1 + volume_box[x,bouding_box_indices[1][1],slice] = 1 + for y in range(bouding_box_indices[1][0],bouding_box_indices[1][1] - 1) : + volume_box[bouding_box_indices[0][0],y,slice] = 1 + volume_box[bouding_box_indices[0][1],y,slice] = 1 volume_box = nib.Nifti1Image(volume_box, affine=np.eye(4)) nib.save(original, filename=os.path.join('attention_maps','original_'+str(i)+'_'+batch['data.input.img_path'][0]+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) nib.save(volume_box, filename=os.path.join('attention_maps','maxvolume_'+str(i)+'_'+batch['data.input.img_path'][0]+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) nib.save(attention_map, filename=os.path.join('attention_maps','attention_'+str(i)+'_'+batch['data.input.img_path'][0]+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) - def load_model_and_test_data(train : NDict, paths : NDict, infer: NDict): lgr = logging.getLogger('Fuse') @@ -374,6 +389,12 @@ def show_attention_on_image(img: np.ndarray, nifti = nib.Nifti1Image(np.concatenate( cams, axis=2 ), np.eye(4)) return nifti +def largest_indices(ary, n): + """Returns the n largest indices from a numpy array.""" + flat = ary.flatten() + indices = np.argpartition(flat, -n)[-n:] + indices = indices[np.argsort(-flat[indices])] + return np.unravel_index(indices, ary.shape) ###################################### # Analyze Template diff --git a/fuse/dl/models/model_wrapper.py b/fuse/dl/models/model_wrapper.py index 136767f08..dffe016b9 100644 --- a/fuse/dl/models/model_wrapper.py +++ b/fuse/dl/models/model_wrapper.py @@ -92,10 +92,11 @@ class ModelWrapDictToSeq(torch.nn.Module): Fuse model wrapper for wrapping torch modules and passing through Fuse """ - def __init__(self, fuse_model : torch.nn.Module): + def __init__(self, fuse_model : torch.nn.Module, output_key : str): super().__init__() self.model = fuse_model - def forward(self, input : torch.tensor, output_key : str): + self.output_key = output_key + def forward(self, input : torch.tensor): batch_dict = NDict() #find input key fuse_input = self.model.conv_inputs[0][0] @@ -103,7 +104,7 @@ def forward(self, input : torch.tensor, output_key : str): #feed fuse model with dict as he excpect ans_ndict = self.model(batch_dict) #extract model output from dict - output =ans_ndict['output'][output_key] + output =ans_ndict['output'][self.output_key] return output \ No newline at end of file From 5073ae65c2fdc6d2815605f64d4beb7a787d4086 Mon Sep 17 00:00:00 2001 From: itaiguez Date: Thu, 28 Jul 2022 17:51:13 +0300 Subject: [PATCH 08/38] need to find other solution for multi maximums in 3d --- .../classification/ukbb_prostate/runner.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index c80545bb2..0337b7d36 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -301,16 +301,17 @@ def run_explain(train : NDict, paths : NDict, infer: NDict): original_attention_map = nib.load(os.path.join('attention_maps','model.backbone.layer4','attention_map_'+str(i)+'_0_0.nii.gz')).get_fdata() original_transposed = np.transpose(batch['data.input.img'], axes=(1, 2, 0)) scale_ratio = [ original_transposed.shape[i]/value for i,value in enumerate(original_attention_map.shape)] - max_volumes = largest_indices(original_attention_map, 3) - center = tuple([index/2 for index in original_attention_map.shape]) - min_dist = 99999999999999999 - max_volume = max_volumes[0] - for point in max_volumes : - dist = np.linalg.norm(point-center) - if dist < min_dist : - min_dist = dist - max_volume = point - bouding_box_indices = [(int((max_volume[i]-1)*scale_ratio[i]),int((max_volume[i]+1)*scale_ratio[i])+1) for i in range(3)] + # max_volumes = largest_indices(original_attention_map, 3) + # center = tuple([index/2 for index in original_attention_map.shape]) + # min_dist = 99999999999999999 + # max_volume = max_volumes[0] + # for point in max_volumes : + # dist = np.linalg.norm(point-center) + # if dist < min_dist : + # min_dist = dist + # max_volume = point + max_volume = np.unravel_index(original_attention_map.argmax(), original_attention_map.shape) + bouding_box_indices = [(int((max_volume[i]-1)*scale_ratio[i]),int((max_volume[i]+1)*scale_ratio[i])) for i in range(3)] print(scale_ratio) print(i,max_volume) print(bouding_box_indices) From d62c71a0f861369c91468818d0cc2c84b4c256fd Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Thu, 28 Jul 2022 22:16:10 +0300 Subject: [PATCH 09/38] supporting additional running configurations --- .../ukbb_prostate/cohort_and_label_def.py | 65 ++++++++++ .../conf/config_prostate_cancer.yaml | 44 +++++++ .../conf/config_prostatectomy.yaml | 44 +++++++ .../ukbb_prostate/files_download_from_cos.py | 117 ++++++++++++++++++ .../classification/ukbb_prostate/runner.py | 69 +++++------ fuseimg/datasets/ukbb_neck_to_knee.py | 48 ++++--- 6 files changed, 332 insertions(+), 55 deletions(-) create mode 100644 examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py create mode 100644 examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostate_cancer.yaml create mode 100644 examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostatectomy.yaml create mode 100644 examples/fuse_examples/imaging/classification/ukbb_prostate/files_download_from_cos.py diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py new file mode 100644 index 000000000..fddf8098a --- /dev/null +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py @@ -0,0 +1,65 @@ + +from typing import Optional + +import numpy as np +import pandas as pd + +from fuse.utils import NDict + + +def get_samples_for_cohort(cohort_config: NDict, clinical_data_file:str, seed:Optional[int]=222): + df = pd.read_csv(clinical_data_file) + sample_ids = df['file'].values + selected = np.zeros(df.shape[0], dtype=bool) + group_ids = cohort_config['group_ids'] + max_group_size = cohort_config[ 'max_group_size'] + max_group_size = None if max_group_size <= 0 else max_group_size + np.random.seed(seed) + for group_id in group_ids: + if group_id == 'all': + group_filter = np.ones(df.shape[0], dtype=bool) + elif group_id == 'men': + group_filter = df['is female']==0 + elif group_id == 'men_no_cancer': + group_filter = (df['is female'] == 0) & (df['preindex cancer'] == 0) + elif group_id == 'men_prostate_cancer': + group_filter = (df['is female'] == 0) & (df['preindex prostate cancer'] == 1) + elif group_id == 'men_prostate_cancer_no_prostatectomy': + group_filter = (df['is female'] == 0) & (df['preindex prostate cancer'] == 1) & (df['preindex prostatectomy'] == 0) + elif group_id == 'men_prostatectomy': + group_filter = (df['is female'] == 0) & (df['preindex prostatectomy'] == 1) + elif group_id == 'men_no_prostatectomy': + group_filter = (df['is female'] == 0) & (df['preindex prostatectomy'] == 0) + else: + raise NotImplementedError(group_id) + + group_size = group_filter.sum() + + if max_group_size is not None and group_size > max_group_size: + all_indexes = np.where(group_filter)[0] + rand_perm = np.random.permutation(group_size) + n_remove = group_size -max_group_size + indexes_to_remove = all_indexes[rand_perm[:n_remove]] + assert np.all(group_filter[indexes_to_remove]) + group_filter[indexes_to_remove] = False + assert np.sum(group_filter) == max_group_size + print( group_id, "size:", group_size, "=>", max_group_size, "First removed index=", indexes_to_remove[0]) + else: + print(group_id, "size:", group_size) + selected |= group_filter + print("cohort size=", np.sum(selected)) + return sample_ids[selected].tolist() + + +def get_class_names(label_type:str): + if label_type == "classification": + class_names = ["Male", "Female","Male-prostate-excision"] + elif label_type == "is female": + class_names = ["Male", "Female"] + elif label_type == "preindex prostatectomy": + class_names = ["No-surgery", "surgery"] + elif label_type == "preindex prostate cancer": + class_names = ["no-cancer", "prostate-cancer"] + else: + raise NotImplementedError("unsuported target!!") + return class_names diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostate_cancer.yaml b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostate_cancer.yaml new file mode 100644 index 000000000..7de5eaf3f --- /dev/null +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostate_cancer.yaml @@ -0,0 +1,44 @@ +label: prostate_cancer +paths: + my_ukbb_dir: /projects/msieve_dev3/usr/ozery/fuse_examples/ukbb + data_dir: ${oc.env:UKBB_MRI_BODY_DATA_PATH} + cache_dir: ${paths.my_ukbb_dir}/cache + model_dir : ${paths.my_ukbb_dir}/model_${label} + data_split_filename: ${paths.my_ukbb_dir}/ukbb_split_${label}.pkl # what if the cohort changes?? + clinical_data_file: /projects/msieve/Data/ukbb/body-mri-data/label_prostatectomy_v5_331.csv + data_misc_dir: ${paths.my_ukbb_dir}/data_misc + inference_dir : ${paths.model_dir}/infer_dir + eval_dir : ${paths.model_dir}/eval_dir + sample_ids : ${paths.data_misc_dir}/sample_ids.csv +run: + running_modes : ['train','infer', 'eval'] #['train','infer', 'eval', 'explain'] + +train: + cos: + bucket_name: body-mri-data + credentials_file: ${paths.data_dir}/cos_body.json + fields: [ 20201 ] + cohort: + group_ids: [ men_no_cancer, men_prostate_cancer_no_prostatectomy ] + max_group_size: 500 + target: preindex prostate cancer + reset_cache: False + num_workers : 12 + num_folds : 5 + train_folds : [0,1,2] + validation_folds : [3] + batch_size: 4 + learning_rate : 1e-4 + weight_decay : 0 + resume_checkpoint_filename : + trainer: + accelerator : gpu + devices : 1 + num_epochs : 10 + ckpt_path : +infer: + infer_filename : validation_set_infer.gz + checkpoint : best_epoch.ckpt + infer_folds : [4] + target : preindex prostate cancer + num_workers : 12 \ No newline at end of file diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostatectomy.yaml b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostatectomy.yaml new file mode 100644 index 000000000..e6dd8cef5 --- /dev/null +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostatectomy.yaml @@ -0,0 +1,44 @@ +label: prostatectomy +paths: + my_ukbb_dir: /projects/msieve_dev3/usr/ozery/fuse_examples/ukbb + data_dir: ${oc.env:UKBB_MRI_BODY_DATA_PATH} + cache_dir: ${paths.my_ukbb_dir}/cache2 + model_dir : ${paths.my_ukbb_dir}/model_${label} + data_split_filename: ${paths.my_ukbb_dir}/ukbb_split_${label}.pkl # what if the cohort changes?? + clinical_data_file: /projects/msieve/Data/ukbb/body-mri-data/label_prostatectomy_v5_331.csv + data_misc_dir: ${paths.my_ukbb_dir}/data_misc + inference_dir : ${paths.model_dir}/infer_dir + eval_dir : ${paths.model_dir}/eval_dir + sample_ids : ${paths.data_misc_dir}/sample_ids.csv +run: + running_modes : ['train','infer', 'eval'] #['train','infer', 'eval', 'explain'] + +train: + cos: + bucket_name: body-mri-data + credentials_file: ${paths.data_dir}/cos_body.json + fields: [ 20201 ] + cohort: + group_ids: [ men_no_prostatectomy, men_prostatectomy ] + max_group_size: 500 + target: preindex prostatectomy + reset_cache: False + num_workers : 12 + num_folds : 5 + train_folds : [0,1,2] + validation_folds : [3] + batch_size: 4 + learning_rate : 1e-4 + weight_decay : 0 + resume_checkpoint_filename : + trainer: + accelerator : gpu + devices : 1 + num_epochs : 10 + ckpt_path : +infer: + infer_filename : validation_set_infer.gz + checkpoint : best_epoch.ckpt + infer_folds : [4] + target : preindex prostatectomy + num_workers : 12 \ No newline at end of file diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/files_download_from_cos.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/files_download_from_cos.py new file mode 100644 index 000000000..1d07c35a3 --- /dev/null +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/files_download_from_cos.py @@ -0,0 +1,117 @@ +from botocore.client import Config, ClientError +import ibm_boto3 +import os +import json +import time +import glob + +from typing import List +from fuse.utils import NDict + +def download_sample_files(sample_ids: List[str], mri_output_dir:str, cos_cfg:NDict): + if not os.path.exists(mri_output_dir): + os.makedirs(mri_output_dir) + print("created", mri_output_dir) + + fields = cos_cfg["fields"] + potentially_missing_files_set = set() + + for field in fields: + potentially_requested_files_set = set([s.replace("*", str(field)) for s in sample_ids]) + existing_files_disk_set = set([os.path.basename(f) for f in glob.glob(os.path.join(mri_output_dir, f"*_{field}_*_0.zip"))]) + + potentially_missing_files_set |= potentially_requested_files_set - existing_files_disk_set + + if len(potentially_missing_files_set)==0: + print("No missing files.") + return + + print(f"checking download of {len(potentially_missing_files_set)} potnetially missing files") + + mri_type_bucket_name = cos_cfg["bucket_name"] + + + cos_credentials_file = cos_cfg["credentials_file"] + + with open(cos_credentials_file, 'r') as f: + cos_credentials = json.load(f) + print(cos_credentials) + + auth_endpoint = 'https://iam.bluemix.net/oidc/token' + service_endpoint = 'https://s3-api.us-geo.objectstorage.softlayer.net' + + + t0 = time.time() + cosClient = ibm_boto3.client('s3', + ibm_api_key_id=cos_credentials['apikey'], + ibm_service_instance_id=cos_credentials['resource_instance_id'], + ibm_auth_endpoint=auth_endpoint, + config=Config(signature_version='oauth'), + endpoint_url=service_endpoint) + print(f"connected to COS in time={time.time() - t0:.2f} seconds") + + # List the objects in the bucket + existing_files_cos = listFilesFromCOS(cosClient, mri_type_bucket_name) + print(len(existing_files_cos)) + + + filenames_to_download = list(set(existing_files_cos).intersection(potentially_missing_files_set)) + + downloadFilesFromCOS(cosClient, filenames_to_download, mri_type_bucket_name, destinationDirectory=mri_output_dir) + + +# Create method to get a complete list of the files in a bucket (each call is limited to 1000 files) +def listFilesFromCOS(cosClient, bucketName): + # Initialize result + existingFileSet = set() + + # Initialize the continuation token + ContinuationToken = '' + + # Loop in case there are more than 1000 files + while True: + + # Get the file list (include continuation token for paging) + res = cosClient.list_objects_v2(Bucket=bucketName, ContinuationToken=ContinuationToken) + + # Put the files in the set + if 'Contents' in res: + for r in res['Contents']: + existingFileSet.add(r['Key']) + + # Check if there are more files and grab the continuation token if true + if res['IsTruncated'] == True: + ContinuationToken = res['NextContinuationToken'] + else: + break + + # Return the file set + return list(existingFileSet) + + +def downloadFilesFromCOS(cosClient, fileList, myBucketName, destinationDirectory='.'): + print("prepare to download", len(fileList)) + destFileList = [] + + # Loop over the files + n_download = 0 + n_exist = 0 + for sourceFileName in fileList: + # Get the destination file name + destFileName = os.path.join(destinationDirectory, os.path.basename(sourceFileName)) + if os.path.exists(destFileName): + print("File already exists", destFileName, "==> skipping") + n_exist += 1 + continue + + # Copy file from my bucket + cosClient.download_file(Filename=destFileName, Bucket=myBucketName, Key=sourceFileName) + + print(n_download, sourceFileName, ' --> ', destFileName) + n_download += 1 + + destFileList.append(destFileName) + + print("Files:", n_download,"downloaded,",n_exist, "already existed") + return destFileList + diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index a709b6f99..04dae360d 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -17,6 +17,11 @@ from collections import OrderedDict import os + +from fuse_examples.imaging.classification.ukbb_prostate import cohort_and_label_def, files_download_from_cos + + +os.environ['UKBB_MRI_BODY_DATA_PATH']='/projects/msieve/Data/ukbb/body-mri-data' import sys import copy from fuse.eval.metrics.classification.metrics_thresholding_common import MetricApplyThresholds @@ -35,8 +40,6 @@ from fuse.data.utils.samplers import BatchSamplerDefault from fuse.data.utils.collates import CollateDefault from fuse.data.utils.split import dataset_balanced_division_to_folds -from fuse.dl.models import ModelMultiHead -from fuse.utils.file_io.file_io import load_pickle from fuse.dl.losses.loss_default import LossDefault from fuse.eval.metrics.classification.metrics_classification_common import MetricAUCROC, MetricAccuracy @@ -48,7 +51,6 @@ from fuse.eval.evaluator import EvaluatorDefault import torch import hydra -from typing import Dict from omegaconf import DictConfig, OmegaConf from fuse.dl.models.backbones.backbone_resnet_3d import BackboneResnet3D @@ -59,7 +61,6 @@ from medcam import medcam import numpy as np from cv2 import cv2 -import skimage import nibabel as nib assert "UKBB_DATA_PATH" in os.environ, "Expecting environment variable UKBB_DATA_PATH to be set. Follow the instruction in example README file to download and set the path to the data" @@ -75,20 +76,9 @@ def create_model(train: NDict,paths: NDict) -> torch.nn.Module: See HeadGlobalPoolingClassifier for details """ #### Train Data - if train['target'] == "classification" : - num_classes = 3 - gt_label = "data.gt.classification" - class_names = ["Male", "Female","Male-prostate-excision"] - elif train['target'] == "preindex prostatectomy" : - num_classes = 2 - gt_label = "data.gt.classification" - class_names = ["No-surgery","surgery"] - elif train['target'] == "is female" : - num_classes = 2 - gt_label = "data.gt.classification" - class_names = ["Male","Female"] - else: - raise("unsuported target!!") + gt_label_key = "data.gt.classification" + class_names = cohort_and_label_def.get_class_names(train['target']) + num_classes = len(class_names) model = ModelMultiHead( conv_inputs=(('data.input.img', 1),), backbone=BackboneResnet3D(in_channels=1), @@ -110,7 +100,7 @@ def create_model(train: NDict,paths: NDict) -> torch.nn.Module: devices=train['trainer']['devices'], num_sanity_val_steps = -1, auto_select_gpus=True) - return model, pl_trainer, num_classes, gt_label , class_names + return model, pl_trainer, num_classes, gt_label_key , class_names ################################# # Train Template @@ -122,8 +112,13 @@ def run_train(paths : NDict , train: NDict ) -> torch.nn.Module: fuse_logger_start(output_path=paths["model_dir"], console_verbose_level=logging.INFO) lgr = logging.getLogger('Fuse') + sample_ids = cohort_and_label_def.get_samples_for_cohort(train['cohort'], paths['clinical_data_file']) + # Download data - # TBD + # instructions how to get the ukbb data + # 1. apply for access in his website https://www.ukbiobank.ac.uk/enable-your-research/apply-for-access + # 2. download all data to the path configured in os env variable UKBB_DATA_PATH + files_download_from_cos.download_sample_files(sample_ids=sample_ids, mri_output_dir=paths["data_dir"], cos_cfg=train["cos"]) lgr.info('\nFuse Train', {'attrs': ['bold', 'underline']}) @@ -138,14 +133,17 @@ def run_train(paths : NDict , train: NDict ) -> torch.nn.Module: # split to folds randomly - temp - samples_path = os.path.join(paths["data_misc_dir"],"samples.csv") - if os.path.isfile(samples_path) : - sample_ids = pd.read_csv(samples_path)['file'].to_list() - print(sample_ids) - else: - sample_ids = None - input_source_gt = pd.read_csv(paths["gt_file"]) - dataset_all = UKBB.dataset(paths["data_dir"], train['target'], input_source_gt, paths["cache_dir"], reset_cache=False, num_workers=train["num_workers"], sample_ids=sample_ids,train=True , is_female = train["is_female"]) + # samples_path = os.path.join(paths["data_misc_dir"],"samples.csv") + # if os.path.isfile(samples_path) : + # sample_ids = pd.read_csv(samples_path)['file'].to_list() + # print(sample_ids) + # else: + # sample_ids = None + input_source_gt = pd.read_csv(paths["clinical_data_file"]) + dataset_all = UKBB.dataset(paths["data_dir"], train['target'], input_source_gt, paths["cache_dir"], + reset_cache=False, num_workers=train["num_workers"], sample_ids=sample_ids, + train=True + ) print("dataset size",len(dataset_all)) folds = dataset_balanced_division_to_folds(dataset=dataset_all, @@ -162,9 +160,9 @@ def run_train(paths : NDict , train: NDict ) -> torch.nn.Module: for fold in train["validation_folds"]: validation_sample_ids += folds[fold] - train_dataset = UKBB.dataset(paths["data_dir"], train['target'], input_source_gt, paths["cache_dir"], reset_cache=False, num_workers=train["num_workers"], sample_ids=train_sample_ids, train=True , is_female = train["is_female"]) + train_dataset = UKBB.dataset(paths["data_dir"], train['target'], input_source_gt, paths["cache_dir"], reset_cache=False, num_workers=train["num_workers"], sample_ids=train_sample_ids, train=True) - validation_dataset = UKBB.dataset(paths["data_dir"], train['target'], input_source_gt, paths["cache_dir"], reset_cache=False, num_workers=train["num_workers"], sample_ids=validation_sample_ids , is_female = train["is_female"]) + validation_dataset = UKBB.dataset(paths["data_dir"], train['target'], input_source_gt, paths["cache_dir"], reset_cache=False, num_workers=train["num_workers"], sample_ids=validation_sample_ids) ## Create sampler lgr.info(f'- Create sampler:') @@ -324,9 +322,8 @@ def load_model_and_test_data(train : NDict, paths : NDict, infer: NDict): infer_sample_ids = [] for fold in infer["infer_folds"]: infer_sample_ids += folds[fold] - input_source_gt = pd.read_csv(paths["gt_file"]) - test_dataset = UKBB.dataset(paths["data_dir"], infer['target'], input_source_gt, paths["cache_dir"], sample_ids=infer_sample_ids, train=False, - is_female=train["is_female"]) + input_source_gt = pd.read_csv(paths["clinical_data_file"]) + test_dataset = UKBB.dataset(paths["data_dir"], infer['target'], input_source_gt, paths["cache_dir"], sample_ids=infer_sample_ids, train=False) ## Create dataloader infer_dataloader = DataLoader(dataset=test_dataset, @@ -392,6 +389,7 @@ def run_eval(paths : NDict, infer: NDict): return results + @hydra.main(config_path="conf", config_name="config") def main(cfg : DictConfig) -> None: cfg = NDict(OmegaConf.to_object(cfg)) @@ -400,11 +398,6 @@ def main(cfg : DictConfig) -> None: force_gpus = None # [0] choose_and_enable_multiple_gpus(cfg["train.trainer.devices"], force_gpus=force_gpus) - # instructions how to get the ukbb data - # 1. apply for access in his website https://www.ukbiobank.ac.uk/enable-your-research/apply-for-access - # 2. download all data to the path configured in os env variable UKBB_DATA_PATH - - # train if 'train' in cfg["run.running_modes"]: run_train(cfg["paths"] ,cfg["train"]) diff --git a/fuseimg/datasets/ukbb_neck_to_knee.py b/fuseimg/datasets/ukbb_neck_to_knee.py index b6722ac98..6b3a14530 100644 --- a/fuseimg/datasets/ukbb_neck_to_knee.py +++ b/fuseimg/datasets/ukbb_neck_to_knee.py @@ -121,8 +121,30 @@ class UKBB: # 2. folder named body-mri-data which is the downloaded data folder """ # bump whenever the static pipeline modified - CMMD_DATASET_VER = 0 + UKBB_DATASET_VER = 0 + @staticmethod + def download(path: str) -> None: + ''' + Automatic download is not supported, please follow instructions in STOIC21 class header to download + ''' + assert len(UKBB.sample_ids(path)) > 0, "automatic download is not supported, please follow instructions in STOIC21 class header to download" + + + @staticmethod + def sample_ids(path: str): + return UKBB.get_existing_sample_ids(path) + + @staticmethod + def get_existing_sample_ids(path: str): + """ + get all the sample ids that have a zip file in the specified path + """ + existing_files = glob(os.path.join(path, "*_*_*_0.zip")) + existing_sample_id_fields = [f.split("_") for f in existing_files] + existing_sample_ids = set([a[0] + "_*_" + a[2] + "_" + a[3] for a in existing_sample_id_fields]) + + return existing_sample_ids @staticmethod def static_pipeline(data_dir: str) -> PipelineDefault: """ @@ -133,11 +155,11 @@ def static_pipeline(data_dir: str) -> PipelineDefault: # decoding sample ID (OpUKBBSampleIDDecode(), dict()), # will save image and seg path to "data.input.img_path", "data.gt.seg_path" (OpLoadUKBBZip(data_dir), dict(key_in="data.input.img_path", key_out="data.input.img", unique_id_out="data.ID", series="Dixon_BH_17s_W", station = 4)), - (OpLambda(partial(skimage.transform.resize, - output_shape=(32, 256, 256), - mode='reflect', - anti_aliasing=True, - preserve_range=True)), dict(key="data.input.img")), + # (OpLambda(partial(skimage.transform.resize, + # output_shape=(32, 256, 256), + # mode='reflect', + # anti_aliasing=True, + # preserve_range=True)), dict(key="data.input.img")), (OpNormalizeAgainstSelf(), dict(key="data.input.img")), (OpToNumpy(), dict(key='data.input.img', dtype=np.float32)), # (OpLambda(partial(dump, filename="first.png", slice = 25)), dict(key="data.input.img")), @@ -197,8 +219,7 @@ def dataset( reset_cache : bool = True, num_workers:int = 10, sample_ids: Optional[Sequence[Hashable]] = None, - train: bool = False, - is_female: int = None) : + train: bool = False) : """ Creates Fuse Dataset single object (either for training, validation and test or user defined set) @@ -210,20 +231,13 @@ def dataset( :param num_workers: number of processes used for caching :param sample_ids: dataset including the specified sample_ids or None for all the samples. sample_id is case_{id:05d} (for example case_00001 or case_00100). :param train: True if used for training - adds augmentation operations to the pipeline - :param is_female filter only male/females from database :return: DatasetDefault object """ - existing_files = [file for file in os.listdir(data_dir) if '.zip' in file] - existing_sample_id_fields = [f.split("_") for f in existing_files] - existing_sample_ids = set([a[0] + "_*_" + a[2] + "_" + a[3] for a in existing_sample_id_fields]) - a_filter = input_source_gt['file'].isin(existing_sample_ids) - if is_female is not None: - a_filter &= input_source_gt['is female'] == is_female - all_sample_ids = list(set(input_source_gt[a_filter]['file'].to_list())) if sample_ids is None: - sample_ids = all_sample_ids + sample_ids = UKBB.sample_ids(data_dir) + static_pipeline = UKBB.static_pipeline(data_dir) dynamic_pipeline = UKBB.dynamic_pipeline(input_source_gt, target,train=train) From 8ac2e98e67a9562954244e20c56ca7c16a52887e Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Thu, 28 Jul 2022 22:28:29 +0300 Subject: [PATCH 10/38] removing config.yaml --- .gitignore | 3 +- .../ukbb_prostate/conf/config.yaml | 35 ------------------- requirements.txt | 4 +++ 3 files changed, 6 insertions(+), 36 deletions(-) delete mode 100644 examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config.yaml diff --git a/.gitignore b/.gitignore index c371cc2ea..098aa7ab7 100755 --- a/.gitignore +++ b/.gitignore @@ -38,4 +38,5 @@ fuse_examples/imaging/classification/knight/baseline/model_dir fuse_examples/imaging/classification/mnist/examples fuse_examples/imaging/hello_world/examples/ .vscode/ -examples/fuse_examples/imaging/classification/knight/baseline/sandbox.ipynb \ No newline at end of file +examples/fuse_examples/imaging/classification/knight/baseline/sandbox.ipynb +/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config.yaml diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config.yaml b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config.yaml deleted file mode 100644 index a14f06eef..000000000 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config.yaml +++ /dev/null @@ -1,35 +0,0 @@ -paths: - model_dir : model_new/BackboneResnet3D - inference_dir : model_new/infer_dir - eval_dir : model_new/eval_dir - cache_dir : examples/UKBB_cache_dir - data_misc_dir : data_misc - data_split_filename: ukbb_split.pkl - data_dir : ${oc.env:UKBB_DATA_PATH} - sample_ids : sample_ids.csv - gt_file : data_misc/ground_truth.csv -run: - running_modes : ['infer', 'eval'] -train: - target : preindex prostatectomy - is_female : 0 - reset_cache: False - num_workers : 12 - num_folds : 5 - train_folds : [0,1,2] - validation_folds : [3] - batch_size: 4 - learning_rate : 1e-4 - weight_decay : 0 - resume_checkpoint_filename : - trainer: - accelerator : gpu - devices : 1 - num_epochs : 10 - ckpt_path : -infer: - infer_filename : validation_set_infer.gz - checkpoint : best_epoch.ckpt - infer_folds : [4] - target : preindex prostatectomy - num_workers : 12 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 7aa10304e..03cb65d7d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,3 +33,7 @@ medpy pytorch_lightning hydra-core omegaconf +# # for ukbb example: +# medcam +# ibm-cos-sdk +# botocore \ No newline at end of file From 2f717df8d385c2a9aca60f3b9efd4103d86ee24f Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Sat, 30 Jul 2022 09:48:51 +0300 Subject: [PATCH 11/38] changing file to file_pattern --- .../classification/ukbb_prostate/cohort_and_label_def.py | 2 +- .../imaging/classification/ukbb_prostate/runner.py | 7 ++----- fuseimg/datasets/ukbb_neck_to_knee.py | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py index fddf8098a..1a8032eec 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py @@ -9,7 +9,7 @@ def get_samples_for_cohort(cohort_config: NDict, clinical_data_file:str, seed:Optional[int]=222): df = pd.read_csv(clinical_data_file) - sample_ids = df['file'].values + sample_ids = df['file_pattern'].values selected = np.zeros(df.shape[0], dtype=bool) group_ids = cohort_config['group_ids'] max_group_size = cohort_config[ 'max_group_size'] diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index f98584fea..4865b8307 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -347,12 +347,9 @@ def load_model_and_test_data(train : NDict, paths : NDict, infer: NDict): infer_sample_ids = [] for fold in infer["infer_folds"]: infer_sample_ids += folds[fold] - input_source_gt = pd.read_csv(paths["gt_file"]) - test_dataset = UKBB.dataset(paths["data_dir"], infer['target'], input_source_gt, paths["cache_dir"], num_workers = infer['num_workers'], - sample_ids=infer_sample_ids, train=False,) input_source_gt = pd.read_csv(paths["clinical_data_file"]) - test_dataset = UKBB.dataset(paths["data_dir"], infer['target'], input_source_gt, paths["cache_dir"], sample_ids=infer_sample_ids, train=False) - + test_dataset = UKBB.dataset(paths["data_dir"], infer['target'], input_source_gt, paths["cache_dir"], num_workers = infer['num_workers'], + sample_ids=infer_sample_ids, train=False) ## Create dataloader infer_dataloader = DataLoader(dataset=test_dataset, shuffle=False, drop_last=False, diff --git a/fuseimg/datasets/ukbb_neck_to_knee.py b/fuseimg/datasets/ukbb_neck_to_knee.py index 6b3a14530..be648c079 100644 --- a/fuseimg/datasets/ukbb_neck_to_knee.py +++ b/fuseimg/datasets/ukbb_neck_to_knee.py @@ -174,7 +174,7 @@ def dynamic_pipeline(data_source : pd.DataFrame, target: str, train: bool = Fals """ dynamic_pipeline = PipelineDefault("cmmd_dynamic", [ (OpReadDataframe(data_source, - key_column="file", columns_to_extract=['file','patient_id', target], + key_column="file_pattern", columns_to_extract=['file_pattern','patient_id', target], rename_columns={'patient_id' :"data.patientID", target: "data.gt.classification" }), dict()), (OpToTensor(), dict(key="data.input.img",dtype=torch.float32)), (OpToTensor(), dict(key="data.gt.classification", dtype=torch.long)), From 21b7a541c07780e0dcc2c20e1eeab8f0c40fda19 Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Mon, 1 Aug 2022 13:18:37 +0300 Subject: [PATCH 12/38] adding men_no_neoplasms and men_cancer_genital --- .../ukbb_prostate/cohort_and_label_def.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py index 1a8032eec..5374a44bf 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py @@ -22,14 +22,18 @@ def get_samples_for_cohort(cohort_config: NDict, clinical_data_file:str, seed:Op group_filter = df['is female']==0 elif group_id == 'men_no_cancer': group_filter = (df['is female'] == 0) & (df['preindex cancer'] == 0) + elif group_id == 'men_no_neoplasms': + group_filter = (df['is female'] == 0) & (df['preindex neoplasms'] == 0) elif group_id == 'men_prostate_cancer': - group_filter = (df['is female'] == 0) & (df['preindex prostate cancer'] == 1) + group_filter = (df['is female'] == 0) & (df['preindex prostate cancer'] >0) elif group_id == 'men_prostate_cancer_no_prostatectomy': - group_filter = (df['is female'] == 0) & (df['preindex prostate cancer'] == 1) & (df['preindex prostatectomy'] == 0) + group_filter = (df['is female'] == 0) & (df['preindex prostate cancer'] >0) & (df['preindex prostatectomy'] == 0) elif group_id == 'men_prostatectomy': - group_filter = (df['is female'] == 0) & (df['preindex prostatectomy'] == 1) + group_filter = (df['is female'] == 0) & (df['preindex prostatectomy']>0) elif group_id == 'men_no_prostatectomy': group_filter = (df['is female'] == 0) & (df['preindex prostatectomy'] == 0) + elif group_id == 'men_cancer_genital': + group_filter = (df['is female'] == 0) & (df['blocks preindex C60-C63 Malignant neoplasms of male genital organs']>0) else: raise NotImplementedError(group_id) @@ -60,6 +64,8 @@ def get_class_names(label_type:str): class_names = ["No-surgery", "surgery"] elif label_type == "preindex prostate cancer": class_names = ["no-cancer", "prostate-cancer"] + elif label_type == "preindex cancer": + class_names = ["no-cancer", "cancer"] else: raise NotImplementedError("unsuported target!!") return class_names From 808bd6523c9b01a71634cdc2f1e4e5919babc647 Mon Sep 17 00:00:00 2001 From: itaiguez Date: Mon, 1 Aug 2022 14:41:45 +0300 Subject: [PATCH 13/38] draw several bbox, find best to center works --- .../classification/ukbb_prostate/runner.py | 58 ++++++++++++------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 4865b8307..f66cbdff8 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -298,37 +298,51 @@ def run_explain(train : NDict, paths : NDict, infer: NDict): original_attention_map = nib.load(os.path.join('attention_maps','model.backbone.layer4','attention_map_'+str(i)+'_0_0.nii.gz')).get_fdata() original_transposed = np.transpose(batch['data.input.img'], axes=(1, 2, 0)) scale_ratio = [ original_transposed.shape[i]/value for i,value in enumerate(original_attention_map.shape)] - # max_volumes = largest_indices(original_attention_map, 3) - # center = tuple([index/2 for index in original_attention_map.shape]) - # min_dist = 99999999999999999 - # max_volume = max_volumes[0] - # for point in max_volumes : - # dist = np.linalg.norm(point-center) - # if dist < min_dist : - # min_dist = dist - # max_volume = point - max_volume = np.unravel_index(original_attention_map.argmax(), original_attention_map.shape) - bouding_box_indices = [(int((max_volume[i]-1)*scale_ratio[i]),int((max_volume[i]+1)*scale_ratio[i])) for i in range(3)] - print(scale_ratio) - print(i,max_volume) - print(bouding_box_indices) + points = [] + max_value = original_attention_map.argmax() + current_max = max_value + while True: + current_max = original_attention_map.argmax() + max_volume = np.unravel_index(current_max, original_attention_map.shape) + if current_max < max_value : + break + points.append(np.asarray(max_volume)) + original_attention_map[max_volume] = 0.0 + print("sample ",i) + print("points",points) + max_volume = points[0] + center = [int(index/2) for index in original_attention_map.shape] + min_dist = 99999999999999999 + for point in points : + dist = np.linalg.norm(point-center) + if dist < min_dist : + min_dist = dist + max_volume = point + print("best",max_volume) attention_map = show_attention_on_image(batch['data.input.img'],attention_map) batch['data.input.img'] = np.transpose(batch['data.input.img'], axes=(1, 2, 0)) original = nib.Nifti1Image(batch['data.input.img'], affine=np.eye(4)) volume_box = np.zeros(batch['data.input.img'].shape) - for slice in range(bouding_box_indices[2][0],bouding_box_indices[2][1] - 1): - for x in range(bouding_box_indices[0][0],bouding_box_indices[0][1] - 1) : - volume_box[x,bouding_box_indices[1][0],slice] = 1 - volume_box[x,bouding_box_indices[1][1],slice] = 1 - for y in range(bouding_box_indices[1][0],bouding_box_indices[1][1] - 1) : - volume_box[bouding_box_indices[0][0],y,slice] = 1 - volume_box[bouding_box_indices[0][1],y,slice] = 1 + for point in points: + bouding_box_indices = [(int((point[i]-1)*scale_ratio[i]),int((point[i]+1)*scale_ratio[i])) for i in range(3)] + print(bouding_box_indices) + volume_box = draw_bbox_around_volume(volume_box,bouding_box_indices, 1) + #bouding_box_indices = [(int((max_volume[i]-1)*scale_ratio[i]),int((max_volume[i]+1)*scale_ratio[i])) for i in range(3)] + # volume_box = draw_bbox_around_volume(volume_box,max_volume, 2) volume_box = nib.Nifti1Image(volume_box, affine=np.eye(4)) nib.save(original, filename=os.path.join('attention_maps','original_'+str(i)+'_'+batch['data.input.img_path'][0]+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) nib.save(volume_box, filename=os.path.join('attention_maps','maxvolume_'+str(i)+'_'+batch['data.input.img_path'][0]+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) nib.save(attention_map, filename=os.path.join('attention_maps','attention_'+str(i)+'_'+batch['data.input.img_path'][0]+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) - +def draw_bbox_around_volume(volume_box, bouding_box_indices, color): + for slice in range(bouding_box_indices[2][0],bouding_box_indices[2][1] - 1): + for x in range(bouding_box_indices[0][0],bouding_box_indices[0][1] - 1) : + volume_box[x,bouding_box_indices[1][0],slice] = color + volume_box[x,bouding_box_indices[1][1],slice] = color + for y in range(bouding_box_indices[1][0],bouding_box_indices[1][1] - 1) : + volume_box[bouding_box_indices[0][0],y,slice] = color + volume_box[bouding_box_indices[0][1],y,slice] = color + return volume_box def load_model_and_test_data(train : NDict, paths : NDict, infer: NDict): lgr = logging.getLogger('Fuse') From 7540642c4245f6f550dd11d0d9fbfbd2cc8ebcef Mon Sep 17 00:00:00 2001 From: itaiguez Date: Mon, 1 Aug 2022 17:03:39 +0300 Subject: [PATCH 14/38] move ukbb req to examples --- examples/requirements.txt | 6 +++++- requirements.txt | 8 ++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/examples/requirements.txt b/examples/requirements.txt index 75f4127cb..742d54c22 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -1,2 +1,6 @@ # All requirements -# python>=3.7 \ No newline at end of file +# python>=3.7 +# for ukbb example: +medcam +ibm-cos-sdk +botocore \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 03cb65d7d..22b68d056 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,7 +33,7 @@ medpy pytorch_lightning hydra-core omegaconf -# # for ukbb example: -# medcam -# ibm-cos-sdk -# botocore \ No newline at end of file +# for ukbb example: +medcam +ibm-cos-sdk +botocore \ No newline at end of file From 6d2aea6f17273e1cb66b4cbfd8f4ff4e35c52b47 Mon Sep 17 00:00:00 2001 From: itaiguez Date: Mon, 1 Aug 2022 17:09:19 +0300 Subject: [PATCH 15/38] change req --- requirements.txt | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 22b68d056..9f5c4520b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,8 +32,4 @@ ipykernel medpy pytorch_lightning hydra-core -omegaconf -# for ukbb example: -medcam -ibm-cos-sdk -botocore \ No newline at end of file +omegaconf \ No newline at end of file From 71f72b283c15b1f4e2d9d7f5e0bd597e293d64f4 Mon Sep 17 00:00:00 2001 From: itaiguez Date: Mon, 1 Aug 2022 17:14:54 +0300 Subject: [PATCH 16/38] merge req --- requirements.txt | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9f5c4520b..b568f0154 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,30 +6,28 @@ pandas>=1.2 tqdm>=4.52.0 scipy>=1.5.4 matplotlib>=3.3.3 -scikit-image>=0.17.2 scikit-learn>=0.23.2 termcolor>=1.1.0 -torch>=1.5.0 -torchvision>=0.8.1 +torch>=1.5.0,<=1.11.0 # the higher limit is temporary - to avoid from error in unitests: "Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method" +torchvision>=0.8.1,<=0.12.0 tensorboard -SimpleITK>=1.2.0 wget -opencv-python<=4.3.0.36 ipython -pydicom h5py hdf5plugin deepdiff statsmodels -nibabel -pycocotools>=2.0.1 xmlrunner paramiko tables psutil testbook ipykernel -medpy pytorch_lightning hydra-core -omegaconf \ No newline at end of file +omegaconf +pycocotools>=2.0.1 +nibabel +mypy +flake8 +black \ No newline at end of file From ba5d4a9239876de1870df55852d2d500caddb6a2 Mon Sep 17 00:00:00 2001 From: itaiguez Date: Mon, 1 Aug 2022 17:16:15 +0300 Subject: [PATCH 17/38] merge req --- requirements.txt | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/requirements.txt b/requirements.txt index b568f0154..9f5c4520b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,28 +6,30 @@ pandas>=1.2 tqdm>=4.52.0 scipy>=1.5.4 matplotlib>=3.3.3 +scikit-image>=0.17.2 scikit-learn>=0.23.2 termcolor>=1.1.0 -torch>=1.5.0,<=1.11.0 # the higher limit is temporary - to avoid from error in unitests: "Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method" -torchvision>=0.8.1,<=0.12.0 +torch>=1.5.0 +torchvision>=0.8.1 tensorboard +SimpleITK>=1.2.0 wget +opencv-python<=4.3.0.36 ipython +pydicom h5py hdf5plugin deepdiff statsmodels +nibabel +pycocotools>=2.0.1 xmlrunner paramiko tables psutil testbook ipykernel +medpy pytorch_lightning hydra-core -omegaconf -pycocotools>=2.0.1 -nibabel -mypy -flake8 -black \ No newline at end of file +omegaconf \ No newline at end of file From 9abab55173362f8c44ec8bc0f40554f045c2aad5 Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Mon, 1 Aug 2022 19:02:14 +0300 Subject: [PATCH 18/38] cohort_seleciton: adding filter_out to cohort definition, writing to log --- .../ukbb_prostate/cohort_and_label_def.py | 84 ++++++++++++------- .../classification/ukbb_prostate/runner.py | 2 +- 2 files changed, 56 insertions(+), 30 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py index 5374a44bf..0dc1b30c0 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py @@ -7,38 +7,40 @@ from fuse.utils import NDict -def get_samples_for_cohort(cohort_config: NDict, clinical_data_file:str, seed:Optional[int]=222): +def get_samples_for_cohort(cohort_config: NDict, clinical_data_file:str, seed:Optional[int]=222, lgr=None): + + def write_log_info(s): + if lgr is not None: + lgr.info(s) + df = pd.read_csv(clinical_data_file) - sample_ids = df['file_pattern'].values - selected = np.zeros(df.shape[0], dtype=bool) - group_ids = cohort_config['group_ids'] + + filter_out_groups = cohort_config.get("filter_out") + if filter_out_groups is not None: + filter_out = np.zeros(df.shape[0], dtype=bool) + for group_id in filter_out_groups: + group_filter = get_group_filter(group_id, df) + filter_out |= group_filter + else: + filter_out = None + max_group_size = cohort_config[ 'max_group_size'] max_group_size = None if max_group_size <= 0 else max_group_size np.random.seed(seed) - for group_id in group_ids: - if group_id == 'all': - group_filter = np.ones(df.shape[0], dtype=bool) - elif group_id == 'men': - group_filter = df['is female']==0 - elif group_id == 'men_no_cancer': - group_filter = (df['is female'] == 0) & (df['preindex cancer'] == 0) - elif group_id == 'men_no_neoplasms': - group_filter = (df['is female'] == 0) & (df['preindex neoplasms'] == 0) - elif group_id == 'men_prostate_cancer': - group_filter = (df['is female'] == 0) & (df['preindex prostate cancer'] >0) - elif group_id == 'men_prostate_cancer_no_prostatectomy': - group_filter = (df['is female'] == 0) & (df['preindex prostate cancer'] >0) & (df['preindex prostatectomy'] == 0) - elif group_id == 'men_prostatectomy': - group_filter = (df['is female'] == 0) & (df['preindex prostatectomy']>0) - elif group_id == 'men_no_prostatectomy': - group_filter = (df['is female'] == 0) & (df['preindex prostatectomy'] == 0) - elif group_id == 'men_cancer_genital': - group_filter = (df['is female'] == 0) & (df['blocks preindex C60-C63 Malignant neoplasms of male genital organs']>0) - else: - raise NotImplementedError(group_id) + selected = np.zeros(df.shape[0], dtype=bool) + group_ids = cohort_config['group_ids'] + + for group_id in group_ids: + group_filter = get_group_filter(group_id, df) group_size = group_filter.sum() + write_log_info(f'{group_id} size={group_size}') + if filter_out is not None: + group_filter &= ~filter_out + group_size = group_filter.sum() + write_log_info(f'{group_id} size={group_size} after filtering') + if max_group_size is not None and group_size > max_group_size: all_indexes = np.where(group_filter)[0] rand_perm = np.random.permutation(group_size) @@ -47,14 +49,38 @@ def get_samples_for_cohort(cohort_config: NDict, clinical_data_file:str, seed:Op assert np.all(group_filter[indexes_to_remove]) group_filter[indexes_to_remove] = False assert np.sum(group_filter) == max_group_size - print( group_id, "size:", group_size, "=>", max_group_size, "First removed index=", indexes_to_remove[0]) - else: - print(group_id, "size:", group_size) + write_log_info( f"{group_id} size: {group_size} => {max_group_size}, First removed index= {indexes_to_remove[0]}") + selected |= group_filter print("cohort size=", np.sum(selected)) - return sample_ids[selected].tolist() + return df['file_pattern'].values[selected].tolist() + + + +def get_group_filter(group_id, df): + if group_id == 'all': + return np.ones(df.shape[0], dtype=bool) + if group_id == 'men': + return df['is female'] == 0 + if group_id == 'women': + return df['is female'] == 1 + + if group_id == 'no_cancer': + return df['preindex cancer'] == 0 + if group_id == 'no_neoplasms': + return df['preindex neoplasms'] == 0 + + if group_id == 'prostate_cancer': + return df['preindex prostate cancer'] > 0 + if group_id == 'prostatectomy': + return df['preindex prostatectomy'] > 0 + if group_id == 'no_prostatectomy': + return df['preindex prostatectomy'] == 0 + if group_id == 'cancer_male_genital': + return df['blocks preindex C60-C63 Malignant neoplasms of male genital organs']>0 + raise NotImplementedError(group_id) def get_class_names(label_type:str): if label_type == "classification": class_names = ["Male", "Female","Male-prostate-excision"] diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 4865b8307..a85edd394 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -111,7 +111,7 @@ def run_train(paths : NDict , train: NDict ) -> torch.nn.Module: fuse_logger_start(output_path=paths["model_dir"], console_verbose_level=logging.INFO) lgr = logging.getLogger('Fuse') - sample_ids = cohort_and_label_def.get_samples_for_cohort(train['cohort'], paths['clinical_data_file']) + sample_ids = cohort_and_label_def.get_samples_for_cohort(train['cohort'], paths['clinical_data_file'], lgr=lgr) # Download data # instructions how to get the ukbb data From 845a9c4c9ec89f6d0be9c4e7eb8dd4f99f57ac68 Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Mon, 1 Aug 2022 19:14:52 +0300 Subject: [PATCH 19/38] cosmetic changes in cohort_and_label_def.py --- .../ukbb_prostate/cohort_and_label_def.py | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py index 0dc1b30c0..e84340ce7 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py @@ -60,25 +60,37 @@ def write_log_info(s): def get_group_filter(group_id, df): if group_id == 'all': return np.ones(df.shape[0], dtype=bool) + # sex if group_id == 'men': return df['is female'] == 0 if group_id == 'women': return df['is female'] == 1 - if group_id == 'no_cancer': - return df['preindex cancer'] == 0 + # neoplasms (malignant, in situ, benign) + if group_id == 'neoplasms': + return df['preindex neoplasms'] >0 if group_id == 'no_neoplasms': return df['preindex neoplasms'] == 0 - if group_id == 'prostate_cancer': + # malignant + if group_id == 'cancer': + return df['preindex cancer'] >0 + if group_id == 'no_cancer': + return df['preindex cancer'] == 0 + + # maligant - male genital + if group_id == 'cancer_male_genital': + return df['blocks preindex C60-C63 Malignant neoplasms of male genital organs'] > 0 + # malignent - prostate + if group_id == 'cancer_prostate': return df['preindex prostate cancer'] > 0 + # prostatectomy if group_id == 'prostatectomy': return df['preindex prostatectomy'] > 0 if group_id == 'no_prostatectomy': return df['preindex prostatectomy'] == 0 - if group_id == 'cancer_male_genital': - return df['blocks preindex C60-C63 Malignant neoplasms of male genital organs']>0 + raise NotImplementedError(group_id) def get_class_names(label_type:str): From d824738c04cb36082b75f7936d3eeb910aa5f60d Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Mon, 1 Aug 2022 19:24:01 +0300 Subject: [PATCH 20/38] minor change in runner --- .../imaging/classification/ukbb_prostate/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 6ad00d7cf..84b2a2a4a 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -62,7 +62,7 @@ from cv2 import cv2 import nibabel as nib -assert "UKBB_DATA_PATH" in os.environ, "Expecting environment variable UKBB_DATA_PATH to be set. Follow the instruction in example README file to download and set the path to the data" +assert "UKBB_MRI_BODY_DATA_PATH" in os.environ, "Expecting environment variable UKBB_MRI_BODY_DATA_PATH to be set. Follow the instruction in example README file to download and set the path to the data" ########################################## # Debug modes ########################################## From 70256e0276cc9f97b435d8e6c1fa0047f19ade53 Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Mon, 1 Aug 2022 19:27:08 +0300 Subject: [PATCH 21/38] runner: write cohort def to log --- .../fuse_examples/imaging/classification/ukbb_prostate/runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 84b2a2a4a..4d179ebbd 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -120,6 +120,7 @@ def run_train(paths : NDict , train: NDict ) -> torch.nn.Module: files_download_from_cos.download_sample_files(sample_ids=sample_ids, mri_output_dir=paths["data_dir"], cos_cfg=train["cos"]) lgr.info('\nFuse Train', {'attrs': ['bold', 'underline']}) + lgr.info('cohort def='+str(train['cohort']), {'color': 'magenta'}) lgr.info(f'model_dir={paths["model_dir"]}', {'color': 'magenta'}) lgr.info(f'cache_dir={paths["cache_dir"]}', {'color': 'magenta'}) From 7b6656282c9b416a8052aa9cca0055945ebfba90 Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Tue, 2 Aug 2022 11:19:29 +0300 Subject: [PATCH 22/38] supporting 'resume_from_checkpoint' in runner.py --- .../imaging/classification/ukbb_prostate/runner.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 4d179ebbd..2ce782cd7 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -98,6 +98,7 @@ def create_model(train: NDict,paths: NDict) -> torch.nn.Module: accelerator=train['trainer']['accelerator'], devices=train['trainer']['devices'], num_sanity_val_steps = -1, + resume_from_checkpoint = train.get('resume_from_checkpoint'), auto_select_gpus=True) return model, pl_trainer, num_classes, gt_label_key , class_names @@ -123,6 +124,9 @@ def run_train(paths : NDict , train: NDict ) -> torch.nn.Module: lgr.info('cohort def='+str(train['cohort']), {'color': 'magenta'}) lgr.info(f'model_dir={paths["model_dir"]}', {'color': 'magenta'}) + if train['resume_from_checkpoint'] is not None: + lgr.info(f"resume_from_checkpoint = {train['resume_from_checkpoint']}", {'color': 'magenta'}) + lgr.info(f'cache_dir={paths["cache_dir"]}', {'color': 'magenta'}) # ============================================================================== # Model From 7f1a6d0e2c114dadb5a2f0e0d428fed6de16a9ad Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Tue, 2 Aug 2022 12:12:32 +0300 Subject: [PATCH 23/38] add kidney cancer experiment --- .../ukbb_prostate/cohort_and_label_def.py | 8 +++- .../conf/config_prostate_cancer.yaml | 2 +- .../conf/config_uriinary_track_cancer.yaml | 48 +++++++++++++++++++ .../classification/ukbb_prostate/runner.py | 2 - 4 files changed, 56 insertions(+), 4 deletions(-) create mode 100644 examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_uriinary_track_cancer.yaml diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py index e84340ce7..969fac462 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py @@ -81,9 +81,15 @@ def get_group_filter(group_id, df): # maligant - male genital if group_id == 'cancer_male_genital': return df['blocks preindex C60-C63 Malignant neoplasms of male genital organs'] > 0 + # malignent - urinary tract (covers kidney) + if group_id == 'cancer_urinary_tract': + return df[ 'blocks preindex C64-C68 Malignant neoplasms of urinary tract'] > 0 # malignent - prostate if group_id == 'cancer_prostate': - return df['preindex prostate cancer'] > 0 + return df['preindex C61 Malignant neoplasm of prostate'] > 0 + # malignent - kidney + if group_id == 'cancer_kidney': + return df['preindex C64 Malignant neoplasm of kidney, except renal pelvis'] > 0 # prostatectomy if group_id == 'prostatectomy': diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostate_cancer.yaml b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostate_cancer.yaml index a267e2cea..0e5d812b5 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostate_cancer.yaml +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostate_cancer.yaml @@ -5,7 +5,7 @@ paths: cache_dir: ${paths.my_ukbb_dir}/cache model_dir : ${paths.my_ukbb_dir}/model_${label} data_split_filename: ${paths.my_ukbb_dir}/ukbb_split_${label}.pkl # what if the cohort changes?? - clinical_data_file: /projects/msieve/Data/ukbb/body-mri-data/label_prostatectomy_v5_331.csv + clinical_data_file: /projects/msieve/Data/ukbb/body-mri-data/prostate_clinical_data_v2.csv #label_prostatectomy_v5_331.csv data_misc_dir: ${paths.my_ukbb_dir}/data_misc inference_dir : ${paths.model_dir}/infer_dir eval_dir : ${paths.model_dir}/eval_dir diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_uriinary_track_cancer.yaml b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_uriinary_track_cancer.yaml new file mode 100644 index 000000000..3de70a212 --- /dev/null +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_uriinary_track_cancer.yaml @@ -0,0 +1,48 @@ +label: urinary_cancer_500 +paths: + my_ukbb_dir: /projects/msieve_dev3/usr/ozery/fuse_examples/ukbb + data_dir: ${oc.env:UKBB_MRI_BODY_DATA_PATH} + cache_dir: ${paths.my_ukbb_dir}/cache2 + model_dir : ${paths.my_ukbb_dir}/model_${label} + data_split_filename: ${paths.my_ukbb_dir}/ukbb_split_${label}.pkl # what if the cohort changes?? + clinical_data_file: /projects/msieve/Data/ukbb/body-mri-data/prostate_clinical_data_v2.csv #label_prostatectomy_v5_331.csv + data_misc_dir: ${paths.my_ukbb_dir}/data_misc + inference_dir : ${paths.model_dir}/infer_dir + eval_dir : ${paths.model_dir}/eval_dir + sample_ids : ${paths.data_misc_dir}/sample_ids.csv +run: + running_modes : ['train','infer', 'eval'] #['train','infer', 'eval', 'explain'] +train: + cos: + bucket_name: body-mri-data + credentials_file: ${paths.data_dir}/cos_body.json + fields: [ 20201 ] + cohort: + group_ids: [ no_neoplasms, cancer_urinary_tract] #cancer_male_genital] +# group_ids: [ no_prostatectomy, prostatectomy] +# filter_out: [women, prostatectomy] + max_group_size: 500 + series_config: + series: Dixon_BH_17s_W + station: 4 + target: preindex cancer + reset_cache: False + num_workers : 12 + num_folds : 5 + train_folds : [0,1,2] + validation_folds : [3] + batch_size: 4 + learning_rate : 1e-4 + weight_decay : 0 + resume_checkpoint_filename : + trainer: + accelerator : gpu + devices : 1 + num_epochs : 20 + ckpt_path : +infer: + infer_filename : validation_set_infer.gz + checkpoint : best_epoch.ckpt + infer_folds : [4] + target : preindex prostate cancer + num_workers : 12 \ No newline at end of file diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 49b36f5f0..46225c916 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -112,8 +112,6 @@ def run_train(paths : NDict , train: NDict ) -> torch.nn.Module: fuse_logger_start(output_path=paths["model_dir"], console_verbose_level=logging.INFO) lgr = logging.getLogger('Fuse') - sample_ids = cohort_and_label_def.get_samples_for_cohort(train['cohort'], paths['clinical_data_file']) - sample_ids = cohort_and_label_def.get_samples_for_cohort(train['cohort'], paths['clinical_data_file'], lgr=lgr) # Download data From 1515eb295a17275a9131b37afa5b90cdd442a8ea Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Fri, 5 Aug 2022 15:57:49 +0300 Subject: [PATCH 24/38] revising the definition of cohorts --- .../ukbb_prostate/cohort_and_label_def.py | 94 ++++++++----------- .../conf/config_prostate_cancer.yaml | 47 ---------- .../conf/config_prostatectomy.yaml | 47 ---------- ...track_cancer.yaml => config_template.yaml} | 13 ++- .../classification/ukbb_prostate/runner.py | 2 +- fuse/utils/utils_logger.py | 6 +- 6 files changed, 53 insertions(+), 156 deletions(-) delete mode 100644 examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostate_cancer.yaml delete mode 100644 examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostatectomy.yaml rename examples/fuse_examples/imaging/classification/ukbb_prostate/conf/{config_uriinary_track_cancer.yaml => config_template.yaml} (66%) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py index 969fac462..0b7c7e7eb 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py @@ -14,13 +14,11 @@ def write_log_info(s): lgr.info(s) df = pd.read_csv(clinical_data_file) + var_namespace = get_clinical_vars_namespace(df) - filter_out_groups = cohort_config.get("filter_out") - if filter_out_groups is not None: - filter_out = np.zeros(df.shape[0], dtype=bool) - for group_id in filter_out_groups: - group_filter = get_group_filter(group_id, df) - filter_out |= group_filter + filter_out_group = cohort_config.get("filter_out") + if filter_out_group is not None: + filter_out = eval(filter_out_group, var_namespace) else: filter_out = None @@ -29,17 +27,17 @@ def write_log_info(s): np.random.seed(seed) selected = np.zeros(df.shape[0], dtype=bool) - group_ids = cohort_config['group_ids'] + included_groups = cohort_config['groups'] - for group_id in group_ids: - group_filter = get_group_filter(group_id, df) + for included_group in included_groups: + group_filter = eval(included_group, var_namespace) group_size = group_filter.sum() - write_log_info(f'{group_id} size={group_size}') + write_log_info(f'{included_group} size={group_size}') if filter_out is not None: group_filter &= ~filter_out group_size = group_filter.sum() - write_log_info(f'{group_id} size={group_size} after filtering') + write_log_info(f'{included_group} size={group_size} after filtering') if max_group_size is not None and group_size > max_group_size: all_indexes = np.where(group_filter)[0] @@ -49,7 +47,7 @@ def write_log_info(s): assert np.all(group_filter[indexes_to_remove]) group_filter[indexes_to_remove] = False assert np.sum(group_filter) == max_group_size - write_log_info( f"{group_id} size: {group_size} => {max_group_size}, First removed index= {indexes_to_remove[0]}") + write_log_info( f"{included_group} size: {group_size} => {max_group_size}, First removed index= {indexes_to_remove[0]}") selected |= group_filter print("cohort size=", np.sum(selected)) @@ -57,48 +55,34 @@ def write_log_info(s): -def get_group_filter(group_id, df): - if group_id == 'all': - return np.ones(df.shape[0], dtype=bool) - # sex - if group_id == 'men': - return df['is female'] == 0 - if group_id == 'women': - return df['is female'] == 1 - - # neoplasms (malignant, in situ, benign) - if group_id == 'neoplasms': - return df['preindex neoplasms'] >0 - if group_id == 'no_neoplasms': - return df['preindex neoplasms'] == 0 - - # malignant - if group_id == 'cancer': - return df['preindex cancer'] >0 - if group_id == 'no_cancer': - return df['preindex cancer'] == 0 - - # maligant - male genital - if group_id == 'cancer_male_genital': - return df['blocks preindex C60-C63 Malignant neoplasms of male genital organs'] > 0 - # malignent - urinary tract (covers kidney) - if group_id == 'cancer_urinary_tract': - return df[ 'blocks preindex C64-C68 Malignant neoplasms of urinary tract'] > 0 - # malignent - prostate - if group_id == 'cancer_prostate': - return df['preindex C61 Malignant neoplasm of prostate'] > 0 - # malignent - kidney - if group_id == 'cancer_kidney': - return df['preindex C64 Malignant neoplasm of kidney, except renal pelvis'] > 0 - - # prostatectomy - if group_id == 'prostatectomy': - return df['preindex prostatectomy'] > 0 - if group_id == 'no_prostatectomy': - return df['preindex prostatectomy'] == 0 - - - raise NotImplementedError(group_id) +def get_clinical_vars_namespace(df): + mapping = {col.replace(' ', '_'): df[col]>0 for i, col in enumerate(df.columns) if df.dtypes[i] != 'O'} + + # vars in use: 'preindex_neoplasms', 'postindex_neoplasms', 'preindex_cancer', 'postindex_cancer' + # 'prostate_hyperplasia_preindex', 'prostate_hyperplasia_postindex', + # 'preindex_prostatectomy', 'postindex_prostatectomy', + # 'preindex_prostate_resection', 'postindex_prostate_resection' + + mapping['women'] = mapping['is_female'] + mapping['men'] = ~mapping['is_female'] + + acronyms = [('preindex_cancer_prostate', 'preindex_C61_Malignant_neoplasm_of_prostate'), + ('postindex_cancer_prostate', 'postindex_C61_Malignant_neoplasm_of_prostate'), + ('preindex_cancer_male_genital', 'preindex_C60-C63_Malignant_neoplasms_of_male_genital_organs'), + ('postindex_cancer_male_genital', 'postindex_C60-C63_Malignant_neoplasms_of_male_genital_organs'), + ('preindex_cancer_urinary_tract', 'preindex_C64-C68_Malignant_neoplasms_of_urinary_tract'), + ('postindex_cancer_urinary_tract', 'postindex_C64-C68_Malignant_neoplasms_of_urinary_tract'), + ('preindex_kidney_cancer','preindex_C64_Malignant_neoplasm_of_kidney,_except_renal_pelvis'), + ('postindex_kidney_cancer', 'postindex_C64_Malignant_neoplasm_of_kidney,_except_renal_pelvis'), + ] + for s_new, s_old in acronyms: + if s_old in mapping: + mapping[s_new] = mapping[s_old] + else: + print(f"*** {s_old} does not exist in the clinical data file") + + return mapping + def get_class_names(label_type:str): if label_type == "classification": class_names = ["Male", "Female","Male-prostate-excision"] @@ -111,5 +95,5 @@ def get_class_names(label_type:str): elif label_type == "preindex cancer": class_names = ["no-cancer", "cancer"] else: - raise NotImplementedError("unsuported target!!") + class_names = [f'no {label_type}', label_type] return class_names diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostate_cancer.yaml b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostate_cancer.yaml deleted file mode 100644 index 0e5d812b5..000000000 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostate_cancer.yaml +++ /dev/null @@ -1,47 +0,0 @@ -label: prostate_cancer -paths: - my_ukbb_dir: /projects/msieve_dev3/usr/ozery/fuse_examples/ukbb - data_dir: ${oc.env:UKBB_MRI_BODY_DATA_PATH} - cache_dir: ${paths.my_ukbb_dir}/cache - model_dir : ${paths.my_ukbb_dir}/model_${label} - data_split_filename: ${paths.my_ukbb_dir}/ukbb_split_${label}.pkl # what if the cohort changes?? - clinical_data_file: /projects/msieve/Data/ukbb/body-mri-data/prostate_clinical_data_v2.csv #label_prostatectomy_v5_331.csv - data_misc_dir: ${paths.my_ukbb_dir}/data_misc - inference_dir : ${paths.model_dir}/infer_dir - eval_dir : ${paths.model_dir}/eval_dir - sample_ids : ${paths.data_misc_dir}/sample_ids.csv -run: - running_modes : ['train','infer', 'eval'] #['train','infer', 'eval', 'explain'] - -train: - cos: - bucket_name: body-mri-data - credentials_file: ${paths.data_dir}/cos_body.json - fields: [ 20201 ] - cohort: - group_ids: [ men_no_cancer, men_prostate_cancer_no_prostatectomy ] - max_group_size: 500 - series_config: - series : Dixon_BH_17s_W - station: 4 - target: preindex prostate cancer - reset_cache: False - num_workers : 12 - num_folds : 5 - train_folds : [0,1,2] - validation_folds : [3] - batch_size: 4 - learning_rate : 1e-4 - weight_decay : 0 - resume_checkpoint_filename : - trainer: - accelerator : gpu - devices : 1 - num_epochs : 10 - ckpt_path : -infer: - infer_filename : validation_set_infer.gz - checkpoint : best_epoch.ckpt - infer_folds : [4] - target : preindex prostate cancer - num_workers : 12 \ No newline at end of file diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostatectomy.yaml b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostatectomy.yaml deleted file mode 100644 index 4b5fb54ad..000000000 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_prostatectomy.yaml +++ /dev/null @@ -1,47 +0,0 @@ -label: prostatectomy -paths: - my_ukbb_dir: /projects/msieve_dev3/usr/ozery/fuse_examples/ukbb - data_dir: ${oc.env:UKBB_MRI_BODY_DATA_PATH} - cache_dir: ${paths.my_ukbb_dir}/cache2 - model_dir : ${paths.my_ukbb_dir}/model_${label} - data_split_filename: ${paths.my_ukbb_dir}/ukbb_split_${label}.pkl # what if the cohort changes?? - clinical_data_file: /projects/msieve/Data/ukbb/body-mri-data/label_prostatectomy_v5_331.csv - data_misc_dir: ${paths.my_ukbb_dir}/data_misc - inference_dir : ${paths.model_dir}/infer_dir - eval_dir : ${paths.model_dir}/eval_dir - sample_ids : ${paths.data_misc_dir}/sample_ids.csv -run: - running_modes : ['train','infer', 'eval'] #['train','infer', 'eval', 'explain'] - -train: - cos: - bucket_name: body-mri-data - credentials_file: ${paths.data_dir}/cos_body.json - fields: [ 20201 ] - cohort: - group_ids: [ men_no_prostatectomy, men_prostatectomy ] - max_group_size: 500 - series_config: - series : Dixon_BH_17s_W - station: 4 - target: preindex prostatectomy - reset_cache: False - num_workers : 12 - num_folds : 5 - train_folds : [0,1,2] - validation_folds : [3] - batch_size: 4 - learning_rate : 1e-4 - weight_decay : 0 - resume_checkpoint_filename : - trainer: - accelerator : gpu - devices : 1 - num_epochs : 10 - ckpt_path : -infer: - infer_filename : validation_set_infer.gz - checkpoint : best_epoch.ckpt - infer_folds : [4] - target : preindex prostatectomy - num_workers : 12 \ No newline at end of file diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_uriinary_track_cancer.yaml b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml similarity index 66% rename from examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_uriinary_track_cancer.yaml rename to examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml index 3de70a212..283c990ef 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_uriinary_track_cancer.yaml +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml @@ -5,22 +5,27 @@ paths: cache_dir: ${paths.my_ukbb_dir}/cache2 model_dir : ${paths.my_ukbb_dir}/model_${label} data_split_filename: ${paths.my_ukbb_dir}/ukbb_split_${label}.pkl # what if the cohort changes?? - clinical_data_file: /projects/msieve/Data/ukbb/body-mri-data/prostate_clinical_data_v2.csv #label_prostatectomy_v5_331.csv + clinical_data_file: ${paths.data_dir}/body_clinical_data_v3.1.csv #label_prostatectomy_v5_331.csv data_misc_dir: ${paths.my_ukbb_dir}/data_misc inference_dir : ${paths.model_dir}/infer_dir eval_dir : ${paths.model_dir}/eval_dir sample_ids : ${paths.data_misc_dir}/sample_ids.csv run: running_modes : ['train','infer', 'eval'] #['train','infer', 'eval', 'explain'] +predefined_groups: + cancer_urinary_tract: [~preindex_neoplasms, preindex_cancer_urinary_tract] # contains kidney cancer + cancer_male_genital: [men & ~(preindex_neoplasms | preindex_prostatectomy), men & preindex_cancer_male_genital & ~preindex_prostatectomy] + cancer_prostate: [ men & (~preindex_neoplasms) & (~preindex_prostatectomy), men & preindex_cancer_prostate & ~preindex_prostatectomy] + prostatectomy: [men & (~preindex_prostatectomy), men & preindex_prostatectomy] train: cos: bucket_name: body-mri-data credentials_file: ${paths.data_dir}/cos_body.json fields: [ 20201 ] + cohort: - group_ids: [ no_neoplasms, cancer_urinary_tract] #cancer_male_genital] -# group_ids: [ no_prostatectomy, prostatectomy] -# filter_out: [women, prostatectomy] + groups: ${predefined_groups.cancer_prostate} +# filter_out: women max_group_size: 500 series_config: series: Dixon_BH_17s_W diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 46225c916..323df3e44 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -109,7 +109,7 @@ def run_train(paths : NDict , train: NDict ) -> torch.nn.Module: # ============================================================================== # Logger # ============================================================================== - fuse_logger_start(output_path=paths["model_dir"], console_verbose_level=logging.INFO) + fuse_logger_start(output_path=paths["model_dir"], console_verbose_level=logging.INFO, list_of_source_files=['../conf/config.yaml']) lgr = logging.getLogger('Fuse') sample_ids = cohort_and_label_def.get_samples_for_cohort(train['cohort'], paths['clinical_data_file'], lgr=lgr) diff --git a/fuse/utils/utils_logger.py b/fuse/utils/utils_logger.py index a84df71d1..de4f0894e 100644 --- a/fuse/utils/utils_logger.py +++ b/fuse/utils/utils_logger.py @@ -151,10 +151,12 @@ def fuse_logger_start( create_or_reset_dir(source_files_output_path) lgr.info(f"Copy source files to {source_files_output_path}") + if list_of_source_files is None: # copy just the caller function file name - caller_function_file_name = inspect.stack()[1][1] - list_of_source_files = [caller_function_file_name] + list_of_source_files = [] + caller_function_file_name = inspect.stack()[1][1] + list_of_source_files.append(caller_function_file_name) for src_file in list_of_source_files: copyfile(os.path.abspath(src_file), os.path.join(source_files_output_path, os.path.basename(src_file))) From a7ea109c522ec3cadd251304e275fdcd9a46e638 Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Sat, 6 Aug 2022 07:48:44 +0300 Subject: [PATCH 25/38] revise cohort and target definition --- .../ukbb_prostate/cohort_and_label_def.py | 74 ++-- .../ukbb_prostate/conf/config_template.yaml | 55 ++- .../classification/ukbb_prostate/runner.py | 318 ++++++++++-------- 3 files changed, 236 insertions(+), 211 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py index 0b7c7e7eb..363c74118 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py @@ -1,4 +1,4 @@ - +import typing from typing import Optional import numpy as np @@ -7,37 +7,28 @@ from fuse.utils import NDict -def get_samples_for_cohort(cohort_config: NDict, clinical_data_file:str, seed:Optional[int]=222, lgr=None): +def get_samples_for_cohort(cohort_config: NDict, var_namespace:typing.Dict, seed:Optional[int]=222, lgr=None): def write_log_info(s): if lgr is not None: lgr.info(s) - df = pd.read_csv(clinical_data_file) - var_namespace = get_clinical_vars_namespace(df) - - filter_out_group = cohort_config.get("filter_out") - if filter_out_group is not None: - filter_out = eval(filter_out_group, var_namespace) - else: - filter_out = None - max_group_size = cohort_config[ 'max_group_size'] max_group_size = None if max_group_size <= 0 else max_group_size np.random.seed(seed) - selected = np.zeros(df.shape[0], dtype=bool) - included_groups = cohort_config['groups'] + selected = eval(cohort_config['inclusion'], var_namespace) + + y = var_namespace[cohort_config['group_id_vec']] + y_vals = np.unique(y) - for included_group in included_groups: - group_filter = eval(included_group, var_namespace) + n = 0 + + for y_val in y_vals: + group_filter = (y == y_val) & selected group_size = group_filter.sum() - write_log_info(f'{included_group} size={group_size}') - if filter_out is not None: - group_filter &= ~filter_out - group_size = group_filter.sum() - write_log_info(f'{included_group} size={group_size} after filtering') + write_log_info(f'target={y_val} size={group_size}') if max_group_size is not None and group_size > max_group_size: all_indexes = np.where(group_filter)[0] @@ -45,45 +36,28 @@ def write_log_info(s): n_remove = group_size -max_group_size indexes_to_remove = all_indexes[rand_perm[:n_remove]] assert np.all(group_filter[indexes_to_remove]) + selected[indexes_to_remove] = False group_filter[indexes_to_remove] = False assert np.sum(group_filter) == max_group_size - write_log_info( f"{included_group} size: {group_size} => {max_group_size}, First removed index= {indexes_to_remove[0]}") - - selected |= group_filter + write_log_info( f"target={y_val} size: {group_size} => {max_group_size}, First removed index= {indexes_to_remove[0]}") + n += np.sum(group_filter) print("cohort size=", np.sum(selected)) - return df['file_pattern'].values[selected].tolist() - - - -def get_clinical_vars_namespace(df): - mapping = {col.replace(' ', '_'): df[col]>0 for i, col in enumerate(df.columns) if df.dtypes[i] != 'O'} + assert np.sum(selected) == n + return var_namespace['file_pattern'][selected].tolist() - # vars in use: 'preindex_neoplasms', 'postindex_neoplasms', 'preindex_cancer', 'postindex_cancer' - # 'prostate_hyperplasia_preindex', 'prostate_hyperplasia_postindex', - # 'preindex_prostatectomy', 'postindex_prostatectomy', - # 'preindex_prostate_resection', 'postindex_prostate_resection' - mapping['women'] = mapping['is_female'] - mapping['men'] = ~mapping['is_female'] - acronyms = [('preindex_cancer_prostate', 'preindex_C61_Malignant_neoplasm_of_prostate'), - ('postindex_cancer_prostate', 'postindex_C61_Malignant_neoplasm_of_prostate'), - ('preindex_cancer_male_genital', 'preindex_C60-C63_Malignant_neoplasms_of_male_genital_organs'), - ('postindex_cancer_male_genital', 'postindex_C60-C63_Malignant_neoplasms_of_male_genital_organs'), - ('preindex_cancer_urinary_tract', 'preindex_C64-C68_Malignant_neoplasms_of_urinary_tract'), - ('postindex_cancer_urinary_tract', 'postindex_C64-C68_Malignant_neoplasms_of_urinary_tract'), - ('preindex_kidney_cancer','preindex_C64_Malignant_neoplasm_of_kidney,_except_renal_pelvis'), - ('postindex_kidney_cancer', 'postindex_C64_Malignant_neoplasm_of_kidney,_except_renal_pelvis'), - ] - for s_new, s_old in acronyms: - if s_old in mapping: - mapping[s_new] = mapping[s_old] - else: - print(f"*** {s_old} does not exist in the clinical data file") +def get_clinical_vars_namespace(df, columns_to_add): + var_namespace = {col.replace(' ', '_').replace(',', '_').replace('-', '_'): + df[col] for i, col in enumerate(df.columns) } - return mapping + for col_name, col_expression in columns_to_add.items(): + x = eval(col_expression, var_namespace) + var_namespace[col_name] = x + return var_namespace def get_class_names(label_type:str): + #todo: need to revisit. define class names in config if label_type == "classification": class_names = ["Male", "Female","Male-prostate-excision"] elif label_type == "is female": diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml index 283c990ef..b84afe33a 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml @@ -1,8 +1,42 @@ -label: urinary_cancer_500 +columns_to_add: + women: is_female > 0 + men: is_female == 0 + preindex_cancer_prostate : preindex_C61_Malignant_neoplasm_of_prostate > 0 + postindex_cancer_prostate : postindex_C61_Malignant_neoplasm_of_prostate > 0 + prepostindex_cancer_prostate: preindex_cancer_prostate | postindex_cancer_prostate + preindex_cancer_male_genital : preindex_C60_C63_Malignant_neoplasms_of_male_genital_organs > 0 + postindex_cancer_male_genital : postindex_C60_C63_Malignant_neoplasms_of_male_genital_organs > 0 + preindex_cancer_urinary_tract : preindex_C64_C68_Malignant_neoplasms_of_urinary_tract > 0 + postindex_cancer_urinary_tract : postindex_C64_C68_Malignant_neoplasms_of_urinary_tract > 0 + preindex_kidney_cancer : preindex_C64_Malignant_neoplasm_of_kidney__except_renal_pelvis > 0 + postindex_kidney_cancer : postindex_C64_Malignant_neoplasm_of_kidney__except_renal_pelvis > 0 + cancer_prostate_preindex_postindex: preindex_cancer_prostate | postindex_cancer_prostate + cohort_men_with_prostate: men & (preindex_prostatectomy==0) + cohort_cancer_urinary_tract_detection: (preindex_neoplasms==0) | preindex_cancer_urinary_tract # contains kidney cancer + cohort_cancer_male_genital_detection: men & (preindex_prostatectomy==0) & (preindex_cancer_male_genital | (preindex_neoplasms==0)) + cohort_cancer_prostate_preindex: men & (preindex_prostatectomy==0) & (preindex_cancer_prostate | (preindex_neoplasms==0)) + cohort_cancer_prostate_prepostindex: men & (preindex_prostatectomy==0) & (preindex_cancer_prostate | postindex_cancer_prostate | (preindex_neoplasms==0)) + cohort_cancer_prostate_postindex: men & (preindex_prostatectomy==0) & (preindex_neoplasms==0) + +max_group_size : 500 + +#target: preindex_prostatectomy +#cohort : men + +#target: prostate_hyperplasia_preindex +#cohort: cohort_men_with_prostate + +#target: preindex_cancer_prostate +#cohort: cohort_cancer_prostate_preindex + +target: prepostindex_cancer_prostate +cohort : cohort_cancer_prostate_prepostindex + +label : ${cohort}_${max_group_size}_${target} paths: my_ukbb_dir: /projects/msieve_dev3/usr/ozery/fuse_examples/ukbb data_dir: ${oc.env:UKBB_MRI_BODY_DATA_PATH} - cache_dir: ${paths.my_ukbb_dir}/cache2 + cache_dir: ${paths.my_ukbb_dir}/cache model_dir : ${paths.my_ukbb_dir}/model_${label} data_split_filename: ${paths.my_ukbb_dir}/ukbb_split_${label}.pkl # what if the cohort changes?? clinical_data_file: ${paths.data_dir}/body_clinical_data_v3.1.csv #label_prostatectomy_v5_331.csv @@ -12,25 +46,21 @@ paths: sample_ids : ${paths.data_misc_dir}/sample_ids.csv run: running_modes : ['train','infer', 'eval'] #['train','infer', 'eval', 'explain'] -predefined_groups: - cancer_urinary_tract: [~preindex_neoplasms, preindex_cancer_urinary_tract] # contains kidney cancer - cancer_male_genital: [men & ~(preindex_neoplasms | preindex_prostatectomy), men & preindex_cancer_male_genital & ~preindex_prostatectomy] - cancer_prostate: [ men & (~preindex_neoplasms) & (~preindex_prostatectomy), men & preindex_cancer_prostate & ~preindex_prostatectomy] - prostatectomy: [men & (~preindex_prostatectomy), men & preindex_prostatectomy] train: cos: bucket_name: body-mri-data credentials_file: ${paths.data_dir}/cos_body.json fields: [ 20201 ] - + columns_to_add: ${columns_to_add} + target: ${target} cohort: - groups: ${predefined_groups.cancer_prostate} + inclusion: ${cohort} # filter_out: women - max_group_size: 500 + group_id_vec: ${target} + max_group_size: ${max_group_size} series_config: series: Dixon_BH_17s_W station: 4 - target: preindex cancer reset_cache: False num_workers : 12 num_folds : 5 @@ -49,5 +79,6 @@ infer: infer_filename : validation_set_infer.gz checkpoint : best_epoch.ckpt infer_folds : [4] - target : preindex prostate cancer + target : ${target} + columns_to_add: ${columns_to_add} num_workers : 12 \ No newline at end of file diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 323df3e44..7ba059fe1 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -1,4 +1,3 @@ - """ (C) Copyright 2021 IBM Corp. @@ -20,7 +19,6 @@ from fuse_examples.imaging.classification.ukbb_prostate import cohort_and_label_def, files_download_from_cos - import sys import copy from fuse.eval.metrics.classification.metrics_thresholding_common import MetricApplyThresholds @@ -69,7 +67,8 @@ mode = 'default' # Options: 'default', 'debug'. See details in FuseDebug debug = FuseDebug(mode) -def create_model(train: NDict,paths: NDict) -> torch.nn.Module: + +def create_model(train: NDict, paths: NDict) -> torch.nn.Module: """ creates the model See HeadGlobalPoolingClassifier for details @@ -79,40 +78,45 @@ def create_model(train: NDict,paths: NDict) -> torch.nn.Module: class_names = cohort_and_label_def.get_class_names(train['target']) num_classes = len(class_names) model = ModelMultiHead( - conv_inputs=(('data.input.img', 1),), - backbone=BackboneResnet3D(in_channels=1), - heads=[ - Head3DClassifier(head_name='head_0', + conv_inputs=(('data.input.img', 1),), + backbone=BackboneResnet3D(in_channels=1), + heads=[ + Head3DClassifier(head_name='head_0', conv_inputs=[("model.backbone_features", 512)], - # dropout_rate=train_params['imaging_dropout'], - # append_dropout_rate=train_params['clinical_dropout'], - # fused_dropout_rate=train_params['fused_dropout'], + # dropout_rate=train_params['imaging_dropout'], + # append_dropout_rate=train_params['clinical_dropout'], + # fused_dropout_rate=train_params['fused_dropout'], num_classes=num_classes, - # append_features=[("data.input.clinical", 8)], - # append_layers_description=(256,128), + # append_features=[("data.input.clinical", 8)], + # append_layers_description=(256,128), ), - ]) + ]) # create lightining trainer. pl_trainer = Trainer(default_root_dir=paths['model_dir'], - max_epochs=train['trainer']['num_epochs'], - accelerator=train['trainer']['accelerator'], - devices=train['trainer']['devices'], - num_sanity_val_steps = -1, - resume_from_checkpoint = train.get('resume_from_checkpoint'), - auto_select_gpus=True) - return model, pl_trainer, num_classes, gt_label_key , class_names + max_epochs=train['trainer']['num_epochs'], + accelerator=train['trainer']['accelerator'], + devices=train['trainer']['devices'], + num_sanity_val_steps=-1, + resume_from_checkpoint=train.get('resume_from_checkpoint'), + auto_select_gpus=True) + return model, pl_trainer, num_classes, gt_label_key, class_names + ################################# # Train Template ################################# -def run_train(paths : NDict , train: NDict ) -> torch.nn.Module: +def run_train(paths: NDict, train: NDict) -> torch.nn.Module: # ============================================================================== # Logger # ============================================================================== fuse_logger_start(output_path=paths["model_dir"], console_verbose_level=logging.INFO, list_of_source_files=['../conf/config.yaml']) lgr = logging.getLogger('Fuse') - sample_ids = cohort_and_label_def.get_samples_for_cohort(train['cohort'], paths['clinical_data_file'], lgr=lgr) + clinical_data_df, var_namespace = read_clinical_data_file(filename=paths["clinical_data_file"], target=train['target'], + columns_to_add=train.get('columns_to_add'), + return_var_namespace=True) + + sample_ids = cohort_and_label_def.get_samples_for_cohort(cohort_config=train['cohort'], var_namespace=var_namespace, lgr=lgr) # Download data # instructions how to get the ukbb data @@ -121,52 +125,40 @@ def run_train(paths : NDict , train: NDict ) -> torch.nn.Module: files_download_from_cos.download_sample_files(sample_ids=sample_ids, mri_output_dir=paths["data_dir"], cos_cfg=train["cos"]) lgr.info('\nFuse Train', {'attrs': ['bold', 'underline']}) - lgr.info('cohort def='+str(train['cohort']), {'color': 'magenta'}) + lgr.info('cohort def=' + str(train['cohort']), {'color': 'magenta'}) lgr.info(f'model_dir={paths["model_dir"]}', {'color': 'magenta'}) - if train['resume_from_checkpoint'] is not None: - lgr.info(f"resume_from_checkpoint = {train['resume_from_checkpoint']}", {'color': 'magenta'}) lgr.info(f'cache_dir={paths["cache_dir"]}', {'color': 'magenta'}) # ============================================================================== # Model # ============================================================================== lgr.info('Model:', {'attrs': 'bold'}) - model, pl_trainer, num_classes, gt_label , class_names = create_model(train, paths) + model, pl_trainer, num_classes, gt_label, class_names = create_model(train, paths) lgr.info('Model: Done', {'attrs': 'bold'}) - + # split to folds randomly - temp - - # samples_path = os.path.join(paths["data_misc_dir"],"samples.csv") - # if os.path.isfile(samples_path) : - # sample_ids = pd.read_csv(samples_path)['file'].to_list() - # print(sample_ids) - # else: - # sample_ids = None - input_source_gt = pd.read_csv(paths["clinical_data_file"]) - dataset_all = UKBB.dataset(paths["data_dir"], train['target'], train['series_config'], input_source_gt, paths["cache_dir"], - reset_cache=False, num_workers=train["num_workers"], sample_ids=sample_ids, - train=True - ) + # samples_path = os.path.join(paths["data_misc_dir"],"samples.csv") # if os.path.isfile(samples_path) : # sample_ids = pd.read_csv(samples_path)['file'].to_list() # print(sample_ids) # else: # sample_ids = None - input_source_gt = pd.read_csv(paths["clinical_data_file"]) - dataset_all = UKBB.dataset(paths["data_dir"], train['target'], input_source_gt, paths["cache_dir"], + + dataset_all = UKBB.dataset(data_dir=paths["data_dir"], target=train['target'], series_config=train['series_config'], + input_source_gt=clinical_data_df, cache_dir=paths["cache_dir"], reset_cache=False, num_workers=train["num_workers"], sample_ids=sample_ids, train=True ) - print("dataset size",len(dataset_all)) - + print("dataset size", len(dataset_all)) + folds = dataset_balanced_division_to_folds(dataset=dataset_all, - output_split_filename=os.path.join( paths["data_misc_dir"], paths["data_split_filename"]), - id = 'data.patientID', - keys_to_balance=[gt_label], - nfolds=train["num_folds"], - workers= train["num_workers"]) + output_split_filename=os.path.join(paths["data_misc_dir"], paths["data_split_filename"]), + id='data.patientID', + keys_to_balance=[gt_label], + nfolds=train["num_folds"], + workers=train["num_workers"]) train_sample_ids = [] for fold in train["train_folds"]: @@ -175,20 +167,24 @@ def run_train(paths : NDict , train: NDict ) -> torch.nn.Module: for fold in train["validation_folds"]: validation_sample_ids += folds[fold] - train_dataset = UKBB.dataset(paths["data_dir"], train['target'], train['series_config'], input_source_gt, paths["cache_dir"], reset_cache=False, num_workers=train["num_workers"], sample_ids=train_sample_ids, train=True) - - validation_dataset = UKBB.dataset(paths["data_dir"], train['target'], train['series_config'], input_source_gt, paths["cache_dir"], reset_cache=False, num_workers=train["num_workers"], sample_ids=validation_sample_ids) + train_dataset = UKBB.dataset(data_dir=paths["data_dir"], target=train['target'], series_config=train['series_config'], + input_source_gt=clinical_data_df, cache_dir=paths["cache_dir"], reset_cache=False, num_workers=train["num_workers"], + sample_ids=train_sample_ids, train=True) + + validation_dataset = UKBB.dataset(data_dir=paths["data_dir"], target=train['target'], series_config=train['series_config'], + input_source_gt=clinical_data_df, cache_dir=paths["cache_dir"], reset_cache=False, + num_workers=train["num_workers"], sample_ids=validation_sample_ids) ## Create sampler lgr.info(f'- Create sampler:') sampler = BatchSamplerDefault(dataset=train_dataset, - balanced_class_name=gt_label, - num_balanced_classes=num_classes, - batch_size=train["batch_size"], - mode = "approx", - workers=train["num_workers"], - balanced_class_weights=None - ) + balanced_class_name=gt_label, + num_balanced_classes=num_classes, + batch_size=train["batch_size"], + mode="approx", + workers=train["num_workers"], + balanced_class_weights=None + ) lgr.info(f'- Create sampler: Done') @@ -218,20 +214,19 @@ def run_train(paths : NDict , train: NDict ) -> torch.nn.Module: # ==================================================================================== losses = { 'cls_loss': LossDefault(pred='model.logits.head_0', target=gt_label, - callable=F.cross_entropy, weight=1.0) + callable=F.cross_entropy, weight=1.0) } - # ==================================================================================== # Metrics # ==================================================================================== train_metrics = OrderedDict([ - ('op', MetricApplyThresholds(pred='model.output.head_0')), # will apply argmax - ('auc', MetricAUCROC(pred='model.output.head_0', target=gt_label, class_names = class_names)), + ('op', MetricApplyThresholds(pred='model.output.head_0')), # will apply argmax + ('auc', MetricAUCROC(pred='model.output.head_0', target=gt_label, class_names=class_names)), ('accuracy', MetricAccuracy(pred='results:metrics.op.cls_pred', target=gt_label)), ]) - - validation_metrics = copy.deepcopy(train_metrics) # use the same metrics in validation as well + + validation_metrics = copy.deepcopy(train_metrics) # use the same metrics in validation as well # either a dict with arguments to pass to ModelCheckpoint or list dicts for multiple ModelCheckpoint callbacks (to monitor and save checkpoints for more then one metric). best_epoch_source = dict( @@ -249,15 +244,15 @@ def run_train(paths : NDict , train: NDict ) -> torch.nn.Module: optimizer = optim.Adam(model.parameters(), lr=train["learning_rate"], weight_decay=train["weight_decay"]) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer) - - lr_sch_config = dict(scheduler=scheduler,monitor="validation.losses.total_loss") - + + lr_sch_config = dict(scheduler=scheduler, monitor="validation.losses.total_loss") + # optimizier and lr sch - see pl.LightningModule.configure_optimizers return value for all options optimizers_and_lr_schs = dict(optimizer=optimizer, lr_scheduler=lr_sch_config) # create instance of PL module - FuseMedML generic version - - pl_module = LightningModuleDefault(model_dir=paths["model_dir"], + + pl_module = LightningModuleDefault(model_dir=paths["model_dir"], model=model, losses=losses, train_metrics=train_metrics, @@ -265,17 +260,17 @@ def run_train(paths : NDict , train: NDict ) -> torch.nn.Module: best_epoch_source=best_epoch_source, optimizers_and_lr_schs=optimizers_and_lr_schs) - # train from scratch pl_trainer.fit(pl_module, train_dataloader, validation_dataloader, ckpt_path=train['trainer']['ckpt_path']) lgr.info('Train: Done', {'attrs': 'bold'}) - + return model, pl_trainer + ###################################### # Inference Template ###################################### -def run_infer(train : NDict, paths : NDict , infer: NDict): +def run_infer(train: NDict, paths: NDict, infer: NDict): create_dir(paths['inference_dir']) #### Logger fuse_logger_start(output_path=paths["inference_dir"], console_verbose_level=logging.INFO) @@ -286,7 +281,7 @@ def run_infer(train : NDict, paths : NDict , infer: NDict): infer_file = os.path.join(paths['inference_dir'], infer['infer_filename']) - pl_module.set_predictions_keys(['model.output.head_0', 'data.gt.classification']) # which keys to extract and dump into file + pl_module.set_predictions_keys(['model.output.head_0', 'data.gt.classification']) # which keys to extract and dump into file # create a trainer instance predictions = pl_trainer.predict(pl_module, infer_dataloader, return_predictions=True) @@ -294,73 +289,80 @@ def run_infer(train : NDict, paths : NDict , infer: NDict): infer_df = convert_predictions_to_dataframe(predictions) save_dataframe(infer_df, infer_file) + ###################################### # Explain Template ###################################### -def run_explain(train : NDict, paths : NDict, infer: NDict): +def run_explain(train: NDict, paths: NDict, infer: NDict): fuse_logger_start(output_path=None, console_verbose_level=logging.INFO) lgr = logging.getLogger('Fuse') lgr.info('Fuse Explain', {'attrs': ['bold', 'underline']}) pl_module, _, infer_dataloader = load_model_and_test_data(train, paths, infer) - - model = ModelWrapDictToSeq(pl_module._model, output_key = 'head_0') - model = medcam.inject(model, output_dir="attention_maps", backend='gcam', save_maps=True, layer='auto',return_attention=True) + model = ModelWrapDictToSeq(pl_module._model, output_key='head_0') + model = medcam.inject(model, output_dir="attention_maps", backend='gcam', save_maps=True, layer='auto', return_attention=True) for i, batch in enumerate(infer_dataloader): - logit, attention_map = model(batch['data.input.img'],batch['data.gt.classification']) - attention_map = attention_map[0][0].numpy() - batch['data.input.img'] = batch['data.input.img'][0][0].numpy() - original_attention_map = nib.load(os.path.join('attention_maps','model.backbone.layer4','attention_map_'+str(i)+'_0_0.nii.gz')).get_fdata() - original_transposed = np.transpose(batch['data.input.img'], axes=(1, 2, 0)) - scale_ratio = [ original_transposed.shape[i]/value for i,value in enumerate(original_attention_map.shape)] - points = [] - max_value = original_attention_map.argmax() - current_max = max_value - while True: - current_max = original_attention_map.argmax() - max_volume = np.unravel_index(current_max, original_attention_map.shape) - if current_max < max_value : - break - points.append(np.asarray(max_volume)) - original_attention_map[max_volume] = 0.0 - print("sample ",i) - print("points",points) - max_volume = points[0] - center = [int(index/2) for index in original_attention_map.shape] - min_dist = 99999999999999999 - for point in points : - dist = np.linalg.norm(point-center) - if dist < min_dist : - min_dist = dist - max_volume = point - print("best",max_volume) - attention_map = show_attention_on_image(batch['data.input.img'],attention_map) - batch['data.input.img'] = np.transpose(batch['data.input.img'], axes=(1, 2, 0)) - original = nib.Nifti1Image(batch['data.input.img'], affine=np.eye(4)) - volume_box = np.zeros(batch['data.input.img'].shape) - for point in points: - bouding_box_indices = [(int((point[i]-1)*scale_ratio[i]),int((point[i]+1)*scale_ratio[i])) for i in range(3)] - print(bouding_box_indices) - volume_box = draw_bbox_around_volume(volume_box,bouding_box_indices, 1) - #bouding_box_indices = [(int((max_volume[i]-1)*scale_ratio[i]),int((max_volume[i]+1)*scale_ratio[i])) for i in range(3)] - # volume_box = draw_bbox_around_volume(volume_box,max_volume, 2) - volume_box = nib.Nifti1Image(volume_box, affine=np.eye(4)) - nib.save(original, filename=os.path.join('attention_maps','original_'+str(i)+'_'+batch['data.input.img_path'][0]+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) - nib.save(volume_box, filename=os.path.join('attention_maps','maxvolume_'+str(i)+'_'+batch['data.input.img_path'][0]+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) - nib.save(attention_map, filename=os.path.join('attention_maps','attention_'+str(i)+'_'+batch['data.input.img_path'][0]+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) + logit, attention_map = model(batch['data.input.img'], batch['data.gt.classification']) + attention_map = attention_map[0][0].numpy() + batch['data.input.img'] = batch['data.input.img'][0][0].numpy() + original_attention_map = nib.load( + os.path.join('attention_maps', 'model.backbone.layer4', 'attention_map_' + str(i) + '_0_0.nii.gz')).get_fdata() + original_transposed = np.transpose(batch['data.input.img'], axes=(1, 2, 0)) + scale_ratio = [original_transposed.shape[i] / value for i, value in enumerate(original_attention_map.shape)] + points = [] + max_value = original_attention_map.argmax() + current_max = max_value + while True: + current_max = original_attention_map.argmax() + max_volume = np.unravel_index(current_max, original_attention_map.shape) + if current_max < max_value: + break + points.append(np.asarray(max_volume)) + original_attention_map[max_volume] = 0.0 + print("sample ", i) + print("points", points) + max_volume = points[0] + center = [int(index / 2) for index in original_attention_map.shape] + min_dist = 99999999999999999 + for point in points: + dist = np.linalg.norm(point - center) + if dist < min_dist: + min_dist = dist + max_volume = point + print("best", max_volume) + attention_map = show_attention_on_image(batch['data.input.img'], attention_map) + batch['data.input.img'] = np.transpose(batch['data.input.img'], axes=(1, 2, 0)) + original = nib.Nifti1Image(batch['data.input.img'], affine=np.eye(4)) + volume_box = np.zeros(batch['data.input.img'].shape) + for point in points: + bouding_box_indices = [(int((point[i] - 1) * scale_ratio[i]), int((point[i] + 1) * scale_ratio[i])) for i in range(3)] + print(bouding_box_indices) + volume_box = draw_bbox_around_volume(volume_box, bouding_box_indices, 1) + # bouding_box_indices = [(int((max_volume[i]-1)*scale_ratio[i]),int((max_volume[i]+1)*scale_ratio[i])) for i in range(3)] + # volume_box = draw_bbox_around_volume(volume_box,max_volume, 2) + volume_box = nib.Nifti1Image(volume_box, affine=np.eye(4)) + nib.save(original, filename=os.path.join('attention_maps', 'original_' + str(i) + '_' + batch['data.input.img_path'][0] + '_label_=' + str( + batch['data.gt.classification']) + '.nii.gz')) + nib.save(volume_box, filename=os.path.join('attention_maps', 'maxvolume_' + str(i) + '_' + batch['data.input.img_path'][0] + '_label_=' + str( + batch['data.gt.classification']) + '.nii.gz')) + nib.save(attention_map, filename=os.path.join('attention_maps', + 'attention_' + str(i) + '_' + batch['data.input.img_path'][0] + '_label_=' + str( + batch['data.gt.classification']) + '.nii.gz')) + def draw_bbox_around_volume(volume_box, bouding_box_indices, color): - for slice in range(bouding_box_indices[2][0],bouding_box_indices[2][1] - 1): - for x in range(bouding_box_indices[0][0],bouding_box_indices[0][1] - 1) : - volume_box[x,bouding_box_indices[1][0],slice] = color - volume_box[x,bouding_box_indices[1][1],slice] = color - for y in range(bouding_box_indices[1][0],bouding_box_indices[1][1] - 1) : - volume_box[bouding_box_indices[0][0],y,slice] = color - volume_box[bouding_box_indices[0][1],y,slice] = color + for slice in range(bouding_box_indices[2][0], bouding_box_indices[2][1] - 1): + for x in range(bouding_box_indices[0][0], bouding_box_indices[0][1] - 1): + volume_box[x, bouding_box_indices[1][0], slice] = color + volume_box[x, bouding_box_indices[1][1], slice] = color + for y in range(bouding_box_indices[1][0], bouding_box_indices[1][1] - 1): + volume_box[bouding_box_indices[0][0], y, slice] = color + volume_box[bouding_box_indices[0][1], y, slice] = color return volume_box -def load_model_and_test_data(train : NDict, paths : NDict, infer: NDict): + +def load_model_and_test_data(train: NDict, paths: NDict, infer: NDict): lgr = logging.getLogger('Fuse') checkpoint_file = os.path.join(paths["model_dir"], infer["checkpoint"]) @@ -377,8 +379,10 @@ def load_model_and_test_data(train : NDict, paths : NDict, infer: NDict): infer_sample_ids = [] for fold in infer["infer_folds"]: infer_sample_ids += folds[fold] - input_source_gt = pd.read_csv(paths["clinical_data_file"]) - test_dataset = UKBB.dataset(paths["data_dir"], infer['target'], train['series_config'], input_source_gt, paths["cache_dir"], num_workers = infer['num_workers'], + input_source_gt = read_clinical_data_file(paths["clinical_data_file"], infer['target'], infer.get('columns_to_add')) + + test_dataset = UKBB.dataset(data_dir=paths["data_dir"], target=infer['target'], series_config=train['series_config'], + input_source_gt=input_source_gt, cache_dir=paths["cache_dir"], num_workers=infer['num_workers'], sample_ids=infer_sample_ids, train=False) ## Create dataloader infer_dataloader = DataLoader(dataset=test_dataset, @@ -386,13 +390,15 @@ def load_model_and_test_data(train : NDict, paths : NDict, infer: NDict): collate_fn=CollateDefault(), num_workers=infer["num_workers"]) - pl_module = LightningModuleDefault.load_from_checkpoint(checkpoint_file, model_dir=paths["model_dir"], model=model, map_location="cpu", strict=True) + pl_module = LightningModuleDefault.load_from_checkpoint(checkpoint_file, model_dir=paths["model_dir"], model=model, map_location="cpu", + strict=True) return pl_module, pl_trainer, infer_dataloader + def show_attention_on_image(img: np.ndarray, - mask: np.ndarray, - colormap: int = cv2.COLORMAP_JET) -> np.ndarray: + mask: np.ndarray, + colormap: int = cv2.COLORMAP_JET) -> np.ndarray: """ This function overlays the cam mask on the image as an heatmap. reference for fusing heat map and original image : https://github.com/jacobgil/pytorch-grad-cam/blob/61e9babae8600351b02b6e90864e4807f44f2d4a/pytorch_grad_cam/utils/image.py#L25 By default the heatmap is in BGR format. @@ -401,22 +407,23 @@ def show_attention_on_image(img: np.ndarray, :param colormap: The OpenCV colormap to be used. :returns: The default image with the cam overlay. """ - heatmaps = [np.float32(cv2.applyColorMap(np.uint8(255 * mask[i]), colormap))/255 for i in range(mask.shape[0])] - images = [cv2.cvtColor(img[i],cv2.COLOR_GRAY2RGB) for i in range(img.shape[0])] + heatmaps = [np.float32(cv2.applyColorMap(np.uint8(255 * mask[i]), colormap)) / 255 for i in range(mask.shape[0])] + images = [cv2.cvtColor(img[i], cv2.COLOR_GRAY2RGB) for i in range(img.shape[0])] RGB_DTYPE = np.dtype([('R', 'u1'), ('G', 'u1'), ('B', 'u1')]) cams = [] - for i in range(len(images)) : + for i in range(len(images)): if np.max(images[i]) > 1: - images[i] *= (1.0/ images[i].max()) - + images[i] *= (1.0 / images[i].max()) + cam = heatmaps[i] + images[i] cam = cam / np.max(cam) cam = np.uint8(255 * cam) - cam = cam.view(RGB_DTYPE ) + cam = cam.view(RGB_DTYPE) cams.append(cam) - nifti = nib.Nifti1Image(np.concatenate( cams, axis=2 ), np.eye(4)) + nifti = nib.Nifti1Image(np.concatenate(cams, axis=2), np.eye(4)) return nifti + def largest_indices(ary, n): """Returns the n largest indices from a numpy array.""" flat = ary.flatten() @@ -424,17 +431,18 @@ def largest_indices(ary, n): indices = indices[np.argsort(-flat[indices])] return np.unravel_index(indices, ary.shape) + ###################################### # Analyze Template ###################################### -def run_eval(paths : NDict, infer: NDict): +def run_eval(paths: NDict, infer: NDict): fuse_logger_start(output_path=None, console_verbose_level=logging.INFO) lgr = logging.getLogger('Fuse') lgr.info('Fuse Eval', {'attrs': ['bold', 'underline']}) - # metrics + # metrics metrics = OrderedDict([ - ('op', MetricApplyThresholds(pred='model.output.head_0')), # will apply argmax + ('op', MetricApplyThresholds(pred='model.output.head_0')), # will apply argmax ('auc', MetricAUCROC(pred='model.output.head_0', target='data.gt.classification')), ('accuracy', MetricAccuracy(pred='results:metrics.op.cls_pred', target='data.gt.classification')), ]) @@ -444,15 +452,15 @@ def run_eval(paths : NDict, infer: NDict): # run results = evaluator.eval(ids=None, - data=os.path.join(paths["inference_dir"], infer["infer_filename"]), - metrics=metrics, - output_dir=paths["eval_dir"]) + data=os.path.join(paths["inference_dir"], infer["infer_filename"]), + metrics=metrics, + output_dir=paths["eval_dir"]) return results @hydra.main(config_path="conf", config_name="config") -def main(cfg : DictConfig) -> None: +def main(cfg: DictConfig) -> None: cfg = NDict(OmegaConf.to_object(cfg)) print(cfg) # uncomment if you want to use specific gpus instead of automatically looking for free ones @@ -461,23 +469,35 @@ def main(cfg : DictConfig) -> None: # train if 'train' in cfg["run.running_modes"]: - run_train(cfg["paths"] ,cfg["train"]) + run_train(cfg["paths"], cfg["train"]) else: assert "Expecting train mode to be set." - # infer (infer set) if 'infer' in cfg["run.running_modes"]: - run_infer(cfg["train"], cfg["paths"] , cfg["infer"]) + run_infer(cfg["train"], cfg["paths"], cfg["infer"]) # # evaluate (infer set) if 'eval' in cfg["run.running_modes"]: - run_eval(cfg["paths"] ,cfg["infer"]) + run_eval(cfg["paths"], cfg["infer"]) # explain (infer set) if 'explain' in cfg["run.running_modes"]: run_explain(cfg["train"], cfg["paths"], cfg["infer"]) + +def read_clinical_data_file(filename, target, columns_to_add, return_var_namespace=False): + df = pd.read_csv(filename) + if (target not in df.columns) or return_var_namespace: + var_namespace = cohort_and_label_def.get_clinical_vars_namespace(df, columns_to_add) + assert target in var_namespace + df[target] = var_namespace[target] + df[target] = df[target].astype(int) + if return_var_namespace: + return df, var_namespace + return df + + if __name__ == "__main__": sys.argv.append('hydra.run.dir=working_dir') main() From 2801f7a872bb630cc145c665d6291addd646e586 Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Sat, 6 Aug 2022 11:14:44 +0300 Subject: [PATCH 26/38] minor changes --- .../ukbb_prostate/conf/config_template.yaml | 2 +- .../imaging/classification/ukbb_prostate/runner.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml index b84afe33a..7394b0a08 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml @@ -34,7 +34,7 @@ cohort : cohort_cancer_prostate_prepostindex label : ${cohort}_${max_group_size}_${target} paths: - my_ukbb_dir: /projects/msieve_dev3/usr/ozery/fuse_examples/ukbb + my_ukbb_dir: /projects/msieve_dev3/usr/${oc.env:USER}/fuse_examples/ukbb # /dccstor/mm_hcls/usr/${oc.env:USER}/fuse_example_runs/ukbb_mri_body/ data_dir: ${oc.env:UKBB_MRI_BODY_DATA_PATH} cache_dir: ${paths.my_ukbb_dir}/cache model_dir : ${paths.my_ukbb_dir}/model_${label} diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 7ba059fe1..d61342a5d 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -128,6 +128,8 @@ def run_train(paths: NDict, train: NDict) -> torch.nn.Module: lgr.info('cohort def=' + str(train['cohort']), {'color': 'magenta'}) lgr.info(f'model_dir={paths["model_dir"]}', {'color': 'magenta'}) + if train.get('trainer.ckpt_path') is not None: + lgr.info(f"trainer.ckpt_path = {train['trainer.ckpt_path']}", {'color': 'magenta'}) lgr.info(f'cache_dir={paths["cache_dir"]}', {'color': 'magenta'}) # ============================================================================== @@ -469,22 +471,21 @@ def main(cfg: DictConfig) -> None: # train if 'train' in cfg["run.running_modes"]: - run_train(cfg["paths"], cfg["train"]) + run_train(NDict(cfg["paths"]), NDict(cfg["train"])) else: assert "Expecting train mode to be set." # infer (infer set) if 'infer' in cfg["run.running_modes"]: - run_infer(cfg["train"], cfg["paths"], cfg["infer"]) + run_infer(NDict(cfg["train"]), NDict(cfg["paths"]), NDict(cfg["infer"])) # # evaluate (infer set) if 'eval' in cfg["run.running_modes"]: - run_eval(cfg["paths"], cfg["infer"]) + run_eval(NDict(cfg["paths"]), NDict(cfg["infer"])) # explain (infer set) if 'explain' in cfg["run.running_modes"]: - run_explain(cfg["train"], cfg["paths"], cfg["infer"]) - + run_explain(NDict(cfg["train"]), NDict(cfg["paths"]), NDict(cfg["infer"])) def read_clinical_data_file(filename, target, columns_to_add, return_var_namespace=False): df = pd.read_csv(filename) @@ -496,8 +497,7 @@ def read_clinical_data_file(filename, target, columns_to_add, return_var_namespa if return_var_namespace: return df, var_namespace return df - - + if __name__ == "__main__": sys.argv.append('hydra.run.dir=working_dir') main() From a91487fe201d15a3bb897e06457fdf3de5956a81 Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Sun, 7 Aug 2022 09:06:15 +0300 Subject: [PATCH 27/38] adding optional evaluation of subgoups --- .../ukbb_prostate/cohort_and_label_def.py | 9 +- .../ukbb_prostate/conf/config_template.yaml | 33 ++++--- .../classification/ukbb_prostate/runner.py | 85 ++++++++++++------- fuse/eval/evaluator.py | 17 ++-- .../metrics_thresholding_common.py | 4 +- fuse/eval/metrics/utils.py | 5 ++ 6 files changed, 102 insertions(+), 51 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py index 363c74118..e38f804fc 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/cohort_and_label_def.py @@ -17,7 +17,7 @@ def write_log_info(s): max_group_size = None if max_group_size <= 0 else max_group_size np.random.seed(seed) - selected = eval(cohort_config['inclusion'], var_namespace) + selected = eval(cohort_config['inclusion'], {}, var_namespace) y = var_namespace[cohort_config['group_id_vec']] y_vals = np.unique(y) @@ -43,17 +43,18 @@ def write_log_info(s): n += np.sum(group_filter) print("cohort size=", np.sum(selected)) assert np.sum(selected) == n - return var_namespace['file_pattern'][selected].tolist() + return var_namespace[cohort_config['sample_id_col']][selected].tolist() def get_clinical_vars_namespace(df, columns_to_add): var_namespace = {col.replace(' ', '_').replace(',', '_').replace('-', '_'): - df[col] for i, col in enumerate(df.columns) } + df[col].values for col in df.columns } for col_name, col_expression in columns_to_add.items(): - x = eval(col_expression, var_namespace) + x = eval(col_expression, {}, var_namespace) var_namespace[col_name] = x + return var_namespace def get_class_names(label_type:str): diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml index 7394b0a08..cee4a4a7e 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml @@ -18,23 +18,29 @@ columns_to_add: cohort_cancer_prostate_prepostindex: men & (preindex_prostatectomy==0) & (preindex_cancer_prostate | postindex_cancer_prostate | (preindex_neoplasms==0)) cohort_cancer_prostate_postindex: men & (preindex_prostatectomy==0) & (preindex_neoplasms==0) -max_group_size : 500 +max_group_size : 1000 -#target: preindex_prostatectomy -#cohort : men +# target: preindex_prostatectomy +# cohort : men -#target: prostate_hyperplasia_preindex -#cohort: cohort_men_with_prostate +# target: prostate_hyperplasia_preindex +# cohort: cohort_men_with_prostate + +# target: preindex_cancer_prostate +# cohort: cohort_cancer_prostate_preindex + +target: postindex_cancer_prostate +cohort : cohort_cancer_prostate_postindex + +# target: prepostindex_cancer_prostate +# cohort : cohort_cancer_prostate_prepostindex -#target: preindex_cancer_prostate -#cohort: cohort_cancer_prostate_preindex -target: prepostindex_cancer_prostate -cohort : cohort_cancer_prostate_prepostindex label : ${cohort}_${max_group_size}_${target} +sample_id_col : file_pattern paths: - my_ukbb_dir: /projects/msieve_dev3/usr/${oc.env:USER}/fuse_examples/ukbb # /dccstor/mm_hcls/usr/${oc.env:USER}/fuse_example_runs/ukbb_mri_body/ + my_ukbb_dir: /dccstor/mm_hcls/usr/${oc.env:USER}/fuse_example_runs/ukbb_mri_body/ #/projects/msieve_dev3/usr/${oc.env:USER}/fuse_examples/ukbb # data_dir: ${oc.env:UKBB_MRI_BODY_DATA_PATH} cache_dir: ${paths.my_ukbb_dir}/cache model_dir : ${paths.my_ukbb_dir}/model_${label} @@ -58,6 +64,7 @@ train: # filter_out: women group_id_vec: ${target} max_group_size: ${max_group_size} + sample_id_col: ${sample_id_col} series_config: series: Dixon_BH_17s_W station: 4 @@ -81,4 +88,8 @@ infer: infer_folds : [4] target : ${target} columns_to_add: ${columns_to_add} - num_workers : 12 \ No newline at end of file + num_workers : 12 +eval: + cohorts: #[cohort_cancer_prostate_preindex, cohort_cancer_prostate_postindex] + columns_to_add: ${columns_to_add} + sample_id_col: ${sample_id_col} \ No newline at end of file diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index d61342a5d..3d9990e6a 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -16,6 +16,7 @@ from collections import OrderedDict import os +from typing import Union, Dict, Optional, List from fuse_examples.imaging.classification.ukbb_prostate import cohort_and_label_def, files_download_from_cos @@ -112,7 +113,7 @@ def run_train(paths: NDict, train: NDict) -> torch.nn.Module: fuse_logger_start(output_path=paths["model_dir"], console_verbose_level=logging.INFO, list_of_source_files=['../conf/config.yaml']) lgr = logging.getLogger('Fuse') - clinical_data_df, var_namespace = read_clinical_data_file(filename=paths["clinical_data_file"], target=train['target'], + clinical_data_df, var_namespace = read_clinical_data_file(filename=paths["clinical_data_file"], targets=train['target'], columns_to_add=train.get('columns_to_add'), return_var_namespace=True) @@ -381,7 +382,7 @@ def load_model_and_test_data(train: NDict, paths: NDict, infer: NDict): infer_sample_ids = [] for fold in infer["infer_folds"]: infer_sample_ids += folds[fold] - input_source_gt = read_clinical_data_file(paths["clinical_data_file"], infer['target'], infer.get('columns_to_add')) + input_source_gt = read_clinical_data_file(filename=paths["clinical_data_file"], targets=infer['target'], columns_to_add=infer.get('columns_to_add')) test_dataset = UKBB.dataset(data_dir=paths["data_dir"], target=infer['target'], series_config=train['series_config'], input_source_gt=input_source_gt, cache_dir=paths["cache_dir"], num_workers=infer['num_workers'], @@ -437,37 +438,70 @@ def largest_indices(ary, n): ###################################### # Analyze Template ###################################### -def run_eval(paths: NDict, infer: NDict): +def run_eval(paths: NDict, infer: NDict, eval: Optional[Dict]=None): fuse_logger_start(output_path=None, console_verbose_level=logging.INFO) lgr = logging.getLogger('Fuse') lgr.info('Fuse Eval', {'attrs': ['bold', 'underline']}) - # metrics - metrics = OrderedDict([ - ('op', MetricApplyThresholds(pred='model.output.head_0')), # will apply argmax - ('auc', MetricAUCROC(pred='model.output.head_0', target='data.gt.classification')), - ('accuracy', MetricAccuracy(pred='results:metrics.op.cls_pred', target='data.gt.classification')), - ]) + + sample_ids_groups_to_eval = [('all', None)] + if eval is not None and eval.get("cohorts") is not None: + df = read_clinical_data_file(filename=paths["clinical_data_file"], targets=None, columns_to_add=infer.get('columns_to_add')) + sample_ids_series = df[eval['sample_id_col']] + for cohort in eval['cohorts']: + cohort_sample_ids = sample_ids_series[df[cohort]].values + sample_ids_groups_to_eval.append( (cohort, cohort_sample_ids)) # create evaluator evaluator = EvaluatorDefault() - # run - results = evaluator.eval(ids=None, - data=os.path.join(paths["inference_dir"], infer["infer_filename"]), - metrics=metrics, - output_dir=paths["eval_dir"]) - - return results - + pred_key = 'model.output.head_0' + gt_key = 'data.gt.classification' + pred_cls_key = 'pred_cls' + results_all = NDict() + for group_name, group_sample_ids in sample_ids_groups_to_eval: + op_pred_cls_name = f'{group_name}-{pred_cls_key}' + # metrics + metrics = OrderedDict([ + (f'{op_pred_cls_name}', MetricApplyThresholds(pred=pred_key, key_out=pred_cls_key)), # will apply argmax + (f'{group_name}-auc', MetricAUCROC(pred=pred_key, target=gt_key)), + (f'{group_name}-accuracy', MetricAccuracy(pred=f'results:metrics.{op_pred_cls_name}.{pred_cls_key}', target=gt_key)), + ]) + + # run + results = evaluator.eval(ids=group_sample_ids, + data=os.path.join(paths["inference_dir"], infer["infer_filename"]), + metrics=metrics, + output_dir=paths["eval_dir"], + outputfile_basename=f'results_{group_name}', + error_missing_ids=False) + results_all.merge(results) + + return results_all + +def read_clinical_data_file(filename:str, targets: Optional[Union[str, List[str]]]=None, columns_to_add: Optional[List[str]]=None, + return_var_namespace:Optional[bool]=False): + df_org = pd.read_csv(filename) + + var_namespace = cohort_and_label_def.get_clinical_vars_namespace(df_org, columns_to_add) + df = pd.DataFrame.from_dict(var_namespace) + if targets is not None: + if isinstance(targets, str): + targets = [targets] + for target in targets: + df[target] = df[target].astype(int) + if return_var_namespace: + return df, var_namespace + return df @hydra.main(config_path="conf", config_name="config") def main(cfg: DictConfig) -> None: cfg = NDict(OmegaConf.to_object(cfg)) print(cfg) # uncomment if you want to use specific gpus instead of automatically looking for free ones - force_gpus = None # [0] - choose_and_enable_multiple_gpus(cfg["train.trainer.devices"], force_gpus=force_gpus) + if 'train' in cfg["run.running_modes"] or 'infer' in cfg["run.running_modes"] or 'explain' in cfg["run.running_modes"] : + force_gpus = None # [0] + choose_and_enable_multiple_gpus(cfg["train.trainer.devices"], force_gpus=force_gpus) # train if 'train' in cfg["run.running_modes"]: @@ -481,22 +515,13 @@ def main(cfg: DictConfig) -> None: # # evaluate (infer set) if 'eval' in cfg["run.running_modes"]: - run_eval(NDict(cfg["paths"]), NDict(cfg["infer"])) + run_eval(NDict(cfg["paths"]), NDict(cfg["infer"]), cfg.get('eval')) # explain (infer set) if 'explain' in cfg["run.running_modes"]: run_explain(NDict(cfg["train"]), NDict(cfg["paths"]), NDict(cfg["infer"])) -def read_clinical_data_file(filename, target, columns_to_add, return_var_namespace=False): - df = pd.read_csv(filename) - if (target not in df.columns) or return_var_namespace: - var_namespace = cohort_and_label_def.get_clinical_vars_namespace(df, columns_to_add) - assert target in var_namespace - df[target] = var_namespace[target] - df[target] = df[target].astype(int) - if return_var_namespace: - return df, var_namespace - return df + if __name__ == "__main__": sys.argv.append('hydra.run.dir=working_dir') diff --git a/fuse/eval/evaluator.py b/fuse/eval/evaluator.py index 9775e1646..d7486f74d 100644 --- a/fuse/eval/evaluator.py +++ b/fuse/eval/evaluator.py @@ -54,6 +54,8 @@ def eval( id_key: str = "id", batch_size: Optional[int] = None, output_dir: Optional[str] = None, + outputfile_basename: Optional[str] = 'results', + error_missing_ids: Optional[bool] = True ) -> NDict: """ evaluate, return, print and optionally dump results to a file @@ -79,6 +81,8 @@ def eval( data must be an iterator of samples. A batch will be automatically created from batch_size samples :param output_dir: Optional - dump results to directory + :param outputfile_basename: Optional - the basename of the files to which results will be dumped + :param error_missing_ids: Optional - whether to raise error if some of the provided ids are missing :return: dictionary that holds all the results. """ @@ -88,8 +92,11 @@ def eval( ids_df = None # use all samples if batch_size is None: - data_df = self.read_data(data, ids_df, id_key=id_key) + data_df = self.read_data(data, ids_df, id_key=id_key, error_missing_ids=error_missing_ids) data_df["id"] = data_df[id_key] + if not error_missing_ids and (ids_df is not None) and (len(ids_df) >len(data_df.index)): + ids_df = ids_df.loc[data_df["id"].index] + ids = [id for id in ids if id in set(ids_df.index)] # pass data for metric_name, metric in metrics.items(): @@ -139,7 +146,7 @@ def eval( raise # dump results - self.dump_metrics_results(results, output_dir) + self.dump_metrics_results(results, output_dir, outputfile_basename) return results @@ -249,7 +256,7 @@ def read_data( return result_data - def dump_metrics_results(self, metrics_results: NDict, output_dir: str) -> None: + def dump_metrics_results(self, metrics_results: NDict, output_dir: str, filebasename:str) -> None: """ Dump results to a file :param metrics_results: results return from metric.process() @@ -283,9 +290,9 @@ def dump_metrics_results(self, metrics_results: NDict, output_dir: str) -> None: os.makedirs(output_dir, exist_ok=True) # save text results - with open(os.path.join(output_dir, "results.txt"), "w") as output_file: + with open(os.path.join(output_dir, f"{filebasename}.txt"), "w") as output_file: output_file.write(results) # save pickled results - with open(os.path.join(output_dir, "results.pickle"), "wb") as output_file: + with open(os.path.join(output_dir, f"{filebasename}.pickle"), "wb") as output_file: pickle.dump(metrics_results, output_file) diff --git a/fuse/eval/metrics/classification/metrics_thresholding_common.py b/fuse/eval/metrics/classification/metrics_thresholding_common.py index 0d676304f..20a182f06 100644 --- a/fuse/eval/metrics/classification/metrics_thresholding_common.py +++ b/fuse/eval/metrics/classification/metrics_thresholding_common.py @@ -13,6 +13,7 @@ def __init__( pred: str, class_names: Optional[Sequence[str]] = None, operation_point: Union[float, Sequence[Tuple[int, float]], str, None] = None, + key_out: Optional[str] = 'cls_pred', **kwargs ): """ @@ -35,6 +36,7 @@ def __init__( operation_point=operation_point, **kwargs ) + self.key_out = key_out def _apply_thresholds( self, @@ -47,4 +49,4 @@ def _apply_thresholds( # make sure to return the per-sample metric result for the relevant sample ids: per_sample_data = PerSampleData(data=pred_thresholded, ids=ids) - return {"cls_pred": per_sample_data} + return {self.key_out: per_sample_data} diff --git a/fuse/eval/metrics/utils.py b/fuse/eval/metrics/utils.py index 15ca5619b..75ccfa3a2 100644 --- a/fuse/eval/metrics/utils.py +++ b/fuse/eval/metrics/utils.py @@ -25,3 +25,8 @@ def __call__(self, ids: Optional[Sequence[Hashable]] = None) -> Sequence[np.ndar permutation = [original_ids.index(sample_id) for sample_id in required_ids] return [self._data[i] for i in permutation] + + def __str__(self): + n = len(self._ids) + mean_vals = np.mean(self._data) if len(self._data) == n else [np.mean(x) for x in self._data] + return f'{type(self).__name__}: n={n}, mean={mean_vals}' From 515e89cbbaad1e641a19acab33af3c6aefcd48fd Mon Sep 17 00:00:00 2001 From: itaiguez Date: Mon, 8 Aug 2022 13:25:49 +0300 Subject: [PATCH 28/38] fix file path --- .../imaging/classification/ukbb_prostate/runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 3d9990e6a..3e29cb457 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -19,7 +19,7 @@ from typing import Union, Dict, Optional, List from fuse_examples.imaging.classification.ukbb_prostate import cohort_and_label_def, files_download_from_cos - +import pathlib import sys import copy from fuse.eval.metrics.classification.metrics_thresholding_common import MetricApplyThresholds @@ -110,7 +110,7 @@ def run_train(paths: NDict, train: NDict) -> torch.nn.Module: # ============================================================================== # Logger # ============================================================================== - fuse_logger_start(output_path=paths["model_dir"], console_verbose_level=logging.INFO, list_of_source_files=['../conf/config.yaml']) + fuse_logger_start(output_path=paths["model_dir"], console_verbose_level=logging.INFO, list_of_source_files=[os.path.join(pathlib.Path(__file__).parent.resolve(),'conf/config.yaml')]) lgr = logging.getLogger('Fuse') clinical_data_df, var_namespace = read_clinical_data_file(filename=paths["clinical_data_file"], targets=train['target'], @@ -524,5 +524,5 @@ def main(cfg: DictConfig) -> None: if __name__ == "__main__": - sys.argv.append('hydra.run.dir=working_dir') + sys.argv.append('hydra.run.dir=working_dir3') main() From c54bd1a50080e07f1787cac348e817853eb745c1 Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Mon, 8 Aug 2022 15:21:41 +0300 Subject: [PATCH 29/38] merge --- .../classification/ukbb_prostate/conf/config_template.yaml | 2 +- .../imaging/classification/ukbb_prostate/runner.py | 6 +++++- fuse/eval/metrics/utils.py | 3 +-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml index cee4a4a7e..5277fa7b2 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml @@ -51,7 +51,7 @@ paths: eval_dir : ${paths.model_dir}/eval_dir sample_ids : ${paths.data_misc_dir}/sample_ids.csv run: - running_modes : ['train','infer', 'eval'] #['train','infer', 'eval', 'explain'] + running_modes : [train, infer,eval] #[train, infer, eval, explain] train: cos: bucket_name: body-mri-data diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 3e29cb457..700e278bb 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -23,6 +23,7 @@ import sys import copy from fuse.eval.metrics.classification.metrics_thresholding_common import MetricApplyThresholds +from fuse.eval.metrics.stat.metrics_stat_common import MetricUniqueValues import pandas as pd from fuse.utils.utils_debug import FuseDebug from fuse.utils.gpu import choose_and_enable_multiple_gpus @@ -110,7 +111,9 @@ def run_train(paths: NDict, train: NDict) -> torch.nn.Module: # ============================================================================== # Logger # ============================================================================== - fuse_logger_start(output_path=paths["model_dir"], console_verbose_level=logging.INFO, list_of_source_files=[os.path.join(pathlib.Path(__file__).parent.resolve(),'conf/config.yaml')]) + + config_file =os.path.join(os.path.dirname(os.path.realpath(__file__)), 'conf/config.yaml') + fuse_logger_start(output_path=paths["model_dir"], console_verbose_level=logging.INFO, list_of_source_files=[config_file]) lgr = logging.getLogger('Fuse') clinical_data_df, var_namespace = read_clinical_data_file(filename=paths["clinical_data_file"], targets=train['target'], @@ -464,6 +467,7 @@ def run_eval(paths: NDict, infer: NDict, eval: Optional[Dict]=None): # metrics metrics = OrderedDict([ (f'{op_pred_cls_name}', MetricApplyThresholds(pred=pred_key, key_out=pred_cls_key)), # will apply argmax + (f'{group_name}-gt-vals', MetricUniqueValues(key=gt_key)), (f'{group_name}-auc', MetricAUCROC(pred=pred_key, target=gt_key)), (f'{group_name}-accuracy', MetricAccuracy(pred=f'results:metrics.{op_pred_cls_name}.{pred_cls_key}', target=gt_key)), ]) diff --git a/fuse/eval/metrics/utils.py b/fuse/eval/metrics/utils.py index 75ccfa3a2..e8ed2a82a 100644 --- a/fuse/eval/metrics/utils.py +++ b/fuse/eval/metrics/utils.py @@ -28,5 +28,4 @@ def __call__(self, ids: Optional[Sequence[Hashable]] = None) -> Sequence[np.ndar def __str__(self): n = len(self._ids) - mean_vals = np.mean(self._data) if len(self._data) == n else [np.mean(x) for x in self._data] - return f'{type(self).__name__}: n={n}, mean={mean_vals}' + return f'{type(self).__name__}: size={n}' From 71bd518c6292ee989329166b8f28fd2243ca73de Mon Sep 17 00:00:00 2001 From: Michal Ozery-Flato Date: Tue, 9 Aug 2022 16:23:12 +0300 Subject: [PATCH 30/38] create file for creating cohort for genetic features --- .../create_sample_id_file_for_genetic_main.py | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 examples/fuse_examples/imaging/classification/ukbb_prostate/create_sample_id_file_for_genetic_main.py diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/create_sample_id_file_for_genetic_main.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/create_sample_id_file_for_genetic_main.py new file mode 100644 index 000000000..4e50c6a22 --- /dev/null +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/create_sample_id_file_for_genetic_main.py @@ -0,0 +1,38 @@ +from hydra import compose, initialize +from fuse.utils import file_io +import numpy as np +import pandas as pd + + +def main(): + cfg_overrides = ['target=prepostindex_cancer_prostate', 'cohort=cohort_cancer_prostate_prepostindex', 'max_group_size=1000'] + + initialize(config_path="conf", job_name="test_app") # only allows relative path + cfg = compose(config_name="config", overrides=cfg_overrides) + + input_split_file = cfg["paths"]["data_split_filename"] + print("using split file", input_split_file) + output_file = input_split_file.replace(".pkl", "_for_genetics.csv") #.csv + + folds = file_io.load_pickle(input_split_file) + assert len(folds) == 5 + print("Using patients in folds 0, 1,2 as as train") + df_list = [] + for fold in range(5): + sample_ids = folds[fold] + patient_ids = [s.split('_')[0] for s in sample_ids] + patient_ids = np.asarray(list(set(patient_ids))) + df = pd.DataFrame(patient_ids.reshape(-1, 1), columns=['eid']) + df['is_test'] = 0 if fold <= 2 else 1 + df_list.append(df) + + df_all = pd.concat(df_list, axis=0) + assert df_all.eid.nunique() == df_all.shape[0] + df_all.to_csv(output_file, index=False) + print("wrote", output_file) + + df_test = pd.read_csv(output_file) + print(df_test.groupby('is_test').count()) + +if __name__ == '__main__': + main() From a1606915eafe708ae152fd52ddb8cb05784716a3 Mon Sep 17 00:00:00 2001 From: itaiguez Date: Tue, 9 Aug 2022 17:47:32 +0300 Subject: [PATCH 31/38] added op for volume around point, explain print point --- .../classification/ukbb_prostate/runner.py | 47 ++++++++++++------- fuseimg/datasets/ukbb_neck_to_knee.py | 20 +++++++- 2 files changed, 48 insertions(+), 19 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 700e278bb..e9a3cea5e 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -307,13 +307,13 @@ def run_explain(train: NDict, paths: NDict, infer: NDict): pl_module, _, infer_dataloader = load_model_and_test_data(train, paths, infer) model = ModelWrapDictToSeq(pl_module._model, output_key='head_0') - model = medcam.inject(model, output_dir="attention_maps", backend='gcam', save_maps=True, layer='auto', return_attention=True) + model = medcam.inject(model, output_dir=os.path.join('working_dir','attention_maps'), backend='gcam', save_maps=True, layer='auto', return_attention=True) for i, batch in enumerate(infer_dataloader): logit, attention_map = model(batch['data.input.img'], batch['data.gt.classification']) attention_map = attention_map[0][0].numpy() batch['data.input.img'] = batch['data.input.img'][0][0].numpy() original_attention_map = nib.load( - os.path.join('attention_maps', 'model.backbone.layer4', 'attention_map_' + str(i) + '_0_0.nii.gz')).get_fdata() + os.path.join('working_dir','attention_maps',os.listdir(os.path.join('working_dir','attention_maps'))[0], 'attention_map_' + str(i) + '_0_0.nii.gz')).get_fdata() original_transposed = np.transpose(batch['data.input.img'], axes=(1, 2, 0)) scale_ratio = [original_transposed.shape[i] / value for i, value in enumerate(original_attention_map.shape)] points = [] @@ -340,21 +340,32 @@ def run_explain(train: NDict, paths: NDict, infer: NDict): attention_map = show_attention_on_image(batch['data.input.img'], attention_map) batch['data.input.img'] = np.transpose(batch['data.input.img'], axes=(1, 2, 0)) original = nib.Nifti1Image(batch['data.input.img'], affine=np.eye(4)) - volume_box = np.zeros(batch['data.input.img'].shape) - for point in points: - bouding_box_indices = [(int((point[i] - 1) * scale_ratio[i]), int((point[i] + 1) * scale_ratio[i])) for i in range(3)] - print(bouding_box_indices) - volume_box = draw_bbox_around_volume(volume_box, bouding_box_indices, 1) - # bouding_box_indices = [(int((max_volume[i]-1)*scale_ratio[i]),int((max_volume[i]+1)*scale_ratio[i])) for i in range(3)] - # volume_box = draw_bbox_around_volume(volume_box,max_volume, 2) - volume_box = nib.Nifti1Image(volume_box, affine=np.eye(4)) - nib.save(original, filename=os.path.join('attention_maps', 'original_' + str(i) + '_' + batch['data.input.img_path'][0] + '_label_=' + str( - batch['data.gt.classification']) + '.nii.gz')) - nib.save(volume_box, filename=os.path.join('attention_maps', 'maxvolume_' + str(i) + '_' + batch['data.input.img_path'][0] + '_label_=' + str( - batch['data.gt.classification']) + '.nii.gz')) - nib.save(attention_map, filename=os.path.join('attention_maps', - 'attention_' + str(i) + '_' + batch['data.input.img_path'][0] + '_label_=' + str( - batch['data.gt.classification']) + '.nii.gz')) + # volume_box = np.zeros(batch['data.input.img'].shape) + # radiuses = np.array([30.0,30.0,5.0]) + points = np.array(points) + big_point = [float(np.mean(points[:,i])[0])*scale_ratio[i] for i in range(3)] + print(big_point) + # bouding_box_indices = [(max(int((big_point[i]-radiuses[i])),0),min(int((big_point[i]+radiuses[i])),volume_box.shape[i]-1)) for i in range(3)] + # print(bouding_box_indices) + # volume_box = draw_bbox_around_volume(volume_box,bouding_box_indices, 1) + # for point in points: + # bouding_box_indices = [(max(int((point[i]-1)*scale_ratio[i]),0),min(int((point[i]+1)*scale_ratio[i]),volume_box.shape[i]-1)) for i in range(3)] + # print(bouding_box_indices) + # volume_box = draw_bbox_around_volume(volume_box,bouding_box_indices, 1) + # if len(points) > 1 : + # points = np.array(points) + # big_point = [[ min(points[:,i]), max(points[:,i])] for i in range(3)] + # for axis in big_point : + # if abs(axis[0] - axis[1] ) < 2 : + # axis[0] = axis[0] - 1 + # axis[1] = axis[1] + 1 + # print("big_point",big_point) + # bouding_box_indices = [(int((big_point[i][0])*scale_ratio[i]),int((big_point[i][1])*scale_ratio[i])) for i in range(3)] + # print("bouding_box_indices",bouding_box_indices) + # volume_box = draw_bbox_around_volume(volume_box,bouding_box_indices, 2) + identifier = batch['data.input.img_path'][0].replace('*','') + nib.save(original, filename=os.path.join('attention_maps','original_'+str(i)+'_'+identifier+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) + nib.save(attention_map, filename=os.path.join('attention_maps','attention_'+str(i)+'_'+identifier+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) def draw_bbox_around_volume(volume_box, bouding_box_indices, color): @@ -528,5 +539,5 @@ def main(cfg: DictConfig) -> None: if __name__ == "__main__": - sys.argv.append('hydra.run.dir=working_dir3') + sys.argv.append('hydra.run.dir=working_dir') main() diff --git a/fuseimg/datasets/ukbb_neck_to_knee.py b/fuseimg/datasets/ukbb_neck_to_knee.py index 48d62284f..c65f20c66 100644 --- a/fuseimg/datasets/ukbb_neck_to_knee.py +++ b/fuseimg/datasets/ukbb_neck_to_knee.py @@ -12,7 +12,7 @@ from fuse.data.ops.op_base import OpBase from fuse.utils import NDict from functools import partial -from typing import Hashable, Optional, Sequence +from typing import Hashable, List, Optional, Sequence import torch import pandas as pd import numpy as np @@ -122,7 +122,24 @@ def __call__(self, sample_dict: NDict, series_config : NDict, key_in:str, key_ou sample_dict[unique_id_out] = dcm_unique shutil.rmtree(dirpath) return sample_dict +class OpLoadVolumeAroundCenterPoint(OpBase): + ''' + loads a volume (defined by radiuses) from given image around the given center point + ''' + def __init__(self, **kwargs): + super().__init__(**kwargs) + def __call__(self, sample_dict: NDict , key_in:str, key_out: str, center_point : np.array= None, radiuses : np.array= np.array([5.0,30.0,30.0])) -> NDict: + ''' + + ''' + img = sample_dict[key_in] + if center_point == None : + center_point = [i/2 for i in img.shape] + bouding_box_indices = [(max(int((center_point[i]-radiuses[i])),0),min(int((center_point[i]+radiuses[i])),img.shape[i]-1)) for i in range(3)] + img = img[bouding_box_indices[0][0]:bouding_box_indices[0][1],bouding_box_indices[1][0]:bouding_box_indices[1][1],bouding_box_indices[2][0]:bouding_box_indices[2][1]] + sample_dict[key_out] = img + return sample_dict class UKBB: """ # dataset that contains MRI nech-to-knee and metadata from UK patients @@ -166,6 +183,7 @@ def static_pipeline(data_dir: str, series_config: NDict) -> PipelineDefault: # decoding sample ID (OpUKBBSampleIDDecode(), dict()), # will save image and seg path to "data.input.img_path", "data.gt.seg_path" (OpLoadUKBBZip(data_dir), dict(key_in="data.input.img_path", key_out="data.input.img", unique_id_out="data.ID", series_config=series_config)), + (OpLoadVolumeAroundCenterPoint(), dict(key_in="data.input.img", key_out="data.input.img")), # (OpLambda(partial(skimage.transform.resize, # output_shape=(32, 256, 256), # mode='reflect', From 85739ad80aa64339eff708efda46868c8c375b38 Mon Sep 17 00:00:00 2001 From: itaiguez Date: Wed, 10 Aug 2022 16:20:13 +0300 Subject: [PATCH 32/38] added support for centerpoints inference and usage, fixed warning in pipe --- .../ukbb_prostate/conf/config_template.yaml | 21 ++- .../classification/ukbb_prostate/runner.py | 152 ++++-------------- fuseimg/datasets/ukbb_neck_to_knee.py | 44 +++-- 3 files changed, 80 insertions(+), 137 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml index 5277fa7b2..2f93cd7ac 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/conf/config_template.yaml @@ -3,6 +3,7 @@ columns_to_add: men: is_female == 0 preindex_cancer_prostate : preindex_C61_Malignant_neoplasm_of_prostate > 0 postindex_cancer_prostate : postindex_C61_Malignant_neoplasm_of_prostate > 0 + prostate_hyperplasia_preindex : preindex_prostate_hyperplasia > 0 prepostindex_cancer_prostate: preindex_cancer_prostate | postindex_cancer_prostate preindex_cancer_male_genital : preindex_C60_C63_Malignant_neoplasms_of_male_genital_organs > 0 postindex_cancer_male_genital : postindex_C60_C63_Malignant_neoplasms_of_male_genital_organs > 0 @@ -23,14 +24,14 @@ max_group_size : 1000 # target: preindex_prostatectomy # cohort : men -# target: prostate_hyperplasia_preindex -# cohort: cohort_men_with_prostate +target: prostate_hyperplasia_preindex +cohort: cohort_men_with_prostate # target: preindex_cancer_prostate # cohort: cohort_cancer_prostate_preindex -target: postindex_cancer_prostate -cohort : cohort_cancer_prostate_postindex +# target: postindex_cancer_prostate +# cohort : cohort_cancer_prostate_postindex # target: prepostindex_cancer_prostate # cohort : cohort_cancer_prostate_prepostindex @@ -92,4 +93,14 @@ infer: eval: cohorts: #[cohort_cancer_prostate_preindex, cohort_cancer_prostate_postindex] columns_to_add: ${columns_to_add} - sample_id_col: ${sample_id_col} \ No newline at end of file + sample_id_col: ${sample_id_col} +explain: + sample_ids : ${paths.data_misc_dir}/sample_ids.csv + attention_dir : attention + centerpoints_dir_name : centerpoints + infer_filename : validation_set_infer.gz + checkpoint : best_epoch.ckpt + target : ${target} + columns_to_add: ${columns_to_add} + num_workers : 12 + debug: False \ No newline at end of file diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index e9a3cea5e..9e025c760 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -15,10 +15,11 @@ """ from collections import OrderedDict +from genericpath import isdir import os from typing import Union, Dict, Optional, List -from fuse_examples.imaging.classification.ukbb_prostate import cohort_and_label_def, files_download_from_cos +from fuse_examples.imaging.classification.ukbb_prostate import cohort_and_label_def, files_download_from_cos, explain_with_gradcam import pathlib import sys import copy @@ -56,11 +57,6 @@ from fuse.dl.models import ModelMultiHead from fuse.dl.models.heads.head_3D_classifier import Head3DClassifier -from fuse.dl.models.model_wrapper import ModelWrapDictToSeq -from medcam import medcam -import numpy as np -from cv2 import cv2 -import nibabel as nib assert "UKBB_MRI_BODY_DATA_PATH" in os.environ, "Expecting environment variable UKBB_MRI_BODY_DATA_PATH to be set. Follow the instruction in example README file to download and set the path to the data" ########################################## @@ -299,84 +295,35 @@ def run_infer(train: NDict, paths: NDict, infer: NDict): ###################################### # Explain Template ###################################### -def run_explain(train: NDict, paths: NDict, infer: NDict): +def run_explain(train: NDict, paths: NDict, explain: NDict): fuse_logger_start(output_path=None, console_verbose_level=logging.INFO) lgr = logging.getLogger('Fuse') lgr.info('Fuse Explain', {'attrs': ['bold', 'underline']}) - pl_module, _, infer_dataloader = load_model_and_test_data(train, paths, infer) - - model = ModelWrapDictToSeq(pl_module._model, output_key='head_0') - model = medcam.inject(model, output_dir=os.path.join('working_dir','attention_maps'), backend='gcam', save_maps=True, layer='auto', return_attention=True) - for i, batch in enumerate(infer_dataloader): - logit, attention_map = model(batch['data.input.img'], batch['data.gt.classification']) - attention_map = attention_map[0][0].numpy() - batch['data.input.img'] = batch['data.input.img'][0][0].numpy() - original_attention_map = nib.load( - os.path.join('working_dir','attention_maps',os.listdir(os.path.join('working_dir','attention_maps'))[0], 'attention_map_' + str(i) + '_0_0.nii.gz')).get_fdata() - original_transposed = np.transpose(batch['data.input.img'], axes=(1, 2, 0)) - scale_ratio = [original_transposed.shape[i] / value for i, value in enumerate(original_attention_map.shape)] - points = [] - max_value = original_attention_map.argmax() - current_max = max_value - while True: - current_max = original_attention_map.argmax() - max_volume = np.unravel_index(current_max, original_attention_map.shape) - if current_max < max_value: - break - points.append(np.asarray(max_volume)) - original_attention_map[max_volume] = 0.0 - print("sample ", i) - print("points", points) - max_volume = points[0] - center = [int(index / 2) for index in original_attention_map.shape] - min_dist = 99999999999999999 - for point in points: - dist = np.linalg.norm(point - center) - if dist < min_dist: - min_dist = dist - max_volume = point - print("best", max_volume) - attention_map = show_attention_on_image(batch['data.input.img'], attention_map) - batch['data.input.img'] = np.transpose(batch['data.input.img'], axes=(1, 2, 0)) - original = nib.Nifti1Image(batch['data.input.img'], affine=np.eye(4)) - # volume_box = np.zeros(batch['data.input.img'].shape) - # radiuses = np.array([30.0,30.0,5.0]) - points = np.array(points) - big_point = [float(np.mean(points[:,i])[0])*scale_ratio[i] for i in range(3)] - print(big_point) - # bouding_box_indices = [(max(int((big_point[i]-radiuses[i])),0),min(int((big_point[i]+radiuses[i])),volume_box.shape[i]-1)) for i in range(3)] - # print(bouding_box_indices) - # volume_box = draw_bbox_around_volume(volume_box,bouding_box_indices, 1) - # for point in points: - # bouding_box_indices = [(max(int((point[i]-1)*scale_ratio[i]),0),min(int((point[i]+1)*scale_ratio[i]),volume_box.shape[i]-1)) for i in range(3)] - # print(bouding_box_indices) - # volume_box = draw_bbox_around_volume(volume_box,bouding_box_indices, 1) - # if len(points) > 1 : - # points = np.array(points) - # big_point = [[ min(points[:,i]), max(points[:,i])] for i in range(3)] - # for axis in big_point : - # if abs(axis[0] - axis[1] ) < 2 : - # axis[0] = axis[0] - 1 - # axis[1] = axis[1] + 1 - # print("big_point",big_point) - # bouding_box_indices = [(int((big_point[i][0])*scale_ratio[i]),int((big_point[i][1])*scale_ratio[i])) for i in range(3)] - # print("bouding_box_indices",bouding_box_indices) - # volume_box = draw_bbox_around_volume(volume_box,bouding_box_indices, 2) - identifier = batch['data.input.img_path'][0].replace('*','') - nib.save(original, filename=os.path.join('attention_maps','original_'+str(i)+'_'+identifier+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) - nib.save(attention_map, filename=os.path.join('attention_maps','attention_'+str(i)+'_'+identifier+'_label_='+str(batch['data.gt.classification'])+'.nii.gz')) - - -def draw_bbox_around_volume(volume_box, bouding_box_indices, color): - for slice in range(bouding_box_indices[2][0], bouding_box_indices[2][1] - 1): - for x in range(bouding_box_indices[0][0], bouding_box_indices[0][1] - 1): - volume_box[x, bouding_box_indices[1][0], slice] = color - volume_box[x, bouding_box_indices[1][1], slice] = color - for y in range(bouding_box_indices[1][0], bouding_box_indices[1][1] - 1): - volume_box[bouding_box_indices[0][0], y, slice] = color - volume_box[bouding_box_indices[0][1], y, slice] = color - return volume_box + checkpoint_file = os.path.join(paths["model_dir"], explain["checkpoint"]) + lgr.info(f'checkpoint_file={checkpoint_file}', {'color': 'magenta'}) + + # load model + lgr.info('Model:', {'attrs': 'bold'}) + model, pl_trainer, num_classes, gt_label, class_names = create_model(train, paths) + lgr.info('Model: Done', {'attrs': 'bold'}) + + input_source_gt = read_clinical_data_file(filename=paths["clinical_data_file"], targets=explain['target'], columns_to_add=explain.get('columns_to_add')) + + infer_sample_ids = pd.read_csv(paths["sample_ids"])['sample_id'].to_list() + test_dataset = UKBB.dataset(data_dir=paths["data_dir"], target=explain['target'], series_config=train['series_config'], + input_source_gt=input_source_gt, cache_dir=paths["cache_dir"], num_workers=explain['num_workers'], + sample_ids=infer_sample_ids, train=False) + ## Create dataloader + infer_dataloader = DataLoader(dataset=test_dataset, + shuffle=False, drop_last=False, + collate_fn=CollateDefault(), + num_workers=explain["num_workers"]) + + pl_module = LightningModuleDefault.load_from_checkpoint(checkpoint_file, model_dir=paths["model_dir"], model=model, map_location="cpu", + strict=True) + + explain_with_gradcam.save_attention_centerpoint(pl_module , infer_dataloader , explain) def load_model_and_test_data(train: NDict, paths: NDict, infer: NDict): @@ -413,41 +360,6 @@ def load_model_and_test_data(train: NDict, paths: NDict, infer: NDict): return pl_module, pl_trainer, infer_dataloader -def show_attention_on_image(img: np.ndarray, - mask: np.ndarray, - colormap: int = cv2.COLORMAP_JET) -> np.ndarray: - """ This function overlays the cam mask on the image as an heatmap. - reference for fusing heat map and original image : https://github.com/jacobgil/pytorch-grad-cam/blob/61e9babae8600351b02b6e90864e4807f44f2d4a/pytorch_grad_cam/utils/image.py#L25 - By default the heatmap is in BGR format. - :param img: The base image in RGB or BGR format. - :param mask: The cam mask. - :param colormap: The OpenCV colormap to be used. - :returns: The default image with the cam overlay. - """ - heatmaps = [np.float32(cv2.applyColorMap(np.uint8(255 * mask[i]), colormap)) / 255 for i in range(mask.shape[0])] - images = [cv2.cvtColor(img[i], cv2.COLOR_GRAY2RGB) for i in range(img.shape[0])] - RGB_DTYPE = np.dtype([('R', 'u1'), ('G', 'u1'), ('B', 'u1')]) - cams = [] - for i in range(len(images)): - if np.max(images[i]) > 1: - images[i] *= (1.0 / images[i].max()) - - cam = heatmaps[i] + images[i] - cam = cam / np.max(cam) - cam = np.uint8(255 * cam) - cam = cam.view(RGB_DTYPE) - cams.append(cam) - nifti = nib.Nifti1Image(np.concatenate(cams, axis=2), np.eye(4)) - return nifti - - -def largest_indices(ary, n): - """Returns the n largest indices from a numpy array.""" - flat = ary.flatten() - indices = np.argpartition(flat, -n)[-n:] - indices = indices[np.argsort(-flat[indices])] - return np.unravel_index(indices, ary.shape) - ###################################### # Analyze Template @@ -520,24 +432,24 @@ def main(cfg: DictConfig) -> None: # train if 'train' in cfg["run.running_modes"]: - run_train(NDict(cfg["paths"]), NDict(cfg["train"])) + run_train(cfg["paths"], cfg["train"]) else: assert "Expecting train mode to be set." # infer (infer set) if 'infer' in cfg["run.running_modes"]: - run_infer(NDict(cfg["train"]), NDict(cfg["paths"]), NDict(cfg["infer"])) + run_infer(cfg["train"], cfg["paths"], cfg["infer"]) # # evaluate (infer set) if 'eval' in cfg["run.running_modes"]: - run_eval(NDict(cfg["paths"]), NDict(cfg["infer"]), cfg.get('eval')) + run_eval(cfg["paths"], cfg["infer"], cfg.get('eval')) # explain (infer set) if 'explain' in cfg["run.running_modes"]: - run_explain(NDict(cfg["train"]), NDict(cfg["paths"]), NDict(cfg["infer"])) + run_explain(cfg["train"], cfg["paths"], cfg["explain"]) if __name__ == "__main__": - sys.argv.append('hydra.run.dir=working_dir') + sys.argv.append('hydra.run.dir=working_dir2') main() diff --git a/fuseimg/datasets/ukbb_neck_to_knee.py b/fuseimg/datasets/ukbb_neck_to_knee.py index c65f20c66..31e7d6c1c 100644 --- a/fuseimg/datasets/ukbb_neck_to_knee.py +++ b/fuseimg/datasets/ukbb_neck_to_knee.py @@ -97,7 +97,7 @@ def __call__(self, sample_dict: NDict, series_config : NDict, key_in:str, key_ou station_list.append(i+1) dicom_tags['station'] = station_list try : - dcm_unique = dicom_tags[dicom_tags['station'] == series_config['station']][dicom_tags['series'] == series_config['series']]['dcm_unique'].iloc[0] + dcm_unique = dicom_tags[(dicom_tags['station'] == series_config['station']) & (dicom_tags['series'] == series_config['series'])]['dcm_unique'].iloc[0] except: print("requested file",zip_filename,"series description",series_config, "not found!!!") return None @@ -122,6 +122,23 @@ def __call__(self, sample_dict: NDict, series_config : NDict, key_in:str, key_ou sample_dict[unique_id_out] = dcm_unique shutil.rmtree(dirpath) return sample_dict + +class OpLoadCenterPoint(OpBase): + ''' + loads a zip and select a sequence and a station from it + ''' + def __init__(self, dir_path: str, **kwargs): + super().__init__(**kwargs) + self._dir_path = dir_path + + def __call__(self, sample_dict: NDict, key_in:str, key_out: str) -> NDict: + ''' + + ''' + filename = os.path.join(self._dir_path,sample_dict[key_in]) + f = open(filename, 'r').read() + sample_dict[key_out]= np.loadtxt(f, delimiter=',', usecols=(0, 2)) + return sample_dict class OpLoadVolumeAroundCenterPoint(OpBase): ''' loads a volume (defined by radiuses) from given image around the given center point @@ -129,14 +146,16 @@ class OpLoadVolumeAroundCenterPoint(OpBase): def __init__(self, **kwargs): super().__init__(**kwargs) - def __call__(self, sample_dict: NDict , key_in:str, key_out: str, center_point : np.array= None, radiuses : np.array= np.array([5.0,30.0,30.0])) -> NDict: + def __call__(self, sample_dict: NDict , key_in:str, key_out: str, centerpoint : str = None, radiuses : np.array= np.array([5.0,30.0,30.0])) -> NDict: ''' ''' img = sample_dict[key_in] - if center_point == None : - center_point = [i/2 for i in img.shape] - bouding_box_indices = [(max(int((center_point[i]-radiuses[i])),0),min(int((center_point[i]+radiuses[i])),img.shape[i]-1)) for i in range(3)] + if centerpoint == None : + center_point_cord = [i/2 for i in img.shape] + else: + center_point_cord = sample_dict[centerpoint] + bouding_box_indices = [(max(int((center_point_cord[i]-radiuses[i])),0),min(int((center_point_cord[i]+radiuses[i])),img.shape[i]-1)) for i in range(3)] img = img[bouding_box_indices[0][0]:bouding_box_indices[0][1],bouding_box_indices[1][0]:bouding_box_indices[1][1],bouding_box_indices[2][0]:bouding_box_indices[2][1]] sample_dict[key_out] = img return sample_dict @@ -174,7 +193,7 @@ def get_existing_sample_ids(path: str): return existing_sample_ids @staticmethod - def static_pipeline(data_dir: str, series_config: NDict) -> PipelineDefault: + def static_pipeline(data_dir: str, series_config: NDict, centerpoint_dir: str = None) -> PipelineDefault: """ Get suggested static pipeline (which will be cached), typically loading the data plus design choices that we won't experiment with. :param data_path: path to original kits21 data (can be downloaded by KITS21.download()) @@ -183,12 +202,13 @@ def static_pipeline(data_dir: str, series_config: NDict) -> PipelineDefault: # decoding sample ID (OpUKBBSampleIDDecode(), dict()), # will save image and seg path to "data.input.img_path", "data.gt.seg_path" (OpLoadUKBBZip(data_dir), dict(key_in="data.input.img_path", key_out="data.input.img", unique_id_out="data.ID", series_config=series_config)), - (OpLoadVolumeAroundCenterPoint(), dict(key_in="data.input.img", key_out="data.input.img")), - # (OpLambda(partial(skimage.transform.resize, - # output_shape=(32, 256, 256), - # mode='reflect', - # anti_aliasing=True, - # preserve_range=True)), dict(key="data.input.img")), + # (OpLoadCenterPoint(centerpoint_dir), dict(key_in="data.input.img_path", key_out="data.input.centerpoint")), + # (OpLoadVolumeAroundCenterPoint(), dict(key_in="data.input.img", key_out="data.input.img" , centerpoint = "data.input.centerpoint")), + (OpLambda(partial(skimage.transform.resize, + output_shape=(44, 174, 224), + mode='reflect', + anti_aliasing=True, + preserve_range=True)), dict(key="data.input.img")), (OpNormalizeAgainstSelf(), dict(key="data.input.img")), (OpToNumpy(), dict(key='data.input.img', dtype=np.float32)), # (OpLambda(partial(dump, filename="first.png", slice = 25)), dict(key="data.input.img")), From 0c87e6fbeee7eff3e02cca37cf0005039545e857 Mon Sep 17 00:00:00 2001 From: itaiguez Date: Wed, 10 Aug 2022 20:52:47 +0300 Subject: [PATCH 33/38] added parallel support for explain --- .../classification/ukbb_prostate/runner.py | 4 +-- fuseimg/datasets/ukbb_neck_to_knee.py | 31 +++++++++++-------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 9e025c760..c1498b819 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -312,10 +312,10 @@ def run_explain(train: NDict, paths: NDict, explain: NDict): infer_sample_ids = pd.read_csv(paths["sample_ids"])['sample_id'].to_list() test_dataset = UKBB.dataset(data_dir=paths["data_dir"], target=explain['target'], series_config=train['series_config'], - input_source_gt=input_source_gt, cache_dir=paths["cache_dir"], num_workers=explain['num_workers'], + input_source_gt=input_source_gt, cache_dir=paths['cache_dir'], num_workers=explain['num_workers'], sample_ids=infer_sample_ids, train=False) ## Create dataloader - infer_dataloader = DataLoader(dataset=test_dataset, + infer_dataloader = DataLoader(dataset=test_dataset, batch_size=explain['batch_size'], shuffle=False, drop_last=False, collate_fn=CollateDefault(), num_workers=explain["num_workers"]) diff --git a/fuseimg/datasets/ukbb_neck_to_knee.py b/fuseimg/datasets/ukbb_neck_to_knee.py index 31e7d6c1c..e94381984 100644 --- a/fuseimg/datasets/ukbb_neck_to_knee.py +++ b/fuseimg/datasets/ukbb_neck_to_knee.py @@ -292,19 +292,24 @@ def dataset( static_pipeline = UKBB.static_pipeline(data_dir, series_config) dynamic_pipeline = UKBB.dynamic_pipeline(input_source_gt, target,train=train) - - cacher = SamplesCacher(f'cmmd_cache_ver', - static_pipeline, - cache_dirs=[cache_dir], - restart_cache=reset_cache, - audit_first_sample=False, audit_rate=None, - workers=num_workers) - - my_dataset = DatasetDefault(sample_ids=sample_ids, - static_pipeline=static_pipeline, - dynamic_pipeline=dynamic_pipeline, - cacher=cacher, - ) + if cache_dir != None : + cacher = SamplesCacher(f'cmmd_cache_ver', + static_pipeline, + cache_dirs=[cache_dir], + restart_cache=reset_cache, + audit_first_sample=False, audit_rate=None, + workers=num_workers) + + my_dataset = DatasetDefault(sample_ids=sample_ids, + static_pipeline=static_pipeline, + dynamic_pipeline=dynamic_pipeline, + cacher=cacher, + ) + else: + my_dataset = DatasetDefault(sample_ids=sample_ids, + static_pipeline=static_pipeline, + dynamic_pipeline=dynamic_pipeline, + ) my_dataset.create(num_workers = num_workers) return my_dataset From 8aec7ea9451fdbf0067c149c61ec57bce9b400b1 Mon Sep 17 00:00:00 2001 From: itaiguez Date: Wed, 10 Aug 2022 20:53:59 +0300 Subject: [PATCH 34/38] split explain to new file --- .../ukbb_prostate/explain_with_gradcam.py | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py new file mode 100644 index 000000000..01cf2b216 --- /dev/null +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py @@ -0,0 +1,97 @@ +from fuse.dl.models.model_wrapper import ModelWrapDictToSeq +from fuse.utils.ndict import NDict +import os +import numpy as np +from fuse.dl.models.model_wrapper import ModelWrapDictToSeq +from medcam import medcam +from cv2 import cv2 +import nibabel as nib +from multiprocessing import Pool +from tqdm import tqdm +from pqdm.processes import pqdm +from functools import partial + +def save_attention_centerpoint(pl_module,infer_dataloader , explain: NDict) : + if not os.path.isdir(explain['centerpoints_dir_name']): + os.mkdir(explain['centerpoints_dir_name']) + if not os.path.isdir(explain['attention_dir']): + os.mkdir(explain['attention_dir']) + model = ModelWrapDictToSeq(pl_module._model, output_key='head_0') + model = medcam.inject(model, output_dir=os.path.join(explain['attention_dir']),label=explain['label'], backend='gcam', save_maps=True, layer='auto', return_attention=True) + for i, batch in tqdm(enumerate(infer_dataloader)): + logit, attention_map = model(batch['data.input.img'], batch['data.gt.classification']) + params = [] + for j in range(0,batch['data.input.img'].shape[0]) : + params.append({"i": i ,"logit":logit, "attention_map":attention_map ,"j": j ,"batch" : batch , "explain":explain }) + pqdm(params , run_gradcam_on_sample, n_jobs = explain["num_workers"]) +def run_gradcam_on_sample(params): + i = params["i"] + logit = params["logit"] + attention_map = params["attention_map"] + j = params["j"] + batch = params["j"] + explain = params["explain"] + sample = batch['data.input.img'][j][0] + attention_map = attention_map[j][0].numpy() + sample = sample.numpy() + layer_folder = [name for name in os.listdir(explain['attention_dir']) if os.path.isdir(os.path.join(explain['attention_dir'], name))][0] + original_attention_name = os.path.join(explain['attention_dir'],layer_folder, 'attention_map_' + str(i) + '_'+str(j)+'_0.nii.gz') + original_attention_map = nib.load(original_attention_name).get_fdata() + original_transposed = np.transpose(sample, axes=(1, 2, 0)) + scale_ratio = [original_transposed.shape[i] / value for i, value in enumerate(original_attention_map.shape)] + points = [] + max_value = original_attention_map.argmax() + current_max = max_value + while True: + current_max = original_attention_map.argmax() + max_volume = np.unravel_index(current_max, original_attention_map.shape) + if current_max < max_value: + break + points.append(np.asarray(max_volume)) + original_attention_map[max_volume] = 0.0 + points = np.array(points) + big_point = np.array([int(np.mean(points[:,i])*scale_ratio[i]) for i in range(3)]) + identifier = batch['data.input.img_path'][j] + with open(os.path.join(explain['centerpoints_dir_name'],identifier+'.npy'), 'wb') as f: + np.save(f, big_point) + center = np.array([int(index / 2) for index in original_transposed.shape]) + dist = np.linalg.norm(big_point - center) + if logit.detach().numpy()[j][0] < 0.9 and dist > 40 : + print(batch['data.input.img_path'][0],"suspected as wrong") + print("logit",logit,"distate from center",dist,"center=",center) + if explain['debug'] == True: + identifier = batch['data.input.img_path'][0].replace('*','') + attention_map = show_attention_on_image(sample, attention_map) + sample = np.transpose(sample, axes=(1, 2, 0)) + original = nib.Nifti1Image(sample, affine=np.eye(4)) + nib.save(original, filename=os.path.join(explain['attention_dir'],'original_'+identifier+'_label_='+str(batch['data.gt.classification'][j])+str(big_point)+'.nii.gz')) + nib.save(attention_map, filename=os.path.join(explain['attention_dir'],'attention_'+identifier+'_label_='+str(batch['data.gt.classification'][j])+'.nii.gz')) + + + +def show_attention_on_image(img: np.ndarray, + mask: np.ndarray, + colormap: int = cv2.COLORMAP_JET) -> np.ndarray: + """ This function overlays the cam mask on the image as an heatmap. + reference for fusing heat map and original image : https://github.com/jacobgil/pytorch-grad-cam/blob/61e9babae8600351b02b6e90864e4807f44f2d4a/pytorch_grad_cam/utils/image.py#L25 + By default the heatmap is in BGR format. + :param img: The base image in RGB or BGR format. + :param mask: The cam mask. + :param colormap: The OpenCV colormap to be used. + :returns: The default image with the cam overlay. + """ + heatmaps = [np.float32(cv2.applyColorMap(np.uint8(255 * mask[i]), colormap)) / 255 for i in range(mask.shape[0])] + images = [cv2.cvtColor(img[i], cv2.COLOR_GRAY2RGB) for i in range(img.shape[0])] + RGB_DTYPE = np.dtype([('R', 'u1'), ('G', 'u1'), ('B', 'u1')]) + cams = [] + for i in range(len(images)): + if np.max(images[i]) > 1: + images[i] *= (1.0 / images[i].max()) + + cam = heatmaps[i] + images[i] + cam = cam / np.max(cam) + cam = np.uint8(255 * cam) + cam = cam.view(RGB_DTYPE) + cams.append(cam) + nifti = nib.Nifti1Image(np.concatenate(cams, axis=2), np.eye(4)) + return nifti From 2a72c8a8e52b4b4c218e27c021281dbd005e9fec Mon Sep 17 00:00:00 2001 From: itaiguez Date: Thu, 11 Aug 2022 15:55:05 +0300 Subject: [PATCH 35/38] fix support batching, data parallel still a problem --- .../ukbb_prostate/explain_with_gradcam.py | 60 +++++++++++++------ .../classification/ukbb_prostate/runner.py | 5 +- fuseimg/datasets/ukbb_neck_to_knee.py | 4 +- 3 files changed, 49 insertions(+), 20 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py index 01cf2b216..46213e8b1 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py @@ -9,30 +9,53 @@ from multiprocessing import Pool from tqdm import tqdm from pqdm.processes import pqdm -from functools import partial +import pandas as pd +import torch.nn as nn -def save_attention_centerpoint(pl_module,infer_dataloader , explain: NDict) : +def save_attention_centerpoint(pl_module, pl_trainer , infer_dataloader , explain: NDict) : if not os.path.isdir(explain['centerpoints_dir_name']): os.mkdir(explain['centerpoints_dir_name']) if not os.path.isdir(explain['attention_dir']): os.mkdir(explain['attention_dir']) - model = ModelWrapDictToSeq(pl_module._model, output_key='head_0') - model = medcam.inject(model, output_dir=os.path.join(explain['attention_dir']),label=explain['label'], backend='gcam', save_maps=True, layer='auto', return_attention=True) - for i, batch in tqdm(enumerate(infer_dataloader)): - logit, attention_map = model(batch['data.input.img'], batch['data.gt.classification']) + device="cuda:0" + # os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1, 2 ,3,4,5,6,7" + model = ModelWrapDictToSeq(pl_module._model, output_key='head_0').to(device) + model = medcam.inject(model, output_dir=os.path.join(explain['attention_dir']),label=explain['label'], backend='gcam', save_maps=True, layer='auto', return_attention=explain['debug']).to(device) + # model = nn.DataParallel(model, device_ids=[0, 1, 2 ,3,4,5,6,7]) + # pl_trainer.test(model=model, datamodule=infer_dataloader, verbose=True) + results = [] + for i, batch in tqdm(enumerate(infer_dataloader), total=len(infer_dataloader)): + batch['data.input.img'] = batch['data.input.img'].to(device) + batch['data.gt.classification'] = batch['data.gt.classification'].to(device) + if explain['debug'] == True: + logit, attention = model(batch['data.input.img'], batch['data.gt.classification']) + else: + logit, _ = model(batch['data.input.img'], batch['data.gt.classification']) + logit_vector = logit.detach().cpu().numpy() params = [] for j in range(0,batch['data.input.img'].shape[0]) : - params.append({"i": i ,"logit":logit, "attention_map":attention_map ,"j": j ,"batch" : batch , "explain":explain }) - pqdm(params , run_gradcam_on_sample, n_jobs = explain["num_workers"]) + sample = batch['data.input.img'][j][0].cpu() + logit = logit_vector[j][explain['label']] + label = str(batch['data.gt.classification'][j]) + if explain['debug'] == True: + attention_map = attention[j][0].cpu() + else: + attention_map = None + identifier = batch['data.input.img_path'][j].replace('*','') + param = {"i": i ,"logit":logit, "attention_map":attention_map ,"j": j ,"sample" : sample , "explain":explain , "identifier" : identifier, "label":label} + res = run_gradcam_on_sample(param) + results.append(res) + params.append(param) + #pqdm(params , run_gradcam_on_sample, n_jobs = explain["num_workers"]) + df = pd.DataFrame(results, columns=["identifier","logit","dist", "big_point"]) + df.to_csv("prostate_output.csv") def run_gradcam_on_sample(params): i = params["i"] logit = params["logit"] - attention_map = params["attention_map"] j = params["j"] batch = params["j"] explain = params["explain"] - sample = batch['data.input.img'][j][0] - attention_map = attention_map[j][0].numpy() + sample = params["sample"] sample = sample.numpy() layer_folder = [name for name in os.listdir(explain['attention_dir']) if os.path.isdir(os.path.join(explain['attention_dir'], name))][0] original_attention_name = os.path.join(explain['attention_dir'],layer_folder, 'attention_map_' + str(i) + '_'+str(j)+'_0.nii.gz') @@ -51,21 +74,24 @@ def run_gradcam_on_sample(params): original_attention_map[max_volume] = 0.0 points = np.array(points) big_point = np.array([int(np.mean(points[:,i])*scale_ratio[i]) for i in range(3)]) - identifier = batch['data.input.img_path'][j] + identifier = params["identifier"] with open(os.path.join(explain['centerpoints_dir_name'],identifier+'.npy'), 'wb') as f: np.save(f, big_point) center = np.array([int(index / 2) for index in original_transposed.shape]) dist = np.linalg.norm(big_point - center) - if logit.detach().numpy()[j][0] < 0.9 and dist > 40 : - print(batch['data.input.img_path'][0],"suspected as wrong") + if logit < 0.9 and dist > 40 : + print(identifier,"suspected as wrong") print("logit",logit,"distate from center",dist,"center=",center) if explain['debug'] == True: - identifier = batch['data.input.img_path'][0].replace('*','') + attention_map = params["attention_map"] + attention_map = attention_map.numpy() + identifier = identifier.replace('*','') attention_map = show_attention_on_image(sample, attention_map) sample = np.transpose(sample, axes=(1, 2, 0)) original = nib.Nifti1Image(sample, affine=np.eye(4)) - nib.save(original, filename=os.path.join(explain['attention_dir'],'original_'+identifier+'_label_='+str(batch['data.gt.classification'][j])+str(big_point)+'.nii.gz')) - nib.save(attention_map, filename=os.path.join(explain['attention_dir'],'attention_'+identifier+'_label_='+str(batch['data.gt.classification'][j])+'.nii.gz')) + nib.save(original, filename=os.path.join(explain['attention_dir'],'original_'+identifier+'_'+str(big_point)+'.nii.gz')) + nib.save(attention_map, filename=os.path.join(explain['attention_dir'],'attention_'+identifier+'_'+'.nii.gz')) + return [identifier,logit,dist, big_point] diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index c1498b819..486793ab7 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -94,6 +94,7 @@ def create_model(train: NDict, paths: NDict) -> torch.nn.Module: max_epochs=train['trainer']['num_epochs'], accelerator=train['trainer']['accelerator'], devices=train['trainer']['devices'], + strategy = train['trainer']['strategy'], num_sanity_val_steps=-1, resume_from_checkpoint=train.get('resume_from_checkpoint'), auto_select_gpus=True) @@ -312,7 +313,7 @@ def run_explain(train: NDict, paths: NDict, explain: NDict): infer_sample_ids = pd.read_csv(paths["sample_ids"])['sample_id'].to_list() test_dataset = UKBB.dataset(data_dir=paths["data_dir"], target=explain['target'], series_config=train['series_config'], - input_source_gt=input_source_gt, cache_dir=paths['cache_dir'], num_workers=explain['num_workers'], + input_source_gt=input_source_gt, cache_dir=None, reset_cache = False, num_workers=explain['num_workers'], sample_ids=infer_sample_ids, train=False) ## Create dataloader infer_dataloader = DataLoader(dataset=test_dataset, batch_size=explain['batch_size'], @@ -323,7 +324,7 @@ def run_explain(train: NDict, paths: NDict, explain: NDict): pl_module = LightningModuleDefault.load_from_checkpoint(checkpoint_file, model_dir=paths["model_dir"], model=model, map_location="cpu", strict=True) - explain_with_gradcam.save_attention_centerpoint(pl_module , infer_dataloader , explain) + explain_with_gradcam.save_attention_centerpoint(pl_module ,pl_trainer, infer_dataloader , explain) def load_model_and_test_data(train: NDict, paths: NDict, infer: NDict): diff --git a/fuseimg/datasets/ukbb_neck_to_knee.py b/fuseimg/datasets/ukbb_neck_to_knee.py index e94381984..96136d0ab 100644 --- a/fuseimg/datasets/ukbb_neck_to_knee.py +++ b/fuseimg/datasets/ukbb_neck_to_knee.py @@ -308,7 +308,9 @@ def dataset( else: my_dataset = DatasetDefault(sample_ids=sample_ids, static_pipeline=static_pipeline, - dynamic_pipeline=dynamic_pipeline, + dynamic_pipeline=dynamic_pipeline, + cacher=None, + allow_uncached_sample_morphing=False, ) my_dataset.create(num_workers = num_workers) From 0d5ff75ac2ce9e53100a309c203b73fff84442e2 Mon Sep 17 00:00:00 2001 From: itaiguez Date: Mon, 15 Aug 2022 13:19:03 +0300 Subject: [PATCH 36/38] fixed problem with non caching and bad sample --- .../ukbb_prostate/explain_with_gradcam.py | 61 +++++++++++-------- .../classification/ukbb_prostate/runner.py | 2 +- fuse/data/datasets/caching/samples_cacher.py | 2 +- fuse/data/datasets/dataset_default.py | 11 +++- fuse/utils/data/collate.py | 2 +- fuseimg/datasets/ukbb_neck_to_knee.py | 3 +- 6 files changed, 50 insertions(+), 31 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py index 46213e8b1..fa77d9a3c 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py @@ -24,29 +24,40 @@ def save_attention_centerpoint(pl_module, pl_trainer , infer_dataloader , expla # model = nn.DataParallel(model, device_ids=[0, 1, 2 ,3,4,5,6,7]) # pl_trainer.test(model=model, datamodule=infer_dataloader, verbose=True) results = [] - for i, batch in tqdm(enumerate(infer_dataloader), total=len(infer_dataloader)): - batch['data.input.img'] = batch['data.input.img'].to(device) - batch['data.gt.classification'] = batch['data.gt.classification'].to(device) - if explain['debug'] == True: - logit, attention = model(batch['data.input.img'], batch['data.gt.classification']) - else: - logit, _ = model(batch['data.input.img'], batch['data.gt.classification']) - logit_vector = logit.detach().cpu().numpy() - params = [] - for j in range(0,batch['data.input.img'].shape[0]) : - sample = batch['data.input.img'][j][0].cpu() - logit = logit_vector[j][explain['label']] - label = str(batch['data.gt.classification'][j]) - if explain['debug'] == True: - attention_map = attention[j][0].cpu() - else: - attention_map = None - identifier = batch['data.input.img_path'][j].replace('*','') - param = {"i": i ,"logit":logit, "attention_map":attention_map ,"j": j ,"sample" : sample , "explain":explain , "identifier" : identifier, "label":label} - res = run_gradcam_on_sample(param) - results.append(res) - params.append(param) - #pqdm(params , run_gradcam_on_sample, n_jobs = explain["num_workers"]) + i = 0 + try: + for batch in tqdm(infer_dataloader, total=len(infer_dataloader)): + try: + batch['data.input.img'] = batch['data.input.img'].to(device) + batch['data.gt.classification'] = batch['data.gt.classification'].to(device) + if explain['debug'] == True: + logit, attention = model(batch['data.input.img'], batch['data.gt.classification']) + else: + logit = model(batch['data.input.img'], batch['data.gt.classification']) + logit_vector = logit.detach().cpu().numpy() + # params = [] + for j in range(0,batch['data.input.img'].shape[0]) : + sample = batch['data.input.img'][j][0].cpu() + logit = logit_vector[j][explain['label']] + label = str(batch['data.gt.classification'][j]) + if explain['debug'] == True: + attention_map = attention[j][0].cpu() + else: + attention_map = None + identifier = batch['data.input.img_path'][j].replace('*','') + param = {"i": i ,"logit":logit, "attention_map":attention_map ,"j": j ,"sample" : sample , "explain":explain , "identifier" : identifier, "label":label} + res = run_gradcam_on_sample(param) + results.append(res) + i += 1 + # params.append(param) + except Exception as e: + print("got problem in batch containing the following examples",batch['data.input.img_path']) + print(e) + continue + #pqdm(params , run_gradcam_on_sample, n_jobs = explain["num_workers"]) + except Exception as e: + print(e) + pass df = pd.DataFrame(results, columns=["identifier","logit","dist", "big_point"]) df.to_csv("prostate_output.csv") def run_gradcam_on_sample(params): @@ -65,13 +76,15 @@ def run_gradcam_on_sample(params): points = [] max_value = original_attention_map.argmax() current_max = max_value - while True: + index = 0 + while index < 5: current_max = original_attention_map.argmax() max_volume = np.unravel_index(current_max, original_attention_map.shape) if current_max < max_value: break points.append(np.asarray(max_volume)) original_attention_map[max_volume] = 0.0 + index += 1 points = np.array(points) big_point = np.array([int(np.mean(points[:,i])*scale_ratio[i]) for i in range(3)]) identifier = params["identifier"] diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 486793ab7..784245953 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -318,7 +318,7 @@ def run_explain(train: NDict, paths: NDict, explain: NDict): ## Create dataloader infer_dataloader = DataLoader(dataset=test_dataset, batch_size=explain['batch_size'], shuffle=False, drop_last=False, - collate_fn=CollateDefault(), + collate_fn=CollateDefault(raise_error_key_missing = False), num_workers=explain["num_workers"]) pl_module = LightningModuleDefault.load_from_checkpoint(checkpoint_file, model_dir=paths["model_dir"], model=model, map_location="cpu", diff --git a/fuse/data/datasets/caching/samples_cacher.py b/fuse/data/datasets/caching/samples_cacher.py index b15f1e35a..8723bfb6a 100644 --- a/fuse/data/datasets/caching/samples_cacher.py +++ b/fuse/data/datasets/caching/samples_cacher.py @@ -342,7 +342,7 @@ def _cache(self, orig_sample_id: Any): return output_info -def _get_available_write_location(cache_dirs: List[str], max_allowed_used_space=0.95): +def _get_available_write_location(cache_dirs: List[str], max_allowed_used_space=None): """ :param cache_dirs: write directories. Directories are checked in order that they are provided. :param max_allowed_used_space: set to a value between 0.0 to 1.0. diff --git a/fuse/data/datasets/dataset_default.py b/fuse/data/datasets/dataset_default.py index a3820e200..e79ff1d84 100644 --- a/fuse/data/datasets/dataset_default.py +++ b/fuse/data/datasets/dataset_default.py @@ -40,6 +40,7 @@ def __init__( dynamic_pipeline: Optional[PipelineDefault] = None, cacher: Optional[SamplesCacher] = None, allow_uncached_sample_morphing: bool = False, + always_return_dict = True, ): """ :param sample_ids: list of sample_ids included in dataset. @@ -58,6 +59,7 @@ def __init__( self._cacher = cacher self._orig_sample_ids = sample_ids self._allow_uncached_sample_morphing = allow_uncached_sample_morphing + self._always_return_dict = always_return_dict # verify unique names for dynamic pipelines if self._dynamic_pipeline is not None and self._static_pipeline is not None: @@ -178,9 +180,12 @@ def getitem( sample = create_initial_sample(sample_id) sample = self._static_pipeline(sample) if not isinstance(sample, dict): - raise Exception( - f'By default when caching is disabled sample morphing is not allowed, and the output of the static pipeline is expected to be a dict. Instead got {type(sample)}. You can use "allow_uncached_sample_morphing=True" to allow this, but be aware it is slow and should be used only for debugging' - ) + if self._always_return_dict : + raise Exception( + f'By default when caching is disabled sample morphing is not allowed, and the output of the static pipeline is expected to be a dict. Instead got {type(sample)}. You can use "allow_uncached_sample_morphing=True" to allow this, but be aware it is slow and should be used only for debugging' + ) + else: + return None else: orig_sid = self._final_sid_to_orig_sid[sample_id] sample = create_initial_sample(orig_sid) diff --git a/fuse/utils/data/collate.py b/fuse/utils/data/collate.py index 810ae5e48..de736a065 100644 --- a/fuse/utils/data/collate.py +++ b/fuse/utils/data/collate.py @@ -100,7 +100,7 @@ def _collect_values_to_list(self, samples: List[str], key: str) -> Tuple[List, b if self._raise_error_key_missing: raise Exception(f"Error: key {key} does not exist in sample {index}: {sample}") else: - value = None + continue else: value = sample[key] collected_values.append(value) diff --git a/fuseimg/datasets/ukbb_neck_to_knee.py b/fuseimg/datasets/ukbb_neck_to_knee.py index 96136d0ab..02517c304 100644 --- a/fuseimg/datasets/ukbb_neck_to_knee.py +++ b/fuseimg/datasets/ukbb_neck_to_knee.py @@ -310,7 +310,8 @@ def dataset( static_pipeline=static_pipeline, dynamic_pipeline=dynamic_pipeline, cacher=None, - allow_uncached_sample_morphing=False, + allow_uncached_sample_morphing=False, + always_return_dict = False, ) my_dataset.create(num_workers = num_workers) From 05fa933aec49f3dcc57629d9cbec9ad83633125b Mon Sep 17 00:00:00 2001 From: itaiguez Date: Thu, 25 Aug 2022 13:16:19 +0300 Subject: [PATCH 37/38] ipdate explain --- .../classification/ukbb_prostate/explain_with_gradcam.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py index fa77d9a3c..bdd9de3ed 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py @@ -11,6 +11,7 @@ from pqdm.processes import pqdm import pandas as pd import torch.nn as nn +import torch def save_attention_centerpoint(pl_module, pl_trainer , infer_dataloader , explain: NDict) : if not os.path.isdir(explain['centerpoints_dir_name']): @@ -28,6 +29,9 @@ def save_attention_centerpoint(pl_module, pl_trainer , infer_dataloader , expla try: for batch in tqdm(infer_dataloader, total=len(infer_dataloader)): try: + # if isinstance(batch['data.input.img'], list) : + # batch['data.input.img'] = torch.stack(batch['data.input.img']) + # batch['data.gt.classification'] = torch.stack(batch['data.gt.classification'] ) batch['data.input.img'] = batch['data.input.img'].to(device) batch['data.gt.classification'] = batch['data.gt.classification'].to(device) if explain['debug'] == True: @@ -57,6 +61,8 @@ def save_attention_centerpoint(pl_module, pl_trainer , infer_dataloader , expla #pqdm(params , run_gradcam_on_sample, n_jobs = explain["num_workers"]) except Exception as e: print(e) + df = pd.DataFrame(results, columns=["identifier","logit","dist", "big_point"]) + df.to_csv("prostate_output.csv") pass df = pd.DataFrame(results, columns=["identifier","logit","dist", "big_point"]) df.to_csv("prostate_output.csv") From 7098a043aaf33ebfc6ade551fb33272d2ab26d2e Mon Sep 17 00:00:00 2001 From: Iti Guez Date: Wed, 7 Sep 2022 06:33:27 -0500 Subject: [PATCH 38/38] updated to new UKBB op and run explain --- .../ukbb_prostate/explain_with_gradcam.py | 62 ++++--- .../classification/ukbb_prostate/runner.py | 20 +-- fuseimg/data/ops/shape_ops.py | 3 +- fuseimg/datasets/ukbb_neck_to_knee.py | 151 +++++------------- 4 files changed, 93 insertions(+), 143 deletions(-) diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py index bdd9de3ed..1cd4c0959 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/explain_with_gradcam.py @@ -1,29 +1,48 @@ -from fuse.dl.models.model_wrapper import ModelWrapDictToSeq from fuse.utils.ndict import NDict import os import numpy as np -from fuse.dl.models.model_wrapper import ModelWrapDictToSeq from medcam import medcam -from cv2 import cv2 +import cv2 import nibabel as nib from multiprocessing import Pool from tqdm import tqdm -from pqdm.processes import pqdm import pandas as pd import torch.nn as nn import torch +import pqdm -def save_attention_centerpoint(pl_module, pl_trainer , infer_dataloader , explain: NDict) : +class ModelWrapDictToSeq(torch.nn.Module): + """ + Fuse model wrapper for wrapping fuse pytorch model and make him be in basic format- input is tensor and output is tensor + The user need to provide the input and output keys of the fuse model + """ + + def __init__(self, fuse_model: torch.nn.Module, output_key: str, input_key: str): + super().__init__() + self.model = fuse_model + self.output_key = output_key + self.input_key = input_key + + def forward(self, input: torch.tensor): + batch_dict = NDict() + # find input key + batch_dict[self.input_key] = input + # feed fuse model with dict as he excpect + ans_ndict = self.model(batch_dict) + # extract model output from dict + output = NDict(ans_ndict)[self.output_key] + return output + + +def save_attention_centerpoint(pl_module , infer_dataloader , explain: NDict) : if not os.path.isdir(explain['centerpoints_dir_name']): os.mkdir(explain['centerpoints_dir_name']) if not os.path.isdir(explain['attention_dir']): os.mkdir(explain['attention_dir']) device="cuda:0" # os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1, 2 ,3,4,5,6,7" - model = ModelWrapDictToSeq(pl_module._model, output_key='head_0').to(device) + model = ModelWrapDictToSeq(pl_module._model,input_key ='data.input.img', output_key='output.head_0').to(device) model = medcam.inject(model, output_dir=os.path.join(explain['attention_dir']),label=explain['label'], backend='gcam', save_maps=True, layer='auto', return_attention=explain['debug']).to(device) - # model = nn.DataParallel(model, device_ids=[0, 1, 2 ,3,4,5,6,7]) - # pl_trainer.test(model=model, datamodule=infer_dataloader, verbose=True) results = [] i = 0 try: @@ -35,14 +54,14 @@ def save_attention_centerpoint(pl_module, pl_trainer , infer_dataloader , expla batch['data.input.img'] = batch['data.input.img'].to(device) batch['data.gt.classification'] = batch['data.gt.classification'].to(device) if explain['debug'] == True: - logit, attention = model(batch['data.input.img'], batch['data.gt.classification']) + logit, attention = model(batch['data.input.img']) else: - logit = model(batch['data.input.img'], batch['data.gt.classification']) - logit_vector = logit.detach().cpu().numpy() - # params = [] + logit = model(batch['data.input.img']) + logit_vector = logit + params = [] for j in range(0,batch['data.input.img'].shape[0]) : sample = batch['data.input.img'][j][0].cpu() - logit = logit_vector[j][explain['label']] + logit = logit_vector[j][explain['target_int']] label = str(batch['data.gt.classification'][j]) if explain['debug'] == True: attention_map = attention[j][0].cpu() @@ -52,13 +71,14 @@ def save_attention_centerpoint(pl_module, pl_trainer , infer_dataloader , expla param = {"i": i ,"logit":logit, "attention_map":attention_map ,"j": j ,"sample" : sample , "explain":explain , "identifier" : identifier, "label":label} res = run_gradcam_on_sample(param) results.append(res) + #results.append(pqdm(params , run_gradcam_on_sample, n_jobs = explain["num_workers"])) i += 1 - # params.append(param) + params.append(param) except Exception as e: - print("got problem in batch containing the following examples",batch['data.input.img_path']) - print(e) - continue - #pqdm(params , run_gradcam_on_sample, n_jobs = explain["num_workers"]) + print("got problem in batch containing the following examples",batch['data.input.img_path']) + print(e) + continue + except Exception as e: print(e) df = pd.DataFrame(results, columns=["identifier","logit","dist", "big_point"]) @@ -98,9 +118,9 @@ def run_gradcam_on_sample(params): np.save(f, big_point) center = np.array([int(index / 2) for index in original_transposed.shape]) dist = np.linalg.norm(big_point - center) - if logit < 0.9 and dist > 40 : - print(identifier,"suspected as wrong") - print("logit",logit,"distate from center",dist,"center=",center) + # if logit < 0.9 and dist > 40 : + # print(identifier,"suspected as wrong") + # print("logit",logit,"distate from center",dist,"center=",center) if explain['debug'] == True: attention_map = params["attention_map"] attention_map = attention_map.numpy() diff --git a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py index 784245953..c1d905c37 100644 --- a/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py +++ b/examples/fuse_examples/imaging/classification/ukbb_prostate/runner.py @@ -19,7 +19,9 @@ import os from typing import Union, Dict, Optional, List -from fuse_examples.imaging.classification.ukbb_prostate import cohort_and_label_def, files_download_from_cos, explain_with_gradcam +import cohort_and_label_def +import files_download_from_cos +import explain_with_gradcam import pathlib import sys import copy @@ -149,7 +151,7 @@ def run_train(paths: NDict, train: NDict) -> torch.nn.Module: # else: # sample_ids = None - dataset_all = UKBB.dataset(data_dir=paths["data_dir"], target=train['target'], series_config=train['series_config'], + dataset_all = UKBB.dataset(data_dir=paths["data_dir"], target=train['target'], series_config=train['series_config'], aug_params= train['aug_params'], input_source_gt=clinical_data_df, cache_dir=paths["cache_dir"], reset_cache=False, num_workers=train["num_workers"], sample_ids=sample_ids, train=True @@ -170,11 +172,11 @@ def run_train(paths: NDict, train: NDict) -> torch.nn.Module: for fold in train["validation_folds"]: validation_sample_ids += folds[fold] - train_dataset = UKBB.dataset(data_dir=paths["data_dir"], target=train['target'], series_config=train['series_config'], + train_dataset = UKBB.dataset(data_dir=paths["data_dir"], target=train['target'], series_config=train['series_config'], aug_params= train['aug_params'], input_source_gt=clinical_data_df, cache_dir=paths["cache_dir"], reset_cache=False, num_workers=train["num_workers"], sample_ids=train_sample_ids, train=True) - validation_dataset = UKBB.dataset(data_dir=paths["data_dir"], target=train['target'], series_config=train['series_config'], + validation_dataset = UKBB.dataset(data_dir=paths["data_dir"], target=train['target'], series_config=train['series_config'], aug_params= train['aug_params'], input_source_gt=clinical_data_df, cache_dir=paths["cache_dir"], reset_cache=False, num_workers=train["num_workers"], sample_ids=validation_sample_ids) @@ -301,7 +303,7 @@ def run_explain(train: NDict, paths: NDict, explain: NDict): lgr = logging.getLogger('Fuse') lgr.info('Fuse Explain', {'attrs': ['bold', 'underline']}) - checkpoint_file = os.path.join(paths["model_dir"], explain["checkpoint"]) + checkpoint_file = explain["checkpoint"] lgr.info(f'checkpoint_file={checkpoint_file}', {'color': 'magenta'}) # load model @@ -311,9 +313,9 @@ def run_explain(train: NDict, paths: NDict, explain: NDict): input_source_gt = read_clinical_data_file(filename=paths["clinical_data_file"], targets=explain['target'], columns_to_add=explain.get('columns_to_add')) - infer_sample_ids = pd.read_csv(paths["sample_ids"])['sample_id'].to_list() + infer_sample_ids = pd.read_csv(explain["sample_ids"])['sample_id'].to_list() test_dataset = UKBB.dataset(data_dir=paths["data_dir"], target=explain['target'], series_config=train['series_config'], - input_source_gt=input_source_gt, cache_dir=None, reset_cache = False, num_workers=explain['num_workers'], + input_source_gt=input_source_gt, cache_dir=explain['cache_dir'], reset_cache = False, num_workers=explain['num_workers'], sample_ids=infer_sample_ids, train=False) ## Create dataloader infer_dataloader = DataLoader(dataset=test_dataset, batch_size=explain['batch_size'], @@ -321,10 +323,10 @@ def run_explain(train: NDict, paths: NDict, explain: NDict): collate_fn=CollateDefault(raise_error_key_missing = False), num_workers=explain["num_workers"]) - pl_module = LightningModuleDefault.load_from_checkpoint(checkpoint_file, model_dir=paths["model_dir"], model=model, map_location="cpu", + pl_module = LightningModuleDefault.load_from_checkpoint(checkpoint_file, model_dir=explain["model_dir"], model=model, map_location="cpu", strict=True) - explain_with_gradcam.save_attention_centerpoint(pl_module ,pl_trainer, infer_dataloader , explain) + explain_with_gradcam.save_attention_centerpoint(pl_module , infer_dataloader , explain) def load_model_and_test_data(train: NDict, paths: NDict, infer: NDict): diff --git a/fuseimg/data/ops/shape_ops.py b/fuseimg/data/ops/shape_ops.py index 60fb75c54..d11809328 100755 --- a/fuseimg/data/ops/shape_ops.py +++ b/fuseimg/data/ops/shape_ops.py @@ -74,11 +74,10 @@ class OpSelectSlice(OpBase): def __init__(self, **kwargs): super().__init__(**kwargs) - def __call__(self, sample_dict: NDict, op_id: Optional[str], key: str, slice_idx: int): + def __call__(self, sample_dict: NDict, key: str, slice_idx: int): """ :param slice_idx: the index of the selected slice from the 1st dimmention of an input tensor """ - img = sample_dict[key] if len(img.shape) < 3: return sample_dict diff --git a/fuseimg/datasets/ukbb_neck_to_knee.py b/fuseimg/datasets/ukbb_neck_to_knee.py index 02517c304..eebe9ed85 100644 --- a/fuseimg/datasets/ukbb_neck_to_knee.py +++ b/fuseimg/datasets/ukbb_neck_to_knee.py @@ -25,6 +25,9 @@ import zipfile # from fuseimg.data.ops import ops_mri from fuse.utils.rand.param_sampler import Uniform, RandInt, RandBool +from fuseimg.datasets.ukbb_read_mri_op import OpLoadUKBBZip +from fuseimg.data.ops.aug.geometry import OpAugAffine2D, OpAugSqueeze3Dto2D, OpAugUnsqueeze3DFrom2D +from fuseimg.data.ops.shape_ops import OpSelectSlice import SimpleITK as sitk import tempfile import shutil @@ -54,74 +57,6 @@ def __call__(self, sample_dict: NDict) -> NDict: return sample_dict -class OpLoadUKBBZip(OpBase): - ''' - loads a zip and select a sequence and a station from it - ''' - def __init__(self, dir_path: str, **kwargs): - super().__init__(**kwargs) - self._dir_path = dir_path - - def __call__(self, sample_dict: NDict, series_config : NDict, key_in:str, key_out: str, unique_id_out: str) -> NDict: - ''' - - ''' - scans = [] - zip_filenames = glob(os.path.join(self._dir_path,sample_dict[key_in])) - if len(zip_filenames) >1: - raise NotImplementedError(f"{sample_dict[key_in]} has more then one match. Currently not supported") - zip_filename = zip_filenames[0] - try: - zip_file = zipfile.ZipFile(zip_filename) - except: - print("error in opening",zip_filename, os.path.exists(zip_filename)) - return None - filenames_list = [f.filename for f in zip_file.infolist() if '.dcm' in f.filename] - - for dicom_file in filenames_list: - with zip_file.open(dicom_file) as f: - dcm = pydicom.read_file(io.BytesIO(f.read())) - scans.append({'file': zip_filename.split("/")[-1], 'dcm_unique': dcm[0x0020000e].value, 'time':dcm[0x00080031].value, 'series': dcm[0x0008103e].value}) - - dicom_tags = pd.DataFrame(scans) - dicom_tags['n_slices'] = dicom_tags.groupby(dicom_tags.columns.to_list())['file'].transform('size') - dicom_tags = dicom_tags.drop_duplicates() - dicom_tags = dicom_tags.sort_values(by=['time']) - if series_config['series'] in ['Dixon_noBH_in', 'Dixon_noBH_opp', 'Dixon_noBH_F', 'Dixon_noBH_W', 'Dixon_BH_17s_in', 'Dixon_BH_17s_opp', 'Dixon_BH_17s_F', 'Dixon_BH_17s_W'] : - if len(dicom_tags) != 24: - print(zip_filename, "has missing/extra sequences ",len(dicom_tags),"instead of 24") - return None - station_list = [] - for i in range(6) : - for j in range(4) : - station_list.append(i+1) - dicom_tags['station'] = station_list - try : - dcm_unique = dicom_tags[(dicom_tags['station'] == series_config['station']) & (dicom_tags['series'] == series_config['series'])]['dcm_unique'].iloc[0] - except: - print("requested file",zip_filename,"series description",series_config, "not found!!!") - return None - else: - try: - dcm_unique = dicom_tags[dicom_tags['series'] == series_config['series']]['dcm_unique'].iloc[0] - except: - print("requested file",zip_filename,"series description",series_config, "not found!!!") - return None - dirpath = tempfile.mkdtemp() - # ... do stuff with dirpath - for dicom_file in filenames_list: - with zip_file.open(dicom_file) as f: - if pydicom.read_file(io.BytesIO(f.read()))[0x0020000e].value == dcm_unique : - zip_file.extract(dicom_file, path=dirpath) - reader = sitk.ImageSeriesReader() - dicom_names = reader.GetGDCMSeriesFileNames(dirpath) - reader.SetFileNames(dicom_names) - image = reader.Execute() - numpy_img = sitk.GetArrayFromImage(image) - sample_dict[key_out] = numpy_img - sample_dict[unique_id_out] = dcm_unique - shutil.rmtree(dirpath) - return sample_dict class OpLoadCenterPoint(OpBase): ''' @@ -201,59 +136,51 @@ def static_pipeline(data_dir: str, series_config: NDict, centerpoint_dir: str = static_pipeline = PipelineDefault("cmmd_static", [ # decoding sample ID (OpUKBBSampleIDDecode(), dict()), # will save image and seg path to "data.input.img_path", "data.gt.seg_path" - (OpLoadUKBBZip(data_dir), dict(key_in="data.input.img_path", key_out="data.input.img", unique_id_out="data.ID", series_config=series_config)), - # (OpLoadCenterPoint(centerpoint_dir), dict(key_in="data.input.img_path", key_out="data.input.centerpoint")), - # (OpLoadVolumeAroundCenterPoint(), dict(key_in="data.input.img", key_out="data.input.img" , centerpoint = "data.input.centerpoint")), - (OpLambda(partial(skimage.transform.resize, - output_shape=(44, 174, 224), - mode='reflect', - anti_aliasing=True, - preserve_range=True)), dict(key="data.input.img")), - (OpNormalizeAgainstSelf(), dict(key="data.input.img")), - (OpToNumpy(), dict(key='data.input.img', dtype=np.float32)), - # (OpLambda(partial(dump, filename="first.png", slice = 25)), dict(key="data.input.img")), + (OpLoadUKBBZip(data_dir), dict(key_in="data.input.img_path", key_out="data.input.img", series_config=series_config)), ]) return static_pipeline @staticmethod - def dynamic_pipeline(data_source : pd.DataFrame, target: str, train: bool = False): + def dynamic_pipeline(data_source : pd.DataFrame, target: str, aug_params: NDict = None,series_config:NDict = None,train: bool = False): """ Get suggested dynamic pipeline. including pre-processing that might be modified and augmentation operations. :param train : True iff we request dataset for train purpouse """ - dynamic_pipeline = PipelineDefault("cmmd_dynamic", [ - (OpReadDataframe(data_source, - key_column="file_pattern", columns_to_extract=['file_pattern','patient_id', target], - rename_columns={'patient_id' :"data.patientID", target: "data.gt.classification" }), dict()), - (OpToTensor(), dict(key="data.input.img",dtype=torch.float32)), + sers = series_config['series'] + rel_sers = series_config['relevant_series'] + indices = [i for i, x in enumerate(sers) if x in rel_sers] + + ops = [(OpReadDataframe( + data_source, + key_column="file_pattern", + columns_to_extract=["file_pattern", "patient_id",target] , + rename_columns={"patient_id": "data.patientID", target: "data.gt.classification"}), dict()), + (OpNormalizeAgainstSelf(), dict(key="data.input.img")), + (OpToNumpy(), dict(key="data.input.img", dtype=np.float32)), + (OpSelectSlice(), dict(key='data.input.img', slice_idx=indices)), + (OpToTensor(), dict(key="data.input.img", dtype=torch.float32)), + (OpToTensor(), dict(key="data.gt.classification", dtype=torch.long)), - (OpLambda(partial(torch.unsqueeze, dim=0)), dict(key="data.input.img")) ]) + ] # augmentation if train: - dynamic_pipeline.extend([ - (OpLambda(partial(torch.squeeze, dim=0)), dict(key="data.input.img")), - + ops += [ # affine augmentation - will apply the same affine transformation on each slice - (OpRandApply(OpSample(OpAugAffine2D()), 0.5), dict( - key="data.input.img", - rotate=Uniform(-180.0,180.0), - scale=Uniform(0.8, 1.2), - flip=(RandBool(0.5), RandBool(0.5)), - translate=(RandInt(-15, 15), RandInt(-15, 15)) - )), - - # color augmentation - check if it is useful in CT images - # (OpSample(OpAugColor()), dict( - # key="data.input.img", - # gamma=Uniform(0.8,1.2), - # contrast=Uniform(0.9,1.1), - # add=Uniform(-0.01, 0.01) - # )), - - # add channel dimension -> [C=1, D, H, W] - (OpLambda(partial(torch.unsqueeze, dim=0)), dict(key="data.input.img")), - - ]) + (OpAugSqueeze3Dto2D(), dict(key='data.input.img', axis_squeeze=1))] + ops += [(OpRandApply(OpSample(OpAugAffine2D()), aug_params['apply_aug_prob']), + dict( + key="data.input.img", + rotate=Uniform(*aug_params['rotate']), + scale=Uniform(*aug_params['scale']), + flip=(aug_params['flip'], aug_params['flip']), + translate=(RandInt(*aug_params['translate']), RandInt(*aug_params['translate'])), + ), + ), + (OpAugUnsqueeze3DFrom2D(), dict(key='data.input.img', axis_squeeze=1, channels=len(rel_sers))), + # (OpMasker(), dict()) + ] + + dynamic_pipeline = PipelineDefault("ukbb_dynamic", ops) return dynamic_pipeline @@ -264,6 +191,7 @@ def dataset( data_dir: str, target: str, series_config: NDict, + aug_params: NDict = None, input_source_gt: pd.DataFrame = None, cache_dir : str = None, reset_cache : bool = True, @@ -290,8 +218,9 @@ def dataset( sample_ids = UKBB.sample_ids(data_dir) - static_pipeline = UKBB.static_pipeline(data_dir, series_config) - dynamic_pipeline = UKBB.dynamic_pipeline(input_source_gt, target,train=train) + static_pipeline = UKBB.static_pipeline(data_dir, series_config=series_config) + dynamic_pipeline = UKBB.dynamic_pipeline(input_source_gt, target, aug_params=aug_params, train=train, + series_config=series_config) if cache_dir != None : cacher = SamplesCacher(f'cmmd_cache_ver', static_pipeline,