diff --git a/Pilot1/Attn/attn.py b/Pilot1/Attn/attn.py index 39eaf1a5..500efe8c 100644 --- a/Pilot1/Attn/attn.py +++ b/Pilot1/Attn/attn.py @@ -133,7 +133,7 @@ def load_data(params, seed): file_train = params["train_data"] cdd = os.environ["CANDLE_DATA_DIR"] train_file = candle.get_file( - file_train, url + file_train, datadir = cdd, cache_subdir="Pilot1" + file_train, url + file_train, datadir=cdd, cache_subdir="Pilot1" ) df_x_train_0 = pd.read_hdf(train_file, "x_train_0").astype(np.float32) diff --git a/Pilot1/ST1/README.md b/Pilot1/ST1/README.md index 032b0258..b09b8330 100644 --- a/Pilot1/ST1/README.md +++ b/Pilot1/ST1/README.md @@ -191,4 +191,3 @@ CHEMBL -- 1.5M training examples (shuffled and resampled so not same 1.5M as cla Predicting molecular Weight validation Is also 100K samples non-overlapping. Regression problem achieves R^2 about .95 after ~20 epochs. - diff --git a/Pilot1/ST1/clr_callback.py b/Pilot1/ST1/clr_callback.py index dbac09d4..fb22a4da 100644 --- a/Pilot1/ST1/clr_callback.py +++ b/Pilot1/ST1/clr_callback.py @@ -1,12 +1,13 @@ -from tensorflow.keras.callbacks import * -from tensorflow.keras import backend as K import numpy as np +from tensorflow.keras import backend as K +from tensorflow.keras.callbacks import * + class CyclicLR(Callback): """This callback implements a cyclical learning rate policy (CLR). The method cycles the learning rate between two boundaries with some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186). - The amplitude of the cycle can be scaled on a per-iteration or + The amplitude of the cycle can be scaled on a per-iteration or per-cycle basis. This class has three built-in policies, as put forth in the paper. "triangular": @@ -14,17 +15,17 @@ class CyclicLR(Callback): "triangular2": A basic triangular cycle that scales initial amplitude by half each cycle. "exp_range": - A cycle that scales initial amplitude by gamma**(cycle iterations) at each + A cycle that scales initial amplitude by gamma**(cycle iterations) at each cycle iteration. For more detail, please see paper. - + # Example ```python clr = CyclicLR(base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular') model.fit(X_train, Y_train, callbacks=[clr]) ``` - + Class also supports custom scaling functions: ```python clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.)) @@ -32,14 +33,14 @@ class CyclicLR(Callback): step_size=2000., scale_fn=clr_fn, scale_mode='cycle') model.fit(X_train, Y_train, callbacks=[clr]) - ``` + ``` # Arguments base_lr: initial learning rate which is the lower boundary in the cycle. max_lr: upper boundary in the cycle. Functionally, it defines the cycle amplitude (max_lr - base_lr). The lr at any cycle is the sum of base_lr - and some scaling of the amplitude; therefore + and some scaling of the amplitude; therefore max_lr may not actually be reached depending on scaling function. step_size: number of training iterations per @@ -52,17 +53,25 @@ class CyclicLR(Callback): gamma: constant in 'exp_range' scaling function: gamma**(cycle iterations) scale_fn: Custom scaling policy defined by a single - argument lambda function, where + argument lambda function, where 0 <= scale_fn(x) <= 1 for all x >= 0. - mode paramater is ignored + mode paramater is ignored scale_mode: {'cycle', 'iterations'}. - Defines whether scale_fn is evaluated on + Defines whether scale_fn is evaluated on cycle number or cycle iterations (training iterations since start of cycle). Default is 'cycle'. """ - def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular', - gamma=1., scale_fn=None, scale_mode='cycle'): + def __init__( + self, + base_lr=0.001, + max_lr=0.006, + step_size=2000.0, + mode="triangular", + gamma=1.0, + scale_fn=None, + scale_mode="cycle", + ): super(CyclicLR, self).__init__() self.base_lr = base_lr @@ -71,26 +80,25 @@ def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangula self.mode = mode self.gamma = gamma if scale_fn == None: - if self.mode == 'triangular': - self.scale_fn = lambda x: 1. - self.scale_mode = 'cycle' - elif self.mode == 'triangular2': - self.scale_fn = lambda x: 1/(2.**(x-1)) - self.scale_mode = 'cycle' - elif self.mode == 'exp_range': - self.scale_fn = lambda x: gamma**(x) - self.scale_mode = 'iterations' + if self.mode == "triangular": + self.scale_fn = lambda x: 1.0 + self.scale_mode = "cycle" + elif self.mode == "triangular2": + self.scale_fn = lambda x: 1 / (2.0 ** (x - 1)) + self.scale_mode = "cycle" + elif self.mode == "exp_range": + self.scale_fn = lambda x: gamma ** (x) + self.scale_mode = "iterations" else: self.scale_fn = scale_fn self.scale_mode = scale_mode - self.clr_iterations = 0. - self.trn_iterations = 0. + self.clr_iterations = 0.0 + self.trn_iterations = 0.0 self.history = {} self._reset() - def _reset(self, new_base_lr=None, new_max_lr=None, - new_step_size=None): + def _reset(self, new_base_lr=None, new_max_lr=None, new_step_size=None): """Resets cycle iterations. Optional boundary/step size adjustment. """ @@ -100,34 +108,38 @@ def _reset(self, new_base_lr=None, new_max_lr=None, self.max_lr = new_max_lr if new_step_size != None: self.step_size = new_step_size - self.clr_iterations = 0. - + self.clr_iterations = 0.0 + def clr(self): - cycle = np.floor(1+self.clr_iterations/(2*self.step_size)) - x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1) - if self.scale_mode == 'cycle': - return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle) + cycle = np.floor(1 + self.clr_iterations / (2 * self.step_size)) + x = np.abs(self.clr_iterations / self.step_size - 2 * cycle + 1) + if self.scale_mode == "cycle": + return self.base_lr + (self.max_lr - self.base_lr) * np.maximum( + 0, (1 - x) + ) * self.scale_fn(cycle) else: - return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations) - + return self.base_lr + (self.max_lr - self.base_lr) * np.maximum( + 0, (1 - x) + ) * self.scale_fn(self.clr_iterations) + def on_train_begin(self, logs={}): logs = logs or {} if self.clr_iterations == 0: K.set_value(self.model.optimizer.lr, self.base_lr) else: - K.set_value(self.model.optimizer.lr, self.clr()) - + K.set_value(self.model.optimizer.lr, self.clr()) + def on_batch_end(self, epoch, logs=None): - + logs = logs or {} self.trn_iterations += 1 self.clr_iterations += 1 - self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr)) - self.history.setdefault('iterations', []).append(self.trn_iterations) + self.history.setdefault("lr", []).append(K.get_value(self.model.optimizer.lr)) + self.history.setdefault("iterations", []).append(self.trn_iterations) for k, v in logs.items(): self.history.setdefault(k, []).append(v) - + K.set_value(self.model.optimizer.lr, self.clr()) diff --git a/Pilot1/ST1/config_st_spe_training.json b/Pilot1/ST1/config_st_spe_training.json index 43f18b90..efca8533 100644 --- a/Pilot1/ST1/config_st_spe_training.json +++ b/Pilot1/ST1/config_st_spe_training.json @@ -1,52 +1,51 @@ { - "general": { - "use_hvd": true, - "batch_size": 64, - "epochs": 400, - "lr": 0.00000991301767144166, - "loss_fn": "mean_squared_error" - }, + "general": { + "use_hvd": true, + "batch_size": 64, + "epochs": 400, + "lr": 0.00000991301767144166, + "loss_fn": "mean_squared_error" + }, - "data_loading": { - "data_path": "/lus/grand/projects/datascience/avasan/Data_Docking/2M-flatten", - "rec": "3CLPro_7BQY_A_1_F", - "pattern": "Orderable_zinc_db_enaHLL.sorted.4col.descriptors.parquet.xform-smiles.csv.reg" - }, + "data_loading": { + "data_path": "/lus/grand/projects/datascience/avasan/Data_Docking/2M-flatten", + "rec": "3CLPro_7BQY_A_1_F", + "pattern": "Orderable_zinc_db_enaHLL.sorted.4col.descriptors.parquet.xform-smiles.csv.reg" + }, - "tokenization": { - "vocab_size": 3132, - "maxlen": 45, - "tokenizer": { - "category": "smilespair", - "spe_file": "VocabFiles/SPE_ChEMBL.txt", - "vocab_file": "VocabFiles/vocab_spe.txt" - } - }, + "tokenization": { + "vocab_size": 3132, + "maxlen": 45, + "tokenizer": { + "category": "smilespair", + "spe_file": "VocabFiles/SPE_ChEMBL.txt", + "vocab_file": "VocabFiles/vocab_spe.txt" + } + }, - "architecture": { - "embedding": { - "embed_dim": 128 - }, - "transformer_block": { - "num_blocks": 5, - "activation": "selu", - "ff_dim": 128, - "num_heads": 21, - "dr1": 0.12717945391278226, - "dr2": 0.12717945391278226, - "drop_mha": true - }, - "regressor_head": { - "activation": "selu", - "dr": 0.04990303516069576 - } + "architecture": { + "embedding": { + "embed_dim": 128 + }, + "transformer_block": { + "num_blocks": 5, + "activation": "selu", + "ff_dim": 128, + "num_heads": 21, + "dr1": 0.12717945391278226, + "dr2": 0.12717945391278226, + "drop_mha": true }, - - "callbacks": { - "checkpt_file": "smile_regress.autosave.model.h5", - "log_csv": "smile_regress.training.log", - "patience_red_lr": 20, - "patience_early_stop": 100 + "regressor_head": { + "activation": "selu", + "dr": 0.04990303516069576 } + }, + "callbacks": { + "checkpt_file": "smile_regress.autosave.model.h5", + "log_csv": "smile_regress.training.log", + "patience_red_lr": 20, + "patience_early_stop": 100 + } } diff --git a/Pilot1/ST1/polaris_sub_hvd.sh b/Pilot1/ST1/polaris_sub_hvd.sh index 8fe26663..7c2c2857 100755 --- a/Pilot1/ST1/polaris_sub_hvd.sh +++ b/Pilot1/ST1/polaris_sub_hvd.sh @@ -36,4 +36,3 @@ else mpiexec --np $NP -ppn $PPN --cpu-bind verbose,list:0,1,2,3,4,5,6,7 -env NCCL_COLLNET_ENABLE=1 -env NCCL_NET_GDR_LEVEL=PHB python smiles_regress_transformer_run_hvd.py --in_train ${DATA_PATH}/${TFIL} --in_vali ${DATA_PATH}/${VFIL} --ep $EP --num_heads $NUMHEAD --DR_TB $DR_TB --DR_ff $DR_ff --activation $ACT --drop_post_MHA $DROP --lr $LR --loss_fn $LOSS --hvd_switch $HVDSWITCH > $OUT fi - diff --git a/Pilot1/ST1/polaris_sub_smiles_regress_transformer_spe.sh b/Pilot1/ST1/polaris_sub_smiles_regress_transformer_spe.sh index 4dc1990b..3964f368 100755 --- a/Pilot1/ST1/polaris_sub_smiles_regress_transformer_spe.sh +++ b/Pilot1/ST1/polaris_sub_smiles_regress_transformer_spe.sh @@ -13,7 +13,7 @@ module load conda/2022-09-08 conda activate -cd /grand/datascience/avasan/ST_Benchmarks/Test_Tokenizers/SMILESPair_Encoder_continue +cd /grand/datascience/avasan/ST_Benchmarks/Test_Tokenizers/SMILESPair_Encoder_continue NP=16 PPN=4 diff --git a/Pilot1/ST1/smiles_pair_encoders_functions.py b/Pilot1/ST1/smiles_pair_encoders_functions.py index 15a8c6a2..1a64b212 100644 --- a/Pilot1/ST1/smiles_pair_encoders_functions.py +++ b/Pilot1/ST1/smiles_pair_encoders_functions.py @@ -2,15 +2,17 @@ # reference: https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_bert.py # reference https://github.com/rxn4chemistry/rxnmapper +import codecs import collections import logging import os import re -import codecs import unicodedata from typing import List, Optional -from transformers import PreTrainedTokenizer + from SmilesPE.tokenizer import SPE_Tokenizer +from transformers import PreTrainedTokenizer + def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" @@ -22,21 +24,21 @@ def load_vocab(vocab_file): vocab[token] = index return vocab + class Atomwise_Tokenizer(object): """Run atom-level SMILES tokenization""" def __init__(self): - """ Constructs a atom-level Tokenizer. - """ + """Constructs a atom-level Tokenizer.""" self.regex_pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])" self.regex = re.compile(self.regex_pattern) def tokenize(self, text): - """ Basic Tokenization of a SMILES. - """ + """Basic Tokenization of a SMILES.""" tokens = [token for token in self.regex.findall(text)] return tokens - + + class SMILES_SPE_Tokenizer(PreTrainedTokenizer): r""" Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE). @@ -95,7 +97,9 @@ def __init__( ) self.vocab = load_vocab(vocab_file) self.spe_vocab = codecs.open(spe_file) - self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.ids_to_tokens = collections.OrderedDict( + [(ids, tok) for tok, ids in self.vocab.items()] + ) self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab) @property @@ -106,10 +110,10 @@ def get_vocab(self): return dict(self.vocab, **self.added_tokens_encoder) def _tokenize(self, text): - return self.spe_tokenizer.tokenize(text).split(' ') + return self.spe_tokenizer.tokenize(text).split(" ") def _convert_token_to_id(self, token): - """ Converts a token (str) in an id using the vocab. """ + """Converts a token (str) in an id using the vocab.""" return self.vocab.get(token, self.vocab.get(self.unk_token)) def _convert_id_to_token(self, index): @@ -117,7 +121,7 @@ def _convert_id_to_token(self, index): return self.ids_to_tokens.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): - """ Converts a sequence of tokens (string) in a single string. """ + """Converts a sequence of tokens (string) in a single string.""" out_string = " ".join(tokens).replace(" ##", "").strip() return out_string @@ -145,7 +149,10 @@ def build_inputs_with_special_tokens( return cls + token_ids_0 + sep + token_ids_1 + sep def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None, + already_has_special_tokens: bool = False, ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding @@ -167,7 +174,12 @@ def get_special_tokens_mask( "You should not supply a second sequence if the provided sequence of " "ids is already formated with special tokens for the model." ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return list( + map( + lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, + token_ids_0, + ) + ) if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] @@ -217,28 +229,30 @@ def save_vocabulary(self, vocab_path): if index != token_index: logger.warning( "Saving vocabulary to {}: vocabulary indices are not consecutive." - " Please check that the vocabulary is not corrupted!".format(vocab_file) + " Please check that the vocabulary is not corrupted!".format( + vocab_file + ) ) index = token_index writer.write(token + "\n") index += 1 return (vocab_file,) + class Atomwise_Tokenizer(object): """Run atom-level SMILES tokenization""" def __init__(self): - """ Constructs a atom-level Tokenizer. - """ + """Constructs a atom-level Tokenizer.""" self.regex_pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])" self.regex = re.compile(self.regex_pattern) def tokenize(self, text): - """ Basic Tokenization of a SMILES. - """ + """Basic Tokenization of a SMILES.""" tokens = [token for token in self.regex.findall(text)] return tokens + class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer): r""" Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE). @@ -289,7 +303,9 @@ def __init__( "Can't find a vocabulary file at path '{}'.".format(vocab_file) ) self.vocab = load_vocab(vocab_file) - self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.ids_to_tokens = collections.OrderedDict( + [(ids, tok) for tok, ids in self.vocab.items()] + ) self.tokenizer = Atomwise_Tokenizer() @property @@ -303,7 +319,7 @@ def _tokenize(self, text): return self.tokenizer.tokenize(text) def _convert_token_to_id(self, token): - """ Converts a token (str) in an id using the vocab. """ + """Converts a token (str) in an id using the vocab.""" return self.vocab.get(token, self.vocab.get(self.unk_token)) def _convert_id_to_token(self, index): @@ -311,7 +327,7 @@ def _convert_id_to_token(self, index): return self.ids_to_tokens.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): - """ Converts a sequence of tokens (string) in a single string. """ + """Converts a sequence of tokens (string) in a single string.""" out_string = " ".join(tokens).replace(" ##", "").strip() return out_string @@ -339,7 +355,10 @@ def build_inputs_with_special_tokens( return cls + token_ids_0 + sep + token_ids_1 + sep def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None, + already_has_special_tokens: bool = False, ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding @@ -361,7 +380,12 @@ def get_special_tokens_mask( "You should not supply a second sequence if the provided sequence of " "ids is already formated with special tokens for the model." ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + return list( + map( + lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, + token_ids_0, + ) + ) if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] @@ -411,10 +435,11 @@ def save_vocabulary(self, vocab_path): if index != token_index: logger.warning( "Saving vocabulary to {}: vocabulary indices are not consecutive." - " Please check that the vocabulary is not corrupted!".format(vocab_file) + " Please check that the vocabulary is not corrupted!".format( + vocab_file + ) ) index = token_index writer.write(token + "\n") index += 1 return (vocab_file,) - diff --git a/Pilot1/ST1/smiles_regress_transformer_funcs_hvd.py b/Pilot1/ST1/smiles_regress_transformer_funcs_hvd.py index 54bf04dc..11238beb 100644 --- a/Pilot1/ST1/smiles_regress_transformer_funcs_hvd.py +++ b/Pilot1/ST1/smiles_regress_transformer_funcs_hvd.py @@ -1,17 +1,21 @@ ############# Module Loading ############## import argparse +import json import os -import numpy as np +from functools import partial + import matplotlib +import numpy as np import pandas as pd -import json import ray -from functools import partial matplotlib.use("Agg") +import horovod.keras as hvd # ## importing horovod to use data parallelization in another step +import keras_tuner import tensorflow as tf +from clr_callback import * from tensorflow import keras from tensorflow.keras import backend as K from tensorflow.keras import layers @@ -21,32 +25,33 @@ ModelCheckpoint, ReduceLROnPlateau, ) - from tensorflow.keras.optimizers import Adam from tensorflow.keras.preprocessing import sequence, text -import horovod.keras as hvd ### importing horovod to use data parallelization in another step -import keras_tuner -from clr_callback import * ############## Defining functions ##################### ###################################################### + def r2(y_true, y_pred): SS_res = K.sum(K.square(y_true - y_pred)) SS_tot = K.sum(K.square(y_true - K.mean(y_true))) return 1 - SS_res / (SS_tot + K.epsilon()) + # Implement a Transformer block as a layer # embed_dim: number of tokens. This is used for the key_dim for the multi-head attention calculation # ff_dim: number of nodes in Dense layer # epsilon: needed for numerical stability... not sure what this means to be honest + class TransformerBlock(layers.Layer): # __init__: defining all class variables def __init__(self, embed_dim, num_heads, ff_dim, rate, activation, dropout1): super(TransformerBlock, self).__init__() self.drop_chck = dropout1 - self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)#, activation=activation) + self.att = layers.MultiHeadAttention( + num_heads=num_heads, key_dim=embed_dim + ) # , activation=activation) self.ffn = keras.Sequential( [ layers.Dense(ff_dim, activation=activation), @@ -57,6 +62,7 @@ def __init__(self, embed_dim, num_heads, ff_dim, rate, activation, dropout1): self.layernorm2 = layers.LayerNormalization(epsilon=1e-6) self.dropout1 = layers.Dropout(rate) self.dropout2 = layers.Dropout(rate) + # call: building simple transformer architecture def call(self, inputs, training): attn_output = self.att(inputs, inputs) @@ -85,29 +91,43 @@ def call(self, x): positions = self.pos_emb(positions) x = self.token_emb(x) return x + positions - + def prep_text(texts, tokenizer, max_sequence_length): # Turns text into into padded sequences. - text_sequences = tokenizer.texts_to_sequences(texts) # turns text into tokens - return sequence.pad_sequences(text_sequences, maxlen=max_sequence_length) # pad all sequences so they all have same length - - - -def model_architecture(embed_dim, num_heads, ff_dim, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch): - - vocab_size = 40000 #number of possible 'words' in SMILES data - maxlen = 250 #length of each SMILE sequence in input + text_sequences = tokenizer.texts_to_sequences(texts) # turns text into tokens + return sequence.pad_sequences( + text_sequences, maxlen=max_sequence_length + ) # pad all sequences so they all have same length + + +def model_architecture( + embed_dim, + num_heads, + ff_dim, + DR_TB, + DR_ff, + activation, + dropout1, + lr, + loss_fn, + hvd_switch, +): + + vocab_size = 40000 # number of possible 'words' in SMILES data + maxlen = 250 # length of each SMILE sequence in input inputs = layers.Input(shape=(maxlen,)) embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim) x = embedding_layer(inputs) - transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim, DR_TB, activation, dropout1) + transformer_block = TransformerBlock( + embed_dim, num_heads, ff_dim, DR_TB, activation, dropout1 + ) # Use 4 transformer blocks here x = transformer_block(x) x = transformer_block(x) x = transformer_block(x) x = transformer_block(x) - + x = layers.Reshape((1, 32000), input_shape=(250, 128,))( x ) # reshaping increases parameters but improves accuracy a lot @@ -121,45 +141,54 @@ def model_architecture(embed_dim, num_heads, ff_dim, DR_TB, DR_ff, activation, d x = layers.Dense(16, activation=activation)(x) x = layers.Dropout(DR_ff)(x) outputs = layers.Dense(1, activation=activation)(x) - + model = keras.Model(inputs=inputs, outputs=outputs) - + model.summary() - + # Train and Evaluate opt = Adam(learning_rate=lr) - - #HVD Wrap optimizer in hvd Distributed Optimizer delegates gradient comp to original optimizer, averages gradients, and applies averaged gradients + + # HVD Wrap optimizer in hvd Distributed Optimizer delegates gradient comp to original optimizer, averages gradients, and applies averaged gradients if hvd_switch: opt = hvd.DistributedOptimizer(opt) - model.compile( - loss=loss_fn, optimizer=opt, metrics=["mae", r2] - ) + model.compile(loss=loss_fn, optimizer=opt, metrics=["mae", r2]) return model + def build_model(num_heads, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch): - #units = hp.Int("units", min_value=32, max_value=512, step=32) + # units = hp.Int("units", min_value=32, max_value=512, step=32) embed_dim = 128 ff_dim = 128 # call existing model-building code with the hyperparameter values. - model = model_architecture ( - embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim, DR_TB=DR_TB, DR_ff = DR_ff, activation=activation, dropout1=dropout1, lr=lr, loss_fn=loss_fn, hvd_switch=hvd_switch + model = model_architecture( + embed_dim=embed_dim, + num_heads=num_heads, + ff_dim=ff_dim, + DR_TB=DR_TB, + DR_ff=DR_ff, + activation=activation, + dropout1=dropout1, + lr=lr, + loss_fn=loss_fn, + hvd_switch=hvd_switch, ) return model + def initialize_hvd(lr, x_train, y_train): - hvd.init() - print("I am rank %d of %d" %(hvd.rank(), hvd.size())) - - #HVD-2: GPU pinning - gpus = tf.config.experimental.list_physical_devices('GPU') + hvd.init() + print("I am rank %d of %d" % (hvd.rank(), hvd.size())) + + # HVD-2: GPU pinning + gpus = tf.config.experimental.list_physical_devices("GPU") # Ping GPU to each9 rank for gpu in gpus: - tf.config.experimental.set_memory_growth(gpu,True) + tf.config.experimental.set_memory_growth(gpu, True) if gpus: - tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU") lr = lr * hvd.size() x_train = np.array_split(x_train, hvd.size()) @@ -172,25 +201,28 @@ def implement_hvd(x_train, y_train): y_train = y_train[hvd.rank()] return (x_train, y_train) -def callback_setting(hvd_switch, checkpt_file, lr, csv_file, patience_red_lr, patience_early_stop): - + +def callback_setting( + hvd_switch, checkpt_file, lr, csv_file, patience_red_lr, patience_early_stop +): + checkpointer = ModelCheckpoint( - filepath=checkpt_file,#"smile_regress.autosave.model.h5", + filepath=checkpt_file, # "smile_regress.autosave.model.h5", verbose=1, save_weights_only=True, save_best_only=True, ) - - clr = CyclicLR(base_lr = lr, max_lr = 5*lr, step_size=2000.) - - csv_logger = CSVLogger(csv_file)#"smile_regress.training.log") - + + clr = CyclicLR(base_lr=lr, max_lr=5 * lr, step_size=2000.0) + + csv_logger = CSVLogger(csv_file) # "smile_regress.training.log") + # learning rate tuning at each epoch - # is it possible to do batch size tuning at each epoch as well? + # is it possible to do batch size tuning at each epoch as well? reduce_lr = ReduceLROnPlateau( monitor="val_loss", factor=0.75, - patience=patience_red_lr,#20, + patience=patience_red_lr, # 20, verbose=1, mode="auto", epsilon=0.0001, @@ -200,16 +232,16 @@ def callback_setting(hvd_switch, checkpt_file, lr, csv_file, patience_red_lr, pa early_stop = EarlyStopping( monitor="val_loss", - patience=patience_early_stop,#100, + patience=patience_early_stop, # 100, verbose=1, mode="auto", - ) + ) if hvd_switch: - #HVD broadcast initial variables from rank0 to all other processes + # HVD broadcast initial variables from rank0 to all other processes hvd_broadcast = hvd.callbacks.BroadcastGlobalVariablesCallback(0) - callbacks = [hvd_broadcast,reduce_lr,clr] + callbacks = [hvd_broadcast, reduce_lr, clr] if hvd.rank() == 0: callbacks.append(csv_logger) @@ -221,35 +253,35 @@ def callback_setting(hvd_switch, checkpt_file, lr, csv_file, patience_red_lr, pa else: return [reduce_lr, clr, csv_logger, early_stop, checkpointer] + def build_model_tuner(hp): - #units = hp.Int("units", min_value=32, max_value=512, step=32) + # units = hp.Int("units", min_value=32, max_value=512, step=32) embed_dim = 128 num_heads = hp.Int("num_heads", min_value=12, max_value=40, step=4) ff_dim = 128 DR_TB = hp.Float("DR_TB", min_value=0.025, max_value=0.5, step=0.025) DR_ff = hp.Float("DR_TB", min_value=0.025, max_value=0.5, step=0.025) activation = hp.Choice("activation", ["relu", "elu", "gelu"]) - #activation="elu" + # activation="elu" dropout1 = hp.Boolean("dropout_aftermulti") lr = hp.Float("lr", min_value=1e-6, max_value=1e-5, step=1e-6) loss_fn = hp.Choice("loss_fn", ["mean_squared_error", "mean_absolute_error"]) # call existing model-building code with the hyperparameter values. - model = model_architecture ( - embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim, DR_TB=DR_TB, DR_ff = DR_ff, activation=activation, dropout1=dropout1, lr=lr, loss_fn=loss_fn + model = model_architecture( + embed_dim=embed_dim, + num_heads=num_heads, + ff_dim=ff_dim, + DR_TB=DR_TB, + DR_ff=DR_ff, + activation=activation, + dropout1=dropout1, + lr=lr, + loss_fn=loss_fn, ) return model - - - - - - - - - -#tfm.optimization.lars_optimizer.LARS( +# tfm.optimization.lars_optimizer.LARS( # learning_rate = 0.0000025, # momentum = 0.9, # weight_decay_rate = 0.0, @@ -260,4 +292,3 @@ def build_model_tuner(hp): # exclude_from_layer_adaptation = None, # name = 'LARS', # ) - diff --git a/Pilot1/ST1/smiles_regress_transformer_run_hvd.py b/Pilot1/ST1/smiles_regress_transformer_run_hvd.py index 06e1c4fa..caca5955 100644 --- a/Pilot1/ST1/smiles_regress_transformer_run_hvd.py +++ b/Pilot1/ST1/smiles_regress_transformer_run_hvd.py @@ -1,13 +1,18 @@ ############# Module Loading ############## import argparse import os -import numpy as np + import matplotlib +import numpy as np import pandas as pd matplotlib.use("Agg") +import horovod.keras as hvd # ## importing horovod to use data parallelization in another step +import keras_tuner import tensorflow as tf +from clr_callback import * +from smiles_regress_transformer_funcs_hvd import * from tensorflow import keras from tensorflow.keras import backend as K from tensorflow.keras import layers @@ -17,14 +22,8 @@ ModelCheckpoint, ReduceLROnPlateau, ) - from tensorflow.keras.optimizers import Adam from tensorflow.keras.preprocessing import sequence, text -import horovod.keras as hvd ### importing horovod to use data parallelization in another step -import keras_tuner - -from clr_callback import * -from smiles_regress_transformer_funcs_hvd import * #######Argument parsing############# @@ -47,7 +46,7 @@ psr.add_argument("--loss_fn", default="mean_squared_error") psr.add_argument("--hvd_switch", type=bool, default=True) -args = vars(psr.parse_args()) # returns dictionary mapping of an object +args = vars(psr.parse_args()) # returns dictionary mapping of an object ######## Set hyperparameters ######## @@ -59,22 +58,22 @@ dropout1 = args["drop_post_MHA"] lr = args["lr"] loss_fn = args["loss_fn"] -BATCH = 32 # batch size used for training +BATCH = 32 # batch size used for training vocab_size = 40000 maxlen = 250 -#act_fn='elu' -#embed_dim = 128 # Embedding size for each token -#num_heads = 16 # Number of attention heads -#ff_dim = 128 # Hidden layer size in feed forward network inside transformer +# act_fn='elu' +# embed_dim = 128 # Embedding size for each token +# num_heads = 16 # Number of attention heads +# ff_dim = 128 # Hidden layer size in feed forward network inside transformer checkpt_file = "smile_regress.autosave.model.h5" csv_file = "smile_regress.training.log" patience_red_lr = 20 patience_early_stop = 100 hvd_switch = args["hvd_switch"] -########Create training and validation data##### +########Create training and validation data##### -#x: tokenized sequence data, y: single value dock score +# x: tokenized sequence data, y: single value dock score data_path_train = args["in_train"] data_path_vali = args["in_vali"] @@ -84,7 +83,7 @@ data_train.head() # Dataset has type and smiles as the two fields # reshaping: y formatted as [[y_1],[y_2],...] with floats -y_train = data_train["type"].values.reshape(-1, 1) * 1.0 +y_train = data_train["type"].values.reshape(-1, 1) * 1.0 y_val = data_vali["type"].values.reshape(-1, 1) * 1.0 tokenizer = text.Tokenizer(num_words=vocab_size) @@ -99,19 +98,16 @@ x_train, y_train = implement_hvd(x_train, y_train) - ######## Build model ############# +######## Build model ############# + +model = build_model( + num_heads, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch +) -model = build_model(num_heads, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch) - ####### Set callbacks ############## -callbacks = callback_setting ( - hvd_switch, - checkpt_file, - lr, - csv_file, - patience_red_lr, - patience_early_stop - ) +callbacks = callback_setting( + hvd_switch, checkpt_file, lr, csv_file, patience_red_lr, patience_early_stop +) ####### Train model! ######### @@ -124,4 +120,3 @@ validation_data=(x_val, y_val), callbacks=callbacks, ) - diff --git a/Pilot1/ST1/smiles_regress_transformer_spe_funcs.py b/Pilot1/ST1/smiles_regress_transformer_spe_funcs.py index 1082bede..09b7d596 100644 --- a/Pilot1/ST1/smiles_regress_transformer_spe_funcs.py +++ b/Pilot1/ST1/smiles_regress_transformer_spe_funcs.py @@ -1,16 +1,32 @@ ############# Module Loading ############## import argparse +import json import os -import numpy as np +from functools import partial + import matplotlib +import numpy as np import pandas as pd -import json -from functools import partial matplotlib.use("Agg") +import codecs +from itertools import chain, islice, repeat + +import deephyper +import horovod.keras as hvd # ## importing horovod to use data parallelization in another step +import ray import tensorflow as tf +from clr_callback import * +from deephyper.evaluator import Evaluator +from deephyper.evaluator.callback import TqdmCallback +from deephyper.problem import HpProblem +from deephyper.search.hps import CBO + +# from SmilesPE.spe2vec import * +from smiles_pair_encoders_functions import * +from SmilesPE.tokenizer import * from tensorflow import keras from tensorflow.keras import backend as K from tensorflow.keras import layers @@ -20,33 +36,22 @@ ModelCheckpoint, ReduceLROnPlateau, ) -import codecs -from SmilesPE.tokenizer import * -#from SmilesPE.spe2vec import * -from smiles_pair_encoders_functions import * from tensorflow.keras.optimizers import Adam from tensorflow.keras.preprocessing import sequence, text -import horovod.keras as hvd ### importing horovod to use data parallelization in another step -from clr_callback import * -import deephyper -from deephyper.problem import HpProblem from tensorflow.python.client import device_lib -import ray -from deephyper.evaluator import Evaluator -from deephyper.evaluator.callback import TqdmCallback -from deephyper.search.hps import CBO -from itertools import chain, repeat, islice + def pad_infinite(iterable, padding=None): - return chain(iterable, repeat(padding)) + return chain(iterable, repeat(padding)) + def pad(iterable, size, padding=None): - return islice(pad_infinite(iterable, padding), size) + return islice(pad_infinite(iterable, padding), size) def ParamsJson(json_file): with open(json_file) as f: - params = json.load(f) + params = json.load(f) return params @@ -68,9 +73,9 @@ def ArgParsing(): psr.add_argument("--lr", type=float, default=1e-5) psr.add_argument("--loss_fn", default="mean_squared_error") psr.add_argument("--hvd_switch", type=bool, default=True) - - args = vars(psr.parse_args()) # returns dictionary mapping of an object - + + args = vars(psr.parse_args()) # returns dictionary mapping of an object + ######## Set hyperparameters ######## data_path_train = args["in_train"] data_path_vali = args["in_vali"] @@ -85,21 +90,34 @@ def ArgParsing(): loss_fn = args["loss_fn"] hvd_switch = args["hvd_switch"] - return data_path_train, data_path_vali, EPOCH, num_heads, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch + return ( + data_path_train, + data_path_vali, + EPOCH, + num_heads, + DR_TB, + DR_ff, + activation, + dropout1, + lr, + loss_fn, + hvd_switch, + ) + def initialize_hvd(): - hvd.init() - print("I am rank %d of %d" %(hvd.rank(), hvd.size())) - - #HVD-2: GPU pinning - gpus = tf.config.experimental.list_physical_devices('GPU') + hvd.init() + print("I am rank %d of %d" % (hvd.rank(), hvd.size())) + + # HVD-2: GPU pinning + gpus = tf.config.experimental.list_physical_devices("GPU") # Ping GPU to each9 rank for gpu in gpus: - tf.config.experimental.set_memory_growth(gpu,True) + tf.config.experimental.set_memory_growth(gpu, True) if gpus: - tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU") - return + return def split_data(data_x, data_y): @@ -108,7 +126,7 @@ def split_data(data_x, data_y): return (data_x, data_y) -#def implement_hvd(x_train, y_train): +# def implement_hvd(x_train, y_train): # x_train = x_train[hvd.rank()] # y_train = y_train[hvd.rank()] # return (x_train, y_train) @@ -116,6 +134,7 @@ def split_data(data_x, data_y): # Implement embedding layer # Two seperate embedding layers, one for tokens, one for token index (positions). + class TokenAndPositionEmbedding(layers.Layer): def __init__(self, maxlen, vocab_size, embed_dim): super(TokenAndPositionEmbedding, self).__init__() @@ -129,78 +148,202 @@ def call(self, x): x = self.token_emb(x) return x + positions + def prep_text(texts, tokenizer, max_sequence_length): # Turns text into into padded sequences. - text_sequences = tokenizer.texts_to_sequences(texts) # turns text into tokens - return sequence.pad_sequences(text_sequences, maxlen=max_sequence_length) # pad all sequences so they all have same length + text_sequences = tokenizer.texts_to_sequences(texts) # turns text into tokens + return sequence.pad_sequences( + text_sequences, maxlen=max_sequence_length + ) # pad all sequences so they all have same length + +# def train_val_data(data_path_train, data_path_vali, hvd_switch, vocab_size, maxlen): -#def train_val_data(data_path_train, data_path_vali, hvd_switch, vocab_size, maxlen): def preprocess_smiles_pair_encoding(data, maxlen, vocab_file, spe_file): # some default tokens from huggingface - default_toks = ['[PAD]', - '[unused1]', '[unused2]', '[unused3]', '[unused4]','[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', - '[UNK]', '[CLS]', '[SEP]', '[MASK]'] - - + default_toks = [ + "[PAD]", + "[unused1]", + "[unused2]", + "[unused3]", + "[unused4]", + "[unused5]", + "[unused6]", + "[unused7]", + "[unused8]", + "[unused9]", + "[unused10]", + "[UNK]", + "[CLS]", + "[SEP]", + "[MASK]", + ] + # atom-level tokens used for trained the spe vocabulary - atom_toks = ['[c-]', '[SeH]', '[N]', '[C@@]', '[Te]', '[OH+]', 'n', '[AsH]', '[B]', 'b', - '[S@@]', 'o', ')', '[NH+]', '[SH]', 'O', 'I', '[C@]', '-', '[As+]', '[Cl+2]', - '[P+]', '[o+]', '[C]', '[C@H]', '[CH2]', '\\', 'P', '[O-]', '[NH-]', '[S@@+]', - '[te]', '[s+]', 's', '[B-]', 'B', 'F', '=', '[te+]', '[H]', '[C@@H]', '[Na]', - '[Si]', '[CH2-]', '[S@+]', 'C', '[se+]', '[cH-]', '6', 'N', '[IH2]', '[As]', - '[Si@]', '[BH3-]', '[Se]', 'Br', '[C+]', '[I+3]', '[b-]', '[P@+]', '[SH2]', '[I+2]', - '%11', '[Ag-3]', '[O]', '9', 'c', '[N-]', '[BH-]', '4', '[N@+]', '[SiH]', '[Cl+3]', '#', - '(', '[O+]', '[S-]', '[Br+2]', '[nH]', '[N+]', '[n-]', '3', '[Se+]', '[P@@]', '[Zn]', '2', - '[NH2+]', '%10', '[SiH2]', '[nH+]', '[Si@@]', '[P@@+]', '/', '1', '[c+]', '[S@]', '[S+]', - '[SH+]', '[B@@-]', '8', '[B@-]', '[C-]', '7', '[P@]', '[se]', 'S', '[n+]', '[PH]', '[I+]', '5', 'p', '[BH2-]', '[N@@+]', '[CH]', 'Cl'] - - tokenizer = SMILES_SPE_Tokenizer(vocab_file=vocab_file, spe_file= spe_file) - - tokenized_data = np.array([list(pad(tokenizer(smi)['input_ids'], maxlen, 0)) for smi in data]) - + atom_toks = [ + "[c-]", + "[SeH]", + "[N]", + "[C@@]", + "[Te]", + "[OH+]", + "n", + "[AsH]", + "[B]", + "b", + "[S@@]", + "o", + ")", + "[NH+]", + "[SH]", + "O", + "I", + "[C@]", + "-", + "[As+]", + "[Cl+2]", + "[P+]", + "[o+]", + "[C]", + "[C@H]", + "[CH2]", + "\\", + "P", + "[O-]", + "[NH-]", + "[S@@+]", + "[te]", + "[s+]", + "s", + "[B-]", + "B", + "F", + "=", + "[te+]", + "[H]", + "[C@@H]", + "[Na]", + "[Si]", + "[CH2-]", + "[S@+]", + "C", + "[se+]", + "[cH-]", + "6", + "N", + "[IH2]", + "[As]", + "[Si@]", + "[BH3-]", + "[Se]", + "Br", + "[C+]", + "[I+3]", + "[b-]", + "[P@+]", + "[SH2]", + "[I+2]", + "%11", + "[Ag-3]", + "[O]", + "9", + "c", + "[N-]", + "[BH-]", + "4", + "[N@+]", + "[SiH]", + "[Cl+3]", + "#", + "(", + "[O+]", + "[S-]", + "[Br+2]", + "[nH]", + "[N+]", + "[n-]", + "3", + "[Se+]", + "[P@@]", + "[Zn]", + "2", + "[NH2+]", + "%10", + "[SiH2]", + "[nH+]", + "[Si@@]", + "[P@@+]", + "/", + "1", + "[c+]", + "[S@]", + "[S+]", + "[SH+]", + "[B@@-]", + "8", + "[B@-]", + "[C-]", + "7", + "[P@]", + "[se]", + "S", + "[n+]", + "[PH]", + "[I+]", + "5", + "p", + "[BH2-]", + "[N@@+]", + "[CH]", + "Cl", + ] + + tokenizer = SMILES_SPE_Tokenizer(vocab_file=vocab_file, spe_file=spe_file) + + tokenized_data = np.array( + [list(pad(tokenizer(smi)["input_ids"], maxlen, 0)) for smi in data] + ) + return tokenized_data def train_val_data(hyper_params): - data_path = hyper_params['data_loading']['data_path'] - rec = hyper_params['data_loading']['rec'] - pattern = hyper_params['data_loading']['pattern'] + data_path = hyper_params["data_loading"]["data_path"] + rec = hyper_params["data_loading"]["rec"] + pattern = hyper_params["data_loading"]["pattern"] - tokenizer_params = hyper_params['tokenization']['tokenizer'] - #vocabulary = hyper_params['tokenization']['vocab'] - vocab_size = hyper_params['tokenization']['vocab_size'] - maxlen = hyper_params['tokenization']['maxlen'] - hvd_switch = hyper_params['general']['use_hvd'] + tokenizer_params = hyper_params["tokenization"]["tokenizer"] + # vocabulary = hyper_params['tokenization']['vocab'] + vocab_size = hyper_params["tokenization"]["vocab_size"] + maxlen = hyper_params["tokenization"]["maxlen"] + hvd_switch = hyper_params["general"]["use_hvd"] + + data_train = pd.read_csv(f"{data_path}/ml.{rec}.{pattern}.train") + data_vali = pd.read_csv(f"{data_path}/ml.{rec}.{pattern}.val") - data_train = pd.read_csv(f'{data_path}/ml.{rec}.{pattern}.train') - data_vali = pd.read_csv(f'{data_path}/ml.{rec}.{pattern}.val') - data_train.head() # Dataset has type and smiles as the two fields # reshaping: y formatted as [[y_1],[y_2],...] with floats x_smiles_train = data_train["smiles"] x_smiles_val = data_vali["smiles"] - y_train = data_train["type"].values.reshape(-1, 1) * 1.0 + y_train = data_train["type"].values.reshape(-1, 1) * 1.0 y_val = data_vali["type"].values.reshape(-1, 1) * 1.0 if hvd_switch: x_smiles_train, y_train = split_data(x_smiles_train, y_train) - - if tokenizer_params['category'] == 'smilespair': - spe_file = tokenizer_params['spe_file'] - vocab_file = tokenizer_params['vocab_file'] - x_train = preprocess_smiles_pair_encoding(x_smiles_train, - maxlen, - vocab_file, - spe_file) - - x_val = preprocess_smiles_pair_encoding(x_smiles_val, - maxlen, - vocab_file, - spe_file) + + if tokenizer_params["category"] == "smilespair": + spe_file = tokenizer_params["spe_file"] + vocab_file = tokenizer_params["vocab_file"] + x_train = preprocess_smiles_pair_encoding( + x_smiles_train, maxlen, vocab_file, spe_file + ) + + x_val = preprocess_smiles_pair_encoding( + x_smiles_val, maxlen, vocab_file, spe_file + ) print(x_train) else: @@ -209,29 +352,35 @@ def train_val_data(hyper_params): x_train = prep_text(data_train["smiles"], tokenizer, maxlen) x_val = prep_text(data_vali["smiles"], tokenizer, maxlen) - + ######## Implement horovod if necessary ######## - #if hvd_switch: + # if hvd_switch: # x_train, y_train = initialize_hvd(x_train, y_train) # x_train, y_train = implement_hvd(x_train, y_train) return x_train, y_train, x_val, y_val + def get_available_gpus(): local_device_protos = device_lib.list_local_devices() n_gpus = len([x.name for x in local_device_protos if x.device_type == "GPU"]) print(f"Num of gpus is {n_gpus}") if n_gpus > 1: n_gpus -= 1 - + is_gpu_available = n_gpus > 0 - - #if is_gpu_available: - #print(f"{n_gpus} GPU{'s are' if n_gpus > 1 else ' is'} available.") - #else: - #print("No GPU available") - return local_device_protos, [x.name for x in local_device_protos if x.device_type == "GPU"], n_gpus, is_gpu_available + # if is_gpu_available: + # print(f"{n_gpus} GPU{'s are' if n_gpus > 1 else ' is'} available.") + # else: + # print("No GPU available") + + return ( + local_device_protos, + [x.name for x in local_device_protos if x.device_type == "GPU"], + n_gpus, + is_gpu_available, + ) def r2(y_true, y_pred): @@ -245,12 +394,15 @@ def r2(y_true, y_pred): # ff_dim: number of nodes in Dense layer # epsilon: needed for numerical stability... not sure what this means to be honest + class TransformerBlock(layers.Layer): # __init__: defining all class variables def __init__(self, embed_dim, num_heads, ff_dim, rate, activation, dropout1): super(TransformerBlock, self).__init__() self.drop_chck = dropout1 - self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)#, activation=activation) + self.att = layers.MultiHeadAttention( + num_heads=num_heads, key_dim=embed_dim + ) # , activation=activation) self.ffn = keras.Sequential( [ layers.Dense(ff_dim, activation=activation), @@ -261,6 +413,7 @@ def __init__(self, embed_dim, num_heads, ff_dim, rate, activation, dropout1): self.layernorm2 = layers.LayerNormalization(epsilon=1e-6) self.dropout1 = layers.Dropout(rate) self.dropout2 = layers.Dropout(rate) + # call: building simple transformer architecture def call(self, inputs, training): attn_output = self.att(inputs, inputs) @@ -272,43 +425,44 @@ def call(self, inputs, training): return self.layernorm2(out1 + ffn_output) + class ModelArchitecture(layers.Layer): - #def __init__(self, vocab_size, maxlen, embed_dim, num_heads, ff_dim, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch): + # def __init__(self, vocab_size, maxlen, embed_dim, num_heads, ff_dim, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch): def __init__(self, hyper_params): - - lr = hyper_params['general']['lr'] - vocab_size = hyper_params['tokenization']['vocab_size'] - maxlen = hyper_params['tokenization']['maxlen'] - hvd_switch = hyper_params['general']['use_hvd'] - - arch_params = hyper_params['architecture'] - embed_dim = arch_params['embedding']['embed_dim'] - num_heads = arch_params['transformer_block']['num_heads'] - ff_dim = arch_params['transformer_block']['ff_dim'] - DR_TB_1 = arch_params['transformer_block']['dr1'] - DR_TB_2 = arch_params['transformer_block']['dr2'] - DR_ff = arch_params['regressor_head']['dr'] - activation_transformer = arch_params['transformer_block']['activation'] - activation_regressor = arch_params['regressor_head']['activation'] - dropout1 = arch_params['transformer_block']['drop_mha'] - - self.num_tb = arch_params['transformer_block']['num_blocks'] - self.loss_fn = hyper_params['general']['loss_fn'] + + lr = hyper_params["general"]["lr"] + vocab_size = hyper_params["tokenization"]["vocab_size"] + maxlen = hyper_params["tokenization"]["maxlen"] + hvd_switch = hyper_params["general"]["use_hvd"] + + arch_params = hyper_params["architecture"] + embed_dim = arch_params["embedding"]["embed_dim"] + num_heads = arch_params["transformer_block"]["num_heads"] + ff_dim = arch_params["transformer_block"]["ff_dim"] + DR_TB_1 = arch_params["transformer_block"]["dr1"] + DR_TB_2 = arch_params["transformer_block"]["dr2"] + DR_ff = arch_params["regressor_head"]["dr"] + activation_transformer = arch_params["transformer_block"]["activation"] + activation_regressor = arch_params["regressor_head"]["activation"] + dropout1 = arch_params["transformer_block"]["drop_mha"] + + self.num_tb = arch_params["transformer_block"]["num_blocks"] + self.loss_fn = hyper_params["general"]["loss_fn"] self.inputs = layers.Input(shape=(maxlen,)) - self.embedding_layer = TokenAndPositionEmbedding(maxlen, - vocab_size, - embed_dim) + self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim) - self.transformer_block = TransformerBlock(embed_dim, - num_heads, - ff_dim, - DR_TB_1, - activation_transformer, - dropout1) + self.transformer_block = TransformerBlock( + embed_dim, num_heads, ff_dim, DR_TB_1, activation_transformer, dropout1 + ) - self.reshape = layers.Reshape((1, maxlen * embed_dim), - input_shape=(maxlen, embed_dim,)) + self.reshape = layers.Reshape( + (1, maxlen * embed_dim), + input_shape=( + maxlen, + embed_dim, + ), + ) self.dropout1 = layers.Dropout(DR_ff) self.dropout2 = layers.Dropout(DR_ff) @@ -324,11 +478,11 @@ def __init__(self, hyper_params): if hvd_switch: lr = lr * hvd.size() - self.opt = Adam(learning_rate=lr) + self.opt = Adam(learning_rate=lr) self.opt = hvd.DistributedOptimizer(self.opt) else: self.opt = Adam(learning_rate=lr) - + def call(self): x = self.embedding_layer(self.inputs) for tb in range(self.num_tb): @@ -344,42 +498,46 @@ def call(self): x = self.dropout3(x) x = self.dense3(x) - + x = self.dropout4(x) x = self.dense4(x) - + x = self.dropout5(x) outputs = self.dense5(x) - + model = keras.Model(inputs=self.inputs, outputs=outputs) model.summary() model.compile( - loss=self.loss_fn, optimizer=self.opt, metrics=["mae", r2], steps_per_execution=100 + loss=self.loss_fn, + optimizer=self.opt, + metrics=["mae", r2], + steps_per_execution=100, ) - + return model + class TrainingAndCallbacks: - #def __init__(self, hvd_switch, checkpt_file, lr, csv_file, patience_red_lr, patience_early_stop): + # def __init__(self, hvd_switch, checkpt_file, lr, csv_file, patience_red_lr, patience_early_stop): def __init__(self, hyper_params): - self.hvd_switch = hyper_params['general']['use_hvd'] - checkpt_file = hyper_params['callbacks']['checkpt_file'] - csv_file = hyper_params['callbacks']['log_csv'] - patience_red_lr = hyper_params['callbacks']['patience_red_lr'] - patience_early_stop = hyper_params['callbacks']['patience_early_stop'] - lr = hyper_params['general']['lr'] + self.hvd_switch = hyper_params["general"]["use_hvd"] + checkpt_file = hyper_params["callbacks"]["checkpt_file"] + csv_file = hyper_params["callbacks"]["log_csv"] + patience_red_lr = hyper_params["callbacks"]["patience_red_lr"] + patience_early_stop = hyper_params["callbacks"]["patience_early_stop"] + lr = hyper_params["general"]["lr"] if self.hvd_switch: lr = lr * hvd.size() self.checkpointer = ModelCheckpoint( - filepath=checkpt_file,#"smile_regress.autosave.model.h5", + filepath=checkpt_file, # "smile_regress.autosave.model.h5", verbose=1, save_weights_only=True, save_best_only=True, - ) + ) - self.clr = CyclicLR(base_lr = lr, max_lr = 5*lr, step_size=2000.) + self.clr = CyclicLR(base_lr=lr, max_lr=5 * lr, step_size=2000.0) self.csv_logger = CSVLogger(csv_file) self.reduce_lr = ReduceLROnPlateau( @@ -391,17 +549,17 @@ def __init__(self, hyper_params): epsilon=0.0001, cooldown=3, min_lr=0.000000001, - ) + ) self.early_stop = EarlyStopping( monitor="val_loss", patience=patience_early_stop, verbose=1, mode="auto", - ) + ) if self.hvd_switch: - #HVD broadcast initial variables from rank0 to all other processes + # HVD broadcast initial variables from rank0 to all other processes self.hvd_broadcast = hvd.callbacks.BroadcastGlobalVariablesCallback(0) def callback_defining(self): @@ -414,11 +572,17 @@ def callback_defining(self): callbacks.append(self.checkpointer) return callbacks else: - return [self.reduce_lr, self.clr, self.csv_logger, self.early_stop, self.checkpointer] + return [ + self.reduce_lr, + self.clr, + self.csv_logger, + self.early_stop, + self.checkpointer, + ] def training(self, model, x_train, y_train, validation_data, hyper_params): - BATCH = hyper_params['general']['batch_size'] - EPOCH = hyper_params['general']['epochs'] + BATCH = hyper_params["general"]["batch_size"] + EPOCH = hyper_params["general"]["epochs"] callbacks = self.callback_defining() history = model.fit( @@ -434,9 +598,8 @@ def training(self, model, x_train, y_train, validation_data, hyper_params): return history - class RunFnDeepHyper: - def __init__(self, x_train, y_train, x_val, y_val): + def __init__(self, x_train, y_train, x_val, y_val): # Params that are currently static self.vocab_size = 40000 self.maxlen = 250 @@ -446,8 +609,8 @@ def __init__(self, x_train, y_train, x_val, y_val): self.patience_red_lr = 20 self.patience_early_stop = 100 self.hvd_switch = False - self.checkpt_file = 'smile_regress.autosave.model.h5' - self.csv_file = 'smile_regress.training.log' + self.checkpt_file = "smile_regress.autosave.model.h5" + self.csv_file = "smile_regress.training.log" self.x_train = x_train self.y_train = y_train @@ -455,7 +618,7 @@ def __init__(self, x_train, y_train, x_val, y_val): self.y_val = y_val def run(self, config): - + num_heads = config["num_heads"] DR_TB = config["DR_TB"] DR_ff = config["DR_ff"] @@ -466,36 +629,60 @@ def run(self, config): EPOCH = config["epochs"] validation_data = (self.x_val, self.y_val) - model = ModelArchitecture(self.vocab_size, self.maxlen, self.embed_dim, num_heads, self.ff_dim, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, self.hvd_switch).call() - - history = TrainingAndCallbacks(self.hvd_switch, self.checkpt_file, lr, self.csv_file, self.patience_red_lr, self.patience_early_stop).training( model, self.x_train, self.y_train, validation_data, self.BATCH, EPOCH) + model = ModelArchitecture( + self.vocab_size, + self.maxlen, + self.embed_dim, + num_heads, + self.ff_dim, + DR_TB, + DR_ff, + activation, + dropout1, + lr, + loss_fn, + self.hvd_switch, + ).call() + + history = TrainingAndCallbacks( + self.hvd_switch, + self.checkpt_file, + lr, + self.csv_file, + self.patience_red_lr, + self.patience_early_stop, + ).training( + model, self.x_train, self.y_train, validation_data, self.BATCH, EPOCH + ) - return history.history["val_accuracy"] [-1] + return history.history["val_accuracy"][-1] def run(config): - DATA_PATH='/grand/datascience/avasan/ST_Benchmarks/Data/1M-flatten' - - TFIL='ml.3CLPro_7BQY_A_1_F.Orderable_zinc_db_enaHLL.sorted.4col.dd.parquet.xform-smiles.csv.reg.train' - - VFIL='ml.3CLPro_7BQY_A_1_F.Orderable_zinc_db_enaHLL.sorted.4col.dd.parquet.xform-smiles.csv.reg.val' - - data_path_train = f'{DATA_PATH}/{TFIL}' - data_path_vali = f'{DATA_PATH}/{TFIL}' + DATA_PATH = "/grand/datascience/avasan/ST_Benchmarks/Data/1M-flatten" + + TFIL = "ml.3CLPro_7BQY_A_1_F.Orderable_zinc_db_enaHLL.sorted.4col.dd.parquet.xform-smiles.csv.reg.train" + + VFIL = "ml.3CLPro_7BQY_A_1_F.Orderable_zinc_db_enaHLL.sorted.4col.dd.parquet.xform-smiles.csv.reg.val" + + data_path_train = f"{DATA_PATH}/{TFIL}" + data_path_vali = f"{DATA_PATH}/{TFIL}" hvd_switch = False - BATCH = 32 # batch size used for training + BATCH = 32 # batch size used for training vocab_size = 40000 maxlen = 250 - embed_dim = 128 # Embedding size for each token - ff_dim = 128 # Hidden layer size in feed forward network inside transformer + embed_dim = 128 # Embedding size for each token + ff_dim = 128 # Hidden layer size in feed forward network inside transformer checkpt_file = "smile_regress.autosave.model.h5" csv_file = "smile_regress.training.log" patience_red_lr = 20 patience_early_stop = 100 - - ########Create training and validation data##### - x_train, y_train, x_val, y_val = train_val_data(data_path_train, data_path_vali, hvd_switch, vocab_size, maxlen) + + ########Create training and validation data##### + x_train, y_train, x_val, y_val = train_val_data( + data_path_train, data_path_vali, hvd_switch, vocab_size, maxlen + ) num_heads = config["num_heads"] DR_TB = config["DR_TB"] DR_ff = config["DR_ff"] @@ -506,48 +693,70 @@ def run(config): EPOCH = config["epochs"] validation_data = (x_val, y_val) - model = ModelArchitecture(vocab_size, maxlen, embed_dim, num_heads, ff_dim, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch).call() - - history = TrainingAndCallbacks(hvd_switch, checkpt_file, lr, csv_file, patience_red_lr, patience_early_stop).training( model, x_train, y_train, validation_data, BATCH, EPOCH) + model = ModelArchitecture( + vocab_size, + maxlen, + embed_dim, + num_heads, + ff_dim, + DR_TB, + DR_ff, + activation, + dropout1, + lr, + loss_fn, + hvd_switch, + ).call() - return history.history["val_accuracy"] [-1] + history = TrainingAndCallbacks( + hvd_switch, checkpt_file, lr, csv_file, patience_red_lr, patience_early_stop + ).training(model, x_train, y_train, validation_data, BATCH, EPOCH) + return history.history["val_accuracy"][-1] def hyper_param_problem(): ACTIVATIONS = [ - "elu", "gelu", "hard_sigmoid", "linear", "relu", "selu", - "sigmoid", "softplus", "softsign", "swish", "tanh", + "elu", + "gelu", + "hard_sigmoid", + "linear", + "relu", + "selu", + "sigmoid", + "softplus", + "softsign", + "swish", + "tanh", ] - LRS = [1e-6 * i for i in range(0,11)] - + LRS = [1e-6 * i for i in range(0, 11)] + LOSSFNS = ["mean_squared_error", "mean_absolute_error"] problem = HpProblem() - problem.add_hyperparameter((12, 32), "num_heads", default_value = 16) - problem.add_hyperparameter((0.025, 0.5), "DR_TB", default_value = 0.1) - problem.add_hyperparameter((0.025, 0.5), "DR_ff", default_value = 0.1) - problem.add_hyperparameter(ACTIVATIONS, "activation", default_value = "elu") - problem.add_hyperparameter((1e-7, 1e-5), "lr", default_value = 1e-6) - problem.add_hyperparameter(LOSSFNS, "loss_fn", default_value = "mean_squared_error") - problem.add_hyperparameter((2,10), "epochs", default_value = 2) - problem.add_hyperparameter([True, False], "dropout_aftermulti", default_value = False) + problem.add_hyperparameter((12, 32), "num_heads", default_value=16) + problem.add_hyperparameter((0.025, 0.5), "DR_TB", default_value=0.1) + problem.add_hyperparameter((0.025, 0.5), "DR_ff", default_value=0.1) + problem.add_hyperparameter(ACTIVATIONS, "activation", default_value="elu") + problem.add_hyperparameter((1e-7, 1e-5), "lr", default_value=1e-6) + problem.add_hyperparameter(LOSSFNS, "loss_fn", default_value="mean_squared_error") + problem.add_hyperparameter((2, 10), "epochs", default_value=2) + problem.add_hyperparameter([True, False], "dropout_aftermulti", default_value=False) return problem def default_evaluation(problem, is_gpu_available, n_gpus, run): if is_gpu_available: - if not(ray.is_initialized()): + if not (ray.is_initialized()): ray.init(num_cpus=n_gpus, num_gpus=n_gpus, log_to_driver=False) - run_default = ray.remote(num_cpus=1, num_gpus=1)(run) objective_default = ray.get(run_default.remote(problem.default_configuration)) else: - if not(ray.is_initialized()): + if not (ray.is_initialized()): ray.init(num_cpus=1, log_to_driver=False) run_default = run print(problem.default_configuration) @@ -560,7 +769,7 @@ def get_evaluator(run_function, is_gpu_available, n_gpus): method_kwargs = { "num_cpus": 1, "num_cpus_per_task": 1, - "callbacks": [TqdmCallback()] + "callbacks": [TqdmCallback()], } # If GPU devices are detected then it will create 'n_gpus' workers @@ -572,17 +781,17 @@ def get_evaluator(run_function, is_gpu_available, n_gpus): method_kwargs["num_gpus_per_task"] = 1 evaluator = Evaluator.create( - run_function, - method="ray", - method_kwargs=method_kwargs + run_function, method="ray", method_kwargs=method_kwargs + ) + print( + f"Created new evaluator with {evaluator.num_workers} worker{'s' if evaluator.num_workers > 1 else ''} and config: {method_kwargs}", ) - print(f"Created new evaluator with {evaluator.num_workers} worker{'s' if evaluator.num_workers > 1 else ''} and config: {method_kwargs}", ) return evaluator def build_model_tuner(hp): - #units = hp.Int("units", min_value=32, max_value=512, step=32) + # units = hp.Int("units", min_value=32, max_value=512, step=32) vocab_size = 40000 maxlen = 250 embed_dim = 128 @@ -591,17 +800,30 @@ def build_model_tuner(hp): DR_TB = hp.Float("DR_TB", min_value=0.025, max_value=0.5, step=0.025) DR_ff = hp.Float("DR_TB", min_value=0.025, max_value=0.5, step=0.025) activation = hp.Choice("activation", ["relu", "elu", "gelu"]) - #activation="elu" + # activation="elu" dropout1 = hp.Boolean("dropout_aftermulti") lr = hp.Float("lr", min_value=1e-6, max_value=1e-5, step=1e-6) loss_fn = hp.Choice("loss_fn", ["mean_squared_error", "mean_absolute_error"]) # call existing model-building code with the hyperparameter values. - model = ModelArchitecture(vocab_size, maxlen, embed_dim, num_heads, ff_dim, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch).call() + model = ModelArchitecture( + vocab_size, + maxlen, + embed_dim, + num_heads, + ff_dim, + DR_TB, + DR_ff, + activation, + dropout1, + lr, + loss_fn, + hvd_switch, + ).call() return model -#tfm.optimization.lars_optimizer.LARS( +# tfm.optimization.lars_optimizer.LARS( # learning_rate = 0.0000025, # momentum = 0.9, # weight_decay_rate = 0.0, @@ -613,20 +835,34 @@ def build_model_tuner(hp): # name = 'LARS', # ) -def model_architecture(embed_dim, num_heads, ff_dim, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch): - vocab_size = 40000 #number of possible 'words' in SMILES data - maxlen = 250 #length of each SMILE sequence in input +def model_architecture( + embed_dim, + num_heads, + ff_dim, + DR_TB, + DR_ff, + activation, + dropout1, + lr, + loss_fn, + hvd_switch, +): + + vocab_size = 40000 # number of possible 'words' in SMILES data + maxlen = 250 # length of each SMILE sequence in input inputs = layers.Input(shape=(maxlen,)) embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim) x = embedding_layer(inputs) - transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim, DR_TB, activation, dropout1) + transformer_block = TransformerBlock( + embed_dim, num_heads, ff_dim, DR_TB, activation, dropout1 + ) # Use 4 transformer blocks here x = transformer_block(x) x = transformer_block(x) x = transformer_block(x) x = transformer_block(x) - + x = layers.Reshape((1, 32000), input_shape=(250, 128,))( x ) # reshaping increases parameters but improves accuracy a lot @@ -640,43 +876,44 @@ def model_architecture(embed_dim, num_heads, ff_dim, DR_TB, DR_ff, activation, d x = layers.Dense(16, activation=activation)(x) x = layers.Dropout(DR_ff)(x) outputs = layers.Dense(1, activation=activation)(x) - + model = keras.Model(inputs=inputs, outputs=outputs) - + model.summary() # Train and Evaluate opt = Adam(learning_rate=lr) - - #HVD Wrap optimizer in hvd Distributed Optimizer delegates gradient comp to original optimizer, averages gradients, and applies averaged gradients + + # HVD Wrap optimizer in hvd Distributed Optimizer delegates gradient comp to original optimizer, averages gradients, and applies averaged gradients if hvd_switch: opt = hvd.DistributedOptimizer(opt) - model.compile( - loss=loss_fn, optimizer=opt, metrics=["mae", r2] - ) + model.compile(loss=loss_fn, optimizer=opt, metrics=["mae", r2]) return model -def callback_setting(hvd_switch, checkpt_file, lr, csv_file, patience_red_lr, patience_early_stop): - + +def callback_setting( + hvd_switch, checkpt_file, lr, csv_file, patience_red_lr, patience_early_stop +): + checkpointer = ModelCheckpoint( - filepath=checkpt_file,#"smile_regress.autosave.model.h5", + filepath=checkpt_file, # "smile_regress.autosave.model.h5", verbose=1, save_weights_only=True, save_best_only=True, ) - - clr = CyclicLR(base_lr = lr, max_lr = 5*lr, step_size=2000.) - - csv_logger = CSVLogger(csv_file)#"smile_regress.training.log") - + + clr = CyclicLR(base_lr=lr, max_lr=5 * lr, step_size=2000.0) + + csv_logger = CSVLogger(csv_file) # "smile_regress.training.log") + # learning rate tuning at each epoch - # is it possible to do batch size tuning at each epoch as well? + # is it possible to do batch size tuning at each epoch as well? reduce_lr = ReduceLROnPlateau( monitor="val_loss", factor=0.75, - patience=patience_red_lr,#20, + patience=patience_red_lr, # 20, verbose=1, mode="auto", epsilon=0.0001, @@ -686,16 +923,16 @@ def callback_setting(hvd_switch, checkpt_file, lr, csv_file, patience_red_lr, pa early_stop = EarlyStopping( monitor="val_loss", - patience=patience_early_stop,#100, + patience=patience_early_stop, # 100, verbose=1, mode="auto", - ) + ) if hvd_switch: - #HVD broadcast initial variables from rank0 to all other processes + # HVD broadcast initial variables from rank0 to all other processes hvd_broadcast = hvd.callbacks.BroadcastGlobalVariablesCallback(0) - callbacks = [hvd_broadcast,reduce_lr,clr] + callbacks = [hvd_broadcast, reduce_lr, clr] if hvd.rank() == 0: callbacks.append(csv_logger) @@ -708,8 +945,17 @@ def callback_setting(hvd_switch, checkpt_file, lr, csv_file, patience_red_lr, pa return [reduce_lr, clr, csv_logger, early_stop, checkpointer] -def build_model_DeepHyper(x_train, y_train, x_val, y_val, config, hvd_switch=False, checkpt_file = 'smile_regress.autosave.model.h5', csv_file = 'smile_regress.training.log'): - #units = hp.Int("units", min_value=32, max_value=512, step=32) +def build_model_DeepHyper( + x_train, + y_train, + x_val, + y_val, + config, + hvd_switch=False, + checkpt_file="smile_regress.autosave.model.h5", + csv_file="smile_regress.training.log", +): + # units = hp.Int("units", min_value=32, max_value=512, step=32) embed_dim = 128 ff_dim = 128 BATCH = 32 @@ -725,20 +971,23 @@ def build_model_DeepHyper(x_train, y_train, x_val, y_val, config, hvd_switch=Fal loss_fn = config["loss_fn"] EPOCH = config["epochs"] - # call existing model-building code with the hyperparameter values. - model = model_architecture ( - embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim, DR_TB=DR_TB, DR_ff = DR_ff, activation=activation, dropout1=dropout1, lr=lr, loss_fn=loss_fn + # call existing model-building code with the hyperparameter values. + model = model_architecture( + embed_dim=embed_dim, + num_heads=num_heads, + ff_dim=ff_dim, + DR_TB=DR_TB, + DR_ff=DR_ff, + activation=activation, + dropout1=dropout1, + lr=lr, + loss_fn=loss_fn, + ) + + callbacks = callback_setting( + hvd_switch, checkpt_file, lr, csv_file, patience_red_lr, patience_early_stop ) - callbacks = callback_setting ( - hvd_switch, - checkpt_file, - lr, - csv_file, - patience_red_lr, - patience_early_stop - ) - history = model.fit( x_train, y_train, @@ -749,12 +998,10 @@ def build_model_DeepHyper(x_train, y_train, x_val, y_val, config, hvd_switch=Fal callbacks=callbacks, ) - return history.history["val_accuracy"] [-1] - - + return history.history["val_accuracy"][-1] -#def build_model(num_heads, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch): +# def build_model(num_heads, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch): # #units = hp.Int("units", min_value=32, max_value=512, step=32) # embed_dim = 128 # ff_dim = 128 @@ -763,6 +1010,3 @@ def build_model_DeepHyper(x_train, y_train, x_val, y_val, config, hvd_switch=Fal # embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim, DR_TB=DR_TB, DR_ff = DR_ff, activation=activation, dropout1=dropout1, lr=lr, loss_fn=loss_fn, hvd_switch=hvd_switch # ) # return model - - - diff --git a/Pilot1/ST1/smiles_regress_transformer_spe_run.py b/Pilot1/ST1/smiles_regress_transformer_spe_run.py index 8124de42..96935459 100644 --- a/Pilot1/ST1/smiles_regress_transformer_spe_run.py +++ b/Pilot1/ST1/smiles_regress_transformer_spe_run.py @@ -1,11 +1,18 @@ ############# Module Loading ############## import argparse import os -import numpy as np + import matplotlib +import numpy as np import pandas as pd + matplotlib.use("Agg") +import json + +import horovod.keras as hvd # ## importing horovod to use data parallelization in another step import tensorflow as tf +from clr_callback import * +from smiles_regress_transformer_spe_funcs import * from tensorflow import keras from tensorflow.keras import backend as K from tensorflow.keras import layers @@ -17,22 +24,17 @@ ) from tensorflow.keras.optimizers import Adam from tensorflow.keras.preprocessing import sequence, text -import horovod.keras as hvd ### importing horovod to use data parallelization in another step - -from clr_callback import * -from smiles_regress_transformer_spe_funcs import * from tensorflow.python.client import device_lib -import json #######HyperParamSetting############# -json_file = 'config_st_spe_training.json' +json_file = "config_st_spe_training.json" hyper_params = ParamsJson(json_file) -if hyper_params['general']['use_hvd']==True: +if hyper_params["general"]["use_hvd"] == True: initialize_hvd() -########Create training and validation data##### +########Create training and validation data##### x_train, y_train, x_val, y_val = train_val_data(hyper_params) ######## Build model ############# @@ -44,10 +46,5 @@ train_and_callbacks = TrainingAndCallbacks(hyper_params) history = train_and_callbacks.training( - model, - x_train, - y_train, - (x_val, y_val), - hyper_params - ) - + model, x_train, y_train, (x_val, y_val), hyper_params +) diff --git a/Pilot1/Uno/topN_to_uno.py b/Pilot1/Uno/topN_to_uno.py index 6b6b05b2..448b12d9 100644 --- a/Pilot1/Uno/topN_to_uno.py +++ b/Pilot1/Uno/topN_to_uno.py @@ -1,40 +1,58 @@ import argparse +import json import os import sys import time -import json from collections import OrderedDict def parse_arguments(): parser = argparse.ArgumentParser() - parser.add_argument('--dataframe_from', type=str, default='top21_dataframe_8x8.csv', - help='Dataframe file name contains all data points') - parser.add_argument('--plan', type=str, default='plan.json', - help='Plan data file') - parser.add_argument('--node', type=str, default=None, - help='node number to execute') - parser.add_argument('--incremental', action='store_true', - help='True for building dataset incrementally') - parser.add_argument('--fold', type=str, default=None, - help='pre-calculated indexes for cross fold validation') - parser.add_argument('--cell_feature_selection', default=None, - help='Plain text list for cell feature filtering. one item per line') - parser.add_argument('--drug_feature_selection', default=None, - help='Plain text list for drug feature filtering. one item per line') - parser.add_argument('--output', type=str, default='topN.uno.h5', - help='output filename') - parser.add_argument('--show', action='store_true', help='Simply show the plan node') - parser.add_argument('--raw', action='store_true', help='With --show, also show raw JSON') - parser.add_argument('--convert', help='Convert JSON to text format') + parser.add_argument( + "--dataframe_from", + type=str, + default="top21_dataframe_8x8.csv", + help="Dataframe file name contains all data points", + ) + parser.add_argument("--plan", type=str, default="plan.json", help="Plan data file") + parser.add_argument("--node", type=str, default=None, help="node number to execute") + parser.add_argument( + "--incremental", + action="store_true", + help="True for building dataset incrementally", + ) + parser.add_argument( + "--fold", + type=str, + default=None, + help="pre-calculated indexes for cross fold validation", + ) + parser.add_argument( + "--cell_feature_selection", + default=None, + help="Plain text list for cell feature filtering. one item per line", + ) + parser.add_argument( + "--drug_feature_selection", + default=None, + help="Plain text list for drug feature filtering. one item per line", + ) + parser.add_argument( + "--output", type=str, default="topN.uno.h5", help="output filename" + ) + parser.add_argument("--show", action="store_true", help="Simply show the plan node") + parser.add_argument( + "--raw", action="store_true", help="With --show, also show raw JSON" + ) + parser.add_argument("--convert", help="Convert JSON to text format") args, unparsed = parser.parse_known_args() return args, unparsed + def read_plan(filename, node): - print("read_plan(): reading file {} for node {}" - .format(filename, node)) - with open(filename, 'r') as plan_file: + print("read_plan(): reading file {} for node {}".format(filename, node)) + with open(filename, "r") as plan_file: plan = json.load(plan_file) if node is None: result = plan @@ -49,57 +67,72 @@ def read_plan(filename, node): class topN_NoDataException(Exception): pass + def build_masks(args, df): if args.node is None: - print('node is None. Generate Random split') + print("node is None. Generate Random split") mask = get_random_mask(df) return mask, ~mask - print('from new build_mask: {} {} {}'.format(args.plan, args.node, args.incremental)) + print( + "from new build_mask: {} {} {}".format(args.plan, args.node, args.incremental) + ) import plangen + plan = read_plan(args.plan, None) ids = {} mask = {} - _, _, ids['train'], ids['val'] = plangen.get_subplan_features(plan, args.node, args.incremental) - - for partition in ['train', 'val']: - _mask = df['Sample'] == None # noqa Should keep == operator here. This is a pandas operation. - for i in range(len(ids[partition]['CELL'])): - if 'CELL' in ids[partition] and 'DRUG' in ids[partition]: - cl_filter = ids[partition]['CELL'][i] - dr_filter = ids[partition]['DRUG'][i] - __mask = df['Sample'].isin(cl_filter) & df['DRUG1'].isin(dr_filter) - elif 'CELL' in ids[partition]: - cl_filter = ids[partition]['CELL'][i] - __mask = df['Sample'].isin(cl_filter) - elif 'DRUG' in ids[partition]: - dr_filter = ids[partition]['DRUG'][i] - __mask = df['Drug1'].isin(dr_filter) + _, _, ids["train"], ids["val"] = plangen.get_subplan_features( + plan, args.node, args.incremental + ) + + for partition in ["train", "val"]: + _mask = ( + df["Sample"] == None + ) # noqa Should keep == operator here. This is a pandas operation. + for i in range(len(ids[partition]["CELL"])): + if "CELL" in ids[partition] and "DRUG" in ids[partition]: + cl_filter = ids[partition]["CELL"][i] + dr_filter = ids[partition]["DRUG"][i] + __mask = df["Sample"].isin(cl_filter) & df["DRUG1"].isin(dr_filter) + elif "CELL" in ids[partition]: + cl_filter = ids[partition]["CELL"][i] + __mask = df["Sample"].isin(cl_filter) + elif "DRUG" in ids[partition]: + dr_filter = ids[partition]["DRUG"][i] + __mask = df["Drug1"].isin(dr_filter) _mask = _mask | __mask mask[partition] = _mask - return mask['train'], mask['val'] + return mask["train"], mask["val"] def build_masks_w_holdout(args, df): if args.node is None: - print('node is None. Generate Random split') + print("node is None. Generate Random split") mask = get_random_mask(df) return mask, ~mask - print('from new build_mask: {} {} {}'.format(args.plan, args.node, args.incremental)) + print( + "from new build_mask: {} {} {}".format(args.plan, args.node, args.incremental) + ) import plangen + plan = read_plan(args.plan, args.node) ids = {} mask = {} # Dicts {'CELL': [[CCL_510, CCL_577, ...]]} : - _, _, ids['train'], ids['val'] = plangen.get_subplan_features(plan, args.node, args.incremental) - if ids['train'] == None: + _, _, ids["train"], ids["val"] = plangen.get_subplan_features( + plan, args.node, args.incremental + ) + if ids["train"] == None: print("topN: get_subplan_features() returned None!") raise topN_NoDataException() - print("CELL lines in plan for %s: ids train len: " % args.node + - str(len(ids['train']['CELL'][0]))) + print( + "CELL lines in plan for %s: ids train len: " % args.node + + str(len(ids["train"]["CELL"][0])) + ) # holdout from sklearn.model_selection import ShuffleSplit @@ -109,7 +142,7 @@ def build_masks_w_holdout(args, df): splitter = ShuffleSplit(n_splits=1, test_size=0.1, random_state=123) tr_vl_id, test_id = next(splitter.split(X=idx_vec)) - mask['test'] = df.index.isin(test_id) + mask["test"] = df.index.isin(test_id) print("df.info():") df.info() @@ -125,77 +158,95 @@ def build_masks_w_holdout(args, df): start = time.time() df_new = df.iloc[tr_vl_id, :] # index selects part of matrix stop = time.time() - print("split time: %0.3f" % (stop-start)) + print("split time: %0.3f" % (stop - start)) - for partition in ['train', 'val']: - _mask = df['Sample'] == None # noqa Should keep == operator here. This is a pandas operation. - for i in range(len(ids[partition]['CELL'])): + for partition in ["train", "val"]: + _mask = ( + df["Sample"] == None + ) # noqa Should keep == operator here. This is a pandas operation. + for i in range(len(ids[partition]["CELL"])): print("i: %i" % i) - if 'CELL' in ids[partition] and 'drug' in ids[partition]: + if "CELL" in ids[partition] and "drug" in ids[partition]: print("IF CD") - cl_filter = ids[partition]['CELL'][i] - dr_filter = ids[partition]['drug'][i] - __mask = df_new['Sample'].isin(cl_filter) & \ - df_new['Drug1'].isin(dr_filter) + cl_filter = ids[partition]["CELL"][i] + dr_filter = ids[partition]["drug"][i] + __mask = df_new["Sample"].isin(cl_filter) & df_new["Drug1"].isin( + dr_filter + ) - elif 'CELL' in ids[partition]: + elif "CELL" in ids[partition]: print("IF C.") - cl_filter = ids[partition]['CELL'][i] - __mask = df_new['Sample'].isin(cl_filter) - elif 'drug' in ids[partition]: + cl_filter = ids[partition]["CELL"][i] + __mask = df_new["Sample"].isin(cl_filter) + elif "drug" in ids[partition]: print("IF D.") - dr_filter = ids[partition]['drug'][i] - __mask = df_new['Drug1'].isin(dr_filter) + dr_filter = ids[partition]["drug"][i] + __mask = df_new["Drug1"].isin(dr_filter) _mask = _mask | __mask mask[partition] = _mask - return mask['train'], mask['val'], mask['test'] + return mask["train"], mask["val"], mask["test"] def get_random_mask(df): import numpy as np + return np.random.rand(len(df)) < 0.8 def read_dataframe(args): - print("in read_dataframe") ; sys.stdout.flush() + print("in read_dataframe") + sys.stdout.flush() import pandas as pd _, ext = os.path.splitext(args.dataframe_from) - if ext == '.h5' or ext == '.hdf5': + if ext == ".h5" or ext == ".hdf5": print("HDFStore r " + str(args.dataframe_from)) - store = pd.HDFStore(args.dataframe_from, 'r') - print("HDFStore opened") ; sys.stdout.flush() - df = store.get('df') - print("HDFStore got df") ; sys.stdout.flush() + store = pd.HDFStore(args.dataframe_from, "r") + print("HDFStore opened") + sys.stdout.flush() + df = store.get("df") + print("HDFStore got df") + sys.stdout.flush() store.close() - print("HDFStore closed") ; sys.stdout.flush() - elif ext == '.feather': + print("HDFStore closed") + sys.stdout.flush() + elif ext == ".feather": print("read feather " + str(args.dataframe_from)) df = pd.read_feather(args.dataframe_from).fillna(0) print("read feather ok." + str(args.dataframe_from)) sys.stdout.flush() - elif ext == '.parquet': + elif ext == ".parquet": df = pd.read_parquet(args.dataframe_from).fillna(0) else: - df = pd.read_csv(args.dataframe_from, low_memory=False, na_values='na').fillna(0) + df = pd.read_csv(args.dataframe_from, low_memory=False, na_values="na").fillna( + 0 + ) - df.rename(columns={'CELL': 'Sample', 'DRUG': 'Drug1'}, inplace=True) - df_y = df[['AUC', 'Sample', 'Drug1']] + df.rename(columns={"CELL": "Sample", "DRUG": "Drug1"}, inplace=True) + df_y = df[["AUC", "Sample", "Drug1"]] cols = df.columns.to_list() - cl_columns = list(filter(lambda x: x.startswith('GE_'), cols)) - dd_columns = list(filter(lambda x: x.startswith('DD_'), cols)) + cl_columns = list(filter(lambda x: x.startswith("GE_"), cols)) + dd_columns = list(filter(lambda x: x.startswith("DD_"), cols)) print("args.cell_feature_selection: " + str(args.cell_feature_selection)) sys.stdout.flush() if args.cell_feature_selection is not None: - features = set(pd.read_csv(args.cell_feature_selection, skip_blank_lines=True, header=None)[0].to_list()) + features = set( + pd.read_csv( + args.cell_feature_selection, skip_blank_lines=True, header=None + )[0].to_list() + ) cl_columns = list(filter(lambda x: x in features, cl_columns)) print("args.drug_feature_selection: " + str(args.drug_feature_selection)) if args.drug_feature_selection is not None: - features = set(pd.read_csv(args.drug_feature_selection, skip_blank_lines=True, header=None)[0].to_list()) + features = set( + pd.read_csv( + args.drug_feature_selection, skip_blank_lines=True, header=None + )[0].to_list() + ) dd_columns = list(filter(lambda x: x in features, dd_columns)) df_cl = df.loc[:, cl_columns] @@ -205,15 +256,19 @@ def read_dataframe(args): def build_dataframe(args): - print("read_dataframe") ; sys.stdout.flush() + print("read_dataframe") + sys.stdout.flush() import pandas as pd + df_y, df_cl, df_dd = read_dataframe(args) - print("read_dataframe OK") ; sys.stdout.flush() + print("read_dataframe OK") + sys.stdout.flush() - print("args.fold " + str(args.fold)) ; sys.stdout.flush() + print("args.fold " + str(args.fold)) + sys.stdout.flush() if args.fold is not None: - tr_id = pd.read_csv('{}_tr_id.csv'.format(args.fold)) - vl_id = pd.read_csv('{}_vl_id.csv'.format(args.fold)) + tr_id = pd.read_csv("{}_tr_id.csv".format(args.fold)) + vl_id = pd.read_csv("{}_vl_id.csv".format(args.fold)) tr_idx = tr_id.iloc[:, 0].dropna().values.astype(int).tolist() vl_idx = vl_id.iloc[:, 0].dropna().values.astype(int).tolist() tr_vl_idx = tr_idx + vl_idx @@ -224,15 +279,15 @@ def build_dataframe(args): x_train_0 = df_cl.iloc[tr_idx, :].reset_index(drop=True) x_train_1 = df_dd.iloc[tr_idx, :].reset_index(drop=True) - x_train_1.columns = [''] * len(x_train_1.columns) + x_train_1.columns = [""] * len(x_train_1.columns) x_val_0 = df_cl.iloc[vl_idx, :].reset_index(drop=True) x_val_1 = df_dd.iloc[vl_idx, :].reset_index(drop=True) - x_val_1.columns = [''] * len(x_val_1.columns) + x_val_1.columns = [""] * len(x_val_1.columns) x_test_0 = df_cl.iloc[~df_cl.index.isin(tr_vl_idx), :].reset_index(drop=True) x_test_1 = df_dd.iloc[~df_dd.index.isin(tr_vl_idx), :].reset_index(drop=True) - x_test_1.columns = [''] * len(x_val_1.columns) + x_test_1.columns = [""] * len(x_val_1.columns) else: # args.fold is None # train_mask, val_mask = build_masks(args, df_y) train_mask, val_mask, test_mask = build_masks_w_holdout(args, df_y) @@ -244,45 +299,51 @@ def build_dataframe(args): x_train_0 = df_cl[train_mask].reset_index(drop=True) x_train_1 = df_dd[train_mask].reset_index(drop=True) - x_train_1.columns = [''] * len(x_train_1.columns) + x_train_1.columns = [""] * len(x_train_1.columns) x_val_0 = df_cl[val_mask].reset_index(drop=True) x_val_1 = df_dd[val_mask].reset_index(drop=True) - x_val_1.columns = [''] * len(x_val_1.columns) + x_val_1.columns = [""] * len(x_val_1.columns) x_test_0 = df_cl[test_mask].reset_index(drop=True) x_test_1 = df_dd[test_mask].reset_index(drop=True) - x_test_1.columns = [''] * len(x_test_1.columns) + x_test_1.columns = [""] * len(x_test_1.columns) # store import os.path + output = os.path.realpath(args.output) - print("topN HDFStore w " + output) ; sys.stdout.flush() - store = pd.HDFStore(output, 'w') # , complevel=9, complib='blosc:snappy') - store.put('y_train', y_train, format='table') - store.put('y_val', y_val, format='table') + print("topN HDFStore w " + output) + sys.stdout.flush() + store = pd.HDFStore(output, "w") # , complevel=9, complib='blosc:snappy') + store.put("y_train", y_train, format="table") + store.put("y_val", y_val, format="table") print("DF: x_train_0") x_train_0.info() sys.stdout.flush() - store.put('x_train_0', x_train_0, format='table') + store.put("x_train_0", x_train_0, format="table") print("DF: x_train_0 done.") sys.stdout.flush() - store.put('x_train_1', x_train_1, format='table') - store.put('x_val_0', x_val_0, format='table') - store.put('x_val_1', x_val_1, format='table') + store.put("x_train_1", x_train_1, format="table") + store.put("x_val_0", x_val_0, format="table") + store.put("x_val_1", x_val_1, format="table") # keep input feature list and shape cl_width = len(df_cl.columns) dd_width = len(df_dd.columns) - store.put('model', pd.DataFrame()) - store.get_storer('model').attrs.input_features = OrderedDict([('cell.rnaseq', 'cell.rnaseq'), ('drug1.descriptors', 'drug.descriptors')]) - store.get_storer('model').attrs.feature_shapes = OrderedDict([('cell.rnaseq', (cl_width,)), ('drug.descriptors', (dd_width,))]) + store.put("model", pd.DataFrame()) + store.get_storer("model").attrs.input_features = OrderedDict( + [("cell.rnaseq", "cell.rnaseq"), ("drug1.descriptors", "drug.descriptors")] + ) + store.get_storer("model").attrs.feature_shapes = OrderedDict( + [("cell.rnaseq", (cl_width,)), ("drug.descriptors", (dd_width,))] + ) if y_test is not None: - store.put('y_test', y_test, format='table') - store.put('x_test_0', x_test_0, format='table') - store.put('x_test_1', x_test_1, format='table') + store.put("y_test", y_test, format="table") + store.put("x_test_0", x_test_0, format="table") + store.put("x_test_1", x_test_1, format="table") print("topN HDFStore close " + output) store.close() @@ -334,9 +395,10 @@ def show_node(subtree, fp_out): if key not in dataset: continue partition_keys = dataset[key] - print("%s %ss: count: %i" % - (partition, key, len(partition_keys)), - file=fp_out) + print( + "%s %ss: count: %i" % (partition, key, len(partition_keys)), + file=fp_out, + ) show_list(partition_keys, fp_out) @@ -384,7 +446,7 @@ def convert(args): fp_out.close() -if __name__ == '__main__': +if __name__ == "__main__": parsed, unparsed = parse_arguments() if parsed.show: show(parsed, sys.stdout) diff --git a/Pilot1/Uno/uno_baseline_keras2.py b/Pilot1/Uno/uno_baseline_keras2.py index b7d3d09f..ce0d2970 100644 --- a/Pilot1/Uno/uno_baseline_keras2.py +++ b/Pilot1/Uno/uno_baseline_keras2.py @@ -6,63 +6,65 @@ import os import time +import candle import numpy as np import pandas as pd - import tensorflow as tf +import uno as benchmark +import uno_data +from scipy.stats.stats import pearsonr +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score from tensorflow import keras from tensorflow.keras import backend as K from tensorflow.keras import optimizers +from tensorflow.keras.callbacks import ( + Callback, + LearningRateScheduler, + ModelCheckpoint, + ReduceLROnPlateau, + TensorBoard, +) +from tensorflow.keras.layers import Dense, Dropout, Input from tensorflow.keras.models import Model -from tensorflow.keras.layers import Input, Dense, Dropout -from tensorflow.keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler, TensorBoard -from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error -from scipy.stats.stats import pearsonr - -import uno as benchmark -import candle - -import uno_data -from uno_data import CombinedDataLoader, CombinedDataGenerator, DataFeeder - +from uno_data import CombinedDataGenerator, CombinedDataLoader, DataFeeder logger = logging.getLogger(__name__) -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" tf.compat.v1.disable_eager_execution() def extension_from_parameters(args): """Construct string for saving model with annotation of parameters""" - ext = '' - ext += '.A={}'.format(args.activation) - ext += '.B={}'.format(args.batch_size) - ext += '.E={}'.format(args.epochs) - ext += '.O={}'.format(args.optimizer) + ext = "" + ext += ".A={}".format(args.activation) + ext += ".B={}".format(args.batch_size) + ext += ".E={}".format(args.epochs) + ext += ".O={}".format(args.optimizer) # ext += '.LEN={}'.format(args.maxlen) - ext += '.LR={}'.format(args.learning_rate) - ext += '.CF={}'.format(''.join([x[0] for x in sorted(args.cell_features)])) - ext += '.DF={}'.format(''.join([x[0] for x in sorted(args.drug_features)])) + ext += ".LR={}".format(args.learning_rate) + ext += ".CF={}".format("".join([x[0] for x in sorted(args.cell_features)])) + ext += ".DF={}".format("".join([x[0] for x in sorted(args.drug_features)])) if args.feature_subsample > 0: - ext += '.FS={}'.format(args.feature_subsample) + ext += ".FS={}".format(args.feature_subsample) if args.dropout > 0: - ext += '.DR={}'.format(args.dropout) + ext += ".DR={}".format(args.dropout) if args.warmup_lr: - ext += '.wu_lr' + ext += ".wu_lr" if args.reduce_lr: - ext += '.re_lr' + ext += ".re_lr" if args.residual: - ext += '.res' + ext += ".res" if args.use_landmark_genes: - ext += '.L1000' + ext += ".L1000" if args.no_gen: - ext += '.ng' + ext += ".ng" for i, n in enumerate(args.dense): if n > 0: - ext += '.D{}={}'.format(i + 1, n) + ext += ".D{}={}".format(i + 1, n) if args.dense_feature_layers != args.dense: for i, n in enumerate(args.dense): if n > 0: - ext += '.FD{}={}'.format(i + 1, n) + ext += ".FD{}={}".format(i + 1, n) return ext @@ -72,13 +74,13 @@ def evaluate_prediction(y_true, y_pred): mae = mean_absolute_error(y_true, y_pred) r2 = r2_score(y_true, y_pred) corr, _ = pearsonr(y_true, y_pred) - return {'mse': mse, 'mae': mae, 'r2': r2, 'corr': corr} + return {"mse": mse, "mae": mae, "r2": r2, "corr": corr} -def log_evaluation(metric_outputs, logger, description='Comparing y_true and y_pred:'): +def log_evaluation(metric_outputs, logger, description="Comparing y_true and y_pred:"): logger.info(description) for metric, value in metric_outputs.items(): - logger.info(' {}: {:.8f}'.format(metric, value)) + logger.info(" {}: {:.8f}".format(metric, value)) class LoggingCallback(Callback): @@ -87,7 +89,10 @@ def __init__(self, print_fcn=print): self.print_fcn = print_fcn def on_epoch_end(self, epoch, logs={}): - msg = "[Epoch: %i] %s" % (epoch, ", ".join("%s: %f" % (k, v) for k, v in sorted(logs.items()))) + msg = "[Epoch: %i] %s" % ( + epoch, + ", ".join("%s: %f" % (k, v) for k, v in sorted(logs.items())), + ) self.print_fcn(msg) @@ -97,14 +102,13 @@ def __init__(self, rate, **kwargs): self.uses_learning_phase = False def call(self, x, mask=None): - if 0. < self.rate < 1.: + if 0.0 < self.rate < 1.0: noise_shape = self._get_noise_shape(x) x = K.dropout(x, self.rate, noise_shape) return x class MultiGPUCheckpoint(ModelCheckpoint): - def set_model(self, model): if isinstance(model.layers[-2], Model): self.model = model.layers[-2] @@ -112,15 +116,23 @@ def set_model(self, model): self.model = model -def build_feature_model(input_shape, name='', dense_layers=[1000, 1000], - kernel_initializer='glorot_normal', - activation='relu', residual=False, - dropout_rate=0, permanent_dropout=True): +def build_feature_model( + input_shape, + name="", + dense_layers=[1000, 1000], + kernel_initializer="glorot_normal", + activation="relu", + residual=False, + dropout_rate=0, + permanent_dropout=True, +): x_input = Input(shape=input_shape) h = x_input for i, layer in enumerate(dense_layers): x = h - h = Dense(layer, activation=activation, kernel_initializer=kernel_initializer)(h) + h = Dense(layer, activation=activation, kernel_initializer=kernel_initializer)( + h + ) if dropout_rate > 0: if permanent_dropout: h = PermanentDropout(dropout_rate)(h) @@ -136,7 +148,6 @@ def build_feature_model(input_shape, name='', dense_layers=[1000, 1000], class SimpleWeightSaver(Callback): - def __init__(self, fname): self.fname = fname @@ -154,25 +165,35 @@ def build_model(loader, args, permanent_dropout=True, silent=False): input_models = {} dropout_rate = args.dropout - initializer = 'glorot_normal' if hasattr(args, 'initialization') is False else args.initialization - kernel_initializer = candle.build_initializer(initializer, candle.keras_default_config(), args.rng_seed) + initializer = ( + "glorot_normal" + if hasattr(args, "initialization") is False + else args.initialization + ) + kernel_initializer = candle.build_initializer( + initializer, candle.keras_default_config(), args.rng_seed + ) for fea_type, shape in loader.feature_shapes.items(): - base_type = fea_type.split('.')[0] - if base_type in ['cell', 'drug']: - if args.dense_cell_feature_layers is not None and base_type == 'cell': + base_type = fea_type.split(".")[0] + if base_type in ["cell", "drug"]: + if args.dense_cell_feature_layers is not None and base_type == "cell": dense_feature_layers = args.dense_cell_feature_layers - elif args.dense_drug_feature_layers is not None and base_type == 'drug': + elif args.dense_drug_feature_layers is not None and base_type == "drug": dense_feature_layers = args.dense_drug_feature_layers else: dense_feature_layers = args.dense_feature_layers - box = build_feature_model(input_shape=shape, name=fea_type, - dense_layers=dense_feature_layers, - kernel_initializer=kernel_initializer, - dropout_rate=dropout_rate, permanent_dropout=permanent_dropout) + box = build_feature_model( + input_shape=shape, + name=fea_type, + dense_layers=dense_feature_layers, + kernel_initializer=kernel_initializer, + dropout_rate=dropout_rate, + permanent_dropout=permanent_dropout, + ) if not silent: - logger.debug('Feature encoding submodel for %s:', fea_type) + logger.debug("Feature encoding submodel for %s:", fea_type) box.summary(print_fn=logger.debug) input_models[fea_type] = box @@ -180,7 +201,7 @@ def build_model(loader, args, permanent_dropout=True, silent=False): encoded_inputs = [] for fea_name, fea_type in loader.input_features.items(): shape = loader.feature_shapes[fea_type] - fea_input = Input(shape, name='input.' + fea_name) + fea_input = Input(shape, name="input." + fea_name) inputs.append(fea_input) if fea_type in input_models: input_model = input_models[fea_type] @@ -194,7 +215,9 @@ def build_model(loader, args, permanent_dropout=True, silent=False): h = merged for i, layer in enumerate(args.dense): x = h - h = Dense(layer, activation=args.activation, kernel_initializer=kernel_initializer)(h) + h = Dense( + layer, activation=args.activation, kernel_initializer=kernel_initializer + )(h) if dropout_rate > 0: if permanent_dropout: h = PermanentDropout(dropout_rate)(h) @@ -210,11 +233,16 @@ def build_model(loader, args, permanent_dropout=True, silent=False): return Model(inputs, output) -def initialize_parameters(default_model='uno_default_model.txt'): +def initialize_parameters(default_model="uno_default_model.txt"): # Build benchmark object - unoBmk = benchmark.BenchmarkUno(benchmark.file_path, default_model, 'keras', - prog='uno_baseline', desc='Build neural network based models to predict tumor response to single and paired drugs.') + unoBmk = benchmark.BenchmarkUno( + benchmark.file_path, + default_model, + "keras", + prog="uno_baseline", + desc="Build neural network based models to predict tumor response to single and paired drugs.", + ) # Initialize parameters gParameters = candle.finalize_parameters(unoBmk) @@ -230,11 +258,16 @@ def run(params): candle.verify_path(args.save_path) # prefix = args.save_path + ext logfile = args.logfile if args.logfile else "save/python.log" - candle.set_up_logger(logfile, logger, args.verbose, - fmt_line="%(asctime)s %(message)s",) + candle.set_up_logger( + logfile, + logger, + args.verbose, + fmt_line="%(asctime)s %(message)s", + ) logger.info("UNO RUN ...") import tensorflow as tf + # from tensorflow.python.client import device_lib print("TF version: " + tf.__version__) # gpus = tf.config.list_physical_devices('GPU') @@ -242,10 +275,11 @@ def run(params): # tf.config.set_visible_devices(gpus[0], 'GPU') # print(device_lib.list_local_devices()) - logger.info('Params: {}'.format(params)) + logger.info("Params: {}".format(params)) - if (len(args.gpus) > 0): + if len(args.gpus) > 0: import tensorflow as tf + config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = ",".join(map(str, args.gpus)) @@ -260,120 +294,171 @@ def run(params): datafile = args.use_exported_data loader = CombinedDataLoader(seed=args.rng_seed) - loader.load(cache=args.cache, - ncols=args.feature_subsample, - agg_dose=args.agg_dose, - cell_features=args.cell_features, - drug_features=args.drug_features, - drug_median_response_min=args.drug_median_response_min, - drug_median_response_max=args.drug_median_response_max, - use_landmark_genes=args.use_landmark_genes, - use_filtered_genes=args.use_filtered_genes, - cell_feature_subset_path=args.cell_feature_subset_path or args.feature_subset_path, - drug_feature_subset_path=args.drug_feature_subset_path or args.feature_subset_path, - preprocess_rnaseq=args.preprocess_rnaseq, - single=args.single, - train_sources=args.train_sources, - test_sources=args.test_sources, - embed_feature_source=not args.no_feature_source, - encode_response_source=not args.no_response_source, - use_exported_data=args.use_exported_data, - ) - - target = args.agg_dose or 'Growth' + loader.load( + cache=args.cache, + ncols=args.feature_subsample, + agg_dose=args.agg_dose, + cell_features=args.cell_features, + drug_features=args.drug_features, + drug_median_response_min=args.drug_median_response_min, + drug_median_response_max=args.drug_median_response_max, + use_landmark_genes=args.use_landmark_genes, + use_filtered_genes=args.use_filtered_genes, + cell_feature_subset_path=args.cell_feature_subset_path + or args.feature_subset_path, + drug_feature_subset_path=args.drug_feature_subset_path + or args.feature_subset_path, + preprocess_rnaseq=args.preprocess_rnaseq, + single=args.single, + train_sources=args.train_sources, + test_sources=args.test_sources, + embed_feature_source=not args.no_feature_source, + encode_response_source=not args.no_response_source, + use_exported_data=args.use_exported_data, + ) + + target = args.agg_dose or "Growth" val_split = args.val_split train_split = 1 - val_split if args.export_csv: fname = args.export_csv - loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, - cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, - cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path) - train_gen = CombinedDataGenerator(loader, batch_size=args.batch_size, shuffle=args.shuffle) - val_gen = CombinedDataGenerator(loader, partition='val', batch_size=args.batch_size, shuffle=args.shuffle) - - x_train_list, y_train = train_gen.get_slice(size=train_gen.size, dataframe=True, single=args.single) - x_val_list, y_val = val_gen.get_slice(size=val_gen.size, dataframe=True, single=args.single) + loader.partition_data( + cv_folds=args.cv, + train_split=train_split, + val_split=val_split, + cell_types=args.cell_types, + by_cell=args.by_cell, + by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, + drug_subset_path=args.drug_subset_path, + ) + train_gen = CombinedDataGenerator( + loader, batch_size=args.batch_size, shuffle=args.shuffle + ) + val_gen = CombinedDataGenerator( + loader, partition="val", batch_size=args.batch_size, shuffle=args.shuffle + ) + + x_train_list, y_train = train_gen.get_slice( + size=train_gen.size, dataframe=True, single=args.single + ) + x_val_list, y_val = val_gen.get_slice( + size=val_gen.size, dataframe=True, single=args.single + ) df_train = pd.concat([y_train] + x_train_list, axis=1) df_val = pd.concat([y_val] + x_val_list, axis=1) df = pd.concat([df_train, df_val]).reset_index(drop=True) if args.growth_bins > 1: - df = uno_data.discretize(df, 'Growth', bins=args.growth_bins) - df.to_csv(fname, sep='\t', index=False, float_format="%.3g") + df = uno_data.discretize(df, "Growth", bins=args.growth_bins) + df.to_csv(fname, sep="\t", index=False, float_format="%.3g") return if args.export_data: fname = args.export_data - loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, - cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, - cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path) - train_gen = CombinedDataGenerator(loader, batch_size=args.batch_size, shuffle=args.shuffle) - val_gen = CombinedDataGenerator(loader, partition='val', batch_size=args.batch_size, shuffle=args.shuffle) - store = pd.HDFStore(fname, complevel=9, complib='blosc:snappy') - - config_min_itemsize = {'Sample': 30, 'Drug1': 10} + loader.partition_data( + cv_folds=args.cv, + train_split=train_split, + val_split=val_split, + cell_types=args.cell_types, + by_cell=args.by_cell, + by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, + drug_subset_path=args.drug_subset_path, + ) + train_gen = CombinedDataGenerator( + loader, batch_size=args.batch_size, shuffle=args.shuffle + ) + val_gen = CombinedDataGenerator( + loader, partition="val", batch_size=args.batch_size, shuffle=args.shuffle + ) + store = pd.HDFStore(fname, complevel=9, complib="blosc:snappy") + + config_min_itemsize = {"Sample": 30, "Drug1": 10} if not args.single: - config_min_itemsize['Drug2'] = 10 + config_min_itemsize["Drug2"] = 10 - for partition in ['train', 'val']: - gen = train_gen if partition == 'train' else val_gen + for partition in ["train", "val"]: + gen = train_gen if partition == "train" else val_gen for i in range(gen.steps): - x_list, y = gen.get_slice(size=args.batch_size, dataframe=True, single=args.single) + x_list, y = gen.get_slice( + size=args.batch_size, dataframe=True, single=args.single + ) for j, input_feature in enumerate(x_list): - input_feature.columns = [''] * len(input_feature.columns) - store.append('x_{}_{}'.format(partition, j), input_feature.astype('float32'), format='table', data_columns=True) - store.append('y_{}'.format(partition), y.astype({target: 'float32'}), format='table', data_columns=True, - min_itemsize=config_min_itemsize) - logger.info('Generating {} dataset. {} / {}'.format(partition, i, gen.steps)) + input_feature.columns = [""] * len(input_feature.columns) + store.append( + "x_{}_{}".format(partition, j), + input_feature.astype("float32"), + format="table", + data_columns=True, + ) + store.append( + "y_{}".format(partition), + y.astype({target: "float32"}), + format="table", + data_columns=True, + min_itemsize=config_min_itemsize, + ) + logger.info( + "Generating {} dataset. {} / {}".format(partition, i, gen.steps) + ) # save input_features and feature_shapes from loader - store.put('model', pd.DataFrame()) - store.get_storer('model').attrs.input_features = loader.input_features - store.get_storer('model').attrs.feature_shapes = loader.feature_shapes + store.put("model", pd.DataFrame()) + store.get_storer("model").attrs.input_features = loader.input_features + store.get_storer("model").attrs.feature_shapes = loader.feature_shapes store.close() - logger.info('Completed generating {}'.format(fname)) + logger.info("Completed generating {}".format(fname)) return if args.use_exported_data is None: - loader.partition_data(cv_folds=args.cv, train_split=train_split, val_split=val_split, - cell_types=args.cell_types, by_cell=args.by_cell, by_drug=args.by_drug, - cell_subset_path=args.cell_subset_path, drug_subset_path=args.drug_subset_path) + loader.partition_data( + cv_folds=args.cv, + train_split=train_split, + val_split=val_split, + cell_types=args.cell_types, + by_cell=args.by_cell, + by_drug=args.by_drug, + cell_subset_path=args.cell_subset_path, + drug_subset_path=args.drug_subset_path, + ) model = build_model(loader, args) - logger.info('Combined model:') + logger.info("Combined model:") model.summary(print_fn=logger.info) # plot_model(model, to_file=prefix+'.model.png', show_shapes=True) if args.cp: model_json = model.to_json() - with open('model.json', 'w') as f: + with open("model.json", "w") as f: print(model_json, file=f) def warmup_scheduler(epoch): lr = args.learning_rate or base_lr * args.batch_size / 100 if epoch <= 5: K.set_value(model.optimizer.lr, (base_lr * (5 - epoch) + lr * epoch) / 5) - logger.debug('Epoch {}: lr={:.5g}'.format(epoch, K.get_value(model.optimizer.lr))) + logger.debug( + "Epoch {}: lr={:.5g}".format(epoch, K.get_value(model.optimizer.lr)) + ) return K.get_value(model.optimizer.lr) df_pred_list = [] - cv_ext = '' + cv_ext = "" cv = args.cv if args.cv > 1 else 1 for fold in range(cv): if args.cv > 1: - logger.info('Cross validation fold {}/{}:'.format(fold + 1, cv)) - cv_ext = '.cv{}'.format(fold + 1) + logger.info("Cross validation fold {}/{}:".format(fold + 1, cv)) + cv_ext = ".cv{}".format(fold + 1) template_model = build_model(loader, args, silent=True) initial_epoch = 0 logger.info("CKPT CONSTRUCT...") - ckpt = candle.CandleCkptKeras(params, - verbose=True) + ckpt = candle.CandleCkptKeras(params, verbose=True) logger.info("CKPT CONSTRUCT OK.") logger.info("template model: " + str(template_model)) @@ -381,54 +466,60 @@ def warmup_scheduler(epoch): J = ckpt.restart(params) if J is not None: - initial_epoch = J['epoch'] - best_metric_last = J['best_metric_last'] - params['ckpt_best_metric_last'] = best_metric_last - print('initial_epoch: %i' % initial_epoch) + initial_epoch = J["epoch"] + best_metric_last = J["best_metric_last"] + params["ckpt_best_metric_last"] = best_metric_last + print("initial_epoch: %i" % initial_epoch) elif args.initial_weights is not None: - logger.info("Loading initial weights from '{}'" - .format(args.initial_weights)) + logger.info( + "Loading initial weights from '{}'".format(args.initial_weights) + ) start = time.time() template_model.load_weights(args.initial_weights) stop = time.time() duration = stop - start - logger.info("Loaded from initial_weights in %0.3f seconds." % - duration) + logger.info("Loaded from initial_weights in %0.3f seconds." % duration) if len(args.gpus) > 1: from tensorflow.keras.utils import multi_gpu_model + gpu_count = len(args.gpus) logger.info("Multi GPU with {} gpus".format(gpu_count)) model = multi_gpu_model(template_model, cpu_merge=False, gpus=gpu_count) else: model = template_model - optimizer = optimizers.deserialize({'class_name': args.optimizer, 'config': {}}) + optimizer = optimizers.deserialize({"class_name": args.optimizer, "config": {}}) base_lr = args.base_lr or K.get_value(optimizer.lr) if args.learning_rate: K.set_value(optimizer.lr, args.learning_rate) logger.info("COMPILE") - model.compile(loss=args.loss, optimizer=optimizer, metrics=[candle.mae, candle.r2]) + model.compile( + loss=args.loss, optimizer=optimizer, metrics=[candle.mae, candle.r2] + ) # calculate trainable and non-trainable params params.update(candle.compute_trainable_params(model)) candle_monitor = candle.CandleRemoteMonitor(params=params) - timeout_monitor = candle.TerminateOnTimeOut(params['timeout']) + timeout_monitor = candle.TerminateOnTimeOut(params["timeout"]) patience = 10 - if 'patience' in params: - patience = int(params['patience']) - logger.info('setting patience: %i' % patience) - es_monitor = keras.callbacks.EarlyStopping(patience=patience, - verbose=1) - - reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001) + if "patience" in params: + patience = int(params["patience"]) + logger.info("setting patience: %i" % patience) + es_monitor = keras.callbacks.EarlyStopping(patience=patience, verbose=1) + + reduce_lr = ReduceLROnPlateau( + monitor="val_loss", factor=0.5, patience=5, min_lr=0.00001 + ) warmup_lr = LearningRateScheduler(warmup_scheduler) # prefix + cv_ext + '. - checkpointer = MultiGPUCheckpoint('model.h5', save_best_only=True) - tensorboard = TensorBoard(log_dir="tb/{}{}{}".format(args.tb_prefix, ext, cv_ext)) + checkpointer = MultiGPUCheckpoint("model.h5", save_best_only=True) + tensorboard = TensorBoard( + log_dir="tb/{}{}{}".format(args.tb_prefix, ext, cv_ext) + ) history_logger = LoggingCallback(logger.debug) callbacks = [candle_monitor, timeout_monitor, history_logger, ckpt] @@ -447,56 +538,119 @@ def warmup_scheduler(epoch): callbacks.append(MultiGPUCheckpoint(args.save_weights)) if args.use_exported_data is not None: - train_gen = DataFeeder(filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose, on_memory=args.on_memory_loader) - val_gen = DataFeeder(partition='val', filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose, on_memory=args.on_memory_loader) - test_gen = DataFeeder(partition='test', filename=args.use_exported_data, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single, agg_dose=args.agg_dose, on_memory=args.on_memory_loader) + train_gen = DataFeeder( + filename=args.use_exported_data, + batch_size=args.batch_size, + shuffle=args.shuffle, + single=args.single, + agg_dose=args.agg_dose, + on_memory=args.on_memory_loader, + ) + val_gen = DataFeeder( + partition="val", + filename=args.use_exported_data, + batch_size=args.batch_size, + shuffle=args.shuffle, + single=args.single, + agg_dose=args.agg_dose, + on_memory=args.on_memory_loader, + ) + test_gen = DataFeeder( + partition="test", + filename=args.use_exported_data, + batch_size=args.batch_size, + shuffle=args.shuffle, + single=args.single, + agg_dose=args.agg_dose, + on_memory=args.on_memory_loader, + ) else: - train_gen = CombinedDataGenerator(loader, fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) - val_gen = CombinedDataGenerator(loader, partition='val', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) - test_gen = CombinedDataGenerator(loader, partition='test', fold=fold, batch_size=args.batch_size, shuffle=args.shuffle, single=args.single) + train_gen = CombinedDataGenerator( + loader, + fold=fold, + batch_size=args.batch_size, + shuffle=args.shuffle, + single=args.single, + ) + val_gen = CombinedDataGenerator( + loader, + partition="val", + fold=fold, + batch_size=args.batch_size, + shuffle=args.shuffle, + single=args.single, + ) + test_gen = CombinedDataGenerator( + loader, + partition="test", + fold=fold, + batch_size=args.batch_size, + shuffle=args.shuffle, + single=args.single, + ) df_val = val_gen.get_response(copy=True) y_val = df_val[target].values y_shuf = np.random.permutation(y_val) - log_evaluation(evaluate_prediction(y_val, y_shuf), logger, - description='Between random pairs in y_val:') + log_evaluation( + evaluate_prediction(y_val, y_shuf), + logger, + description="Between random pairs in y_val:", + ) if args.no_gen: - x_train_list, y_train = train_gen.get_slice(size=train_gen.size, single=args.single) + x_train_list, y_train = train_gen.get_slice( + size=train_gen.size, single=args.single + ) x_val_list, y_val = val_gen.get_slice(size=val_gen.size, single=args.single) - history = model.fit(x_train_list, y_train, - batch_size=args.batch_size, - epochs=args.epochs, - initial_epoch=initial_epoch, - callbacks=callbacks, - validation_data=(x_val_list, y_val)) + history = model.fit( + x_train_list, + y_train, + batch_size=args.batch_size, + epochs=args.epochs, + initial_epoch=initial_epoch, + callbacks=callbacks, + validation_data=(x_val_list, y_val), + ) else: - logger.info('Data points per epoch: train = %d, val = %d, test = %d', train_gen.size, val_gen.size, test_gen.size) - logger.info('Steps per epoch: train = %d, val = %d, test = %d', train_gen.steps, val_gen.steps, test_gen.steps) - history = model.fit(train_gen, - epochs=args.epochs, - initial_epoch=initial_epoch, - callbacks=callbacks, - validation_data=val_gen) + logger.info( + "Data points per epoch: train = %d, val = %d, test = %d", + train_gen.size, + val_gen.size, + test_gen.size, + ) + logger.info( + "Steps per epoch: train = %d, val = %d, test = %d", + train_gen.steps, + val_gen.steps, + test_gen.steps, + ) + history = model.fit( + train_gen, + epochs=args.epochs, + initial_epoch=initial_epoch, + callbacks=callbacks, + validation_data=val_gen, + ) # prediction on holdout(test) when exists or use validation set if test_gen.size > 0: df_val = test_gen.get_response(copy=True) y_val = df_val[target].values y_val_pred = model.predict(test_gen, steps=test_gen.steps + 1) - y_val_pred = y_val_pred[:test_gen.size] + y_val_pred = y_val_pred[: test_gen.size] else: if args.no_gen: y_val_pred = model.predict(x_val_list, batch_size=args.batch_size) else: val_gen.reset() y_val_pred = model.predict(val_gen, steps=val_gen.steps + 1) - y_val_pred = y_val_pred[:val_gen.size] + y_val_pred = y_val_pred[: val_gen.size] y_val_pred = y_val_pred.flatten() - if 'loss' in history.history: - history_length = len(history.history['loss']) + if "loss" in history.history: + history_length = len(history.history["loss"]) logger.info("history_length: %i" % history_length) history_expected = args.epochs - initial_epoch if history_length == history_expected: @@ -512,48 +666,68 @@ def warmup_scheduler(epoch): log_evaluation(scores, logger) # df_val = df_val.assign(PredictedGrowth=y_val_pred, GrowthError=y_val_pred - y_val) - df_val['Predicted' + target] = y_val_pred - df_val[target + 'Error'] = y_val_pred - y_val + df_val["Predicted" + target] = y_val_pred + df_val[target + "Error"] = y_val_pred - y_val df_pred_list.append(df_val) - candle.plot_metrics(history, title=None, skip_ep=0, outdir=os.path.dirname(args.save_path), add_lr=True) + candle.plot_metrics( + history, + title=None, + skip_ep=0, + outdir=os.path.dirname(args.save_path), + add_lr=True, + ) - pred_fname = 'predicted.tsv' # prefix + + pred_fname = "predicted.tsv" # prefix + df_pred = pd.concat(df_pred_list) if args.agg_dose: if args.single: - df_pred.sort_values(['Sample', 'Drug1', target], inplace=True) + df_pred.sort_values(["Sample", "Drug1", target], inplace=True) else: - df_pred.sort_values(['Source', 'Sample', 'Drug1', 'Drug2', target], inplace=True) + df_pred.sort_values( + ["Source", "Sample", "Drug1", "Drug2", target], inplace=True + ) else: if args.single: - df_pred.sort_values(['Sample', 'Drug1', 'Dose1', 'Growth'], inplace=True) + df_pred.sort_values(["Sample", "Drug1", "Dose1", "Growth"], inplace=True) else: - df_pred.sort_values(['Sample', 'Drug1', 'Drug2', 'Dose1', 'Dose2', 'Growth'], inplace=True) - df_pred.to_csv(pred_fname, sep='\t', index=False, float_format='%.4g') + df_pred.sort_values( + ["Sample", "Drug1", "Drug2", "Dose1", "Dose2", "Growth"], inplace=True + ) + df_pred.to_csv(pred_fname, sep="\t", index=False, float_format="%.4g") if args.cv > 1: - scores = evaluate_prediction(df_pred[target], df_pred['Predicted' + target]) - log_evaluation(scores, logger, description='Combining cross validation folds:') + scores = evaluate_prediction(df_pred[target], df_pred["Predicted" + target]) + log_evaluation(scores, logger, description="Combining cross validation folds:") for test_source in loader.test_sep_sources: - test_gen = CombinedDataGenerator(loader, partition='test', batch_size=args.batch_size, source=test_source) + test_gen = CombinedDataGenerator( + loader, partition="test", batch_size=args.batch_size, source=test_source + ) df_test = test_gen.get_response(copy=True) y_test = df_test[target].values n_test = len(y_test) if n_test == 0: continue if args.no_gen: - x_test_list, y_test = test_gen.get_slice(size=test_gen.size, single=args.single) + x_test_list, y_test = test_gen.get_slice( + size=test_gen.size, single=args.single + ) y_test_pred = model.predict(x_test_list, batch_size=args.batch_size) else: - y_test_pred = model.predict_generator(test_gen.flow(single=args.single), test_gen.steps) - y_test_pred = y_test_pred[:test_gen.size] + y_test_pred = model.predict_generator( + test_gen.flow(single=args.single), test_gen.steps + ) + y_test_pred = y_test_pred[: test_gen.size] y_test_pred = y_test_pred.flatten() scores = evaluate_prediction(y_test, y_test_pred) - log_evaluation(scores, logger, description='Testing on data from {} ({})'.format(test_source, n_test)) + log_evaluation( + scores, + logger, + description="Testing on data from {} ({})".format(test_source, n_test), + ) - if K.backend() == 'tensorflow': + if K.backend() == "tensorflow": K.clear_session() logger.handlers = [] @@ -566,7 +740,7 @@ def main(): run(params) -if __name__ == '__main__': +if __name__ == "__main__": main() - if K.backend() == 'tensorflow': + if K.backend() == "tensorflow": K.clear_session() diff --git a/Pilot3/P3B6/p3b6.py b/Pilot3/P3B6/p3b6.py index c9bda47f..059d08f2 100644 --- a/Pilot3/P3B6/p3b6.py +++ b/Pilot3/P3B6/p3b6.py @@ -16,7 +16,6 @@ {"name": "num_test_samples", "action": "store", "type": int}, {"name": "num_classes", "action": "store", "type": int}, {"name": "eps", "action": "store", "type": float}, - ] required = [ diff --git a/Pilot3/P3B7/p3b7.py b/Pilot3/P3B7/p3b7.py index 51dd7dad..ad503840 100644 --- a/Pilot3/P3B7/p3b7.py +++ b/Pilot3/P3B7/p3b7.py @@ -18,7 +18,6 @@ {"name": "kernel1", "action": "store", "type": int}, {"name": "kernel2", "action": "store", "type": int}, {"name": "kernel3", "action": "store", "type": int}, - ] required = [ diff --git a/examples/ADRP/adrp.py b/examples/ADRP/adrp.py index 758dd96d..f189ade5 100644 --- a/examples/ADRP/adrp.py +++ b/examples/ADRP/adrp.py @@ -18,7 +18,13 @@ additional_definitions = [ {"name": "latent_dim", "action": "store", "type": int, "help": "latent dimensions"}, - {"name": "benchmark_data", "action": "store", "type": candle.str2bool, "default": False, "help": "Use prepared benchmark data"}, + { + "name": "benchmark_data", + "action": "store", + "type": candle.str2bool, + "default": False, + "help": "Use prepared benchmark data", + }, { "name": "residual", "type": candle.str2bool, @@ -231,8 +237,8 @@ def get_model(params): def load_data(params, seed): - if 'benchmark_data' in params and params['benchmark_data'] != "": - if params['train_data'].endswith('.parquet'): + if "benchmark_data" in params and params["benchmark_data"] != "": + if params["train_data"].endswith(".parquet"): header_url = params["header_url"] dh_dict, th_list = load_headers( "descriptor_headers.csv", "training_headers.csv", header_url @@ -243,9 +249,15 @@ def load_data(params, seed): url = params["data_url"] # file_train = params["train_data"] - train_file = candle.get_file(params['train_data'], url + params['train_data'], cache_subdir="Pilot1") - test_file = candle.get_file(params['test_data'], url + params['test_data'], cache_subdir="Pilot1") - val_file = candle.get_file(params['val_data'], url + params['val_data'], cache_subdir="Pilot1") + train_file = candle.get_file( + params["train_data"], url + params["train_data"], cache_subdir="Pilot1" + ) + test_file = candle.get_file( + params["test_data"], url + params["test_data"], cache_subdir="Pilot1" + ) + val_file = candle.get_file( + params["val_data"], url + params["val_data"], cache_subdir="Pilot1" + ) # df = (pd.read_csv(data_path,skiprows=1).values).astype('float32') print("Loading data...") @@ -270,11 +282,18 @@ def load_data(params, seed): scaler = StandardScaler() scaler.fit(train_df_x) train_df_x = scaler.fit_transform(train_df_x) - test_df_x = scaler.fit_transform(test_df_x) - val_df_x = scaler.fit_transform(val_df_x) - - return train_df_x, train_df_y, val_df_x, val_df_y, train_df_x.shape[1], histogram - #return X_train, Y_train, X_test, Y_test, X_train.shape[1], histogram + test_df_x = scaler.fit_transform(test_df_x) + val_df_x = scaler.fit_transform(val_df_x) + + return ( + train_df_x, + train_df_y, + val_df_x, + val_df_y, + train_df_x.shape[1], + histogram, + ) + # return X_train, Y_train, X_test, Y_test, X_train.shape[1], histogram else: header_url = params["header_url"] @@ -286,10 +305,14 @@ def load_data(params, seed): url = params["data_url"] file_train = ( - "ml." + params["base_name"] + ".Orderable_zinc_db_enaHLL.sorted.4col.dd.parquet" + "ml." + + params["base_name"] + + ".Orderable_zinc_db_enaHLL.sorted.4col.dd.parquet" ) # file_train = params["train_data"] - train_file = candle.get_file(file_train, url + file_train, cache_subdir="Pilot1") + train_file = candle.get_file( + file_train, url + file_train, cache_subdir="Pilot1" + ) # df = (pd.read_csv(data_path,skiprows=1).values).astype('float32') print("Loading data...") df = pd.read_parquet(train_file) diff --git a/examples/ADRP/adrp_baseline_keras2.py b/examples/ADRP/adrp_baseline_keras2.py index e4bbed9d..b8e052ba 100644 --- a/examples/ADRP/adrp_baseline_keras2.py +++ b/examples/ADRP/adrp_baseline_keras2.py @@ -335,12 +335,12 @@ def run(params): # set up a bunch of callbacks to do work during model training.. - #checkpointer = ModelCheckpoint( + # checkpointer = ModelCheckpoint( # filepath=params["save_path"] + "agg_adrp.autosave.model.h5", # verbose=1, # save_weights_only=False, # save_best_only=True, - #) + # ) initial_epoch = 0 ckpt = candle.CandleCkptKeras(params, verbose=True) ckpt.set_model(model) diff --git a/examples/ADRP/adrp_bmkdata_model.txt b/examples/ADRP/adrp_bmkdata_model.txt index e3678eb4..120d7e01 100644 --- a/examples/ADRP/adrp_bmkdata_model.txt +++ b/examples/ADRP/adrp_bmkdata_model.txt @@ -12,7 +12,7 @@ val_data = 'ml.ADRP_6W02_A_1_H.Orderable_zinc_db_enaHLL.sorted.4col.dd.dedup.v ckpt_restart_mode = 'off' ckpt_save_best = True ckpt_save_best_metric = 'val_loss' -ckpt_skip_epochs = 50 +ckpt_skip_epochs = 50 ckpt_save_interval = 500 ckpt_keep_limit = 1 diff --git a/examples/ADRP/reg_go3.py b/examples/ADRP/reg_go3.py index 1b30a9a7..730f24d3 100644 --- a/examples/ADRP/reg_go3.py +++ b/examples/ADRP/reg_go3.py @@ -9,6 +9,7 @@ matplotlib.use("Agg") import matplotlib.pyplot as plt +import tensorflow as tf import tensorflow.keras as ke from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler @@ -22,23 +23,24 @@ from tensorflow.keras.layers import Dense, Dropout, Input from tensorflow.keras.models import Model, model_from_json, model_from_yaml from tensorflow.keras.optimizers import SGD -import tensorflow as tf file_path = os.path.dirname(os.path.realpath(__file__)) -strategy = tf.distribute.MirroredStrategy([ - '/xpu:0' -# ,'/xpu:1' -# ,'/xpu:2' -# ,'/xpu:3' -# ,'/xpu:4' -# ,'/xpu:5' -# ,'/xpu:6','/xpu:7' -# ,'/xpu:8','/xpu:9' -# ,'/xpu:10','/xpu:11' - ]) -print('tensorflow version: {}'.format(tf.__version__)) -print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) +strategy = tf.distribute.MirroredStrategy( + [ + "/xpu:0" + # ,'/xpu:1' + # ,'/xpu:2' + # ,'/xpu:3' + # ,'/xpu:4' + # ,'/xpu:5' + # ,'/xpu:6','/xpu:7' + # ,'/xpu:8','/xpu:9' + # ,'/xpu:10','/xpu:11' + ] +) +print("tensorflow version: {}".format(tf.__version__)) +print("Number of devices: {}".format(strategy.num_replicas_in_sync)) # parse args psr = argparse.ArgumentParser(description="input csv file") @@ -53,6 +55,7 @@ data_path = args["in"] print(args) + def r2(y_true, y_pred): SS_res = K.sum(K.square(y_true - y_pred)) SS_tot = K.sum(K.square(y_true - K.mean(y_true))) @@ -66,8 +69,8 @@ def load_data(): df = (pd.read_csv(data_path, skiprows=1).values).astype("float32") df_y = df[:, 0].astype("float32") df_x = df[:, 1:PL].astype(np.float32) - print('df_y: {}\n{}'.format(df_y.shape, df_y)) - print('df_x: {}\n{}'.format(df_x.shape, df_x)) + print("df_y: {}\n{}".format(df_y.shape, df_y)) + print("df_x: {}\n{}".format(df_x.shape, df_x)) scaler = StandardScaler() df_x = scaler.fit_transform(df_x) @@ -78,15 +81,16 @@ def load_data(): return X_train, Y_train, X_test, Y_test + def load_data_from_parquet(): - + data_path = args["in"] - df=pd.read_parquet(data_path) - df_y = df['reg'].values.astype("float32") - df_x = df.iloc[:,6:].values.astype("float32") - print('df_y: {}\n{}'.format(df_y.shape, df_y)) - print('df_x: {}\n{}'.format(df_x.shape, df_x)) + df = pd.read_parquet(data_path) + df_y = df["reg"].values.astype("float32") + df_x = df.iloc[:, 6:].values.astype("float32") + print("df_y: {}\n{}".format(df_y.shape, df_y)) + print("df_x: {}\n{}".format(df_x.shape, df_x)) scaler = StandardScaler() df_x = scaler.fit_transform(df_x) @@ -98,31 +102,52 @@ def load_data_from_parquet(): return X_train, Y_train, X_test, Y_test -#X_train, Y_train, X_test, Y_test = load_data() +# X_train, Y_train, X_test, Y_test = load_data() X_train, Y_train, X_test, Y_test = load_data_from_parquet() print("X_train shape:", X_train.shape) print("X_test shape:", X_test.shape) print("Y_train_shape: ", Y_train.shape) print("Y_test shape: ", Y_test.shape) -steps = X_train.shape[0]//GLOBAL_BATCH_SIZE -validation_steps = X_test.shape[0]//GLOBAL_BATCH_SIZE -print('samples {}, global_batch_size {}, steps {}'.format(X_train.shape[0], GLOBAL_BATCH_SIZE, steps)) -print('val samples {}, global_batch_size {}, val_steps {}'.format(X_test.shape[0], GLOBAL_BATCH_SIZE, validation_steps)) +steps = X_train.shape[0] // GLOBAL_BATCH_SIZE +validation_steps = X_test.shape[0] // GLOBAL_BATCH_SIZE +print( + "samples {}, global_batch_size {}, steps {}".format( + X_train.shape[0], GLOBAL_BATCH_SIZE, steps + ) +) +print( + "val samples {}, global_batch_size {}, val_steps {}".format( + X_test.shape[0], GLOBAL_BATCH_SIZE, validation_steps + ) +) -train_ds = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).batch(GLOBAL_BATCH_SIZE, - drop_remainder=True, - num_parallel_calls=None, - deterministic=None, - ).repeat(EPOCH) -val_ds = tf.data.Dataset.from_tensor_slices((X_test, Y_test)).batch(GLOBAL_BATCH_SIZE, - drop_remainder=True, - num_parallel_calls=None, - deterministic=None,).repeat(EPOCH) +train_ds = ( + tf.data.Dataset.from_tensor_slices((X_train, Y_train)) + .batch( + GLOBAL_BATCH_SIZE, + drop_remainder=True, + num_parallel_calls=None, + deterministic=None, + ) + .repeat(EPOCH) +) +val_ds = ( + tf.data.Dataset.from_tensor_slices((X_test, Y_test)) + .batch( + GLOBAL_BATCH_SIZE, + drop_remainder=True, + num_parallel_calls=None, + deterministic=None, + ) + .repeat(EPOCH) +) options = tf.data.Options() -options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA +options.experimental_distribute.auto_shard_policy = ( + tf.data.experimental.AutoShardPolicy.DATA +) train_ds = train_ds.with_options(options) val_ds = val_ds.with_options(options) @@ -131,7 +156,7 @@ def load_data_from_parquet(): with strategy.scope(): - #inputs = Input(shape=(PS,)) + # inputs = Input(shape=(PS,)) inputs = Input(shape=(1826,)) x = Dense(250, activation="relu")(inputs) x = Dropout(DR)(x) @@ -173,21 +198,30 @@ def load_data_from_parquet(): early_stop = EarlyStopping(monitor="val_loss", patience=100, verbose=1, mode="auto") from datetime import datetime as dt -print("{} calling model.fit".format(dt.fromtimestamp(dt.timestamp(dt.now())).strftime("%D %H:%M:%S.%s"))) + +print( + "{} calling model.fit".format( + dt.fromtimestamp(dt.timestamp(dt.now())).strftime("%D %H:%M:%S.%s") + ) +) history = model.fit( - #X_train, - #Y_train, + # X_train, + # Y_train, train_dist, batch_size=GLOBAL_BATCH_SIZE, steps_per_epoch=int(steps), epochs=EPOCH, verbose=1, - #validation_data=(X_test, Y_test), + # validation_data=(X_test, Y_test), validation_data=val_dist, validation_steps=validation_steps, callbacks=[checkpointer, csv_logger, reduce_lr, early_stop], ) -print("{} done calling model.fit".format(dt.fromtimestamp(dt.timestamp(dt.now())).strftime("%D %H:%M:%S.%s"))) +print( + "{} done calling model.fit".format( + dt.fromtimestamp(dt.timestamp(dt.now())).strftime("%D %H:%M:%S.%s") + ) +) score = model.evaluate(X_test, Y_test, verbose=0)