diff --git a/Pilot1/ST1/clr_callback.py b/Pilot1/ST1/clr_callback.py new file mode 100644 index 00000000..dbac09d4 --- /dev/null +++ b/Pilot1/ST1/clr_callback.py @@ -0,0 +1,133 @@ +from tensorflow.keras.callbacks import * +from tensorflow.keras import backend as K +import numpy as np + +class CyclicLR(Callback): + """This callback implements a cyclical learning rate policy (CLR). + The method cycles the learning rate between two boundaries with + some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186). + The amplitude of the cycle can be scaled on a per-iteration or + per-cycle basis. + This class has three built-in policies, as put forth in the paper. + "triangular": + A basic triangular cycle w/ no amplitude scaling. + "triangular2": + A basic triangular cycle that scales initial amplitude by half each cycle. + "exp_range": + A cycle that scales initial amplitude by gamma**(cycle iterations) at each + cycle iteration. + For more detail, please see paper. + + # Example + ```python + clr = CyclicLR(base_lr=0.001, max_lr=0.006, + step_size=2000., mode='triangular') + model.fit(X_train, Y_train, callbacks=[clr]) + ``` + + Class also supports custom scaling functions: + ```python + clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.)) + clr = CyclicLR(base_lr=0.001, max_lr=0.006, + step_size=2000., scale_fn=clr_fn, + scale_mode='cycle') + model.fit(X_train, Y_train, callbacks=[clr]) + ``` + # Arguments + base_lr: initial learning rate which is the + lower boundary in the cycle. + max_lr: upper boundary in the cycle. Functionally, + it defines the cycle amplitude (max_lr - base_lr). + The lr at any cycle is the sum of base_lr + and some scaling of the amplitude; therefore + max_lr may not actually be reached depending on + scaling function. + step_size: number of training iterations per + half cycle. Authors suggest setting step_size + 2-8 x training iterations in epoch. + mode: one of {triangular, triangular2, exp_range}. + Default 'triangular'. + Values correspond to policies detailed above. + If scale_fn is not None, this argument is ignored. + gamma: constant in 'exp_range' scaling function: + gamma**(cycle iterations) + scale_fn: Custom scaling policy defined by a single + argument lambda function, where + 0 <= scale_fn(x) <= 1 for all x >= 0. + mode paramater is ignored + scale_mode: {'cycle', 'iterations'}. + Defines whether scale_fn is evaluated on + cycle number or cycle iterations (training + iterations since start of cycle). Default is 'cycle'. + """ + + def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular', + gamma=1., scale_fn=None, scale_mode='cycle'): + super(CyclicLR, self).__init__() + + self.base_lr = base_lr + self.max_lr = max_lr + self.step_size = step_size + self.mode = mode + self.gamma = gamma + if scale_fn == None: + if self.mode == 'triangular': + self.scale_fn = lambda x: 1. + self.scale_mode = 'cycle' + elif self.mode == 'triangular2': + self.scale_fn = lambda x: 1/(2.**(x-1)) + self.scale_mode = 'cycle' + elif self.mode == 'exp_range': + self.scale_fn = lambda x: gamma**(x) + self.scale_mode = 'iterations' + else: + self.scale_fn = scale_fn + self.scale_mode = scale_mode + self.clr_iterations = 0. + self.trn_iterations = 0. + self.history = {} + + self._reset() + + def _reset(self, new_base_lr=None, new_max_lr=None, + new_step_size=None): + """Resets cycle iterations. + Optional boundary/step size adjustment. + """ + if new_base_lr != None: + self.base_lr = new_base_lr + if new_max_lr != None: + self.max_lr = new_max_lr + if new_step_size != None: + self.step_size = new_step_size + self.clr_iterations = 0. + + def clr(self): + cycle = np.floor(1+self.clr_iterations/(2*self.step_size)) + x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1) + if self.scale_mode == 'cycle': + return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle) + else: + return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations) + + def on_train_begin(self, logs={}): + logs = logs or {} + + if self.clr_iterations == 0: + K.set_value(self.model.optimizer.lr, self.base_lr) + else: + K.set_value(self.model.optimizer.lr, self.clr()) + + def on_batch_end(self, epoch, logs=None): + + logs = logs or {} + self.trn_iterations += 1 + self.clr_iterations += 1 + + self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr)) + self.history.setdefault('iterations', []).append(self.trn_iterations) + + for k, v in logs.items(): + self.history.setdefault(k, []).append(v) + + K.set_value(self.model.optimizer.lr, self.clr()) diff --git a/Pilot1/ST1/polaris_sub_hvd.sh b/Pilot1/ST1/polaris_sub_hvd.sh new file mode 100755 index 00000000..8fe26663 --- /dev/null +++ b/Pilot1/ST1/polaris_sub_hvd.sh @@ -0,0 +1,39 @@ +#!/bin/bash +#PBS -N st_hvd +#PBS -l select=2 +#PBS -l walltime=24:00:00 +#PBS -q preemptable +#PBS -l filesystems=grand +#PBS -A datascience +#PBS -o logs/ +#PBS -e logs/ +#PBS -m abe +#PBS -M avasan@anl.gov + +DATA_PATH=/grand/datascience/avasan/ST_Benchmarks/Data/1M-flatten + +TFIL=ml.3CLPro_7BQY_A_1_F.Orderable_zinc_db_enaHLL.sorted.4col.dd.parquet.xform-smiles.csv.reg.train +VFIL=ml.3CLPro_7BQY_A_1_F.Orderable_zinc_db_enaHLL.sorted.4col.dd.parquet.xform-smiles.csv.reg.val + +EP=400 +NUMHEAD=16 +DR_TB=0.1 +DR_ff=0.1 + +ACT=elu +DROP=False +LR=0.0000025 +LOSS=mean_squared_error +HVDSWITCH=True + +if [$HVDSWITCH = False]; then + python smiles_regress_transformer_run_hvd.py --in_train ${DATA_PATH}/${TFIL} --in_vali ${DATA_PATH}/${VFIL} --ep $EP --num_heads $NUMHEAD --DR_TB $DR_TB --DR_ff $DR_ff --activation $ACT --drop_post_MHA $DROP --lr $LR --loss_fn $LOSS --hvd_switch $HVDSWITCH + +else + NP=8 + PPN=4 + OUT=logfile.log + mpiexec --np $NP -ppn $PPN --cpu-bind verbose,list:0,1,2,3,4,5,6,7 -env NCCL_COLLNET_ENABLE=1 -env NCCL_NET_GDR_LEVEL=PHB python smiles_regress_transformer_run_hvd.py --in_train ${DATA_PATH}/${TFIL} --in_vali ${DATA_PATH}/${VFIL} --ep $EP --num_heads $NUMHEAD --DR_TB $DR_TB --DR_ff $DR_ff --activation $ACT --drop_post_MHA $DROP --lr $LR --loss_fn $LOSS --hvd_switch $HVDSWITCH > $OUT + +fi + diff --git a/Pilot1/ST1/smiles_regress_transformer_funcs_hvd.py b/Pilot1/ST1/smiles_regress_transformer_funcs_hvd.py new file mode 100644 index 00000000..54bf04dc --- /dev/null +++ b/Pilot1/ST1/smiles_regress_transformer_funcs_hvd.py @@ -0,0 +1,263 @@ +############# Module Loading ############## + +import argparse +import os +import numpy as np +import matplotlib +import pandas as pd +import json +import ray +from functools import partial + +matplotlib.use("Agg") + +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import backend as K +from tensorflow.keras import layers +from tensorflow.keras.callbacks import ( + CSVLogger, + EarlyStopping, + ModelCheckpoint, + ReduceLROnPlateau, +) + +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.preprocessing import sequence, text +import horovod.keras as hvd ### importing horovod to use data parallelization in another step +import keras_tuner +from clr_callback import * + +############## Defining functions ##################### +###################################################### + +def r2(y_true, y_pred): + SS_res = K.sum(K.square(y_true - y_pred)) + SS_tot = K.sum(K.square(y_true - K.mean(y_true))) + return 1 - SS_res / (SS_tot + K.epsilon()) + +# Implement a Transformer block as a layer +# embed_dim: number of tokens. This is used for the key_dim for the multi-head attention calculation +# ff_dim: number of nodes in Dense layer +# epsilon: needed for numerical stability... not sure what this means to be honest + +class TransformerBlock(layers.Layer): + # __init__: defining all class variables + def __init__(self, embed_dim, num_heads, ff_dim, rate, activation, dropout1): + super(TransformerBlock, self).__init__() + self.drop_chck = dropout1 + self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)#, activation=activation) + self.ffn = keras.Sequential( + [ + layers.Dense(ff_dim, activation=activation), + layers.Dense(embed_dim), + ] + ) + self.layernorm1 = layers.LayerNormalization(epsilon=1e-6) + self.layernorm2 = layers.LayerNormalization(epsilon=1e-6) + self.dropout1 = layers.Dropout(rate) + self.dropout2 = layers.Dropout(rate) + # call: building simple transformer architecture + def call(self, inputs, training): + attn_output = self.att(inputs, inputs) + if self.drop_chck: + attn_output = self.dropout1(attn_output, training=training) + out1 = self.layernorm1(inputs + attn_output) + ffn_output = self.ffn(out1) + ffn_output = self.dropout2(ffn_output, training=training) + + return self.layernorm2(out1 + ffn_output) + + +# Implement embedding layer +# Two seperate embedding layers, one for tokens, one for token index (positions). + + +class TokenAndPositionEmbedding(layers.Layer): + def __init__(self, maxlen, vocab_size, embed_dim): + super(TokenAndPositionEmbedding, self).__init__() + self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim) + self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim) + + def call(self, x): + maxlen = tf.shape(x)[-1] + positions = tf.range(start=0, limit=maxlen, delta=1) + positions = self.pos_emb(positions) + x = self.token_emb(x) + return x + positions + + +def prep_text(texts, tokenizer, max_sequence_length): + # Turns text into into padded sequences. + text_sequences = tokenizer.texts_to_sequences(texts) # turns text into tokens + return sequence.pad_sequences(text_sequences, maxlen=max_sequence_length) # pad all sequences so they all have same length + + + +def model_architecture(embed_dim, num_heads, ff_dim, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch): + + vocab_size = 40000 #number of possible 'words' in SMILES data + maxlen = 250 #length of each SMILE sequence in input + inputs = layers.Input(shape=(maxlen,)) + embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim) + x = embedding_layer(inputs) + transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim, DR_TB, activation, dropout1) + # Use 4 transformer blocks here + x = transformer_block(x) + x = transformer_block(x) + x = transformer_block(x) + x = transformer_block(x) + + x = layers.Reshape((1, 32000), input_shape=(250, 128,))( + x + ) # reshaping increases parameters but improves accuracy a lot + x = layers.Dropout(DR_ff)(x) + x = layers.Dense(1024, activation=activation)(x) + x = layers.Dropout(DR_ff)(x) + x = layers.Dense(256, activation=activation)(x) + x = layers.Dropout(DR_ff)(x) + x = layers.Dense(64, activation=activation)(x) + x = layers.Dropout(DR_ff)(x) + x = layers.Dense(16, activation=activation)(x) + x = layers.Dropout(DR_ff)(x) + outputs = layers.Dense(1, activation=activation)(x) + + model = keras.Model(inputs=inputs, outputs=outputs) + + model.summary() + + # Train and Evaluate + + opt = Adam(learning_rate=lr) + + #HVD Wrap optimizer in hvd Distributed Optimizer delegates gradient comp to original optimizer, averages gradients, and applies averaged gradients + if hvd_switch: + opt = hvd.DistributedOptimizer(opt) + + model.compile( + loss=loss_fn, optimizer=opt, metrics=["mae", r2] + ) + return model + +def build_model(num_heads, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch): + #units = hp.Int("units", min_value=32, max_value=512, step=32) + embed_dim = 128 + ff_dim = 128 + # call existing model-building code with the hyperparameter values. + model = model_architecture ( + embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim, DR_TB=DR_TB, DR_ff = DR_ff, activation=activation, dropout1=dropout1, lr=lr, loss_fn=loss_fn, hvd_switch=hvd_switch + ) + return model + +def initialize_hvd(lr, x_train, y_train): + hvd.init() + print("I am rank %d of %d" %(hvd.rank(), hvd.size())) + + #HVD-2: GPU pinning + gpus = tf.config.experimental.list_physical_devices('GPU') + # Ping GPU to each9 rank + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu,True) + if gpus: + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') + + lr = lr * hvd.size() + x_train = np.array_split(x_train, hvd.size()) + y_train = np.array_split(y_train, hvd.size()) + return (lr, x_train, y_train) + + +def implement_hvd(x_train, y_train): + x_train = x_train[hvd.rank()] + y_train = y_train[hvd.rank()] + return (x_train, y_train) + +def callback_setting(hvd_switch, checkpt_file, lr, csv_file, patience_red_lr, patience_early_stop): + + checkpointer = ModelCheckpoint( + filepath=checkpt_file,#"smile_regress.autosave.model.h5", + verbose=1, + save_weights_only=True, + save_best_only=True, + ) + + clr = CyclicLR(base_lr = lr, max_lr = 5*lr, step_size=2000.) + + csv_logger = CSVLogger(csv_file)#"smile_regress.training.log") + + # learning rate tuning at each epoch + # is it possible to do batch size tuning at each epoch as well? + reduce_lr = ReduceLROnPlateau( + monitor="val_loss", + factor=0.75, + patience=patience_red_lr,#20, + verbose=1, + mode="auto", + epsilon=0.0001, + cooldown=3, + min_lr=0.000000001, + ) + + early_stop = EarlyStopping( + monitor="val_loss", + patience=patience_early_stop,#100, + verbose=1, + mode="auto", + ) + + if hvd_switch: + #HVD broadcast initial variables from rank0 to all other processes + hvd_broadcast = hvd.callbacks.BroadcastGlobalVariablesCallback(0) + + callbacks = [hvd_broadcast,reduce_lr,clr] + + if hvd.rank() == 0: + callbacks.append(csv_logger) + callbacks.append(early_stop) + callbacks.append(checkpointer) + + return callbacks + + else: + return [reduce_lr, clr, csv_logger, early_stop, checkpointer] + +def build_model_tuner(hp): + #units = hp.Int("units", min_value=32, max_value=512, step=32) + embed_dim = 128 + num_heads = hp.Int("num_heads", min_value=12, max_value=40, step=4) + ff_dim = 128 + DR_TB = hp.Float("DR_TB", min_value=0.025, max_value=0.5, step=0.025) + DR_ff = hp.Float("DR_TB", min_value=0.025, max_value=0.5, step=0.025) + activation = hp.Choice("activation", ["relu", "elu", "gelu"]) + #activation="elu" + dropout1 = hp.Boolean("dropout_aftermulti") + lr = hp.Float("lr", min_value=1e-6, max_value=1e-5, step=1e-6) + loss_fn = hp.Choice("loss_fn", ["mean_squared_error", "mean_absolute_error"]) + # call existing model-building code with the hyperparameter values. + model = model_architecture ( + embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim, DR_TB=DR_TB, DR_ff = DR_ff, activation=activation, dropout1=dropout1, lr=lr, loss_fn=loss_fn + ) + return model + + + + + + + + + + + +#tfm.optimization.lars_optimizer.LARS( +# learning_rate = 0.0000025, +# momentum = 0.9, +# weight_decay_rate = 0.0, +# eeta = 0.001, +# nesterov = False, +# classic_momentum = True, +# exclude_from_weight_decay = None, +# exclude_from_layer_adaptation = None, +# name = 'LARS', +# ) + diff --git a/Pilot1/ST1/smiles_regress_transformer_run_hvd.py b/Pilot1/ST1/smiles_regress_transformer_run_hvd.py new file mode 100644 index 00000000..06e1c4fa --- /dev/null +++ b/Pilot1/ST1/smiles_regress_transformer_run_hvd.py @@ -0,0 +1,127 @@ +############# Module Loading ############## +import argparse +import os +import numpy as np +import matplotlib +import pandas as pd + +matplotlib.use("Agg") + +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import backend as K +from tensorflow.keras import layers +from tensorflow.keras.callbacks import ( + CSVLogger, + EarlyStopping, + ModelCheckpoint, + ReduceLROnPlateau, +) + +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.preprocessing import sequence, text +import horovod.keras as hvd ### importing horovod to use data parallelization in another step +import keras_tuner + +from clr_callback import * +from smiles_regress_transformer_funcs_hvd import * + +#######Argument parsing############# + +file_path = os.path.dirname(os.path.realpath(__file__)) + +# psr and args take input outside of the script and assign: +# (1) file paths for data_path_train and data_path_vali +# (2) number of training epochs + +psr = argparse.ArgumentParser(description="input csv file") +psr.add_argument("--in_train", default="in_train") +psr.add_argument("--in_vali", default="in_vali") +psr.add_argument("--ep", type=int, default=400) +psr.add_argument("--num_heads", type=int, default=16) +psr.add_argument("--DR_TB", type=float, default=0.1) +psr.add_argument("--DR_ff", type=float, default=0.1) +psr.add_argument("--activation", default="activation") +psr.add_argument("--drop_post_MHA", type=bool, default=True) +psr.add_argument("--lr", type=float, default=1e-5) +psr.add_argument("--loss_fn", default="mean_squared_error") +psr.add_argument("--hvd_switch", type=bool, default=True) + +args = vars(psr.parse_args()) # returns dictionary mapping of an object + +######## Set hyperparameters ######## + +EPOCH = args["ep"] +num_heads = args["num_heads"] +DR_TB = args["DR_TB"] +DR_ff = args["DR_ff"] +activation = args["activation"] +dropout1 = args["drop_post_MHA"] +lr = args["lr"] +loss_fn = args["loss_fn"] +BATCH = 32 # batch size used for training +vocab_size = 40000 +maxlen = 250 +#act_fn='elu' +#embed_dim = 128 # Embedding size for each token +#num_heads = 16 # Number of attention heads +#ff_dim = 128 # Hidden layer size in feed forward network inside transformer +checkpt_file = "smile_regress.autosave.model.h5" +csv_file = "smile_regress.training.log" +patience_red_lr = 20 +patience_early_stop = 100 +hvd_switch = args["hvd_switch"] + +########Create training and validation data##### + +#x: tokenized sequence data, y: single value dock score +data_path_train = args["in_train"] +data_path_vali = args["in_vali"] + +data_train = pd.read_csv(data_path_train) +data_vali = pd.read_csv(data_path_vali) + +data_train.head() +# Dataset has type and smiles as the two fields +# reshaping: y formatted as [[y_1],[y_2],...] with floats +y_train = data_train["type"].values.reshape(-1, 1) * 1.0 +y_val = data_vali["type"].values.reshape(-1, 1) * 1.0 + +tokenizer = text.Tokenizer(num_words=vocab_size) +tokenizer.fit_on_texts(data_train["smiles"]) + +x_train = prep_text(data_train["smiles"], tokenizer, maxlen) +x_val = prep_text(data_vali["smiles"], tokenizer, maxlen) + +######## Implement horovod if necessary ######## +if hvd_switch: + lr, x_train, y_train = initialize_hvd(lr, x_train, y_train) + x_train, y_train = implement_hvd(x_train, y_train) + + + ######## Build model ############# + +model = build_model(num_heads, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch) + +####### Set callbacks ############## +callbacks = callback_setting ( + hvd_switch, + checkpt_file, + lr, + csv_file, + patience_red_lr, + patience_early_stop + ) + +####### Train model! ######### + +history = model.fit( + x_train, + y_train, + batch_size=BATCH, + epochs=EPOCH, + verbose=1, + validation_data=(x_val, y_val), + callbacks=callbacks, +) + diff --git a/Pilot1/ST1/sub_hvd.sh b/Pilot1/ST1/sub_hvd.sh new file mode 100755 index 00000000..619bc602 --- /dev/null +++ b/Pilot1/ST1/sub_hvd.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +module load conda/2022-09-08 +conda activate + +DATA_PATH=/grand/datascience/avasan/ST_Benchmarks/Data/1M-flatten + +TFIL=ml.3CLPro_7BQY_A_1_F.Orderable_zinc_db_enaHLL.sorted.4col.dd.parquet.xform-smiles.csv.reg.train +VFIL=ml.3CLPro_7BQY_A_1_F.Orderable_zinc_db_enaHLL.sorted.4col.dd.parquet.xform-smiles.csv.reg.val + +EP=400 +NUMHEAD=16 +DR_TB=0.1 +DR_ff=0.1 + +ACT=elu +DROP=False +LR=0.0000025 +LOSS=mean_squared_error +HVDSWITCH=True + +if [$HVDSWITCH = False]; then + python smiles_regress_transformer_run_hvd.py --in_train ${DATA_PATH}/${TFIL} --in_vali ${DATA_PATH}/${VFIL} --ep $EP --num_heads $NUMHEAD --DR_TB $DR_TB --DR_ff $DR_ff --activation $ACT --drop_post_MHA $DROP --lr $LR --loss_fn $LOSS --hvd_switch $HVDSWITCH + +else + NP=8 + PPN=4 + OUT=logfile.log + mpiexec --np $NP -ppn $PPN --cpu-bind verbose,list:0,1,2,3,4,5,6,7 -env NCCL_COLLNET_ENABLE=1 -env NCCL_NET_GDR_LEVEL=PHB python smiles_regress_transformer_run_hvd.py --in_train ${DATA_PATH}/${TFIL} --in_vali ${DATA_PATH}/${VFIL} --ep $EP --num_heads $NUMHEAD --DR_TB $DR_TB --DR_ff $DR_ff --activation $ACT --drop_post_MHA $DROP --lr $LR --loss_fn $LOSS --hvd_switch $HVDSWITCH > $OUT + +fi diff --git a/Pilot1/Uno/README.md b/Pilot1/Uno/README.md index 884a96fb..58163890 100644 --- a/Pilot1/Uno/README.md +++ b/Pilot1/Uno/README.md @@ -269,3 +269,41 @@ Current time ....23512.750 ``` python uno_infer.py --data All.h5 --model_file model.h5 --n_pred 30 ``` + +## top21 Plans + +Top21 plans may be generated in JSON format with `plangen.py`. + +JSON plans may be converted to a TXT format via +`topN_to_uno.py --convert` + +The TXT format is (comments are not actually supported yet). + +``` +metadata: +key1: value1 # Some number of key value pairs +key2: value2 +... + +node: # Some NODE ID, e.g., 1.2.3 +val_sets: 1 # Number of validation sets +index: 0 # The current validation set index +val CELLS: count: 175 # Number of CELLs in validation set + CCL_100 CCL_1000 CCL_1001 CCL_1002 # CCLs up till blank line + CCL_1013 CCL_1016 CCL_1017 CCL_1020 + +train_sets: 3 # Number of training sets +index: 0 # The current training set index +train CELLs: count: # Number of CELLs in training set + ... # CCLs as above + +index: 1 +train CELLS: count: + ... # CCLs as above + +node: # Start next node... + +``` + +The TXT format is much faster to load and search with, e.g., +`get-node-txt.sh`.