diff --git a/Pilot1/ST1/clr_callback.py b/Pilot1/ST1/clr_callback.py
new file mode 100644
index 00000000..dbac09d4
--- /dev/null
+++ b/Pilot1/ST1/clr_callback.py
@@ -0,0 +1,133 @@
+from tensorflow.keras.callbacks import *
+from tensorflow.keras import backend as K
+import numpy as np
+
+class CyclicLR(Callback):
+    """This callback implements a cyclical learning rate policy (CLR).
+    The method cycles the learning rate between two boundaries with
+    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
+    The amplitude of the cycle can be scaled on a per-iteration or 
+    per-cycle basis.
+    This class has three built-in policies, as put forth in the paper.
+    "triangular":
+        A basic triangular cycle w/ no amplitude scaling.
+    "triangular2":
+        A basic triangular cycle that scales initial amplitude by half each cycle.
+    "exp_range":
+        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
+        cycle iteration.
+    For more detail, please see paper.
+    
+    # Example
+        ```python
+            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
+                                step_size=2000., mode='triangular')
+            model.fit(X_train, Y_train, callbacks=[clr])
+        ```
+    
+    Class also supports custom scaling functions:
+        ```python
+            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
+            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
+                                step_size=2000., scale_fn=clr_fn,
+                                scale_mode='cycle')
+            model.fit(X_train, Y_train, callbacks=[clr])
+        ```    
+    # Arguments
+        base_lr: initial learning rate which is the
+            lower boundary in the cycle.
+        max_lr: upper boundary in the cycle. Functionally,
+            it defines the cycle amplitude (max_lr - base_lr).
+            The lr at any cycle is the sum of base_lr
+            and some scaling of the amplitude; therefore 
+            max_lr may not actually be reached depending on
+            scaling function.
+        step_size: number of training iterations per
+            half cycle. Authors suggest setting step_size
+            2-8 x training iterations in epoch.
+        mode: one of {triangular, triangular2, exp_range}.
+            Default 'triangular'.
+            Values correspond to policies detailed above.
+            If scale_fn is not None, this argument is ignored.
+        gamma: constant in 'exp_range' scaling function:
+            gamma**(cycle iterations)
+        scale_fn: Custom scaling policy defined by a single
+            argument lambda function, where 
+            0 <= scale_fn(x) <= 1 for all x >= 0.
+            mode paramater is ignored 
+        scale_mode: {'cycle', 'iterations'}.
+            Defines whether scale_fn is evaluated on 
+            cycle number or cycle iterations (training
+            iterations since start of cycle). Default is 'cycle'.
+    """
+
+    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
+                 gamma=1., scale_fn=None, scale_mode='cycle'):
+        super(CyclicLR, self).__init__()
+
+        self.base_lr = base_lr
+        self.max_lr = max_lr
+        self.step_size = step_size
+        self.mode = mode
+        self.gamma = gamma
+        if scale_fn == None:
+            if self.mode == 'triangular':
+                self.scale_fn = lambda x: 1.
+                self.scale_mode = 'cycle'
+            elif self.mode == 'triangular2':
+                self.scale_fn = lambda x: 1/(2.**(x-1))
+                self.scale_mode = 'cycle'
+            elif self.mode == 'exp_range':
+                self.scale_fn = lambda x: gamma**(x)
+                self.scale_mode = 'iterations'
+        else:
+            self.scale_fn = scale_fn
+            self.scale_mode = scale_mode
+        self.clr_iterations = 0.
+        self.trn_iterations = 0.
+        self.history = {}
+
+        self._reset()
+
+    def _reset(self, new_base_lr=None, new_max_lr=None,
+               new_step_size=None):
+        """Resets cycle iterations.
+        Optional boundary/step size adjustment.
+        """
+        if new_base_lr != None:
+            self.base_lr = new_base_lr
+        if new_max_lr != None:
+            self.max_lr = new_max_lr
+        if new_step_size != None:
+            self.step_size = new_step_size
+        self.clr_iterations = 0.
+        
+    def clr(self):
+        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
+        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
+        if self.scale_mode == 'cycle':
+            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
+        else:
+            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
+        
+    def on_train_begin(self, logs={}):
+        logs = logs or {}
+
+        if self.clr_iterations == 0:
+            K.set_value(self.model.optimizer.lr, self.base_lr)
+        else:
+            K.set_value(self.model.optimizer.lr, self.clr())        
+            
+    def on_batch_end(self, epoch, logs=None):
+        
+        logs = logs or {}
+        self.trn_iterations += 1
+        self.clr_iterations += 1
+
+        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
+        self.history.setdefault('iterations', []).append(self.trn_iterations)
+
+        for k, v in logs.items():
+            self.history.setdefault(k, []).append(v)
+        
+        K.set_value(self.model.optimizer.lr, self.clr())
diff --git a/Pilot1/ST1/polaris_sub_hvd.sh b/Pilot1/ST1/polaris_sub_hvd.sh
new file mode 100755
index 00000000..8fe26663
--- /dev/null
+++ b/Pilot1/ST1/polaris_sub_hvd.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#PBS -N st_hvd
+#PBS -l select=2
+#PBS -l walltime=24:00:00
+#PBS -q preemptable
+#PBS -l filesystems=grand
+#PBS -A datascience
+#PBS -o logs/
+#PBS -e logs/
+#PBS -m abe
+#PBS -M avasan@anl.gov
+
+DATA_PATH=/grand/datascience/avasan/ST_Benchmarks/Data/1M-flatten
+
+TFIL=ml.3CLPro_7BQY_A_1_F.Orderable_zinc_db_enaHLL.sorted.4col.dd.parquet.xform-smiles.csv.reg.train
+VFIL=ml.3CLPro_7BQY_A_1_F.Orderable_zinc_db_enaHLL.sorted.4col.dd.parquet.xform-smiles.csv.reg.val
+
+EP=400
+NUMHEAD=16
+DR_TB=0.1
+DR_ff=0.1
+
+ACT=elu
+DROP=False
+LR=0.0000025
+LOSS=mean_squared_error
+HVDSWITCH=True
+
+if [$HVDSWITCH = False]; then
+    python smiles_regress_transformer_run_hvd.py --in_train ${DATA_PATH}/${TFIL} --in_vali ${DATA_PATH}/${VFIL} --ep $EP --num_heads $NUMHEAD --DR_TB $DR_TB --DR_ff $DR_ff --activation $ACT --drop_post_MHA $DROP --lr $LR --loss_fn $LOSS --hvd_switch $HVDSWITCH
+
+else
+    NP=8
+    PPN=4
+    OUT=logfile.log
+    mpiexec --np $NP -ppn $PPN --cpu-bind verbose,list:0,1,2,3,4,5,6,7 -env NCCL_COLLNET_ENABLE=1 -env NCCL_NET_GDR_LEVEL=PHB python smiles_regress_transformer_run_hvd.py  --in_train ${DATA_PATH}/${TFIL} --in_vali ${DATA_PATH}/${VFIL} --ep $EP --num_heads $NUMHEAD --DR_TB $DR_TB --DR_ff $DR_ff --activation $ACT --drop_post_MHA $DROP --lr $LR --loss_fn $LOSS --hvd_switch $HVDSWITCH > $OUT
+
+fi
+
diff --git a/Pilot1/ST1/smiles_regress_transformer_funcs_hvd.py b/Pilot1/ST1/smiles_regress_transformer_funcs_hvd.py
new file mode 100644
index 00000000..54bf04dc
--- /dev/null
+++ b/Pilot1/ST1/smiles_regress_transformer_funcs_hvd.py
@@ -0,0 +1,263 @@
+############# Module Loading ##############
+
+import argparse
+import os
+import numpy as np
+import matplotlib
+import pandas as pd
+import json
+import ray
+from functools import partial
+
+matplotlib.use("Agg")
+
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import backend as K
+from tensorflow.keras import layers
+from tensorflow.keras.callbacks import (
+    CSVLogger,
+    EarlyStopping,
+    ModelCheckpoint,
+    ReduceLROnPlateau,
+)
+
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.preprocessing import sequence, text
+import horovod.keras as hvd ### importing horovod to use data parallelization in another step
+import keras_tuner
+from clr_callback import *
+
+############## Defining functions #####################
+######################################################
+
+def r2(y_true, y_pred):
+    SS_res = K.sum(K.square(y_true - y_pred))
+    SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
+    return 1 - SS_res / (SS_tot + K.epsilon())
+
+# Implement a Transformer block as a layer
+# embed_dim: number of tokens. This is used for the key_dim for the multi-head attention calculation
+# ff_dim: number of nodes in Dense layer
+# epsilon: needed for numerical stability... not sure what this means to be honest
+
+class TransformerBlock(layers.Layer):
+    # __init__: defining all class variables
+    def __init__(self, embed_dim, num_heads, ff_dim, rate, activation, dropout1):
+        super(TransformerBlock, self).__init__()
+        self.drop_chck = dropout1
+        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)#, activation=activation)
+        self.ffn = keras.Sequential(
+            [
+                layers.Dense(ff_dim, activation=activation),
+                layers.Dense(embed_dim),
+            ]
+        )
+        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
+        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
+        self.dropout1 = layers.Dropout(rate)
+        self.dropout2 = layers.Dropout(rate)
+    # call: building simple transformer architecture
+    def call(self, inputs, training):
+        attn_output = self.att(inputs, inputs)
+        if self.drop_chck:
+            attn_output = self.dropout1(attn_output, training=training)
+        out1 = self.layernorm1(inputs + attn_output)
+        ffn_output = self.ffn(out1)
+        ffn_output = self.dropout2(ffn_output, training=training)
+
+        return self.layernorm2(out1 + ffn_output)
+
+
+# Implement embedding layer
+# Two seperate embedding layers, one for tokens, one for token index (positions).
+
+
+class TokenAndPositionEmbedding(layers.Layer):
+    def __init__(self, maxlen, vocab_size, embed_dim):
+        super(TokenAndPositionEmbedding, self).__init__()
+        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
+        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
+
+    def call(self, x):
+        maxlen = tf.shape(x)[-1]
+        positions = tf.range(start=0, limit=maxlen, delta=1)
+        positions = self.pos_emb(positions)
+        x = self.token_emb(x)
+        return x + positions
+        
+
+def prep_text(texts, tokenizer, max_sequence_length):
+    # Turns text into into padded sequences.
+    text_sequences = tokenizer.texts_to_sequences(texts) # turns text into tokens
+    return sequence.pad_sequences(text_sequences, maxlen=max_sequence_length) # pad all sequences so they all have same length
+
+
+
+def model_architecture(embed_dim, num_heads, ff_dim, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch):
+
+    vocab_size = 40000  #number of possible 'words' in SMILES data
+    maxlen = 250  #length of each SMILE sequence in input
+    inputs = layers.Input(shape=(maxlen,))
+    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
+    x = embedding_layer(inputs)
+    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim, DR_TB, activation, dropout1)
+    # Use 4 transformer blocks here
+    x = transformer_block(x)
+    x = transformer_block(x)
+    x = transformer_block(x)
+    x = transformer_block(x)
+    
+    x = layers.Reshape((1, 32000), input_shape=(250, 128,))(
+        x
+    )  # reshaping increases parameters but improves accuracy a lot
+    x = layers.Dropout(DR_ff)(x)
+    x = layers.Dense(1024, activation=activation)(x)
+    x = layers.Dropout(DR_ff)(x)
+    x = layers.Dense(256, activation=activation)(x)
+    x = layers.Dropout(DR_ff)(x)
+    x = layers.Dense(64, activation=activation)(x)
+    x = layers.Dropout(DR_ff)(x)
+    x = layers.Dense(16, activation=activation)(x)
+    x = layers.Dropout(DR_ff)(x)
+    outputs = layers.Dense(1, activation=activation)(x)
+    
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    
+    model.summary()
+    
+    # Train and Evaluate
+
+    opt = Adam(learning_rate=lr)
+    
+    #HVD Wrap optimizer in hvd Distributed Optimizer delegates gradient comp to original optimizer, averages gradients, and applies averaged gradients
+    if hvd_switch:
+        opt = hvd.DistributedOptimizer(opt)
+
+    model.compile(
+        loss=loss_fn, optimizer=opt, metrics=["mae", r2]
+    )
+    return model
+
+def build_model(num_heads, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch):
+    #units = hp.Int("units", min_value=32, max_value=512, step=32)
+    embed_dim = 128
+    ff_dim = 128
+    # call existing model-building code with the hyperparameter values.
+    model = model_architecture (
+        embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim, DR_TB=DR_TB, DR_ff = DR_ff, activation=activation, dropout1=dropout1, lr=lr, loss_fn=loss_fn, hvd_switch=hvd_switch
+    )
+    return model
+
+def initialize_hvd(lr, x_train, y_train):
+    hvd.init() 
+    print("I am rank %d of %d" %(hvd.rank(), hvd.size()))
+    
+    #HVD-2: GPU pinning
+    gpus = tf.config.experimental.list_physical_devices('GPU')
+    # Ping GPU to each9 rank
+    for gpu in gpus:
+    	tf.config.experimental.set_memory_growth(gpu,True)
+    if gpus:
+    	tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+
+    lr = lr * hvd.size()
+    x_train = np.array_split(x_train, hvd.size())
+    y_train = np.array_split(y_train, hvd.size())
+    return (lr, x_train, y_train)
+
+
+def implement_hvd(x_train, y_train):
+    x_train = x_train[hvd.rank()]
+    y_train = y_train[hvd.rank()]
+    return (x_train, y_train)
+
+def callback_setting(hvd_switch, checkpt_file, lr, csv_file, patience_red_lr, patience_early_stop):
+    
+    checkpointer = ModelCheckpoint(
+        filepath=checkpt_file,#"smile_regress.autosave.model.h5",
+        verbose=1,
+        save_weights_only=True,
+        save_best_only=True,
+    )
+    
+    clr = CyclicLR(base_lr = lr, max_lr = 5*lr, step_size=2000.)
+    
+    csv_logger = CSVLogger(csv_file)#"smile_regress.training.log")
+    
+    # learning rate tuning at each epoch
+    # is it possible to do batch size tuning at each epoch as well? 
+    reduce_lr = ReduceLROnPlateau(
+        monitor="val_loss",
+        factor=0.75,
+        patience=patience_red_lr,#20,
+        verbose=1,
+        mode="auto",
+        epsilon=0.0001,
+        cooldown=3,
+        min_lr=0.000000001,
+    )
+
+    early_stop = EarlyStopping(
+        monitor="val_loss",
+        patience=patience_early_stop,#100,
+        verbose=1,
+        mode="auto",
+        )
+
+    if hvd_switch:
+        #HVD broadcast initial variables from rank0 to all other processes 
+        hvd_broadcast = hvd.callbacks.BroadcastGlobalVariablesCallback(0)
+
+        callbacks = [hvd_broadcast,reduce_lr,clr]
+
+        if hvd.rank() == 0:
+            callbacks.append(csv_logger)
+            callbacks.append(early_stop)
+            callbacks.append(checkpointer)
+
+        return callbacks
+
+    else:
+        return [reduce_lr, clr, csv_logger, early_stop, checkpointer]
+
+def build_model_tuner(hp):
+    #units = hp.Int("units", min_value=32, max_value=512, step=32)
+    embed_dim = 128
+    num_heads = hp.Int("num_heads", min_value=12, max_value=40, step=4)
+    ff_dim = 128
+    DR_TB = hp.Float("DR_TB", min_value=0.025, max_value=0.5, step=0.025)
+    DR_ff = hp.Float("DR_TB", min_value=0.025, max_value=0.5, step=0.025)
+    activation = hp.Choice("activation", ["relu", "elu", "gelu"])
+    #activation="elu"
+    dropout1 = hp.Boolean("dropout_aftermulti")
+    lr = hp.Float("lr", min_value=1e-6, max_value=1e-5, step=1e-6)
+    loss_fn = hp.Choice("loss_fn", ["mean_squared_error", "mean_absolute_error"])
+    # call existing model-building code with the hyperparameter values.
+    model = model_architecture (
+        embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim, DR_TB=DR_TB, DR_ff = DR_ff, activation=activation, dropout1=dropout1, lr=lr, loss_fn=loss_fn
+    )
+    return model
+
+
+
+
+
+
+
+
+
+
+
+#tfm.optimization.lars_optimizer.LARS(
+#    learning_rate = 0.0000025,
+#    momentum = 0.9,
+#    weight_decay_rate = 0.0,
+#    eeta = 0.001,
+#    nesterov = False,
+#    classic_momentum = True,
+#    exclude_from_weight_decay = None,
+#    exclude_from_layer_adaptation = None,
+#    name = 'LARS',
+#    )
+
diff --git a/Pilot1/ST1/smiles_regress_transformer_run_hvd.py b/Pilot1/ST1/smiles_regress_transformer_run_hvd.py
new file mode 100644
index 00000000..06e1c4fa
--- /dev/null
+++ b/Pilot1/ST1/smiles_regress_transformer_run_hvd.py
@@ -0,0 +1,127 @@
+############# Module Loading ##############
+import argparse
+import os
+import numpy as np
+import matplotlib
+import pandas as pd
+
+matplotlib.use("Agg")
+
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import backend as K
+from tensorflow.keras import layers
+from tensorflow.keras.callbacks import (
+    CSVLogger,
+    EarlyStopping,
+    ModelCheckpoint,
+    ReduceLROnPlateau,
+)
+
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.preprocessing import sequence, text
+import horovod.keras as hvd ### importing horovod to use data parallelization in another step
+import keras_tuner
+
+from clr_callback import *
+from smiles_regress_transformer_funcs_hvd import *
+
+#######Argument parsing#############
+
+file_path = os.path.dirname(os.path.realpath(__file__))
+
+# psr and args take input outside of the script and assign:
+# (1) file paths for data_path_train and data_path_vali
+# (2) number of training epochs
+
+psr = argparse.ArgumentParser(description="input csv file")
+psr.add_argument("--in_train", default="in_train")
+psr.add_argument("--in_vali", default="in_vali")
+psr.add_argument("--ep", type=int, default=400)
+psr.add_argument("--num_heads", type=int, default=16)
+psr.add_argument("--DR_TB", type=float, default=0.1)
+psr.add_argument("--DR_ff", type=float, default=0.1)
+psr.add_argument("--activation", default="activation")
+psr.add_argument("--drop_post_MHA", type=bool, default=True)
+psr.add_argument("--lr", type=float, default=1e-5)
+psr.add_argument("--loss_fn", default="mean_squared_error")
+psr.add_argument("--hvd_switch", type=bool, default=True)
+
+args = vars(psr.parse_args()) # returns dictionary mapping of an object
+
+######## Set  hyperparameters ########
+
+EPOCH = args["ep"]
+num_heads = args["num_heads"]
+DR_TB = args["DR_TB"]
+DR_ff = args["DR_ff"]
+activation = args["activation"]
+dropout1 = args["drop_post_MHA"]
+lr = args["lr"]
+loss_fn = args["loss_fn"]
+BATCH = 32 # batch size used for training
+vocab_size = 40000
+maxlen = 250
+#act_fn='elu'
+#embed_dim = 128   # Embedding size for each token
+#num_heads = 16   # Number of attention heads
+#ff_dim = 128   # Hidden layer size in feed forward network inside transformer
+checkpt_file = "smile_regress.autosave.model.h5"
+csv_file = "smile_regress.training.log"
+patience_red_lr = 20
+patience_early_stop = 100
+hvd_switch = args["hvd_switch"]
+
+########Create training and validation data##### 
+
+#x: tokenized sequence data, y: single value dock score 
+data_path_train = args["in_train"]
+data_path_vali = args["in_vali"]
+
+data_train = pd.read_csv(data_path_train)
+data_vali = pd.read_csv(data_path_vali)
+
+data_train.head()
+# Dataset has type and smiles as the two fields
+# reshaping: y formatted as [[y_1],[y_2],...] with floats
+y_train = data_train["type"].values.reshape(-1, 1) * 1.0 
+y_val = data_vali["type"].values.reshape(-1, 1) * 1.0
+
+tokenizer = text.Tokenizer(num_words=vocab_size)
+tokenizer.fit_on_texts(data_train["smiles"])
+
+x_train = prep_text(data_train["smiles"], tokenizer, maxlen)
+x_val = prep_text(data_vali["smiles"], tokenizer, maxlen)
+
+######## Implement horovod if necessary ########
+if hvd_switch:
+    lr, x_train, y_train = initialize_hvd(lr, x_train, y_train)
+    x_train, y_train = implement_hvd(x_train, y_train)
+
+
+ ######## Build model #############
+
+model = build_model(num_heads, DR_TB, DR_ff, activation, dropout1, lr, loss_fn, hvd_switch)
+   
+####### Set callbacks ##############
+callbacks = callback_setting (
+        hvd_switch,
+        checkpt_file,
+        lr,
+        csv_file,
+        patience_red_lr,
+        patience_early_stop
+        )
+
+####### Train model! #########
+
+history = model.fit(
+    x_train,
+    y_train,
+    batch_size=BATCH,
+    epochs=EPOCH,
+    verbose=1,
+    validation_data=(x_val, y_val),
+    callbacks=callbacks,
+)
+
diff --git a/Pilot1/ST1/sub_hvd.sh b/Pilot1/ST1/sub_hvd.sh
new file mode 100755
index 00000000..619bc602
--- /dev/null
+++ b/Pilot1/ST1/sub_hvd.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+module load conda/2022-09-08
+conda activate
+
+DATA_PATH=/grand/datascience/avasan/ST_Benchmarks/Data/1M-flatten
+
+TFIL=ml.3CLPro_7BQY_A_1_F.Orderable_zinc_db_enaHLL.sorted.4col.dd.parquet.xform-smiles.csv.reg.train
+VFIL=ml.3CLPro_7BQY_A_1_F.Orderable_zinc_db_enaHLL.sorted.4col.dd.parquet.xform-smiles.csv.reg.val
+
+EP=400
+NUMHEAD=16
+DR_TB=0.1
+DR_ff=0.1
+
+ACT=elu
+DROP=False
+LR=0.0000025
+LOSS=mean_squared_error
+HVDSWITCH=True
+
+if [$HVDSWITCH = False]; then
+    python smiles_regress_transformer_run_hvd.py --in_train ${DATA_PATH}/${TFIL} --in_vali ${DATA_PATH}/${VFIL} --ep $EP --num_heads $NUMHEAD --DR_TB $DR_TB --DR_ff $DR_ff --activation $ACT --drop_post_MHA $DROP --lr $LR --loss_fn $LOSS --hvd_switch $HVDSWITCH
+
+else
+    NP=8
+    PPN=4
+    OUT=logfile.log
+    mpiexec --np $NP -ppn $PPN --cpu-bind verbose,list:0,1,2,3,4,5,6,7 -env NCCL_COLLNET_ENABLE=1 -env NCCL_NET_GDR_LEVEL=PHB python smiles_regress_transformer_run_hvd.py  --in_train ${DATA_PATH}/${TFIL} --in_vali ${DATA_PATH}/${VFIL} --ep $EP --num_heads $NUMHEAD --DR_TB $DR_TB --DR_ff $DR_ff --activation $ACT --drop_post_MHA $DROP --lr $LR --loss_fn $LOSS --hvd_switch $HVDSWITCH > $OUT
+
+fi
diff --git a/Pilot1/Uno/README.md b/Pilot1/Uno/README.md
index 884a96fb..58163890 100644
--- a/Pilot1/Uno/README.md
+++ b/Pilot1/Uno/README.md
@@ -269,3 +269,41 @@ Current time ....23512.750
 ```
 python uno_infer.py --data All.h5 --model_file model.h5 --n_pred 30
 ```
+
+## top21 Plans
+
+Top21 plans may be generated in JSON format with `plangen.py`.
+
+JSON plans may be converted to a TXT format via
+`topN_to_uno.py --convert`
+
+The TXT format is (comments are not actually supported yet).
+
+```
+metadata:
+key1: value1   # Some number of key value pairs
+key2: value2
+...
+
+node: <NODE ID>  # Some NODE ID, e.g., 1.2.3
+val_sets: 1  # Number of validation sets
+index: 0     # The current validation set index
+val CELLS: count: 175   # Number of CELLs in validation set
+  CCL_100 CCL_1000 CCL_1001 CCL_1002   # CCLs up till blank line
+  CCL_1013 CCL_1016 CCL_1017 CCL_1020
+
+train_sets: 3  # Number of training sets
+index: 0     # The current training set index
+train CELLs: count: <N> # Number of CELLs in training set
+     ...    # CCLs as above
+
+index: 1
+train CELLS: count: <N>
+     ...    # CCLs as above
+
+node: <NODE ID>  # Start next node...
+
+```
+
+The TXT format is much faster to load and search with, e.g.,
+`get-node-txt.sh`.