Merge pull request #73 from Arc-rendezvous/issue-66

Issue 66
gunawanlg · Apr 10, 2020 · 04b3d44 · 04b3d44
2 parents 2fc284f + 865116f
commit 04b3d44
Show file tree

Hide file tree

Showing 24 changed files with 34,883 additions and 8 deletions.
diff --git a/gurih/models/model.py b/gurih/models/model.py
@@ -12,7 +12,7 @@
 import tensorflow.keras.backend as K
 from tensorflow.keras import Input
 from tensorflow.keras.models import Model, model_from_json
-from tensorflow.keras.layers import Lambda, Dense, LSTM
+from tensorflow.keras.layers import Lambda, Dense, LSTM, Dropout
 from tensorflow.keras.layers import Masking, Conv1D, Bidirectional, TimeDistributed
 # from tensorflow.keras.metrics import Precision, Recall
 from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
@@ -69,6 +69,10 @@ def __init__(self, dir_path="../../models/", doc_path="../../docs/"):
         self.name = None
         self._show_summary()
 
+    def _create(self):
+        """Create model architecture"""
+        raise NotImplementedError
+
     def compile(self):
         """Model.compile"""
         raise NotImplementedError
@@ -478,3 +482,90 @@ def _predict(self, X_test):
         ctc_matrix = pred_func(X_test)[0]
 
         return ctc_matrix
+
+
+class BaselineASRModelV2(BaselineASRModel):
+    def __init__(self,
+                 input_shape,
+                 vocab_len,
+                 n_dense=512,
+                 n_lstm_units=256,
+                 filters=256,
+                 **kwargs):
+        super().__init__(input_shape,
+                         vocab_len,
+                         n_lstm_units=n_lstm_units,
+                         filters=filters,
+                         **kwargs)
+        self._n_dense = n_dense
+
+    def _create(self):
+        """Create the baseline ASR with CTC Model"""
+        def _ctc_lambda_func(args):
+            """Lambda function to calculate CTC loss in keras"""
+            y_pred, labels, input_length, label_length = args
+            # y_pred = y_pred[:, 2:, :]
+            return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
+
+        # Calculate output shape as len of vocab +1 for CTC blank token
+        output_shape = self.vocab_len + 1
+
+        input_in = Input(shape=self.input_shape, name="the_input")
+        mask     = Masking(mask_value=0, name="masking")(input_in)
+        conv1D   = Conv1D(self._filters,
+                          self._kernel_size,
+                          strides=self._strides,
+                          padding=self._padding,
+                          activation='relu',
+                          name="conv1")(mask)
+        dense1   = TimeDistributed(Dense(self._n_dense, activation='relu'), name="dense1")(conv1D)
+        dropout1 = Dropout(0.1)(dense1, training=self._training)
+        dense2   = TimeDistributed(Dense(self._n_dense, activation='relu'), name="dense2")(dropout1)
+        dropout2 = Dropout(0.1)(dense2, training=self._training)
+        dense3   = TimeDistributed(Dense(self._n_dense, activation='relu'), name="dense3")(dropout2)
+        dropout3 = Dropout(0.1)(dense3, training=self._training)
+        biLSTM   = Bidirectional(LSTM(self._n_lstm_units,
+                                      return_sequences=True,
+                                      activation='tanh'),
+                                 name="bidirectional")(dropout3)
+        y_pred   = TimeDistributed(Dense(output_shape, activation='softmax'),
+                                   name="the_output")(biLSTM)
+
+        labels       = Input(shape=[None], dtype='float32', name="the_labels")
+        input_length = Input(shape=[1], dtype='int32', name="input_length")
+        label_length = Input(shape=[1], dtype='int32', name="label_length")
+        loss_out     = Lambda(_ctc_lambda_func,
+                              output_shape=(1,),
+                              name='ctc')([y_pred, labels, input_length, label_length])
+
+        self.model = Model(inputs=[input_in, labels, input_length, label_length],
+                           outputs=[loss_out])
+        self.name = '_'.join(["BaselineASR",
+                              'f' + str(self._filters),
+                              'k' + str(self._kernel_size),
+                              's' + str(self._strides),
+                              'p' + self._padding,
+                              'nlstm' + str(self._n_lstm_units),
+                              'ndense' + str(self.vocab_len)])
+
+        # See the model summary before calculating custom CTC loss
+        # for clarity of the architecture of the model
+        tmp_model = Model(inputs=input_in, outputs=y_pred)
+        tmp_model._name = '_'.join(["BaselineASR",
+                                    'f' + str(self._filters),
+                                    'k' + str(self._kernel_size),
+                                    's' + str(self._strides),
+                                    'p' + self._padding,
+                                    'nlstm' + str(self._n_lstm_units),
+                                    'ndense' + str(self.vocab_len)])
+        tmp_model.summary()
+        if self._training is True:
+            with open(self._doc_path + "summary.txt", 'w') as f:
+                with redirect_stdout(f):
+                    tmp_model.summary()
+
+
+class Seq2SeqModel(_BaseModel):
+    def __init__(self, input_shape, vocab_len, latent_dim=256, **kwargs):
+        super().__init__(input_shape, vocab_len, **kwargs)
+        self.latent_dim = latent_dim
diff --git a/gurih/models/my_keras_layers.py b/gurih/models/my_keras_layers.py
@@ -2,8 +2,11 @@
 # KERAS CUSTOM LAYER COMPONENT #
 ################################
 
-from tensorflow.keras import Model
-from tensorflow.keras.layers import Conv1D, LSTM, Bidirectional, Dense
+import tensorflow as tf
+import tensorflow.keras.backend as K
+from tensorflow.keras.layers import Layer, Lambda, Concatenate, Activation, Dot, RepeatVector
+from tensorflow.keras.layers import Conv1D, LSTM, Bidirectional, Dense, MaxPool1D, SimpleRNN
+from tensorflow.keras.models import Model
 
 
 class BaselineASR(Model):
@@ -29,3 +32,205 @@ def call(self, x):
         x = self.bilstm(x)
         x = self.dense(x)
         return x
+
+
+class BasicASREncoder(Model):
+    def __init__(self, vocab_len, n_lstm, batch_size):
+        super(BasicASREncoder, self).__init__()
+        self.vocab_len = vocab_len
+        self.n_lstm = n_lstm
+        self.batch_size = batch_size
+
+        self.bilstm = Bidirectional(LSTM(self.n_lstm,
+                                         return_sequences=True,
+                                         return_state=True,
+                                         recurrent_initializer='orthogonal'))  # or 'glorot_uniform'
+
+    def call(self, X, hidden):
+        """
+        Parameters
+        ----------
+        X : shaoe=(m, Tx, n_mfcc)
+            audio sequence input
+
+        Returns
+        -------
+        output : shape=(m, Tx, n_lstm)
+            output from lSTM
+        state : shape=(m, n_lstm)
+            final LSTM state
+        """
+        output, *state = self.bilstm(X, initial_state=hidden)
+        return output, state  # shape=(batch_size, 4*n_lstm)
+
+
+class BahdanauAttention(tf.keras.Model):
+    def __init__(self, n_dense):
+        super(BahdanauAttention, self).__init__()
+        self.n_dense = n_dense
+        self.W1 = Dense(n_dense)  # no activation g(x) = x
+        self.W2 = Dense(n_dense)  # no activation g(x) = x
+        self.V = Dense(1)  # no activation g(x) = x
+
+    def call(self, query, values):
+        # query hidden state shape == (batch_size, hidden size)
+        # query_with_time_axis shape == (batch_size, 1, hidden size)
+        # values shape == (batch_size, max_len, hidden size)
+        # we are doing this to broadcast addition along the time axis to calculate the score
+        query_with_time_axis = tf.expand_dims(query, 1)
+
+        # score shape == (batch_size, max_length, 1)
+        # we get 1 at the last axis because we are applying score to self.V
+        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
+        score = self.V(tf.nn.tanh(
+            self.W1(query_with_time_axis) + self.W2(values)))
+
+        # attention_weights shape == (batch_size, max_length, 1)
+        attention_weights = tf.nn.softmax(score, axis=1)
+
+        # context_vector shape after sum == (batch_size, hidden_size)
+        context_vector = attention_weights * values
+        context_vector = tf.reduce_sum(context_vector, axis=1)
+
+        return context_vector, attention_weights
+
+
+class PStacker(Layer):
+    def __init__(self):
+        super(PStacker, self).__init__()
+        self.even = Lambda(lambda x: x[:, 0::2, :])
+        self.odd = Lambda(lambda x: x[:, 1::2, :])
+        self.concat = Concatenate(axis=-1)
+
+    def call(self, inputs):
+        even_sequence = self.even(inputs)
+        odd_sequence = self.odd(inputs)
+        outputs = self.concat([even_sequence, odd_sequence])
+        return outputs
+
+
+class MLPAttention(Layer):
+    def __init__(self, n_dense):
+        super(MLPAttention, self).__init__()
+        self.densor_1 = Dense(n_dense, )
+        self.densor_2 = Dense(n_dense//2)
+        self.densor_3 = Dense(n_dense//4)
+        self.densor_4 = Dense(1, activation='relu')
+
+    def call(self, X):
+        return self.densor_4(self.densor_3(self.densor_2(self.densor_1(X))))
+
+
+class MLPOutput(Layer):
+    def __init__(self, n_dense):
+        super(MLPOutput, self).__init__()
+        self.densor_1 = Dense(n_dense, activation='relu')
+        self.densor_2 = Dense(n_dense//2, activation='relu')
+
+    def call(self, X):
+        return self.densor_2(self.densor_1(X))
+
+
+class LuongAttention(Model):
+    def __init__(self, n_dense):
+        super(LuongAttention, self).__init__()
+        self.concatenator = Concatenate(axis=-1)
+        self.densor = MLPAttention(n_dense)
+        self.activator = Activation('softmax', name='attention_weights')
+        self.dotor = Dot(axes=1)
+
+    def call(self, inputs):
+        encoder_outputs, *decoder_prev_states = inputs
+        Tx = K.int_shape(encoder_outputs)[1]
+
+        decoder_prev_states = self.concatenator(decoder_prev_states)
+        decoder_prev_states = RepeatVector(Tx)(decoder_prev_states)
+        concat = self.concatenator([encoder_outputs, decoder_prev_states])
+
+        e = self.densor(concat)
+        alphas = self.activator(e)
+        context_vector = self.dotor([alphas, encoder_outputs])
+
+        return context_vector, alphas
+
+
+class EncoderLSTM(Model):
+    def __init__(self, n_lstm):
+        super(EncoderLSTM, self).__init__()
+        self.pstack = PStacker()
+        self.encoder_1 = Bidirectional(LSTM(n_lstm//4, return_sequences=True))
+        self.encoder_2 = Bidirectional(LSTM(n_lstm//2, return_sequences=True))
+        self.encoder_3 = Bidirectional(LSTM(n_lstm, return_sequences=True, return_state=True))
+
+    def call(self, inputs):
+        stack_1 = self.pstack(self.encoder_1(inputs))
+        stack_2 = self.pstack(self.encoder_2(stack_1))
+        encoder_outputs, *encoder_states = self.encoder_3(stack_2)
+
+        return encoder_outputs, encoder_states
+
+
+class DecoderLSTM(Model):
+    def __init__(self, n_lstm, n_dense, vocab_len):
+        super(DecoderLSTM, self).__init__()
+        self.lstm_1 = LSTM(n_lstm, return_sequences=True, return_state=True)
+        self.lstm_2 = LSTM(n_lstm, return_sequences=True, return_state=True)
+        self.mlp = MLPOutput(n_dense)
+        self.dense = Dense(vocab_len, activation='softmax')
+
+    def call(self, inputs):
+        context_vector, *initial_states = inputs
+
+        lstm_1_output, *lstm_1_states = self.lstm_1(context_vector, initial_state=initial_states[0:2])
+        lstm_2_output, *lstm_2_states = self.lstm_2(lstm_1_output, initial_state=initial_states[2:4])
+        outputs = self.mlp(lstm_2_output)
+        outputs = self.dense(outputs)
+
+        return outputs, [*lstm_1_states, *lstm_2_states]
+
+
+# Below are implementation of SpecAugment paper
+class EncoderLSTMSpecAugment(Model):
+    def __init__(self, n_lstm, n_filters=256, kernel_size=11):
+        super(EncoderLSTMSpecAugment, self).__init__()
+        self.n_lstm = n_lstm
+        self.n_filters = n_filters
+        self.kernel_size = kernel_size
+        self.cnn_1 = Conv1D(n_filters, kernel_size, strides=1, padding='same')
+        self.max_pool_1 = MaxPool1D()
+        self.cnn_2 = Conv1D(n_filters, kernel_size, strides=1, padding='same')
+        self.max_pool_2 = MaxPool1D()
+        self.lstm_1 = Bidirectional(LSTM(n_lstm, return_sequences=True))
+        self.lstm_2 = Bidirectional(LSTM(n_lstm, return_sequences=True))
+        self.lstm_3 = Bidirectional(LSTM(n_lstm, return_sequences=True))
+        self.lstm_4 = Bidirectional(LSTM(n_lstm, return_sequences=True, return_state=True))
+
+    def call(self, inputs):
+        encoder_outputs, *encoder_states = self.lstm_4(
+            self.lstm_3(
+                self.lstm_2(
+                    self.lstm_1(
+                        self.max_pool_2(
+                            self.cnn_2(
+                                self.max_pool_1(
+                                    self.cnn_1(inputs))))))))
+
+        return encoder_outputs, encoder_states
+
+
+class DecoderLSTMSpecAugment(Model):
+    def __init__(self, n_rnn, vocab_len):
+        super(DecoderLSTMSpecAugment, self).__init__()
+        self.rnn_1 = SimpleRNN(n_rnn, return_sequences=True, return_state=True)
+        self.rnn_2 = SimpleRNN(n_rnn, return_sequences=True, return_state=True)
+        self.dense = Dense(vocab_len, activation='softmax')
+
+    def call(self, inputs):
+        context_vector, *initial_states = inputs
+
+        # Initialize only with cell state from LSTM
+        rnn_1_output, rnn_1_states = self.rnn_1(context_vector, initial_state=initial_states[0])
+        rnn_2_output, rnn_2_states = self.rnn_2(rnn_1_output, initial_state=initial_states[1])
+        outputs = self.dense(rnn_2_output)
+
+        return outputs, [rnn_1_states, rnn_2_states]
diff --git a/gurih/models/my_keras_losses.py b/gurih/models/my_keras_losses.py
@@ -0,0 +1,21 @@
+import tensorflow as tf
+
+
+class CTCLoss(tf.keras.losses.Loss):
+    def __init__(self, logits_time_major=False,
+                 reduction=tf.keras.losses.Reduction.SUM, name='ctc'):
+        super().__init__(reduction=reduction, name=name)
+        self.logits_time_major = logits_time_major
+
+    def call(self, y_true, y_pred):
+        y_true = tf.cast(y_true, tf.int32)
+        logit_length = tf.fill([tf.shape(y_pred)[0]], tf.shape(y_pred)[1])
+        label_length = tf.fill([tf.shape(y_true)[0]], tf.shape(y_true)[1])
+        loss = tf.nn.ctc_loss(
+            labels=y_true,
+            logits=y_pred,
+            label_length=label_length,
+            logit_length=logit_length,
+            logits_time_major=self.logits_time_major,
+            blank_index=-1)
+        return tf.reduce_mean(loss)
diff --git a/gurih/models/my_keras_metrics.py b/gurih/models/my_keras_metrics.py
@@ -0,0 +1,42 @@
+import tensorflow as tf
+
+
+def CER(y_true, y_pred):
+    """
+    Average edit distance, ignoring padding of0s.
+    Score returned is the Levenshtein edit distance divided by the total length
+    of reference truth.
+
+    Parameters
+    ----------
+    y_pred : Tensor[shape=(batch_size, prediction_seq_length, num_classes)]
+        softmax output from model
+    y_true : Tensor[shape=(batch_size, labels_seq_length, num_classes)]
+        one-hot vector of ground truth y_true
+
+    Returns
+    -------
+    cer : float,
+        character error rate
+    """
+    # reference_length = y_true.shape[1]
+    reference_length = tf.cast(tf.shape(y_true)[1], tf.float32)
+
+    y_pred = tf.cast(tf.argmax(y_pred, axis=-1), tf.int32)
+    nonzero_idx = tf.where(tf.not_equal(y_pred, 0))
+    sparse_outputs = tf.SparseTensor(nonzero_idx,
+                                     tf.gather_nd(y_pred, nonzero_idx),
+                                     tf.shape(y_pred, out_type=tf.int64))
+
+    y_true = tf.cast(tf.argmax(y_true, axis=-1), tf.int32)
+    nonzero_idx = tf.where(tf.not_equal(y_true, 0))
+    label_sparse_outputs = tf.SparseTensor(nonzero_idx,
+                                           tf.gather_nd(y_true, nonzero_idx),
+                                           tf.shape(y_true, out_type=tf.int64))
+
+    distance = tf.reduce_sum(
+        tf.edit_distance(sparse_outputs, label_sparse_outputs, normalize=False)
+    )
+
+    cer = tf.math.divide(distance, reference_length)
+    return cer