Skip to content

Commit

Permalink
Merge pull request #73 from Arc-rendezvous/issue-66
Browse files Browse the repository at this point in the history
Issue 66
  • Loading branch information
gunawanlg authored Apr 10, 2020
2 parents 2fc284f + 865116f commit 04b3d44
Show file tree
Hide file tree
Showing 24 changed files with 34,883 additions and 8 deletions.
93 changes: 92 additions & 1 deletion gurih/models/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import tensorflow.keras.backend as K
from tensorflow.keras import Input
from tensorflow.keras.models import Model, model_from_json
from tensorflow.keras.layers import Lambda, Dense, LSTM
from tensorflow.keras.layers import Lambda, Dense, LSTM, Dropout
from tensorflow.keras.layers import Masking, Conv1D, Bidirectional, TimeDistributed
# from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
Expand Down Expand Up @@ -69,6 +69,10 @@ def __init__(self, dir_path="../../models/", doc_path="../../docs/"):
self.name = None
self._show_summary()

def _create(self):
"""Create model architecture"""
raise NotImplementedError

def compile(self):
"""Model.compile"""
raise NotImplementedError
Expand Down Expand Up @@ -478,3 +482,90 @@ def _predict(self, X_test):
ctc_matrix = pred_func(X_test)[0]

return ctc_matrix


class BaselineASRModelV2(BaselineASRModel):
def __init__(self,
input_shape,
vocab_len,
n_dense=512,
n_lstm_units=256,
filters=256,
**kwargs):
super().__init__(input_shape,
vocab_len,
n_lstm_units=n_lstm_units,
filters=filters,
**kwargs)
self._n_dense = n_dense

def _create(self):
"""Create the baseline ASR with CTC Model"""
def _ctc_lambda_func(args):
"""Lambda function to calculate CTC loss in keras"""
y_pred, labels, input_length, label_length = args
# y_pred = y_pred[:, 2:, :]
return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

# Calculate output shape as len of vocab +1 for CTC blank token
output_shape = self.vocab_len + 1

input_in = Input(shape=self.input_shape, name="the_input")
mask = Masking(mask_value=0, name="masking")(input_in)
conv1D = Conv1D(self._filters,
self._kernel_size,
strides=self._strides,
padding=self._padding,
activation='relu',
name="conv1")(mask)
dense1 = TimeDistributed(Dense(self._n_dense, activation='relu'), name="dense1")(conv1D)
dropout1 = Dropout(0.1)(dense1, training=self._training)
dense2 = TimeDistributed(Dense(self._n_dense, activation='relu'), name="dense2")(dropout1)
dropout2 = Dropout(0.1)(dense2, training=self._training)
dense3 = TimeDistributed(Dense(self._n_dense, activation='relu'), name="dense3")(dropout2)
dropout3 = Dropout(0.1)(dense3, training=self._training)
biLSTM = Bidirectional(LSTM(self._n_lstm_units,
return_sequences=True,
activation='tanh'),
name="bidirectional")(dropout3)
y_pred = TimeDistributed(Dense(output_shape, activation='softmax'),
name="the_output")(biLSTM)

labels = Input(shape=[None], dtype='float32', name="the_labels")
input_length = Input(shape=[1], dtype='int32', name="input_length")
label_length = Input(shape=[1], dtype='int32', name="label_length")
loss_out = Lambda(_ctc_lambda_func,
output_shape=(1,),
name='ctc')([y_pred, labels, input_length, label_length])

self.model = Model(inputs=[input_in, labels, input_length, label_length],
outputs=[loss_out])
self.name = '_'.join(["BaselineASR",
'f' + str(self._filters),
'k' + str(self._kernel_size),
's' + str(self._strides),
'p' + self._padding,
'nlstm' + str(self._n_lstm_units),
'ndense' + str(self.vocab_len)])

# See the model summary before calculating custom CTC loss
# for clarity of the architecture of the model
tmp_model = Model(inputs=input_in, outputs=y_pred)
tmp_model._name = '_'.join(["BaselineASR",
'f' + str(self._filters),
'k' + str(self._kernel_size),
's' + str(self._strides),
'p' + self._padding,
'nlstm' + str(self._n_lstm_units),
'ndense' + str(self.vocab_len)])
tmp_model.summary()
if self._training is True:
with open(self._doc_path + "summary.txt", 'w') as f:
with redirect_stdout(f):
tmp_model.summary()


class Seq2SeqModel(_BaseModel):
def __init__(self, input_shape, vocab_len, latent_dim=256, **kwargs):
super().__init__(input_shape, vocab_len, **kwargs)
self.latent_dim = latent_dim
209 changes: 207 additions & 2 deletions gurih/models/my_keras_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
# KERAS CUSTOM LAYER COMPONENT #
################################

from tensorflow.keras import Model
from tensorflow.keras.layers import Conv1D, LSTM, Bidirectional, Dense
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Layer, Lambda, Concatenate, Activation, Dot, RepeatVector
from tensorflow.keras.layers import Conv1D, LSTM, Bidirectional, Dense, MaxPool1D, SimpleRNN
from tensorflow.keras.models import Model


class BaselineASR(Model):
Expand All @@ -29,3 +32,205 @@ def call(self, x):
x = self.bilstm(x)
x = self.dense(x)
return x


class BasicASREncoder(Model):
def __init__(self, vocab_len, n_lstm, batch_size):
super(BasicASREncoder, self).__init__()
self.vocab_len = vocab_len
self.n_lstm = n_lstm
self.batch_size = batch_size

self.bilstm = Bidirectional(LSTM(self.n_lstm,
return_sequences=True,
return_state=True,
recurrent_initializer='orthogonal')) # or 'glorot_uniform'

def call(self, X, hidden):
"""
Parameters
----------
X : shaoe=(m, Tx, n_mfcc)
audio sequence input
Returns
-------
output : shape=(m, Tx, n_lstm)
output from lSTM
state : shape=(m, n_lstm)
final LSTM state
"""
output, *state = self.bilstm(X, initial_state=hidden)
return output, state # shape=(batch_size, 4*n_lstm)


class BahdanauAttention(tf.keras.Model):
def __init__(self, n_dense):
super(BahdanauAttention, self).__init__()
self.n_dense = n_dense
self.W1 = Dense(n_dense) # no activation g(x) = x
self.W2 = Dense(n_dense) # no activation g(x) = x
self.V = Dense(1) # no activation g(x) = x

def call(self, query, values):
# query hidden state shape == (batch_size, hidden size)
# query_with_time_axis shape == (batch_size, 1, hidden size)
# values shape == (batch_size, max_len, hidden size)
# we are doing this to broadcast addition along the time axis to calculate the score
query_with_time_axis = tf.expand_dims(query, 1)

# score shape == (batch_size, max_length, 1)
# we get 1 at the last axis because we are applying score to self.V
# the shape of the tensor before applying self.V is (batch_size, max_length, units)
score = self.V(tf.nn.tanh(
self.W1(query_with_time_axis) + self.W2(values)))

# attention_weights shape == (batch_size, max_length, 1)
attention_weights = tf.nn.softmax(score, axis=1)

# context_vector shape after sum == (batch_size, hidden_size)
context_vector = attention_weights * values
context_vector = tf.reduce_sum(context_vector, axis=1)

return context_vector, attention_weights


class PStacker(Layer):
def __init__(self):
super(PStacker, self).__init__()
self.even = Lambda(lambda x: x[:, 0::2, :])
self.odd = Lambda(lambda x: x[:, 1::2, :])
self.concat = Concatenate(axis=-1)

def call(self, inputs):
even_sequence = self.even(inputs)
odd_sequence = self.odd(inputs)
outputs = self.concat([even_sequence, odd_sequence])
return outputs


class MLPAttention(Layer):
def __init__(self, n_dense):
super(MLPAttention, self).__init__()
self.densor_1 = Dense(n_dense, )
self.densor_2 = Dense(n_dense//2)
self.densor_3 = Dense(n_dense//4)
self.densor_4 = Dense(1, activation='relu')

def call(self, X):
return self.densor_4(self.densor_3(self.densor_2(self.densor_1(X))))


class MLPOutput(Layer):
def __init__(self, n_dense):
super(MLPOutput, self).__init__()
self.densor_1 = Dense(n_dense, activation='relu')
self.densor_2 = Dense(n_dense//2, activation='relu')

def call(self, X):
return self.densor_2(self.densor_1(X))


class LuongAttention(Model):
def __init__(self, n_dense):
super(LuongAttention, self).__init__()
self.concatenator = Concatenate(axis=-1)
self.densor = MLPAttention(n_dense)
self.activator = Activation('softmax', name='attention_weights')
self.dotor = Dot(axes=1)

def call(self, inputs):
encoder_outputs, *decoder_prev_states = inputs
Tx = K.int_shape(encoder_outputs)[1]

decoder_prev_states = self.concatenator(decoder_prev_states)
decoder_prev_states = RepeatVector(Tx)(decoder_prev_states)
concat = self.concatenator([encoder_outputs, decoder_prev_states])

e = self.densor(concat)
alphas = self.activator(e)
context_vector = self.dotor([alphas, encoder_outputs])

return context_vector, alphas


class EncoderLSTM(Model):
def __init__(self, n_lstm):
super(EncoderLSTM, self).__init__()
self.pstack = PStacker()
self.encoder_1 = Bidirectional(LSTM(n_lstm//4, return_sequences=True))
self.encoder_2 = Bidirectional(LSTM(n_lstm//2, return_sequences=True))
self.encoder_3 = Bidirectional(LSTM(n_lstm, return_sequences=True, return_state=True))

def call(self, inputs):
stack_1 = self.pstack(self.encoder_1(inputs))
stack_2 = self.pstack(self.encoder_2(stack_1))
encoder_outputs, *encoder_states = self.encoder_3(stack_2)

return encoder_outputs, encoder_states


class DecoderLSTM(Model):
def __init__(self, n_lstm, n_dense, vocab_len):
super(DecoderLSTM, self).__init__()
self.lstm_1 = LSTM(n_lstm, return_sequences=True, return_state=True)
self.lstm_2 = LSTM(n_lstm, return_sequences=True, return_state=True)
self.mlp = MLPOutput(n_dense)
self.dense = Dense(vocab_len, activation='softmax')

def call(self, inputs):
context_vector, *initial_states = inputs

lstm_1_output, *lstm_1_states = self.lstm_1(context_vector, initial_state=initial_states[0:2])
lstm_2_output, *lstm_2_states = self.lstm_2(lstm_1_output, initial_state=initial_states[2:4])
outputs = self.mlp(lstm_2_output)
outputs = self.dense(outputs)

return outputs, [*lstm_1_states, *lstm_2_states]


# Below are implementation of SpecAugment paper
class EncoderLSTMSpecAugment(Model):
def __init__(self, n_lstm, n_filters=256, kernel_size=11):
super(EncoderLSTMSpecAugment, self).__init__()
self.n_lstm = n_lstm
self.n_filters = n_filters
self.kernel_size = kernel_size
self.cnn_1 = Conv1D(n_filters, kernel_size, strides=1, padding='same')
self.max_pool_1 = MaxPool1D()
self.cnn_2 = Conv1D(n_filters, kernel_size, strides=1, padding='same')
self.max_pool_2 = MaxPool1D()
self.lstm_1 = Bidirectional(LSTM(n_lstm, return_sequences=True))
self.lstm_2 = Bidirectional(LSTM(n_lstm, return_sequences=True))
self.lstm_3 = Bidirectional(LSTM(n_lstm, return_sequences=True))
self.lstm_4 = Bidirectional(LSTM(n_lstm, return_sequences=True, return_state=True))

def call(self, inputs):
encoder_outputs, *encoder_states = self.lstm_4(
self.lstm_3(
self.lstm_2(
self.lstm_1(
self.max_pool_2(
self.cnn_2(
self.max_pool_1(
self.cnn_1(inputs))))))))

return encoder_outputs, encoder_states


class DecoderLSTMSpecAugment(Model):
def __init__(self, n_rnn, vocab_len):
super(DecoderLSTMSpecAugment, self).__init__()
self.rnn_1 = SimpleRNN(n_rnn, return_sequences=True, return_state=True)
self.rnn_2 = SimpleRNN(n_rnn, return_sequences=True, return_state=True)
self.dense = Dense(vocab_len, activation='softmax')

def call(self, inputs):
context_vector, *initial_states = inputs

# Initialize only with cell state from LSTM
rnn_1_output, rnn_1_states = self.rnn_1(context_vector, initial_state=initial_states[0])
rnn_2_output, rnn_2_states = self.rnn_2(rnn_1_output, initial_state=initial_states[1])
outputs = self.dense(rnn_2_output)

return outputs, [rnn_1_states, rnn_2_states]
21 changes: 21 additions & 0 deletions gurih/models/my_keras_losses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import tensorflow as tf


class CTCLoss(tf.keras.losses.Loss):
def __init__(self, logits_time_major=False,
reduction=tf.keras.losses.Reduction.SUM, name='ctc'):
super().__init__(reduction=reduction, name=name)
self.logits_time_major = logits_time_major

def call(self, y_true, y_pred):
y_true = tf.cast(y_true, tf.int32)
logit_length = tf.fill([tf.shape(y_pred)[0]], tf.shape(y_pred)[1])
label_length = tf.fill([tf.shape(y_true)[0]], tf.shape(y_true)[1])
loss = tf.nn.ctc_loss(
labels=y_true,
logits=y_pred,
label_length=label_length,
logit_length=logit_length,
logits_time_major=self.logits_time_major,
blank_index=-1)
return tf.reduce_mean(loss)
42 changes: 42 additions & 0 deletions gurih/models/my_keras_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import tensorflow as tf


def CER(y_true, y_pred):
"""
Average edit distance, ignoring padding of0s.
Score returned is the Levenshtein edit distance divided by the total length
of reference truth.
Parameters
----------
y_pred : Tensor[shape=(batch_size, prediction_seq_length, num_classes)]
softmax output from model
y_true : Tensor[shape=(batch_size, labels_seq_length, num_classes)]
one-hot vector of ground truth y_true
Returns
-------
cer : float,
character error rate
"""
# reference_length = y_true.shape[1]
reference_length = tf.cast(tf.shape(y_true)[1], tf.float32)

y_pred = tf.cast(tf.argmax(y_pred, axis=-1), tf.int32)
nonzero_idx = tf.where(tf.not_equal(y_pred, 0))
sparse_outputs = tf.SparseTensor(nonzero_idx,
tf.gather_nd(y_pred, nonzero_idx),
tf.shape(y_pred, out_type=tf.int64))

y_true = tf.cast(tf.argmax(y_true, axis=-1), tf.int32)
nonzero_idx = tf.where(tf.not_equal(y_true, 0))
label_sparse_outputs = tf.SparseTensor(nonzero_idx,
tf.gather_nd(y_true, nonzero_idx),
tf.shape(y_true, out_type=tf.int64))

distance = tf.reduce_sum(
tf.edit_distance(sparse_outputs, label_sparse_outputs, normalize=False)
)

cer = tf.math.divide(distance, reference_length)
return cer
Loading

0 comments on commit 04b3d44

Please sign in to comment.