rollout.py

import tensorflow as tf
import numpy as np


class ROLLOUT(object):
    def __init__(self, lstm, update_rate, rollout_num):
        self.lstm = lstm
        self.update_rate = update_rate

        self.num_emb = self.lstm.num_emb
        self.batch_size = self.lstm.batch_size
        self.emb_dim = self.lstm.emb_dim
        self.hidden_dim = self.lstm.hidden_dim
        self.sequence_length = self.lstm.seq_len
        self.start_token = tf.identity(self.lstm.start_token)
        self.learning_rate = self.lstm.learning_rate

        self.rollout_initialization()
        self.get_state_list_graph()
        self.rollout_graph(rollout_num)

    def rollout_initialization(self):
        self.g_embeddings = tf.identity(self.lstm.g_embeddings)
        self.g_recurrent_unit = self.create_recurrent_unit()  # maps h_tm1 to h_t for generator
        self.g_output_unit = self.create_output_unit()  # maps h_t to o_t (output token logits)

        # placeholder definition
        self.x = tf.placeholder(tf.int32, shape=[self.batch_size, self.sequence_length]) # sequence of tokens generated by generator
        self.given_num = tf.placeholder(tf.int32)

        # processed for batch
        with tf.device("/cpu:0"):
            self.processed_x = tf.transpose(tf.nn.embedding_lookup(self.g_embeddings, self.x), perm=[1, 0, 2])  # seq_length x batch_size x emb_dim

        # zero state
        self.h0 = tf.zeros([self.batch_size, self.hidden_dim])
        self.h0 = tf.stack([self.h0, self.h0])


    def get_state_list_graph(self):
        state_list = []
        with tf.variable_scope("STATE_LIST"):
            for i in range(self.sequence_length):
                tf.get_variable_scope().reuse_variables()
                if i == 0:
                    # the first step
                    h_t = self.g_recurrent_unit(tf.nn.embedding_lookup(self.g_embeddings, self.start_token), self.h0)
                else:
                    # tf.get_variable_scope().reuse_variables()
                    h_t = self.g_recurrent_unit(self.processed_x[i-1, :, :], h_t)
                state_list.append(h_t)

        self.state_list = tf.stack(state_list)
        # print self.state_list.shape


    def rollout_graph(self, rollout_num):

        rollout_results = []
        with tf.variable_scope("LSTM"):
            tf.get_variable_scope().reuse_variables()
            for k in range(rollout_num):
                rollout_list = []
                for rollout_step in range(1, self.sequence_length):
                    rollout_sents = []

                    # before defined step, just copy the generated token
                    for i in range(rollout_step):
                        rollout_sents.append(self.x[:, i])

                    # prepare hidden state
                    h_t = self.state_list[rollout_step-1]
                    x_tp1 = self.processed_x[rollout_step-1]

                    # after defined step, random sample the next token (sampling step)
                    for i in range(self.sequence_length - rollout_step):
                        h_t = self.g_recurrent_unit(x_tp1, h_t)
                        o_t = self.g_output_unit(h_t)
                        log_prob = tf.log(tf.nn.softmax(o_t))
                        next_token = tf.cast(tf.reshape(tf.multinomial(log_prob, 1), [self.batch_size]), tf.int32)
                        x_tp1 = tf.nn.embedding_lookup(self.g_embeddings, next_token)
                        rollout_sents.append(next_token)

                    rollout_sents = tf.transpose(tf.stack(rollout_sents), (1, 0))  # batch x seq_length
                    rollout_list.append(rollout_sents)

                # append the complete sentence without rollout
                rollout_list.append(self.x)
                rollout_list = tf.stack(rollout_list) # rollout_step x batch x seq_length

                rollout_results.append(rollout_list)
            self.rollout_results = tf.stack(rollout_results)  # rollout_num x rollout_step x batch x seq_length
        # print self.rollout_results.shape

    ################################################
    ##### This get_reward function is only used 
    ##### when you have a gpu with a large memory
    ################################################
    def get_reward(self, sess, input_x, rollout_num, ranker, rank_data_loader):

        feed = {self.x: input_x}
        rollout_results = sess.run(self.rollout_results, feed)
        ref = rank_data_loader.get_ref()
        samples = np.reshape(rollout_results, [-1, self.sequence_length])
        feed = {ranker.input_x: samples, ranker.dropout_keep_prob: 1.0, ranker.input_ref: ref}
        scores = sess.run(ranker.all_rank_score, feed)
        rewards = np.transpose(np.mean(scores, axis = 0))
        return rewards

    # def get_reward(self, sess, input_x, rollout_num, ranker, rank_data_loader):

    #     feed = {self.x: input_x}
    #     rollout_results = sess.run(self.rollout_results, feed)
    #     ref = rank_data_loader.get_ref()

    #     rewards = []
    #     for i in range(rollout_num):
    #         for given_num in range(1, 20):
    #             samples = rollout_results[i][given_num-1]
    #             feed = {ranker.input_x: samples, ranker.dropout_keep_prob: 1.0, ranker.input_ref: ref}
    #             scores = sess.run(ranker.rank_score, feed)
    #             ypred = np.array([item for item in scores])
    #             if i == 0:
    #                 rewards.append(ypred)
    #             else:
    #                 rewards[given_num - 1] += ypred

    #         # the last token reward
    #         feed = {ranker.input_x: input_x, ranker.dropout_keep_prob: 1.0, ranker.input_ref: ref}
    #         scores = sess.run(ranker.rank_score, feed)
    #         ypred = np.array([item for item in scores])
    #         if i == 0:
    #             rewards.append(ypred)
    #         else:
    #             rewards[20 - 1] += ypred

    #     rewards = np.transpose(np.array(rewards)) / (1.0 * rollout_num)  # batch_size x seq_length
    #     return rewards

    ########################################## 
    ##########################################
    ##########################################
    #basic models for LSTMs
    def create_recurrent_unit(self):
        # Weights and Bias for input and hidden tensor
        self.Wi = tf.identity(self.lstm.Wi)
        self.Ui = tf.identity(self.lstm.Ui)
        self.bi = tf.identity(self.lstm.bi)

        self.Wf = tf.identity(self.lstm.Wf)
        self.Uf = tf.identity(self.lstm.Uf)
        self.bf = tf.identity(self.lstm.bf)

        self.Wog = tf.identity(self.lstm.Wog)
        self.Uog = tf.identity(self.lstm.Uog)
        self.bog = tf.identity(self.lstm.bog)

        self.Wc = tf.identity(self.lstm.Wc)
        self.Uc = tf.identity(self.lstm.Uc)
        self.bc = tf.identity(self.lstm.bc)

        def unit(x, hidden_memory_tm1):
            previous_hidden_state, c_prev = tf.unstack(hidden_memory_tm1)

            # Input Gate
            i = tf.sigmoid(
                tf.matmul(x, self.Wi) +
                tf.matmul(previous_hidden_state, self.Ui) + self.bi
            )

            # Forget Gate
            f = tf.sigmoid(
                tf.matmul(x, self.Wf) +
                tf.matmul(previous_hidden_state, self.Uf) + self.bf
            )

            # Output Gate
            o = tf.sigmoid(
                tf.matmul(x, self.Wog) +
                tf.matmul(previous_hidden_state, self.Uog) + self.bog
            )

            # New Memory Cell
            c_ = tf.nn.tanh(
                tf.matmul(x, self.Wc) +
                tf.matmul(previous_hidden_state, self.Uc) + self.bc
            )

            # Final Memory cell
            c = f * c_prev + i * c_

            # Current Hidden state
            current_hidden_state = o * tf.nn.tanh(c)

            return tf.stack([current_hidden_state, c])

        return unit

    def update_recurrent_unit(self):
        # Weights and Bias for input and hidden tensor
        self.Wi = self.update_rate * self.Wi + (1 - self.update_rate) * tf.identity(self.lstm.Wi)
        self.Ui = self.update_rate * self.Ui + (1 - self.update_rate) * tf.identity(self.lstm.Ui)
        self.bi = self.update_rate * self.bi + (1 - self.update_rate) * tf.identity(self.lstm.bi)

        self.Wf = self.update_rate * self.Wf + (1 - self.update_rate) * tf.identity(self.lstm.Wf)
        self.Uf = self.update_rate * self.Uf + (1 - self.update_rate) * tf.identity(self.lstm.Uf)
        self.bf = self.update_rate * self.bf + (1 - self.update_rate) * tf.identity(self.lstm.bf)

        self.Wog = self.update_rate * self.Wog + (1 - self.update_rate) * tf.identity(self.lstm.Wog)
        self.Uog = self.update_rate * self.Uog + (1 - self.update_rate) * tf.identity(self.lstm.Uog)
        self.bog = self.update_rate * self.bog + (1 - self.update_rate) * tf.identity(self.lstm.bog)

        self.Wc = self.update_rate * self.Wc + (1 - self.update_rate) * tf.identity(self.lstm.Wc)
        self.Uc = self.update_rate * self.Uc + (1 - self.update_rate) * tf.identity(self.lstm.Uc)
        self.bc = self.update_rate * self.bc + (1 - self.update_rate) * tf.identity(self.lstm.bc)

        def unit(x, hidden_memory_tm1):
            previous_hidden_state, c_prev = tf.unstack(hidden_memory_tm1)

            # Input Gate
            i = tf.sigmoid(
                tf.matmul(x, self.Wi) +
                tf.matmul(previous_hidden_state, self.Ui) + self.bi
            )

            # Forget Gate
            f = tf.sigmoid(
                tf.matmul(x, self.Wf) +
                tf.matmul(previous_hidden_state, self.Uf) + self.bf
            )

            # Output Gate
            o = tf.sigmoid(
                tf.matmul(x, self.Wog) +
                tf.matmul(previous_hidden_state, self.Uog) + self.bog
            )

            # New Memory Cell
            c_ = tf.nn.tanh(
                tf.matmul(x, self.Wc) +
                tf.matmul(previous_hidden_state, self.Uc) + self.bc
            )

            # Final Memory cell
            c = f * c_prev + i * c_

            # Current Hidden state
            current_hidden_state = o * tf.nn.tanh(c)

            return tf.stack([current_hidden_state, c])

        return unit

    def create_output_unit(self):
        self.Wo = tf.identity(self.lstm.Wo)
        self.bo = tf.identity(self.lstm.bo)

        def unit(hidden_memory_tuple):
            hidden_state, c_prev = tf.unstack(hidden_memory_tuple)
            # hidden_state : batch x hidden_dim
            logits = tf.matmul(hidden_state, self.Wo) + self.bo
            # output = tf.nn.softmax(logits)
            return logits

        return unit

    def update_output_unit(self):
        self.Wo = self.update_rate * self.Wo + (1 - self.update_rate) * tf.identity(self.lstm.Wo)
        self.bo = self.update_rate * self.bo + (1 - self.update_rate) * tf.identity(self.lstm.bo)

        def unit(hidden_memory_tuple):
            hidden_state, c_prev = tf.unstack(hidden_memory_tuple)
            # hidden_state : batch x hidden_dim
            logits = tf.matmul(hidden_state, self.Wo) + self.bo
            # output = tf.nn.softmax(logits)
            return logits

        return unit

    def update_params(self):
        self.g_embeddings = tf.identity(self.lstm.g_embeddings)
        self.g_recurrent_unit = self.update_recurrent_unit()
        self.g_output_unit = self.update_output_unit()