From 6aa4e2dcf00fb48bb0c183d0f703b1e49d6c6619 Mon Sep 17 00:00:00 2001 From: husein zolkepli Date: Wed, 16 Jan 2019 20:14:29 +0800 Subject: [PATCH] added session for summarization agents --- session/summary/residual-freeze.ipynb | 746 ++++++++++++++++++++++ session/summary/skip-news.py | 256 ++++++++ session/summary/skip-thought-freeze.ipynb | 676 ++++++++++++++++++++ session/summary/skip-wiki.py | 319 +++++++++ session/word2vec/wiki-256.py | 103 +++ session/word2vec/word2vec.py | 173 +++-- 6 files changed, 2204 insertions(+), 69 deletions(-) create mode 100644 session/summary/residual-freeze.ipynb create mode 100644 session/summary/skip-news.py create mode 100644 session/summary/skip-thought-freeze.ipynb create mode 100644 session/summary/skip-wiki.py create mode 100644 session/word2vec/wiki-256.py diff --git a/session/summary/residual-freeze.ipynb b/session/summary/residual-freeze.ipynb new file mode 100644 index 00000000..c103cefe --- /dev/null +++ b/session/summary/residual-freeze.ipynb @@ -0,0 +1,746 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "class Attention:\n", + " def __init__(self,hidden_size):\n", + " self.hidden_size = hidden_size\n", + " self.dense_layer = tf.layers.Dense(hidden_size)\n", + " self.v = tf.random_normal([hidden_size],mean=0,stddev=1/np.sqrt(hidden_size))\n", + " \n", + " def score(self, hidden_tensor, encoder_outputs):\n", + " energy = tf.nn.tanh(self.dense_layer(tf.concat([hidden_tensor,encoder_outputs],2)))\n", + " energy = tf.transpose(energy,[0,2,1])\n", + " batch_size = tf.shape(encoder_outputs)[0]\n", + " v = tf.expand_dims(tf.tile(tf.expand_dims(self.v,0),[batch_size,1]),1)\n", + " energy = tf.matmul(v,energy)\n", + " return tf.squeeze(energy,1)\n", + " \n", + " def __call__(self, hidden, encoder_outputs):\n", + " seq_len = tf.shape(encoder_outputs)[1]\n", + " batch_size = tf.shape(encoder_outputs)[0]\n", + " H = tf.tile(tf.expand_dims(hidden, 1),[1,seq_len,1])\n", + " attn_energies = self.score(H,encoder_outputs)\n", + " return tf.expand_dims(tf.nn.softmax(attn_energies),1)\n", + "\n", + "class Model:\n", + " def __init__(\n", + " self,\n", + " dict_size,\n", + " size_layers,\n", + " learning_rate,\n", + " maxlen,\n", + " num_blocks = 3,\n", + " ):\n", + " block_size = size_layers\n", + " self.BEFORE = tf.placeholder(tf.int32,[None,maxlen])\n", + " self.INPUT = tf.placeholder(tf.int32,[None,maxlen])\n", + " self.AFTER = tf.placeholder(tf.int32,[None,maxlen])\n", + " self.batch_size = tf.shape(self.INPUT)[0]\n", + " self.output_layer = tf.layers.Dense(dict_size, name=\"output_layer\")\n", + " self.output_layer.build(size_layers)\n", + " self.embeddings = tf.Variable(tf.random_uniform([dict_size, size_layers], -1, 1))\n", + " embedded = tf.nn.embedding_lookup(self.embeddings, self.INPUT)\n", + " self.attention = Attention(size_layers)\n", + "\n", + " def residual_block(x, size, rate, block, reuse = False):\n", + " with tf.variable_scope(\n", + " 'block_%d_%d' % (block, rate), reuse = reuse\n", + " ):\n", + " attn_weights = self.attention(tf.reduce_sum(x,axis=1), x)\n", + " conv_filter = tf.layers.conv1d(\n", + " attn_weights,\n", + " x.shape[2] // 4,\n", + " kernel_size = size,\n", + " strides = 1,\n", + " padding = 'same',\n", + " dilation_rate = rate,\n", + " activation = tf.nn.tanh,\n", + " )\n", + " conv_gate = tf.layers.conv1d(\n", + " x,\n", + " x.shape[2] // 4,\n", + " kernel_size = size,\n", + " strides = 1,\n", + " padding = 'same',\n", + " dilation_rate = rate,\n", + " activation = tf.nn.sigmoid,\n", + " )\n", + " out = tf.multiply(conv_filter, conv_gate)\n", + " out = tf.layers.conv1d(\n", + " out,\n", + " block_size,\n", + " kernel_size = 1,\n", + " strides = 1,\n", + " padding = 'same',\n", + " activation = tf.nn.tanh,\n", + " )\n", + " return tf.add(x, out), out\n", + "\n", + " forward = tf.layers.conv1d(\n", + " embedded, block_size, kernel_size = 1, strides = 1, padding = 'SAME'\n", + " )\n", + " zeros = tf.zeros_like(forward)\n", + " for i in range(num_blocks):\n", + " for r in [1, 2, 4, 8, 16]:\n", + " forward, s = residual_block(\n", + " forward, size = 7, rate = r, block = i\n", + " )\n", + " zeros = tf.add(zeros, s)\n", + " forward = tf.layers.conv1d(\n", + " zeros,\n", + " block_size,\n", + " kernel_size = 1,\n", + " strides = 1,\n", + " padding = 'SAME',\n", + " activation = tf.nn.tanh,\n", + " )\n", + " self.get_thought = tf.reduce_sum(forward,axis=1, name = 'logits')\n", + " \n", + " def decoder(labels, reuse):\n", + " decoder_in = tf.nn.embedding_lookup(self.embeddings, labels)\n", + " forward = tf.layers.conv1d(\n", + " decoder_in, block_size, kernel_size = 1, strides = 1, padding = 'SAME'\n", + " )\n", + " zeros = tf.zeros_like(forward)\n", + " for r in [8, 16, 24]:\n", + " forward, s = residual_block(forward, size = 7, rate = r, block = 10, reuse = reuse)\n", + " zeros = tf.add(zeros, s)\n", + " return tf.layers.conv1d(\n", + " zeros,\n", + " block_size,\n", + " kernel_size = 1,\n", + " strides = 1,\n", + " padding = 'SAME',\n", + " activation = tf.nn.tanh,\n", + " )\n", + " \n", + " fw_logits = decoder(self.AFTER, False)\n", + " bw_logits = decoder(self.BEFORE, True)\n", + " self.attention = tf.matmul(\n", + " self.get_thought, tf.transpose(self.embeddings), name = 'attention'\n", + " )\n", + " self.loss = self.calculate_loss(fw_logits, self.AFTER) + self.calculate_loss(bw_logits, self.BEFORE)\n", + " self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)\n", + " \n", + " def calculate_loss(self, outputs, labels):\n", + " mask = tf.cast(tf.sign(labels), tf.float32)\n", + " logits = self.output_layer(outputs)\n", + " return tf.contrib.seq2seq.sequence_loss(logits, labels, mask)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "200004" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import json\n", + "with open('skip-wiki-dict.json') as fopen:\n", + " dictionary = json.load(fopen)\n", + "len(dictionary)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def rename(checkpoint_dir, replace_from, replace_to, add_prefix, dry_run=False):\n", + " checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)\n", + " with tf.Session() as sess:\n", + " for var_name, _ in tf.contrib.framework.list_variables(checkpoint_dir):\n", + " var = tf.contrib.framework.load_variable(checkpoint_dir, var_name)\n", + " new_name = var_name\n", + " if None not in [replace_from, replace_to]:\n", + " new_name = new_name.replace(replace_from, replace_to)\n", + " if add_prefix:\n", + " new_name = add_prefix + new_name\n", + "\n", + " if dry_run:\n", + " print('%s would be renamed to %s.' % (var_name, new_name))\n", + " else:\n", + " print('Renaming %s to %s.' % (var_name, new_name))\n", + " # Rename the variable\n", + " var = tf.Variable(var, name=new_name)\n", + "\n", + " if not dry_run:\n", + " # Save the variables\n", + " saver = tf.train.Saver()\n", + " sess.run(tf.global_variables_initializer())\n", + " saver.save(sess, 'skip-rename/model.ckpt')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# rename('skip/model.ckpt','thought_scope_e1d42da4-5ae4-4898-b0f1-f52f687a4e28',\n", + "# 'thought_scope',None)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py:1711: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n", + " warnings.warn('An interactive session is already active. This can '\n" + ] + } + ], + "source": [ + "tf.reset_default_graph()\n", + "sess = tf.InteractiveSession()\n", + "model = Model(len(dictionary), 64, 1e-3, 50)\n", + "sess.run(tf.global_variables_initializer())" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Restoring parameters from skip-wiki/model.ckpt\n" + ] + } + ], + "source": [ + "saver=tf.train.Saver(tf.global_variables())\n", + "saver.restore(sess, 'skip-wiki/model.ckpt')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "\n", + "def sequence(s, w2v_model, maxlen, vocabulary_size):\n", + " words = s.split()\n", + " np_array = np.zeros((maxlen),dtype=np.int32)\n", + " current_no = 0\n", + " for no, word in enumerate(words[:maxlen - 2]):\n", + " id_to_append = 1\n", + " if word in w2v_model:\n", + " word_id = w2v_model[word]\n", + " if word_id < vocabulary_size:\n", + " id_to_append = word_id\n", + " np_array[no] = id_to_append\n", + " current_no = no\n", + " np_array[current_no + 1] = 3\n", + " return np_array\n", + "\n", + "def generate_batch(sentences,batch_size,w2v_model,maxlen,vocabulary_size):\n", + " window_size = batch_size + 2\n", + " first_index = 1000\n", + " batch_sentences = sentences[first_index:first_index+window_size]\n", + " print(batch_sentences)\n", + " batch_sequences = np.array([sequence(sentence,w2v_model,maxlen,vocabulary_size) for sentence in batch_sentences])\n", + " window_shape = []\n", + " for i in range(batch_size):\n", + " window_shape.append(batch_sequences[i:i+3])\n", + " window_shape = np.array(window_shape)\n", + " return window_shape[:,0], window_shape[:,1], window_shape[:,2]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open('news-bm.json','r') as fopen:\n", + " sentences = json.loads(fopen.read())" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['pahang diwakili pemangku raja pahang tengku abdullah sultan ahmad shah manakala kelantan diwakili pemangku raja kelantan dr', 'tengku muhammad faiz petra', 'pada hari kedua mesyuarat yang bermula kira pukul pagi itu raja-raja melayu diiringi menteri besar masing-masing manakala yang dipertua negeri pulau pinang sabah dan melaka diiringi ketua menteri masing-masing']\n" + ] + } + ], + "source": [ + "bw_input, current_input, fw_input = generate_batch(sentences,1,dictionary,50,len(dictionary))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "encoded = sess.run(model.get_thought,feed_dict={model.INPUT:fw_input})" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0.07066324, 0.13310698, -0.62426007, -0.4613824 , -0.17707539,\n", + " -0.3925364 , 1.1155262 , 1.1873002 , 0.48969495, 0.81452906,\n", + " -0.1577659 , -0.17734857, -0.37914753, -0.7942437 , 0.56107384,\n", + " 0.29675886, -0.7340232 , -0.07755096, 0.29897642, -0.0737358 ,\n", + " 0.6024291 , 0.95485014, -0.95064414, -0.63884234, 0.03552189,\n", + " -0.40762448, -0.25227717, -0.24423571, 0.37850273, -0.11428429,\n", + " -0.8386208 , -0.2072649 , -0.9640392 , -0.63121736, -0.5339436 ,\n", + " 0.96501446, -0.12163527, 0.31738836, 0.9421329 , -0.51436657,\n", + " 0.6444553 , -0.2436821 , -0.4731561 , -0.00128211, -0.05046922,\n", + " 0.5482205 , 0.85903156, 0.681826 , 0.02734087, 0.5048841 ,\n", + " 0.08036114, 0.00166782, 0.5863657 , 0.37902188, -0.14853519,\n", + " 0.11486635, 0.03344561, 1.1854374 , -0.07733421, -0.8486209 ,\n", + " 0.9942196 , 0.9136265 , -0.10116772, -0.21602613]],\n", + " dtype=float32)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "encoded" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "strings = ','.join(\n", + " [\n", + " n.name\n", + " for n in tf.get_default_graph().as_graph_def().node\n", + " if (\n", + " 'Variable' in n.op\n", + " or n.name.find('Placeholder') >= 0\n", + " or 'add_1' in n.name\n", + " or 'attention' in n.name\n", + " or 'logits' in n.name\n", + " )\n", + " and 'Adam' not in n.name\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Placeholder',\n", + " 'Placeholder_1',\n", + " 'Placeholder_2',\n", + " 'output_layer/kernel',\n", + " 'output_layer/bias',\n", + " 'Variable',\n", + " 'conv1d/kernel',\n", + " 'conv1d/bias',\n", + " 'block_0_1/dense/kernel',\n", + " 'block_0_1/dense/bias',\n", + " 'block_0_1/dense/Tensordot/add_1',\n", + " 'block_0_1/conv1d/kernel',\n", + " 'block_0_1/conv1d/bias',\n", + " 'block_0_1/conv1d_1/kernel',\n", + " 'block_0_1/conv1d_1/bias',\n", + " 'block_0_1/conv1d_2/kernel',\n", + " 'block_0_1/conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_1/add_1',\n", + " 'block_0_2/conv1d/kernel',\n", + " 'block_0_2/conv1d/bias',\n", + " 'block_0_2/conv1d_1/kernel',\n", + " 'block_0_2/conv1d_1/bias',\n", + " 'block_0_2/conv1d_2/kernel',\n", + " 'block_0_2/conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_2/add_1',\n", + " 'block_0_4/conv1d/kernel',\n", + " 'block_0_4/conv1d/bias',\n", + " 'block_0_4/conv1d_1/kernel',\n", + " 'block_0_4/conv1d_1/bias',\n", + " 'block_0_4/conv1d_2/kernel',\n", + " 'block_0_4/conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_3/add_1',\n", + " 'block_0_8/conv1d/kernel',\n", + " 'block_0_8/conv1d/bias',\n", + " 'block_0_8/conv1d_1/kernel',\n", + " 'block_0_8/conv1d_1/bias',\n", + " 'block_0_8/conv1d_2/kernel',\n", + " 'block_0_8/conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_4/add_1',\n", + " 'block_0_16/conv1d/kernel',\n", + " 'block_0_16/conv1d/bias',\n", + " 'block_0_16/conv1d_1/kernel',\n", + " 'block_0_16/conv1d_1/bias',\n", + " 'block_0_16/conv1d_2/kernel',\n", + " 'block_0_16/conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_5/add_1',\n", + " 'block_1_1/conv1d/kernel',\n", + " 'block_1_1/conv1d/bias',\n", + " 'block_1_1/conv1d_1/kernel',\n", + " 'block_1_1/conv1d_1/bias',\n", + " 'block_1_1/conv1d_2/kernel',\n", + " 'block_1_1/conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_6/add_1',\n", + " 'block_1_2/conv1d/kernel',\n", + " 'block_1_2/conv1d/bias',\n", + " 'block_1_2/conv1d_1/kernel',\n", + " 'block_1_2/conv1d_1/bias',\n", + " 'block_1_2/conv1d_2/kernel',\n", + " 'block_1_2/conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_7/add_1',\n", + " 'block_1_4/conv1d/kernel',\n", + " 'block_1_4/conv1d/bias',\n", + " 'block_1_4/conv1d_1/kernel',\n", + " 'block_1_4/conv1d_1/bias',\n", + " 'block_1_4/conv1d_2/kernel',\n", + " 'block_1_4/conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_8/add_1',\n", + " 'block_1_8/conv1d/kernel',\n", + " 'block_1_8/conv1d/bias',\n", + " 'block_1_8/conv1d_1/kernel',\n", + " 'block_1_8/conv1d_1/bias',\n", + " 'block_1_8/conv1d_2/kernel',\n", + " 'block_1_8/conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_9/add_1',\n", + " 'block_1_16/conv1d/kernel',\n", + " 'block_1_16/conv1d/bias',\n", + " 'block_1_16/conv1d_1/kernel',\n", + " 'block_1_16/conv1d_1/bias',\n", + " 'block_1_16/conv1d_2/kernel',\n", + " 'block_1_16/conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_10/add_1',\n", + " 'block_2_1/conv1d/kernel',\n", + " 'block_2_1/conv1d/bias',\n", + " 'block_2_1/conv1d_1/kernel',\n", + " 'block_2_1/conv1d_1/bias',\n", + " 'block_2_1/conv1d_2/kernel',\n", + " 'block_2_1/conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_11/add_1',\n", + " 'block_2_2/conv1d/kernel',\n", + " 'block_2_2/conv1d/bias',\n", + " 'block_2_2/conv1d_1/kernel',\n", + " 'block_2_2/conv1d_1/bias',\n", + " 'block_2_2/conv1d_2/kernel',\n", + " 'block_2_2/conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_12/add_1',\n", + " 'block_2_4/conv1d/kernel',\n", + " 'block_2_4/conv1d/bias',\n", + " 'block_2_4/conv1d_1/kernel',\n", + " 'block_2_4/conv1d_1/bias',\n", + " 'block_2_4/conv1d_2/kernel',\n", + " 'block_2_4/conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_13/add_1',\n", + " 'block_2_8/conv1d/kernel',\n", + " 'block_2_8/conv1d/bias',\n", + " 'block_2_8/conv1d_1/kernel',\n", + " 'block_2_8/conv1d_1/bias',\n", + " 'block_2_8/conv1d_2/kernel',\n", + " 'block_2_8/conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_14/add_1',\n", + " 'block_2_16/conv1d/kernel',\n", + " 'block_2_16/conv1d/bias',\n", + " 'block_2_16/conv1d_1/kernel',\n", + " 'block_2_16/conv1d_1/bias',\n", + " 'block_2_16/conv1d_2/kernel',\n", + " 'block_2_16/conv1d_2/bias',\n", + " 'conv1d_1/kernel',\n", + " 'conv1d_1/bias',\n", + " 'logits/reduction_indices',\n", + " 'logits',\n", + " 'conv1d_2/kernel',\n", + " 'conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_15/add_1',\n", + " 'block_10_8/conv1d/kernel',\n", + " 'block_10_8/conv1d/bias',\n", + " 'block_10_8/conv1d_1/kernel',\n", + " 'block_10_8/conv1d_1/bias',\n", + " 'block_10_8/conv1d_2/kernel',\n", + " 'block_10_8/conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_16/add_1',\n", + " 'block_10_16/conv1d/kernel',\n", + " 'block_10_16/conv1d/bias',\n", + " 'block_10_16/conv1d_1/kernel',\n", + " 'block_10_16/conv1d_1/bias',\n", + " 'block_10_16/conv1d_2/kernel',\n", + " 'block_10_16/conv1d_2/bias',\n", + " 'block_0_1/dense/Tensordot_17/add_1',\n", + " 'block_10_24/conv1d/kernel',\n", + " 'block_10_24/conv1d/bias',\n", + " 'block_10_24/conv1d_1/kernel',\n", + " 'block_10_24/conv1d_1/bias',\n", + " 'block_10_24/conv1d_2/kernel',\n", + " 'block_10_24/conv1d_2/bias',\n", + " 'conv1d_3/kernel',\n", + " 'conv1d_3/bias',\n", + " 'conv1d_4/kernel',\n", + " 'conv1d_4/bias',\n", + " 'block_0_1/dense/Tensordot_18/add_1',\n", + " 'block_0_1/dense/Tensordot_19/add_1',\n", + " 'block_0_1/dense/Tensordot_20/add_1',\n", + " 'conv1d_5/kernel',\n", + " 'conv1d_5/bias',\n", + " 'attention',\n", + " 'output_layer/Tensordot/add_1',\n", + " 'output_layer/Tensordot_1/add_1',\n", + " 'beta1_power',\n", + " 'beta2_power']" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "strings.split(',')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "def freeze_graph(model_dir, output_node_names):\n", + "\n", + " if not tf.gfile.Exists(model_dir):\n", + " raise AssertionError(\n", + " \"Export directory doesn't exists. Please specify an export \"\n", + " \"directory: %s\" % model_dir)\n", + "\n", + " checkpoint = tf.train.get_checkpoint_state(model_dir)\n", + " input_checkpoint = checkpoint.model_checkpoint_path\n", + " \n", + " absolute_model_dir = \"/\".join(input_checkpoint.split('/')[:-1])\n", + " output_graph = absolute_model_dir + \"/frozen_model.pb\"\n", + " clear_devices = True\n", + " with tf.Session(graph=tf.Graph()) as sess:\n", + " saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)\n", + " saver.restore(sess, input_checkpoint)\n", + " output_graph_def = tf.graph_util.convert_variables_to_constants(\n", + " sess,\n", + " tf.get_default_graph().as_graph_def(),\n", + " output_node_names.split(\",\")\n", + " ) \n", + " with tf.gfile.GFile(output_graph, \"wb\") as f:\n", + " f.write(output_graph_def.SerializeToString())\n", + " print(\"%d ops in the final graph.\" % len(output_graph_def.node))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Restoring parameters from skip-wiki/model.ckpt\n", + "INFO:tensorflow:Froze 127 variables.\n", + "Converted 127 variables to const ops.\n", + "2031 ops in the final graph.\n" + ] + } + ], + "source": [ + "freeze_graph('skip-wiki', strings)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def load_graph(frozen_graph_filename):\n", + " with tf.gfile.GFile(frozen_graph_filename, \"rb\") as f:\n", + " graph_def = tf.GraphDef()\n", + " graph_def.ParseFromString(f.read())\n", + " with tf.Graph().as_default() as graph:\n", + " tf.import_graph_def(graph_def)\n", + " return graph" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "g=load_graph('skip-wiki/frozen_model.pb')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py:1711: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n", + " warnings.warn('An interactive session is already active. This can '\n" + ] + } + ], + "source": [ + "x = g.get_tensor_by_name('import/Placeholder_1:0')\n", + "logits = g.get_tensor_by_name('import/logits:0')\n", + "attention = g.get_tensor_by_name('import/attention:0')\n", + "test_sess = tf.InteractiveSession(graph=g)\n", + "out, att = test_sess.run([logits,attention], feed_dict={x:fw_input})" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1, 200004)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "att.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "rev_dict = {v: k for k, v in dictionary.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "38799\n", + "jagaannya\n", + "4035\n", + "zulkifli\n", + "101993\n", + "ferdy\n", + "11445\n", + "hoe\n", + "165827\n", + "sharidake\n", + "325\n", + "televisyen\n", + "1681\n", + "kawan\n", + "124186\n", + "diimbau\n", + "34683\n", + "luteum\n", + "636\n", + "brunei\n" + ] + } + ], + "source": [ + "for i in att[0].argsort()[-10:][::-1]:\n", + " print(i)\n", + " print(rev_dict[i])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/session/summary/skip-news.py b/session/summary/skip-news.py new file mode 100644 index 00000000..8fb22ade --- /dev/null +++ b/session/summary/skip-news.py @@ -0,0 +1,256 @@ + +# coding: utf-8 + +# In[1]: + + +import sys +import warnings + +if not sys.warnoptions: + warnings.simplefilter('ignore') + +import tensorflow as tf +import numpy as np +from tqdm import tqdm +import re +import collections +import json +import os +from tensorflow.contrib import seq2seq + + +def sequence(s, w2v_model, maxlen = 50, vocabulary_size = 500000): + words = s.split() + np_array = np.zeros((maxlen), dtype = np.int32) + current_no = 0 + for no, word in enumerate(words[: maxlen - 2]): + id_to_append = 1 + if word in w2v_model: + word_id = w2v_model[word] + if word_id < vocabulary_size: + id_to_append = word_id + np_array[no] = id_to_append + current_no = no + np_array[current_no + 1] = 3 + return np_array + + +def batch_sequence(sentences, dictionary, maxlen = 50): + np_array = np.zeros((len(sentences), maxlen), dtype = np.int32) + for no_sentence, sentence in enumerate(sentences): + current_no = 0 + for no, word in enumerate(sentence.split()[: maxlen - 2]): + np_array[no_sentence, no] = dictionary.get(word, 1) + current_no = no + np_array[no_sentence, current_no + 1] = 3 + return np_array + + +class Model: + def __init__( + self, + vocabulary_size, + maxlen = 50, + output_size = 512, + learning_rate = 1e-3, + embedding_size = 256, + batch_size = 16, + max_grad_norm = 10, + **kwargs + ): + word_embeddings = tf.Variable( + tf.random_uniform( + [vocabulary_size, embedding_size], -np.sqrt(3), np.sqrt(3) + ) + ) + self.output_size = output_size + self.maxlen = maxlen + self.embeddings = word_embeddings + self.output_layer = tf.layers.Dense(vocabulary_size) + self.output_layer.build(output_size) + + self.BEFORE = tf.placeholder(tf.int32, [None, maxlen]) + self.INPUT = tf.placeholder(tf.int32, [None, maxlen]) + self.AFTER = tf.placeholder(tf.int32, [None, maxlen]) + self.batch_size = tf.shape(self.INPUT)[0] + + self.get_thought = self.thought(self.INPUT) + self.attention = tf.matmul( + self.get_thought, tf.transpose(self.embeddings), name = 'attention' + ) + self.fw_logits = self.decoder(self.get_thought, self.AFTER) + self.bw_logits = self.decoder(self.get_thought, self.BEFORE) + self.loss = self.calculate_loss( + self.fw_logits, self.AFTER + ) + self.calculate_loss(self.bw_logits, self.BEFORE) + tvars = tf.trainable_variables() + grads, _ = tf.clip_by_global_norm( + tf.gradients(self.loss, tvars), max_grad_norm + ) + self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss) + + def get_embedding(self, inputs): + return tf.nn.embedding_lookup(self.embeddings, inputs) + + def thought(self, inputs): + encoder_in = self.get_embedding(inputs) + fw_cell = tf.nn.rnn_cell.GRUCell(self.output_size) + bw_cell = tf.nn.rnn_cell.GRUCell(self.output_size) + sequence_length = tf.reduce_sum(tf.sign(inputs), axis = 1) + with tf.variable_scope( + 'thought_scope', reuse = False + ): + rnn_output = tf.nn.bidirectional_dynamic_rnn( + fw_cell, + bw_cell, + encoder_in, + sequence_length = sequence_length, + dtype = tf.float32, + )[1] + return sum(rnn_output) + + def decoder(self, thought, labels): + main = tf.strided_slice(labels, [0, 0], [self.batch_size, -1], [1, 1]) + shifted_labels = tf.concat([tf.fill([self.batch_size, 1], 2), main], 1) + decoder_in = self.get_embedding(shifted_labels) + cell = tf.nn.rnn_cell.GRUCell(self.output_size) + max_seq_lengths = tf.fill([self.batch_size], self.maxlen) + helper = seq2seq.TrainingHelper( + decoder_in, max_seq_lengths, time_major = False + ) + decoder = seq2seq.BasicDecoder(cell, helper, thought) + decoder_out = seq2seq.dynamic_decode(decoder)[0].rnn_output + return decoder_out + + def calculate_loss(self, outputs, labels): + mask = tf.cast(tf.sign(labels), tf.float32) + logits = self.output_layer(outputs) + return seq2seq.sequence_loss(logits, labels, mask) + + +def counter_words(sentences): + word_counter = collections.Counter() + word_list = [] + num_lines, num_words = (0, 0) + for i in sentences: + words = re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', i) + word_counter.update(words) + word_list.extend(words) + num_lines += 1 + num_words += len(words) + return word_counter, word_list, num_lines, num_words + + +def build_dict(word_counter, vocab_size = 500000): + count = [['PAD', 0], ['UNK', 1], ['START', 2], ['END', 3]] + count.extend(word_counter.most_common(vocab_size)) + dictionary = dict() + for word, _ in count: + dictionary[word] = len(dictionary) + return dictionary, {word: idx for idx, word in dictionary.items()} + + +def train_model( + train_X, + train_Y_before, + train_Y_after, + epoch = 10, + batch_size = 16, + embedding_size = 128, + maxlen = 100, + **kwargs +): + word_counter, _, _, _ = counter_words(train_X) + dictionary, _ = build_dict(word_counter) + print(len(dictionary)) + _graph = tf.Graph() + with _graph.as_default(): + model = Model( + len(dictionary), + embedding_size = embedding_size, + output_size = embedding_size, + batch_size = batch_size, + maxlen = maxlen, + **kwargs + ) + sess = tf.InteractiveSession() + saver = tf.train.Saver(tf.global_variables()) + sess.run(tf.global_variables_initializer()) + saver.save(sess, 'skip/model.ckpt') + + for e in range(epoch): + pbar = tqdm(range(0, len(train_X), batch_size), desc = 'minibatch loop') + for i in pbar: + batch_x = batch_sequence( + train_X[i : min(i + batch_size, len(train_X))], + dictionary, + maxlen = maxlen, + ) + batch_y_before = batch_sequence( + train_Y_before[i : min(i + batch_size, len(train_X))], + dictionary, + maxlen = maxlen, + ) + batch_y_after = batch_sequence( + train_Y_after[i : min(i + batch_size, len(train_X))], + dictionary, + maxlen = maxlen, + ) + loss, _ = sess.run( + [model.loss, model.optimizer], + feed_dict = { + model.BEFORE: batch_y_before, + model.INPUT: batch_x, + model.AFTER: batch_y_after, + }, + ) + pbar.set_postfix(cost = loss) + saver.save(sess, 'skip/model.ckpt') + return sess, model, dictionary + + +# In[2]: + + +import json +with open('news-bm.json','r') as fopen: + corpus = json.loads(fopen.read()) + +print(len(corpus)) +corpus = [sentence for sentence in corpus if len(sentence) > 10] +print(len(corpus)) + + +# In[3]: + + +stride = 1 +t_range = int((len(corpus) - 3) / stride + 1) +left, middle, right = [], [], [] +for i in range(t_range): + slices = corpus[i * stride : i * stride + 3] + left.append(slices[0]) + middle.append(slices[1]) + right.append(slices[2]) + + +# In[5]: + + +len(left) == len(middle) == len(right) + + +# In[6]: + + +from sklearn.utils import shuffle +left, middle, right = shuffle(left, middle, right) + + +# In[ ]: + + +_,_,dictionary = train_model(middle,left,right) +with open('skip-news-dict.json', 'w') as fopen: + fopen.write(json.dumps(dictionary)) \ No newline at end of file diff --git a/session/summary/skip-thought-freeze.ipynb b/session/summary/skip-thought-freeze.ipynb new file mode 100644 index 00000000..debe7454 --- /dev/null +++ b/session/summary/skip-thought-freeze.ipynb @@ -0,0 +1,676 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "from tensorflow.contrib import seq2seq\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "class Model:\n", + " def __init__(\n", + " self,\n", + " vocabulary_size,\n", + " maxlen = 50,\n", + " output_size = 512,\n", + " learning_rate = 1e-3,\n", + " embedding_size = 256,\n", + " batch_size = 16,\n", + " max_grad_norm = 10,\n", + " **kwargs\n", + " ):\n", + " word_embeddings = tf.Variable(\n", + " tf.random_uniform(\n", + " [vocabulary_size, embedding_size], -np.sqrt(3), np.sqrt(3)\n", + " )\n", + " )\n", + " self.output_size = output_size\n", + " self.maxlen = maxlen\n", + " self.embeddings = word_embeddings\n", + " self.output_layer = tf.layers.Dense(vocabulary_size)\n", + " self.output_layer.build(output_size)\n", + "\n", + " self.BEFORE = tf.placeholder(tf.int32, [None, maxlen])\n", + " self.INPUT = tf.placeholder(tf.int32, [None, maxlen])\n", + " self.AFTER = tf.placeholder(tf.int32, [None, maxlen])\n", + " self.batch_size = tf.shape(self.INPUT)[0]\n", + "\n", + " self.get_thought = self.thought(self.INPUT)\n", + " self.attention = tf.matmul(\n", + " self.get_thought, tf.transpose(self.embeddings), name = 'attention'\n", + " )\n", + " self.fw_logits = self.decoder(self.get_thought, self.AFTER)\n", + " self.bw_logits = self.decoder(self.get_thought, self.BEFORE)\n", + " self.loss = self.calculate_loss(\n", + " self.fw_logits, self.AFTER\n", + " ) + self.calculate_loss(self.bw_logits, self.BEFORE)\n", + " tvars = tf.trainable_variables()\n", + " grads, _ = tf.clip_by_global_norm(\n", + " tf.gradients(self.loss, tvars), max_grad_norm\n", + " )\n", + " self.optimizer = tf.train.AdamOptimizer(learning_rate).apply_gradients(\n", + " zip(grads, tvars)\n", + " )\n", + "\n", + " def get_embedding(self, inputs):\n", + " return tf.nn.embedding_lookup(self.embeddings, inputs)\n", + "\n", + " def thought(self, inputs):\n", + " encoder_in = self.get_embedding(inputs)\n", + " fw_cell = tf.nn.rnn_cell.GRUCell(self.output_size)\n", + " bw_cell = tf.nn.rnn_cell.GRUCell(self.output_size)\n", + " sequence_length = tf.reduce_sum(tf.sign(inputs), axis = 1)\n", + " with tf.variable_scope(\n", + " 'thought_scope', reuse = False\n", + " ):\n", + " rnn_output = tf.nn.bidirectional_dynamic_rnn(\n", + " fw_cell,\n", + " bw_cell,\n", + " encoder_in,\n", + " sequence_length = sequence_length,\n", + " dtype = tf.float32,\n", + " )[1]\n", + " return sum(rnn_output)\n", + "\n", + " def decoder(self, thought, labels):\n", + " main = tf.strided_slice(labels, [0, 0], [self.batch_size, -1], [1, 1])\n", + " shifted_labels = tf.concat([tf.fill([self.batch_size, 1], 2), main], 1)\n", + " decoder_in = self.get_embedding(shifted_labels)\n", + " cell = tf.nn.rnn_cell.GRUCell(self.output_size)\n", + " max_seq_lengths = tf.fill([self.batch_size], self.maxlen)\n", + " helper = seq2seq.TrainingHelper(\n", + " decoder_in, max_seq_lengths, time_major = False\n", + " )\n", + " decoder = seq2seq.BasicDecoder(cell, helper, thought)\n", + " decoder_out = seq2seq.dynamic_decode(decoder)[0].rnn_output\n", + " return decoder_out\n", + "\n", + " def calculate_loss(self, outputs, labels):\n", + " mask = tf.cast(tf.sign(labels), tf.float32)\n", + " logits = self.output_layer(outputs)\n", + " return seq2seq.sequence_loss(logits, labels, mask)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "54718" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import json\n", + "with open('skip-news-dict.json') as fopen:\n", + " dictionary = json.load(fopen)\n", + "len(dictionary)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def rename(checkpoint_dir, replace_from, replace_to, add_prefix, dry_run=False):\n", + " checkpoint = tf.train.get_checkpoint_state(checkpoint_dir)\n", + " with tf.Session() as sess:\n", + " for var_name, _ in tf.contrib.framework.list_variables(checkpoint_dir):\n", + " var = tf.contrib.framework.load_variable(checkpoint_dir, var_name)\n", + " new_name = var_name\n", + " if None not in [replace_from, replace_to]:\n", + " new_name = new_name.replace(replace_from, replace_to)\n", + " if add_prefix:\n", + " new_name = add_prefix + new_name\n", + "\n", + " if dry_run:\n", + " print('%s would be renamed to %s.' % (var_name, new_name))\n", + " else:\n", + " print('Renaming %s to %s.' % (var_name, new_name))\n", + " # Rename the variable\n", + " var = tf.Variable(var, name=new_name)\n", + "\n", + " if not dry_run:\n", + " # Save the variables\n", + " saver = tf.train.Saver()\n", + " sess.run(tf.global_variables_initializer())\n", + " saver.save(sess, 'skip-rename/model.ckpt')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# rename('skip/model.ckpt','thought_scope_e1d42da4-5ae4-4898-b0f1-f52f687a4e28',\n", + "# 'thought_scope',None)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "sess = tf.InteractiveSession()\n", + "model = Model(len(dictionary), embedding_size = 128, output_size = 128, batch_size=16,maxlen=100)\n", + "sess.run(tf.global_variables_initializer())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Variable:0',\n", + " 'dense/kernel:0',\n", + " 'dense/bias:0',\n", + " 'thought_scope/bidirectional_rnn/fw/gru_cell/gates/kernel:0',\n", + " 'thought_scope/bidirectional_rnn/fw/gru_cell/gates/bias:0',\n", + " 'thought_scope/bidirectional_rnn/fw/gru_cell/candidate/kernel:0',\n", + " 'thought_scope/bidirectional_rnn/fw/gru_cell/candidate/bias:0',\n", + " 'thought_scope/bidirectional_rnn/bw/gru_cell/gates/kernel:0',\n", + " 'thought_scope/bidirectional_rnn/bw/gru_cell/gates/bias:0',\n", + " 'thought_scope/bidirectional_rnn/bw/gru_cell/candidate/kernel:0',\n", + " 'thought_scope/bidirectional_rnn/bw/gru_cell/candidate/bias:0',\n", + " 'decoder/gru_cell/gates/kernel:0',\n", + " 'decoder/gru_cell/gates/bias:0',\n", + " 'decoder/gru_cell/candidate/kernel:0',\n", + " 'decoder/gru_cell/candidate/bias:0',\n", + " 'decoder_1/gru_cell/gates/kernel:0',\n", + " 'decoder_1/gru_cell/gates/bias:0',\n", + " 'decoder_1/gru_cell/candidate/kernel:0',\n", + " 'decoder_1/gru_cell/candidate/bias:0',\n", + " 'beta1_power:0',\n", + " 'beta2_power:0',\n", + " 'Variable/Adam:0',\n", + " 'Variable/Adam_1:0',\n", + " 'dense/kernel/Adam:0',\n", + " 'dense/kernel/Adam_1:0',\n", + " 'dense/bias/Adam:0',\n", + " 'dense/bias/Adam_1:0',\n", + " 'thought_scope/bidirectional_rnn/fw/gru_cell/gates/kernel/Adam:0',\n", + " 'thought_scope/bidirectional_rnn/fw/gru_cell/gates/kernel/Adam_1:0',\n", + " 'thought_scope/bidirectional_rnn/fw/gru_cell/gates/bias/Adam:0',\n", + " 'thought_scope/bidirectional_rnn/fw/gru_cell/gates/bias/Adam_1:0',\n", + " 'thought_scope/bidirectional_rnn/fw/gru_cell/candidate/kernel/Adam:0',\n", + " 'thought_scope/bidirectional_rnn/fw/gru_cell/candidate/kernel/Adam_1:0',\n", + " 'thought_scope/bidirectional_rnn/fw/gru_cell/candidate/bias/Adam:0',\n", + " 'thought_scope/bidirectional_rnn/fw/gru_cell/candidate/bias/Adam_1:0',\n", + " 'thought_scope/bidirectional_rnn/bw/gru_cell/gates/kernel/Adam:0',\n", + " 'thought_scope/bidirectional_rnn/bw/gru_cell/gates/kernel/Adam_1:0',\n", + " 'thought_scope/bidirectional_rnn/bw/gru_cell/gates/bias/Adam:0',\n", + " 'thought_scope/bidirectional_rnn/bw/gru_cell/gates/bias/Adam_1:0',\n", + " 'thought_scope/bidirectional_rnn/bw/gru_cell/candidate/kernel/Adam:0',\n", + " 'thought_scope/bidirectional_rnn/bw/gru_cell/candidate/kernel/Adam_1:0',\n", + " 'thought_scope/bidirectional_rnn/bw/gru_cell/candidate/bias/Adam:0',\n", + " 'thought_scope/bidirectional_rnn/bw/gru_cell/candidate/bias/Adam_1:0',\n", + " 'decoder/gru_cell/gates/kernel/Adam:0',\n", + " 'decoder/gru_cell/gates/kernel/Adam_1:0',\n", + " 'decoder/gru_cell/gates/bias/Adam:0',\n", + " 'decoder/gru_cell/gates/bias/Adam_1:0',\n", + " 'decoder/gru_cell/candidate/kernel/Adam:0',\n", + " 'decoder/gru_cell/candidate/kernel/Adam_1:0',\n", + " 'decoder/gru_cell/candidate/bias/Adam:0',\n", + " 'decoder/gru_cell/candidate/bias/Adam_1:0',\n", + " 'decoder_1/gru_cell/gates/kernel/Adam:0',\n", + " 'decoder_1/gru_cell/gates/kernel/Adam_1:0',\n", + " 'decoder_1/gru_cell/gates/bias/Adam:0',\n", + " 'decoder_1/gru_cell/gates/bias/Adam_1:0',\n", + " 'decoder_1/gru_cell/candidate/kernel/Adam:0',\n", + " 'decoder_1/gru_cell/candidate/kernel/Adam_1:0',\n", + " 'decoder_1/gru_cell/candidate/bias/Adam:0',\n", + " 'decoder_1/gru_cell/candidate/bias/Adam_1:0']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[i.name for i in tf.global_variables()]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Restoring parameters from skip/model.ckpt\n" + ] + } + ], + "source": [ + "saver=tf.train.Saver(tf.global_variables())\n", + "saver.restore(sess, 'skip/model.ckpt')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "\n", + "def sequence(s, w2v_model, maxlen, vocabulary_size):\n", + " words = s.split()\n", + " np_array = np.zeros((maxlen),dtype=np.int32)\n", + " current_no = 0\n", + " for no, word in enumerate(words[:maxlen - 2]):\n", + " id_to_append = 1\n", + " if word in w2v_model:\n", + " word_id = w2v_model[word]\n", + " if word_id < vocabulary_size:\n", + " id_to_append = word_id\n", + " np_array[no] = id_to_append\n", + " current_no = no\n", + " np_array[current_no + 1] = 3\n", + " return np_array\n", + "\n", + "def generate_batch(sentences,batch_size,w2v_model,maxlen,vocabulary_size):\n", + " window_size = batch_size + 2\n", + " first_index = 1000\n", + " batch_sentences = sentences[first_index:first_index+window_size]\n", + " print(batch_sentences)\n", + " batch_sequences = np.array([sequence(sentence,w2v_model,maxlen,vocabulary_size) for sentence in batch_sentences])\n", + " window_shape = []\n", + " for i in range(batch_size):\n", + " window_shape.append(batch_sequences[i:i+3])\n", + " window_shape = np.array(window_shape)\n", + " return window_shape[:,0], window_shape[:,1], window_shape[:,2]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "with open('news-bm.json','r') as fopen:\n", + " sentences = json.loads(fopen.read())" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['pahang diwakili pemangku raja pahang tengku abdullah sultan ahmad shah manakala kelantan diwakili pemangku raja kelantan dr', 'tengku muhammad faiz petra', 'pada hari kedua mesyuarat yang bermula kira pukul pagi itu raja-raja melayu diiringi menteri besar masing-masing manakala yang dipertua negeri pulau pinang sabah dan melaka diiringi ketua menteri masing-masing']\n" + ] + } + ], + "source": [ + "bw_input, current_input, fw_input = generate_batch(sentences,1,dictionary,100,len(dictionary))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "encoded = sess.run(model.get_thought,feed_dict={model.INPUT:fw_input})" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 1.0416520e-01, -5.5048943e-01, -8.6489022e-01, -4.7374249e-02,\n", + " 1.1276997e+00, 1.8109307e+00, 6.9022512e-01, 1.3390839e-02,\n", + " 2.2568166e-01, -1.2908951e+00, 1.8937750e+00, -6.6073686e-01,\n", + " 8.8402975e-01, -1.9575896e+00, -1.3369490e+00, 8.7181759e-01,\n", + " 6.0808134e-01, -1.3946321e+00, 1.2038462e-01, 1.2153907e+00,\n", + " 5.5231041e-01, -1.6721604e+00, -1.9526482e-04, -6.4797735e-01,\n", + " 1.9013047e-02, 1.6876624e+00, -1.7706637e+00, 3.0935839e-01,\n", + " 2.3643266e-01, -7.0005804e-01, -7.6473856e-01, -6.4990938e-01,\n", + " 8.5101128e-02, 1.9995425e+00, -1.3742411e+00, 1.4046657e+00,\n", + " 1.2373401e+00, 1.3037590e+00, 5.5078387e-01, -1.6784103e+00,\n", + " -1.5637214e+00, 1.4834172e-01, -1.0372441e+00, -2.6549307e-01,\n", + " -1.8813536e+00, 1.2753011e-01, 1.6532394e+00, -5.8884758e-01,\n", + " -2.4680305e-01, -1.9865925e+00, 7.4487889e-01, -2.9214048e-01,\n", + " 7.9541242e-01, -7.1536422e-01, 9.7346407e-01, -2.9780412e-01,\n", + " -1.4487034e+00, 1.0695006e+00, 7.1344101e-01, -1.7302066e-01,\n", + " 1.3620573e-01, 1.3157678e-01, 4.6292901e-02, -6.6628301e-01,\n", + " -9.3853849e-01, -2.3844108e-02, -2.4575531e-02, 1.0214790e+00,\n", + " -1.6275005e+00, 1.0081427e+00, 1.0262668e-02, 1.8486687e+00,\n", + " 1.1360471e+00, -8.4355950e-02, -2.7205276e-01, -3.5243776e-01,\n", + " -8.7074924e-01, 9.2197478e-01, -1.6891556e+00, -1.2980952e+00,\n", + " -5.3385198e-02, -6.4494354e-01, 6.6960633e-02, 4.6848938e-01,\n", + " -6.9672108e-01, -1.6785400e+00, 7.6200837e-01, -5.0406647e-01,\n", + " -1.4501936e+00, 1.3387250e+00, -5.6099737e-01, -2.6650232e-01,\n", + " -3.4384909e-01, 1.5968245e+00, -1.7252556e+00, -2.8877589e-01,\n", + " 2.3671919e-01, -1.7661674e+00, 1.1558040e+00, 8.8561887e-01,\n", + " 5.6536603e-01, 1.6616430e+00, 1.5410352e-01, -1.9581079e-02,\n", + " -1.4912158e+00, 1.4021204e+00, 9.7034663e-01, 1.5269648e+00,\n", + " -6.9160253e-02, -1.2739227e+00, -2.5241894e-01, -1.5882177e+00,\n", + " -1.1387055e+00, -1.7391834e+00, 1.9862680e+00, 8.7520087e-01,\n", + " -1.0236690e+00, 9.9145275e-01, 1.8478736e-01, -5.5831087e-01,\n", + " -8.1992823e-01, 6.1038101e-01, 4.4993043e-02, 1.4730409e+00,\n", + " 3.2682568e-01, 1.8637949e-01, 1.8340302e-01, -4.0022135e-01]],\n", + " dtype=float32)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "encoded" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "strings = ','.join(\n", + " [\n", + " n.name\n", + " for n in tf.get_default_graph().as_graph_def().node\n", + " if (\n", + " 'Variable' in n.op\n", + " or n.name.find('Placeholder') >= 0\n", + " or 'add_1' in n.name\n", + " or 'attention' in n.name\n", + " )\n", + " and 'Adam' not in n.name\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Variable',\n", + " 'dense/kernel',\n", + " 'dense/bias',\n", + " 'Placeholder',\n", + " 'Placeholder_1',\n", + " 'Placeholder_2',\n", + " 'thought_scope/bidirectional_rnn/fw/gru_cell/gates/kernel',\n", + " 'thought_scope/bidirectional_rnn/fw/gru_cell/gates/bias',\n", + " 'thought_scope/bidirectional_rnn/fw/gru_cell/candidate/kernel',\n", + " 'thought_scope/bidirectional_rnn/fw/gru_cell/candidate/bias',\n", + " 'thought_scope/bidirectional_rnn/fw/fw/while/add_1/y',\n", + " 'thought_scope/bidirectional_rnn/fw/fw/while/add_1',\n", + " 'thought_scope/bidirectional_rnn/bw/gru_cell/gates/kernel',\n", + " 'thought_scope/bidirectional_rnn/bw/gru_cell/gates/bias',\n", + " 'thought_scope/bidirectional_rnn/bw/gru_cell/candidate/kernel',\n", + " 'thought_scope/bidirectional_rnn/bw/gru_cell/candidate/bias',\n", + " 'thought_scope/bidirectional_rnn/bw/bw/while/add_1/y',\n", + " 'thought_scope/bidirectional_rnn/bw/bw/while/add_1',\n", + " 'thought_scope/add_1',\n", + " 'attention',\n", + " 'decoder/gru_cell/gates/kernel',\n", + " 'decoder/gru_cell/gates/bias',\n", + " 'decoder/gru_cell/candidate/kernel',\n", + " 'decoder/gru_cell/candidate/bias',\n", + " 'decoder/while/add_1/y',\n", + " 'decoder/while/add_1',\n", + " 'decoder_1/gru_cell/gates/kernel',\n", + " 'decoder_1/gru_cell/gates/bias',\n", + " 'decoder_1/gru_cell/candidate/kernel',\n", + " 'decoder_1/gru_cell/candidate/bias',\n", + " 'decoder_1/while/add_1/y',\n", + " 'decoder_1/while/add_1',\n", + " 'gradients/thought_scope/add_1_grad/Shape',\n", + " 'gradients/thought_scope/add_1_grad/Shape_1',\n", + " 'gradients/thought_scope/add_1_grad/BroadcastGradientArgs',\n", + " 'gradients/thought_scope/add_1_grad/Sum',\n", + " 'gradients/thought_scope/add_1_grad/Reshape',\n", + " 'gradients/thought_scope/add_1_grad/Sum_1',\n", + " 'gradients/thought_scope/add_1_grad/Reshape_1',\n", + " 'beta1_power',\n", + " 'beta2_power']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "strings.split(',')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def freeze_graph(model_dir, output_node_names):\n", + "\n", + " if not tf.gfile.Exists(model_dir):\n", + " raise AssertionError(\n", + " \"Export directory doesn't exists. Please specify an export \"\n", + " \"directory: %s\" % model_dir)\n", + "\n", + " checkpoint = tf.train.get_checkpoint_state(model_dir)\n", + " input_checkpoint = checkpoint.model_checkpoint_path\n", + " \n", + " absolute_model_dir = \"/\".join(input_checkpoint.split('/')[:-1])\n", + " output_graph = absolute_model_dir + \"/frozen_model.pb\"\n", + " clear_devices = True\n", + " with tf.Session(graph=tf.Graph()) as sess:\n", + " saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)\n", + " saver.restore(sess, input_checkpoint)\n", + " output_graph_def = tf.graph_util.convert_variables_to_constants(\n", + " sess,\n", + " tf.get_default_graph().as_graph_def(),\n", + " output_node_names.split(\",\")\n", + " ) \n", + " with tf.gfile.GFile(output_graph, \"wb\") as f:\n", + " f.write(output_graph_def.SerializeToString())\n", + " print(\"%d ops in the final graph.\" % len(output_graph_def.node))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Restoring parameters from skip/model.ckpt\n", + "INFO:tensorflow:Froze 21 variables.\n", + "INFO:tensorflow:Converted 21 variables to const ops.\n", + "1224 ops in the final graph.\n" + ] + } + ], + "source": [ + "freeze_graph('skip', strings)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "def load_graph(frozen_graph_filename):\n", + " with tf.gfile.GFile(frozen_graph_filename, \"rb\") as f:\n", + " graph_def = tf.GraphDef()\n", + " graph_def.ParseFromString(f.read())\n", + " with tf.Graph().as_default() as graph:\n", + " tf.import_graph_def(graph_def)\n", + " return graph" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "g=load_graph('skip/frozen_model.pb')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py:1702: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n", + " warnings.warn('An interactive session is already active. This can '\n" + ] + } + ], + "source": [ + "x = g.get_tensor_by_name('import/Placeholder_1:0')\n", + "logits = g.get_tensor_by_name('import/thought_scope/add_1:0')\n", + "attention = g.get_tensor_by_name('import/attention:0')\n", + "test_sess = tf.InteractiveSession(graph=g)\n", + "out, att = test_sess.run([logits,attention], feed_dict={x:fw_input})" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1, 54718)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "att.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "rev_dict = {v: k for k, v in dictionary.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "49104\n", + "menjebaknya\n", + "54\n", + "seperti\n", + "5951\n", + "gunanya\n", + "41221\n", + "hawar\n", + "6333\n", + "ganjaran\n", + "27612\n", + "dayangku\n", + "33504\n", + "pijak\n", + "44119\n", + "parol\n", + "43996\n", + "poupart\n", + "22753\n", + "scb\n" + ] + } + ], + "source": [ + "for i in att[0].argsort()[-10:][::-1]:\n", + " print(i)\n", + " print(rev_dict[i])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/session/summary/skip-wiki.py b/session/summary/skip-wiki.py new file mode 100644 index 00000000..50cff96d --- /dev/null +++ b/session/summary/skip-wiki.py @@ -0,0 +1,319 @@ + +# coding: utf-8 + +# In[1]: + + +import sys +import warnings + +if not sys.warnoptions: + warnings.simplefilter('ignore') + +import tensorflow as tf +import numpy as np +from tqdm import tqdm +import re +import collections +import json +import os +from unidecode import unidecode + + +def batch_sequence(sentences, dictionary, maxlen = 50): + np_array = np.zeros((len(sentences), maxlen), dtype = np.int32) + for no_sentence, sentence in enumerate(sentences): + current_no = 0 + for no, word in enumerate(sentence.split()[: maxlen - 2]): + np_array[no_sentence, no] = dictionary.get(word, 1) + current_no = no + np_array[no_sentence, current_no + 1] = 3 + return np_array + + +class Attention: + def __init__(self,hidden_size): + self.hidden_size = hidden_size + self.dense_layer = tf.layers.Dense(hidden_size) + self.v = tf.random_normal([hidden_size],mean=0,stddev=1/np.sqrt(hidden_size)) + + def score(self, hidden_tensor, encoder_outputs): + energy = tf.nn.tanh(self.dense_layer(tf.concat([hidden_tensor,encoder_outputs],2))) + energy = tf.transpose(energy,[0,2,1]) + batch_size = tf.shape(encoder_outputs)[0] + v = tf.expand_dims(tf.tile(tf.expand_dims(self.v,0),[batch_size,1]),1) + energy = tf.matmul(v,energy) + return tf.squeeze(energy,1) + + def __call__(self, hidden, encoder_outputs): + seq_len = tf.shape(encoder_outputs)[1] + batch_size = tf.shape(encoder_outputs)[0] + H = tf.tile(tf.expand_dims(hidden, 1),[1,seq_len,1]) + attn_energies = self.score(H,encoder_outputs) + return tf.expand_dims(tf.nn.softmax(attn_energies),1) + +class Model: + def __init__( + self, + dict_size, + size_layers, + learning_rate, + maxlen, + num_blocks = 3, + ): + block_size = size_layers + self.BEFORE = tf.placeholder(tf.int32,[None,maxlen]) + self.INPUT = tf.placeholder(tf.int32,[None,maxlen]) + self.AFTER = tf.placeholder(tf.int32,[None,maxlen]) + self.batch_size = tf.shape(self.INPUT)[0] + self.output_layer = tf.layers.Dense(dict_size, name="output_layer") + self.output_layer.build(size_layers) + self.embeddings = tf.Variable(tf.random_uniform([dict_size, size_layers], -1, 1)) + embedded = tf.nn.embedding_lookup(self.embeddings, self.INPUT) + self.attention = Attention(size_layers) + + def residual_block(x, size, rate, block, reuse = False): + with tf.variable_scope( + 'block_%d_%d' % (block, rate), reuse = reuse + ): + attn_weights = self.attention(tf.reduce_sum(x,axis=1), x) + conv_filter = tf.layers.conv1d( + attn_weights, + x.shape[2] // 4, + kernel_size = size, + strides = 1, + padding = 'same', + dilation_rate = rate, + activation = tf.nn.tanh, + ) + conv_gate = tf.layers.conv1d( + x, + x.shape[2] // 4, + kernel_size = size, + strides = 1, + padding = 'same', + dilation_rate = rate, + activation = tf.nn.sigmoid, + ) + out = tf.multiply(conv_filter, conv_gate) + out = tf.layers.conv1d( + out, + block_size, + kernel_size = 1, + strides = 1, + padding = 'same', + activation = tf.nn.tanh, + ) + return tf.add(x, out), out + + forward = tf.layers.conv1d( + embedded, block_size, kernel_size = 1, strides = 1, padding = 'SAME' + ) + zeros = tf.zeros_like(forward) + for i in range(num_blocks): + for r in [1, 2, 4, 8, 16]: + forward, s = residual_block( + forward, size = 7, rate = r, block = i + ) + zeros = tf.add(zeros, s) + forward = tf.layers.conv1d( + zeros, + block_size, + kernel_size = 1, + strides = 1, + padding = 'SAME', + activation = tf.nn.tanh, + ) + self.get_thought = tf.reduce_sum(forward,axis=1, name = 'logits') + + def decoder(labels, reuse): + decoder_in = tf.nn.embedding_lookup(self.embeddings, labels) + forward = tf.layers.conv1d( + decoder_in, block_size, kernel_size = 1, strides = 1, padding = 'SAME' + ) + zeros = tf.zeros_like(forward) + for r in [8, 16, 24]: + forward, s = residual_block(forward, size = 7, rate = r, block = 10, reuse = reuse) + zeros = tf.add(zeros, s) + return tf.layers.conv1d( + zeros, + block_size, + kernel_size = 1, + strides = 1, + padding = 'SAME', + activation = tf.nn.tanh, + ) + + fw_logits = decoder(self.AFTER, False) + bw_logits = decoder(self.BEFORE, True) + self.attention = tf.matmul( + self.get_thought, tf.transpose(self.embeddings), name = 'attention' + ) + self.loss = self.calculate_loss(fw_logits, self.AFTER) + self.calculate_loss(bw_logits, self.BEFORE) + self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss) + + def calculate_loss(self, outputs, labels): + mask = tf.cast(tf.sign(labels), tf.float32) + logits = self.output_layer(outputs) + return tf.contrib.seq2seq.sequence_loss(logits, labels, mask) + + +def counter_words(sentences): + word_counter = collections.Counter() + word_list = [] + num_lines, num_words = (0, 0) + for i in sentences: + words = re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', i) + word_counter.update(words) + word_list.extend(words) + num_lines += 1 + num_words += len(words) + return word_counter, word_list, num_lines, num_words + + +def build_dict(word_counter, vocab_size = 200000): + count = [['PAD', 0], ['UNK', 1], ['START', 2], ['END', 3]] + count.extend(word_counter.most_common(vocab_size)) + dictionary = dict() + for word, _ in count: + dictionary[word] = len(dictionary) + return dictionary, {word: idx for idx, word in dictionary.items()} + + +def train_model( + train_X, + train_Y_before, + train_Y_after, + epoch = 10, + batch_size = 16, + embedding_size = 64, + maxlen = 50, + **kwargs +): + word_counter, _, _, _ = counter_words(train_X) + dictionary, _ = build_dict(word_counter) + print(len(dictionary)) + _graph = tf.Graph() + with _graph.as_default(): + model = Model( + len(dictionary), + embedding_size, + 1e-3, + maxlen, + ) + sess = tf.InteractiveSession() + saver = tf.train.Saver(tf.global_variables()) + sess.run(tf.global_variables_initializer()) + saver.save(sess, 'skip-wiki/model.ckpt') + + for e in range(epoch): + pbar = tqdm(range(0, len(train_X), batch_size), desc = 'minibatch loop') + for i in pbar: + batch_x = batch_sequence( + train_X[i : min(i + batch_size, len(train_X))], + dictionary, + maxlen = maxlen, + ) + batch_y_before = batch_sequence( + train_Y_before[i : min(i + batch_size, len(train_X))], + dictionary, + maxlen = maxlen, + ) + batch_y_after = batch_sequence( + train_Y_after[i : min(i + batch_size, len(train_X))], + dictionary, + maxlen = maxlen, + ) + loss, _ = sess.run( + [model.loss, model.optimizer], + feed_dict = { + model.BEFORE: batch_y_before, + model.INPUT: batch_x, + model.AFTER: batch_y_after, + }, + ) + pbar.set_postfix(cost = loss) + saver.save(sess, 'skip-wiki/model.ckpt') + return sess, model, dictionary + + +# In[2]: + + +def cleaning(string): + string = re.sub( + 'http\S+|www.\S+', + '', + ' '.join( + [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0] + ), + ) + string = unidecode(string).replace('.', '. ').replace(',', ', ') + string = re.sub('[^A-Za-z ]+', ' ', string) + string = re.sub(r'[ ]+', ' ', string).strip() + string = ' '.join( + [ + i + for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) + if len(i) + ] + ) + return string.lower() + +def split_by_dot(string): + string = re.sub( + r'(? 10] +print(len(corpus)) + +# In[3]: + + +stride = 1 +t_range = int((len(corpus) - 3) / stride + 1) +left, middle, right = [], [], [] +for i in range(t_range): + slices = corpus[i * stride : i * stride + 3] + left.append(slices[0]) + middle.append(slices[1]) + right.append(slices[2]) + + +# In[5]: + + +len(left) == len(middle) == len(right) + + +# In[6]: + + +from sklearn.utils import shuffle +left, middle, right = shuffle(left, middle, right) + + +# In[ ]: + + +_,_,dictionary = train_model(middle,left,right) +with open('skip-wiki-dict.json', 'w') as fopen: + fopen.write(json.dumps(dictionary)) \ No newline at end of file diff --git a/session/word2vec/wiki-256.py b/session/word2vec/wiki-256.py new file mode 100644 index 00000000..078f505a --- /dev/null +++ b/session/word2vec/wiki-256.py @@ -0,0 +1,103 @@ + +# coding: utf-8 + +# In[1]: + + +import word2vec +import numpy as np +import tensorflow as tf +import json +import os +import re +from unidecode import unidecode +os.environ['CUDA_VISIBLE_DEVICES'] = '' + + +# In[2]: + + +with open('wiki-ms.txt') as fopen: + sentences = fopen.read() + + +def cleaning(string): + string = re.sub( + 'http\S+|www.\S+', + '', + ' '.join( + [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0] + ), + ) + string = unidecode(string).replace('.', '. ').replace(',', ', ') + string = re.sub('[^A-Za-z ]+', ' ', string) + string = re.sub(r'[ ]+', ' ', string).strip() + string = ' '.join( + [ + i + for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) + if len(i) + ] + ) + return string.lower() +# In[3]: + +sentences = cleaning(sentences).split() + +word_array, dictionary, rev_dictionary, num_lines, num_words = word2vec.build_word_array(sentences,vocab_size=1000000) + + +# In[4]: + + +len(dictionary) + + +# In[5]: + + +X, Y = word2vec.build_training_set(word_array) +graph_params = {'batch_size': 32, + 'vocab_size': np.max(X)+1, + 'embed_size': 256, + 'hid_size': 256, + 'neg_samples': 128, + 'learn_rate': 0.01, + 'momentum': 0.9, + 'embed_noise': 0.1, + 'hid_noise': 0.3, + 'epoch':10, + 'optimizer': 'Momentum'} + + +# In[6]: + + +split = round(X.shape[0]*0.9) +train_X, train_Y = X[:split, :], Y[:split, :] +test_X, test_Y = X[split:, :], Y[split:, :] + + +# In[7]: + + +model = word2vec.Model(graph_params) +print('model built, vocab size %d, document length %d'%(np.max(X)+1, len(word_array))) + + +# In[ ]: + + +embed_weights, nce_weights = model.train(train_X, train_Y, test_X, test_Y, + graph_params['epoch'], + graph_params['batch_size']) + + +# In[ ]: + + +import pickle +with open('word2vec-wiki-256.p', 'wb') as fopen: + pickle.dump({'dictionary':dictionary,'rev_dictionary':rev_dictionary, + 'embed_weights':embed_weights,'nce_weights':nce_weights}, fopen) + diff --git a/session/word2vec/word2vec.py b/session/word2vec/word2vec.py index 7e0c057b..1d1c5154 100644 --- a/session/word2vec/word2vec.py +++ b/session/word2vec/word2vec.py @@ -5,33 +5,38 @@ from sklearn.utils import shuffle from sklearn.manifold import TSNE from scipy.spatial.distance import cdist +from tqdm import tqdm + def counter_words(sentences): word_counter = collections.Counter() word_list = [] num_lines, num_words = (0, 0) for i in sentences: - words = re.findall("[\\w']+|[;:\-\(\)&.,!?\"]", i) + words = re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', i) word_counter.update(words) word_list.extend(words) num_lines += 1 num_words += len(words) return word_counter, word_list, num_lines, num_words -def build_dict(word_counter, vocab_size=50000): - count = [['UNK', 0]] + +def build_dict(word_counter, vocab_size = 50000): + count = [['PAD', 0], ['UNK', 1], ['START', 2], ['END', 3]] count.extend(word_counter.most_common(vocab_size)) dictionary = dict() for word, _ in count: dictionary[word] = len(dictionary) return dictionary, {word: idx for idx, word in dictionary.items()} + def doc2num(word_list, dictionary): word_array = [] unknown_val = len(dictionary) for word in word_list: word_array.append(dictionary.get(word, unknown_val)) - return np.array(word_array, dtype=np.int32) + return np.array(word_array, dtype = np.int32) + def build_word_array(sentences, vocab_size): word_counter, word_list, num_lines, num_words = counter_words(sentences) @@ -39,97 +44,125 @@ def build_word_array(sentences, vocab_size): word_array = doc2num(word_list, dictionary) return word_array, dictionary, rev_dictionary, num_lines, num_words + def build_training_set(word_array): num_words = len(word_array) - x = np.zeros((num_words-4, 4), dtype=np.int32) - y = np.zeros((num_words-4, 1), dtype=np.int32) - shift = np.array([-2, -1, 1, 2], dtype=np.int32) - for idx in range(2, num_words-2): - y[idx-2, 0] = word_array[idx] - x[idx-2, :] = word_array[idx+shift] + x = np.zeros((num_words - 4, 4), dtype = np.int32) + y = np.zeros((num_words - 4, 1), dtype = np.int32) + shift = np.array([-2, -1, 1, 2], dtype = np.int32) + for idx in range(2, num_words - 2): + y[idx - 2, 0] = word_array[idx] + x[idx - 2, :] = word_array[idx + shift] return x, y + class Model: def __init__(self, graph_params): g_params = graph_params tf.reset_default_graph() self.sess = tf.InteractiveSession() - self.X = tf.placeholder(tf.int64, shape=[None, 4]) - self.Y = tf.placeholder(tf.int64, shape=[None, 1]) - w_m2, w_m1, w_p1, w_p2 = tf.unstack(self.X, axis=1) - self.embed_weights = tf.Variable(tf.random_uniform([g_params['vocab_size'],g_params['embed_size']], - -g_params['embed_noise'],g_params['embed_noise'])) + self.X = tf.placeholder(tf.int64, shape = [None, 4]) + self.Y = tf.placeholder(tf.int64, shape = [None, 1]) + w_m2, w_m1, w_p1, w_p2 = tf.unstack(self.X, axis = 1) + self.embed_weights = tf.Variable( + tf.random_uniform( + [g_params['vocab_size'], g_params['embed_size']], + -g_params['embed_noise'], + g_params['embed_noise'], + ) + ) embed_m2 = tf.nn.embedding_lookup(self.embed_weights, w_m2) embed_m1 = tf.nn.embedding_lookup(self.embed_weights, w_m1) embed_p1 = tf.nn.embedding_lookup(self.embed_weights, w_p1) embed_p2 = tf.nn.embedding_lookup(self.embed_weights, w_p2) - embed_stack = tf.concat([embed_m2, embed_m1, embed_p1, embed_p2],1) - hid_weights = tf.Variable(tf.random_normal([g_params['embed_size'] * 4, - g_params['hid_size']], - stddev=g_params['hid_noise']/(g_params['embed_size'] * 4)**0.5)) + embed_stack = tf.concat([embed_m2, embed_m1, embed_p1, embed_p2], 1) + hid_weights = tf.Variable( + tf.random_normal( + [g_params['embed_size'] * 4, g_params['hid_size']], + stddev = g_params['hid_noise'] + / (g_params['embed_size'] * 4) ** 0.5, + ) + ) hid_bias = tf.Variable(tf.zeros([g_params['hid_size']])) hid_out = tf.nn.tanh(tf.matmul(embed_stack, hid_weights) + hid_bias) - self.nce_weights = tf.Variable(tf.random_normal([g_params['vocab_size'], - g_params['hid_size']], - stddev=1.0 / g_params['hid_size'] ** 0.5)) + self.nce_weights = tf.Variable( + tf.random_normal( + [g_params['vocab_size'], g_params['hid_size']], + stddev = 1.0 / g_params['hid_size'] ** 0.5, + ) + ) nce_bias = tf.Variable(tf.zeros([g_params['vocab_size']])) - self.cost = tf.reduce_mean(tf.nn.nce_loss(self.nce_weights, nce_bias, - inputs=hid_out, labels=self.Y, - num_sampled=g_params['neg_samples'], - num_classes=g_params['vocab_size'], - num_true=1, remove_accidental_hits=True)) - self.logits = tf.argmax(tf.matmul(hid_out,self.nce_weights, transpose_b=True) + nce_bias, axis=1) + self.cost = tf.reduce_mean( + tf.nn.nce_loss( + self.nce_weights, + nce_bias, + inputs = hid_out, + labels = self.Y, + num_sampled = g_params['neg_samples'], + num_classes = g_params['vocab_size'], + num_true = 1, + remove_accidental_hits = True, + ) + ) + self.logits = tf.argmax( + tf.matmul(hid_out, self.nce_weights, transpose_b = True) + nce_bias, + axis = 1, + ) if g_params['optimizer'] == 'RMSProp': - self.optimizer = tf.train.RMSPropOptimizer(g_params['learn_rate']).minimize(self.cost) + self.optimizer = tf.train.RMSPropOptimizer( + g_params['learn_rate'] + ).minimize(self.cost) elif g_params['optimizer'] == 'Momentum': - self.optimizer = tf.train.MomentumOptimizer(g_params['learn_rate'], - g_params['momentum']).minimize(self.cost) + self.optimizer = tf.train.MomentumOptimizer( + g_params['learn_rate'], g_params['momentum'] + ).minimize(self.cost) elif g_params['optimizer'] == 'Adam': - self.optimizer = tf.train.AdamOptimizer(g_params['learn_rate']).minimize(self.cost) + self.optimizer = tf.train.AdamOptimizer( + g_params['learn_rate'] + ).minimize(self.cost) else: print('Optimizer not supported,exit.') self.sess.run(tf.global_variables_initializer()) - - def train(self,X, Y, X_val, Y_val,epoch,batch_size): - num_batches = len(X) // batch_size - avg_loss, avg_loss_count, batch_count = (0, 0, 0) - e_train, e_val = ([], []) - for i in range(1, epoch+1): - avg_loss, avg_loss_count = (0, 0) + + def train(self, X, Y, X_val, Y_val, epoch, batch_size): + for i in range(epoch): X, Y = shuffle(X, Y) - for batch in range(num_batches): - bot_idx = batch * batch_size - top_idx = bot_idx + batch_size - feed_dict = {self.X: X[bot_idx:top_idx, :],self.Y: Y[bot_idx:top_idx, :]} - _, loss = self.sess.run([self.optimizer,self.cost],feed_dict=feed_dict) - avg_loss += loss - avg_loss_count += 1 - batch_count += 1 - num_batches = X_val.shape[0] // batch_size - avg_loss = avg_loss / avg_loss_count - e_train.append(avg_loss) - val_loss = 0 - for batch in range(num_batches): - bot_idx = batch * batch_size - top_idx = bot_idx + batch_size - feed_dict = {self.X: X_val[bot_idx:top_idx, :], - self.Y: Y_val[bot_idx:top_idx, :]} - val_loss += self.sess.run(self.cost, feed_dict=feed_dict) - val_loss = val_loss / num_batches - e_val.append(val_loss) - print('epoch %d, total batch %d, train loss %f, val loss %f'%(i,batch_count,avg_loss, val_loss)) + pbar = tqdm( + range(0, len(X), batch_size), desc = 'train minibatch loop' + ) + for batch in pbar: + feed_dict = { + self.X: X[batch : min(batch + batch_size, len(X))], + self.Y: Y[batch : min(batch + batch_size, len(X))], + } + _, loss = self.sess.run( + [self.optimizer, self.cost], feed_dict = feed_dict + ) + pbar.set_postfix(cost = loss) + + pbar = tqdm( + range(0, len(X_val), batch_size), desc = 'test minibatch loop' + ) + for batch in pbar: + feed_dict = { + self.X: X_val[batch : min(batch + batch_size, len(X_val))], + self.Y: Y_val[batch : min(batch + batch_size, len(X_val))], + } + loss = self.sess.run(self.cost, feed_dict = feed_dict) + pbar.set_postfix(cost = loss) return self.embed_weights.eval(), self.nce_weights.eval() + class Word2Vec: - def __init__(self,embed_matrix, dictionary): + def __init__(self, embed_matrix, dictionary): self._embed_matrix = embed_matrix self._dictionary = dictionary self._reverse_dictionary = {v: k for k, v in dictionary.items()} - + def get_vector_by_name(self, word): return np.ravel(self._embed_matrix[self._dictionary[word], :]) - - def n_closest(self, word, num_closest=5, metric='cosine'): + + def n_closest(self, word, num_closest = 5, metric = 'cosine'): wv = self.get_vector_by_name(word) closest_indices = self.closest_row_indices(wv, num_closest + 1, metric) word_list = [] @@ -138,13 +171,15 @@ def n_closest(self, word, num_closest=5, metric='cosine'): if word in word_list: word_list.remove(word) return word_list - + def closest_row_indices(self, wv, num, metric): - dist_array = np.ravel(cdist(self._embed_matrix, wv.reshape((1, -1)),metric=metric)) + dist_array = np.ravel( + cdist(self._embed_matrix, wv.reshape((1, -1)), metric = metric) + ) sorted_indices = np.argsort(dist_array) return sorted_indices[:num] - - def analogy(self, a, b, c, num=1, metric='cosine'): + + def analogy(self, a, b, c, num = 1, metric = 'cosine'): va = self.get_vector_by_name(a) vb = self.get_vector_by_name(b) vc = self.get_vector_by_name(c) @@ -156,7 +191,7 @@ def analogy(self, a, b, c, num=1, metric='cosine'): return d_word_list def project_2d(self, start, end): - tsne = TSNE(n_components=2) + tsne = TSNE(n_components = 2) embed_2d = tsne.fit_transform(self._embed_matrix[start:end, :]) word_list = [] for i in range(start, end):