From e130c0c417bae71915553fc36e6200564b39c504 Mon Sep 17 00:00:00 2001 From: husein zolkepli Date: Mon, 28 Jun 2021 14:18:46 +0800 Subject: [PATCH] release 4.5 --- docs/Api.rst | 6 + docs/load-dependency.ipynb | 5589 ++++++++++++---- ...load-knowledge-graph-from-dependency.ipynb | 400 +- docs/speech-toolkit.rst | 5 + example/dependency/load-dependency.ipynb | 391 +- ...load-knowledge-graph-from-dependency.ipynb | 400 +- load-dependency.ipynb | 5956 +++++++++++++++++ malaya/dependency.py | 113 +- malaya/knowledge_graph.py | 10 +- malaya/model/bert.py | 5 +- malaya/model/xlnet.py | 5 +- malaya/train/__init__.py | 155 - malaya/train/model/__init__.py | 3 - malaya/train/model/alxlnet/__init__.py | 1 - malaya/train/model/bigbird/__init__.py | 1 - malaya/train/model/bigbird/attention.py | 1279 ---- malaya/train/model/bigbird/beam_search.py | 277 - malaya/train/model/bigbird/decoder.py | 681 -- malaya/train/model/bigbird/encoder.py | 515 -- malaya/train/model/bigbird/modeling.py | 516 -- malaya/train/model/bigbird/optimization.py | 182 - malaya/train/model/bigbird/utils.py | 806 --- malaya/train/model/pegasus/__init__.py | 1 - malaya/train/model/pegasus/base.py | 59 - malaya/train/model/pegasus/layers/__init__.py | 13 - .../train/model/pegasus/layers/attention.py | 135 - .../train/model/pegasus/layers/beam_search.py | 301 - malaya/train/model/pegasus/layers/decoding.py | 208 - .../train/model/pegasus/layers/embedding.py | 73 - malaya/train/model/pegasus/layers/timing.py | 68 - .../model/pegasus/layers/transformer_block.py | 123 - malaya/train/model/pegasus/transformer.py | 198 - .../train/model/product_key_memory/layer.py | 22 - .../train/model/product_key_memory/model.py | 33 - pretrained-model/fnet/README.md | 4 +- pretrained-model/fnet/sentiment-base.ipynb | 494 ++ pretrained-model/fnet/test-fnet.ipynb | 561 ++ pretrained-model/performer/fast_attention.py | 529 ++ pretrained-model/performer/model.py | 230 + .../performer/test-performer.ipynb | 33 + pretrained-model/performer/util.py | 195 + session/dependency/albert-base.ipynb | 2606 +++++--- session/dependency/albert-tiny.ipynb | 2563 ++++--- session/dependency/alxlnet-base.ipynb | 2639 +++++--- session/dependency/bert-base.ipynb | 2884 ++++---- session/dependency/tiny-bert.ipynb | 2736 ++++---- session/dependency/xlnet-base.ipynb | 2928 ++++---- 47 files changed, 22562 insertions(+), 14370 deletions(-) create mode 100644 load-dependency.ipynb delete mode 100644 malaya/train/__init__.py delete mode 100644 malaya/train/model/__init__.py delete mode 100644 malaya/train/model/alxlnet/__init__.py delete mode 100644 malaya/train/model/bigbird/__init__.py delete mode 100644 malaya/train/model/bigbird/attention.py delete mode 100644 malaya/train/model/bigbird/beam_search.py delete mode 100644 malaya/train/model/bigbird/decoder.py delete mode 100644 malaya/train/model/bigbird/encoder.py delete mode 100644 malaya/train/model/bigbird/modeling.py delete mode 100644 malaya/train/model/bigbird/optimization.py delete mode 100644 malaya/train/model/bigbird/utils.py delete mode 100644 malaya/train/model/pegasus/__init__.py delete mode 100644 malaya/train/model/pegasus/base.py delete mode 100644 malaya/train/model/pegasus/layers/__init__.py delete mode 100644 malaya/train/model/pegasus/layers/attention.py delete mode 100644 malaya/train/model/pegasus/layers/beam_search.py delete mode 100644 malaya/train/model/pegasus/layers/decoding.py delete mode 100644 malaya/train/model/pegasus/layers/embedding.py delete mode 100644 malaya/train/model/pegasus/layers/timing.py delete mode 100644 malaya/train/model/pegasus/layers/transformer_block.py delete mode 100644 malaya/train/model/pegasus/transformer.py delete mode 100644 malaya/train/model/product_key_memory/layer.py delete mode 100644 malaya/train/model/product_key_memory/model.py create mode 100644 pretrained-model/fnet/sentiment-base.ipynb create mode 100644 pretrained-model/fnet/test-fnet.ipynb create mode 100644 pretrained-model/performer/fast_attention.py create mode 100644 pretrained-model/performer/model.py create mode 100644 pretrained-model/performer/test-performer.ipynb create mode 100644 pretrained-model/performer/util.py diff --git a/docs/Api.rst b/docs/Api.rst index f0931351..16697a09 100644 --- a/docs/Api.rst +++ b/docs/Api.rst @@ -27,6 +27,12 @@ malaya.constituency .. automodule:: malaya.constituency :members: +malaya.coref +--------------------- + +.. automodule:: malaya.coref + :members: + malaya.dependency ------------------ diff --git a/docs/load-dependency.ipynb b/docs/load-dependency.ipynb index 4f3e3f5a..d1a09942 100644 --- a/docs/load-dependency.ipynb +++ b/docs/load-dependency.ipynb @@ -31,15 +31,15 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 4.98 s, sys: 944 ms, total: 5.92 s\n", - "Wall time: 6.56 s\n" + "CPU times: user 5.15 s, sys: 925 ms, total: 6.07 s\n", + "Wall time: 6.8 s\n" ] } ], @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -135,91 +135,86 @@ " \n", " \n", " 8\n", - " advmod\n", - " adverbial modifier\n", - " \n", - " \n", - " 9\n", " compound\n", " compound\n", " \n", " \n", - " 10\n", + " 9\n", " compound:plur\n", " plural compound\n", " \n", " \n", - " 11\n", + " 10\n", " conj\n", " conjunct\n", " \n", " \n", - " 12\n", + " 11\n", " cop\n", " cop\n", " \n", " \n", - " 13\n", + " 12\n", " csubj\n", " clausal subject\n", " \n", " \n", - " 14\n", + " 13\n", " dep\n", " dependent\n", " \n", " \n", - " 15\n", + " 14\n", " det\n", " determiner\n", " \n", " \n", - " 16\n", + " 15\n", " fixed\n", " multi-word expression\n", " \n", " \n", - " 17\n", + " 16\n", " flat\n", " name\n", " \n", " \n", - " 18\n", + " 17\n", " iobj\n", " indirect object\n", " \n", " \n", - " 19\n", + " 18\n", " mark\n", " marker\n", " \n", " \n", - " 20\n", + " 19\n", " nmod\n", " nominal modifier\n", " \n", " \n", - " 21\n", + " 20\n", " nsubj\n", " nominal subject\n", " \n", " \n", - " 22\n", + " 21\n", " obj\n", " direct object\n", " \n", " \n", - " 23\n", + " 22\n", " parataxis\n", " parataxis\n", " \n", " \n", - " 24\n", + " 23\n", " root\n", " root\n", " \n", " \n", - " 25\n", + " 24\n", " xcomp\n", " open clausal complement\n", " \n", @@ -237,27 +232,26 @@ "5 aux auxiliary\n", "6 case case marking\n", "7 ccomp clausal complement\n", - "8 advmod adverbial modifier\n", - "9 compound compound\n", - "10 compound:plur plural compound\n", - "11 conj conjunct\n", - "12 cop cop\n", - "13 csubj clausal subject\n", - "14 dep dependent\n", - "15 det determiner\n", - "16 fixed multi-word expression\n", - "17 flat name\n", - "18 iobj indirect object\n", - "19 mark marker\n", - "20 nmod nominal modifier\n", - "21 nsubj nominal subject\n", - "22 obj direct object\n", - "23 parataxis parataxis\n", - "24 root root\n", - "25 xcomp open clausal complement" + "8 compound compound\n", + "9 compound:plur plural compound\n", + "10 conj conjunct\n", + "11 cop cop\n", + "12 csubj clausal subject\n", + "13 dep dependent\n", + "14 det determiner\n", + "15 fixed multi-word expression\n", + "16 flat name\n", + "17 iobj indirect object\n", + "18 mark marker\n", + "19 nmod nominal modifier\n", + "20 nsubj nominal subject\n", + "21 obj direct object\n", + "22 parataxis parataxis\n", + "23 root root\n", + "24 xcomp open clausal complement" ] }, - "execution_count": 5, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -270,12 +264,28 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### List available transformer Dependency models" + "### List available transformer Dependency models\n", + "\n", + "```python\n", + "def available_transformer(version: str = 'v2'):\n", + " \"\"\"\n", + " List available transformer dependency parsing models.\n", + "\n", + " Parameters\n", + " ----------\n", + " version : str, optional (default='v2')\n", + " Version supported. Allowed values:\n", + "\n", + " * ``'v1'`` - version 1, maintain for knowledge graph.\n", + " * ``'v2'`` - Trained on bigger dataset, better version.\n", + "\n", + " \"\"\"\n", + "```" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -316,51 +326,51 @@ " \n", " \n", " bert\n", - " 426.0\n", - " 112.0\n", - " 0.855\n", - " 0.848\n", - " 0.920\n", + " 455.0\n", + " 114.00\n", + " 0.820450\n", + " 0.79970\n", + " 0.98936\n", " \n", " \n", " tiny-bert\n", - " 59.5\n", - " 15.7\n", - " 0.718\n", - " 0.694\n", - " 0.886\n", + " 69.7\n", + " 17.50\n", + " 0.795252\n", + " 0.72470\n", + " 0.98939\n", " \n", " \n", " albert\n", - " 50.0\n", - " 13.2\n", - " 0.811\n", - " 0.793\n", - " 0.879\n", + " 60.8\n", + " 15.30\n", + " 0.821895\n", + " 0.79752\n", + " 1.00000\n", " \n", " \n", " tiny-albert\n", - " 24.8\n", - " 6.6\n", - " 0.708\n", - " 0.673\n", - " 0.817\n", + " 33.4\n", + " 8.51\n", + " 0.786500\n", + " 0.75870\n", + " 1.00000\n", " \n", " \n", " xlnet\n", - " 450.2\n", - " 119.0\n", - " 0.931\n", - " 0.925\n", - " 0.947\n", + " 480.2\n", + " 121.00\n", + " 0.848110\n", + " 0.82741\n", + " 0.92101\n", " \n", " \n", " alxlnet\n", - " 50.0\n", - " 14.3\n", - " 0.894\n", - " 0.886\n", - " 0.942\n", + " 61.2\n", + " 16.40\n", + " 0.849290\n", + " 0.82810\n", + " 0.92099\n", " \n", " \n", "\n", @@ -368,23 +378,23 @@ ], "text/plain": [ " Size (MB) Quantized Size (MB) Arc Accuracy Types Accuracy \\\n", - "bert 426.0 112.0 0.855 0.848 \n", - "tiny-bert 59.5 15.7 0.718 0.694 \n", - "albert 50.0 13.2 0.811 0.793 \n", - "tiny-albert 24.8 6.6 0.708 0.673 \n", - "xlnet 450.2 119.0 0.931 0.925 \n", - "alxlnet 50.0 14.3 0.894 0.886 \n", + "bert 455.0 114.00 0.820450 0.79970 \n", + "tiny-bert 69.7 17.50 0.795252 0.72470 \n", + "albert 60.8 15.30 0.821895 0.79752 \n", + "tiny-albert 33.4 8.51 0.786500 0.75870 \n", + "xlnet 480.2 121.00 0.848110 0.82741 \n", + "alxlnet 61.2 16.40 0.849290 0.82810 \n", "\n", " Root Accuracy \n", - "bert 0.920 \n", - "tiny-bert 0.886 \n", - "albert 0.879 \n", - "tiny-albert 0.817 \n", - "xlnet 0.947 \n", - "alxlnet 0.942 " + "bert 0.98936 \n", + "tiny-bert 0.98939 \n", + "albert 1.00000 \n", + "tiny-albert 1.00000 \n", + "xlnet 0.92101 \n", + "alxlnet 0.92099 " ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -393,15 +403,6 @@ "malaya.dependency.available_transformer()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure you can check accuracy chart from here first before select a model, https://malaya.readthedocs.io/en/latest/models-accuracy.html#Dependency-parsing\n", - "\n", - "**The best model in term of accuracy is XLNET**." - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -409,13 +410,19 @@ "### Load xlnet dependency model\n", "\n", "```python\n", - "def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs):\n", + "def transformer(version: str = 'v2', model: str = 'xlnet', quantized: bool = False, **kwargs):\n", " \"\"\"\n", " Load Transformer Dependency Parsing model, transfer learning Transformer + biaffine attention.\n", "\n", " Parameters\n", " ----------\n", - " model : str, optional (default='bert')\n", + " version : str, optional (default='v2')\n", + " Version supported. Allowed values:\n", + "\n", + " * ``'v1'`` - version 1, maintain for knowledge graph.\n", + " * ``'v2'`` - Trained on bigger dataset, better version.\n", + "\n", + " model : str, optional (default='xlnet')\n", " Model architecture supported. Allowed values:\n", "\n", " * ``'bert'`` - Google BERT BASE parameters.\n", @@ -442,11 +449,19 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:running dependency-v2/albert using device /device:CPU:0\n" + ] + } + ], "source": [ - "model = malaya.dependency.transformer(model = 'xlnet')" + "model = malaya.dependency.transformer(model = 'albert')" ] }, { @@ -462,7 +477,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -470,12 +485,12 @@ "output_type": "stream", "text": [ "WARNING:root:Load quantized model will cause accuracy drop.\n", - "INFO:root:running dependency/xlnet-quantized using device /device:CPU:0\n" + "INFO:root:running dependency-v2/albert-quantized using device /device:CPU:0\n" ] } ], "source": [ - "quantized_model = malaya.dependency.transformer(model = 'xlnet', quantized = True)" + "quantized_model = malaya.dependency.transformer(model = 'albert', quantized = True)" ] }, { @@ -502,7 +517,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -511,7 +526,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -523,63 +538,75 @@ "\n", "\n", - "\n", + "\n", "\n", "G\n", - "\n", + "\n", "\n", "\n", "0\n", - "0 (None)\n", + "0 (None)\n", "\n", "\n", "\n", "3\n", - "3 (menasihati)\n", + "3 (menasihati)\n", "\n", "\n", "\n", "0->3\n", - "\n", - "\n", - "root\n", + "\n", + "\n", + "root\n", "\n", "\n", "\n", "1\n", - "1 (Dr)\n", + "1 (Dr)\n", "\n", "\n", - "\n", + "\n", "3->1\n", - "\n", - "\n", - "nsubj\n", + "\n", + "\n", + "nsubj\n", "\n", "\n", - "\n", + "\n", "4\n", - "4 (mereka)\n", + "4 (mereka)\n", "\n", "\n", - "\n", + "\n", "3->4\n", - "\n", - "\n", - "obj\n", + "\n", + "\n", + "obj\n", "\n", "\n", - "\n", + "\n", "6\n", - "6 (berhenti)\n", + "6 (berhenti)\n", "\n", "\n", - "\n", + "\n", "3->6\n", - "\n", - "\n", - "xcomp\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "15\n", + "15 (.)\n", + "\n", + "\n", + "\n", + "3->15\n", + "\n", + "\n", + "punct\n", "\n", "\n", "\n", @@ -589,126 +616,126 @@ "\n", "\n", "1->2\n", - "\n", - "\n", - "flat\n", - "\n", - "\n", - "\n", - "14\n", - "14 (memandu.)\n", - "\n", - "\n", - "\n", - "1->14\n", - "\n", - "\n", - "acl\n", - "\n", - "\n", - "\n", - "13\n", - "13 (ketika)\n", - "\n", - "\n", - "\n", - "14->13\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "flat\n", "\n", "\n", "\n", "5\n", - "5 (supaya)\n", + "5 (supaya)\n", "\n", "\n", "\n", "6->5\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "cc\n", "\n", "\n", "\n", "7\n", - "7 (berehat)\n", + "7 (berehat)\n", "\n", "\n", "\n", "6->7\n", - "\n", - "\n", - "xcomp\n", + "\n", + "\n", + "xcomp\n", "\n", "\n", "\n", "9\n", - "9 (tidur)\n", + "9 (tidur)\n", "\n", "\n", "\n", "6->9\n", - "\n", - "\n", - "conj\n", + "\n", + "\n", + "conj\n", "\n", "\n", "\n", "8\n", - "8 (dan)\n", + "8 (dan)\n", "\n", "\n", "\n", "9->8\n", - "\n", - "\n", - "cc\n", + "\n", + "\n", + "cc\n", "\n", - "\n", + "\n", "\n", - "10\n", - "10 (sebentar)\n", + "12\n", + "12 (mengantuk)\n", "\n", - "\n", + "\n", "\n", - "9->10\n", - "\n", - "\n", - "advmod\n", + "9->12\n", + "\n", + "\n", + "xcomp\n", "\n", - "\n", + "\n", "\n", - "12\n", - "12 (mengantuk)\n", + "14\n", + "14 (memandu)\n", "\n", - "\n", + "\n", "\n", - "9->12\n", - "\n", - "\n", - "advcl\n", + "9->14\n", + "\n", + "\n", + "advcl\n", "\n", - "\n", + "\n", "\n", + "10\n", + "10 (sebentar)\n", + "\n", + "\n", + "\n", + "12->10\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", "11\n", - "11 (sekiranya)\n", + "11 (sekiranya)\n", "\n", "\n", - "\n", + "\n", "12->11\n", - "\n", - "\n", - "mark\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "13\n", + "13 (ketika)\n", + "\n", + "\n", + "\n", + "14->13\n", + "\n", + "\n", + "mark\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 13, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -720,7 +747,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -732,192 +759,204 @@ "\n", "\n", - "\n", - "\n", + "\n", + "\n", "G\n", - "\n", + "\n", "\n", "\n", "0\n", - "0 (None)\n", + "0 (None)\n", "\n", "\n", "\n", "3\n", - "3 (menasihati)\n", + "3 (menasihati)\n", "\n", "\n", "\n", "0->3\n", - "\n", - "\n", - "root\n", + "\n", + "\n", + "root\n", "\n", "\n", "\n", "1\n", - "1 (Dr)\n", + "1 (Dr)\n", "\n", "\n", "\n", "3->1\n", - "\n", - "\n", - "nsubj\n", + "\n", + "\n", + "nsubj\n", "\n", "\n", "\n", "4\n", - "4 (mereka)\n", + "4 (mereka)\n", "\n", "\n", "\n", "3->4\n", - "\n", - "\n", - "obj\n", + "\n", + "\n", + "obj\n", "\n", "\n", "\n", "6\n", - "6 (berhenti)\n", + "6 (berhenti)\n", "\n", "\n", "\n", "3->6\n", - "\n", - "\n", - "ccomp\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "15\n", + "15 (.)\n", + "\n", + "\n", + "\n", + "3->15\n", + "\n", + "\n", + "punct\n", "\n", "\n", "\n", "2\n", - "2 (Mahathir)\n", + "2 (Mahathir)\n", "\n", "\n", "\n", "1->2\n", - "\n", - "\n", - "flat\n", + "\n", + "\n", + "flat\n", "\n", "\n", - "\n", + "\n", "5\n", - "5 (supaya)\n", + "5 (supaya)\n", "\n", "\n", - "\n", + "\n", "6->5\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "cc\n", "\n", "\n", - "\n", + "\n", "7\n", - "7 (berehat)\n", + "7 (berehat)\n", "\n", "\n", - "\n", + "\n", "6->7\n", - "\n", - "\n", - "xcomp\n", + "\n", + "\n", + "xcomp\n", "\n", "\n", - "\n", + "\n", "9\n", - "9 (tidur)\n", + "9 (tidur)\n", "\n", "\n", - "\n", + "\n", "6->9\n", - "\n", - "\n", - "conj\n", + "\n", + "\n", + "conj\n", "\n", "\n", - "\n", + "\n", "8\n", - "8 (dan)\n", + "8 (dan)\n", "\n", "\n", - "\n", - "9->8\n", - "\n", - "\n", - "cc\n", - "\n", - "\n", - "\n", - "10\n", - "10 (sebentar)\n", - "\n", - "\n", "\n", - "9->10\n", - "\n", - "\n", - "advmod\n", + "9->8\n", + "\n", + "\n", + "cc\n", "\n", "\n", "\n", "12\n", - "12 (mengantuk)\n", + "12 (mengantuk)\n", "\n", "\n", "\n", "9->12\n", - "\n", - "\n", - "advcl\n", + "\n", + "\n", + "xcomp\n", "\n", - "\n", + "\n", "\n", - "11\n", - "11 (sekiranya)\n", + "14\n", + "14 (memandu)\n", "\n", - "\n", - "\n", - "12->11\n", - "\n", - "\n", - "mark\n", + "\n", + "\n", + "9->14\n", + "\n", + "\n", + "advcl\n", "\n", - "\n", + "\n", "\n", - "14\n", - "14 (memandu.)\n", + "10\n", + "10 (sebentar)\n", "\n", - "\n", - "\n", - "11->14\n", - "\n", - "\n", - "advcl\n", + "\n", + "\n", + "12->10\n", + "\n", + "\n", + "case\n", "\n", - "\n", + "\n", "\n", + "11\n", + "11 (sekiranya)\n", + "\n", + "\n", + "\n", + "12->11\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", "13\n", - "13 (ketika)\n", + "13 (ketika)\n", "\n", "\n", - "\n", + "\n", "14->13\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "mark\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 14, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -936,14 +975,14 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "INFO:root:running dependency/alxlnet using device /device:CPU:0\n" + "INFO:root:running dependency-v2/alxlnet using device /device:CPU:0\n" ] }, { @@ -955,199 +994,211 @@ "\n", "\n", - "\n", - "\n", + "\n", + "\n", "G\n", - "\n", + "\n", "\n", "\n", "0\n", - "0 (None)\n", + "0 (None)\n", "\n", "\n", "\n", "3\n", - "3 (menasihati)\n", + "3 (menasihati)\n", "\n", "\n", "\n", "0->3\n", - "\n", - "\n", - "root\n", + "\n", + "\n", + "root\n", "\n", "\n", "\n", "1\n", - "1 (Dr)\n", + "1 (Dr)\n", "\n", "\n", - "\n", - "3->1\n", - "\n", - "\n", - "nsubj\n", - "\n", - "\n", - "\n", - "2\n", - "2 (Mahathir)\n", - "\n", - "\n", "\n", - "3->2\n", - "\n", - "\n", - "flat\n", + "3->1\n", + "\n", + "\n", + "nsubj\n", "\n", "\n", "\n", "4\n", - "4 (mereka)\n", + "4 (mereka)\n", "\n", "\n", "\n", "3->4\n", - "\n", - "\n", - "obj\n", + "\n", + "\n", + "obj\n", "\n", "\n", "\n", "6\n", - "6 (berhenti)\n", + "6 (berhenti)\n", "\n", "\n", "\n", "3->6\n", - "\n", - "\n", - "xcomp\n", + "\n", + "\n", + "conj\n", "\n", - "\n", + "\n", "\n", + "15\n", + "15 (.)\n", + "\n", + "\n", + "\n", + "3->15\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "2\n", + "2 (Mahathir)\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", "5\n", - "5 (supaya)\n", + "5 (supaya)\n", "\n", "\n", - "\n", + "\n", "6->5\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "cc\n", "\n", "\n", - "\n", + "\n", "7\n", - "7 (berehat)\n", + "7 (berehat)\n", "\n", "\n", - "\n", + "\n", "6->7\n", - "\n", - "\n", - "xcomp\n", + "\n", + "\n", + "xcomp\n", "\n", "\n", - "\n", + "\n", "9\n", - "9 (tidur)\n", + "9 (tidur)\n", "\n", "\n", - "\n", + "\n", "6->9\n", - "\n", - "\n", - "conj\n", + "\n", + "\n", + "conj\n", "\n", "\n", - "\n", + "\n", "8\n", - "8 (dan)\n", + "8 (dan)\n", "\n", "\n", - "\n", - "9->8\n", - "\n", - "\n", - "cc\n", - "\n", - "\n", - "\n", - "10\n", - "10 (sebentar)\n", - "\n", - "\n", "\n", - "9->10\n", - "\n", - "\n", - "advmod\n", + "9->8\n", + "\n", + "\n", + "cc\n", "\n", "\n", "\n", "12\n", - "12 (mengantuk)\n", + "12 (mengantuk)\n", "\n", "\n", "\n", "9->12\n", - "\n", - "\n", - "advcl\n", + "\n", + "\n", + "xcomp\n", "\n", - "\n", + "\n", "\n", - "11\n", - "11 (sekiranya)\n", + "14\n", + "14 (memandu)\n", "\n", - "\n", - "\n", - "12->11\n", - "\n", - "\n", - "mark\n", + "\n", + "\n", + "9->14\n", + "\n", + "\n", + "advcl\n", "\n", - "\n", + "\n", "\n", - "14\n", - "14 (memandu.)\n", + "10\n", + "10 (sebentar)\n", "\n", - "\n", - "\n", - "11->14\n", - "\n", - "\n", - "ccomp\n", + "\n", + "\n", + "12->10\n", + "\n", + "\n", + "case\n", "\n", - "\n", + "\n", "\n", + "11\n", + "11 (sekiranya)\n", + "\n", + "\n", + "\n", + "12->11\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", "13\n", - "13 (ketika)\n", + "13 (ketika)\n", "\n", "\n", - "\n", + "\n", "14->13\n", - "\n", - "\n", - "mark\n", + "\n", + "\n", + "mark\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 15, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "alxlnet = malaya.dependency.transformer(model = 'alxlnet')\n", - "tagging, indexing = malaya.stack.voting_stack([model, alxlnet, model], string)\n", + "tagging, indexing = malaya.stack.voting_stack([model, model, alxlnet], string)\n", "malaya.dependency.dependency_graph(tagging, indexing).to_graphvis()" ] }, @@ -1160,18 +1211,20 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# https://www.astroawani.com/berita-malaysia/terbaik-tun-kita-geng-najib-razak-puji-tun-m-297884\n", "\n", - "s = \"Najib yang juga Ahli Parlimen Pekan memuji sikap Ahli Parlimen Langkawi itu yang mengaku bersalah selepas melanggar SOP kerana tidak mengambil suhu badan ketika masuk ke sebuah surau di Langkawi pada Sabtu lalu\"" + "s = \"\"\"\n", + "KUALA LUMPUR: Dalam hal politik, jarang sekali untuk melihat dua figura ini - bekas Perdana Menteri, Datuk Seri Najib Razak dan Tun Dr Mahathir Mohamad mempunyai 'pandangan yang sama' atau sekapal. Namun, situasi itu berbeza apabila melibatkan isu ketidakpatuhan terhadap prosedur operasi standard (SOP). Najib, yang juga Ahli Parlimen Pekan memuji sikap Ahli Parlimen Langkawi itu yang mengaku bersalah selepas melanggar SOP kerana tidak mengambil suhu badan ketika masuk ke sebuah surau di Langkawi pada Sabtu lalu.\n", + "\"\"\"" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -1183,469 +1236,1104 @@ "\n", "\n", - "\n", - "\n", + "\n", + "\n", "G\n", - "\n", + "\n", "\n", "\n", "0\n", - "0 (None)\n", + "0 (None)\n", "\n", - "\n", + "\n", "\n", - "7\n", - "7 (memuji)\n", + "11\n", + "11 (melihat)\n", "\n", - "\n", + "\n", "\n", - "0->7\n", - "\n", - "\n", - "root\n", + "0->11\n", + "\n", + "\n", + "root\n", "\n", "\n", "\n", "1\n", - "1 (Najib)\n", + "1 (KUALA)\n", "\n", - "\n", - "\n", - "7->1\n", - "\n", - "\n", - "nsubj\n", + "\n", + "\n", + "11->1\n", + "\n", + "\n", + "nsubj\n", "\n", "\n", - "\n", + "\n", "8\n", - "8 (sikap)\n", + "8 (jarang)\n", "\n", - "\n", - "\n", - "7->8\n", - "\n", - "\n", - "obj\n", + "\n", + "\n", + "11->8\n", + "\n", + "\n", + "advmod\n", "\n", - "\n", + "\n", + "\n", + "9\n", + "9 (sekali)\n", + "\n", + "\n", + "\n", + "11->9\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "10\n", + "10 (untuk)\n", + "\n", + "\n", + "\n", + "11->10\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "29\n", + "29 (mempunyai)\n", + "\n", + "\n", + "\n", + "11->29\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "42\n", + "42 (berbeza)\n", + "\n", + "\n", + "\n", + "11->42\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", "\n", - "4\n", - "4 (Ahli)\n", + "2\n", + "2 (LUMPUR)\n", "\n", - "\n", + "\n", "\n", - "1->4\n", - "\n", - "\n", - "fixed\n", + "1->2\n", + "\n", + "\n", + "flat\n", "\n", - "\n", + "\n", "\n", - "2\n", - "2 (yang)\n", + "5\n", + "5 (hal)\n", "\n", - "\n", + "\n", "\n", - "4->2\n", - "\n", - "\n", - "nsubj\n", + "1->5\n", + "\n", + "\n", + "obl\n", "\n", - "\n", + "\n", "\n", - "3\n", - "3 (juga)\n", + "7\n", + "7 (,)\n", "\n", - "\n", + "\n", "\n", - "4->3\n", - "\n", - "\n", - "advmod\n", + "1->7\n", + "\n", + "\n", + "punct\n", "\n", - "\n", + "\n", "\n", - "5\n", - "5 (Parlimen)\n", + "3\n", + "3 (:)\n", "\n", - "\n", + "\n", "\n", - "4->5\n", - "\n", - "\n", - "flat\n", + "5->3\n", + "\n", + "\n", + "punct\n", "\n", - "\n", + "\n", "\n", + "4\n", + "4 (Dalam)\n", + "\n", + "\n", + "\n", + "5->4\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", "6\n", - "6 (Pekan)\n", + "6 (politik)\n", "\n", "\n", - "\n", + "\n", "5->6\n", - "\n", - "\n", - "flat\n", + "\n", + "\n", + "compound\n", "\n", - "\n", - "\n", - "9\n", - "9 (Ahli)\n", + "\n", + "\n", + "13\n", + "13 (figura)\n", "\n", - "\n", - "\n", - "8->9\n", - "\n", - "\n", - "flat\n", + "\n", + "\n", + "29->13\n", + "\n", + "\n", + "obj\n", "\n", - "\n", - "\n", - "14\n", - "14 (mengaku)\n", + "\n", + "\n", + "31\n", + "31 (pandangan)\n", "\n", - "\n", - "\n", - "8->14\n", - "\n", - "\n", - "acl\n", + "\n", + "\n", + "29->31\n", + "\n", + "\n", + "obj\n", "\n", - "\n", - "\n", - "10\n", - "10 (Parlimen)\n", + "\n", + "\n", + "37\n", + "37 (.)\n", "\n", - "\n", - "\n", - "9->10\n", - "\n", - "\n", - "flat\n", + "\n", + "\n", + "29->37\n", + "\n", + "\n", + "punct\n", "\n", - "\n", - "\n", - "12\n", - "12 (itu)\n", + "\n", + "\n", + "38\n", + "38 (Namun)\n", "\n", - "\n", - "\n", - "9->12\n", - "\n", - "\n", - "det\n", + "\n", + "\n", + "29->38\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "39\n", + "39 (,)\n", + "\n", + "\n", + "\n", + "42->39\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "40\n", + "40 (situasi)\n", + "\n", + "\n", + "\n", + "42->40\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "54\n", + "54 (.)\n", + "\n", + "\n", + "\n", + "42->54\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "89\n", + "89 (.)\n", + "\n", + "\n", + "\n", + "42->89\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "44\n", + "44 (melibatkan)\n", + "\n", + "\n", + "\n", + "42->44\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "55\n", + "55 (Najib)\n", + "\n", + "\n", + "\n", + "42->55\n", + "\n", + "\n", + "dep\n", "\n", - "\n", + "\n", "\n", - "13\n", - "13 (yang)\n", + "12\n", + "12 (dua)\n", "\n", - "\n", + "\n", "\n", - "14->13\n", - "\n", - "\n", - "nsubj\n", + "13->12\n", + "\n", + "\n", + "nummod\n", "\n", "\n", - "\n", + "\n", "15\n", - "15 (bersalah)\n", + "15 (-)\n", "\n", - "\n", + "\n", "\n", - "14->15\n", - "\n", - "\n", - "ccomp\n", + "13->15\n", + "\n", + "\n", + "punct\n", "\n", - "\n", - "\n", - "17\n", - "17 (melanggar)\n", + "\n", + "\n", + "16\n", + "16 (bekas)\n", "\n", - "\n", + "\n", "\n", - "14->17\n", - "\n", - "\n", - "xcomp\n", + "13->16\n", + "\n", + "\n", + "compound:plur\n", "\n", - "\n", - "\n", - "21\n", - "21 (mengambil)\n", + "\n", + "\n", + "17\n", + "17 (Perdana)\n", "\n", - "\n", + "\n", "\n", - "14->21\n", - "\n", - "\n", - "acl\n", - "\n", - "\n", - "\n", - "11\n", - "11 (Langkawi)\n", - "\n", - "\n", - "\n", - "10->11\n", - "\n", - "\n", - "flat\n", + "13->17\n", + "\n", + "\n", + "flat\n", "\n", - "\n", - "\n", - "16\n", - "16 (selepas)\n", + "\n", + "\n", + "14\n", + "14 (ini)\n", "\n", - "\n", + "\n", "\n", - "17->16\n", - "\n", - "\n", - "case\n", + "17->14\n", + "\n", + "\n", + "det\n", "\n", "\n", - "\n", + "\n", "18\n", - "18 (SOP)\n", + "18 (Menteri)\n", "\n", "\n", "\n", "17->18\n", - "\n", - "\n", - "obj\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "19\n", + "19 (,)\n", + "\n", + "\n", + "\n", + "17->19\n", + "\n", + "\n", + "punct\n", "\n", "\n", "\n", "20\n", - "20 (tidak)\n", + "20 (Datuk)\n", "\n", - "\n", + "\n", "\n", - "21->20\n", - "\n", - "\n", - "advmod\n", + "17->20\n", + "\n", + "\n", + "appos\n", "\n", "\n", "\n", "25\n", - "25 (masuk)\n", + "25 (Tun)\n", "\n", - "\n", + "\n", "\n", - "21->25\n", - "\n", - "\n", - "advcl\n", - "\n", - "\n", - "\n", - "23\n", - "23 (badan)\n", - "\n", - "\n", - "\n", - "18->23\n", - "\n", - "\n", - "compound\n", + "17->25\n", + "\n", + "\n", + "conj\n", "\n", - "\n", - "\n", - "19\n", - "19 (kerana)\n", - "\n", - "\n", - "\n", - "23->19\n", - "\n", - "\n", - "det\n", - "\n", - "\n", + "\n", "\n", - "22\n", - "22 (suhu)\n", + "21\n", + "21 (Seri)\n", "\n", - "\n", - "\n", - "25->22\n", - "\n", - "\n", - "obj\n", + "\n", + "\n", + "20->21\n", + "\n", + "\n", + "flat\n", "\n", "\n", - "\n", + "\n", "24\n", - "24 (ketika)\n", + "24 (dan)\n", "\n", "\n", - "\n", + "\n", "25->24\n", - "\n", - "\n", - "mark\n", + "\n", + "\n", + "cc\n", "\n", - "\n", - "\n", - "28\n", - "28 (surau)\n", + "\n", + "\n", + "26\n", + "26 (Dr)\n", "\n", - "\n", - "\n", - "25->28\n", - "\n", - "\n", - "obl\n", + "\n", + "\n", + "25->26\n", + "\n", + "\n", + "flat\n", "\n", - "\n", - "\n", - "32\n", - "32 (Sabtu)\n", + "\n", + "\n", + "22\n", + "22 (Najib)\n", "\n", - "\n", - "\n", - "25->32\n", - "\n", - "\n", - "obl\n", + "\n", + "\n", + "21->22\n", + "\n", + "\n", + "flat\n", "\n", - "\n", - "\n", - "26\n", - "26 (ke)\n", + "\n", + "\n", + "23\n", + "23 (Razak)\n", "\n", - "\n", - "\n", - "28->26\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "22->23\n", + "\n", + "\n", + "flat\n", "\n", "\n", "\n", "27\n", - "27 (sebuah)\n", + "27 (Mahathir)\n", "\n", - "\n", - "\n", - "28->27\n", - "\n", - "\n", - "det\n", + "\n", + "\n", + "26->27\n", + "\n", + "\n", + "flat\n", "\n", - "\n", + "\n", "\n", - "30\n", - "30 (Langkawi)\n", - "\n", - "\n", - "\n", - "28->30\n", - "\n", - "\n", - "nmod\n", - "\n", - "\n", - "\n", - "31\n", - "31 (pada)\n", + "28\n", + "28 (Mohamad)\n", "\n", - "\n", - "\n", - "32->31\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "27->28\n", + "\n", + "\n", + "flat\n", "\n", - "\n", - "\n", - "29\n", - "29 (di)\n", + "\n", + "\n", + "30\n", + "30 (')\n", "\n", - "\n", - "\n", - "30->29\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "31->30\n", + "\n", + "\n", + "punct\n", "\n", "\n", - "\n", + "\n", "33\n", - "33 (lalu)\n", + "33 (sama)\n", "\n", "\n", - "\n", + "\n", "31->33\n", - "\n", - "\n", - "amod\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "36\n", + "36 (sekapal)\n", + "\n", + "\n", + "\n", + "33->36\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "32\n", + "32 (yang)\n", + "\n", + "\n", + "\n", + "36->32\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "34\n", + "34 (')\n", + "\n", + "\n", + "\n", + "36->34\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "35\n", + "35 (atau)\n", + "\n", + "\n", + "\n", + "36->35\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "41\n", + "41 (itu)\n", + "\n", + "\n", + "\n", + "40->41\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "43\n", + "43 (apabila)\n", + "\n", + "\n", + "\n", + "44->43\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "45\n", + "45 (isu)\n", + "\n", + "\n", + "\n", + "44->45\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "56\n", + "56 (,)\n", + "\n", + "\n", + "\n", + "55->56\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "59\n", + "59 (Ahli)\n", + "\n", + "\n", + "\n", + "55->59\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "62\n", + "62 (memuji)\n", + "\n", + "\n", + "\n", + "55->62\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "46\n", + "46 (ketidakpatuhan)\n", + "\n", + "\n", + "\n", + "45->46\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "48\n", + "48 (prosedur)\n", + "\n", + "\n", + "\n", + "45->48\n", + "\n", + "\n", + "nmod\n", + "\n", + "\n", + "\n", + "47\n", + "47 (terhadap)\n", + "\n", + "\n", + "\n", + "48->47\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "49\n", + "49 (operasi)\n", + "\n", + "\n", + "\n", + "48->49\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "50\n", + "50 (standard)\n", + "\n", + "\n", + "\n", + "48->50\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "52\n", + "52 (SOP)\n", + "\n", + "\n", + "\n", + "48->52\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "51\n", + "51 (()\n", + "\n", + "\n", + "\n", + "52->51\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "53\n", + "53 ())\n", + "\n", + "\n", + "\n", + "52->53\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "57\n", + "57 (yang)\n", + "\n", + "\n", + "\n", + "59->57\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "58\n", + "58 (juga)\n", + "\n", + "\n", + "\n", + "59->58\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "60\n", + "60 (Parlimen)\n", + "\n", + "\n", + "\n", + "59->60\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "63\n", + "63 (sikap)\n", + "\n", + "\n", + "\n", + "62->63\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "61\n", + "61 (Pekan)\n", + "\n", + "\n", + "\n", + "60->61\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "64\n", + "64 (Ahli)\n", + "\n", + "\n", + "\n", + "63->64\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "69\n", + "69 (mengaku)\n", + "\n", + "\n", + "\n", + "63->69\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "65\n", + "65 (Parlimen)\n", + "\n", + "\n", + "\n", + "64->65\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "68\n", + "68 (yang)\n", + "\n", + "\n", + "\n", + "69->68\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "70\n", + "70 (bersalah)\n", + "\n", + "\n", + "\n", + "69->70\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "66\n", + "66 (Langkawi)\n", + "\n", + "\n", + "\n", + "65->66\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "67\n", + "67 (itu)\n", + "\n", + "\n", + "\n", + "66->67\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "72\n", + "72 (melanggar)\n", + "\n", + "\n", + "\n", + "70->72\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "71\n", + "71 (selepas)\n", + "\n", + "\n", + "\n", + "72->71\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "73\n", + "73 (SOP)\n", + "\n", + "\n", + "\n", + "72->73\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "76\n", + "76 (mengambil)\n", + "\n", + "\n", + "\n", + "72->76\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "74\n", + "74 (kerana)\n", + "\n", + "\n", + "\n", + "76->74\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "75\n", + "75 (tidak)\n", + "\n", + "\n", + "\n", + "76->75\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "77\n", + "77 (suhu)\n", + "\n", + "\n", + "\n", + "76->77\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "80\n", + "80 (masuk)\n", + "\n", + "\n", + "\n", + "76->80\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "78\n", + "78 (badan)\n", + "\n", + "\n", + "\n", + "77->78\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "79\n", + "79 (ketika)\n", + "\n", + "\n", + "\n", + "80->79\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "83\n", + "83 (surau)\n", + "\n", + "\n", + "\n", + "80->83\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "85\n", + "85 (Langkawi)\n", + "\n", + "\n", + "\n", + "80->85\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "87\n", + "87 (Sabtu)\n", + "\n", + "\n", + "\n", + "80->87\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "81\n", + "81 (ke)\n", + "\n", + "\n", + "\n", + "83->81\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "82\n", + "82 (sebuah)\n", + "\n", + "\n", + "\n", + "83->82\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "84\n", + "84 (di)\n", + "\n", + "\n", + "\n", + "85->84\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "86\n", + "86 (pada)\n", + "\n", + "\n", + "\n", + "87->86\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "88\n", + "88 (lalu)\n", + "\n", + "\n", + "\n", + "87->88\n", + "\n", + "\n", + "amod\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 17, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tagging, indexing = malaya.stack.voting_stack([model, alxlnet, model], s)\n", - "malaya.dependency.dependency_graph(tagging, indexing).to_graphvis()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Dependency graph object\n", - "\n", - "To initiate a dependency graph from dependency models, you need to call `malaya.dependency.dependency_graph`." + "d_object, tagging, indexing = model.predict(s)\n", + "d_object.to_graphvis()" ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "graph = malaya.dependency.dependency_graph(tagging, indexing)\n", - "graph" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### generate graphvis" - ] - }, - { - "cell_type": "code", - "execution_count": 20, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1657,760 +2345,3110 @@ "\n", "\n", - "\n", - "\n", + "\n", + "\n", "G\n", - "\n", + "\n", "\n", "\n", "0\n", - "0 (None)\n", + "0 (None)\n", "\n", - "\n", + "\n", "\n", - "7\n", - "7 (memuji)\n", + "11\n", + "11 (melihat)\n", "\n", - "\n", + "\n", "\n", - "0->7\n", - "\n", - "\n", - "root\n", + "0->11\n", + "\n", + "\n", + "root\n", "\n", "\n", "\n", "1\n", - "1 (Najib)\n", + "1 (KUALA)\n", "\n", - "\n", - "\n", - "7->1\n", - "\n", - "\n", - "nsubj\n", + "\n", + "\n", + "11->1\n", + "\n", + "\n", + "nsubj\n", "\n", "\n", - "\n", + "\n", "8\n", - "8 (sikap)\n", + "8 (jarang)\n", "\n", - "\n", - "\n", - "7->8\n", - "\n", - "\n", - "obj\n", + "\n", + "\n", + "11->8\n", + "\n", + "\n", + "advmod\n", "\n", - "\n", + "\n", + "\n", + "9\n", + "9 (sekali)\n", + "\n", + "\n", + "\n", + "11->9\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "10\n", + "10 (untuk)\n", + "\n", + "\n", + "\n", + "11->10\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "29\n", + "29 (mempunyai)\n", + "\n", + "\n", + "\n", + "11->29\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "42\n", + "42 (berbeza)\n", + "\n", + "\n", + "\n", + "11->42\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", "\n", - "4\n", - "4 (Ahli)\n", + "2\n", + "2 (LUMPUR)\n", "\n", - "\n", + "\n", "\n", - "1->4\n", - "\n", - "\n", - "fixed\n", + "1->2\n", + "\n", + "\n", + "flat\n", "\n", - "\n", + "\n", "\n", - "2\n", - "2 (yang)\n", + "5\n", + "5 (hal)\n", "\n", - "\n", + "\n", "\n", - "4->2\n", - "\n", - "\n", - "nsubj\n", + "1->5\n", + "\n", + "\n", + "obl\n", "\n", - "\n", + "\n", "\n", - "3\n", - "3 (juga)\n", + "7\n", + "7 (,)\n", "\n", - "\n", + "\n", "\n", - "4->3\n", - "\n", - "\n", - "advmod\n", + "1->7\n", + "\n", + "\n", + "punct\n", "\n", - "\n", + "\n", "\n", - "5\n", - "5 (Parlimen)\n", + "3\n", + "3 (:)\n", "\n", - "\n", + "\n", "\n", - "4->5\n", - "\n", - "\n", - "flat\n", + "5->3\n", + "\n", + "\n", + "punct\n", "\n", - "\n", + "\n", "\n", + "4\n", + "4 (Dalam)\n", + "\n", + "\n", + "\n", + "5->4\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", "6\n", - "6 (Pekan)\n", + "6 (politik)\n", "\n", "\n", - "\n", + "\n", "5->6\n", - "\n", - "\n", - "flat\n", + "\n", + "\n", + "compound\n", "\n", - "\n", - "\n", - "9\n", - "9 (Ahli)\n", + "\n", + "\n", + "13\n", + "13 (figura)\n", "\n", - "\n", - "\n", - "8->9\n", - "\n", - "\n", - "flat\n", + "\n", + "\n", + "29->13\n", + "\n", + "\n", + "obj\n", "\n", - "\n", - "\n", - "14\n", - "14 (mengaku)\n", + "\n", + "\n", + "31\n", + "31 (pandangan)\n", "\n", - "\n", - "\n", - "8->14\n", - "\n", - "\n", - "acl\n", + "\n", + "\n", + "29->31\n", + "\n", + "\n", + "obj\n", "\n", - "\n", - "\n", - "10\n", - "10 (Parlimen)\n", + "\n", + "\n", + "37\n", + "37 (.)\n", "\n", - "\n", - "\n", - "9->10\n", - "\n", - "\n", - "flat\n", + "\n", + "\n", + "29->37\n", + "\n", + "\n", + "punct\n", "\n", - "\n", - "\n", - "12\n", - "12 (itu)\n", + "\n", + "\n", + "38\n", + "38 (Namun)\n", "\n", - "\n", - "\n", - "9->12\n", - "\n", - "\n", - "det\n", + "\n", + "\n", + "29->38\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "39\n", + "39 (,)\n", + "\n", + "\n", + "\n", + "42->39\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "40\n", + "40 (situasi)\n", + "\n", + "\n", + "\n", + "42->40\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "54\n", + "54 (.)\n", + "\n", + "\n", + "\n", + "42->54\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "89\n", + "89 (.)\n", + "\n", + "\n", + "\n", + "42->89\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "44\n", + "44 (melibatkan)\n", + "\n", + "\n", + "\n", + "42->44\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "55\n", + "55 (Najib)\n", + "\n", + "\n", + "\n", + "42->55\n", + "\n", + "\n", + "dep\n", "\n", - "\n", + "\n", "\n", - "13\n", - "13 (yang)\n", + "12\n", + "12 (dua)\n", "\n", - "\n", + "\n", "\n", - "14->13\n", - "\n", - "\n", - "nsubj\n", + "13->12\n", + "\n", + "\n", + "nummod\n", "\n", "\n", - "\n", + "\n", "15\n", - "15 (bersalah)\n", + "15 (-)\n", "\n", - "\n", + "\n", "\n", - "14->15\n", - "\n", - "\n", - "ccomp\n", + "13->15\n", + "\n", + "\n", + "punct\n", "\n", - "\n", - "\n", - "17\n", - "17 (melanggar)\n", + "\n", + "\n", + "16\n", + "16 (bekas)\n", "\n", - "\n", + "\n", "\n", - "14->17\n", - "\n", - "\n", - "xcomp\n", + "13->16\n", + "\n", + "\n", + "compound:plur\n", "\n", - "\n", - "\n", - "21\n", - "21 (mengambil)\n", + "\n", + "\n", + "17\n", + "17 (Perdana)\n", "\n", - "\n", + "\n", "\n", - "14->21\n", - "\n", - "\n", - "acl\n", - "\n", - "\n", - "\n", - "11\n", - "11 (Langkawi)\n", - "\n", - "\n", - "\n", - "10->11\n", - "\n", - "\n", - "flat\n", + "13->17\n", + "\n", + "\n", + "flat\n", "\n", - "\n", - "\n", - "16\n", - "16 (selepas)\n", + "\n", + "\n", + "14\n", + "14 (ini)\n", "\n", - "\n", + "\n", "\n", - "17->16\n", - "\n", - "\n", - "case\n", + "17->14\n", + "\n", + "\n", + "det\n", "\n", "\n", - "\n", + "\n", "18\n", - "18 (SOP)\n", + "18 (Menteri)\n", "\n", "\n", "\n", "17->18\n", - "\n", - "\n", - "obj\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "19\n", + "19 (,)\n", + "\n", + "\n", + "\n", + "17->19\n", + "\n", + "\n", + "punct\n", "\n", "\n", "\n", "20\n", - "20 (tidak)\n", + "20 (Datuk)\n", "\n", - "\n", + "\n", "\n", - "21->20\n", - "\n", - "\n", - "advmod\n", + "17->20\n", + "\n", + "\n", + "appos\n", "\n", "\n", "\n", "25\n", - "25 (masuk)\n", + "25 (Tun)\n", "\n", - "\n", + "\n", "\n", - "21->25\n", - "\n", - "\n", - "advcl\n", - "\n", - "\n", - "\n", - "23\n", - "23 (badan)\n", - "\n", - "\n", - "\n", - "18->23\n", - "\n", - "\n", - "compound\n", - "\n", - "\n", - "\n", - "19\n", - "19 (kerana)\n", - "\n", - "\n", - "\n", - "23->19\n", - "\n", - "\n", - "det\n", + "17->25\n", + "\n", + "\n", + "conj\n", "\n", - "\n", + "\n", "\n", - "22\n", - "22 (suhu)\n", + "21\n", + "21 (Seri)\n", "\n", - "\n", - "\n", - "25->22\n", - "\n", - "\n", - "obj\n", + "\n", + "\n", + "20->21\n", + "\n", + "\n", + "flat\n", "\n", "\n", - "\n", + "\n", "24\n", - "24 (ketika)\n", + "24 (dan)\n", "\n", "\n", - "\n", + "\n", "25->24\n", - "\n", - "\n", - "mark\n", + "\n", + "\n", + "cc\n", "\n", - "\n", - "\n", - "28\n", - "28 (surau)\n", + "\n", + "\n", + "26\n", + "26 (Dr)\n", "\n", - "\n", - "\n", - "25->28\n", - "\n", - "\n", - "obl\n", + "\n", + "\n", + "25->26\n", + "\n", + "\n", + "flat\n", "\n", - "\n", - "\n", - "32\n", - "32 (Sabtu)\n", + "\n", + "\n", + "22\n", + "22 (Najib)\n", "\n", - "\n", - "\n", - "25->32\n", - "\n", - "\n", - "obl\n", + "\n", + "\n", + "21->22\n", + "\n", + "\n", + "flat\n", "\n", - "\n", - "\n", - "26\n", - "26 (ke)\n", + "\n", + "\n", + "23\n", + "23 (Razak)\n", "\n", - "\n", - "\n", - "28->26\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "22->23\n", + "\n", + "\n", + "flat\n", "\n", "\n", "\n", "27\n", - "27 (sebuah)\n", + "27 (Mahathir)\n", "\n", - "\n", - "\n", - "28->27\n", - "\n", - "\n", - "det\n", + "\n", + "\n", + "26->27\n", + "\n", + "\n", + "flat\n", "\n", - "\n", + "\n", "\n", - "30\n", - "30 (Langkawi)\n", - "\n", - "\n", - "\n", - "28->30\n", - "\n", - "\n", - "nmod\n", - "\n", - "\n", - "\n", - "31\n", - "31 (pada)\n", + "28\n", + "28 (Mohamad)\n", "\n", - "\n", - "\n", - "32->31\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "27->28\n", + "\n", + "\n", + "flat\n", "\n", - "\n", - "\n", - "29\n", - "29 (di)\n", + "\n", + "\n", + "30\n", + "30 (')\n", "\n", - "\n", - "\n", - "30->29\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "31->30\n", + "\n", + "\n", + "punct\n", "\n", "\n", - "\n", + "\n", "33\n", - "33 (lalu)\n", + "33 (sama)\n", "\n", "\n", - "\n", + "\n", "31->33\n", - "\n", - "\n", - "amod\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "36\n", + "36 (sekapal)\n", + "\n", + "\n", + "\n", + "33->36\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "32\n", + "32 (yang)\n", + "\n", + "\n", + "\n", + "36->32\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "34\n", + "34 (')\n", + "\n", + "\n", + "\n", + "36->34\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "35\n", + "35 (atau)\n", + "\n", + "\n", + "\n", + "36->35\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "41\n", + "41 (itu)\n", + "\n", + "\n", + "\n", + "40->41\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "43\n", + "43 (apabila)\n", + "\n", + "\n", + "\n", + "44->43\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "45\n", + "45 (isu)\n", + "\n", + "\n", + "\n", + "44->45\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "56\n", + "56 (,)\n", + "\n", + "\n", + "\n", + "55->56\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "59\n", + "59 (Ahli)\n", + "\n", + "\n", + "\n", + "55->59\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "62\n", + "62 (memuji)\n", + "\n", + "\n", + "\n", + "55->62\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "46\n", + "46 (ketidakpatuhan)\n", + "\n", + "\n", + "\n", + "45->46\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "48\n", + "48 (prosedur)\n", + "\n", + "\n", + "\n", + "45->48\n", + "\n", + "\n", + "nmod\n", + "\n", + "\n", + "\n", + "47\n", + "47 (terhadap)\n", + "\n", + "\n", + "\n", + "48->47\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "49\n", + "49 (operasi)\n", + "\n", + "\n", + "\n", + "48->49\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "50\n", + "50 (standard)\n", + "\n", + "\n", + "\n", + "48->50\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "52\n", + "52 (SOP)\n", + "\n", + "\n", + "\n", + "48->52\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "51\n", + "51 (()\n", + "\n", + "\n", + "\n", + "52->51\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "53\n", + "53 ())\n", + "\n", + "\n", + "\n", + "52->53\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "57\n", + "57 (yang)\n", + "\n", + "\n", + "\n", + "59->57\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "58\n", + "58 (juga)\n", + "\n", + "\n", + "\n", + "59->58\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "60\n", + "60 (Parlimen)\n", + "\n", + "\n", + "\n", + "59->60\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "63\n", + "63 (sikap)\n", + "\n", + "\n", + "\n", + "62->63\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "61\n", + "61 (Pekan)\n", + "\n", + "\n", + "\n", + "60->61\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "64\n", + "64 (Ahli)\n", + "\n", + "\n", + "\n", + "63->64\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "69\n", + "69 (mengaku)\n", + "\n", + "\n", + "\n", + "63->69\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "65\n", + "65 (Parlimen)\n", + "\n", + "\n", + "\n", + "64->65\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "68\n", + "68 (yang)\n", + "\n", + "\n", + "\n", + "69->68\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "70\n", + "70 (bersalah)\n", + "\n", + "\n", + "\n", + "69->70\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "66\n", + "66 (Langkawi)\n", + "\n", + "\n", + "\n", + "65->66\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "67\n", + "67 (itu)\n", + "\n", + "\n", + "\n", + "66->67\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "72\n", + "72 (melanggar)\n", + "\n", + "\n", + "\n", + "70->72\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "71\n", + "71 (selepas)\n", + "\n", + "\n", + "\n", + "72->71\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "73\n", + "73 (SOP)\n", + "\n", + "\n", + "\n", + "72->73\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "76\n", + "76 (mengambil)\n", + "\n", + "\n", + "\n", + "72->76\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "74\n", + "74 (kerana)\n", + "\n", + "\n", + "\n", + "76->74\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "75\n", + "75 (tidak)\n", + "\n", + "\n", + "\n", + "76->75\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "77\n", + "77 (suhu)\n", + "\n", + "\n", + "\n", + "76->77\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "80\n", + "80 (masuk)\n", + "\n", + "\n", + "\n", + "76->80\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "78\n", + "78 (badan)\n", + "\n", + "\n", + "\n", + "77->78\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "79\n", + "79 (ketika)\n", + "\n", + "\n", + "\n", + "80->79\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "83\n", + "83 (surau)\n", + "\n", + "\n", + "\n", + "80->83\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "85\n", + "85 (Langkawi)\n", + "\n", + "\n", + "\n", + "80->85\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "87\n", + "87 (Sabtu)\n", + "\n", + "\n", + "\n", + "80->87\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "81\n", + "81 (ke)\n", + "\n", + "\n", + "\n", + "83->81\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "82\n", + "82 (sebuah)\n", + "\n", + "\n", + "\n", + "83->82\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "84\n", + "84 (di)\n", + "\n", + "\n", + "\n", + "85->84\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "86\n", + "86 (pada)\n", + "\n", + "\n", + "\n", + "87->86\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "88\n", + "88 (lalu)\n", + "\n", + "\n", + "\n", + "87->88\n", + "\n", + "\n", + "amod\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 20, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "graph.to_graphvis()" + "tagging, indexing = malaya.stack.voting_stack([model, model, alxlnet], s)\n", + "malaya.dependency.dependency_graph(tagging, indexing).to_graphvis()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Get nodes" + "### Dependency graph object\n", + "\n", + "To initiate a dependency graph from dependency models, you need to call `malaya.dependency.dependency_graph`." ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "defaultdict(.()>,\n", - " {0: {'address': 0,\n", - " 'word': None,\n", - " 'lemma': None,\n", - " 'ctag': 'TOP',\n", - " 'tag': 'TOP',\n", - " 'feats': None,\n", - " 'head': None,\n", - " 'deps': defaultdict(list, {'root': [7]}),\n", - " 'rel': None},\n", - " 1: {'address': 1,\n", - " 'word': 'Najib',\n", + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph = malaya.dependency.dependency_graph(tagging, indexing)\n", + "graph" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### generate graphvis" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "11\n", + "11 (melihat)\n", + "\n", + "\n", + "\n", + "0->11\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "1\n", + "1 (KUALA)\n", + "\n", + "\n", + "\n", + "11->1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "8\n", + "8 (jarang)\n", + "\n", + "\n", + "\n", + "11->8\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "9\n", + "9 (sekali)\n", + "\n", + "\n", + "\n", + "11->9\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "10\n", + "10 (untuk)\n", + "\n", + "\n", + "\n", + "11->10\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "29\n", + "29 (mempunyai)\n", + "\n", + "\n", + "\n", + "11->29\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "42\n", + "42 (berbeza)\n", + "\n", + "\n", + "\n", + "11->42\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "2\n", + "2 (LUMPUR)\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "5\n", + "5 (hal)\n", + "\n", + "\n", + "\n", + "1->5\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "7\n", + "7 (,)\n", + "\n", + "\n", + "\n", + "1->7\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "3\n", + "3 (:)\n", + "\n", + "\n", + "\n", + "5->3\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "4\n", + "4 (Dalam)\n", + "\n", + "\n", + "\n", + "5->4\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "6\n", + "6 (politik)\n", + "\n", + "\n", + "\n", + "5->6\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "13\n", + "13 (figura)\n", + "\n", + "\n", + "\n", + "29->13\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "31\n", + "31 (pandangan)\n", + "\n", + "\n", + "\n", + "29->31\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "37\n", + "37 (.)\n", + "\n", + "\n", + "\n", + "29->37\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "38\n", + "38 (Namun)\n", + "\n", + "\n", + "\n", + "29->38\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "39\n", + "39 (,)\n", + "\n", + "\n", + "\n", + "42->39\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "40\n", + "40 (situasi)\n", + "\n", + "\n", + "\n", + "42->40\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "54\n", + "54 (.)\n", + "\n", + "\n", + "\n", + "42->54\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "89\n", + "89 (.)\n", + "\n", + "\n", + "\n", + "42->89\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "44\n", + "44 (melibatkan)\n", + "\n", + "\n", + "\n", + "42->44\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "55\n", + "55 (Najib)\n", + "\n", + "\n", + "\n", + "42->55\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "12\n", + "12 (dua)\n", + "\n", + "\n", + "\n", + "13->12\n", + "\n", + "\n", + "nummod\n", + "\n", + "\n", + "\n", + "15\n", + "15 (-)\n", + "\n", + "\n", + "\n", + "13->15\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "16\n", + "16 (bekas)\n", + "\n", + "\n", + "\n", + "13->16\n", + "\n", + "\n", + "compound:plur\n", + "\n", + "\n", + "\n", + "17\n", + "17 (Perdana)\n", + "\n", + "\n", + "\n", + "13->17\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "14\n", + "14 (ini)\n", + "\n", + "\n", + "\n", + "17->14\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "18\n", + "18 (Menteri)\n", + "\n", + "\n", + "\n", + "17->18\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "19\n", + "19 (,)\n", + "\n", + "\n", + "\n", + "17->19\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "20\n", + "20 (Datuk)\n", + "\n", + "\n", + "\n", + "17->20\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "25\n", + "25 (Tun)\n", + "\n", + "\n", + "\n", + "17->25\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "21\n", + "21 (Seri)\n", + "\n", + "\n", + "\n", + "20->21\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "24\n", + "24 (dan)\n", + "\n", + "\n", + "\n", + "25->24\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "26\n", + "26 (Dr)\n", + "\n", + "\n", + "\n", + "25->26\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "22\n", + "22 (Najib)\n", + "\n", + "\n", + "\n", + "21->22\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "23\n", + "23 (Razak)\n", + "\n", + "\n", + "\n", + "22->23\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "27\n", + "27 (Mahathir)\n", + "\n", + "\n", + "\n", + "26->27\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "28\n", + "28 (Mohamad)\n", + "\n", + "\n", + "\n", + "27->28\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "30\n", + "30 (')\n", + "\n", + "\n", + "\n", + "31->30\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "33\n", + "33 (sama)\n", + "\n", + "\n", + "\n", + "31->33\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "36\n", + "36 (sekapal)\n", + "\n", + "\n", + "\n", + "33->36\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "32\n", + "32 (yang)\n", + "\n", + "\n", + "\n", + "36->32\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "34\n", + "34 (')\n", + "\n", + "\n", + "\n", + "36->34\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "35\n", + "35 (atau)\n", + "\n", + "\n", + "\n", + "36->35\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "41\n", + "41 (itu)\n", + "\n", + "\n", + "\n", + "40->41\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "43\n", + "43 (apabila)\n", + "\n", + "\n", + "\n", + "44->43\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "45\n", + "45 (isu)\n", + "\n", + "\n", + "\n", + "44->45\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "56\n", + "56 (,)\n", + "\n", + "\n", + "\n", + "55->56\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "59\n", + "59 (Ahli)\n", + "\n", + "\n", + "\n", + "55->59\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "62\n", + "62 (memuji)\n", + "\n", + "\n", + "\n", + "55->62\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "46\n", + "46 (ketidakpatuhan)\n", + "\n", + "\n", + "\n", + "45->46\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "48\n", + "48 (prosedur)\n", + "\n", + "\n", + "\n", + "45->48\n", + "\n", + "\n", + "nmod\n", + "\n", + "\n", + "\n", + "47\n", + "47 (terhadap)\n", + "\n", + "\n", + "\n", + "48->47\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "49\n", + "49 (operasi)\n", + "\n", + "\n", + "\n", + "48->49\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "50\n", + "50 (standard)\n", + "\n", + "\n", + "\n", + "48->50\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "52\n", + "52 (SOP)\n", + "\n", + "\n", + "\n", + "48->52\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "51\n", + "51 (()\n", + "\n", + "\n", + "\n", + "52->51\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "53\n", + "53 ())\n", + "\n", + "\n", + "\n", + "52->53\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "57\n", + "57 (yang)\n", + "\n", + "\n", + "\n", + "59->57\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "58\n", + "58 (juga)\n", + "\n", + "\n", + "\n", + "59->58\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "60\n", + "60 (Parlimen)\n", + "\n", + "\n", + "\n", + "59->60\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "63\n", + "63 (sikap)\n", + "\n", + "\n", + "\n", + "62->63\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "61\n", + "61 (Pekan)\n", + "\n", + "\n", + "\n", + "60->61\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "64\n", + "64 (Ahli)\n", + "\n", + "\n", + "\n", + "63->64\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "69\n", + "69 (mengaku)\n", + "\n", + "\n", + "\n", + "63->69\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "65\n", + "65 (Parlimen)\n", + "\n", + "\n", + "\n", + "64->65\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "68\n", + "68 (yang)\n", + "\n", + "\n", + "\n", + "69->68\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "70\n", + "70 (bersalah)\n", + "\n", + "\n", + "\n", + "69->70\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "66\n", + "66 (Langkawi)\n", + "\n", + "\n", + "\n", + "65->66\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "67\n", + "67 (itu)\n", + "\n", + "\n", + "\n", + "66->67\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "72\n", + "72 (melanggar)\n", + "\n", + "\n", + "\n", + "70->72\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "71\n", + "71 (selepas)\n", + "\n", + "\n", + "\n", + "72->71\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "73\n", + "73 (SOP)\n", + "\n", + "\n", + "\n", + "72->73\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "76\n", + "76 (mengambil)\n", + "\n", + "\n", + "\n", + "72->76\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "74\n", + "74 (kerana)\n", + "\n", + "\n", + "\n", + "76->74\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "75\n", + "75 (tidak)\n", + "\n", + "\n", + "\n", + "76->75\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "77\n", + "77 (suhu)\n", + "\n", + "\n", + "\n", + "76->77\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "80\n", + "80 (masuk)\n", + "\n", + "\n", + "\n", + "76->80\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "78\n", + "78 (badan)\n", + "\n", + "\n", + "\n", + "77->78\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "79\n", + "79 (ketika)\n", + "\n", + "\n", + "\n", + "80->79\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "83\n", + "83 (surau)\n", + "\n", + "\n", + "\n", + "80->83\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "85\n", + "85 (Langkawi)\n", + "\n", + "\n", + "\n", + "80->85\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "87\n", + "87 (Sabtu)\n", + "\n", + "\n", + "\n", + "80->87\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "81\n", + "81 (ke)\n", + "\n", + "\n", + "\n", + "83->81\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "82\n", + "82 (sebuah)\n", + "\n", + "\n", + "\n", + "83->82\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "84\n", + "84 (di)\n", + "\n", + "\n", + "\n", + "85->84\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "86\n", + "86 (pada)\n", + "\n", + "\n", + "\n", + "87->86\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "88\n", + "88 (lalu)\n", + "\n", + "\n", + "\n", + "87->88\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph.to_graphvis()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Get nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "defaultdict(.()>,\n", + " {0: {'address': 0,\n", + " 'word': None,\n", + " 'lemma': None,\n", + " 'ctag': 'TOP',\n", + " 'tag': 'TOP',\n", + " 'feats': None,\n", + " 'head': None,\n", + " 'deps': defaultdict(list, {'root': [11]}),\n", + " 'rel': None},\n", + " 1: {'address': 1,\n", + " 'word': 'KUALA',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 11,\n", + " 'deps': defaultdict(list,\n", + " {'flat': [2], 'obl': [5], 'punct': [7]}),\n", + " 'rel': 'nsubj'},\n", + " 11: {'address': 11,\n", + " 'word': 'melihat',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 0,\n", + " 'deps': defaultdict(list,\n", + " {'nsubj': [1],\n", + " 'advmod': [8, 9],\n", + " 'case': [10],\n", + " 'advcl': [29],\n", + " 'dep': [42]}),\n", + " 'rel': 'root'},\n", + " 2: {'address': 2,\n", + " 'word': 'LUMPUR',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 1,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'flat'},\n", + " 3: {'address': 3,\n", + " 'word': ':',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 5,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 5: {'address': 5,\n", + " 'word': 'hal',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 1,\n", + " 'deps': defaultdict(list,\n", + " {'punct': [3], 'case': [4], 'compound': [6]}),\n", + " 'rel': 'obl'},\n", + " 4: {'address': 4,\n", + " 'word': 'Dalam',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 5,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'case'},\n", + " 6: {'address': 6,\n", + " 'word': 'politik',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 5,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'compound'},\n", + " 7: {'address': 7,\n", + " 'word': ',',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 1,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 8: {'address': 8,\n", + " 'word': 'jarang',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 11,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'advmod'},\n", + " 9: {'address': 9,\n", + " 'word': 'sekali',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 11,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'advmod'},\n", + " 10: {'address': 10,\n", + " 'word': 'untuk',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 11,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'case'},\n", + " 12: {'address': 12,\n", + " 'word': 'dua',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 13,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'nummod'},\n", + " 13: {'address': 13,\n", + " 'word': 'figura',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 29,\n", + " 'deps': defaultdict(list,\n", + " {'nummod': [12],\n", + " 'punct': [15],\n", + " 'compound:plur': [16],\n", + " 'flat': [17]}),\n", + " 'rel': 'obj'},\n", + " 29: {'address': 29,\n", + " 'word': 'mempunyai',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 11,\n", + " 'deps': defaultdict(list,\n", + " {'obj': [13, 31], 'punct': [37], 'mark': [38]}),\n", + " 'rel': 'advcl'},\n", + " 14: {'address': 14,\n", + " 'word': 'ini',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 17,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'det'},\n", + " 17: {'address': 17,\n", + " 'word': 'Perdana',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 13,\n", + " 'deps': defaultdict(list,\n", + " {'det': [14],\n", + " 'flat': [18],\n", + " 'punct': [19],\n", + " 'appos': [20],\n", + " 'conj': [25]}),\n", + " 'rel': 'flat'},\n", + " 15: {'address': 15,\n", + " 'word': '-',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 13,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 16: {'address': 16,\n", + " 'word': 'bekas',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 13,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'compound:plur'},\n", + " 18: {'address': 18,\n", + " 'word': 'Menteri',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 17,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'flat'},\n", + " 19: {'address': 19,\n", + " 'word': ',',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 17,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 20: {'address': 20,\n", + " 'word': 'Datuk',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 17,\n", + " 'deps': defaultdict(list, {'flat': [21]}),\n", + " 'rel': 'appos'},\n", + " 21: {'address': 21,\n", + " 'word': 'Seri',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 20,\n", + " 'deps': defaultdict(list, {'flat': [22]}),\n", + " 'rel': 'flat'},\n", + " 22: {'address': 22,\n", + " 'word': 'Najib',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 21,\n", + " 'deps': defaultdict(list, {'flat': [23]}),\n", + " 'rel': 'flat'},\n", + " 23: {'address': 23,\n", + " 'word': 'Razak',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 22,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'flat'},\n", + " 24: {'address': 24,\n", + " 'word': 'dan',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 25,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'cc'},\n", + " 25: {'address': 25,\n", + " 'word': 'Tun',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 17,\n", + " 'deps': defaultdict(list, {'cc': [24], 'flat': [26]}),\n", + " 'rel': 'conj'},\n", + " 26: {'address': 26,\n", + " 'word': 'Dr',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 25,\n", + " 'deps': defaultdict(list, {'flat': [27]}),\n", + " 'rel': 'flat'},\n", + " 27: {'address': 27,\n", + " 'word': 'Mahathir',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 26,\n", + " 'deps': defaultdict(list, {'flat': [28]}),\n", + " 'rel': 'flat'},\n", + " 28: {'address': 28,\n", + " 'word': 'Mohamad',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 27,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'flat'},\n", + " 30: {'address': 30,\n", + " 'word': \"'\",\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 31,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 31: {'address': 31,\n", + " 'word': 'pandangan',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 29,\n", + " 'deps': defaultdict(list, {'punct': [30], 'amod': [33]}),\n", + " 'rel': 'obj'},\n", + " 32: {'address': 32,\n", + " 'word': 'yang',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 7,\n", - " 'deps': defaultdict(list, {'fixed': [4]}),\n", + " 'head': 36,\n", + " 'deps': defaultdict(list, {}),\n", " 'rel': 'nsubj'},\n", - " 7: {'address': 7,\n", - " 'word': 'memuji',\n", + " 36: {'address': 36,\n", + " 'word': 'sekapal',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 0,\n", - " 'deps': defaultdict(list, {'nsubj': [1], 'obj': [8]}),\n", - " 'rel': 'root'},\n", - " 2: {'address': 2,\n", + " 'head': 33,\n", + " 'deps': defaultdict(list,\n", + " {'nsubj': [32], 'punct': [34], 'cc': [35]}),\n", + " 'rel': 'conj'},\n", + " 33: {'address': 33,\n", + " 'word': 'sama',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 31,\n", + " 'deps': defaultdict(list, {'conj': [36]}),\n", + " 'rel': 'amod'},\n", + " 34: {'address': 34,\n", + " 'word': \"'\",\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 36,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 35: {'address': 35,\n", + " 'word': 'atau',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 36,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'cc'},\n", + " 37: {'address': 37,\n", + " 'word': '.',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 29,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 38: {'address': 38,\n", + " 'word': 'Namun',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 29,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'mark'},\n", + " 39: {'address': 39,\n", + " 'word': ',',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 42,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 42: {'address': 42,\n", + " 'word': 'berbeza',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 11,\n", + " 'deps': defaultdict(list,\n", + " {'punct': [39, 54, 89],\n", + " 'nsubj': [40],\n", + " 'advcl': [44],\n", + " 'dep': [55]}),\n", + " 'rel': 'dep'},\n", + " 40: {'address': 40,\n", + " 'word': 'situasi',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 42,\n", + " 'deps': defaultdict(list, {'det': [41]}),\n", + " 'rel': 'nsubj'},\n", + " 41: {'address': 41,\n", + " 'word': 'itu',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 40,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'det'},\n", + " 43: {'address': 43,\n", + " 'word': 'apabila',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 44,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'mark'},\n", + " 44: {'address': 44,\n", + " 'word': 'melibatkan',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 42,\n", + " 'deps': defaultdict(list, {'mark': [43], 'obj': [45]}),\n", + " 'rel': 'advcl'},\n", + " 45: {'address': 45,\n", + " 'word': 'isu',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 44,\n", + " 'deps': defaultdict(list, {'compound': [46], 'nmod': [48]}),\n", + " 'rel': 'obj'},\n", + " 46: {'address': 46,\n", + " 'word': 'ketidakpatuhan',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 45,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'compound'},\n", + " 47: {'address': 47,\n", + " 'word': 'terhadap',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 48,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'case'},\n", + " 48: {'address': 48,\n", + " 'word': 'prosedur',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 45,\n", + " 'deps': defaultdict(list,\n", + " {'case': [47],\n", + " 'compound': [49],\n", + " 'amod': [50],\n", + " 'appos': [52]}),\n", + " 'rel': 'nmod'},\n", + " 49: {'address': 49,\n", + " 'word': 'operasi',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 48,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'compound'},\n", + " 50: {'address': 50,\n", + " 'word': 'standard',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 48,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'amod'},\n", + " 51: {'address': 51,\n", + " 'word': '(',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 52,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 52: {'address': 52,\n", + " 'word': 'SOP',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 48,\n", + " 'deps': defaultdict(list, {'punct': [51, 53]}),\n", + " 'rel': 'appos'},\n", + " 53: {'address': 53,\n", + " 'word': ')',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 52,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 54: {'address': 54,\n", + " 'word': '.',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 42,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 55: {'address': 55,\n", + " 'word': 'Najib',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 42,\n", + " 'deps': defaultdict(list,\n", + " {'punct': [56], 'nsubj': [59], 'acl': [62]}),\n", + " 'rel': 'dep'},\n", + " 56: {'address': 56,\n", + " 'word': ',',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 55,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 57: {'address': 57,\n", " 'word': 'yang',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 4,\n", + " 'head': 59,\n", " 'deps': defaultdict(list, {}),\n", " 'rel': 'nsubj'},\n", - " 4: {'address': 4,\n", + " 59: {'address': 59,\n", " 'word': 'Ahli',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 1,\n", + " 'head': 55,\n", " 'deps': defaultdict(list,\n", - " {'nsubj': [2], 'advmod': [3], 'flat': [5]}),\n", - " 'rel': 'fixed'},\n", - " 3: {'address': 3,\n", + " {'nsubj': [57], 'advmod': [58], 'flat': [60]}),\n", + " 'rel': 'nsubj'},\n", + " 58: {'address': 58,\n", " 'word': 'juga',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 4,\n", + " 'head': 59,\n", " 'deps': defaultdict(list, {}),\n", " 'rel': 'advmod'},\n", - " 5: {'address': 5,\n", + " 60: {'address': 60,\n", " 'word': 'Parlimen',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 4,\n", - " 'deps': defaultdict(list, {'flat': [6]}),\n", + " 'head': 59,\n", + " 'deps': defaultdict(list, {'flat': [61]}),\n", " 'rel': 'flat'},\n", - " 6: {'address': 6,\n", + " 61: {'address': 61,\n", " 'word': 'Pekan',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 5,\n", + " 'head': 60,\n", " 'deps': defaultdict(list, {}),\n", " 'rel': 'flat'},\n", - " 8: {'address': 8,\n", + " 62: {'address': 62,\n", + " 'word': 'memuji',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 55,\n", + " 'deps': defaultdict(list, {'obj': [63]}),\n", + " 'rel': 'acl'},\n", + " 63: {'address': 63,\n", " 'word': 'sikap',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 7,\n", - " 'deps': defaultdict(list, {'flat': [9], 'acl': [14]}),\n", + " 'head': 62,\n", + " 'deps': defaultdict(list, {'flat': [64], 'acl': [69]}),\n", " 'rel': 'obj'},\n", - " 9: {'address': 9,\n", + " 64: {'address': 64,\n", " 'word': 'Ahli',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 8,\n", - " 'deps': defaultdict(list, {'flat': [10], 'det': [12]}),\n", + " 'head': 63,\n", + " 'deps': defaultdict(list, {'flat': [65]}),\n", " 'rel': 'flat'},\n", - " 10: {'address': 10,\n", + " 65: {'address': 65,\n", " 'word': 'Parlimen',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 9,\n", - " 'deps': defaultdict(list, {'flat': [11]}),\n", + " 'head': 64,\n", + " 'deps': defaultdict(list, {'flat': [66]}),\n", " 'rel': 'flat'},\n", - " 11: {'address': 11,\n", + " 66: {'address': 66,\n", " 'word': 'Langkawi',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 10,\n", - " 'deps': defaultdict(list, {}),\n", + " 'head': 65,\n", + " 'deps': defaultdict(list, {'det': [67]}),\n", " 'rel': 'flat'},\n", - " 12: {'address': 12,\n", + " 67: {'address': 67,\n", " 'word': 'itu',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 9,\n", + " 'head': 66,\n", " 'deps': defaultdict(list, {}),\n", " 'rel': 'det'},\n", - " 13: {'address': 13,\n", + " 68: {'address': 68,\n", " 'word': 'yang',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 14,\n", + " 'head': 69,\n", " 'deps': defaultdict(list, {}),\n", " 'rel': 'nsubj'},\n", - " 14: {'address': 14,\n", + " 69: {'address': 69,\n", " 'word': 'mengaku',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 8,\n", - " 'deps': defaultdict(list,\n", - " {'nsubj': [13],\n", - " 'ccomp': [15],\n", - " 'xcomp': [17],\n", - " 'acl': [21]}),\n", + " 'head': 63,\n", + " 'deps': defaultdict(list, {'nsubj': [68], 'xcomp': [70]}),\n", " 'rel': 'acl'},\n", - " 15: {'address': 15,\n", + " 70: {'address': 70,\n", " 'word': 'bersalah',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 14,\n", - " 'deps': defaultdict(list, {}),\n", - " 'rel': 'ccomp'},\n", - " 16: {'address': 16,\n", + " 'head': 69,\n", + " 'deps': defaultdict(list, {'xcomp': [72]}),\n", + " 'rel': 'xcomp'},\n", + " 71: {'address': 71,\n", " 'word': 'selepas',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 17,\n", + " 'head': 72,\n", " 'deps': defaultdict(list, {}),\n", " 'rel': 'case'},\n", - " 17: {'address': 17,\n", + " 72: {'address': 72,\n", " 'word': 'melanggar',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 14,\n", - " 'deps': defaultdict(list, {'case': [16], 'obj': [18]}),\n", + " 'head': 70,\n", + " 'deps': defaultdict(list,\n", + " {'case': [71], 'obj': [73], 'advcl': [76]}),\n", " 'rel': 'xcomp'},\n", - " 18: {'address': 18,\n", + " 73: {'address': 73,\n", " 'word': 'SOP',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 17,\n", - " 'deps': defaultdict(list, {'compound': [23]}),\n", + " 'head': 72,\n", + " 'deps': defaultdict(list, {}),\n", " 'rel': 'obj'},\n", - " 19: {'address': 19,\n", + " 74: {'address': 74,\n", " 'word': 'kerana',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 23,\n", + " 'head': 76,\n", " 'deps': defaultdict(list, {}),\n", - " 'rel': 'det'},\n", - " 23: {'address': 23,\n", - " 'word': 'badan',\n", + " 'rel': 'mark'},\n", + " 76: {'address': 76,\n", + " 'word': 'mengambil',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 18,\n", - " 'deps': defaultdict(list, {'det': [19]}),\n", - " 'rel': 'compound'},\n", - " 20: {'address': 20,\n", + " 'head': 72,\n", + " 'deps': defaultdict(list,\n", + " {'mark': [74],\n", + " 'advmod': [75],\n", + " 'obj': [77],\n", + " 'advcl': [80]}),\n", + " 'rel': 'advcl'},\n", + " 75: {'address': 75,\n", " 'word': 'tidak',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 21,\n", + " 'head': 76,\n", " 'deps': defaultdict(list, {}),\n", " 'rel': 'advmod'},\n", - " 21: {'address': 21,\n", - " 'word': 'mengambil',\n", - " 'lemma': '_',\n", - " 'ctag': '_',\n", - " 'tag': '_',\n", - " 'feats': '_',\n", - " 'head': 14,\n", - " 'deps': defaultdict(list, {'advmod': [20], 'advcl': [25]}),\n", - " 'rel': 'acl'},\n", - " 22: {'address': 22,\n", + " 77: {'address': 77,\n", " 'word': 'suhu',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 25,\n", - " 'deps': defaultdict(list, {}),\n", + " 'head': 76,\n", + " 'deps': defaultdict(list, {'compound': [78]}),\n", " 'rel': 'obj'},\n", - " 25: {'address': 25,\n", - " 'word': 'masuk',\n", + " 78: {'address': 78,\n", + " 'word': 'badan',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 21,\n", - " 'deps': defaultdict(list,\n", - " {'obj': [22], 'mark': [24], 'obl': [28, 32]}),\n", - " 'rel': 'advcl'},\n", - " 24: {'address': 24,\n", + " 'head': 77,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'compound'},\n", + " 79: {'address': 79,\n", " 'word': 'ketika',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 25,\n", + " 'head': 80,\n", " 'deps': defaultdict(list, {}),\n", " 'rel': 'mark'},\n", - " 26: {'address': 26,\n", + " 80: {'address': 80,\n", + " 'word': 'masuk',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 76,\n", + " 'deps': defaultdict(list, {'mark': [79], 'obl': [83, 85, 87]}),\n", + " 'rel': 'advcl'},\n", + " 81: {'address': 81,\n", " 'word': 'ke',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 28,\n", + " 'head': 83,\n", " 'deps': defaultdict(list, {}),\n", " 'rel': 'case'},\n", - " 28: {'address': 28,\n", + " 83: {'address': 83,\n", " 'word': 'surau',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 25,\n", - " 'deps': defaultdict(list,\n", - " {'case': [26], 'det': [27], 'nmod': [30]}),\n", + " 'head': 80,\n", + " 'deps': defaultdict(list, {'case': [81], 'det': [82]}),\n", " 'rel': 'obl'},\n", - " 27: {'address': 27,\n", + " 82: {'address': 82,\n", " 'word': 'sebuah',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 28,\n", + " 'head': 83,\n", " 'deps': defaultdict(list, {}),\n", " 'rel': 'det'},\n", - " 29: {'address': 29,\n", + " 84: {'address': 84,\n", " 'word': 'di',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 30,\n", + " 'head': 85,\n", " 'deps': defaultdict(list, {}),\n", " 'rel': 'case'},\n", - " 30: {'address': 30,\n", + " 85: {'address': 85,\n", " 'word': 'Langkawi',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 28,\n", - " 'deps': defaultdict(list, {'case': [29]}),\n", - " 'rel': 'nmod'},\n", - " 31: {'address': 31,\n", + " 'head': 80,\n", + " 'deps': defaultdict(list, {'case': [84]}),\n", + " 'rel': 'obl'},\n", + " 86: {'address': 86,\n", " 'word': 'pada',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 32,\n", - " 'deps': defaultdict(list, {'amod': [33]}),\n", + " 'head': 87,\n", + " 'deps': defaultdict(list, {}),\n", " 'rel': 'case'},\n", - " 32: {'address': 32,\n", + " 87: {'address': 87,\n", " 'word': 'Sabtu',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 25,\n", - " 'deps': defaultdict(list, {'case': [31]}),\n", + " 'head': 80,\n", + " 'deps': defaultdict(list, {'case': [86], 'amod': [88]}),\n", " 'rel': 'obl'},\n", - " 33: {'address': 33,\n", + " 88: {'address': 88,\n", " 'word': 'lalu',\n", " 'lemma': '_',\n", " 'ctag': '_',\n", " 'tag': '_',\n", " 'feats': '_',\n", - " 'head': 31,\n", + " 'head': 87,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'amod'},\n", + " 89: {'address': 89,\n", + " 'word': '.',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 42,\n", " 'deps': defaultdict(list, {}),\n", - " 'rel': 'amod'}})" + " 'rel': 'punct'}})" ] }, - "execution_count": 21, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -2428,47 +5466,103 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[(('memuji', '_'), 'nsubj', ('Najib', '_')),\n", - " (('Najib', '_'), 'fixed', ('Ahli', '_')),\n", + "[(('melihat', '_'), 'nsubj', ('KUALA', '_')),\n", + " (('KUALA', '_'), 'flat', ('LUMPUR', '_')),\n", + " (('KUALA', '_'), 'obl', ('hal', '_')),\n", + " (('hal', '_'), 'punct', (':', '_')),\n", + " (('hal', '_'), 'case', ('Dalam', '_')),\n", + " (('hal', '_'), 'compound', ('politik', '_')),\n", + " (('KUALA', '_'), 'punct', (',', '_')),\n", + " (('melihat', '_'), 'advmod', ('jarang', '_')),\n", + " (('melihat', '_'), 'advmod', ('sekali', '_')),\n", + " (('melihat', '_'), 'case', ('untuk', '_')),\n", + " (('melihat', '_'), 'advcl', ('mempunyai', '_')),\n", + " (('mempunyai', '_'), 'obj', ('figura', '_')),\n", + " (('figura', '_'), 'nummod', ('dua', '_')),\n", + " (('figura', '_'), 'punct', ('-', '_')),\n", + " (('figura', '_'), 'compound:plur', ('bekas', '_')),\n", + " (('figura', '_'), 'flat', ('Perdana', '_')),\n", + " (('Perdana', '_'), 'det', ('ini', '_')),\n", + " (('Perdana', '_'), 'flat', ('Menteri', '_')),\n", + " (('Perdana', '_'), 'punct', (',', '_')),\n", + " (('Perdana', '_'), 'appos', ('Datuk', '_')),\n", + " (('Datuk', '_'), 'flat', ('Seri', '_')),\n", + " (('Seri', '_'), 'flat', ('Najib', '_')),\n", + " (('Najib', '_'), 'flat', ('Razak', '_')),\n", + " (('Perdana', '_'), 'conj', ('Tun', '_')),\n", + " (('Tun', '_'), 'cc', ('dan', '_')),\n", + " (('Tun', '_'), 'flat', ('Dr', '_')),\n", + " (('Dr', '_'), 'flat', ('Mahathir', '_')),\n", + " (('Mahathir', '_'), 'flat', ('Mohamad', '_')),\n", + " (('mempunyai', '_'), 'obj', ('pandangan', '_')),\n", + " (('pandangan', '_'), 'punct', (\"'\", '_')),\n", + " (('pandangan', '_'), 'amod', ('sama', '_')),\n", + " (('sama', '_'), 'conj', ('sekapal', '_')),\n", + " (('sekapal', '_'), 'nsubj', ('yang', '_')),\n", + " (('sekapal', '_'), 'punct', (\"'\", '_')),\n", + " (('sekapal', '_'), 'cc', ('atau', '_')),\n", + " (('mempunyai', '_'), 'punct', ('.', '_')),\n", + " (('mempunyai', '_'), 'mark', ('Namun', '_')),\n", + " (('melihat', '_'), 'dep', ('berbeza', '_')),\n", + " (('berbeza', '_'), 'punct', (',', '_')),\n", + " (('berbeza', '_'), 'nsubj', ('situasi', '_')),\n", + " (('situasi', '_'), 'det', ('itu', '_')),\n", + " (('berbeza', '_'), 'advcl', ('melibatkan', '_')),\n", + " (('melibatkan', '_'), 'mark', ('apabila', '_')),\n", + " (('melibatkan', '_'), 'obj', ('isu', '_')),\n", + " (('isu', '_'), 'compound', ('ketidakpatuhan', '_')),\n", + " (('isu', '_'), 'nmod', ('prosedur', '_')),\n", + " (('prosedur', '_'), 'case', ('terhadap', '_')),\n", + " (('prosedur', '_'), 'compound', ('operasi', '_')),\n", + " (('prosedur', '_'), 'amod', ('standard', '_')),\n", + " (('prosedur', '_'), 'appos', ('SOP', '_')),\n", + " (('SOP', '_'), 'punct', ('(', '_')),\n", + " (('SOP', '_'), 'punct', (')', '_')),\n", + " (('berbeza', '_'), 'punct', ('.', '_')),\n", + " (('berbeza', '_'), 'dep', ('Najib', '_')),\n", + " (('Najib', '_'), 'punct', (',', '_')),\n", + " (('Najib', '_'), 'nsubj', ('Ahli', '_')),\n", " (('Ahli', '_'), 'nsubj', ('yang', '_')),\n", " (('Ahli', '_'), 'advmod', ('juga', '_')),\n", " (('Ahli', '_'), 'flat', ('Parlimen', '_')),\n", " (('Parlimen', '_'), 'flat', ('Pekan', '_')),\n", + " (('Najib', '_'), 'acl', ('memuji', '_')),\n", " (('memuji', '_'), 'obj', ('sikap', '_')),\n", " (('sikap', '_'), 'flat', ('Ahli', '_')),\n", " (('Ahli', '_'), 'flat', ('Parlimen', '_')),\n", " (('Parlimen', '_'), 'flat', ('Langkawi', '_')),\n", - " (('Ahli', '_'), 'det', ('itu', '_')),\n", + " (('Langkawi', '_'), 'det', ('itu', '_')),\n", " (('sikap', '_'), 'acl', ('mengaku', '_')),\n", " (('mengaku', '_'), 'nsubj', ('yang', '_')),\n", - " (('mengaku', '_'), 'ccomp', ('bersalah', '_')),\n", - " (('mengaku', '_'), 'xcomp', ('melanggar', '_')),\n", + " (('mengaku', '_'), 'xcomp', ('bersalah', '_')),\n", + " (('bersalah', '_'), 'xcomp', ('melanggar', '_')),\n", " (('melanggar', '_'), 'case', ('selepas', '_')),\n", " (('melanggar', '_'), 'obj', ('SOP', '_')),\n", - " (('SOP', '_'), 'compound', ('badan', '_')),\n", - " (('badan', '_'), 'det', ('kerana', '_')),\n", - " (('mengaku', '_'), 'acl', ('mengambil', '_')),\n", + " (('melanggar', '_'), 'advcl', ('mengambil', '_')),\n", + " (('mengambil', '_'), 'mark', ('kerana', '_')),\n", " (('mengambil', '_'), 'advmod', ('tidak', '_')),\n", + " (('mengambil', '_'), 'obj', ('suhu', '_')),\n", + " (('suhu', '_'), 'compound', ('badan', '_')),\n", " (('mengambil', '_'), 'advcl', ('masuk', '_')),\n", - " (('masuk', '_'), 'obj', ('suhu', '_')),\n", " (('masuk', '_'), 'mark', ('ketika', '_')),\n", " (('masuk', '_'), 'obl', ('surau', '_')),\n", " (('surau', '_'), 'case', ('ke', '_')),\n", " (('surau', '_'), 'det', ('sebuah', '_')),\n", - " (('surau', '_'), 'nmod', ('Langkawi', '_')),\n", + " (('masuk', '_'), 'obl', ('Langkawi', '_')),\n", " (('Langkawi', '_'), 'case', ('di', '_')),\n", " (('masuk', '_'), 'obl', ('Sabtu', '_')),\n", " (('Sabtu', '_'), 'case', ('pada', '_')),\n", - " (('pada', '_'), 'amod', ('lalu', '_'))]" + " (('Sabtu', '_'), 'amod', ('lalu', '_')),\n", + " (('berbeza', '_'), 'punct', ('.', '_'))]" ] }, - "execution_count": 22, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -2486,7 +5580,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -2495,7 +5589,7 @@ "False" ] }, - "execution_count": 23, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -2519,16 +5613,16 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 24, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -2540,12 +5634,12 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -2563,16 +5657,16 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "OutMultiEdgeDataView([(1, 7), (2, 4), (3, 4), (4, 1), (5, 4), (6, 5), (8, 7), (9, 8), (10, 9), (11, 10), (12, 9), (13, 14), (14, 8), (15, 14), (16, 17), (17, 14), (18, 17), (19, 23), (20, 21), (21, 14), (22, 25), (23, 18), (24, 25), (25, 21), (26, 28), (27, 28), (28, 25), (29, 30), (30, 28), (31, 32), (32, 25), (33, 31)])" + "OutMultiEdgeDataView([(1, 11), (2, 1), (3, 5), (4, 5), (5, 1), (6, 5), (7, 1), (8, 11), (9, 11), (10, 11), (12, 13), (13, 29), (14, 17), (15, 13), (16, 13), (17, 13), (18, 17), (19, 17), (20, 17), (21, 20), (22, 21), (23, 22), (24, 25), (25, 17), (26, 25), (27, 26), (28, 27), (29, 11), (30, 31), (31, 29), (32, 36), (33, 31), (34, 36), (35, 36), (36, 33), (37, 29), (38, 29), (39, 42), (40, 42), (41, 40), (42, 11), (43, 44), (44, 42), (45, 44), (46, 45), (47, 48), (48, 45), (49, 48), (50, 48), (51, 52), (52, 48), (53, 52), (54, 42), (55, 42), (56, 55), (57, 59), (58, 59), (59, 55), (60, 59), (61, 60), (62, 55), (63, 62), (64, 63), (65, 64), (66, 65), (67, 66), (68, 69), (69, 63), (70, 69), (71, 72), (72, 70), (73, 72), (74, 76), (75, 76), (76, 72), (77, 76), (78, 77), (79, 80), (80, 76), (81, 83), (82, 83), (83, 80), (84, 85), (85, 80), (86, 87), (87, 80), (88, 87), (89, 42)])" ] }, - "execution_count": 26, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -2583,16 +5677,16 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "NodeView((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33))" + "NodeView((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89))" ] }, - "execution_count": 27, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -2603,48 +5697,104 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{1: 'Najib',\n", - " 2: 'yang',\n", - " 3: 'juga',\n", - " 4: 'Ahli',\n", - " 5: 'Parlimen',\n", - " 6: 'Pekan',\n", - " 7: 'memuji',\n", - " 8: 'sikap',\n", - " 9: 'Ahli',\n", - " 10: 'Parlimen',\n", - " 11: 'Langkawi',\n", - " 12: 'itu',\n", - " 13: 'yang',\n", - " 14: 'mengaku',\n", - " 15: 'bersalah',\n", - " 16: 'selepas',\n", - " 17: 'melanggar',\n", - " 18: 'SOP',\n", - " 19: 'kerana',\n", - " 20: 'tidak',\n", - " 21: 'mengambil',\n", - " 22: 'suhu',\n", - " 23: 'badan',\n", - " 24: 'ketika',\n", - " 25: 'masuk',\n", - " 26: 'ke',\n", - " 27: 'sebuah',\n", - " 28: 'surau',\n", - " 29: 'di',\n", - " 30: 'Langkawi',\n", - " 31: 'pada',\n", - " 32: 'Sabtu',\n", - " 33: 'lalu'}" + "{1: 'KUALA',\n", + " 2: 'LUMPUR',\n", + " 3: ':',\n", + " 4: 'Dalam',\n", + " 5: 'hal',\n", + " 6: 'politik',\n", + " 7: ',',\n", + " 8: 'jarang',\n", + " 9: 'sekali',\n", + " 10: 'untuk',\n", + " 11: 'melihat',\n", + " 12: 'dua',\n", + " 13: 'figura',\n", + " 14: 'ini',\n", + " 15: '-',\n", + " 16: 'bekas',\n", + " 17: 'Perdana',\n", + " 18: 'Menteri',\n", + " 19: ',',\n", + " 20: 'Datuk',\n", + " 21: 'Seri',\n", + " 22: 'Najib',\n", + " 23: 'Razak',\n", + " 24: 'dan',\n", + " 25: 'Tun',\n", + " 26: 'Dr',\n", + " 27: 'Mahathir',\n", + " 28: 'Mohamad',\n", + " 29: 'mempunyai',\n", + " 30: \"'\",\n", + " 31: 'pandangan',\n", + " 32: 'yang',\n", + " 33: 'sama',\n", + " 34: \"'\",\n", + " 35: 'atau',\n", + " 36: 'sekapal',\n", + " 37: '.',\n", + " 38: 'Namun',\n", + " 39: ',',\n", + " 40: 'situasi',\n", + " 41: 'itu',\n", + " 42: 'berbeza',\n", + " 43: 'apabila',\n", + " 44: 'melibatkan',\n", + " 45: 'isu',\n", + " 46: 'ketidakpatuhan',\n", + " 47: 'terhadap',\n", + " 48: 'prosedur',\n", + " 49: 'operasi',\n", + " 50: 'standard',\n", + " 51: '(',\n", + " 52: 'SOP',\n", + " 53: ')',\n", + " 54: '.',\n", + " 55: 'Najib',\n", + " 56: ',',\n", + " 57: 'yang',\n", + " 58: 'juga',\n", + " 59: 'Ahli',\n", + " 60: 'Parlimen',\n", + " 61: 'Pekan',\n", + " 62: 'memuji',\n", + " 63: 'sikap',\n", + " 64: 'Ahli',\n", + " 65: 'Parlimen',\n", + " 66: 'Langkawi',\n", + " 67: 'itu',\n", + " 68: 'yang',\n", + " 69: 'mengaku',\n", + " 70: 'bersalah',\n", + " 71: 'selepas',\n", + " 72: 'melanggar',\n", + " 73: 'SOP',\n", + " 74: 'kerana',\n", + " 75: 'tidak',\n", + " 76: 'mengambil',\n", + " 77: 'suhu',\n", + " 78: 'badan',\n", + " 79: 'ketika',\n", + " 80: 'masuk',\n", + " 81: 'ke',\n", + " 82: 'sebuah',\n", + " 83: 'surau',\n", + " 84: 'di',\n", + " 85: 'Langkawi',\n", + " 86: 'pada',\n", + " 87: 'Sabtu',\n", + " 88: 'lalu',\n", + " 89: '.'}" ] }, - "execution_count": 28, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -2656,12 +5806,12 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 27, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -2702,7 +5852,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -2711,7 +5861,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -2721,16 +5871,16 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(33, 2)" + "(89, 2)" ] }, - "execution_count": 44, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -2745,12 +5895,12 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 31, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -2780,13 +5930,6 @@ " textcoords = 'offset points',\n", " )" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/docs/load-knowledge-graph-from-dependency.ipynb b/docs/load-knowledge-graph-from-dependency.ipynb index 6a08cf06..7621253d 100644 --- a/docs/load-knowledge-graph-from-dependency.ipynb +++ b/docs/load-knowledge-graph-from-dependency.ipynb @@ -44,8 +44,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 5.11 s, sys: 927 ms, total: 6.04 s\n", - "Wall time: 6.12 s\n" + "CPU times: user 5.14 s, sys: 883 ms, total: 6.03 s\n", + "Wall time: 6.61 s\n" ] } ], @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "infrared-discipline", "metadata": {}, "outputs": [ @@ -82,8 +82,8 @@ } ], "source": [ - "quantized_model = malaya.dependency.transformer(model = 'xlnet', quantized = True)\n", - "alxlnet = malaya.dependency.transformer(model = 'alxlnet')" + "quantized_model = malaya.dependency.transformer(version = 'v1', model = 'xlnet', quantized = True)\n", + "alxlnet = malaya.dependency.transformer(version = 'v1', model = 'alxlnet')" ] }, { @@ -120,417 +120,417 @@ "\n", "\n", - "\n", - "\n", + "\n", + "\n", "G\n", - "\n", + "\n", "\n", "\n", "0\n", - "0 (None)\n", + "0 (None)\n", "\n", "\n", "\n", "7\n", - "7 (memuji)\n", + "7 (memuji)\n", "\n", "\n", "\n", "0->7\n", - "\n", - "\n", - "root\n", + "\n", + "\n", + "root\n", "\n", "\n", "\n", "1\n", - "1 (Najib)\n", + "1 (Najib)\n", "\n", "\n", "\n", "7->1\n", - "\n", - "\n", - "nsubj\n", + "\n", + "\n", + "nsubj\n", "\n", "\n", "\n", "8\n", - "8 (sikap)\n", + "8 (sikap)\n", "\n", "\n", "\n", "7->8\n", - "\n", - "\n", - "obj\n", + "\n", + "\n", + "obj\n", "\n", - "\n", + "\n", "\n", - "4\n", - "4 (Ahli)\n", + "2\n", + "2 (yang)\n", "\n", - "\n", + "\n", "\n", - "1->4\n", - "\n", - "\n", - "nsubj\n", + "1->2\n", + "\n", + "\n", + "nsubj\n", "\n", - "\n", + "\n", "\n", - "2\n", - "2 (yang)\n", + "4\n", + "4 (Ahli)\n", "\n", - "\n", + "\n", "\n", - "4->2\n", - "\n", - "\n", - "nsubj\n", + "1->4\n", + "\n", + "\n", + "appos\n", "\n", "\n", "\n", "3\n", - "3 (juga)\n", + "3 (juga)\n", "\n", "\n", "\n", "4->3\n", - "\n", - "\n", - "advmod\n", + "\n", + "\n", + "advmod\n", "\n", "\n", "\n", "5\n", - "5 (Parlimen)\n", + "5 (Parlimen)\n", "\n", "\n", "\n", "4->5\n", - "\n", - "\n", - "flat\n", + "\n", + "\n", + "flat\n", "\n", "\n", "\n", "6\n", - "6 (Pekan)\n", + "6 (Pekan)\n", "\n", "\n", "\n", "5->6\n", - "\n", - "\n", - "flat\n", + "\n", + "\n", + "flat\n", "\n", "\n", "\n", "9\n", - "9 (Ahli)\n", + "9 (Ahli)\n", "\n", "\n", "\n", "8->9\n", - "\n", - "\n", - "flat\n", + "\n", + "\n", + "flat\n", "\n", - "\n", + "\n", "\n", - "14\n", - "14 (mengaku)\n", + "12\n", + "12 (itu)\n", "\n", - "\n", + "\n", "\n", - "8->14\n", - "\n", - "\n", - "acl\n", + "8->12\n", + "\n", + "\n", + "det\n", "\n", - "\n", + "\n", "\n", - "10\n", - "10 (Parlimen)\n", + "14\n", + "14 (mengaku)\n", "\n", - "\n", + "\n", "\n", - "9->10\n", - "\n", - "\n", - "flat\n", + "8->14\n", + "\n", + "\n", + "acl\n", "\n", - "\n", + "\n", "\n", - "12\n", - "12 (itu)\n", + "10\n", + "10 (Parlimen)\n", "\n", - "\n", + "\n", "\n", - "9->12\n", - "\n", - "\n", - "det\n", + "9->10\n", + "\n", + "\n", + "flat\n", "\n", "\n", "\n", "13\n", - "13 (yang)\n", + "13 (yang)\n", "\n", "\n", "\n", "14->13\n", - "\n", - "\n", - "nsubj\n", + "\n", + "\n", + "nsubj\n", "\n", "\n", "\n", "15\n", - "15 (bersalah)\n", + "15 (bersalah)\n", "\n", "\n", "\n", "14->15\n", - "\n", - "\n", - "amod\n", + "\n", + "\n", + "amod\n", "\n", "\n", "\n", "17\n", - "17 (melanggar)\n", + "17 (melanggar)\n", "\n", "\n", "\n", "14->17\n", - "\n", - "\n", - "xcomp\n", + "\n", + "\n", + "xcomp\n", "\n", "\n", "\n", "11\n", - "11 (Langkawi)\n", + "11 (Langkawi)\n", "\n", "\n", "\n", "10->11\n", - "\n", - "\n", - "flat\n", + "\n", + "\n", + "flat\n", "\n", "\n", "\n", "16\n", - "16 (selepas)\n", + "16 (selepas)\n", "\n", "\n", "\n", "17->16\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "case\n", "\n", "\n", "\n", "18\n", - "18 (SOP)\n", + "18 (SOP)\n", "\n", "\n", "\n", "17->18\n", - "\n", - "\n", - "obj\n", + "\n", + "\n", + "obj\n", "\n", "\n", "\n", "21\n", - "21 (mengambil)\n", + "21 (mengambil)\n", "\n", - "\n", + "\n", "\n", - "18->21\n", - "\n", - "\n", - "acl\n", + "17->21\n", + "\n", + "\n", + "acl\n", "\n", "\n", "\n", "23\n", - "23 (badan)\n", + "23 (badan)\n", "\n", "\n", "\n", "18->23\n", - "\n", - "\n", - "compound\n", + "\n", + "\n", + "compound\n", "\n", "\n", "\n", "20\n", - "20 (tidak)\n", + "20 (tidak)\n", "\n", "\n", "\n", "21->20\n", - "\n", - "\n", - "advmod\n", + "\n", + "\n", + "advmod\n", "\n", "\n", "\n", "25\n", - "25 (masuk)\n", + "25 (masuk)\n", "\n", "\n", "\n", "21->25\n", - "\n", - "\n", - "advcl\n", + "\n", + "\n", + "advcl\n", "\n", "\n", "\n", "19\n", - "19 (kerana)\n", + "19 (kerana)\n", + "\n", + "\n", + "\n", + "25->19\n", + "\n", + "\n", + "det\n", "\n", "\n", "\n", "22\n", - "22 (suhu)\n", + "22 (suhu)\n", "\n", "\n", - "\n", + "\n", "25->22\n", - "\n", - "\n", - "obj\n", + "\n", + "\n", + "obj\n", "\n", "\n", "\n", "24\n", - "24 (ketika)\n", + "24 (ketika)\n", "\n", "\n", - "\n", + "\n", "25->24\n", - "\n", - "\n", - "mark\n", + "\n", + "\n", + "mark\n", "\n", "\n", - "\n", + "\n", "28\n", - "28 (surau)\n", + "28 (surau)\n", "\n", "\n", - "\n", + "\n", "25->28\n", - "\n", - "\n", - "obl\n", - "\n", - "\n", - "\n", - "22->19\n", - "\n", - "\n", - "mark\n", + "\n", + "\n", + "obl\n", "\n", "\n", - "\n", + "\n", "32\n", - "32 (Sabtu)\n", - "\n", - "\n", - "\n", - "24->32\n", - "\n", - "\n", - "obl\n", + "32 (Sabtu)\n", "\n", - "\n", - "\n", - "31\n", - "31 (pada)\n", - "\n", - "\n", - "\n", - "32->31\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "25->32\n", + "\n", + "\n", + "obl\n", "\n", "\n", "\n", "26\n", - "26 (ke)\n", + "26 (ke)\n", "\n", "\n", "\n", "28->26\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "case\n", "\n", "\n", "\n", "27\n", - "27 (sebuah)\n", + "27 (sebuah)\n", "\n", "\n", "\n", "28->27\n", - "\n", - "\n", - "det\n", + "\n", + "\n", + "det\n", "\n", "\n", "\n", "30\n", - "30 (Langkawi)\n", + "30 (Langkawi)\n", "\n", "\n", "\n", "28->30\n", - "\n", - "\n", - "nmod\n", + "\n", + "\n", + "nmod\n", + "\n", + "\n", + "\n", + "31\n", + "31 (pada)\n", + "\n", + "\n", + "\n", + "32->31\n", + "\n", + "\n", + "case\n", "\n", "\n", "\n", "29\n", - "29 (di)\n", + "29 (di)\n", "\n", "\n", "\n", "30->29\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "case\n", "\n", "\n", "\n", "33\n", - "33 (lalu)\n", + "33 (lalu)\n", "\n", "\n", "\n", "31->33\n", - "\n", - "\n", - "amod\n", + "\n", + "\n", + "advmod\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 4, @@ -551,13 +551,14 @@ "### Parse knowledge graph from dependency\n", "\n", "```python\n", - "def parse_from_dependency(tagging, indexing,\n", - " subjects=[['flat', 'subj', 'nsubj', 'csubj']],\n", - " relations=[['acl', 'xcomp', 'ccomp', 'obj', 'conj', 'advcl'], ['obj']],\n", - " objects=[['obj', 'compound', 'flat', 'nmod', 'obl']],\n", - " get_networkx=True):\n", + "def parse_from_dependency(tagging: List[Tuple[str, str]],\n", + " indexing: List[Tuple[str, str]],\n", + " subjects: List[List[str]] = [['flat', 'subj', 'nsubj', 'csubj']],\n", + " relations: List[List[str]] = [['acl', 'xcomp', 'ccomp', 'obj', 'conj', 'advcl'], ['obj']],\n", + " objects: List[List[str]] = [['obj', 'compound', 'flat', 'nmod', 'obl']],\n", + " get_networkx: bool = True):\n", " \"\"\"\n", - " Generate knowledge graphs from dependency parsing.\n", + " Generate knowledge graphs from dependency parsing, we suggest use dependency parsing v1.\n", "\n", " Parameters\n", " ----------\n", @@ -600,13 +601,16 @@ { "data": { "text/plain": [ - "{'result': [{'subject': 'Najib Ahli Parlimen Pekan',\n", - " 'relation': 'memuji sikap mengaku melanggar SOP mengambil masuk',\n", + "{'result': [{'subject': 'Najib',\n", + " 'relation': 'memuji sikap mengaku melanggar SOP',\n", + " 'object': 'badan'},\n", + " {'subject': 'Najib',\n", + " 'relation': 'memuji sikap mengaku melanggar mengambil masuk',\n", " 'object': 'suhu'},\n", - " {'subject': 'Najib Ahli Parlimen Pekan',\n", + " {'subject': 'Najib',\n", " 'relation': 'memuji sikap',\n", " 'object': 'Ahli Parlimen Langkawi'}],\n", - " 'G': }" + " 'G': }" ] }, "execution_count": 6, @@ -626,7 +630,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -779,7 +783,7 @@ { "data": { "text/plain": [ - "(35, 58)" + "(34, 58)" ] }, "execution_count": 9, @@ -821,7 +825,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] diff --git a/docs/speech-toolkit.rst b/docs/speech-toolkit.rst index 28a64adb..6051d94a 100644 --- a/docs/speech-toolkit.rst +++ b/docs/speech-toolkit.rst @@ -58,6 +58,7 @@ Features - **Speaker overlap**, detect overlap speakers using Finetuned Speaker Vector. - **Speaker Vector**, calculate similarity between speakers using Pretrained Speaker Vector. - **Speech Enhancement**, enhance voice activities using Waveform UNET. +- **SpeechSplit Conversion**, detailed speaking style conversion by disentangling speech into content, timbre, rhythm and pitch using PyWorld and PySPTK. - **Speech-to-Text**, End-to-End Speech to Text for Malay and Mixed (Malay and Singlish) using RNN-Transducer and Wav2Vec2 CTC. - **Super Resolution**, Super Resolution 4x for Waveform. - **Text-to-Speech**, Text to Speech for Malay and Singlish using Tacotron2 and FastSpeech2. @@ -93,6 +94,10 @@ Malaya-Speech also released pretrained models, simply check at `malaya-speech/pr - **FastVC**, Faster and Accurate Voice Conversion using Transformer, no paper produced. - **FastSep**, Faster and Accurate Speech Separation using Transformer, no paper produced. - **wav2vec 2.0**, A Framework for Self-Supervised Learning of Speech Representations, https://arxiv.org/abs/2006.11477 +- **FastSpeechSplit**, Unsupervised Speech Decomposition Via Triple Information Bottleneck using Transformer, no paper produced. +- **Sepformer**, Attention is All You Need in Speech Separation, https://arxiv.org/abs/2010.13154 +- **FastSpeechSplit**, Faster and Accurate Speech Split Conversion using Transformer, no paper produced. +- **HuBERT**, Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units, https://arxiv.org/pdf/2106.07447v1.pdf References ----------- diff --git a/example/dependency/load-dependency.ipynb b/example/dependency/load-dependency.ipynb index fdf5d097..d1a09942 100644 --- a/example/dependency/load-dependency.ipynb +++ b/example/dependency/load-dependency.ipynb @@ -38,8 +38,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 5.07 s, sys: 951 ms, total: 6.02 s\n", - "Wall time: 6.92 s\n" + "CPU times: user 5.15 s, sys: 925 ms, total: 6.07 s\n", + "Wall time: 6.8 s\n" ] } ], @@ -264,12 +264,28 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### List available transformer Dependency models" + "### List available transformer Dependency models\n", + "\n", + "```python\n", + "def available_transformer(version: str = 'v2'):\n", + " \"\"\"\n", + " List available transformer dependency parsing models.\n", + "\n", + " Parameters\n", + " ----------\n", + " version : str, optional (default='v2')\n", + " Version supported. Allowed values:\n", + "\n", + " * ``'v1'`` - version 1, maintain for knowledge graph.\n", + " * ``'v2'`` - Trained on bigger dataset, better version.\n", + "\n", + " \"\"\"\n", + "```" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -310,19 +326,19 @@ " \n", "
\n", " bert\n", - " 426.0\n", - " 112.00\n", - " 0.855000\n", - " 0.84800\n", - " 0.920\n", + " 455.0\n", + " 114.00\n", + " 0.820450\n", + " 0.79970\n", + " 0.98936\n", "
\n", "
\n", " tiny-bert\n", - " 59.5\n", - " 15.70\n", - " 0.718000\n", - " 0.69400\n", - " 0.886\n", + " 69.7\n", + " 17.50\n", + " 0.795252\n", + " 0.72470\n", + " 0.98939\n", "
\n", "
\n", " albert\n", @@ -330,7 +346,7 @@ " 15.30\n", " 0.821895\n", " 0.79752\n", - " 1.000\n", + " 1.00000\n", "
\n", "
\n", " tiny-albert\n", @@ -338,23 +354,23 @@ " 8.51\n", " 0.786500\n", " 0.75870\n", - " 0.817\n", + " 1.00000\n", "
\n", "
\n", " xlnet\n", - " 450.2\n", - " 119.00\n", - " 0.931000\n", - " 0.92500\n", - " 0.947\n", + " 480.2\n", + " 121.00\n", + " 0.848110\n", + " 0.82741\n", + " 0.92101\n", "
\n", "
\n", " alxlnet\n", - " 50.0\n", - " 14.30\n", - " 0.894000\n", - " 0.88600\n", - " 0.942\n", + " 61.2\n", + " 16.40\n", + " 0.849290\n", + " 0.82810\n", + " 0.92099\n", "
\n", "
\n", "\n", @@ -362,23 +378,23 @@ ], "text/plain": [ " Size (MB) Quantized Size (MB) Arc Accuracy Types Accuracy \\\n", - "bert 426.0 112.00 0.855000 0.84800 \n", - "tiny-bert 59.5 15.70 0.718000 0.69400 \n", + "bert 455.0 114.00 0.820450 0.79970 \n", + "tiny-bert 69.7 17.50 0.795252 0.72470 \n", "albert 60.8 15.30 0.821895 0.79752 \n", "tiny-albert 33.4 8.51 0.786500 0.75870 \n", - "xlnet 450.2 119.00 0.931000 0.92500 \n", - "alxlnet 50.0 14.30 0.894000 0.88600 \n", + "xlnet 480.2 121.00 0.848110 0.82741 \n", + "alxlnet 61.2 16.40 0.849290 0.82810 \n", "\n", " Root Accuracy \n", - "bert 0.920 \n", - "tiny-bert 0.886 \n", - "albert 1.000 \n", - "tiny-albert 0.817 \n", - "xlnet 0.947 \n", - "alxlnet 0.942 " + "bert 0.98936 \n", + "tiny-bert 0.98939 \n", + "albert 1.00000 \n", + "tiny-albert 1.00000 \n", + "xlnet 0.92101 \n", + "alxlnet 0.92099 " ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -394,13 +410,19 @@ "### Load xlnet dependency model\n", "\n", "```python\n", - "def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs):\n", + "def transformer(version: str = 'v2', model: str = 'xlnet', quantized: bool = False, **kwargs):\n", " \"\"\"\n", " Load Transformer Dependency Parsing model, transfer learning Transformer + biaffine attention.\n", "\n", " Parameters\n", " ----------\n", - " model : str, optional (default='bert')\n", + " version : str, optional (default='v2')\n", + " Version supported. Allowed values:\n", + "\n", + " * ``'v1'`` - version 1, maintain for knowledge graph.\n", + " * ``'v2'`` - Trained on bigger dataset, better version.\n", + "\n", + " model : str, optional (default='xlnet')\n", " Model architecture supported. Allowed values:\n", "\n", " * ``'bert'`` - Google BERT BASE parameters.\n", @@ -430,46 +452,10 @@ "execution_count": 4, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "downloading frozen model to /Users/huseinzolkepli/Malaya/dependency-v2/albert/model.pb\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 58.0/57.9 [00:09<00:00, 6.36MB/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "downloading frozen vocab to /Users/huseinzolkepli/Malaya/dependency-v2/albert/sp10m.cased.v10.vocab\n" - ] - }, { "name": "stderr", "output_type": "stream", "text": [ - "184%|██████████| 1.00/0.54 [00:00<00:00, 1.08MB/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "downloading frozen tokenizer to /Users/huseinzolkepli/Malaya/dependency-v2/albert/sp10m.cased.v10.model\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "135%|██████████| 1.00/0.74 [00:01<00:00, 1.12s/MB]\n", "INFO:root:running dependency-v2/albert using device /device:CPU:0\n" ] } @@ -498,49 +484,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "WARNING:root:Load quantized model will cause accuracy drop.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "downloading frozen model to /Users/huseinzolkepli/Malaya/dependency-v2/albert-quantized/model.pb\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "103%|██████████| 15.0/14.6 [00:03<00:00, 4.24MB/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "downloading frozen vocab to /Users/huseinzolkepli/Malaya/dependency-v2/albert-quantized/sp10m.cased.v10.vocab\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "184%|██████████| 1.00/0.54 [00:00<00:00, 1.25MB/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "downloading frozen tokenizer to /Users/huseinzolkepli/Malaya/dependency-v2/albert-quantized/sp10m.cased.v10.model\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "135%|██████████| 1.00/0.74 [00:01<00:00, 1.14s/MB]\n", + "WARNING:root:Load quantized model will cause accuracy drop.\n", "INFO:root:running dependency-v2/albert-quantized using device /device:CPU:0\n" ] } @@ -573,7 +517,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -582,7 +526,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -788,10 +732,10 @@ "
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 10, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -803,7 +747,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -1009,10 +953,10 @@ "
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 11, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1031,14 +975,14 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "INFO:root:running dependency-v2/tiny-albert using device /device:CPU:0\n" + "INFO:root:running dependency-v2/alxlnet using device /device:CPU:0\n" ] }, { @@ -1244,17 +1188,17 @@ "
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 12, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "alxlnet = malaya.dependency.transformer(model = 'tiny-albert')\n", - "tagging, indexing = malaya.stack.voting_stack([model, alxlnet, model], string)\n", + "alxlnet = malaya.dependency.transformer(model = 'alxlnet')\n", + "tagging, indexing = malaya.stack.voting_stack([model, model, alxlnet], string)\n", "malaya.dependency.dependency_graph(tagging, indexing).to_graphvis()" ] }, @@ -2374,7 +2318,7 @@ "
\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 14, @@ -2389,7 +2333,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -3483,16 +3427,16 @@ "
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 16, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tagging, indexing = malaya.stack.voting_stack([model, alxlnet, model], s)\n", + "tagging, indexing = malaya.stack.voting_stack([model, model, alxlnet], s)\n", "malaya.dependency.dependency_graph(tagging, indexing).to_graphvis()" ] }, @@ -3507,16 +3451,16 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -3535,8 +3479,10 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": {}, + "execution_count": 17, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -4629,10 +4575,10 @@ "
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -4650,7 +4596,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -5502,7 +5448,7 @@ " 'rel': 'punct'}})" ] }, - "execution_count": 19, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -5673,7 +5619,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 22, @@ -5693,7 +5639,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -5865,7 +5811,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -5906,142 +5852,16 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ - "r = quantized_model.vectorize('Kakak saya mempunyai anjing. Dia menyayanginya')" + "r = quantized_model.vectorize(s)" ] }, { "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "image/svg+xml": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "G\n", - "\n", - "\n", - "\n", - "0\n", - "0 (None)\n", - "\n", - "\n", - "\n", - "3\n", - "3 (mempunyai)\n", - "\n", - "\n", - "\n", - "0->3\n", - "\n", - "\n", - "root\n", - "\n", - "\n", - "\n", - "1\n", - "1 (Kakak)\n", - "\n", - "\n", - "\n", - "3->1\n", - "\n", - "\n", - "nsubj\n", - "\n", - "\n", - "\n", - "2\n", - "2 (saya)\n", - "\n", - "\n", - "\n", - "3->2\n", - "\n", - "\n", - "nsubj\n", - "\n", - "\n", - "\n", - "4\n", - "4 (anjing)\n", - "\n", - "\n", - "\n", - "3->4\n", - "\n", - "\n", - "obj\n", - "\n", - "\n", - "\n", - "7\n", - "7 (menyayanginya)\n", - "\n", - "\n", - "\n", - "3->7\n", - "\n", - "\n", - "dep\n", - "\n", - "\n", - "\n", - "5\n", - "5 (.)\n", - "\n", - "\n", - "\n", - "1->5\n", - "\n", - "\n", - "punct\n", - "\n", - "\n", - "\n", - "6\n", - "6 (Dia)\n", - "\n", - "\n", - "\n", - "7->6\n", - "\n", - "\n", - "nsubj\n", - "\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "string = 'Husein Zolkepli suka makan ayam. Dia pun suka makan daging'\n", - "d_object, tagging, indexing = model.predict(string)\n", - "d_object.to_graphvis()" - ] - }, - { - "cell_type": "code", - "execution_count": 46, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -6051,16 +5871,16 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(7, 2)" + "(89, 2)" ] }, - "execution_count": 47, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -6075,12 +5895,12 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 31, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -6110,13 +5930,6 @@ " textcoords = 'offset points',\n", " )" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/example/knowledge-graph-from-dependency/load-knowledge-graph-from-dependency.ipynb b/example/knowledge-graph-from-dependency/load-knowledge-graph-from-dependency.ipynb index 6a08cf06..7621253d 100644 --- a/example/knowledge-graph-from-dependency/load-knowledge-graph-from-dependency.ipynb +++ b/example/knowledge-graph-from-dependency/load-knowledge-graph-from-dependency.ipynb @@ -44,8 +44,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 5.11 s, sys: 927 ms, total: 6.04 s\n", - "Wall time: 6.12 s\n" + "CPU times: user 5.14 s, sys: 883 ms, total: 6.03 s\n", + "Wall time: 6.61 s\n" ] } ], @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "infrared-discipline", "metadata": {}, "outputs": [ @@ -82,8 +82,8 @@ } ], "source": [ - "quantized_model = malaya.dependency.transformer(model = 'xlnet', quantized = True)\n", - "alxlnet = malaya.dependency.transformer(model = 'alxlnet')" + "quantized_model = malaya.dependency.transformer(version = 'v1', model = 'xlnet', quantized = True)\n", + "alxlnet = malaya.dependency.transformer(version = 'v1', model = 'alxlnet')" ] }, { @@ -120,417 +120,417 @@ "\n", "\n", - "\n", - "\n", + "\n", + "\n", "G\n", - "\n", + "\n", "\n", "\n", "0\n", - "0 (None)\n", + "0 (None)\n", "\n", "\n", "\n", "7\n", - "7 (memuji)\n", + "7 (memuji)\n", "\n", "\n", "\n", "0->7\n", - "\n", - "\n", - "root\n", + "\n", + "\n", + "root\n", "\n", "\n", "\n", "1\n", - "1 (Najib)\n", + "1 (Najib)\n", "\n", "\n", "\n", "7->1\n", - "\n", - "\n", - "nsubj\n", + "\n", + "\n", + "nsubj\n", "\n", "\n", "\n", "8\n", - "8 (sikap)\n", + "8 (sikap)\n", "\n", "\n", "\n", "7->8\n", - "\n", - "\n", - "obj\n", + "\n", + "\n", + "obj\n", "\n", - "\n", + "\n", "\n", - "4\n", - "4 (Ahli)\n", + "2\n", + "2 (yang)\n", "\n", - "\n", + "\n", "\n", - "1->4\n", - "\n", - "\n", - "nsubj\n", + "1->2\n", + "\n", + "\n", + "nsubj\n", "\n", - "\n", + "\n", "\n", - "2\n", - "2 (yang)\n", + "4\n", + "4 (Ahli)\n", "\n", - "\n", + "\n", "\n", - "4->2\n", - "\n", - "\n", - "nsubj\n", + "1->4\n", + "\n", + "\n", + "appos\n", "\n", "\n", "\n", "3\n", - "3 (juga)\n", + "3 (juga)\n", "\n", "\n", "\n", "4->3\n", - "\n", - "\n", - "advmod\n", + "\n", + "\n", + "advmod\n", "\n", "\n", "\n", "5\n", - "5 (Parlimen)\n", + "5 (Parlimen)\n", "\n", "\n", "\n", "4->5\n", - "\n", - "\n", - "flat\n", + "\n", + "\n", + "flat\n", "\n", "\n", "\n", "6\n", - "6 (Pekan)\n", + "6 (Pekan)\n", "\n", "\n", "\n", "5->6\n", - "\n", - "\n", - "flat\n", + "\n", + "\n", + "flat\n", "\n", "\n", "\n", "9\n", - "9 (Ahli)\n", + "9 (Ahli)\n", "\n", "\n", "\n", "8->9\n", - "\n", - "\n", - "flat\n", + "\n", + "\n", + "flat\n", "\n", - "\n", + "\n", "\n", - "14\n", - "14 (mengaku)\n", + "12\n", + "12 (itu)\n", "\n", - "\n", + "\n", "\n", - "8->14\n", - "\n", - "\n", - "acl\n", + "8->12\n", + "\n", + "\n", + "det\n", "\n", - "\n", + "\n", "\n", - "10\n", - "10 (Parlimen)\n", + "14\n", + "14 (mengaku)\n", "\n", - "\n", + "\n", "\n", - "9->10\n", - "\n", - "\n", - "flat\n", + "8->14\n", + "\n", + "\n", + "acl\n", "\n", - "\n", + "\n", "\n", - "12\n", - "12 (itu)\n", + "10\n", + "10 (Parlimen)\n", "\n", - "\n", + "\n", "\n", - "9->12\n", - "\n", - "\n", - "det\n", + "9->10\n", + "\n", + "\n", + "flat\n", "\n", "\n", "\n", "13\n", - "13 (yang)\n", + "13 (yang)\n", "\n", "\n", "\n", "14->13\n", - "\n", - "\n", - "nsubj\n", + "\n", + "\n", + "nsubj\n", "\n", "\n", "\n", "15\n", - "15 (bersalah)\n", + "15 (bersalah)\n", "\n", "\n", "\n", "14->15\n", - "\n", - "\n", - "amod\n", + "\n", + "\n", + "amod\n", "\n", "\n", "\n", "17\n", - "17 (melanggar)\n", + "17 (melanggar)\n", "\n", "\n", "\n", "14->17\n", - "\n", - "\n", - "xcomp\n", + "\n", + "\n", + "xcomp\n", "\n", "\n", "\n", "11\n", - "11 (Langkawi)\n", + "11 (Langkawi)\n", "\n", "\n", "\n", "10->11\n", - "\n", - "\n", - "flat\n", + "\n", + "\n", + "flat\n", "\n", "\n", "\n", "16\n", - "16 (selepas)\n", + "16 (selepas)\n", "\n", "\n", "\n", "17->16\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "case\n", "\n", "\n", "\n", "18\n", - "18 (SOP)\n", + "18 (SOP)\n", "\n", "\n", "\n", "17->18\n", - "\n", - "\n", - "obj\n", + "\n", + "\n", + "obj\n", "\n", "\n", "\n", "21\n", - "21 (mengambil)\n", + "21 (mengambil)\n", "\n", - "\n", + "\n", "\n", - "18->21\n", - "\n", - "\n", - "acl\n", + "17->21\n", + "\n", + "\n", + "acl\n", "\n", "\n", "\n", "23\n", - "23 (badan)\n", + "23 (badan)\n", "\n", "\n", "\n", "18->23\n", - "\n", - "\n", - "compound\n", + "\n", + "\n", + "compound\n", "\n", "\n", "\n", "20\n", - "20 (tidak)\n", + "20 (tidak)\n", "\n", "\n", "\n", "21->20\n", - "\n", - "\n", - "advmod\n", + "\n", + "\n", + "advmod\n", "\n", "\n", "\n", "25\n", - "25 (masuk)\n", + "25 (masuk)\n", "\n", "\n", "\n", "21->25\n", - "\n", - "\n", - "advcl\n", + "\n", + "\n", + "advcl\n", "\n", "\n", "\n", "19\n", - "19 (kerana)\n", + "19 (kerana)\n", + "\n", + "\n", + "\n", + "25->19\n", + "\n", + "\n", + "det\n", "\n", "\n", "\n", "22\n", - "22 (suhu)\n", + "22 (suhu)\n", "\n", "\n", - "\n", + "\n", "25->22\n", - "\n", - "\n", - "obj\n", + "\n", + "\n", + "obj\n", "\n", "\n", "\n", "24\n", - "24 (ketika)\n", + "24 (ketika)\n", "\n", "\n", - "\n", + "\n", "25->24\n", - "\n", - "\n", - "mark\n", + "\n", + "\n", + "mark\n", "\n", "\n", - "\n", + "\n", "28\n", - "28 (surau)\n", + "28 (surau)\n", "\n", "\n", - "\n", + "\n", "25->28\n", - "\n", - "\n", - "obl\n", - "\n", - "\n", - "\n", - "22->19\n", - "\n", - "\n", - "mark\n", + "\n", + "\n", + "obl\n", "\n", "\n", - "\n", + "\n", "32\n", - "32 (Sabtu)\n", - "\n", - "\n", - "\n", - "24->32\n", - "\n", - "\n", - "obl\n", + "32 (Sabtu)\n", "\n", - "\n", - "\n", - "31\n", - "31 (pada)\n", - "\n", - "\n", - "\n", - "32->31\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "25->32\n", + "\n", + "\n", + "obl\n", "\n", "\n", "\n", "26\n", - "26 (ke)\n", + "26 (ke)\n", "\n", "\n", "\n", "28->26\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "case\n", "\n", "\n", "\n", "27\n", - "27 (sebuah)\n", + "27 (sebuah)\n", "\n", "\n", "\n", "28->27\n", - "\n", - "\n", - "det\n", + "\n", + "\n", + "det\n", "\n", "\n", "\n", "30\n", - "30 (Langkawi)\n", + "30 (Langkawi)\n", "\n", "\n", "\n", "28->30\n", - "\n", - "\n", - "nmod\n", + "\n", + "\n", + "nmod\n", + "\n", + "\n", + "\n", + "31\n", + "31 (pada)\n", + "\n", + "\n", + "\n", + "32->31\n", + "\n", + "\n", + "case\n", "\n", "\n", "\n", "29\n", - "29 (di)\n", + "29 (di)\n", "\n", "\n", "\n", "30->29\n", - "\n", - "\n", - "case\n", + "\n", + "\n", + "case\n", "\n", "\n", "\n", "33\n", - "33 (lalu)\n", + "33 (lalu)\n", "\n", "\n", "\n", "31->33\n", - "\n", - "\n", - "amod\n", + "\n", + "\n", + "advmod\n", "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 4, @@ -551,13 +551,14 @@ "### Parse knowledge graph from dependency\n", "\n", "```python\n", - "def parse_from_dependency(tagging, indexing,\n", - " subjects=[['flat', 'subj', 'nsubj', 'csubj']],\n", - " relations=[['acl', 'xcomp', 'ccomp', 'obj', 'conj', 'advcl'], ['obj']],\n", - " objects=[['obj', 'compound', 'flat', 'nmod', 'obl']],\n", - " get_networkx=True):\n", + "def parse_from_dependency(tagging: List[Tuple[str, str]],\n", + " indexing: List[Tuple[str, str]],\n", + " subjects: List[List[str]] = [['flat', 'subj', 'nsubj', 'csubj']],\n", + " relations: List[List[str]] = [['acl', 'xcomp', 'ccomp', 'obj', 'conj', 'advcl'], ['obj']],\n", + " objects: List[List[str]] = [['obj', 'compound', 'flat', 'nmod', 'obl']],\n", + " get_networkx: bool = True):\n", " \"\"\"\n", - " Generate knowledge graphs from dependency parsing.\n", + " Generate knowledge graphs from dependency parsing, we suggest use dependency parsing v1.\n", "\n", " Parameters\n", " ----------\n", @@ -600,13 +601,16 @@ { "data": { "text/plain": [ - "{'result': [{'subject': 'Najib Ahli Parlimen Pekan',\n", - " 'relation': 'memuji sikap mengaku melanggar SOP mengambil masuk',\n", + "{'result': [{'subject': 'Najib',\n", + " 'relation': 'memuji sikap mengaku melanggar SOP',\n", + " 'object': 'badan'},\n", + " {'subject': 'Najib',\n", + " 'relation': 'memuji sikap mengaku melanggar mengambil masuk',\n", " 'object': 'suhu'},\n", - " {'subject': 'Najib Ahli Parlimen Pekan',\n", + " {'subject': 'Najib',\n", " 'relation': 'memuji sikap',\n", " 'object': 'Ahli Parlimen Langkawi'}],\n", - " 'G': }" + " 'G': }" ] }, "execution_count": 6, @@ -626,7 +630,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -779,7 +783,7 @@ { "data": { "text/plain": [ - "(35, 58)" + "(34, 58)" ] }, "execution_count": 9, @@ -821,7 +825,7 @@ "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] diff --git a/load-dependency.ipynb b/load-dependency.ipynb new file mode 100644 index 00000000..d1a09942 --- /dev/null +++ b/load-dependency.ipynb @@ -0,0 +1,5956 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dependency Parsing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\n", + "This tutorial is available as an IPython notebook at [Malaya/example/dependency](https://github.com/huseinzol05/Malaya/tree/master/example/dependency).\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\n", + "This module only trained on standard language structure, so it is not save to use it for local language structure.\n", + " \n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 5.15 s, sys: 925 ms, total: 6.07 s\n", + "Wall time: 6.8 s\n" + ] + } + ], + "source": [ + "%%time\n", + "import malaya" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Describe supported dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:you can read more from https://universaldependencies.org/treebanks/id_pud/index.html\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TagDescription
0aclclausal modifier of noun
1advcladverbial clause modifier
2advmodadverbial modifier
3amodadjectival modifier
4apposappositional modifier
5auxauxiliary
6casecase marking
7ccompclausal complement
8compoundcompound
9compound:plurplural compound
10conjconjunct
11copcop
12csubjclausal subject
13depdependent
14detdeterminer
15fixedmulti-word expression
16flatname
17iobjindirect object
18markmarker
19nmodnominal modifier
20nsubjnominal subject
21objdirect object
22parataxisparataxis
23rootroot
24xcompopen clausal complement
\n", + "
" + ], + "text/plain": [ + " Tag Description\n", + "0 acl clausal modifier of noun\n", + "1 advcl adverbial clause modifier\n", + "2 advmod adverbial modifier\n", + "3 amod adjectival modifier\n", + "4 appos appositional modifier\n", + "5 aux auxiliary\n", + "6 case case marking\n", + "7 ccomp clausal complement\n", + "8 compound compound\n", + "9 compound:plur plural compound\n", + "10 conj conjunct\n", + "11 cop cop\n", + "12 csubj clausal subject\n", + "13 dep dependent\n", + "14 det determiner\n", + "15 fixed multi-word expression\n", + "16 flat name\n", + "17 iobj indirect object\n", + "18 mark marker\n", + "19 nmod nominal modifier\n", + "20 nsubj nominal subject\n", + "21 obj direct object\n", + "22 parataxis parataxis\n", + "23 root root\n", + "24 xcomp open clausal complement" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "malaya.dependency.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### List available transformer Dependency models\n", + "\n", + "```python\n", + "def available_transformer(version: str = 'v2'):\n", + " \"\"\"\n", + " List available transformer dependency parsing models.\n", + "\n", + " Parameters\n", + " ----------\n", + " version : str, optional (default='v2')\n", + " Version supported. Allowed values:\n", + "\n", + " * ``'v1'`` - version 1, maintain for knowledge graph.\n", + " * ``'v2'`` - Trained on bigger dataset, better version.\n", + "\n", + " \"\"\"\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:tested on 20% test set.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Size (MB)Quantized Size (MB)Arc AccuracyTypes AccuracyRoot Accuracy
bert455.0114.000.8204500.799700.98936
tiny-bert69.717.500.7952520.724700.98939
albert60.815.300.8218950.797521.00000
tiny-albert33.48.510.7865000.758701.00000
xlnet480.2121.000.8481100.827410.92101
alxlnet61.216.400.8492900.828100.92099
\n", + "
" + ], + "text/plain": [ + " Size (MB) Quantized Size (MB) Arc Accuracy Types Accuracy \\\n", + "bert 455.0 114.00 0.820450 0.79970 \n", + "tiny-bert 69.7 17.50 0.795252 0.72470 \n", + "albert 60.8 15.30 0.821895 0.79752 \n", + "tiny-albert 33.4 8.51 0.786500 0.75870 \n", + "xlnet 480.2 121.00 0.848110 0.82741 \n", + "alxlnet 61.2 16.40 0.849290 0.82810 \n", + "\n", + " Root Accuracy \n", + "bert 0.98936 \n", + "tiny-bert 0.98939 \n", + "albert 1.00000 \n", + "tiny-albert 1.00000 \n", + "xlnet 0.92101 \n", + "alxlnet 0.92099 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "malaya.dependency.available_transformer()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load xlnet dependency model\n", + "\n", + "```python\n", + "def transformer(version: str = 'v2', model: str = 'xlnet', quantized: bool = False, **kwargs):\n", + " \"\"\"\n", + " Load Transformer Dependency Parsing model, transfer learning Transformer + biaffine attention.\n", + "\n", + " Parameters\n", + " ----------\n", + " version : str, optional (default='v2')\n", + " Version supported. Allowed values:\n", + "\n", + " * ``'v1'`` - version 1, maintain for knowledge graph.\n", + " * ``'v2'`` - Trained on bigger dataset, better version.\n", + "\n", + " model : str, optional (default='xlnet')\n", + " Model architecture supported. Allowed values:\n", + "\n", + " * ``'bert'`` - Google BERT BASE parameters.\n", + " * ``'tiny-bert'`` - Google BERT TINY parameters.\n", + " * ``'albert'`` - Google ALBERT BASE parameters.\n", + " * ``'tiny-albert'`` - Google ALBERT TINY parameters.\n", + " * ``'xlnet'`` - Google XLNET BASE parameters.\n", + " * ``'alxlnet'`` - Malaya ALXLNET BASE parameters.\n", + "\n", + " quantized : bool, optional (default=False)\n", + " if True, will load 8-bit quantized model.\n", + " Quantized model not necessary faster, totally depends on the machine.\n", + "\n", + " Returns\n", + " -------\n", + " result: model\n", + " List of model classes:\n", + "\n", + " * if `bert` in model, will return `malaya.model.bert.DependencyBERT`.\n", + " * if `xlnet` in model, will return `malaya.model.xlnet.DependencyXLNET`.\n", + " \"\"\"\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:running dependency-v2/albert using device /device:CPU:0\n" + ] + } + ], + "source": [ + "model = malaya.dependency.transformer(model = 'albert')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load Quantized model\n", + "\n", + "To load 8-bit quantized model, simply pass `quantized = True`, default is `False`.\n", + "\n", + "We can expect slightly accuracy drop from quantized model, and not necessary faster than normal 32-bit float model, totally depends on machine." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Load quantized model will cause accuracy drop.\n", + "INFO:root:running dependency-v2/albert-quantized using device /device:CPU:0\n" + ] + } + ], + "source": [ + "quantized_model = malaya.dependency.transformer(model = 'albert', quantized = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Predict\n", + "\n", + "```python\n", + "def predict(self, string: str):\n", + " \"\"\"\n", + " Tag a string.\n", + "\n", + " Parameters\n", + " ----------\n", + " string: str\n", + "\n", + " Returns\n", + " -------\n", + " result: Tuple\n", + " \"\"\"\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "string = 'Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar sekiranya mengantuk ketika memandu.'" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "3\n", + "3 (menasihati)\n", + "\n", + "\n", + "\n", + "0->3\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "1\n", + "1 (Dr)\n", + "\n", + "\n", + "\n", + "3->1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "4\n", + "4 (mereka)\n", + "\n", + "\n", + "\n", + "3->4\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "6\n", + "6 (berhenti)\n", + "\n", + "\n", + "\n", + "3->6\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "15\n", + "15 (.)\n", + "\n", + "\n", + "\n", + "3->15\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "2\n", + "2 (Mahathir)\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "5\n", + "5 (supaya)\n", + "\n", + "\n", + "\n", + "6->5\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "7\n", + "7 (berehat)\n", + "\n", + "\n", + "\n", + "6->7\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "9\n", + "9 (tidur)\n", + "\n", + "\n", + "\n", + "6->9\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "8\n", + "8 (dan)\n", + "\n", + "\n", + "\n", + "9->8\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "12\n", + "12 (mengantuk)\n", + "\n", + "\n", + "\n", + "9->12\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "14\n", + "14 (memandu)\n", + "\n", + "\n", + "\n", + "9->14\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "10\n", + "10 (sebentar)\n", + "\n", + "\n", + "\n", + "12->10\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "11\n", + "11 (sekiranya)\n", + "\n", + "\n", + "\n", + "12->11\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "13\n", + "13 (ketika)\n", + "\n", + "\n", + "\n", + "14->13\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d_object, tagging, indexing = model.predict(string)\n", + "d_object.to_graphvis()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "3\n", + "3 (menasihati)\n", + "\n", + "\n", + "\n", + "0->3\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "1\n", + "1 (Dr)\n", + "\n", + "\n", + "\n", + "3->1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "4\n", + "4 (mereka)\n", + "\n", + "\n", + "\n", + "3->4\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "6\n", + "6 (berhenti)\n", + "\n", + "\n", + "\n", + "3->6\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "15\n", + "15 (.)\n", + "\n", + "\n", + "\n", + "3->15\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "2\n", + "2 (Mahathir)\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "5\n", + "5 (supaya)\n", + "\n", + "\n", + "\n", + "6->5\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "7\n", + "7 (berehat)\n", + "\n", + "\n", + "\n", + "6->7\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "9\n", + "9 (tidur)\n", + "\n", + "\n", + "\n", + "6->9\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "8\n", + "8 (dan)\n", + "\n", + "\n", + "\n", + "9->8\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "12\n", + "12 (mengantuk)\n", + "\n", + "\n", + "\n", + "9->12\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "14\n", + "14 (memandu)\n", + "\n", + "\n", + "\n", + "9->14\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "10\n", + "10 (sebentar)\n", + "\n", + "\n", + "\n", + "12->10\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "11\n", + "11 (sekiranya)\n", + "\n", + "\n", + "\n", + "12->11\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "13\n", + "13 (ketika)\n", + "\n", + "\n", + "\n", + "14->13\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d_object, tagging, indexing = quantized_model.predict(string)\n", + "d_object.to_graphvis()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Voting stack model" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:running dependency-v2/alxlnet using device /device:CPU:0\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "3\n", + "3 (menasihati)\n", + "\n", + "\n", + "\n", + "0->3\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "1\n", + "1 (Dr)\n", + "\n", + "\n", + "\n", + "3->1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "4\n", + "4 (mereka)\n", + "\n", + "\n", + "\n", + "3->4\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "6\n", + "6 (berhenti)\n", + "\n", + "\n", + "\n", + "3->6\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "15\n", + "15 (.)\n", + "\n", + "\n", + "\n", + "3->15\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "2\n", + "2 (Mahathir)\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "5\n", + "5 (supaya)\n", + "\n", + "\n", + "\n", + "6->5\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "7\n", + "7 (berehat)\n", + "\n", + "\n", + "\n", + "6->7\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "9\n", + "9 (tidur)\n", + "\n", + "\n", + "\n", + "6->9\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "8\n", + "8 (dan)\n", + "\n", + "\n", + "\n", + "9->8\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "12\n", + "12 (mengantuk)\n", + "\n", + "\n", + "\n", + "9->12\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "14\n", + "14 (memandu)\n", + "\n", + "\n", + "\n", + "9->14\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "10\n", + "10 (sebentar)\n", + "\n", + "\n", + "\n", + "12->10\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "11\n", + "11 (sekiranya)\n", + "\n", + "\n", + "\n", + "12->11\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "13\n", + "13 (ketika)\n", + "\n", + "\n", + "\n", + "14->13\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alxlnet = malaya.dependency.transformer(model = 'alxlnet')\n", + "tagging, indexing = malaya.stack.voting_stack([model, model, alxlnet], string)\n", + "malaya.dependency.dependency_graph(tagging, indexing).to_graphvis()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Harder example" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# https://www.astroawani.com/berita-malaysia/terbaik-tun-kita-geng-najib-razak-puji-tun-m-297884\n", + "\n", + "s = \"\"\"\n", + "KUALA LUMPUR: Dalam hal politik, jarang sekali untuk melihat dua figura ini - bekas Perdana Menteri, Datuk Seri Najib Razak dan Tun Dr Mahathir Mohamad mempunyai 'pandangan yang sama' atau sekapal. Namun, situasi itu berbeza apabila melibatkan isu ketidakpatuhan terhadap prosedur operasi standard (SOP). Najib, yang juga Ahli Parlimen Pekan memuji sikap Ahli Parlimen Langkawi itu yang mengaku bersalah selepas melanggar SOP kerana tidak mengambil suhu badan ketika masuk ke sebuah surau di Langkawi pada Sabtu lalu.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "11\n", + "11 (melihat)\n", + "\n", + "\n", + "\n", + "0->11\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "1\n", + "1 (KUALA)\n", + "\n", + "\n", + "\n", + "11->1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "8\n", + "8 (jarang)\n", + "\n", + "\n", + "\n", + "11->8\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "9\n", + "9 (sekali)\n", + "\n", + "\n", + "\n", + "11->9\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "10\n", + "10 (untuk)\n", + "\n", + "\n", + "\n", + "11->10\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "29\n", + "29 (mempunyai)\n", + "\n", + "\n", + "\n", + "11->29\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "42\n", + "42 (berbeza)\n", + "\n", + "\n", + "\n", + "11->42\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "2\n", + "2 (LUMPUR)\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "5\n", + "5 (hal)\n", + "\n", + "\n", + "\n", + "1->5\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "7\n", + "7 (,)\n", + "\n", + "\n", + "\n", + "1->7\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "3\n", + "3 (:)\n", + "\n", + "\n", + "\n", + "5->3\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "4\n", + "4 (Dalam)\n", + "\n", + "\n", + "\n", + "5->4\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "6\n", + "6 (politik)\n", + "\n", + "\n", + "\n", + "5->6\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "13\n", + "13 (figura)\n", + "\n", + "\n", + "\n", + "29->13\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "31\n", + "31 (pandangan)\n", + "\n", + "\n", + "\n", + "29->31\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "37\n", + "37 (.)\n", + "\n", + "\n", + "\n", + "29->37\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "38\n", + "38 (Namun)\n", + "\n", + "\n", + "\n", + "29->38\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "39\n", + "39 (,)\n", + "\n", + "\n", + "\n", + "42->39\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "40\n", + "40 (situasi)\n", + "\n", + "\n", + "\n", + "42->40\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "54\n", + "54 (.)\n", + "\n", + "\n", + "\n", + "42->54\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "89\n", + "89 (.)\n", + "\n", + "\n", + "\n", + "42->89\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "44\n", + "44 (melibatkan)\n", + "\n", + "\n", + "\n", + "42->44\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "55\n", + "55 (Najib)\n", + "\n", + "\n", + "\n", + "42->55\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "12\n", + "12 (dua)\n", + "\n", + "\n", + "\n", + "13->12\n", + "\n", + "\n", + "nummod\n", + "\n", + "\n", + "\n", + "15\n", + "15 (-)\n", + "\n", + "\n", + "\n", + "13->15\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "16\n", + "16 (bekas)\n", + "\n", + "\n", + "\n", + "13->16\n", + "\n", + "\n", + "compound:plur\n", + "\n", + "\n", + "\n", + "17\n", + "17 (Perdana)\n", + "\n", + "\n", + "\n", + "13->17\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "14\n", + "14 (ini)\n", + "\n", + "\n", + "\n", + "17->14\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "18\n", + "18 (Menteri)\n", + "\n", + "\n", + "\n", + "17->18\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "19\n", + "19 (,)\n", + "\n", + "\n", + "\n", + "17->19\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "20\n", + "20 (Datuk)\n", + "\n", + "\n", + "\n", + "17->20\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "25\n", + "25 (Tun)\n", + "\n", + "\n", + "\n", + "17->25\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "21\n", + "21 (Seri)\n", + "\n", + "\n", + "\n", + "20->21\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "24\n", + "24 (dan)\n", + "\n", + "\n", + "\n", + "25->24\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "26\n", + "26 (Dr)\n", + "\n", + "\n", + "\n", + "25->26\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "22\n", + "22 (Najib)\n", + "\n", + "\n", + "\n", + "21->22\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "23\n", + "23 (Razak)\n", + "\n", + "\n", + "\n", + "22->23\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "27\n", + "27 (Mahathir)\n", + "\n", + "\n", + "\n", + "26->27\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "28\n", + "28 (Mohamad)\n", + "\n", + "\n", + "\n", + "27->28\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "30\n", + "30 (')\n", + "\n", + "\n", + "\n", + "31->30\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "33\n", + "33 (sama)\n", + "\n", + "\n", + "\n", + "31->33\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "36\n", + "36 (sekapal)\n", + "\n", + "\n", + "\n", + "33->36\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "32\n", + "32 (yang)\n", + "\n", + "\n", + "\n", + "36->32\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "34\n", + "34 (')\n", + "\n", + "\n", + "\n", + "36->34\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "35\n", + "35 (atau)\n", + "\n", + "\n", + "\n", + "36->35\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "41\n", + "41 (itu)\n", + "\n", + "\n", + "\n", + "40->41\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "43\n", + "43 (apabila)\n", + "\n", + "\n", + "\n", + "44->43\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "45\n", + "45 (isu)\n", + "\n", + "\n", + "\n", + "44->45\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "56\n", + "56 (,)\n", + "\n", + "\n", + "\n", + "55->56\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "59\n", + "59 (Ahli)\n", + "\n", + "\n", + "\n", + "55->59\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "62\n", + "62 (memuji)\n", + "\n", + "\n", + "\n", + "55->62\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "46\n", + "46 (ketidakpatuhan)\n", + "\n", + "\n", + "\n", + "45->46\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "48\n", + "48 (prosedur)\n", + "\n", + "\n", + "\n", + "45->48\n", + "\n", + "\n", + "nmod\n", + "\n", + "\n", + "\n", + "47\n", + "47 (terhadap)\n", + "\n", + "\n", + "\n", + "48->47\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "49\n", + "49 (operasi)\n", + "\n", + "\n", + "\n", + "48->49\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "50\n", + "50 (standard)\n", + "\n", + "\n", + "\n", + "48->50\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "52\n", + "52 (SOP)\n", + "\n", + "\n", + "\n", + "48->52\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "51\n", + "51 (()\n", + "\n", + "\n", + "\n", + "52->51\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "53\n", + "53 ())\n", + "\n", + "\n", + "\n", + "52->53\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "57\n", + "57 (yang)\n", + "\n", + "\n", + "\n", + "59->57\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "58\n", + "58 (juga)\n", + "\n", + "\n", + "\n", + "59->58\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "60\n", + "60 (Parlimen)\n", + "\n", + "\n", + "\n", + "59->60\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "63\n", + "63 (sikap)\n", + "\n", + "\n", + "\n", + "62->63\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "61\n", + "61 (Pekan)\n", + "\n", + "\n", + "\n", + "60->61\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "64\n", + "64 (Ahli)\n", + "\n", + "\n", + "\n", + "63->64\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "69\n", + "69 (mengaku)\n", + "\n", + "\n", + "\n", + "63->69\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "65\n", + "65 (Parlimen)\n", + "\n", + "\n", + "\n", + "64->65\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "68\n", + "68 (yang)\n", + "\n", + "\n", + "\n", + "69->68\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "70\n", + "70 (bersalah)\n", + "\n", + "\n", + "\n", + "69->70\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "66\n", + "66 (Langkawi)\n", + "\n", + "\n", + "\n", + "65->66\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "67\n", + "67 (itu)\n", + "\n", + "\n", + "\n", + "66->67\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "72\n", + "72 (melanggar)\n", + "\n", + "\n", + "\n", + "70->72\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "71\n", + "71 (selepas)\n", + "\n", + "\n", + "\n", + "72->71\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "73\n", + "73 (SOP)\n", + "\n", + "\n", + "\n", + "72->73\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "76\n", + "76 (mengambil)\n", + "\n", + "\n", + "\n", + "72->76\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "74\n", + "74 (kerana)\n", + "\n", + "\n", + "\n", + "76->74\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "75\n", + "75 (tidak)\n", + "\n", + "\n", + "\n", + "76->75\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "77\n", + "77 (suhu)\n", + "\n", + "\n", + "\n", + "76->77\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "80\n", + "80 (masuk)\n", + "\n", + "\n", + "\n", + "76->80\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "78\n", + "78 (badan)\n", + "\n", + "\n", + "\n", + "77->78\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "79\n", + "79 (ketika)\n", + "\n", + "\n", + "\n", + "80->79\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "83\n", + "83 (surau)\n", + "\n", + "\n", + "\n", + "80->83\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "85\n", + "85 (Langkawi)\n", + "\n", + "\n", + "\n", + "80->85\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "87\n", + "87 (Sabtu)\n", + "\n", + "\n", + "\n", + "80->87\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "81\n", + "81 (ke)\n", + "\n", + "\n", + "\n", + "83->81\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "82\n", + "82 (sebuah)\n", + "\n", + "\n", + "\n", + "83->82\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "84\n", + "84 (di)\n", + "\n", + "\n", + "\n", + "85->84\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "86\n", + "86 (pada)\n", + "\n", + "\n", + "\n", + "87->86\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "88\n", + "88 (lalu)\n", + "\n", + "\n", + "\n", + "87->88\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d_object, tagging, indexing = model.predict(s)\n", + "d_object.to_graphvis()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "11\n", + "11 (melihat)\n", + "\n", + "\n", + "\n", + "0->11\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "1\n", + "1 (KUALA)\n", + "\n", + "\n", + "\n", + "11->1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "8\n", + "8 (jarang)\n", + "\n", + "\n", + "\n", + "11->8\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "9\n", + "9 (sekali)\n", + "\n", + "\n", + "\n", + "11->9\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "10\n", + "10 (untuk)\n", + "\n", + "\n", + "\n", + "11->10\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "29\n", + "29 (mempunyai)\n", + "\n", + "\n", + "\n", + "11->29\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "42\n", + "42 (berbeza)\n", + "\n", + "\n", + "\n", + "11->42\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "2\n", + "2 (LUMPUR)\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "5\n", + "5 (hal)\n", + "\n", + "\n", + "\n", + "1->5\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "7\n", + "7 (,)\n", + "\n", + "\n", + "\n", + "1->7\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "3\n", + "3 (:)\n", + "\n", + "\n", + "\n", + "5->3\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "4\n", + "4 (Dalam)\n", + "\n", + "\n", + "\n", + "5->4\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "6\n", + "6 (politik)\n", + "\n", + "\n", + "\n", + "5->6\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "13\n", + "13 (figura)\n", + "\n", + "\n", + "\n", + "29->13\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "31\n", + "31 (pandangan)\n", + "\n", + "\n", + "\n", + "29->31\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "37\n", + "37 (.)\n", + "\n", + "\n", + "\n", + "29->37\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "38\n", + "38 (Namun)\n", + "\n", + "\n", + "\n", + "29->38\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "39\n", + "39 (,)\n", + "\n", + "\n", + "\n", + "42->39\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "40\n", + "40 (situasi)\n", + "\n", + "\n", + "\n", + "42->40\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "54\n", + "54 (.)\n", + "\n", + "\n", + "\n", + "42->54\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "89\n", + "89 (.)\n", + "\n", + "\n", + "\n", + "42->89\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "44\n", + "44 (melibatkan)\n", + "\n", + "\n", + "\n", + "42->44\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "55\n", + "55 (Najib)\n", + "\n", + "\n", + "\n", + "42->55\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "12\n", + "12 (dua)\n", + "\n", + "\n", + "\n", + "13->12\n", + "\n", + "\n", + "nummod\n", + "\n", + "\n", + "\n", + "15\n", + "15 (-)\n", + "\n", + "\n", + "\n", + "13->15\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "16\n", + "16 (bekas)\n", + "\n", + "\n", + "\n", + "13->16\n", + "\n", + "\n", + "compound:plur\n", + "\n", + "\n", + "\n", + "17\n", + "17 (Perdana)\n", + "\n", + "\n", + "\n", + "13->17\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "14\n", + "14 (ini)\n", + "\n", + "\n", + "\n", + "17->14\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "18\n", + "18 (Menteri)\n", + "\n", + "\n", + "\n", + "17->18\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "19\n", + "19 (,)\n", + "\n", + "\n", + "\n", + "17->19\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "20\n", + "20 (Datuk)\n", + "\n", + "\n", + "\n", + "17->20\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "25\n", + "25 (Tun)\n", + "\n", + "\n", + "\n", + "17->25\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "21\n", + "21 (Seri)\n", + "\n", + "\n", + "\n", + "20->21\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "24\n", + "24 (dan)\n", + "\n", + "\n", + "\n", + "25->24\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "26\n", + "26 (Dr)\n", + "\n", + "\n", + "\n", + "25->26\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "22\n", + "22 (Najib)\n", + "\n", + "\n", + "\n", + "21->22\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "23\n", + "23 (Razak)\n", + "\n", + "\n", + "\n", + "22->23\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "27\n", + "27 (Mahathir)\n", + "\n", + "\n", + "\n", + "26->27\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "28\n", + "28 (Mohamad)\n", + "\n", + "\n", + "\n", + "27->28\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "30\n", + "30 (')\n", + "\n", + "\n", + "\n", + "31->30\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "33\n", + "33 (sama)\n", + "\n", + "\n", + "\n", + "31->33\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "36\n", + "36 (sekapal)\n", + "\n", + "\n", + "\n", + "33->36\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "32\n", + "32 (yang)\n", + "\n", + "\n", + "\n", + "36->32\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "34\n", + "34 (')\n", + "\n", + "\n", + "\n", + "36->34\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "35\n", + "35 (atau)\n", + "\n", + "\n", + "\n", + "36->35\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "41\n", + "41 (itu)\n", + "\n", + "\n", + "\n", + "40->41\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "43\n", + "43 (apabila)\n", + "\n", + "\n", + "\n", + "44->43\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "45\n", + "45 (isu)\n", + "\n", + "\n", + "\n", + "44->45\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "56\n", + "56 (,)\n", + "\n", + "\n", + "\n", + "55->56\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "59\n", + "59 (Ahli)\n", + "\n", + "\n", + "\n", + "55->59\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "62\n", + "62 (memuji)\n", + "\n", + "\n", + "\n", + "55->62\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "46\n", + "46 (ketidakpatuhan)\n", + "\n", + "\n", + "\n", + "45->46\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "48\n", + "48 (prosedur)\n", + "\n", + "\n", + "\n", + "45->48\n", + "\n", + "\n", + "nmod\n", + "\n", + "\n", + "\n", + "47\n", + "47 (terhadap)\n", + "\n", + "\n", + "\n", + "48->47\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "49\n", + "49 (operasi)\n", + "\n", + "\n", + "\n", + "48->49\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "50\n", + "50 (standard)\n", + "\n", + "\n", + "\n", + "48->50\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "52\n", + "52 (SOP)\n", + "\n", + "\n", + "\n", + "48->52\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "51\n", + "51 (()\n", + "\n", + "\n", + "\n", + "52->51\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "53\n", + "53 ())\n", + "\n", + "\n", + "\n", + "52->53\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "57\n", + "57 (yang)\n", + "\n", + "\n", + "\n", + "59->57\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "58\n", + "58 (juga)\n", + "\n", + "\n", + "\n", + "59->58\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "60\n", + "60 (Parlimen)\n", + "\n", + "\n", + "\n", + "59->60\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "63\n", + "63 (sikap)\n", + "\n", + "\n", + "\n", + "62->63\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "61\n", + "61 (Pekan)\n", + "\n", + "\n", + "\n", + "60->61\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "64\n", + "64 (Ahli)\n", + "\n", + "\n", + "\n", + "63->64\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "69\n", + "69 (mengaku)\n", + "\n", + "\n", + "\n", + "63->69\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "65\n", + "65 (Parlimen)\n", + "\n", + "\n", + "\n", + "64->65\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "68\n", + "68 (yang)\n", + "\n", + "\n", + "\n", + "69->68\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "70\n", + "70 (bersalah)\n", + "\n", + "\n", + "\n", + "69->70\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "66\n", + "66 (Langkawi)\n", + "\n", + "\n", + "\n", + "65->66\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "67\n", + "67 (itu)\n", + "\n", + "\n", + "\n", + "66->67\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "72\n", + "72 (melanggar)\n", + "\n", + "\n", + "\n", + "70->72\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "71\n", + "71 (selepas)\n", + "\n", + "\n", + "\n", + "72->71\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "73\n", + "73 (SOP)\n", + "\n", + "\n", + "\n", + "72->73\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "76\n", + "76 (mengambil)\n", + "\n", + "\n", + "\n", + "72->76\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "74\n", + "74 (kerana)\n", + "\n", + "\n", + "\n", + "76->74\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "75\n", + "75 (tidak)\n", + "\n", + "\n", + "\n", + "76->75\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "77\n", + "77 (suhu)\n", + "\n", + "\n", + "\n", + "76->77\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "80\n", + "80 (masuk)\n", + "\n", + "\n", + "\n", + "76->80\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "78\n", + "78 (badan)\n", + "\n", + "\n", + "\n", + "77->78\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "79\n", + "79 (ketika)\n", + "\n", + "\n", + "\n", + "80->79\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "83\n", + "83 (surau)\n", + "\n", + "\n", + "\n", + "80->83\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "85\n", + "85 (Langkawi)\n", + "\n", + "\n", + "\n", + "80->85\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "87\n", + "87 (Sabtu)\n", + "\n", + "\n", + "\n", + "80->87\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "81\n", + "81 (ke)\n", + "\n", + "\n", + "\n", + "83->81\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "82\n", + "82 (sebuah)\n", + "\n", + "\n", + "\n", + "83->82\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "84\n", + "84 (di)\n", + "\n", + "\n", + "\n", + "85->84\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "86\n", + "86 (pada)\n", + "\n", + "\n", + "\n", + "87->86\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "88\n", + "88 (lalu)\n", + "\n", + "\n", + "\n", + "87->88\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tagging, indexing = malaya.stack.voting_stack([model, model, alxlnet], s)\n", + "malaya.dependency.dependency_graph(tagging, indexing).to_graphvis()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dependency graph object\n", + "\n", + "To initiate a dependency graph from dependency models, you need to call `malaya.dependency.dependency_graph`." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph = malaya.dependency.dependency_graph(tagging, indexing)\n", + "graph" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### generate graphvis" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "11\n", + "11 (melihat)\n", + "\n", + "\n", + "\n", + "0->11\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "1\n", + "1 (KUALA)\n", + "\n", + "\n", + "\n", + "11->1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "8\n", + "8 (jarang)\n", + "\n", + "\n", + "\n", + "11->8\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "9\n", + "9 (sekali)\n", + "\n", + "\n", + "\n", + "11->9\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "10\n", + "10 (untuk)\n", + "\n", + "\n", + "\n", + "11->10\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "29\n", + "29 (mempunyai)\n", + "\n", + "\n", + "\n", + "11->29\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "42\n", + "42 (berbeza)\n", + "\n", + "\n", + "\n", + "11->42\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "2\n", + "2 (LUMPUR)\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "5\n", + "5 (hal)\n", + "\n", + "\n", + "\n", + "1->5\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "7\n", + "7 (,)\n", + "\n", + "\n", + "\n", + "1->7\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "3\n", + "3 (:)\n", + "\n", + "\n", + "\n", + "5->3\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "4\n", + "4 (Dalam)\n", + "\n", + "\n", + "\n", + "5->4\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "6\n", + "6 (politik)\n", + "\n", + "\n", + "\n", + "5->6\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "13\n", + "13 (figura)\n", + "\n", + "\n", + "\n", + "29->13\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "31\n", + "31 (pandangan)\n", + "\n", + "\n", + "\n", + "29->31\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "37\n", + "37 (.)\n", + "\n", + "\n", + "\n", + "29->37\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "38\n", + "38 (Namun)\n", + "\n", + "\n", + "\n", + "29->38\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "39\n", + "39 (,)\n", + "\n", + "\n", + "\n", + "42->39\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "40\n", + "40 (situasi)\n", + "\n", + "\n", + "\n", + "42->40\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "54\n", + "54 (.)\n", + "\n", + "\n", + "\n", + "42->54\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "89\n", + "89 (.)\n", + "\n", + "\n", + "\n", + "42->89\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "44\n", + "44 (melibatkan)\n", + "\n", + "\n", + "\n", + "42->44\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "55\n", + "55 (Najib)\n", + "\n", + "\n", + "\n", + "42->55\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "12\n", + "12 (dua)\n", + "\n", + "\n", + "\n", + "13->12\n", + "\n", + "\n", + "nummod\n", + "\n", + "\n", + "\n", + "15\n", + "15 (-)\n", + "\n", + "\n", + "\n", + "13->15\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "16\n", + "16 (bekas)\n", + "\n", + "\n", + "\n", + "13->16\n", + "\n", + "\n", + "compound:plur\n", + "\n", + "\n", + "\n", + "17\n", + "17 (Perdana)\n", + "\n", + "\n", + "\n", + "13->17\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "14\n", + "14 (ini)\n", + "\n", + "\n", + "\n", + "17->14\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "18\n", + "18 (Menteri)\n", + "\n", + "\n", + "\n", + "17->18\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "19\n", + "19 (,)\n", + "\n", + "\n", + "\n", + "17->19\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "20\n", + "20 (Datuk)\n", + "\n", + "\n", + "\n", + "17->20\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "25\n", + "25 (Tun)\n", + "\n", + "\n", + "\n", + "17->25\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "21\n", + "21 (Seri)\n", + "\n", + "\n", + "\n", + "20->21\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "24\n", + "24 (dan)\n", + "\n", + "\n", + "\n", + "25->24\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "26\n", + "26 (Dr)\n", + "\n", + "\n", + "\n", + "25->26\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "22\n", + "22 (Najib)\n", + "\n", + "\n", + "\n", + "21->22\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "23\n", + "23 (Razak)\n", + "\n", + "\n", + "\n", + "22->23\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "27\n", + "27 (Mahathir)\n", + "\n", + "\n", + "\n", + "26->27\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "28\n", + "28 (Mohamad)\n", + "\n", + "\n", + "\n", + "27->28\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "30\n", + "30 (')\n", + "\n", + "\n", + "\n", + "31->30\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "33\n", + "33 (sama)\n", + "\n", + "\n", + "\n", + "31->33\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "36\n", + "36 (sekapal)\n", + "\n", + "\n", + "\n", + "33->36\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "32\n", + "32 (yang)\n", + "\n", + "\n", + "\n", + "36->32\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "34\n", + "34 (')\n", + "\n", + "\n", + "\n", + "36->34\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "35\n", + "35 (atau)\n", + "\n", + "\n", + "\n", + "36->35\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "41\n", + "41 (itu)\n", + "\n", + "\n", + "\n", + "40->41\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "43\n", + "43 (apabila)\n", + "\n", + "\n", + "\n", + "44->43\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "45\n", + "45 (isu)\n", + "\n", + "\n", + "\n", + "44->45\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "56\n", + "56 (,)\n", + "\n", + "\n", + "\n", + "55->56\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "59\n", + "59 (Ahli)\n", + "\n", + "\n", + "\n", + "55->59\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "62\n", + "62 (memuji)\n", + "\n", + "\n", + "\n", + "55->62\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "46\n", + "46 (ketidakpatuhan)\n", + "\n", + "\n", + "\n", + "45->46\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "48\n", + "48 (prosedur)\n", + "\n", + "\n", + "\n", + "45->48\n", + "\n", + "\n", + "nmod\n", + "\n", + "\n", + "\n", + "47\n", + "47 (terhadap)\n", + "\n", + "\n", + "\n", + "48->47\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "49\n", + "49 (operasi)\n", + "\n", + "\n", + "\n", + "48->49\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "50\n", + "50 (standard)\n", + "\n", + "\n", + "\n", + "48->50\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "52\n", + "52 (SOP)\n", + "\n", + "\n", + "\n", + "48->52\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "51\n", + "51 (()\n", + "\n", + "\n", + "\n", + "52->51\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "53\n", + "53 ())\n", + "\n", + "\n", + "\n", + "52->53\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "57\n", + "57 (yang)\n", + "\n", + "\n", + "\n", + "59->57\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "58\n", + "58 (juga)\n", + "\n", + "\n", + "\n", + "59->58\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "60\n", + "60 (Parlimen)\n", + "\n", + "\n", + "\n", + "59->60\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "63\n", + "63 (sikap)\n", + "\n", + "\n", + "\n", + "62->63\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "61\n", + "61 (Pekan)\n", + "\n", + "\n", + "\n", + "60->61\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "64\n", + "64 (Ahli)\n", + "\n", + "\n", + "\n", + "63->64\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "69\n", + "69 (mengaku)\n", + "\n", + "\n", + "\n", + "63->69\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "65\n", + "65 (Parlimen)\n", + "\n", + "\n", + "\n", + "64->65\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "68\n", + "68 (yang)\n", + "\n", + "\n", + "\n", + "69->68\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "70\n", + "70 (bersalah)\n", + "\n", + "\n", + "\n", + "69->70\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "66\n", + "66 (Langkawi)\n", + "\n", + "\n", + "\n", + "65->66\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "67\n", + "67 (itu)\n", + "\n", + "\n", + "\n", + "66->67\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "72\n", + "72 (melanggar)\n", + "\n", + "\n", + "\n", + "70->72\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "71\n", + "71 (selepas)\n", + "\n", + "\n", + "\n", + "72->71\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "73\n", + "73 (SOP)\n", + "\n", + "\n", + "\n", + "72->73\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "76\n", + "76 (mengambil)\n", + "\n", + "\n", + "\n", + "72->76\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "74\n", + "74 (kerana)\n", + "\n", + "\n", + "\n", + "76->74\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "75\n", + "75 (tidak)\n", + "\n", + "\n", + "\n", + "76->75\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "77\n", + "77 (suhu)\n", + "\n", + "\n", + "\n", + "76->77\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "80\n", + "80 (masuk)\n", + "\n", + "\n", + "\n", + "76->80\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "78\n", + "78 (badan)\n", + "\n", + "\n", + "\n", + "77->78\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "79\n", + "79 (ketika)\n", + "\n", + "\n", + "\n", + "80->79\n", + "\n", + "\n", + "mark\n", + "\n", + "\n", + "\n", + "83\n", + "83 (surau)\n", + "\n", + "\n", + "\n", + "80->83\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "85\n", + "85 (Langkawi)\n", + "\n", + "\n", + "\n", + "80->85\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "87\n", + "87 (Sabtu)\n", + "\n", + "\n", + "\n", + "80->87\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "81\n", + "81 (ke)\n", + "\n", + "\n", + "\n", + "83->81\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "82\n", + "82 (sebuah)\n", + "\n", + "\n", + "\n", + "83->82\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "84\n", + "84 (di)\n", + "\n", + "\n", + "\n", + "85->84\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "86\n", + "86 (pada)\n", + "\n", + "\n", + "\n", + "87->86\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "88\n", + "88 (lalu)\n", + "\n", + "\n", + "\n", + "87->88\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph.to_graphvis()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Get nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "defaultdict(.()>,\n", + " {0: {'address': 0,\n", + " 'word': None,\n", + " 'lemma': None,\n", + " 'ctag': 'TOP',\n", + " 'tag': 'TOP',\n", + " 'feats': None,\n", + " 'head': None,\n", + " 'deps': defaultdict(list, {'root': [11]}),\n", + " 'rel': None},\n", + " 1: {'address': 1,\n", + " 'word': 'KUALA',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 11,\n", + " 'deps': defaultdict(list,\n", + " {'flat': [2], 'obl': [5], 'punct': [7]}),\n", + " 'rel': 'nsubj'},\n", + " 11: {'address': 11,\n", + " 'word': 'melihat',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 0,\n", + " 'deps': defaultdict(list,\n", + " {'nsubj': [1],\n", + " 'advmod': [8, 9],\n", + " 'case': [10],\n", + " 'advcl': [29],\n", + " 'dep': [42]}),\n", + " 'rel': 'root'},\n", + " 2: {'address': 2,\n", + " 'word': 'LUMPUR',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 1,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'flat'},\n", + " 3: {'address': 3,\n", + " 'word': ':',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 5,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 5: {'address': 5,\n", + " 'word': 'hal',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 1,\n", + " 'deps': defaultdict(list,\n", + " {'punct': [3], 'case': [4], 'compound': [6]}),\n", + " 'rel': 'obl'},\n", + " 4: {'address': 4,\n", + " 'word': 'Dalam',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 5,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'case'},\n", + " 6: {'address': 6,\n", + " 'word': 'politik',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 5,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'compound'},\n", + " 7: {'address': 7,\n", + " 'word': ',',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 1,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 8: {'address': 8,\n", + " 'word': 'jarang',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 11,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'advmod'},\n", + " 9: {'address': 9,\n", + " 'word': 'sekali',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 11,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'advmod'},\n", + " 10: {'address': 10,\n", + " 'word': 'untuk',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 11,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'case'},\n", + " 12: {'address': 12,\n", + " 'word': 'dua',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 13,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'nummod'},\n", + " 13: {'address': 13,\n", + " 'word': 'figura',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 29,\n", + " 'deps': defaultdict(list,\n", + " {'nummod': [12],\n", + " 'punct': [15],\n", + " 'compound:plur': [16],\n", + " 'flat': [17]}),\n", + " 'rel': 'obj'},\n", + " 29: {'address': 29,\n", + " 'word': 'mempunyai',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 11,\n", + " 'deps': defaultdict(list,\n", + " {'obj': [13, 31], 'punct': [37], 'mark': [38]}),\n", + " 'rel': 'advcl'},\n", + " 14: {'address': 14,\n", + " 'word': 'ini',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 17,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'det'},\n", + " 17: {'address': 17,\n", + " 'word': 'Perdana',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 13,\n", + " 'deps': defaultdict(list,\n", + " {'det': [14],\n", + " 'flat': [18],\n", + " 'punct': [19],\n", + " 'appos': [20],\n", + " 'conj': [25]}),\n", + " 'rel': 'flat'},\n", + " 15: {'address': 15,\n", + " 'word': '-',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 13,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 16: {'address': 16,\n", + " 'word': 'bekas',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 13,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'compound:plur'},\n", + " 18: {'address': 18,\n", + " 'word': 'Menteri',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 17,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'flat'},\n", + " 19: {'address': 19,\n", + " 'word': ',',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 17,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 20: {'address': 20,\n", + " 'word': 'Datuk',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 17,\n", + " 'deps': defaultdict(list, {'flat': [21]}),\n", + " 'rel': 'appos'},\n", + " 21: {'address': 21,\n", + " 'word': 'Seri',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 20,\n", + " 'deps': defaultdict(list, {'flat': [22]}),\n", + " 'rel': 'flat'},\n", + " 22: {'address': 22,\n", + " 'word': 'Najib',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 21,\n", + " 'deps': defaultdict(list, {'flat': [23]}),\n", + " 'rel': 'flat'},\n", + " 23: {'address': 23,\n", + " 'word': 'Razak',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 22,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'flat'},\n", + " 24: {'address': 24,\n", + " 'word': 'dan',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 25,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'cc'},\n", + " 25: {'address': 25,\n", + " 'word': 'Tun',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 17,\n", + " 'deps': defaultdict(list, {'cc': [24], 'flat': [26]}),\n", + " 'rel': 'conj'},\n", + " 26: {'address': 26,\n", + " 'word': 'Dr',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 25,\n", + " 'deps': defaultdict(list, {'flat': [27]}),\n", + " 'rel': 'flat'},\n", + " 27: {'address': 27,\n", + " 'word': 'Mahathir',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 26,\n", + " 'deps': defaultdict(list, {'flat': [28]}),\n", + " 'rel': 'flat'},\n", + " 28: {'address': 28,\n", + " 'word': 'Mohamad',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 27,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'flat'},\n", + " 30: {'address': 30,\n", + " 'word': \"'\",\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 31,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 31: {'address': 31,\n", + " 'word': 'pandangan',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 29,\n", + " 'deps': defaultdict(list, {'punct': [30], 'amod': [33]}),\n", + " 'rel': 'obj'},\n", + " 32: {'address': 32,\n", + " 'word': 'yang',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 36,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'nsubj'},\n", + " 36: {'address': 36,\n", + " 'word': 'sekapal',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 33,\n", + " 'deps': defaultdict(list,\n", + " {'nsubj': [32], 'punct': [34], 'cc': [35]}),\n", + " 'rel': 'conj'},\n", + " 33: {'address': 33,\n", + " 'word': 'sama',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 31,\n", + " 'deps': defaultdict(list, {'conj': [36]}),\n", + " 'rel': 'amod'},\n", + " 34: {'address': 34,\n", + " 'word': \"'\",\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 36,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 35: {'address': 35,\n", + " 'word': 'atau',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 36,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'cc'},\n", + " 37: {'address': 37,\n", + " 'word': '.',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 29,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 38: {'address': 38,\n", + " 'word': 'Namun',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 29,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'mark'},\n", + " 39: {'address': 39,\n", + " 'word': ',',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 42,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 42: {'address': 42,\n", + " 'word': 'berbeza',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 11,\n", + " 'deps': defaultdict(list,\n", + " {'punct': [39, 54, 89],\n", + " 'nsubj': [40],\n", + " 'advcl': [44],\n", + " 'dep': [55]}),\n", + " 'rel': 'dep'},\n", + " 40: {'address': 40,\n", + " 'word': 'situasi',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 42,\n", + " 'deps': defaultdict(list, {'det': [41]}),\n", + " 'rel': 'nsubj'},\n", + " 41: {'address': 41,\n", + " 'word': 'itu',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 40,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'det'},\n", + " 43: {'address': 43,\n", + " 'word': 'apabila',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 44,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'mark'},\n", + " 44: {'address': 44,\n", + " 'word': 'melibatkan',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 42,\n", + " 'deps': defaultdict(list, {'mark': [43], 'obj': [45]}),\n", + " 'rel': 'advcl'},\n", + " 45: {'address': 45,\n", + " 'word': 'isu',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 44,\n", + " 'deps': defaultdict(list, {'compound': [46], 'nmod': [48]}),\n", + " 'rel': 'obj'},\n", + " 46: {'address': 46,\n", + " 'word': 'ketidakpatuhan',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 45,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'compound'},\n", + " 47: {'address': 47,\n", + " 'word': 'terhadap',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 48,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'case'},\n", + " 48: {'address': 48,\n", + " 'word': 'prosedur',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 45,\n", + " 'deps': defaultdict(list,\n", + " {'case': [47],\n", + " 'compound': [49],\n", + " 'amod': [50],\n", + " 'appos': [52]}),\n", + " 'rel': 'nmod'},\n", + " 49: {'address': 49,\n", + " 'word': 'operasi',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 48,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'compound'},\n", + " 50: {'address': 50,\n", + " 'word': 'standard',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 48,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'amod'},\n", + " 51: {'address': 51,\n", + " 'word': '(',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 52,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 52: {'address': 52,\n", + " 'word': 'SOP',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 48,\n", + " 'deps': defaultdict(list, {'punct': [51, 53]}),\n", + " 'rel': 'appos'},\n", + " 53: {'address': 53,\n", + " 'word': ')',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 52,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 54: {'address': 54,\n", + " 'word': '.',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 42,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 55: {'address': 55,\n", + " 'word': 'Najib',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 42,\n", + " 'deps': defaultdict(list,\n", + " {'punct': [56], 'nsubj': [59], 'acl': [62]}),\n", + " 'rel': 'dep'},\n", + " 56: {'address': 56,\n", + " 'word': ',',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 55,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'},\n", + " 57: {'address': 57,\n", + " 'word': 'yang',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 59,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'nsubj'},\n", + " 59: {'address': 59,\n", + " 'word': 'Ahli',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 55,\n", + " 'deps': defaultdict(list,\n", + " {'nsubj': [57], 'advmod': [58], 'flat': [60]}),\n", + " 'rel': 'nsubj'},\n", + " 58: {'address': 58,\n", + " 'word': 'juga',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 59,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'advmod'},\n", + " 60: {'address': 60,\n", + " 'word': 'Parlimen',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 59,\n", + " 'deps': defaultdict(list, {'flat': [61]}),\n", + " 'rel': 'flat'},\n", + " 61: {'address': 61,\n", + " 'word': 'Pekan',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 60,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'flat'},\n", + " 62: {'address': 62,\n", + " 'word': 'memuji',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 55,\n", + " 'deps': defaultdict(list, {'obj': [63]}),\n", + " 'rel': 'acl'},\n", + " 63: {'address': 63,\n", + " 'word': 'sikap',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 62,\n", + " 'deps': defaultdict(list, {'flat': [64], 'acl': [69]}),\n", + " 'rel': 'obj'},\n", + " 64: {'address': 64,\n", + " 'word': 'Ahli',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 63,\n", + " 'deps': defaultdict(list, {'flat': [65]}),\n", + " 'rel': 'flat'},\n", + " 65: {'address': 65,\n", + " 'word': 'Parlimen',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 64,\n", + " 'deps': defaultdict(list, {'flat': [66]}),\n", + " 'rel': 'flat'},\n", + " 66: {'address': 66,\n", + " 'word': 'Langkawi',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 65,\n", + " 'deps': defaultdict(list, {'det': [67]}),\n", + " 'rel': 'flat'},\n", + " 67: {'address': 67,\n", + " 'word': 'itu',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 66,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'det'},\n", + " 68: {'address': 68,\n", + " 'word': 'yang',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 69,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'nsubj'},\n", + " 69: {'address': 69,\n", + " 'word': 'mengaku',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 63,\n", + " 'deps': defaultdict(list, {'nsubj': [68], 'xcomp': [70]}),\n", + " 'rel': 'acl'},\n", + " 70: {'address': 70,\n", + " 'word': 'bersalah',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 69,\n", + " 'deps': defaultdict(list, {'xcomp': [72]}),\n", + " 'rel': 'xcomp'},\n", + " 71: {'address': 71,\n", + " 'word': 'selepas',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 72,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'case'},\n", + " 72: {'address': 72,\n", + " 'word': 'melanggar',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 70,\n", + " 'deps': defaultdict(list,\n", + " {'case': [71], 'obj': [73], 'advcl': [76]}),\n", + " 'rel': 'xcomp'},\n", + " 73: {'address': 73,\n", + " 'word': 'SOP',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 72,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'obj'},\n", + " 74: {'address': 74,\n", + " 'word': 'kerana',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 76,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'mark'},\n", + " 76: {'address': 76,\n", + " 'word': 'mengambil',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 72,\n", + " 'deps': defaultdict(list,\n", + " {'mark': [74],\n", + " 'advmod': [75],\n", + " 'obj': [77],\n", + " 'advcl': [80]}),\n", + " 'rel': 'advcl'},\n", + " 75: {'address': 75,\n", + " 'word': 'tidak',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 76,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'advmod'},\n", + " 77: {'address': 77,\n", + " 'word': 'suhu',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 76,\n", + " 'deps': defaultdict(list, {'compound': [78]}),\n", + " 'rel': 'obj'},\n", + " 78: {'address': 78,\n", + " 'word': 'badan',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 77,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'compound'},\n", + " 79: {'address': 79,\n", + " 'word': 'ketika',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 80,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'mark'},\n", + " 80: {'address': 80,\n", + " 'word': 'masuk',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 76,\n", + " 'deps': defaultdict(list, {'mark': [79], 'obl': [83, 85, 87]}),\n", + " 'rel': 'advcl'},\n", + " 81: {'address': 81,\n", + " 'word': 'ke',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 83,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'case'},\n", + " 83: {'address': 83,\n", + " 'word': 'surau',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 80,\n", + " 'deps': defaultdict(list, {'case': [81], 'det': [82]}),\n", + " 'rel': 'obl'},\n", + " 82: {'address': 82,\n", + " 'word': 'sebuah',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 83,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'det'},\n", + " 84: {'address': 84,\n", + " 'word': 'di',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 85,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'case'},\n", + " 85: {'address': 85,\n", + " 'word': 'Langkawi',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 80,\n", + " 'deps': defaultdict(list, {'case': [84]}),\n", + " 'rel': 'obl'},\n", + " 86: {'address': 86,\n", + " 'word': 'pada',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 87,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'case'},\n", + " 87: {'address': 87,\n", + " 'word': 'Sabtu',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 80,\n", + " 'deps': defaultdict(list, {'case': [86], 'amod': [88]}),\n", + " 'rel': 'obl'},\n", + " 88: {'address': 88,\n", + " 'word': 'lalu',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 87,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'amod'},\n", + " 89: {'address': 89,\n", + " 'word': '.',\n", + " 'lemma': '_',\n", + " 'ctag': '_',\n", + " 'tag': '_',\n", + " 'feats': '_',\n", + " 'head': 42,\n", + " 'deps': defaultdict(list, {}),\n", + " 'rel': 'punct'}})" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph.nodes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Flat the graph" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(('melihat', '_'), 'nsubj', ('KUALA', '_')),\n", + " (('KUALA', '_'), 'flat', ('LUMPUR', '_')),\n", + " (('KUALA', '_'), 'obl', ('hal', '_')),\n", + " (('hal', '_'), 'punct', (':', '_')),\n", + " (('hal', '_'), 'case', ('Dalam', '_')),\n", + " (('hal', '_'), 'compound', ('politik', '_')),\n", + " (('KUALA', '_'), 'punct', (',', '_')),\n", + " (('melihat', '_'), 'advmod', ('jarang', '_')),\n", + " (('melihat', '_'), 'advmod', ('sekali', '_')),\n", + " (('melihat', '_'), 'case', ('untuk', '_')),\n", + " (('melihat', '_'), 'advcl', ('mempunyai', '_')),\n", + " (('mempunyai', '_'), 'obj', ('figura', '_')),\n", + " (('figura', '_'), 'nummod', ('dua', '_')),\n", + " (('figura', '_'), 'punct', ('-', '_')),\n", + " (('figura', '_'), 'compound:plur', ('bekas', '_')),\n", + " (('figura', '_'), 'flat', ('Perdana', '_')),\n", + " (('Perdana', '_'), 'det', ('ini', '_')),\n", + " (('Perdana', '_'), 'flat', ('Menteri', '_')),\n", + " (('Perdana', '_'), 'punct', (',', '_')),\n", + " (('Perdana', '_'), 'appos', ('Datuk', '_')),\n", + " (('Datuk', '_'), 'flat', ('Seri', '_')),\n", + " (('Seri', '_'), 'flat', ('Najib', '_')),\n", + " (('Najib', '_'), 'flat', ('Razak', '_')),\n", + " (('Perdana', '_'), 'conj', ('Tun', '_')),\n", + " (('Tun', '_'), 'cc', ('dan', '_')),\n", + " (('Tun', '_'), 'flat', ('Dr', '_')),\n", + " (('Dr', '_'), 'flat', ('Mahathir', '_')),\n", + " (('Mahathir', '_'), 'flat', ('Mohamad', '_')),\n", + " (('mempunyai', '_'), 'obj', ('pandangan', '_')),\n", + " (('pandangan', '_'), 'punct', (\"'\", '_')),\n", + " (('pandangan', '_'), 'amod', ('sama', '_')),\n", + " (('sama', '_'), 'conj', ('sekapal', '_')),\n", + " (('sekapal', '_'), 'nsubj', ('yang', '_')),\n", + " (('sekapal', '_'), 'punct', (\"'\", '_')),\n", + " (('sekapal', '_'), 'cc', ('atau', '_')),\n", + " (('mempunyai', '_'), 'punct', ('.', '_')),\n", + " (('mempunyai', '_'), 'mark', ('Namun', '_')),\n", + " (('melihat', '_'), 'dep', ('berbeza', '_')),\n", + " (('berbeza', '_'), 'punct', (',', '_')),\n", + " (('berbeza', '_'), 'nsubj', ('situasi', '_')),\n", + " (('situasi', '_'), 'det', ('itu', '_')),\n", + " (('berbeza', '_'), 'advcl', ('melibatkan', '_')),\n", + " (('melibatkan', '_'), 'mark', ('apabila', '_')),\n", + " (('melibatkan', '_'), 'obj', ('isu', '_')),\n", + " (('isu', '_'), 'compound', ('ketidakpatuhan', '_')),\n", + " (('isu', '_'), 'nmod', ('prosedur', '_')),\n", + " (('prosedur', '_'), 'case', ('terhadap', '_')),\n", + " (('prosedur', '_'), 'compound', ('operasi', '_')),\n", + " (('prosedur', '_'), 'amod', ('standard', '_')),\n", + " (('prosedur', '_'), 'appos', ('SOP', '_')),\n", + " (('SOP', '_'), 'punct', ('(', '_')),\n", + " (('SOP', '_'), 'punct', (')', '_')),\n", + " (('berbeza', '_'), 'punct', ('.', '_')),\n", + " (('berbeza', '_'), 'dep', ('Najib', '_')),\n", + " (('Najib', '_'), 'punct', (',', '_')),\n", + " (('Najib', '_'), 'nsubj', ('Ahli', '_')),\n", + " (('Ahli', '_'), 'nsubj', ('yang', '_')),\n", + " (('Ahli', '_'), 'advmod', ('juga', '_')),\n", + " (('Ahli', '_'), 'flat', ('Parlimen', '_')),\n", + " (('Parlimen', '_'), 'flat', ('Pekan', '_')),\n", + " (('Najib', '_'), 'acl', ('memuji', '_')),\n", + " (('memuji', '_'), 'obj', ('sikap', '_')),\n", + " (('sikap', '_'), 'flat', ('Ahli', '_')),\n", + " (('Ahli', '_'), 'flat', ('Parlimen', '_')),\n", + " (('Parlimen', '_'), 'flat', ('Langkawi', '_')),\n", + " (('Langkawi', '_'), 'det', ('itu', '_')),\n", + " (('sikap', '_'), 'acl', ('mengaku', '_')),\n", + " (('mengaku', '_'), 'nsubj', ('yang', '_')),\n", + " (('mengaku', '_'), 'xcomp', ('bersalah', '_')),\n", + " (('bersalah', '_'), 'xcomp', ('melanggar', '_')),\n", + " (('melanggar', '_'), 'case', ('selepas', '_')),\n", + " (('melanggar', '_'), 'obj', ('SOP', '_')),\n", + " (('melanggar', '_'), 'advcl', ('mengambil', '_')),\n", + " (('mengambil', '_'), 'mark', ('kerana', '_')),\n", + " (('mengambil', '_'), 'advmod', ('tidak', '_')),\n", + " (('mengambil', '_'), 'obj', ('suhu', '_')),\n", + " (('suhu', '_'), 'compound', ('badan', '_')),\n", + " (('mengambil', '_'), 'advcl', ('masuk', '_')),\n", + " (('masuk', '_'), 'mark', ('ketika', '_')),\n", + " (('masuk', '_'), 'obl', ('surau', '_')),\n", + " (('surau', '_'), 'case', ('ke', '_')),\n", + " (('surau', '_'), 'det', ('sebuah', '_')),\n", + " (('masuk', '_'), 'obl', ('Langkawi', '_')),\n", + " (('Langkawi', '_'), 'case', ('di', '_')),\n", + " (('masuk', '_'), 'obl', ('Sabtu', '_')),\n", + " (('Sabtu', '_'), 'case', ('pada', '_')),\n", + " (('Sabtu', '_'), 'amod', ('lalu', '_')),\n", + " (('berbeza', '_'), 'punct', ('.', '_'))]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(graph.triples())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Check the graph contains cycles" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "graph.contains_cycle()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Generate networkx\n", + "\n", + "Make sure you already installed networkx, \n", + "\n", + "```bash\n", + "pip install networkx\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "digraph = graph.to_networkx()\n", + "digraph" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import networkx as nx\n", + "import matplotlib.pyplot as plt\n", + "nx.draw_networkx(digraph)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "OutMultiEdgeDataView([(1, 11), (2, 1), (3, 5), (4, 5), (5, 1), (6, 5), (7, 1), (8, 11), (9, 11), (10, 11), (12, 13), (13, 29), (14, 17), (15, 13), (16, 13), (17, 13), (18, 17), (19, 17), (20, 17), (21, 20), (22, 21), (23, 22), (24, 25), (25, 17), (26, 25), (27, 26), (28, 27), (29, 11), (30, 31), (31, 29), (32, 36), (33, 31), (34, 36), (35, 36), (36, 33), (37, 29), (38, 29), (39, 42), (40, 42), (41, 40), (42, 11), (43, 44), (44, 42), (45, 44), (46, 45), (47, 48), (48, 45), (49, 48), (50, 48), (51, 52), (52, 48), (53, 52), (54, 42), (55, 42), (56, 55), (57, 59), (58, 59), (59, 55), (60, 59), (61, 60), (62, 55), (63, 62), (64, 63), (65, 64), (66, 65), (67, 66), (68, 69), (69, 63), (70, 69), (71, 72), (72, 70), (73, 72), (74, 76), (75, 76), (76, 72), (77, 76), (78, 77), (79, 80), (80, 76), (81, 83), (82, 83), (83, 80), (84, 85), (85, 80), (86, 87), (87, 80), (88, 87), (89, 42)])" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "digraph.edges()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "NodeView((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89))" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "digraph.nodes()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{1: 'KUALA',\n", + " 2: 'LUMPUR',\n", + " 3: ':',\n", + " 4: 'Dalam',\n", + " 5: 'hal',\n", + " 6: 'politik',\n", + " 7: ',',\n", + " 8: 'jarang',\n", + " 9: 'sekali',\n", + " 10: 'untuk',\n", + " 11: 'melihat',\n", + " 12: 'dua',\n", + " 13: 'figura',\n", + " 14: 'ini',\n", + " 15: '-',\n", + " 16: 'bekas',\n", + " 17: 'Perdana',\n", + " 18: 'Menteri',\n", + " 19: ',',\n", + " 20: 'Datuk',\n", + " 21: 'Seri',\n", + " 22: 'Najib',\n", + " 23: 'Razak',\n", + " 24: 'dan',\n", + " 25: 'Tun',\n", + " 26: 'Dr',\n", + " 27: 'Mahathir',\n", + " 28: 'Mohamad',\n", + " 29: 'mempunyai',\n", + " 30: \"'\",\n", + " 31: 'pandangan',\n", + " 32: 'yang',\n", + " 33: 'sama',\n", + " 34: \"'\",\n", + " 35: 'atau',\n", + " 36: 'sekapal',\n", + " 37: '.',\n", + " 38: 'Namun',\n", + " 39: ',',\n", + " 40: 'situasi',\n", + " 41: 'itu',\n", + " 42: 'berbeza',\n", + " 43: 'apabila',\n", + " 44: 'melibatkan',\n", + " 45: 'isu',\n", + " 46: 'ketidakpatuhan',\n", + " 47: 'terhadap',\n", + " 48: 'prosedur',\n", + " 49: 'operasi',\n", + " 50: 'standard',\n", + " 51: '(',\n", + " 52: 'SOP',\n", + " 53: ')',\n", + " 54: '.',\n", + " 55: 'Najib',\n", + " 56: ',',\n", + " 57: 'yang',\n", + " 58: 'juga',\n", + " 59: 'Ahli',\n", + " 60: 'Parlimen',\n", + " 61: 'Pekan',\n", + " 62: 'memuji',\n", + " 63: 'sikap',\n", + " 64: 'Ahli',\n", + " 65: 'Parlimen',\n", + " 66: 'Langkawi',\n", + " 67: 'itu',\n", + " 68: 'yang',\n", + " 69: 'mengaku',\n", + " 70: 'bersalah',\n", + " 71: 'selepas',\n", + " 72: 'melanggar',\n", + " 73: 'SOP',\n", + " 74: 'kerana',\n", + " 75: 'tidak',\n", + " 76: 'mengambil',\n", + " 77: 'suhu',\n", + " 78: 'badan',\n", + " 79: 'ketika',\n", + " 80: 'masuk',\n", + " 81: 'ke',\n", + " 82: 'sebuah',\n", + " 83: 'surau',\n", + " 84: 'di',\n", + " 85: 'Langkawi',\n", + " 86: 'pada',\n", + " 87: 'Sabtu',\n", + " 88: 'lalu',\n", + " 89: '.'}" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels = {i:graph.get_by_address(i)['word'] for i in digraph.nodes()}\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(15,5))\n", + "nx.draw_networkx(digraph,labels=labels)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Vectorize\n", + "\n", + "Let say you want to visualize word level in lower dimension, you can use `model.vectorize`,\n", + "\n", + "```python\n", + "def vectorize(self, string: str):\n", + " \"\"\"\n", + " vectorize a string.\n", + "\n", + " Parameters\n", + " ----------\n", + " string: List[str]\n", + "\n", + " Returns\n", + " -------\n", + " result: np.array\n", + " \"\"\"\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "r = quantized_model.vectorize(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "x = [i[0] for i in r]\n", + "y = [i[1] for i in r]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(89, 2)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.manifold import TSNE\n", + "import matplotlib.pyplot as plt\n", + "\n", + "tsne = TSNE().fit_transform(y)\n", + "tsne.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize = (7, 7))\n", + "plt.scatter(tsne[:, 0], tsne[:, 1])\n", + "labels = x\n", + "for label, x, y in zip(\n", + " labels, tsne[:, 0], tsne[:, 1]\n", + "):\n", + " label = (\n", + " '%s, %.3f' % (label[0], label[1])\n", + " if isinstance(label, list)\n", + " else label\n", + " )\n", + " plt.annotate(\n", + " label,\n", + " xy = (x, y),\n", + " xytext = (0, 0),\n", + " textcoords = 'offset points',\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/malaya/dependency.py b/malaya/dependency.py index c4497a92..3c4a362e 100644 --- a/malaya/dependency.py +++ b/malaya/dependency.py @@ -50,7 +50,7 @@ 'csubj:pass': 32, } -_transformer_availability = { +_transformer_availability_v1 = { 'bert': { 'Size (MB)': 426, 'Quantized Size (MB)': 112.0, @@ -66,18 +66,18 @@ 'Root Accuracy': 0.886, }, 'albert': { - 'Size (MB)': 60.8, - 'Quantized Size (MB)': 15.3, - 'Arc Accuracy': 0.821895, - 'Types Accuracy': 0.79752, - 'Root Accuracy': 1.0, + 'Size (MB)': 50, + 'Quantized Size (MB)': 13.2, + 'Arc Accuracy': 0.811, + 'Types Accuracy': 0.793, + 'Root Accuracy': 0.879, }, 'tiny-albert': { - 'Size (MB)': 33.4, - 'Quantized Size (MB)': 8.51, - 'Arc Accuracy': 0.7865, - 'Types Accuracy': 0.7587, - 'Root Accuracy': 1.0, + 'Size (MB)': 24.8, + 'Quantized Size (MB)': 6.6, + 'Arc Accuracy': 0.708, + 'Types Accuracy': 0.673, + 'Root Accuracy': 0.817, }, 'xlnet': { 'Size (MB)': 450.2, @@ -95,6 +95,60 @@ }, } +_transformer_availability_v2 = { + 'bert': { + 'Size (MB)': 455, + 'Quantized Size (MB)': 114.0, + 'Arc Accuracy': 0.82045, + 'Types Accuracy': 0.79970, + 'Root Accuracy': 0.98936, + }, + 'tiny-bert': { + 'Size (MB)': 69.7, + 'Quantized Size (MB)': 17.5, + 'Arc Accuracy': 0.795252, + 'Types Accuracy': 0.72470, + 'Root Accuracy': 0.98939, + }, + 'albert': { + 'Size (MB)': 60.8, + 'Quantized Size (MB)': 15.3, + 'Arc Accuracy': 0.821895, + 'Types Accuracy': 0.79752, + 'Root Accuracy': 1.0, + }, + 'tiny-albert': { + 'Size (MB)': 33.4, + 'Quantized Size (MB)': 8.51, + 'Arc Accuracy': 0.7865, + 'Types Accuracy': 0.7587, + 'Root Accuracy': 1.0, + }, + 'xlnet': { + 'Size (MB)': 480.2, + 'Quantized Size (MB)': 121.0, + 'Arc Accuracy': 0.84811, + 'Types Accuracy': 0.82741, + 'Root Accuracy': 0.92101, + }, + 'alxlnet': { + 'Size (MB)': 61.2, + 'Quantized Size (MB)': 16.4, + 'Arc Accuracy': 0.84929, + 'Types Accuracy': 0.8281, + 'Root Accuracy': 0.92099, + }, +} + +_transformer_availability = {'v1': _transformer_availability_v1, 'v2': _transformer_availability_v2} + + +def _validate_version(version): + version = version.lower() + if version not in _transformer_availability: + raise ValueError('version not supported, only supported `v1` or `v2`.') + return version + def describe(): """ @@ -151,25 +205,40 @@ def dependency_graph(tagging, indexing): return DependencyGraph('\n'.join(result), top_relation_label='root') -def available_transformer(): +def available_transformer(version: str = 'v2'): """ List available transformer dependency parsing models. + + Parameters + ---------- + version : str, optional (default='v2') + Version supported. Allowed values: + + * ``'v1'`` - version 1, maintain for knowledge graph. + * ``'v2'`` - Trained on bigger dataset, better version. + """ from malaya.function import describe_availability return describe_availability( - _transformer_availability, text='tested on 20% test set.' + _transformer_availability[_validate_version(version)], text='tested on 20% test set.' ) @check_type -def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs): +def transformer(version: str = 'v2', model: str = 'xlnet', quantized: bool = False, **kwargs): """ Load Transformer Dependency Parsing model, transfer learning Transformer + biaffine attention. Parameters ---------- - model : str, optional (default='bert') + version : str, optional (default='v2') + Version supported. Allowed values: + + * ``'v1'`` - version 1, maintain for knowledge graph. + * ``'v2'`` - Trained on bigger dataset, better version. + + model : str, optional (default='xlnet') Model architecture supported. Allowed values: * ``'bert'`` - Google BERT BASE parameters. @@ -192,15 +261,22 @@ def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs): * if `xlnet` in model, will return `malaya.model.xlnet.DependencyXLNET`. """ + version = _validate_version(version) model = model.lower() - if model not in _transformer_availability: + if model not in _transformer_availability[version]: raise ValueError( - 'model not supported, please check supported models from `malaya.dependency.available_transformer()`.' + "model not supported, please check supported models from `malaya.dependency.available_transformer(version='{version}')`." ) + module = 'dependency' + minus = 1 + if version != 'v1': + module = f'{module}-{version}' + minus = 2 + path = check_file( file=model, - module='dependency-v2', + module=module, keys={ 'model': 'model.pb', 'vocab': MODEL_VOCAB[model], @@ -240,4 +316,5 @@ def transformer(model: str = 'xlnet', quantized: bool = False, **kwargs): sess=generate_session(graph=g, **kwargs), tokenizer=tokenizer, settings=label, + minus=minus ) diff --git a/malaya/knowledge_graph.py b/malaya/knowledge_graph.py index 6c76e019..c1f78ef9 100644 --- a/malaya/knowledge_graph.py +++ b/malaya/knowledge_graph.py @@ -74,7 +74,7 @@ def parse_from_dependency(tagging: List[Tuple[str, str]], objects: List[List[str]] = [['obj', 'compound', 'flat', 'nmod', 'obl']], get_networkx: bool = True): """ - Generate knowledge graphs from dependency parsing. + Generate knowledge graphs from dependency parsing, we suggest use dependency parsing v1. Parameters ---------- @@ -147,14 +147,6 @@ def parse_from_dependency(tagging: List[Tuple[str, str]], obj = obj[1:] results.append({'subject': subject, 'relation': relation, 'object': obj}) - if d_object.nodes[i]['rel'] == 'appos': - subjects_, relations_ = [], [] - for s in subjects: - s_ = d_object.traverse_ancestor(i, s, initial_label=[d_object.nodes[i]['rel']]) - s_ = _combined(s_) - s_ = [c[1:] for c in s_] - subjects_.extend(s_) - post_results = [] for r in results: r = _postprocess(r) diff --git a/malaya/model/bert.py b/malaya/model/bert.py index 1f69b2b7..5df928de 100644 --- a/malaya/model/bert.py +++ b/malaya/model/bert.py @@ -923,7 +923,7 @@ def predict(self, string: str): class DependencyBERT(Base): - def __init__(self, input_nodes, output_nodes, sess, tokenizer, settings): + def __init__(self, input_nodes, output_nodes, sess, tokenizer, settings, minus): Base.__init__( self, input_nodes=input_nodes, @@ -935,6 +935,7 @@ def __init__(self, input_nodes, output_nodes, sess, tokenizer, settings): self._tag2idx = settings self._idx2tag = {int(v): k for k, v in self._tag2idx.items()} + self._minus = minus @check_type def vectorize(self, string: str): @@ -989,7 +990,7 @@ def predict(self, string: str): ) tagging, depend = r['logits'], r['heads_seq'] tagging = [self._idx2tag[i] for i in tagging[0]] - depend = depend[0] - 2 + depend = depend[0] - self._minus for i in range(len(depend)): if depend[i] == 0 and tagging[i] != 'root': diff --git a/malaya/model/xlnet.py b/malaya/model/xlnet.py index 841be6e3..b1cccb67 100644 --- a/malaya/model/xlnet.py +++ b/malaya/model/xlnet.py @@ -928,7 +928,7 @@ def predict(self, string: str): class DependencyXLNET(Base): - def __init__(self, input_nodes, output_nodes, sess, tokenizer, settings): + def __init__(self, input_nodes, output_nodes, sess, tokenizer, settings, minus): Base.__init__( self, input_nodes=input_nodes, @@ -940,6 +940,7 @@ def __init__(self, input_nodes, output_nodes, sess, tokenizer, settings): self._tag2idx = settings self._idx2tag = {int(v): k for k, v in self._tag2idx.items()} + self._minus = minus @check_type def vectorize(self, string: str): @@ -997,7 +998,7 @@ def predict(self, string: str): ) tagging, depend = r['logits'], r['heads_seq'] tagging = [self._idx2tag[i] for i in tagging[0]] - depend = depend[0] - 2 + depend = depend[0] - self._minus for i in range(len(depend)): if depend[i] == 0 and tagging[i] != 'root': diff --git a/malaya/train/__init__.py b/malaya/train/__init__.py deleted file mode 100644 index 437954b6..00000000 --- a/malaya/train/__init__.py +++ /dev/null @@ -1,155 +0,0 @@ -import tensorflow as tf -from tensorflow.python.distribute.cross_device_ops import ( - AllReduceCrossDeviceOps, -) -from tensorflow.python.estimator.run_config import RunConfig -from herpetologist import check_type -from typing import List, Dict -import numpy as np -import collections -import re - - -@check_type -def run_training( - train_fn, - model_fn, - model_dir: str, - num_gpus: int = 1, - gpu_mem_fraction: float = 0.95, - log_step: int = 100, - summary_step: int = 100, - save_checkpoint_step: int = 1000, - max_steps: int = 10000, - eval_step: int = 10, - eval_throttle: int = 120, - train_hooks=None, - eval_fn=None, -): - tf.logging.set_verbosity(tf.logging.INFO) - - if num_gpus > 1 and not use_tpu: - dist_strategy = tf.contrib.distribute.MirroredStrategy( - num_gpus=num_gpus, - auto_shard_dataset=True, - cross_device_ops=AllReduceCrossDeviceOps( - 'nccl', num_packs=num_gpus - ), - ) - else: - dist_strategy = None - - gpu_options = tf.GPUOptions( - per_process_gpu_memory_fraction=gpu_mem_fraction - ) - config = tf.ConfigProto( - allow_soft_placement=True, gpu_options=gpu_options - ) - run_config = RunConfig( - train_distribute=dist_strategy, - eval_distribute=dist_strategy, - log_step_count_steps=log_step, - model_dir=model_dir, - save_checkpoints_steps=save_checkpoint_step, - save_summary_steps=summary_step, - session_config=config, - ) - - estimator = tf.estimator.Estimator( - model_fn=model_fn, params={}, config=run_config - ) - - if eval_fn: - train_spec = tf.estimator.TrainSpec( - input_fn=train_fn, max_steps=max_steps, hooks=train_hooks - ) - - eval_spec = tf.estimator.EvalSpec( - input_fn=eval_fn, steps=eval_step, throttle_secs=eval_throttle - ) - tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) - - else: - estimator.train( - input_fn=train_fn, max_steps=max_steps, hooks=train_hooks - ) - - -@check_type -def prepare_dataset( - generator, - data_dir: str, - shards: List[Dict], - prefix: str = 'dataset', - shuffle: bool = True, - already_shuffled: bool = False, -): - prepare_data.check_shard(shards) - filepath_fns = { - 'train': prepare_data.training_filepaths, - 'dev': prepare_data.dev_filepaths, - 'test': prepare_data.test_filepaths, - } - - split_paths = [ - ( - split['split'], - filepath_fns[split['split']]( - prefix, data_dir, split['shards'], shuffled=already_shuffled - ), - ) - for split in shards - ] - all_paths = [] - for _, paths in split_paths: - all_paths.extend(paths) - - prepare_data.generate_files(generator, all_paths) - - if shuffle: - prepare_data.shuffle_dataset(all_paths) - - -def get_assignment_map_from_checkpoint(tvars, init_checkpoint, logging=True): - """Compute the union of the current variables and checkpoint variables.""" - assignment_map = {} - initialized_variable_names = {} - - name_to_variable = collections.OrderedDict() - for var in tvars: - name = var.name - m = re.match('^(.*):\\d+$', name) - if m is not None: - name = m.group(1) - name_to_variable[name] = var - - init_vars = tf.train.list_variables(init_checkpoint) - - assignment_map = collections.OrderedDict() - for x in init_vars: - (name, var) = (x[0], x[1]) - if name not in name_to_variable: - continue - - assignment_map[name] = name - assignment_map[name] = name_to_variable[name] - initialized_variable_names[name] = 1 - initialized_variable_names[name + ':0'] = 1 - - if logging: - tf.logging.info('**** Trainable Variables ****') - for var in tvars: - init_string = '' - if var.name in initialized_variable_names: - init_string = ', *INIT_FROM_CKPT*' - tf.logging.info( - ' name = %s, shape = %s%s', var.name, var.shape, init_string - ) - - return (assignment_map, initialized_variable_names) - - -def calculate_parameters(variables): - return np.sum( - [np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()] - ) diff --git a/malaya/train/model/__init__.py b/malaya/train/model/__init__.py deleted file mode 100644 index fb00b5ac..00000000 --- a/malaya/train/model/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from . import alxlnet -from . import bigbird -from . import pegasus diff --git a/malaya/train/model/alxlnet/__init__.py b/malaya/train/model/alxlnet/__init__.py deleted file mode 100644 index 792d6005..00000000 --- a/malaya/train/model/alxlnet/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# diff --git a/malaya/train/model/bigbird/__init__.py b/malaya/train/model/bigbird/__init__.py deleted file mode 100644 index 792d6005..00000000 --- a/malaya/train/model/bigbird/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# diff --git a/malaya/train/model/bigbird/attention.py b/malaya/train/model/bigbird/attention.py deleted file mode 100644 index 48e78be8..00000000 --- a/malaya/train/model/bigbird/attention.py +++ /dev/null @@ -1,1279 +0,0 @@ -# Copyright 2020 The BigBird Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""BigBird Attention Layers.""" - -from absl import logging -from . import utils -import numpy as np -import tensorflow as tf - - -MAX_SEQ_LEN = 4096 - - -def get_single_block_row_attention( - block_id, - to_start_block_id, - to_end_block_id, - num_rand_blocks, - window_block_left=1, - window_block_right=1, - global_block_left=1, - global_block_right=1, -): - """For a single row block get random row attention. - - Args: - block_id: int. block id of row. - to_start_block_id: int. random attention coloum start id. - to_end_block_id: int. random attention coloum end id. - num_rand_blocks: int. number of random blocks to be selected. - window_block_left: int. number of blocks of window to left of a block. - window_block_right: int. number of blocks of window to right of a block. - global_block_left: int. Number of blocks globally used to the left. - global_block_right: int. Number of blocks globally used to the right. - - Returns: - row containing the random attention vector of size num_rand_blocks. - """ - - # list of to_blocks from which to choose random attention - to_block_list = np.arange( - to_start_block_id, to_end_block_id, dtype=np.int32 - ) - # permute the blocks - perm_block = np.random.permutation(to_block_list) - # print(perm_block) - - # illegal blocks for the current block id, using window - illegal_blocks = list( - range(block_id - window_block_left, block_id + window_block_right + 1) - ) - - # Add blocks at the start and at the end - illegal_blocks.extend(list(range(global_block_left))) - illegal_blocks.extend( - list(range(to_end_block_id - global_block_right, to_end_block_id)) - ) - - # The second from_block cannot choose random attention on second last to_block - if block_id == 1: - illegal_blocks.append(to_end_block_id - 2) - - # The second last from_block cannot choose random attention on second to_block - if block_id == to_end_block_id - 2: - illegal_blocks.append(1) - - selected_random_blokcs = [] - - for i in range(to_end_block_id - to_start_block_id): - if perm_block[i] not in illegal_blocks: - selected_random_blokcs.append(perm_block[i]) - if len(selected_random_blokcs) == num_rand_blocks: - break - return np.array(selected_random_blokcs, dtype=np.int32) - - -def bigbird_block_rand_mask_with_head( - from_seq_length, - to_seq_length, - from_block_size, - to_block_size, - num_heads, - plan_from_length, - plan_num_rand_blocks, - window_block_left=1, - window_block_right=1, - global_block_top=1, - global_block_bottom=1, - global_block_left=1, - global_block_right=1, -): - """Create adjacency list of random attention. - - Args: - from_seq_length: int. length of from sequence. - to_seq_length: int. length of to sequence. - from_block_size: int. size of block in from sequence. - to_block_size: int. size of block in to sequence. - num_heads: int. total number of heads. - plan_from_length: list. plan from lenght where num_rand are choosen from. - plan_num_rand_blocks: list. number of rand blocks within the plan. - window_block_left: int. number of blocks of window to left of a block. - window_block_right: int. number of blocks of window to right of a block. - global_block_top: int. number of blocks at the top. - global_block_bottom: int. number of blocks at the bottom. - global_block_left: int. Number of blocks globally used to the left. - global_block_right: int. Number of blocks globally used to the right. - - Returns: - adjacency list of size num_head where each element is of size - from_seq_length//from_block_size-2 by num_rand_blocks - """ - assert ( - from_seq_length // from_block_size == to_seq_length // to_block_size - ), 'Error the number of blocks needs to be same!' - - assert ( - from_seq_length in plan_from_length - ), 'Error from sequence length not in plan!' - - # Total number of blocks in the mmask - num_blocks = from_seq_length // from_block_size - # Number of blocks per plan - plan_block_length = np.array(plan_from_length) // from_block_size - # till when to follow plan - max_plan_idx = plan_from_length.index(from_seq_length) - # Random Attention adjajency list - rand_attn = [ - np.zeros( - (num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), - dtype=np.int32, - ) - for i in range(num_heads) - ] - - # We will go iteratively over the plan blocks and pick random number of - # Attention blocks from the legally allowed blocks - for plan_idx in range(max_plan_idx + 1): - rnd_r_cnt = 0 - if plan_idx > 0: - # set the row for all from_blocks starting from 0 to - # plan_block_length[plan_idx-1] - # column indx start fromm plan_block_length[plan_idx-1] and ends at - # plan_block_length[plan_idx] - if plan_num_rand_blocks[plan_idx] > 0: - rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx])) - curr_r_cnt = int(np.sum(plan_num_rand_blocks[: plan_idx + 1])) - for blk_rw_idx in range( - global_block_top, plan_block_length[plan_idx - 1] - ): - for h in range(num_heads): - # print("head", h, "blk_rw_idx", blk_rw_idx) - rand_attn[h][ - blk_rw_idx, rnd_r_cnt:curr_r_cnt - ] = get_single_block_row_attention( - block_id=blk_rw_idx, - to_start_block_id=plan_block_length[plan_idx - 1], - to_end_block_id=plan_block_length[plan_idx], - num_rand_blocks=plan_num_rand_blocks[plan_idx], - window_block_left=window_block_left, - window_block_right=window_block_right, - global_block_left=global_block_left, - global_block_right=global_block_right, - ) - - for pl_id in range(plan_idx): - if plan_num_rand_blocks[pl_id] == 0: - continue - for blk_rw_idx in range( - plan_block_length[plan_idx - 1], plan_block_length[plan_idx] - ): - rnd_r_cnt = 0 - to_start_block_id = 0 - if pl_id > 0: - rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:pl_id])) - to_start_block_id = plan_block_length[pl_id - 1] - curr_r_cnt = int(np.sum(plan_num_rand_blocks[: pl_id + 1])) - for h in range(num_heads): - # print("head", h, "blk_rw_idx", blk_rw_idx) - rand_attn[h][ - blk_rw_idx, rnd_r_cnt:curr_r_cnt - ] = get_single_block_row_attention( - block_id=blk_rw_idx, - to_start_block_id=to_start_block_id, - to_end_block_id=plan_block_length[pl_id], - num_rand_blocks=plan_num_rand_blocks[pl_id], - window_block_left=window_block_left, - window_block_right=window_block_right, - global_block_left=global_block_left, - global_block_right=global_block_right, - ) - - if plan_num_rand_blocks[plan_idx] == 0: - continue - # print("Start from here") - curr_r_cnt = int(np.sum(plan_num_rand_blocks[: plan_idx + 1])) - from_start_block_id = global_block_top - to_start_block_id = 0 - if plan_idx > 0: - rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx])) - from_start_block_id = plan_block_length[plan_idx - 1] - to_start_block_id = plan_block_length[plan_idx - 1] - - for blk_rw_idx in range( - from_start_block_id, plan_block_length[plan_idx] - ): - for h in range(num_heads): - # print("head", h, "blk_rw_idx", blk_rw_idx) - rand_attn[h][ - blk_rw_idx, rnd_r_cnt:curr_r_cnt - ] = get_single_block_row_attention( - block_id=blk_rw_idx, - to_start_block_id=to_start_block_id, - to_end_block_id=plan_block_length[plan_idx], - num_rand_blocks=plan_num_rand_blocks[plan_idx], - window_block_left=window_block_left, - window_block_right=window_block_right, - global_block_left=global_block_left, - global_block_right=global_block_right, - ) - - for nh in range(num_heads): - rand_attn[nh] = rand_attn[nh][ - global_block_top: num_blocks - global_block_bottom, : - ] - return rand_attn - - -def get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks): - """Gives the plan of where to put random attention. - - Args: - from_seq_length: int. length of from sequence. - from_block_size: int. size of block in from sequence. - num_rand_blocks: int. Number of random chunks per row. - - Returns: - plan_from_length: ending location of from block - plan_num_rand_blocks: number of random ending location for each block - """ - # general plan - plan_from_length = [] - plan_num_rand_blocks = [] - if (2 * num_rand_blocks + 5) < (from_seq_length // from_block_size): - plan_from_length.append( - int((2 * num_rand_blocks + 5) * from_block_size) - ) - plan_num_rand_blocks.append(num_rand_blocks) - plan_from_length.append(from_seq_length) - plan_num_rand_blocks.append(0) - elif (num_rand_blocks + 5) < (from_seq_length // from_block_size): - plan_from_length.append(int((num_rand_blocks + 5) * from_block_size)) - plan_num_rand_blocks.append(num_rand_blocks // 2) - plan_from_length.append(from_seq_length) - plan_num_rand_blocks.append(num_rand_blocks - (num_rand_blocks // 2)) - else: - plan_from_length.append(from_seq_length) - plan_num_rand_blocks.append(num_rand_blocks) - - return plan_from_length, plan_num_rand_blocks - - -def bigbird_block_rand_mask( - from_seq_length, - to_seq_length, - from_block_size, - to_block_size, - num_rand_blocks, - last_idx=-1, -): - """Create adjacency list of random attention. - - Args: - from_seq_length: int. length of from sequence. - to_seq_length: int. length of to sequence. - from_block_size: int. size of block in from sequence. - to_block_size: int. size of block in to sequence. - num_rand_blocks: int. Number of random chunks per row. - last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence, - if positive then num_rand_blocks blocks choosen only upto last_idx. - - Returns: - adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks - """ - assert ( - from_seq_length // from_block_size == to_seq_length // to_block_size - ), 'Error the number of blocks needs to be same!' - - rand_attn = np.zeros( - (from_seq_length // from_block_size - 2, num_rand_blocks), - dtype=np.int32, - ) - middle_seq = np.arange( - 1, to_seq_length // to_block_size - 1, dtype=np.int32 - ) - last = to_seq_length // to_block_size - 1 - if last_idx > (2 * to_block_size): - last = (last_idx // to_block_size) - 1 - - r = num_rand_blocks # shorthand - for i in range(1, from_seq_length // from_block_size - 1): - start = i - 2 - end = i - if i == 1: - rand_attn[i - 1, :] = np.random.permutation(middle_seq[2:last])[:r] - elif i == 2: - rand_attn[i - 1, :] = np.random.permutation(middle_seq[3:last])[:r] - elif i == from_seq_length // from_block_size - 3: - rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r] - # Missing -3: should have been sliced till last-3 - elif i == from_seq_length // from_block_size - 2: - rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r] - # Missing -4: should have been sliced till last-4 - else: - if start > last: - start = last - rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[ - :r - ] - elif (end + 1) == last: - rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[ - :r - ] - else: - rand_attn[i - 1, :] = np.random.permutation( - np.concatenate( - (middle_seq[:start], middle_seq[end + 1: last]) - ) - )[:r] - return rand_attn - - -def full_bigbird_mask( - from_seq_length, - to_seq_length, - from_block_size, - to_block_size, - num_rand_blocks, - rand_attn=None, - focus=1024, -): - """Calculate BigBird attention pattern as a full dense matrix. - - Args: - from_seq_length: int. length of from sequence. - to_seq_length: int. length of to sequence. - from_block_size: int. size of block in from sequence. - to_block_size: int. size of block in to sequence. - num_rand_blocks: int. Number of random chunks per row. - rand_attn: adjajency matrix for random attention. - focus: pick random mask within focus - - Returns: - attention mask matrix of shape [from_seq_length, to_seq_length] - """ - if rand_attn is None: - rand_attn = bigbird_block_rand_mask( - MAX_SEQ_LEN, - MAX_SEQ_LEN, - from_block_size, - to_block_size, - num_rand_blocks, - focus, - ) - - attn_mask = np.zeros((MAX_SEQ_LEN, MAX_SEQ_LEN), dtype=np.int32) - for i in range(1, (MAX_SEQ_LEN // from_block_size) - 1): - attn_mask[ - (i) * from_block_size: (i + 1) * from_block_size, - (i - 1) * to_block_size: (i + 2) * to_block_size, - ] = 1 - for j in rand_attn[i - 1, :]: - attn_mask[ - i * from_block_size: (i + 1) * from_block_size, - j * to_block_size: (j + 1) * to_block_size, - ] = 1 - - attn_mask[:from_block_size, :] = 1 - attn_mask[:, :to_block_size] = 1 - attn_mask[:, -to_block_size:] = 1 - attn_mask[-from_block_size:, :] = 1 - clipped_attn_mask = attn_mask[:from_seq_length, :to_seq_length] - return np.array(clipped_attn_mask, dtype=bool) - - -def create_rand_mask_from_inputs( - from_blocked_mask, - to_blocked_mask, - rand_attn, - num_attention_heads, - num_rand_blocks, - batch_size, - from_seq_length, - from_block_size, -): - """Create 3D attention mask from a 2D tensor mask. - - Args: - from_blocked_mask: 2D Tensor of shape [batch_size, - from_seq_length//from_block_size, from_block_size]. - to_blocked_mask: int32 Tensor of shape [batch_size, - to_seq_length//to_block_size, to_block_size]. - rand_attn: [batch_size, num_attention_heads, - from_seq_length//from_block_size-2, num_rand_blocks] - num_attention_heads: int. Number of attention heads. - num_rand_blocks: int. Number of random chunks per row. - batch_size: int. Batch size for computation. - from_seq_length: int. length of from sequence. - from_block_size: int. size of block in from sequence. - - Returns: - float Tensor of shape [batch_size, num_attention_heads, - from_seq_length//from_block_size-2, - from_block_size, num_rand_blocks*to_block_size]. - """ - num_windows = from_seq_length // from_block_size - 2 - rand_mask = tf.reshape( - tf.gather(to_blocked_mask, rand_attn, batch_dims=1), - [ - batch_size, - num_attention_heads, - num_windows, - num_rand_blocks * from_block_size, - ], - ) - rand_mask = tf.einsum( - 'BLQ,BHLK->BHLQK', from_blocked_mask[:, 1:-1], rand_mask - ) - return rand_mask - - -def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask): - """Create 3D attention mask from a 2D tensor mask. - - Args: - from_blocked_mask: 2D Tensor of shape [batch_size, - from_seq_length//from_block_size, from_block_size]. - to_blocked_mask: int32 Tensor of shape [batch_size, - to_seq_length//to_block_size, to_block_size]. - - Returns: - float Tensor of shape [batch_size, 1, from_seq_length//from_block_size-4, - from_block_size, 3*to_block_size]. - """ - exp_blocked_to_pad = tf.concat( - [ - to_blocked_mask[:, 1:-3], - to_blocked_mask[:, 2:-2], - to_blocked_mask[:, 3:-1], - ], - 2, - ) - band_mask = tf.einsum( - 'BLQ,BLK->BLQK', - tf.cast(from_blocked_mask[:, 2:-2], tf.float32), - tf.cast(exp_blocked_to_pad, tf.float32), - ) - band_mask = tf.expand_dims(band_mask, 1) - return band_mask - - -def create_attention_mask_from_input_mask(from_mask, to_mask): - """Create attention mask from a 2D tensor mask. - - Args: - from_mask: int32 Tensor of shape [batch_size, from_seq_length]. - to_mask: int32 Tensor of shape [batch_size, to_seq_length]. - - Returns: - int32 Tensor of shape [batch_size, 1, from_seq_length, to_seq_length]. - """ - mask = tf.einsum('BF,BT->BFT', from_mask, to_mask) - - # expand to create a slot for heads. - mask = tf.expand_dims(mask, 1) - - return mask - - -def original_full_attention( - query_layer, - key_layer, - value_layer, - attention_mask, - size_per_head, - attention_probs_dropout_prob, -): - """Full quadratic attention calculation. - - Args: - query_layer: float Tensor of shape [batch_size, num_attention_heads, - from_seq_length, size_per_head] - key_layer: float Tensor of shape [batch_size, num_attention_heads, - to_seq_length, size_per_head] - value_layer: float Tensor of shape [batch_size, num_attention_heads, - to_seq_length, size_per_head] - attention_mask: (optional) int32 Tensor of shape [batch_size, - from_seq_length, to_seq_length]. The values should be 1 or 0. The - attention scores will effectively be set to -infinity for any positions in - the mask that are 0, and will be unchanged for positions that are 1. - size_per_head: (optional) int. Size of each attention head. - attention_probs_dropout_prob: (optional) float. Dropout probability of the - attention probabilities. - - Returns: - float Tensor of shape [batch_size, from_seq_length, num_attention_heads, - size_per_head]. - """ - - # Directly take n^2 dot product between "query" and "key". - attention_scores = tf.einsum('BNFH,BNTH->BNFT', query_layer, key_layer) - attention_scores = tf.multiply( - attention_scores, 1.0 / np.sqrt(float(size_per_head)) - ) - - if attention_mask is not None: - # Since attention_mask is 1.0 for positions we want to attend and 0.0 for - # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and -10000.0 for masked positions. - adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 - - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. - attention_scores += adder - - # Normalize the attention scores to probabilities. - # `attention_probs` = [B, N, F, T] - attention_probs = tf.nn.softmax(attention_scores) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = utils.dropout( - attention_probs, attention_probs_dropout_prob - ) - - # `context_layer` = [B, F, N, H] - context_layer = tf.einsum('BNFT,BNTH->BFNH', attention_probs, value_layer) - return context_layer - - -def bigbird_simulated_attention( - query_layer, - key_layer, - value_layer, - attention_mask, - num_attention_heads, - num_rand_blocks, - size_per_head, - from_seq_length, - to_seq_length, - from_block_size, - to_block_size, - seed=None, -): - """BigBird attention calculation using masks in quadratic time. - - Args: - query_layer: float Tensor of shape [batch_size, num_attention_heads, - from_seq_length, size_per_head] - key_layer: float Tensor of shape [batch_size, num_attention_heads, - to_seq_length, size_per_head] - value_layer: float Tensor of shape [batch_size, num_attention_heads, - to_seq_length, size_per_head] - attention_mask: int32 Tensor of shape [batch_size, - from_seq_length, to_seq_length]. The values should be 1 or 0. The - attention scores will effectively be set to -infinity for any positions in - the mask that are 0, and will be unchanged for positions that are 1. - num_attention_heads: int. Number of attention heads. - num_rand_blocks: int. Number of random chunks per row. - size_per_head: int. Size of each attention head. - from_seq_length: int. length of from sequence. - to_seq_length: int. length of to sequence. - from_block_size: int. size of block in from sequence. - to_block_size: int. size of block in to sequence. - seed: (Optional) int. Reandom seed for generating random mask. - - Returns: - float Tensor of shape [batch_size, from_seq_length, num_attention_heads, - size_per_head]. - """ - - if seed: - np.random.seed(seed) - - plan_from_length, plan_num_rand_blocks = get_rand_attn_plan( - from_seq_length, from_block_size, num_rand_blocks - ) - - rand_attn = bigbird_block_rand_mask_with_head( - from_seq_length=from_seq_length, - to_seq_length=to_seq_length, - from_block_size=from_block_size, - to_block_size=to_block_size, - num_heads=num_attention_heads, - plan_from_length=plan_from_length, - plan_num_rand_blocks=plan_num_rand_blocks, - ) - temp_mask = [ - full_bigbird_mask( # pylint: disable=g-complex-comprehension - from_seq_length, - to_seq_length, - from_block_size, - to_block_size, - num_rand_blocks, - rand_attn=rand_attn[i], - focus=1024, - ) - for i in range(num_attention_heads) - ] - temp_mask = np.stack(temp_mask, axis=0) - temp_mask = np.array(temp_mask, dtype=bool) - - rand_block_mask = tf.constant(temp_mask, dtype=tf.bool) # [N, F, T] - rand_block_mask = tf.cast(rand_block_mask, tf.int32) - rand_block_mask = tf.expand_dims(rand_block_mask, 0) # [1, N, F, T] - if attention_mask is not None: - attention_mask = tf.minimum(attention_mask, rand_block_mask) - else: - attention_mask = rand_block_mask - return original_full_attention( - query_layer, - key_layer, - value_layer, - attention_mask, - size_per_head, - attention_probs_dropout_prob=0.0, - ) - - -def bigbird_block_sparse_attention( - query_layer, - key_layer, - value_layer, - band_mask, - from_mask, - to_mask, - from_blocked_mask, - to_blocked_mask, - num_attention_heads, - num_rand_blocks, - size_per_head, - batch_size, - from_seq_length, - to_seq_length, - from_block_size, - to_block_size, - seed=None, - plan_from_length=None, - plan_num_rand_blocks=None, -): - """BigBird attention sparse calculation using blocks in linear time. - - Assumes from_seq_length//from_block_size == to_seq_length//to_block_size. - - - Args: - query_layer: float Tensor of shape [batch_size, num_attention_heads, - from_seq_length, size_per_head] - key_layer: float Tensor of shape [batch_size, num_attention_heads, - to_seq_length, size_per_head] - value_layer: float Tensor of shape [batch_size, num_attention_heads, - to_seq_length, size_per_head] - band_mask: (optional) int32 Tensor of shape [batch_size, 1, - from_seq_length//from_block_size-4, from_block_size, 3*to_block_size]. - The values should be 1 or 0. The attention scores will effectively be - set to -infinity for any positions in the mask that are 0, and will be - unchanged for positions that are 1. - from_mask: (optional) int32 Tensor of shape [batch_size, 1, - from_seq_length, 1]. The values should be 1 or 0. The - attention scores will effectively be set to -infinity for any positions in - the mask that are 0, and will be unchanged for positions that are 1. - to_mask: (optional) int32 Tensor of shape [batch_size, 1, 1, - to_seq_length]. The values should be 1 or 0. The - attention scores will effectively be set to -infinity for any positions in - the mask that are 0, and will be unchanged for positions that are 1. - from_blocked_mask: (optional) int32 Tensor of shape [batch_size, - from_seq_length//from_block_size, from_block_size]. - Same as from_mask, just reshaped. - to_blocked_mask: (optional) int32 Tensor of shape [batch_size, - to_seq_length//to_block_size, to_block_size]. - Same as to_mask, just reshaped. - num_attention_heads: int. Number of attention heads. - num_rand_blocks: int. Number of random chunks per row. - size_per_head: int. Size of each attention head. - batch_size: int. Batch size for computation. - from_seq_length: int. length of from sequence. - to_seq_length: int. length of to sequence. - from_block_size: int. size of block in from sequence. - to_block_size: int. size of block in to sequence. - seed: (Optional) int. Reandom seed for generating random mask. - plan_from_length: (Optional) list. Plan of where to put random attn. It - divides the block matrix into chuncks, where each chunck will have - some randomm attn. - plan_num_rand_blocks: (Optional) list. Number of random per block given by - plan_from_length. - - Returns: - float Tensor of shape [batch_size, from_seq_length, num_attention_heads, - size_per_head]. - """ - assert from_seq_length // from_block_size == to_seq_length // to_block_size - - # cast masks to float - from_mask = tf.cast(from_mask, tf.float32) - to_mask = tf.cast(to_mask, tf.float32) - band_mask = tf.cast(band_mask, tf.float32) - from_blocked_mask = tf.cast(from_blocked_mask, tf.float32) - to_blocked_mask = tf.cast(to_blocked_mask, tf.float32) - - # generate random attention and corresponding masks - np.random.seed(seed) - if from_seq_length in [1024, 3072, 4096]: # old plans used in paper - rand_attn = [ - bigbird_block_rand_mask( # pylint: disable=g-complex-comprehension - MAX_SEQ_LEN, - MAX_SEQ_LEN, - from_block_size, - to_block_size, - num_rand_blocks, - last_idx=1024, - )[: (from_seq_length // from_block_size - 2)] - for _ in range(num_attention_heads) - ] - else: - if plan_from_length is None: - plan_from_length, plan_num_rand_blocks = get_rand_attn_plan( - from_seq_length, from_block_size, num_rand_blocks - ) - - rand_attn = bigbird_block_rand_mask_with_head( - from_seq_length=from_seq_length, - to_seq_length=to_seq_length, - from_block_size=from_block_size, - to_block_size=to_block_size, - num_heads=num_attention_heads, - plan_from_length=plan_from_length, - plan_num_rand_blocks=plan_num_rand_blocks, - ) - rand_attn = np.stack(rand_attn, axis=0) - rand_attn = tf.constant(rand_attn, dtype=tf.int32) - rand_attn = tf.expand_dims(rand_attn, 0) - rand_attn = tf.repeat(rand_attn, batch_size, 0) - - rand_mask = create_rand_mask_from_inputs( - from_blocked_mask, - to_blocked_mask, - rand_attn, - num_attention_heads, - num_rand_blocks, - batch_size, - from_seq_length, - from_block_size, - ) - - # Define shorthands - h = num_attention_heads - r = num_rand_blocks - d = size_per_head - b = batch_size - m = from_seq_length - n = to_seq_length - wm = from_block_size - wn = to_block_size - - blocked_query_matrix = tf.reshape(query_layer, (b, h, m // wm, wm, -1)) - blocked_key_matrix = tf.reshape(key_layer, (b, h, n // wn, wn, -1)) - blocked_value_matrix = tf.reshape(value_layer, (b, h, n // wn, wn, -1)) - gathered_key = tf.reshape( - tf.gather( - blocked_key_matrix, rand_attn, batch_dims=2, name='gather_key' - ), - (b, h, m // wm - 2, r * wn, -1), - ) # [b, h, n//wn-2, r, wn, -1] - gathered_value = tf.reshape( - tf.gather( - blocked_value_matrix, - rand_attn, - batch_dims=2, - name='gather_value', - ), - (b, h, m // wm - 2, r * wn, -1), - ) # [b, h, n//wn-2, r, wn, -1] - - first_product = tf.einsum( - 'BHQD,BHKD->BHQK', blocked_query_matrix[:, :, 0], key_layer - ) # [b, h, wm, -1] x [b, h, n, -1] ==> [b, h, wm, n] - first_product = tf.multiply(first_product, 1.0 / np.sqrt(d)) - first_product += (1.0 - to_mask) * -10000.0 - first_attn_weights = tf.nn.softmax(first_product) # [b, h, wm, n] - first_context_layer = tf.einsum( - 'BHQK,BHKD->BHQD', first_attn_weights, value_layer - ) # [b, h, wm, n] x [b, h, n, -1] ==> [b, h, wm, -1] - first_context_layer = tf.expand_dims(first_context_layer, 2) - - second_key_mat = tf.concat( - [ - blocked_key_matrix[:, :, 0], - blocked_key_matrix[:, :, 1], - blocked_key_matrix[:, :, 2], - blocked_key_matrix[:, :, -1], - gathered_key[:, :, 0], - ], - 2, - ) # [b, h, (4+r)*wn, -1] - second_value_mat = tf.concat( - [ - blocked_value_matrix[:, :, 0], - blocked_value_matrix[:, :, 1], - blocked_value_matrix[:, :, 2], - blocked_value_matrix[:, :, -1], - gathered_value[:, :, 0], - ], - 2, - ) # [b, h, (4+r)*wn, -1] - second_product = tf.einsum( - 'BHQD,BHKD->BHQK', blocked_query_matrix[:, :, 1], second_key_mat - ) # [b, h, wm, -1] x [b, h, (4+r)*wn, -1] ==> [b, h, wm, (4+r)*wn] - second_seq_pad = tf.concat( - [ - to_mask[:, :, :, : 3 * wn], - to_mask[:, :, :, -wn:], - tf.ones([b, 1, 1, r * wn], dtype=tf.float32), - ], - 3, - ) - second_rand_pad = tf.concat( - [tf.ones([b, h, wm, 4 * wn], dtype=tf.float32), rand_mask[:, :, 0]], 3 - ) - second_product = tf.multiply(second_product, 1.0 / np.sqrt(d)) - second_product += ( - 1.0 - tf.minimum(second_seq_pad, second_rand_pad) - ) * -10000.0 - second_attn_weights = tf.nn.softmax(second_product) # [b , h, wm, (4+r)*wn] - second_context_layer = tf.einsum( - 'BHQK,BHKD->BHQD', second_attn_weights, second_value_mat - ) # [b, h, wm, (4+r)*wn] x [b, h, (4+r)*wn, -1] ==> [b, h, wm, -1] - second_context_layer = tf.expand_dims(second_context_layer, 2) - - exp_blocked_key_matrix = tf.concat( - [ - blocked_key_matrix[:, :, 1:-3], - blocked_key_matrix[:, :, 2:-2], - blocked_key_matrix[:, :, 3:-1], - ], - 3, - ) # [b, h, m//wm-4, 3*wn, -1] - exp_blocked_value_matrix = tf.concat( - [ - blocked_value_matrix[:, :, 1:-3], - blocked_value_matrix[:, :, 2:-2], - blocked_value_matrix[:, :, 3:-1], - ], - 3, - ) # [b, h, m//wm-4, 3*wn, -1] - middle_query_matrix = blocked_query_matrix[:, :, 2:-2] - inner_band_product = tf.einsum( - 'BHLQD,BHLKD->BHLQK', middle_query_matrix, exp_blocked_key_matrix - ) # [b, h, m//wm-4, wm, -1] x [b, h, m//wm-4, 3*wn, -1] - # ==> [b, h, m//wm-4, wm, 3*wn] - inner_band_product = tf.multiply(inner_band_product, 1.0 / np.sqrt(d)) - rand_band_product = tf.einsum( - 'BHLQD,BHLKD->BHLQK', middle_query_matrix, gathered_key[:, :, 1:-1] - ) # [b, h, m//wm-4, wm, -1] x [b, h, m//wm-4, r*wn, -1] - # ==> [b, h, m//wm-4, wm, r*wn] - rand_band_product = tf.multiply(rand_band_product, 1.0 / np.sqrt(d)) - first_band_product = tf.einsum( - 'BHLQD,BHKD->BHLQK', middle_query_matrix, blocked_key_matrix[:, :, 0] - ) # [b, h, m//wm-4, wm, -1] x [b, h, wn, -1] ==> [b, h, m//wm-4, wm, wn] - first_band_product = tf.multiply(first_band_product, 1.0 / np.sqrt(d)) - last_band_product = tf.einsum( - 'BHLQD,BHKD->BHLQK', middle_query_matrix, blocked_key_matrix[:, :, -1] - ) # [b, h, m//wm-4, wm, -1] x [b, h, wn, -1] ==> [b, h, m//wm-4, wm, wn] - last_band_product = tf.multiply(last_band_product, 1.0 / np.sqrt(d)) - inner_band_product += (1.0 - band_mask) * -10000.0 - first_band_product += ( - 1.0 - tf.expand_dims(to_mask[:, :, :, :wn], 3) - ) * -10000.0 - last_band_product += ( - 1.0 - tf.expand_dims(to_mask[:, :, :, -wn:], 3) - ) * -10000.0 - rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * -10000.0 - band_product = tf.concat( - [ - first_band_product, - inner_band_product, - rand_band_product, - last_band_product, - ], - -1, - ) # [b, h, m//wm-4, wm, (5+r)*wn] - attn_weights = tf.nn.softmax(band_product) # [b, h, m//wm-4, wm, (5+r)*wn] - context_layer = tf.einsum( - 'BHLQK,BHLKD->BHLQD', - attn_weights[:, :, :, :, wn: 4 * wn], - exp_blocked_value_matrix, - ) # [b, h, m//wm-4, wm, 3*wn] x [b, h, m//wm-4, 3*wn, -1] - # ==> [b, h, m//wm-4, wm, -1] - context_layer += tf.einsum( - 'BHLQK,BHLKD->BHLQD', - attn_weights[:, :, :, :, 4 * wn: -wn], - gathered_value[:, :, 1:-1], - ) # [b, h, m//wm-4, wm, r*wn] x [b, h, m//wm-4, r*wn, -1] - # ==> [b, h, m//wm-4, wm, -1] - context_layer += tf.einsum( - 'BHLQK,BHKD->BHLQD', - attn_weights[:, :, :, :, :wn], - blocked_value_matrix[:, :, 0], - ) # [b, h, m//wm-4, wm, wn] x [b, h, wn, -1] ==> [b, h, m//wm-4, wm, -1] - context_layer += tf.einsum( - 'BHLQK,BHKD->BHLQD', - attn_weights[:, :, :, :, -wn:], - blocked_value_matrix[:, :, -1], - ) # [b, h, m//wm-4, wm, wn] x [b, h, wn, -1] ==> [b, h, m//wm-4, wm, -1] - - second_last_key_mat = tf.concat( - [ - blocked_key_matrix[:, :, 0], - blocked_key_matrix[:, :, -3], - blocked_key_matrix[:, :, -2], - blocked_key_matrix[:, :, -1], - gathered_key[:, :, -1], - ], - 2, - ) # [b, h, (4+r)*wn, -1] - second_last_value_mat = tf.concat( - [ - blocked_value_matrix[:, :, 0], - blocked_value_matrix[:, :, -3], - blocked_value_matrix[:, :, -2], - blocked_value_matrix[:, :, -1], - gathered_value[:, :, -1], - ], - 2, - ) # [b, h, (4+r)*wn, -1] - second_last_product = tf.einsum( - 'BHQD,BHKD->BHQK', blocked_query_matrix[:, :, -2], second_last_key_mat - ) # [b, h, wm, -1] x [b, h, (4+r)*wn, -1] ==> [b, h, wm, (4+r)*wn] - second_last_seq_pad = tf.concat( - [ - to_mask[:, :, :, :wn], - to_mask[:, :, :, -3 * wn:], - tf.ones([b, 1, 1, r * wn], dtype=tf.float32), - ], - 3, - ) - second_last_rand_pad = tf.concat( - [tf.ones([b, h, wm, 4 * wn], dtype=tf.float32), rand_mask[:, :, -1]], - 3, - ) - second_last_product = tf.multiply(second_last_product, 1.0 / np.sqrt(d)) - second_last_product += ( - 1.0 - tf.minimum(second_last_seq_pad, second_last_rand_pad) - ) * -10000.0 - second_last_attn_weights = tf.nn.softmax( - second_last_product - ) # [b, h, wm, (4+r)*wn] - second_last_context_layer = tf.einsum( - 'BHQK,BHKD->BHQD', second_last_attn_weights, second_last_value_mat - ) # [b, h, wm, (4+r)*wn] x [b, h, (4+r)*wn, -1] ==> [b, h, wm, -1] - second_last_context_layer = tf.expand_dims(second_last_context_layer, 2) - - last_product = tf.einsum( - 'BHQD,BHKD->BHQK', blocked_query_matrix[:, :, -1], key_layer - ) # [b, h, wm, -1] x [b, h, n, -1] ==> [b, h, wm, n] - last_product = tf.multiply(last_product, 1.0 / np.sqrt(d)) - last_product += (1.0 - to_mask) * -10000.0 - last_attn_weights = tf.nn.softmax(last_product) # [b, h, wm, n] - last_context_layer = tf.einsum( - 'BHQK,BHKD->BHQD', last_attn_weights, value_layer - ) # [b, h, wm, n] x [b, h, n, -1] ==> [b, h, wm, -1] - last_context_layer = tf.expand_dims(last_context_layer, 2) - - context_layer = tf.concat( - [ - first_context_layer, - second_context_layer, - context_layer, - second_last_context_layer, - last_context_layer, - ], - 2, - ) - context_layer = tf.reshape(context_layer, (b, h, m, -1)) * from_mask - context_layer = tf.transpose(context_layer, (0, 2, 1, 3)) - return context_layer - - -class MultiHeadedAttentionLayer(tf.compat.v1.layers.Layer): - """A multi-headed attention layer. - - It implements following types of multi-headed attention: - - original_full attention from "Attention is all you Need". - - simulated_sparse attention from BigBird with full quadratic implemention. - - block_sparse attention from BigBird with memory efficient linear impl. - """ - - def __init__( - self, - attention_type, - num_attention_heads=1, - num_rand_blocks=3, - size_per_head=512, - initializer_range=0.02, - from_block_size=64, - to_block_size=64, - attention_probs_dropout_prob=0.0, - use_bias=True, - seed=None, - query_act=None, - key_act=None, - value_act=None, - name=None, - **kwargs - ): - """Constructor for a multi-headed attention layer. - - Args: - attention_type: Type of attention, needs to be one of ['original_full', - 'simulated_sparse', 'block_sparse']. - num_attention_heads: (optional) int. Number of attention heads. - num_rand_blocks: (optional) int. Number of random chunks per row. - size_per_head: (optional) int. Size of each attention head. - initializer_range: (optional) float. Range of the weight initializer. - from_block_size: (optional) int. size of block in from sequence. - to_block_size: (optional) int. size of block in to sequence. - attention_probs_dropout_prob: (optional) float. Dropout probability of the - attention probabilities. - use_bias: Whether the layer uses a bias vector. - seed: (Optional) int. Reandom seed for generating random mask. - query_act: (optional) Activation function for the query transform. - key_act: (optional) Activation function for the key transform. - value_act: (optional) Activation function for the value transform. - name: The name scope of this layer. - **kwargs: others - """ - super(MultiHeadedAttentionLayer, self).__init__(name=name, **kwargs) - self.query_layer = utils.Dense3dLayer( - num_attention_heads, - size_per_head, - utils.create_initializer(initializer_range), - query_act, - 'query', - head_first=True, - use_bias=use_bias, - ) - - self.key_layer = utils.Dense3dLayer( - num_attention_heads, - size_per_head, - utils.create_initializer(initializer_range), - key_act, - 'key', - head_first=True, - use_bias=use_bias, - ) - - self.value_layer = utils.Dense3dLayer( - num_attention_heads, - size_per_head, - utils.create_initializer(initializer_range), - value_act, - 'value', - head_first=True, - use_bias=use_bias, - ) - - def attn_impl( - query, - key, - value, - attention_mask, - band_mask, - from_mask, - to_mask, - from_blocked_mask, - to_blocked_mask, - batch_size, - from_seq_length, - to_seq_length, - training, - ): - if attention_type == 'original_full': - logging.info('**** Using original full attention ****') - attn_fn = original_full_attention( - query, - key, - value, - attention_mask, - size_per_head, - attention_probs_dropout_prob if training else 0.0, - ) - elif attention_type == 'simulated_sparse': - logging.info('**** Using simulated sparse attention ****') - attn_fn = bigbird_simulated_attention( - query, - key, - value, - attention_mask, - num_attention_heads, - num_rand_blocks, - size_per_head, - from_seq_length, - to_seq_length, - from_block_size, - to_block_size, - seed, - ) - elif attention_type == 'block_sparse': - logging.info('**** Using block sparse attention ****') - attn_fn = bigbird_block_sparse_attention( - query, - key, - value, - band_mask, - from_mask, - to_mask, - from_blocked_mask, - to_blocked_mask, - num_attention_heads, - num_rand_blocks, - size_per_head, - batch_size, - from_seq_length, - to_seq_length, - from_block_size, - to_block_size, - seed, - ) - else: - raise NotImplementedError( - 'Attention type {} is not implemented'.format( - attention_type - ) - ) - return attn_fn - - self.attn_impl = attn_impl - - @property - def trainable_weights(self): - tvar_list = ( - self.query_layer.trainable_weights - + self.key_layer.trainable_weights - + self.value_layer.trainable_weights - ) - self._trainable_weights = list({v.name: v for v in tvar_list}.values()) - return self._trainable_weights - - def call( - self, - from_tensor, - to_tensor, - attention_mask=None, - band_mask=None, - from_mask=None, - to_mask=None, - from_blocked_mask=None, - to_blocked_mask=None, - cache=None, - decode_i=None, - training=None, - ): - """Implements a multi-headed attention layer from from_tensor to to_tensor. - - Args: - from_tensor: float Tensor of shape [batch_size, from_seq_length, - from_width] - to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. - attention_mask: (optional) int32 Tensor of shape [batch_size, - from_seq_length, to_seq_length]. The values should be 1 or 0. The - attention scores will effectively be set to -infinity for any positions - in the mask that are 0, and will be unchanged for positions that are 1. - band_mask: (optional) int32 Tensor of shape [batch_size, 1, - from_seq_length//from_block_size-4, from_block_size, 3*to_block_size]. - The values should be 1 or 0. The attention scores will effectively be - set to -infinity for any positions in the mask that are 0, and will be - unchanged for positions that are 1. - from_mask: (optional) int32 Tensor of shape [batch_size, 1, - from_seq_length, 1]. The values should be 1 or 0. The - attention scores will effectively be set to -infinity for any positions - in the mask that are 0, and will be unchanged for positions that are 1. - to_mask: (optional) int32 Tensor of shape [batch_size, 1, 1, - to_seq_length]. The values should be 1 or 0. The - attention scores will effectively be set to -infinity for any positions - in the mask that are 0, and will be unchanged for positions that are 1. - from_blocked_mask: (optional) int32 Tensor of shape [batch_size, - from_seq_length//from_block_size, from_block_size]. - Same as from_mask, just reshaped. - to_blocked_mask: (optional) int32 Tensor of shape [batch_size, - to_seq_length//to_block_size, to_block_size]. - Same as to_mask, just reshaped. - cache: (Used during prediction) A dictionary with tensors containing - results of previous attentions. The dictionary must have the items: - {"k": tensor with shape - [batch_size, max_len, num_attention_heads, size_per_head], - "v": tensor with shape - [batch_size, max_len, num_attention_heads, size_per_head]} - decode_i: (Used during prediction) current location of decoding - training: Boolean indicating whether the call is training or inference. - - Returns: - float Tensor of shape [batch_size, from_seq_length, num_attention_heads, - size_per_head]. - - Raises: - ValueError: Any of the arguments or tensor shapes are invalid. - NotImplementedError: For unknown attention type. - """ - from_shape = utils.get_shape_list(from_tensor, expected_rank=3) - to_shape = utils.get_shape_list(to_tensor, expected_rank=3) - - if len(from_shape) != len(to_shape): - raise ValueError( - 'The rank of `from_tensor` must match the rank of `to_tensor`.' - ) - - if len(from_shape) == 3: - batch_size = from_shape[0] - from_seq_length = from_shape[1] - to_seq_length = to_shape[1] - else: - raise ValueError('Need rank 3 tensors to attention_layer.') - - # Scalar dimensions referenced here: - # b = batch size (number of sequences) - # m = `from_tensor` sequence length - # n = `to_tensor` sequence length - # h = `num_attention_heads` - # d = `size_per_head` - - # `query` = [b, h, m, d] - query = self.query_layer(from_tensor) - - # `key` = [b, h, n, d] - key = self.key_layer(to_tensor) - - # `value_layer` = [b, h, n, d] - value = self.value_layer(to_tensor) - - if cache is not None and decode_i is not None: - max_len = utils.get_shape_list(cache['k'])[2] - indices_select = tf.reshape( - tf.one_hot(decode_i, max_len, dtype=to_tensor.dtype), - [1, 1, max_len, 1], - ) - key = cache['k'] + key * indices_select - value = cache['v'] + value * indices_select - cache['k'] = key - cache['v'] = value - - contextual_output = self.attn_impl( - query, - key, - value, - attention_mask, - band_mask, - from_mask, - to_mask, - from_blocked_mask, - to_blocked_mask, - batch_size, - from_seq_length, - to_seq_length, - training, - ) - - return contextual_output diff --git a/malaya/train/model/bigbird/beam_search.py b/malaya/train/model/bigbird/beam_search.py deleted file mode 100644 index ee1d50a6..00000000 --- a/malaya/train/model/bigbird/beam_search.py +++ /dev/null @@ -1,277 +0,0 @@ -# Copyright 2020 The BigBird Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Beam search branched from Pegasus. - -Original source: -https://github.com/google-research/pegasus/blob/master/pegasus/layers/beam_search.py - -This beam search implementation is designed for TPU usage only and prefers -flexibility over efficiency. Transformer attention caching is not enabled yet. - -Mostly follows implementation in T2T. Several difference to pure beamsearch: -1. has finished and alive seqs, use 2 * beam_size to grow alive seqs, - which makes beam_size=1 doesn't equal greedy. -2. prefers finished seq over alive seqs. -3. prefers lower indices when equal probability (though unlikely). -4. with custom length normalization and constraint. - -Notations: - B: batch_size, M: beam_size, T: max_decode_len, V: vocab_size, U: undefined -""" -# pylint: disable=invalid-name - -import tensorflow as tf - - -def length_normalization(start, alpha, min_len, max_len, out_of_range_penalty): - r"""Create length normalization function. - - Combines length penalty from https://arxiv.org/abs/1609.08144, - and length constraint from https://www.aclweb.org/anthology/W18-2706.pdf. - - scores = \sum_j log(P_j) / ((start + lengths)/(1 + start))**alpha - + out_of_range_penalty * (length > max_len or length < min_len) - - Args: - start: int, length normalization start offset. - alpha: float, [0, 1.0], length normalization power. - min_len: int, minimum decode length. - max_len: int, maximum decode lengths. - out_of_range_penalty: float, penalty for lengths outside min len and max - len. Use a negative number that penalize out of range decodes, does hard - constraint if set to -inf. - - Returns: - fn(log_probs_BxM, length)->scores_BxM: a function to normalize sum log - probabilities of sequence with current decoding lengths. - """ - - def length_norm_fn(log_probs_BxM, length_int): - """Normalize sum log probabilities given a sequence length.""" - dtype = log_probs_BxM.dtype - norm_flt = tf.pow( - ((start + tf.cast(length_int, dtype)) / (1.0 + start)), alpha - ) - log_probs_BxM /= norm_flt - too_short_bool = tf.less(length_int, min_len) - too_long_bool = tf.logical_and( - tf.greater(length_int, max_len), max_len > 0 - ) - out_of_range_bool = tf.logical_or(too_long_bool, too_short_bool) - log_probs_BxM += out_of_range_penalty * tf.cast( - out_of_range_bool, dtype - ) - return log_probs_BxM - - return length_norm_fn - - -def beam_search( - symbols_to_logits_fn, - init_seq_BxT, - initial_cache_BxU, - vocab_size, - beam_size, - length_norm_fn, - eos_id=1, -): - """Beam search. - - Args: - symbols_to_logits_fn: fn(seq_BxT, cache_BxU, i) -> (logits_BxV, cache_BxU) - init_seq_BxT: initial sequence ids. - initial_cache_BxU: dictionary of tensors with shape BxU. - vocab_size: vocabulary size. - beam_size: beam size. - length_norm_fn: length normalization function. - eos_id: end of sequence. - - Returns: - Tuple of (beams_BxMxT, scores_BxM). Beam searched sequences and scores. - """ - B, T = init_seq_BxT.shape - M, V = beam_size, vocab_size - dtype = tf.float32 - int_dtype = init_seq_BxT.dtype - - def _loop_body( - i, - alive_seq_BxMxT, - alive_log_probs_BxM, - alive_cache_BxMxU, - finished_seq_BxMxT, - finished_scores_BxM, - ): - """Beam search loop body.""" - # Decode one step with beam - logits_BMxV, cache_BMxU = symbols_to_logits_fn( - _flatten_beam_dim(alive_seq_BxMxT), - tf.nest.map_structure(_flatten_beam_dim, alive_cache_BxMxU), - i, - ) - logits_BxMxV = _unflatten_beam_dim(logits_BMxV, M) - new_cache_BxMxU = tf.nest.map_structure( - lambda t: _unflatten_beam_dim(t, M), cache_BMxU - ) - - # select top 2 * beam_size and fill alive and finished. - log_probs_BxMxV = logits_BxMxV - tf.reduce_logsumexp( - logits_BxMxV, axis=2, keepdims=True - ) - log_probs_BxMxV += tf.expand_dims(alive_log_probs_BxM, axis=2) - log_probs_BxMV = tf.reshape(log_probs_BxMxV, [B, -1]) - new_log_probs_Bx2M, topk_indices_Bx2M = tf.nn.top_k( - log_probs_BxMV, k=2 * M - ) - topk_beam_Bx2M = topk_indices_Bx2M // V - topk_seq_Bx2MxT, new_cache_Bx2MxU = _gather_nested( - [alive_seq_BxMxT, new_cache_BxMxU], topk_beam_Bx2M - ) - topk_ids_Bx2M = topk_indices_Bx2M % V - new_seq_Bx2MxT = _update_i(topk_seq_Bx2MxT, topk_ids_Bx2M, i) - new_finished_flags_Bx2M = tf.cast( - tf.reduce_any(tf.equal(new_seq_Bx2MxT, eos_id), axis=-1), dtype - ) - - # get new alive - _, topk_alive_indices_BxM = tf.nn.top_k( - new_log_probs_Bx2M + new_finished_flags_Bx2M * dtype.min, k=M - ) - ( - alive_seq_BxMxT, - alive_log_probs_BxM, - alive_cache_BxMxU, - ) = _gather_nested( - [new_seq_Bx2MxT, new_log_probs_Bx2M, new_cache_Bx2MxU], - topk_alive_indices_BxM, - ) - - # get new finished - new_scores_Bx2M = length_norm_fn(new_log_probs_Bx2M, i + 1) - new_scores_Bx2M += (1 - new_finished_flags_Bx2M) * dtype.min - finished_seq_Bx3MxT = tf.concat( - [finished_seq_BxMxT, new_seq_Bx2MxT], axis=1 - ) - finished_scores_Bx3M = tf.concat( - [finished_scores_BxM, new_scores_Bx2M], axis=1 - ) - _, topk_finished_indices_BxM = tf.nn.top_k(finished_scores_Bx3M, k=M) - (finished_seq_BxMxT, finished_scores_BxM) = _gather_nested( - [finished_seq_Bx3MxT, finished_scores_Bx3M], - topk_finished_indices_BxM, - ) - - return [ - i + 1, - alive_seq_BxMxT, - alive_log_probs_BxM, - alive_cache_BxMxU, - finished_seq_BxMxT, - finished_scores_BxM, - ] - - # initialize. - init_i = tf.constant(0, dtype=int_dtype) - init_alive_seq_BxMxT = _expand_to_beam_size(init_seq_BxT, M) - log_probs_1xM = tf.constant([[0.0] + [dtype.min] * (M - 1)], dtype=dtype) - init_alive_log_probs_BxM = tf.tile(log_probs_1xM, [B, 1]) - init_alive_cache_BxMxU = tf.nest.map_structure( - lambda t: _expand_to_beam_size(t, M), initial_cache_BxU - ) - init_finished_seq_BxMxT = tf.zeros( - tf.shape(init_alive_seq_BxMxT), int_dtype - ) - init_finished_scores_BxM = tf.zeros([B, M], dtype=dtype) + dtype.min - - # run loop. - ( - _, - final_alive_seq_BxMxT, - final_alive_scores_BxM, - _, - final_finished_seq_BxMxT, - final_finished_scores_BxM, - ) = tf.while_loop( - lambda *args: True, # Always do T iterations - _loop_body, - loop_vars=[ - init_i, - init_alive_seq_BxMxT, - init_alive_log_probs_BxM, - init_alive_cache_BxMxU, - init_finished_seq_BxMxT, - init_finished_scores_BxM, - ], - parallel_iterations=1, - back_prop=False, - maximum_iterations=T, - ) - - # process finished. - final_finished_flag_BxMx1 = tf.reduce_any( - tf.equal(final_finished_seq_BxMxT, eos_id), axis=-1, keepdims=True - ) - final_seq_BxMxT = tf.where( - tf.tile(final_finished_flag_BxMx1, [1, 1, T]), - final_finished_seq_BxMxT, - final_alive_seq_BxMxT, - ) - final_scores_BxM = tf.where( - tf.squeeze(final_finished_flag_BxMx1, axis=-1), - final_finished_scores_BxM, - final_alive_scores_BxM, - ) - return final_seq_BxMxT, final_scores_BxM - - -def _update_i(tensor_BxNxT, updates_BxN, i): - B, N, T = tensor_BxNxT.shape - tensor_BNxT = tf.reshape(tensor_BxNxT, [-1, T]) - updates_BN = tf.reshape(updates_BxN, [-1]) - batch_BN = tf.range(B * N, dtype=tf.int32) - i_BN = tf.fill([B * N], i) - ind_BNx2 = tf.stack([batch_BN, i_BN], axis=-1) - tensor_BNxT = tf.tensor_scatter_nd_update(tensor_BNxT, ind_BNx2, updates_BN) - return tf.reshape(tensor_BNxT, [B, N, T]) - - -def _expand_to_beam_size(tensor_BxU, beam_size): - tensor_Bx1xU = tf.expand_dims(tensor_BxU, axis=1) - tile_dims = [1] * tensor_Bx1xU.shape.ndims - tile_dims[1] = beam_size - tensor_BxMxU = tf.tile(tensor_Bx1xU, tile_dims) - return tensor_BxMxU - - -def _flatten_beam_dim(tensor_BxMxU): - shape = tensor_BxMxU.shape.as_list() - tensor_BMxU = tf.reshape(tensor_BxMxU, [shape[0] * shape[1]] + shape[2:]) - return tensor_BMxU - - -def _unflatten_beam_dim(tensor_BMxU, M): - shape = tensor_BMxU.shape.as_list() - tensor_BxMxU = tf.reshape(tensor_BMxU, [shape[0] // M, M] + shape[1:]) - return tensor_BxMxU - - -def _gather_nested(nested_BxMxU, indices_BxN): - def _gather_beam(tensor_BxMxU): - tensor_BxNxU = tf.gather( - tensor_BxMxU, indices_BxN, batch_dims=1, axis=1 - ) - return tensor_BxNxU - - return tf.nest.map_structure(_gather_beam, nested_BxMxU) diff --git a/malaya/train/model/bigbird/decoder.py b/malaya/train/model/bigbird/decoder.py deleted file mode 100644 index a15dd559..00000000 --- a/malaya/train/model/bigbird/decoder.py +++ /dev/null @@ -1,681 +0,0 @@ -# Copyright 2020 The BigBird Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""BigBird Decoder Layers.""" - -from . import attention -from . import beam_search -from . import utils -import tensorflow as tf - - -class PrenormDecoderLayer(tf.compat.v1.layers.Layer): - """Decoder layer of a transformer in Pegasus style. - - The layer_norm is taken before self-attention. - """ - - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - intermediate_act_fn=utils.gelu, - attention_probs_dropout_prob=0.0, - hidden_dropout_prob=0.1, - initializer_range=0.02, - num_attention_heads=12, - use_bias=True, - name=None, - ): - """Constructor of a decoder layer of a transformer in Pegasus style. - - Args: - hidden_size: (optional) int. Size of hidden dimension. - intermediate_size: (optional) int. Size of intermediate dimension. - intermediate_act_fn: optional) Activation function for intermediate layer. - attention_probs_dropout_prob: (optional) float. Dropout probability of the - attention probabilities. - hidden_dropout_prob: (optional) float. Dropout probability of the - attention. - initializer_range: (optional) float. Range of the weight initializer. - num_attention_heads: (optional) int. Number of attention heads. - use_bias: (optional) bool. Whether key/query/value uses a bias vector. - name: The name scope of this layer. - """ - super(PrenormDecoderLayer, self).__init__(name=name) - self.hidden_dropout_prob = hidden_dropout_prob - - # Attention layers - attention_head_size = hidden_size // num_attention_heads - self.self_attn_layer = attention.MultiHeadedAttentionLayer( - 'original_full', - use_bias=use_bias, - name='self', - num_attention_heads=num_attention_heads, - size_per_head=attention_head_size, - initializer_range=initializer_range, - attention_probs_dropout_prob=attention_probs_dropout_prob, - ) - self.cross_attn_layer = attention.MultiHeadedAttentionLayer( - 'original_full', - use_bias=use_bias, - name='encdec', - num_attention_heads=num_attention_heads, - size_per_head=attention_head_size, - initializer_range=initializer_range, - attention_probs_dropout_prob=attention_probs_dropout_prob, - ) - - # Dense layers - self.self_proj_layer = utils.Dense3dProjLayer( - num_attention_heads, - attention_head_size, - utils.create_initializer(initializer_range), - None, - 'dense', - use_bias, - ) - self.cross_proj_layer = utils.Dense3dProjLayer( - num_attention_heads, - attention_head_size, - utils.create_initializer(initializer_range), - None, - 'dense', - use_bias, - ) - self.expand_layer = utils.Dense2dLayer( - intermediate_size, - utils.create_initializer(initializer_range), - intermediate_act_fn, - 'dense', - ) - self.contract_layer = utils.Dense2dLayer( - hidden_size, - utils.create_initializer(initializer_range), - None, - 'dense', - ) - - # Normalization layer - self.first_layer_norm = utils.NormLayer() - self.second_layer_norm = utils.NormLayer() - self.third_layer_norm = utils.NormLayer() - - @property - def trainable_weights(self): - tvar_list = ( - self.self_attn_layer.trainable_weights - + self.cross_attn_layer.trainable_weights - + self.self_proj_layer.trainable_weights - + self.cross_proj_layer.trainable_weights - + self.expand_layer.trainable_weights - + self.contract_layer.trainable_weights - + self.first_layer_norm.trainable_weights - + self.second_layer_norm.trainable_weights - + self.third_layer_norm.trainable_weights - ) - self._trainable_weights = list({v.name: v for v in tvar_list}.values()) - return self._trainable_weights - - def call( - self, - layer_input, - encoder_outputs, - self_attention_mask, - attention_mask, - cache=None, - decode_i=None, - training=None, - ): - """Implements a decoder layer of a transformer in Pegasus style. - - The layer_norm is taken after self-attention. - - Args: - layer_input: float Tensor of shape [batch_size, seq_length, hidden_size]. - encoder_outputs: tensors with shape [batch_size, input_length, - num_hidden_layers, hidden_size] - self_attention_mask: bias for decoder self-attention layer. [1, 1, - target_length, target_length] - attention_mask: bias for encoder-decoder attention layer. [batch_size, 1, - 1, input_length] - cache: (Used during prediction) A dictionary with tensors containing - results of previous attentions. The dictionary must have the items: - {"k": tensor with shape - [batch_size, max_len, num_attention_heads, size_per_head], - "v": tensor with shape - [batch_size, max_len, num_attention_heads, size_per_head]} - decode_i: (Used during prediction) current location of decoding - training: Boolean indicating whether the call is training or inference. - - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size]. - - Raises: - ValueError: Any of the arguments or tensor shapes are invalid. - NotImplementedError: For unknown attention type. - """ - with tf.compat.v1.variable_scope('attention'): - with tf.compat.v1.variable_scope('self') as sc: - normalized_layer_input = self.first_layer_norm(layer_input) - self_attention_output = self.self_attn_layer( - normalized_layer_input, - normalized_layer_input, - self_attention_mask, - cache=cache, - decode_i=decode_i, - training=training, - scope=sc, - ) - - # Run a linear projection of `hidden_size` then add a residual - # with `layer_input`. - with tf.compat.v1.variable_scope('output'): - self_attention_output = self.self_proj_layer( - self_attention_output - ) - self_attention_output = utils.dropout( - self_attention_output, self.hidden_dropout_prob, training - ) - self_attention_output = self_attention_output + layer_input - - with tf.compat.v1.variable_scope('encdec') as sc: - normalized_self_attention_output = self.second_layer_norm( - self_attention_output - ) - attention_output = self.cross_attn_layer( - normalized_self_attention_output, - encoder_outputs, - attention_mask, - training=training, - scope=sc, - ) - - # Run a linear projection of `hidden_size` then add a residual - # with `layer_input`. - with tf.compat.v1.variable_scope('encdec_output'): - attention_output = self.cross_proj_layer(attention_output) - attention_output = utils.dropout( - attention_output, self.hidden_dropout_prob, training - ) - attention_output = attention_output + self_attention_output - - # The activation is only applied to the "intermediate" hidden layer. - with tf.compat.v1.variable_scope('intermediate'): - normalized_attention_output = self.third_layer_norm( - attention_output - ) - intermediate_output = self.expand_layer(normalized_attention_output) - - # Down-project back to `hidden_size` then add the residual. - with tf.compat.v1.variable_scope('output'): - layer_output = self.contract_layer(intermediate_output) - layer_output = utils.dropout( - layer_output, self.hidden_dropout_prob, training - ) - layer_output = layer_output + attention_output - return layer_output - - -class PostnormDecoderLayer(tf.compat.v1.layers.Layer): - """Decoder layer of a transformer in BERT style. - - The layer_norm is taken before self-attention. - """ - - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - intermediate_act_fn=utils.gelu, - attention_probs_dropout_prob=0.0, - hidden_dropout_prob=0.1, - initializer_range=0.02, - num_attention_heads=12, - use_bias=True, - name=None, - ): - """Constructor of a decoder layer of a transformer in BERT style. - - Args: - hidden_size: (optional) int. Size of hidden dimension. - intermediate_size: (optional) int. Size of intermediate dimension. - intermediate_act_fn: optional) Activation function for intermediate layer. - attention_probs_dropout_prob: (optional) float. Dropout probability of the - attention probabilities. - hidden_dropout_prob: (optional) float. Dropout probability of the - attention. - initializer_range: (optional) float. Range of the weight initializer. - num_attention_heads: (optional) int. Number of attention heads. - use_bias: (optional) bool. Whether key/query/value uses a bias vector. - name: The name scope of this layer. - """ - super(PostnormDecoderLayer, self).__init__(name=name) - self.hidden_dropout_prob = hidden_dropout_prob - - # Attention layers - attention_head_size = hidden_size // num_attention_heads - self.self_attn_layer = attention.MultiHeadedAttentionLayer( - 'original_full', - use_bias=use_bias, - name='self', - num_attention_heads=num_attention_heads, - size_per_head=attention_head_size, - initializer_range=initializer_range, - attention_probs_dropout_prob=attention_probs_dropout_prob, - ) - self.cross_attn_layer = attention.MultiHeadedAttentionLayer( - 'original_full', - use_bias=use_bias, - name='encdec', - num_attention_heads=num_attention_heads, - size_per_head=attention_head_size, - initializer_range=initializer_range, - attention_probs_dropout_prob=attention_probs_dropout_prob, - ) - - # Dense layers - self.self_proj_layer = utils.Dense3dProjLayer( - num_attention_heads, - attention_head_size, - utils.create_initializer(initializer_range), - None, - 'dense', - use_bias, - ) - self.cross_proj_layer = utils.Dense3dProjLayer( - num_attention_heads, - attention_head_size, - utils.create_initializer(initializer_range), - None, - 'dense', - use_bias, - ) - self.expand_layer = utils.Dense2dLayer( - intermediate_size, - utils.create_initializer(initializer_range), - intermediate_act_fn, - 'dense', - ) - self.contract_layer = utils.Dense2dLayer( - hidden_size, - utils.create_initializer(initializer_range), - None, - 'dense', - ) - - # Normalization layer - self.first_layer_norm = utils.NormLayer() - self.second_layer_norm = utils.NormLayer() - self.third_layer_norm = utils.NormLayer() - - @property - def trainable_weights(self): - tvar_list = ( - self.self_attn_layer.trainable_weights - + self.cross_attn_layer.trainable_weights - + self.self_proj_layer.trainable_weights - + self.cross_proj_layer.trainable_weights - + self.expand_layer.trainable_weights - + self.contract_layer.trainable_weights - + self.first_layer_norm.trainable_weights - + self.second_layer_norm.trainable_weights - + self.third_layer_norm.trainable_weights - ) - self._trainable_weights = list({v.name: v for v in tvar_list}.values()) - return self._trainable_weights - - def call( - self, - layer_input, - encoder_outputs, - self_attention_mask, - attention_mask, - cache=None, - decode_i=None, - training=None, - ): - """Implements a decoder layer of a transformer in BERT style. - - The layer_norm is taken after self-attention. - - Args: - layer_input: float Tensor of shape [batch_size, seq_length, hidden_size]. - encoder_outputs: tensors with shape [batch_size, input_length, - num_hidden_layers, hidden_size] - self_attention_mask: bias for decoder self-attention layer. [1, 1, - target_length, target_length] - attention_mask: bias for encoder-decoder attention layer. [batch_size, 1, - 1, input_length] - cache: (Used during prediction) A dictionary with tensors containing - results of previous attentions. The dictionary must have the items: - {"k": tensor with shape - [batch_size, max_len, num_attention_heads, size_per_head], - "v": tensor with shape - [batch_size, max_len, num_attention_heads, size_per_head]} - decode_i: (Used during prediction) current location of decoding - training: Boolean indicating whether the call is training or inference. - - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size]. - - Raises: - ValueError: Any of the arguments or tensor shapes are invalid. - NotImplementedError: For unknown attention type. - """ - with tf.compat.v1.variable_scope('attention'): - with tf.compat.v1.variable_scope('self') as sc: - self_attention_output = self.self_attn_layer( - layer_input, - layer_input, - self_attention_mask, - cache=cache, - decode_i=decode_i, - training=training, - scope=sc, - ) - - # Run a linear projection of `hidden_size` then add a residual - # with `layer_input`. - with tf.compat.v1.variable_scope('output'): - self_attention_output = self.self_proj_layer( - self_attention_output - ) - self_attention_output = utils.dropout( - self_attention_output, self.hidden_dropout_prob, training - ) - self_attention_output = self.first_layer_norm( - self_attention_output + layer_input - ) - - with tf.compat.v1.variable_scope('encdec') as sc: - attention_output = self.cross_attn_layer( - self_attention_output, - encoder_outputs, - attention_mask, - training=training, - scope=sc, - ) - - # Run a linear projection of `hidden_size` then add a residual - # with `layer_input`. - with tf.compat.v1.variable_scope('encdec_output'): - attention_output = self.cross_proj_layer(attention_output) - attention_output = utils.dropout( - attention_output, self.hidden_dropout_prob, training - ) - attention_output = self.second_layer_norm( - attention_output + self_attention_output - ) - - # The activation is only applied to the "intermediate" hidden layer. - with tf.compat.v1.variable_scope('intermediate'): - intermediate_output = self.expand_layer(attention_output) - - # Down-project back to `hidden_size` then add the residual. - with tf.compat.v1.variable_scope('output'): - layer_output = self.contract_layer(intermediate_output) - layer_output = utils.dropout( - layer_output, self.hidden_dropout_prob, training - ) - layer_output = self.third_layer_norm( - layer_output + attention_output - ) - return layer_output - - -class DecoderStack(tf.compat.v1.layers.Layer): - """Transformer decoder stack.""" - - def __init__(self, params): - if params['couple_encoder_decoder']: - name = 'encoder' - with tf.compat.v1.variable_scope( - name, reuse=tf.compat.v1.AUTO_REUSE - ) as scope: - super(DecoderStack, self).__init__(name=name, _scope=scope) - else: - name = 'decoder' - super(DecoderStack, self).__init__(name=name) - - self.params = params - - if params['norm_type'] == 'prenorm': - decoder_class = PrenormDecoderLayer - elif params['norm_type'] == 'postnorm': - decoder_class = PostnormDecoderLayer - else: - raise NotImplementedError( - 'Norm type {} is not implemented'.format(params['norm_type']) - ) - - if self.params.get('num_decoder_layers', None) is not None: - num_hidden_layers = self.params['num_decoder_layers'] - else: - num_hidden_layers = self.params['num_hidden_layers'] - - # Decoder layers - self.decoder_layers = [ - decoder_class( # pylint: disable=g-complex-comprehension - self.params['hidden_size'], - self.params['intermediate_size'], - utils.get_activation(self.params['hidden_act']), - self.params['attention_probs_dropout_prob'], - self.params['hidden_dropout_prob'], - self.params['initializer_range'], - self.params['num_attention_heads'], - self.params['use_bias'], - name='layer_%d' % layer_idx, - ) - for layer_idx in range(num_hidden_layers) - ] - - # Normalization layer - self.layer_norm = utils.NormLayer() - - @property - def trainable_weights(self): - tvar_list = ( - sum([layer.trainable_weights for layer in self.decoder_layers], []) - + self.layer_norm.trainable_weights - ) - self._trainable_weights = list({v.name: v for v in tvar_list}.values()) - return self._trainable_weights - - def call( - self, - decoder_inputs, - self_attention_mask, - encoder_outputs, - encoder_mask, - cache=None, - decode_i=None, - training=None, - ): - """Return the output of the decoder layer stacks. - - Args: - decoder_inputs: tensor with shape - [batch_size, target_length, hidden_size] - self_attention_mask: bias for decoder self-attention layer. [1, 1, - target_length, target_length] - encoder_outputs: tensors with shape [batch_size, input_length, - hidden_size] - encoder_mask: bias for encoder-decoder attention layer. [batch_size, - input_length] - cache: (Used during prediction) A dictionary with tensors containing - results of previous attentions. The dictionary must have the items: - {"k": tensor with shape - [batch_size, max_len, num_attention_heads, size_per_head], - "v": tensor with shape - [batch_size, max_len, num_attention_heads, size_per_head]} - decode_i: (Used during prediction) current location of decoding. - training: Boolean indicating whether the call is training or inference. - - Returns: - Output of decoder layer stack. A float32 tensor with shape [batch_size, - target_length, hidden_size] - """ - # Expand encoder mask to broadcast over num heads and from_seq axis - attention_mask = tf.expand_dims(tf.expand_dims(encoder_mask, 1), 1) - - # if self.params["use_gradient_checkpointing"]:: - # decoder_layer = recompute_gradient(decoder_layer) - - if self.params['norm_type'] == 'postnorm': - decoder_inputs = self.layer_norm(decoder_inputs) - - layer_output = decoder_inputs - for layer in self.decoder_layers: - layer_cache = cache[layer.name] if cache is not None else None - layer_output = layer( - layer_output, - encoder_outputs, - self_attention_mask, - attention_mask, - layer_cache, - decode_i, - training, - ) - - if self.params['norm_type'] == 'prenorm': - layer_output = self.layer_norm(layer_output) - - return layer_output - - -def create_self_attention_mask(length): - with tf.name_scope('decoder_self_attention_mask'): - valid_locs = tf.linalg.band_part(tf.ones([length, length]), -1, 0) - valid_locs = tf.reshape(valid_locs, [1, 1, length, length]) - return valid_locs - - -def inplace_update_i(inp_tensor, updates, i): - """Inplace update a tensor. B: batch_size, L: tensor length.""" - batch_size = tf.shape(inp_tensor)[0] - indices = tf.stack( - [ - tf.range(batch_size, dtype=tf.int32), - tf.fill([batch_size], tf.cast(i, tf.int32)), - ], - axis=-1, - ) - return tf.tensor_scatter_nd_update(inp_tensor, indices, updates) - - -# pylint: disable=invalid-name -def left2right_decode( - symbols_to_logits_fn, - start_symbols, - context_BxU_dict, - batch_size, - max_decode_len, - vocab_size, - beam_size=1, - beam_start=5, - beam_alpha=0.6, - beam_min=0, - beam_max=-1, - eos_id=1, -): - """left to right decode. - - Notations: - B: batch_size, V: vocab_size, T: decode_len, U: undefined dimensions - - Args: - symbols_to_logits_fn: logits = fn(decodes, context, i). Shoud take - [batch_size, decoded_ids] and return [batch_size, vocab_size]. - start_symbols: starting ids [batch_size] - context_BxU_dict: dict of Tensors. - batch_size: int, decode batch size. - max_decode_len: int, maximum number of steps to decode. - vocab_size: int, output vocab size. - beam_size: Number of beams to decode. - beam_start: start length for scaling, default to 5. - beam_alpha: Length penalty for decoding. Should be between 0 (shorter) and 1 - (longer), default to 0.6. - beam_min: Minimum beam search lengths. - beam_max: Maximum beam search lengths. Set -1 to use unlimited. - eos_id: end of token id, default to 1. - - Returns: - decodes: Tensor[batch, decode_len] - """ - dtype = tf.int32 - start_symbols = tf.expand_dims(start_symbols, 1) - # When beam_size=1, beam_search does not behave exactly like greedy. - # This is due to using 2 * beam_size in grow_topk, and keep the top beam_size - # ones that haven't reached EOS into alive. - # In this case, alpha value for length penalty will take effect. - if beam_size == 1: - - def decode_loop(i, decodes_BxT, cache_BxU_dict): - logits_BxV = symbols_to_logits_fn(decodes_BxT, cache_BxU_dict, i) - decodes_BxT = inplace_update_i( - decodes_BxT, - tf.argmax(logits_BxV, -1, output_type=tf.int32), - i, - ) - return i + 1, decodes_BxT, cache_BxU_dict - - def loop_cond(i, decodes_BxT, unused_cache_BxU_dict): - finished_B = tf.reduce_any(tf.equal(decodes_BxT, eos_id), axis=1) - return tf.logical_and( - i < max_decode_len, tf.logical_not(tf.reduce_all(finished_B)) - ) - - init_dec_BxT = tf.concat( - [ - tf.cast(start_symbols, dtype=dtype), - tf.zeros([batch_size, max_decode_len - 1], dtype=dtype), - ], - axis=1, - ) - _, decodes, _ = tf.while_loop( - loop_cond, - decode_loop, - [tf.constant(0, dtype=dtype), init_dec_BxT, context_BxU_dict], - ) - return decodes - - else: - - def symbols_to_logits_fn_with_sampling(decodes_BxT, states_BxU_dict, i): - logits_BxV = symbols_to_logits_fn(decodes_BxT, states_BxU_dict, i) - return logits_BxV, states_BxU_dict - - length_norm_fn = beam_search.length_normalization( - beam_start, beam_alpha, beam_min, beam_max, -1e3 - ) - - init_dec_BxT = tf.concat( - [ - tf.cast(start_symbols, dtype=tf.int32), - tf.zeros([batch_size, max_decode_len - 1], dtype=tf.int32), - ], - axis=1, - ) - - beams, _ = beam_search.beam_search( - symbols_to_logits_fn_with_sampling, - init_dec_BxT, - context_BxU_dict, - vocab_size, - beam_size, - length_norm_fn, - eos_id, - ) - return beams[:, 0, :] diff --git a/malaya/train/model/bigbird/encoder.py b/malaya/train/model/bigbird/encoder.py deleted file mode 100644 index 5a7f5934..00000000 --- a/malaya/train/model/bigbird/encoder.py +++ /dev/null @@ -1,515 +0,0 @@ -# Copyright 2020 The BigBird Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""BigBird Encoder Layers.""" - -from . import attention -from . import utils -import tensorflow as tf - - -class PrenormEncoderLayer(tf.compat.v1.layers.Layer): - """Encoder layer of a transformer in Pegasus style. - - The layer_norm is taken before self-attention. - """ - - def __init__( - self, - attention_type, - hidden_size=768, - intermediate_size=3072, - intermediate_act_fn=utils.gelu, - attention_probs_dropout_prob=0.0, - hidden_dropout_prob=0.1, - initializer_range=0.02, - num_attention_heads=12, - num_rand_blocks=3, - block_size=64, - use_bias=True, - seed=None, - name=None, - ): - """Constructor of an encoder layer of a transformer in Pegasus style. - - Args: - attention_type: Type of attention, needs to be one of ['original_full', - 'simulated_sparse', 'block_sparse']. - hidden_size: (optional) int. Size of hidden dimension. - intermediate_size: (optional) int. Size of intermediate dimension. - intermediate_act_fn: optional) Activation function for intermediate layer. - attention_probs_dropout_prob: (optional) float. Dropout probability of the - attention probabilities. - hidden_dropout_prob: (optional) float. Dropout probability of the - attention. - initializer_range: (optional) float. Range of the weight initializer. - num_attention_heads: (optional) int. Number of attention heads. - num_rand_blocks: (optional) int. Number of random chunks per row. - block_size: (optional) int. size of block in sequence. - use_bias: (optional) bool. Whether key/query/value uses a bias vector. - seed: (Optional) int. Reandom seed for generating random mask. - name: The name scope of this layer. - """ - super(PrenormEncoderLayer, self).__init__(name=name) - self.hidden_dropout_prob = hidden_dropout_prob - - # Attention layer - attention_head_size = hidden_size // num_attention_heads - self.attn_layer = attention.MultiHeadedAttentionLayer( - attention_type, - num_attention_heads, - num_rand_blocks, - attention_head_size, - initializer_range, - block_size, - block_size, - attention_probs_dropout_prob, - use_bias, - seed, - name='self', - ) - - # Dense layers - self.projection_layer = utils.Dense3dProjLayer( - num_attention_heads, - attention_head_size, - utils.create_initializer(initializer_range), - None, - 'dense', - use_bias, - ) - self.expand_layer = utils.Dense2dLayer( - intermediate_size, - utils.create_initializer(initializer_range), - intermediate_act_fn, - 'dense', - ) - self.contract_layer = utils.Dense2dLayer( - hidden_size, - utils.create_initializer(initializer_range), - None, - 'dense', - ) - - # Normalization layer - self.first_layer_norm = utils.NormLayer() - self.second_layer_norm = utils.NormLayer() - - @property - def trainable_weights(self): - tvar_list = ( - self.attn_layer.trainable_weights - + self.projection_layer.trainable_weights - + self.expand_layer.trainable_weights - + self.contract_layer.trainable_weights - + self.first_layer_norm.trainable_weights - + self.second_layer_norm.trainable_weights - ) - self._trainable_weights = list({v.name: v for v in tvar_list}.values()) - return self._trainable_weights - - def call( - self, - layer_input, - attention_mask=None, - band_mask=None, - from_mask=None, - to_mask=None, - input_blocked_mask=None, - training=None, - ): - """Implements a encoder layer of a transformer in Pegasus style. - - Args: - layer_input: float Tensor of shape [batch_size, seq_length, hidden_size]. - attention_mask: (optional) int32 Tensor of shape [batch_size, - seq_length, seq_length]. The values should be 1 or 0. The - attention scores will effectively be set to -infinity for any positions - in the mask that are 0, and will be unchanged for positions that are 1. - band_mask: (optional) int32 Tensor of shape [batch_size, 1, - seq_length//block_size-4, block_size, 3*block_size]. - The values should be 1 or 0. The attention scores will effectively be - set to -infinity for any positions in the mask that are 0, and will be - unchanged for positions that are 1. - from_mask: (optional) int32 Tensor of shape [batch_size, 1, - seq_length, 1]. The values should be 1 or 0. The - attention scores will effectively be set to -infinity for any positions - in the mask that are 0, and will be unchanged for positions that are 1. - to_mask: (optional) int32 Tensor of shape [batch_size, 1, 1, - seq_length]. The values should be 1 or 0. The - attention scores will effectively be set to -infinity for any positions - in the mask that are 0, and will be unchanged for positions that are 1. - input_blocked_mask: (optional) int32 Tensor of shape [batch_size, - seq_length//block_size, block_size]. Same as from/to_mask, just - reshaped. - training: Boolean indicating whether the call is training or inference. - - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size]. - - Raises: - ValueError: Any of the arguments or tensor shapes are invalid. - NotImplementedError: For unknown attention type. - """ - - with tf.compat.v1.variable_scope('attention'): - with tf.compat.v1.variable_scope('self') as sc: - normalized_layer_input = self.first_layer_norm(layer_input) - attention_output = self.attn_layer( - normalized_layer_input, - normalized_layer_input, - attention_mask, - band_mask, - from_mask, - to_mask, - input_blocked_mask, - input_blocked_mask, - training, - scope=sc, - ) - - # Run a linear projection of `hidden_size` then add a residual - # with `layer_input`. - with tf.compat.v1.variable_scope('output'): - attention_output = self.projection_layer(attention_output) - attention_output = utils.dropout( - attention_output, self.hidden_dropout_prob, training - ) - attention_output = attention_output + layer_input - - # The activation is only applied to the "intermediate" hidden layer. - with tf.compat.v1.variable_scope('intermediate'): - normalized_attention_output = self.second_layer_norm( - attention_output - ) - intermediate_output = self.expand_layer(normalized_attention_output) - - # Down-project back to `hidden_size` then add the residual. - with tf.compat.v1.variable_scope('output'): - layer_output = self.contract_layer(intermediate_output) - layer_output = utils.dropout( - layer_output, self.hidden_dropout_prob, training - ) - layer_output = layer_output + attention_output - return layer_output - - -class PostnormEncoderLayer(tf.compat.v1.layers.Layer): - """Encoder layer of a transformer in BERT style. - - The layer_norm is taken after self-attention. - """ - - def __init__( - self, - attention_type, - hidden_size=768, - intermediate_size=3072, - intermediate_act_fn=utils.gelu, - attention_probs_dropout_prob=0.0, - hidden_dropout_prob=0.1, - initializer_range=0.02, - num_attention_heads=12, - num_rand_blocks=3, - block_size=64, - use_bias=True, - seed=None, - name=None, - ): - """Constructor of an encoder layer of a transformer in BERT style. - - Args: - attention_type: Type of attention, needs to be one of ['original_full', - 'simulated_sparse', 'block_sparse']. - hidden_size: (optional) int. Size of hidden dimension. - intermediate_size: (optional) int. Size of intermediate dimension. - intermediate_act_fn: optional) Activation function for intermediate layer. - attention_probs_dropout_prob: (optional) float. Dropout probability of the - attention probabilities. - hidden_dropout_prob: (optional) float. Dropout probability of the - attention. - initializer_range: (optional) float. Range of the weight initializer. - num_attention_heads: (optional) int. Number of attention heads. - num_rand_blocks: (optional) int. Number of random chunks per row. - block_size: (optional) int. size of block in sequence. - use_bias: (optional) bool. Whether key/query/value uses a bias vector. - seed: (Optional) int. Reandom seed for generating random mask. - name: The name scope of this layer. - """ - super(PostnormEncoderLayer, self).__init__(name=name) - self.hidden_dropout_prob = hidden_dropout_prob - - # Attention layer - attention_head_size = hidden_size // num_attention_heads - self.attn_layer = attention.MultiHeadedAttentionLayer( - attention_type, - num_attention_heads, - num_rand_blocks, - attention_head_size, - initializer_range, - block_size, - block_size, - attention_probs_dropout_prob, - use_bias, - seed, - name='self', - ) - - # Dense layers - self.projection_layer = utils.Dense3dProjLayer( - num_attention_heads, - attention_head_size, - utils.create_initializer(initializer_range), - None, - 'dense', - use_bias, - ) - self.expand_layer = utils.Dense2dLayer( - intermediate_size, - utils.create_initializer(initializer_range), - intermediate_act_fn, - 'dense', - ) - self.contract_layer = utils.Dense2dLayer( - hidden_size, - utils.create_initializer(initializer_range), - None, - 'dense', - ) - - # Normalization layer - self.first_layer_norm = utils.NormLayer() - self.second_layer_norm = utils.NormLayer() - - @property - def trainable_weights(self): - tvar_list = ( - self.attn_layer.trainable_weights - + self.projection_layer.trainable_weights - + self.expand_layer.trainable_weights - + self.contract_layer.trainable_weights - + self.first_layer_norm.trainable_weights - + self.second_layer_norm.trainable_weights - ) - self._trainable_weights = list({v.name: v for v in tvar_list}.values()) - return self._trainable_weights - - def call( - self, - layer_input, - attention_mask=None, - band_mask=None, - from_mask=None, - to_mask=None, - input_blocked_mask=None, - training=None, - ): - """Implements a encoder layer of a transformer in BERT style. - - Args: - layer_input: float Tensor of shape [batch_size, seq_length, hidden_size]. - attention_mask: (optional) int32 Tensor of shape [batch_size, - seq_length, seq_length]. The values should be 1 or 0. The - attention scores will effectively be set to -infinity for any positions - in the mask that are 0, and will be unchanged for positions that are 1. - band_mask: (optional) int32 Tensor of shape [batch_size, 1, - seq_length//block_size-4, block_size, 3*block_size]. - The values should be 1 or 0. The attention scores will effectively be - set to -infinity for any positions in the mask that are 0, and will be - unchanged for positions that are 1. - from_mask: (optional) int32 Tensor of shape [batch_size, 1, - seq_length, 1]. The values should be 1 or 0. The - attention scores will effectively be set to -infinity for any positions - in the mask that are 0, and will be unchanged for positions that are 1. - to_mask: (optional) int32 Tensor of shape [batch_size, 1, 1, - seq_length]. The values should be 1 or 0. The - attention scores will effectively be set to -infinity for any positions - in the mask that are 0, and will be unchanged for positions that are 1. - input_blocked_mask: (optional) int32 Tensor of shape [batch_size, - seq_length//block_size, block_size]. Same as from/to_mask, just - reshaped. - training: Boolean indicating whether the call is training or inference. - - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size]. - - Raises: - ValueError: Any of the arguments or tensor shapes are invalid. - NotImplementedError: For unknown attention type. - """ - - with tf.compat.v1.variable_scope('attention'): - with tf.compat.v1.variable_scope('self') as sc: - attention_output = self.attn_layer( - layer_input, - layer_input, - attention_mask, - band_mask, - from_mask, - to_mask, - input_blocked_mask, - input_blocked_mask, - training, - scope=sc, - ) - - # Run a linear projection of `hidden_size` then add a residual - # with `layer_input`. - with tf.compat.v1.variable_scope('output'): - attention_output = self.projection_layer(attention_output) - attention_output = utils.dropout( - attention_output, self.hidden_dropout_prob, training - ) - attention_output = self.first_layer_norm( - attention_output + layer_input - ) - - # The activation is only applied to the "intermediate" hidden layer. - with tf.compat.v1.variable_scope('intermediate'): - intermediate_output = self.expand_layer(attention_output) - - # Down-project back to `hidden_size` then add the residual. - with tf.compat.v1.variable_scope('output'): - layer_output = self.contract_layer(intermediate_output) - layer_output = utils.dropout( - layer_output, self.hidden_dropout_prob, training - ) - layer_output = self.second_layer_norm( - layer_output + attention_output - ) - return layer_output - - -class EncoderStack(tf.compat.v1.layers.Layer): - """Transformer encoder stack.""" - - def __init__(self, params): - name = 'encoder' - super(EncoderStack, self).__init__(name=name) - self.params = params - - if params['norm_type'] == 'prenorm': - encoder_class = PrenormEncoderLayer - elif params['norm_type'] == 'postnorm': - encoder_class = PostnormEncoderLayer - else: - raise NotImplementedError( - 'Norm type {} is not implemented'.format(params['norm_type']) - ) - - # Encoder layers - self.encoder_layers = [ - encoder_class( # pylint: disable=g-complex-comprehension - self.params['attention_type'], - self.params['hidden_size'], - self.params['intermediate_size'], - utils.get_activation(self.params['hidden_act']), - self.params['attention_probs_dropout_prob'], - self.params['hidden_dropout_prob'], - self.params['initializer_range'], - self.params['num_attention_heads'], - self.params['num_rand_blocks'], - self.params['block_size'], - self.params['use_bias'], - seed=layer_idx, - name='layer_%d' % layer_idx, - ) - for layer_idx in range(self.params['num_hidden_layers']) - ] - - # Normalization layer - self.layer_norm = utils.NormLayer() - - @property - def trainable_weights(self): - tvar_list = ( - sum([layer.trainable_weights for layer in self.encoder_layers], []) - + self.layer_norm.trainable_weights - ) - self._trainable_weights = list({v.name: v for v in tvar_list}.values()) - return self._trainable_weights - - def call(self, encoder_inputs, encoder_inputs_mask, training=None): - """Return the output of the decoder layer stacks. - - Args: - encoder_inputs: tensor with shape - [batch_size, input_length, hidden_size] - encoder_inputs_mask: Mask for enccoder input. [batch_size, input_length] - training: Boolean indicating whether the call is training or inference. - - Returns: - Finaly layer encoder output. float tensor with shape - [batch_size, input_length, hidden_size] - """ - encoder_shape = utils.get_shape_list(encoder_inputs, expected_rank=3) - batch_size = encoder_shape[0] - encoder_length = encoder_shape[1] - - if self.params['attention_type'] == 'block_sparse': - # reshape and cast for blocking - encoder_block_size = self.params['block_size'] - blocked_encoder_mask = tf.reshape( - encoder_inputs_mask, - ( - batch_size, - encoder_length // encoder_block_size, - encoder_block_size, - ), - ) - encoder_from_mask = tf.reshape( - encoder_inputs_mask, (batch_size, 1, encoder_length, 1) - ) - encoder_to_mask = tf.reshape( - encoder_inputs_mask, (batch_size, 1, 1, encoder_length) - ) - - # create band padding - attention_mask = None - band_mask = attention.create_band_mask_from_inputs( - blocked_encoder_mask, blocked_encoder_mask - ) - - else: - blocked_encoder_mask = None - encoder_to_mask = None - encoder_from_mask = None - - attention_mask = attention.create_attention_mask_from_input_mask( - encoder_inputs_mask, encoder_inputs_mask - ) - band_mask = None - - # if self.params["use_gradient_checkpointing"]: - # encoder_layer = recompute_gradient(encoder_layer) - - if self.params['norm_type'] == 'postnorm': - encoder_inputs = self.layer_norm(encoder_inputs) - - layer_output = encoder_inputs - for layer in self.encoder_layers: - layer_output = layer( - layer_output, - attention_mask, - band_mask, - encoder_from_mask, - encoder_to_mask, - blocked_encoder_mask, - training, - ) - - if self.params['norm_type'] == 'prenorm': - layer_output = self.layer_norm(layer_output) - - return layer_output diff --git a/malaya/train/model/bigbird/modeling.py b/malaya/train/model/bigbird/modeling.py deleted file mode 100644 index 64d11ed9..00000000 --- a/malaya/train/model/bigbird/modeling.py +++ /dev/null @@ -1,516 +0,0 @@ -# Copyright 2020 The BigBird Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""The main BigBird model and related functions.""" - -import copy - -from . import decoder -from . import encoder -from . import utils -import tensorflow as tf - - -class BertModel(tf.compat.v1.layers.Layer): - """BERT model ("Bidirectional Encoder Representations from Transformers"). - - Example usage: - - ```python - # Already been converted into SentencePiece token ids - input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) - token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) - - params = utils.BigBirdConfig(vocab_size=32000, hidden_size=512, - num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) - - model = modeling.BertModel(params, train=True) - - _, pooled_output = model(input_ids=input_ids, token_type_ids=token_type_ids) - - label_embeddings = tf.get_variable(...) - logits = tf.matmul(pooled_output, label_embeddings) - ... - ``` - """ - - def __init__(self, params): - """Constructor for BertModel. - - Args: - params: `BigBirdConfig` dictionary. - """ - self.params = copy.deepcopy(params) - self.scope = params['scope'] - - with tf.compat.v1.variable_scope( - self.scope, reuse=tf.compat.v1.AUTO_REUSE - ) as vs: - self.embeder = utils.EmbeddingLayer( - vocab_size=self.params['vocab_size'], - emb_dim=self.params['hidden_size'], - initializer=utils.create_initializer( - self.params['initializer_range'] - ), - scale_emb=self.params['rescale_embedding'], - use_token_type=True, - num_token_types=self.params['type_vocab_size'], - use_position_embeddings=True, - max_position_embeddings=self.params[ - 'max_position_embeddings' - ], - dropout_prob=self.params['hidden_dropout_prob'], - ) - self.encoder = encoder.EncoderStack(self.params) - self.pooler = tf.compat.v1.layers.Dense( - units=self.params['hidden_size'], - activation=tf.tanh, - kernel_initializer=utils.create_initializer( - self.params['initializer_range'] - ), - name='pooler/dense', - ) - super(BertModel, self).__init__(name=self.scope, _scope=vs) - - @property - def trainable_weights(self): - tvar_list = ( - self.embeder.trainable_weights - + self.encoder.trainable_weights - + self.pooler.trainable_weights - ) - self._trainable_weights = list({v.name: v for v in tvar_list}.values()) - return self._trainable_weights - - def call(self, input_ids, token_type_ids=None, training=None): - """Constructor for BertModel. - - Args: - input_ids: int32 Tensor of shape [batch_size, seq_length]. - token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. - training: Boolean indicating whether the call is training or inference. - - Returns: - sequence_output: Tensor of shape [batch_size, seq_length, hidden_size] - pooled_output: Tensor of shape [batch_size, hidden_size] - - Raises: - ValueError: The config is invalid or one of the input tensor shapes - is invalid. - """ - if token_type_ids is None: - token_type_ids = tf.zeros_like(input_ids, dtype=tf.int32) - - # Perform embedding lookup on the word ids. - embedding_output = self.embeder( - input_ids, - self.params['max_encoder_length'], - token_type_ids=token_type_ids, - training=training, - ) - - # Generate mask. - input_mask = tf.where( - input_ids > 0, tf.ones_like(input_ids), tf.zeros_like(input_ids) - ) - - # Run the stacked transformer. - sequence_output = self.encoder(embedding_output, input_mask, training) - - # The "pooler" converts the encoded sequence tensor of shape - # [batch_size, seq_length, hidden_size] to a tensor of shape - # [batch_size, hidden_size]. This is necessary for segment-level - # (or segment-pair-level) classification tasks where we need a fixed - # dimensional representation of the segment. - first_token_tensor = sequence_output[:, 0, :] - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. We assume that this has been pre-trained - pooled_output = self.pooler(first_token_tensor) - - return sequence_output, pooled_output - - -class TransformerModel(tf.compat.v1.layers.Layer): - """Encoder-Decoder transformer model. - - Example usage: - - ```python - # Already been converted into SentencePiece token ids - input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) - target_ids = tf.constant([[43, 76, 38], [56, 8, 0]]) - - params = utils.BigBirdConfig(vocab_size=32000, hidden_size=512, - num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) - - model = modeling.TransformerModel(params, train=True) - - predictions, _ = model(input_ids=input_ids, target_ids=target_ids) - - log_probs, logits, pred_ids = predictions - ... - ``` - """ - - def __init__(self, params): - """Constructor for TransformerModel. - - Args: - params: `BigBirdConfig` dictionary. - """ - self.params = copy.deepcopy(params) - self.scope = params['scope'] - - with tf.compat.v1.variable_scope( - self.scope, reuse=tf.compat.v1.AUTO_REUSE - ) as vs: - self.embeder = utils.EmbeddingLayer( - vocab_size=self.params['vocab_size'], - emb_dim=self.params['hidden_size'], - initializer=utils.create_initializer( - self.params['initializer_range'] - ), - scale_emb=self.params['rescale_embedding'], - use_token_type=False, - num_token_types=None, - use_position_embeddings=True, - max_position_embeddings=self.params[ - 'max_position_embeddings' - ], - dropout_prob=self.params['hidden_dropout_prob'], - ) - self.encoder = encoder.EncoderStack(self.params) - self.decoder = decoder.DecoderStack(self.params) - super(TransformerModel, self).__init__( - name=self.scope, _scope=vs - ) - - @property - def trainable_weights(self): - tvar_list = ( - self.embeder.trainable_weights - + self.encoder.trainable_weights - + self.decoder.trainable_weights - ) - self._trainable_weights = list({v.name: v for v in tvar_list}.values()) - return self._trainable_weights - - def _encode(self, input_ids, training=None): - """Generate continuous representation for ids. - - Args: - input_ids: Int tensor with shape [batch_size, input_length]. - training: Boolean indicating whether the call is training or inference. - - Returns: - A float tensors of shape - [batch_size, input_length, hidden_size]. - """ - # Perform embedding lookup on the word ids. - input_embs = self.embeder( - input_ids, self.params['max_encoder_length'], training=training - ) - - # Generate mask. - input_mask = tf.where( - input_ids > 0, tf.ones_like(input_ids), tf.zeros_like(input_ids) - ) - - # Run the stacked transformer. - encoder_output = self.encoder(input_embs, input_mask, training) - - return encoder_output, input_mask - - def _get_start_token_ids(self, tensor_for_shape): - start_token_id = 2 - batch_size = utils.get_shape_list(tensor_for_shape)[0] - return tf.ones([batch_size], dtype=tf.int32) * start_token_id - - def get_inputs_from_targets(self, targets, start_token_ids): - """Converts target ids to input ids, i.e. adds and removes last.""" - length = tf.math.count_nonzero(targets, axis=1, dtype=tf.int32) - # Add start token ids. - inputs = tf.concat( - [tf.expand_dims(start_token_ids, axis=1), targets], 1 - ) - # Remove from the input. - mask = tf.sequence_mask( - length, self.params['max_decoder_length'] + 1, dtype=tf.int32 - ) - inputs = (mask * inputs)[:, :-1] - return inputs - - def _decode( - self, - target_ids, - target_mask, - start_token_ids, - encoder_output, - encoder_mask, - training, - ): - """Compute likelihood of target tokens under the model. - - Args: - target_ids: tensor with shape [batch_size, target_length, hidden_size] - target_mask: self-attention bias for decoder attention layer. [batch_size, - input_length] - start_token_ids: int32 tensor of shape [batch_size] for first decoder - input. - encoder_output: Continuous representation of input sequence. Float tensor - with shape [batch_size, input_length, hidden_size]. - encoder_mask: Float tensor with shape [batch_size, input_length]. - training: Boolean indicating whether the call is training or inference. - - Returns: - A dict containing the output ids, the output log-probs, the output logits. - """ - - # Prepare inputs to decoder layers by shifting targets, embedding ids, - # adding positional encoding and applying dropout. - input_ids = self.get_inputs_from_targets(target_ids, start_token_ids) - - input_embs = self.embeder( - input_ids, self.params['max_decoder_length'], training=training - ) - - outputs = self.decoder( - input_embs, - target_mask, - encoder_output, - encoder_mask, - training=training, - ) - - logits = self.embeder.linear(outputs) - output_ids = tf.cast(tf.argmax(logits, axis=-1), tf.int32) - - log_probs = -tf.nn.sparse_softmax_cross_entropy_with_logits( - labels=target_ids, logits=logits - ) - log_probs = tf.where( - target_ids > 0, log_probs, tf.zeros_like(log_probs, tf.float32) - ) - - return ( - tf.identity(log_probs, name='log_probs'), - tf.identity(logits, name='logits'), - tf.cast(output_ids, tf.int32, name='pred_ids'), - ) - - def _init_cache(self, batch_size): - """Initialize cache for decoding.""" - - max_decode_len = self.params['max_decoder_length'] - num_heads = self.params['num_attention_heads'] - head_size = int(self.params['hidden_size'] / num_heads) - - cache = {} - for layer in range(self.params['num_hidden_layers']): - cache['layer_%d' % layer] = { - 'k': tf.zeros( - [batch_size, num_heads, max_decode_len, head_size] - ), - 'v': tf.zeros( - [batch_size, num_heads, max_decode_len, head_size] - ), - } - return cache - - def _get_symbols_to_logits_fn(self, decoder_self_attention_mask): - """Returns a decoding function that calculates logits of the next tokens.""" - - max_decode_len = self.params['max_decoder_length'] - - def _symbols_to_logits_fn(target_ids, cache, i): - """Generate logits for next candidate IDs. - - Args: - target_ids: Current decoded sequences. int tensor with shape - [batch_size, i + 1] - cache: dictionary of values storing the encoder output, encoder-decoder - attention bias, and previous decoder attention values. - i: Loop index - - Returns: - Tuple of - (logits with shape [batch_size * beam_size, vocab_size], - updated cache values) - """ - decoder_input = tf.slice( - target_ids, - [0, tf.maximum(tf.cast(0, i.dtype), i - 1)], - [tf.shape(target_ids)[0], 1], - ) - self_attention_mask = tf.slice( - decoder_self_attention_mask, - [0, 0, i, 0], - [1, 1, 1, max_decode_len], - ) - - # Preprocess decoder input by getting embeddings and adding timing signal. - decoder_input = self.embeder( - decoder_input, 1, start_pos=i, training=False - ) - - decoder_output = self.decoder( - decoder_input, - self_attention_mask, - cache.get('encoder_output'), - cache.get('encoder_mask'), - cache=cache, - decode_i=i, - training=False, - ) - - logits = self.embeder.linear(decoder_output) - logits = tf.squeeze(logits, axis=[1]) - - return logits - - return _symbols_to_logits_fn - - def _predict( - self, - target_ids, - target_mask, - start_token_ids, - encoder_output, - encoder_mask, - ): - """Beam decode output tokens and probabilities. - - Args: - target_ids: tensor with shape [batch_size, target_length, hidden_size] - target_mask: self-attention bias for decoder attention layer. [batch_size, - input_length] - start_token_ids: int32 tensor of shape [batch_size] for first decoder - input. - encoder_output: Continuous representation of input sequence. Float - tensor with shape [batch_size, target_length, num_hidden_layers, - hidden_size] - encoder_mask: bias for encoder-decoder attention layer. [batch_size, - input_length] - - Returns: - A tuple of: - `log_probs`: Log-probs of output tokens. - `logits`: Logits of output tokens. - `pred_ids`: Predicted output sequence. - """ - batch_size = utils.get_shape_list(start_token_ids)[0] - end_token_id = 1 - - # One step logit function. - symbols_to_logits_fn = self._get_symbols_to_logits_fn(target_mask) - - # Create cache storing decoder attention values for each layer. - cache = self._init_cache(batch_size) - - if encoder_output is not None: - # Add encoder output and attention bias to the cache. - cache['encoder_output'] = encoder_output - cache['encoder_mask'] = encoder_mask - - decoded_ids = decoder.left2right_decode( - symbols_to_logits_fn, - start_token_ids, - cache, - batch_size, - self.params['max_decoder_length'], - vocab_size=self.params['vocab_size'], - beam_size=self.params['beam_size'], - beam_start=5, - beam_alpha=self.params['alpha'], - beam_min=0, - beam_max=-1, - eos_id=end_token_id, - ) - - # Get the top sequence for each batch element - output_ids = tf.cast(decoded_ids, tf.int32, name='pred_ids') - - # Calculate log probs for given sequence if available. - calc_ids = output_ids if target_ids is None else target_ids - output_log_probs, output_logits, _ = self._decode( - calc_ids, - target_mask, - start_token_ids, - encoder_output, - encoder_mask, - training=False, - ) - - return (output_log_probs, output_logits, output_ids) - - def _decode_and_predict( - self, target_ids, encoder_output, encoder_mask, training - ): - """Decodes a sequence given the input and the encoder. - - Args: - target_ids: tensor with shape [batch_size, target_length, hidden_size] - encoder_output: Continuous representation of input sequence. Float - tensor with shape [batch_size, target_length, num_hidden_layers, - hidden_size] - encoder_mask: bias for encoder-decoder attention layer. [batch_size, - input_length] - training: Boolean indicating whether the call is training or inference. - - Returns: - A tuple of: - `log_probs`: Log-probs of output tokens. - `logits`: Logits of output tokens. - `pred_ids`: Predicted output sequence. - """ - # Create initial set of IDs that will be passed into symbols_to_logits_fn. - start_token_ids = self._get_start_token_ids(encoder_output) - - # Create causal self-attention mask for decoder. - target_mask = decoder.create_self_attention_mask( - self.params['max_decoder_length'] - ) - - predictions = {} - if training: - predictions = self._decode( - target_ids, - target_mask, - start_token_ids, - encoder_output, - encoder_mask, - training=True, - ) - else: - predictions = self._predict( - target_ids, - target_mask, - start_token_ids, - encoder_output, - encoder_mask, - ) - - return predictions - - def call(self, input_ids, target_ids=None, training=None): - # Run the inputs through the encoder layer to map the symbol - # representations to continuous representations. - encoder_output, encoder_mask = self._encode(input_ids, training) - - # Decode. - predictions = self._decode_and_predict( - target_ids, encoder_output, encoder_mask, training - ) - - return predictions, encoder_output diff --git a/malaya/train/model/bigbird/optimization.py b/malaya/train/model/bigbird/optimization.py deleted file mode 100644 index c48b222c..00000000 --- a/malaya/train/model/bigbird/optimization.py +++ /dev/null @@ -1,182 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Functions and classes related to optimization (weight updates).""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import re -import tensorflow as tf - - -def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): - """Creates an optimizer training op.""" - global_step = tf.train.get_or_create_global_step() - - learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) - - # Implements linear decay of the learning rate. - learning_rate = tf.train.polynomial_decay( - learning_rate, - global_step, - num_train_steps, - end_learning_rate=0.0, - power=1.0, - cycle=False, - ) - - # Implements linear warmup. I.e., if global_step < num_warmup_steps, the - # learning rate will be `global_step/num_warmup_steps * init_lr`. - if num_warmup_steps: - global_steps_int = tf.cast(global_step, tf.int32) - warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) - - global_steps_float = tf.cast(global_steps_int, tf.float32) - warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) - - warmup_percent_done = global_steps_float / warmup_steps_float - warmup_learning_rate = init_lr * warmup_percent_done - - is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) - learning_rate = ( - 1.0 - is_warmup - ) * learning_rate + is_warmup * warmup_learning_rate - - # It is recommended that you use this optimizer for fine tuning, since this - # is how the model was trained (note that the Adam m/v variables are NOT - # loaded from init_checkpoint.) - optimizer = AdamWeightDecayOptimizer( - learning_rate=learning_rate, - weight_decay_rate=0.01, - beta_1=0.9, - beta_2=0.999, - epsilon=1e-6, - exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'], - ) - - if use_tpu: - optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) - - tvars = tf.trainable_variables() - grads = tf.gradients(loss, tvars) - - # This is how the model was pre-trained. - (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) - - train_op = optimizer.apply_gradients( - zip(grads, tvars), global_step=global_step - ) - - # Normally the global step update is done inside of `apply_gradients`. - # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use - # a different optimizer, you should probably take this line out. - new_global_step = global_step + 1 - train_op = tf.group(train_op, [global_step.assign(new_global_step)]) - return train_op - - -class AdamWeightDecayOptimizer(tf.train.Optimizer): - """A basic Adam optimizer that includes "correct" L2 weight decay.""" - - def __init__( - self, - learning_rate, - weight_decay_rate=0.0, - beta_1=0.9, - beta_2=0.999, - epsilon=1e-6, - exclude_from_weight_decay=None, - name='AdamWeightDecayOptimizer', - ): - """Constructs a AdamWeightDecayOptimizer.""" - super(AdamWeightDecayOptimizer, self).__init__(False, name) - - self.learning_rate = learning_rate - self.weight_decay_rate = weight_decay_rate - self.beta_1 = beta_1 - self.beta_2 = beta_2 - self.epsilon = epsilon - self.exclude_from_weight_decay = exclude_from_weight_decay - - def apply_gradients(self, grads_and_vars, global_step=None, name=None): - """See base class.""" - assignments = [] - for (grad, param) in grads_and_vars: - if grad is None or param is None: - continue - - param_name = self._get_variable_name(param.name) - - m = tf.get_variable( - name=param_name + '/adam_m', - shape=param.shape.as_list(), - dtype=tf.float32, - trainable=False, - initializer=tf.zeros_initializer(), - ) - v = tf.get_variable( - name=param_name + '/adam_v', - shape=param.shape.as_list(), - dtype=tf.float32, - trainable=False, - initializer=tf.zeros_initializer(), - ) - - # Standard Adam update. - next_m = tf.multiply(self.beta_1, m) + tf.multiply( - 1.0 - self.beta_1, grad - ) - next_v = tf.multiply(self.beta_2, v) + tf.multiply( - 1.0 - self.beta_2, tf.square(grad) - ) - - update = next_m / (tf.sqrt(next_v) + self.epsilon) - - # Just adding the square of the weights to the loss function is *not* - # the correct way of using L2 regularization/weight decay with Adam, - # since that will interact with the m and v parameters in strange ways. - # - # Instead we want ot decay the weights in a manner that doesn't interact - # with the m/v parameters. This is equivalent to adding the square - # of the weights to the loss with plain (non-momentum) SGD. - if self._do_use_weight_decay(param_name): - update += self.weight_decay_rate * param - - update_with_lr = self.learning_rate * update - - next_param = param - update_with_lr - - assignments.extend( - [param.assign(next_param), m.assign(next_m), v.assign(next_v)] - ) - return tf.group(*assignments, name=name) - - def _do_use_weight_decay(self, param_name): - """Whether to use L2 weight decay for `param_name`.""" - if not self.weight_decay_rate: - return False - if self.exclude_from_weight_decay: - for r in self.exclude_from_weight_decay: - if re.search(r, param_name) is not None: - return False - return True - - def _get_variable_name(self, param_name): - """Get the variable name from the tensor name.""" - m = re.match('^(.*):\\d+$', param_name) - if m is not None: - param_name = m.group(1) - return param_name diff --git a/malaya/train/model/bigbird/utils.py b/malaya/train/model/bigbird/utils.py deleted file mode 100644 index c41d7c97..00000000 --- a/malaya/train/model/bigbird/utils.py +++ /dev/null @@ -1,806 +0,0 @@ -# Copyright 2020 The BigBird Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Helper and utility functions.""" - -import re - -from absl import logging -import numpy as np -import tensorflow as tf - - -############################### SHAPE UTILS #################################### - - -def get_shape_list(tensor, expected_rank=None, name=None): - """Returns a list of the shape of tensor, preferring static dimensions. - - Args: - tensor: A tf.Tensor object to find the shape of. - expected_rank: (optional) int. The expected rank of `tensor`. If this is - specified and the `tensor` has a different rank, and exception will be - thrown. - name: Optional name of the tensor for the error message. - - Returns: - A list of dimensions of the shape of tensor. All static dimensions will - be returned as python integers, and dynamic dimensions will be returned - as tf.Tensor scalars. - """ - if not tf.executing_eagerly() and name is None: - name = tensor.name - - if expected_rank is not None: - assert_rank(tensor, expected_rank, name) - - shape = tensor.shape.as_list() - - non_static_indexes = [] - for (index, dim) in enumerate(shape): - if dim is None: - non_static_indexes.append(index) - - if not non_static_indexes: - return shape - - # assert False, 'Static shape not available for {}'.format(tensor) - - dyn_shape = tf.shape(tensor) - for index in non_static_indexes: - shape[index] = dyn_shape[index] - return shape - - -def reshape_to_matrix(input_tensor): - """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" - ndims = input_tensor.shape.ndims - if ndims < 2: - raise ValueError( - 'Input tensor must have at least rank 2. Shape = %s' - % (input_tensor.shape) - ) - if ndims == 2: - return input_tensor - - width = input_tensor.shape[-1] - output_tensor = tf.reshape(input_tensor, [-1, width]) - return output_tensor - - -def reshape_from_matrix(output_tensor, orig_shape_list): - """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" - if len(orig_shape_list) == 2: - return output_tensor - - output_shape = get_shape_list(output_tensor) - - orig_dims = orig_shape_list[0:-1] - width = output_shape[-1] - - return tf.reshape(output_tensor, orig_dims + [width]) - - -def assert_rank(tensor, expected_rank, name=None): - """Raises an exception if the tensor rank is not of the expected rank. - - Args: - tensor: A tf.Tensor to check the rank of. - expected_rank: Python integer or list of integers, expected rank. - name: Optional name of the tensor for the error message. - - Raises: - ValueError: If the expected shape doesn't match the actual shape. - """ - if not tf.executing_eagerly() and name is None: - name = tensor.name - - expected_rank_dict = {} - if isinstance(expected_rank, int): - expected_rank_dict[expected_rank] = True - else: - for x in expected_rank: - expected_rank_dict[x] = True - - actual_rank = tensor.shape.ndims - if actual_rank not in expected_rank_dict: - scope_name = tf.compat.v1.get_variable_scope().name - raise ValueError( - 'For the tensor `{}` in scope `{}`, the actual rank ' - '`{}` (shape = {}) is not equal to the expected rank `{}`'.format( - name, - scope_name, - actual_rank, - str(tensor.shape), - str(expected_rank), - ) - ) - - -############################### DENSE LAYERS ################################### - - -def create_initializer(initializer_range=0.02): - """Creates a `truncated_normal_initializer` with the given range.""" - return tf.compat.v1.truncated_normal_initializer(stddev=initializer_range) - - -class Dense3dLayer(tf.compat.v1.layers.Layer): - """A dense layer with 3D kernel.""" - - def __init__( - self, - num_attention_heads, - size_per_head, - initializer, - activation, - name=None, - head_first=False, - use_bias=True, - ): - """Constructor for dense layer with 3D kernel. - - Args: - num_attention_heads: The size of output dimension. - size_per_head: The size per attention head. - initializer: Kernel initializer. - activation: Actication function. - name: The name scope of this layer. - head_first: Whether to output head dimension before or after sequence dim. - use_bias: Whether the layer uses a bias vector. - """ - super(Dense3dLayer, self).__init__(name=name) - self.num_attention_heads = num_attention_heads - self.size_per_head = size_per_head - self.initializer = initializer - self.activation = activation - self.head_first = head_first - self.use_bias = use_bias - - self.w = None - self.b = None - - def call(self, input_tensor): - """Constructor for dense layer with 3D kernel. - - Args: - input_tensor: float Tensor of shape [batch, seq_length, hidden_size]. - - Returns: - float logits Tensor. - """ - last_dim = get_shape_list(input_tensor)[-1] - if self.w is None: - self.w = tf.compat.v1.get_variable( - name='kernel', - shape=[ - last_dim, - self.num_attention_heads * self.size_per_head, - ], - initializer=self.initializer, - ) - self.initializer = None - self._trainable_weights.append(self.w) - reshape_w = tf.reshape( - self.w, [last_dim, self.num_attention_heads, self.size_per_head] - ) - if self.head_first: - ret = tf.einsum('abc,cde->adbe', input_tensor, reshape_w) - else: - ret = tf.einsum('abc,cde->abde', input_tensor, reshape_w) - - if self.use_bias: - if self.b is None: - self.b = tf.compat.v1.get_variable( - name='bias', - shape=[self.num_attention_heads * self.size_per_head], - initializer=tf.zeros_initializer, - ) - self._trainable_weights.append(self.b) - if self.head_first: - reshape_b = tf.reshape( - self.b, [1, self.num_attention_heads, 1, self.size_per_head] - ) - else: - reshape_b = tf.reshape( - self.b, [self.num_attention_heads, self.size_per_head] - ) - ret += reshape_b - - if self.activation is not None: - return self.activation(ret) - else: - return ret - - -class Dense3dProjLayer(tf.compat.v1.layers.Layer): - """A dense layer with 3D kernel for projection.""" - - def __init__( - self, - num_attention_heads, - size_per_head, - initializer, - activation, - name=None, - use_bias=True, - ): - """Constructor for dense layer with 3D kernel for projection. - - Args: - num_attention_heads: The size of output dimension. - size_per_head: The size per attention head. - initializer: Kernel initializer. - activation: Actication function. - name: The name scope of this layer. - use_bias: Whether the layer uses a bias vector. - """ - super(Dense3dProjLayer, self).__init__(name=name) - self.num_attention_heads = num_attention_heads - self.size_per_head = size_per_head - self.initializer = initializer - self.activation = activation - self.use_bias = use_bias - - self.w = None - self.b = None - - def call(self, input_tensor): - """Constructor for dense layer with 3D kernel for projection. - - Args: - input_tensor: float Tensor of shape [batch,from_seq_length, - num_attention_heads, size_per_head]. - - Returns: - float logits Tensor. - """ - hidden_size = self.num_attention_heads * self.size_per_head - if self.w is None: - self.w = tf.compat.v1.get_variable( - name='kernel', - shape=[hidden_size, hidden_size], - initializer=self.initializer, - ) - self.initializer = None - self._trainable_weights.append(self.w) - reshape_w = tf.reshape( - self.w, [self.num_attention_heads, self.size_per_head, hidden_size] - ) - ret = tf.einsum('BFNH,NHD->BFD', input_tensor, reshape_w) - - if self.use_bias: - if self.b is None: - self.b = tf.compat.v1.get_variable( - name='bias', - shape=[hidden_size], - initializer=tf.zeros_initializer, - ) - self._trainable_weights.append(self.b) - ret += self.b - - if self.activation is not None: - return self.activation(ret) - else: - return ret - - -class Dense2dLayer(tf.compat.v1.layers.Layer): - """A dense layer with 2D kernel.""" - - def __init__( - self, output_size, initializer, activation, name=None, use_bias=True - ): - """Constructor for dense layer with 2D kernel. - - Args: - output_size: The size of output dimension. - initializer: Kernel initializer. - activation: Actication function. - name: The name scope of this layer. - use_bias: Whether the layer uses a bias vector. - """ - super(Dense2dLayer, self).__init__(name=name) - self.output_size = output_size - self.initializer = initializer - self.activation = activation - self.use_bias = use_bias - - self.w = None - self.b = None - - def call(self, input_tensor): - """Forward pass for dense layer with 2D kernel. - - Args: - input_tensor: Float tensor with rank 3. - - Returns: - float logits Tensor. - """ - if self.w is None: - last_dim = get_shape_list(input_tensor)[-1] - self.w = tf.compat.v1.get_variable( - name='kernel', - shape=[last_dim, self.output_size], - initializer=self.initializer, - ) - self.initializer = None - self._trainable_weights.append(self.w) - ret = tf.einsum('abc,cd->abd', input_tensor, self.w) - - if self.use_bias: - if self.b is None: - self.b = tf.compat.v1.get_variable( - name='bias', - shape=[self.output_size], - initializer=tf.zeros_initializer, - ) - self._trainable_weights.append(self.b) - ret += self.b - - if self.activation is not None: - return self.activation(ret) - else: - return ret - - -def gelu(x): - """Gaussian Error Linear Unit. - - This is a smoother version of the RELU. - Original paper: https://arxiv.org/abs/1606.08415 - Args: - x: float Tensor to perform activation. - - Returns: - `x` with the GELU activation applied. - """ - cdf = 0.5 * ( - 1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))) - ) - return x * cdf - - -def get_activation(activation_string): - """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. - - Args: - activation_string: String name of the activation function. - - Returns: - A Python function corresponding to the activation function. If - `activation_string` is None, empty, or "linear", this will return None. - If `activation_string` is not a string, it will return `activation_string`. - - Raises: - ValueError: The `activation_string` does not correspond to a known - activation. - """ - - # We assume that anything that"s not a string is already an activation - # function, so we just return it. - if not isinstance(activation_string, str): - return activation_string - - if not activation_string: - return None - - act = activation_string.lower() - if act == 'linear': - return None - elif act == 'relu': - return tf.nn.relu - elif act == 'gelu': - return gelu - elif act == 'tanh': - return tf.tanh - else: - raise ValueError('Unsupported activation: %s' % act) - - -########################## NORM & DROPOUT LAYERS ############################### - - -def dropout(input_tensor, dropout_prob, training=True): - """Perform dropout. - - Args: - input_tensor: float Tensor. - dropout_prob: Python float. The probability of dropping out a value (NOT of - *keeping* a dimension as in `tf.nn.dropout`). - training: Boolean indicating whether the call is training or inference. - - Returns: - A version of `input_tensor` with dropout applied. - """ - - if not training or dropout_prob is None or dropout_prob == 0.0: - return input_tensor - - output = tf.nn.dropout(input_tensor, rate=dropout_prob) - return output - - -class NormLayer(tf.compat.v1.layers.Layer): - """Replacement for contrib_layers.layer_norm.""" - - def __init__(self, name='LayerNorm'): - super(NormLayer, self).__init__(name=name) - self.beta = None - self.gamma = None - - def call(self, input_tensor): - inputs = tf.convert_to_tensor(input_tensor) - inputs_shape = get_shape_list(inputs) - inputs_rank = len(inputs_shape) - dtype = inputs.dtype.base_dtype - norm_axis = inputs_rank - 1 - params_shape = [inputs_shape[norm_axis]] - - # Allocate parameters for the beta and gamma of the normalization. - if self.beta is None: - self.beta = tf.compat.v1.get_variable( - 'beta', - shape=params_shape, - dtype=dtype, - initializer=tf.zeros_initializer(), - trainable=True, - ) - self._trainable_weights.append(self.beta) - if self.gamma is None: - self.gamma = tf.compat.v1.get_variable( - 'gamma', - shape=params_shape, - dtype=dtype, - initializer=tf.ones_initializer(), - trainable=True, - ) - self._trainable_weights.append(self.gamma) - # Compute norm along last axis - mean, variance = tf.nn.moments(inputs, [norm_axis], keepdims=True) - # Compute layer normalization using the batch_normalization function. - # Note that epsilon must be increased for float16 due to the limited - # representable range. - variance_epsilon = 1e-12 if dtype != tf.float16 else 1e-3 - outputs = tf.nn.batch_normalization( - inputs, - mean, - variance, - offset=self.beta, - scale=self.gamma, - variance_epsilon=variance_epsilon, - ) - tf.reshape(outputs, inputs_shape) - # outputs.set_shape(inputs_shape) - return outputs - - -############################# EMBEDDING LAYER ################################## - - -class EmbeddingLayer(tf.compat.v1.layers.Layer): - """An embedding layer.""" - - def __init__( - self, - vocab_size, - emb_dim, - initializer, - scale_emb=False, - use_token_type=False, - num_token_types=16, - use_position_embeddings=True, - max_position_embeddings=4096, - dropout_prob=0.0, - name='embeddings', - ): - super(EmbeddingLayer, self).__init__(name=name) - self.vocab_size = vocab_size - self.emb_dim = emb_dim - self.scale_emb = scale_emb - self.num_token_types = num_token_types - self.max_position_embeddings = max_position_embeddings - self.dropout_prob = dropout_prob - - with tf.compat.v1.variable_scope(name): - self.word_embeddings = tf.compat.v1.get_variable( - 'word_embeddings', - [vocab_size, emb_dim], - dtype=tf.float32, - initializer=initializer, - ) - self._trainable_weights.append(self.word_embeddings) - - if use_token_type: - self.token_type_table = tf.compat.v1.get_variable( - 'token_type_embeddings', - [num_token_types, emb_dim], - dtype=tf.float32, - initializer=initializer, - ) - self._trainable_weights.append(self.token_type_table) - else: - self.token_type_table = None - - if use_position_embeddings: - self.position_embeddings = tf.compat.v1.get_variable( - 'position_embeddings', - [max_position_embeddings, emb_dim], - dtype=tf.float32, - initializer=initializer, - ) - self._trainable_weights.append(self.position_embeddings) - else: - self.position_embeddings = None - - def call( - self, - input_ids, - seq_length, - start_pos=0, - token_type_ids=None, - training=None, - ): - if input_ids is None: - return None - - # subtoken embedding - output = tf.nn.embedding_lookup( - params=self.word_embeddings, ids=input_ids - ) - - if self.scale_emb: - output = output * self.emb_dim ** 0.5 - - if self.token_type_table is not None: - # This vocab will be small so we always do one-hot here, since it is - # always faster for a small vocabulary. - one_hot_ids = tf.one_hot( - token_type_ids, depth=self.num_token_types - ) - token_type_embeddings = tf.tensordot( - one_hot_ids, self.token_type_table, 1 - ) - output += token_type_embeddings - - if self.position_embeddings is not None: - # assert_op = tf.compat.v1.assert_less_equal( - # start_pos + seq_length, self.max_position_embeddings) - # with tf.control_dependencies([assert_op]): - # So `position_embeddings` is effectively an embedding table for - # position [0, 1, 2, ..., max_position_embeddings-1], and the current - # sequence has positions [0, 1, 2, ... seq_length-1], so we can just - # perform a slice. - position_embeddings = tf.slice( - self.position_embeddings, - [start_pos, 0], - [seq_length, self.emb_dim], - ) - output += tf.expand_dims(position_embeddings, axis=0) - - if training and self.dropout_prob > 0: - output = tf.nn.dropout(output, rate=self.dropout_prob) - return output - - def linear(self, x): - """Computes logits by running x through a linear layer. - - Args: - x: A float32 tensor with shape [..., hidden_size] - Returns: - float32 tensor with shape [..., vocab_size]. - """ - with tf.compat.v1.name_scope('presoftmax_linear'): - logits = tf.tensordot(x, self.word_embeddings, [[-1], [1]]) - return logits - - -########################## TPU/CHECKPOINT UTILS ################################ - - -def get_estimator(config, model_fn, keep_checkpoint_max=10): - """Create TPUEstimator object for given config and model_fn.""" - tpu_cluster_resolver = None - if config['use_tpu'] and config['tpu_name']: - tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( - config['tpu_name'], - zone=config['tpu_zone'], - project=config['gcp_project'], - ) - - # Batch size book-keeping - # Estimators handle batch sizes differently among GPUs and TPUs - # GPU: Estimator needs per core batch size - # TPU: Estimator needs total batch size, i.e. num_cores * per core batch size - config_train_batch_size = config['train_batch_size'] # For estimator - config_eval_batch_size = config['eval_batch_size'] # For estimator - effective_train_batch_size = config['train_batch_size'] # For human - effective_eval_batch_size = config['eval_batch_size'] # For human - if config['use_tpu']: - sliced_eval_mode = tf.compat.v1.estimator.tpu.InputPipelineConfig.SLICED - distribute_strategy = None - config_train_batch_size *= config['num_tpu_cores'] - config_eval_batch_size *= config['num_tpu_cores'] - effective_train_batch_size = config_train_batch_size - effective_eval_batch_size = config_eval_batch_size - else: - sliced_eval_mode = ( - tf.compat.v1.estimator.tpu.InputPipelineConfig.PER_HOST_V1 - ) - distribute_strategy = tf.distribute.MirroredStrategy(devices=None) - effective_train_batch_size *= distribute_strategy.num_replicas_in_sync - # effective_eval_batch_size *= distribute_strategy.num_replicas_in_sync - - is_per_host = tf.compat.v1.estimator.tpu.InputPipelineConfig.PER_HOST_V2 - run_config = tf.compat.v1.estimator.tpu.RunConfig( - cluster=tpu_cluster_resolver, - master=config['master'], - model_dir=config['output_dir'], - save_checkpoints_steps=config['save_checkpoints_steps'], - keep_checkpoint_max=keep_checkpoint_max, - train_distribute=distribute_strategy, - tpu_config=tf.compat.v1.estimator.tpu.TPUConfig( - tpu_job_name=config['tpu_job_name'], - iterations_per_loop=config['iterations_per_loop'], - num_shards=config['num_tpu_cores'], - per_host_input_for_training=is_per_host, - eval_training_input_configuration=sliced_eval_mode, - ), - ) - - if config['init_checkpoint']: - ckpt_var_list = tf.compat.v1.train.list_variables( - config['init_checkpoint'] - ) - ckpt_var_list = { - name: shape - for name, shape in ckpt_var_list - if not re.findall('(Adam|Adafactor|global_step)', name) - } - vars_to_warm_start = '({})'.format('|'.join(ckpt_var_list.keys())) - warm_start_settings = tf.estimator.WarmStartSettings( - ckpt_to_initialize_from=config['init_checkpoint'], - vars_to_warm_start=vars_to_warm_start, - ) - else: - ckpt_var_list = {} - warm_start_settings = None - config['ckpt_var_list'] = ckpt_var_list - - # If no TPU, this will fall back to normal Estimator on CPU or GPU. - estimator = tf.compat.v1.estimator.tpu.TPUEstimator( - use_tpu=config['use_tpu'], - model_fn=model_fn, - config=run_config, - train_batch_size=config_train_batch_size, - eval_batch_size=config_eval_batch_size, - warm_start_from=warm_start_settings, - ) - - # assign batch sizes - estimator.train_batch_size = effective_train_batch_size - estimator.eval_batch_size = effective_eval_batch_size - - return estimator - - -def log_variables(variables, ckpt_var_list): - """Log trainable variables.""" - logging.info('**** Trainable Variables ****') - - model_var_list = {var.name: var.get_shape().as_list() for var in variables} - num_params = sum(np.prod(shape) for shape in model_var_list.values()) - length = max(len(name) for name in model_var_list) + 2 - line = '{{:<{}}}{{:<13}}{{}}'.format(length) - - logging.info( - 'The model has {} trainable variables ' - '({:,} parameters):\n'.format(len(model_var_list), num_params) - ) - logging.info(line.format('Name', 'Initialized', 'Shape')) - logging.info(line.format('----', '-----------', '-----')) - - ckpt_var_list = ckpt_var_list.copy() - for name, shape in model_var_list.items(): - name = name.split(':')[0] - if name in ckpt_var_list: - warm_started = 'from ckpt' - del ckpt_var_list[name] - else: - warm_started = 'random' - logging.info(line.format(name, warm_started, shape)) - - if ckpt_var_list: - logging.warning( - 'The warm start checkpoint contained %d variables that were not used ' - 'for the model:\n', - len(ckpt_var_list), - ) - for name, shape in ckpt_var_list.items(): - logging.warning(line.format(name, 'not used', shape)) - - -def add_scalars_to_summary(summary_dir, scalar_tensors_dict): - """Creates a host_call function that writes summaries on TPU.""" - - # All tensors outfed from TPU should preserve batch size dimension. - scalar_tensors_dict = { - k: tf.reshape(v, [1]) for k, v in scalar_tensors_dict.items() - } - - def host_call_fn(**kwargs): - writer = tf.summary.create_file_writer(summary_dir, max_queue=1000) - always_record = tf.summary.record_if(True) - with writer.as_default(), always_record: - for name, scalar in kwargs.items(): - tf.summary.scalar( - name, - tf.reduce_mean(scalar), - tf.compat.v1.train.get_or_create_global_step(), - ) - return tf.compat.v1.summary.all_v2_summary_ops() - - return host_call_fn, scalar_tensors_dict - - -########################## DEFAULT CONFIG UTILS ################################ - - -def get_default_config(): - """Default values for BigBird.""" - - default_config = { - # transformer basic configs - 'attention_probs_dropout_prob': 0.1, - 'hidden_act': 'gelu', - 'hidden_dropout_prob': 0.1, - 'hidden_size': 768, - 'initializer_range': 0.02, - 'intermediate_size': 3072, - 'max_position_embeddings': 4096, - 'num_attention_heads': 12, - 'num_hidden_layers': 12, - 'type_vocab_size': 2, - 'use_bias': True, - 'rescale_embedding': False, - 'scope': 'bert', - # sparse mask configs - 'attention_type': 'block_sparse', - 'norm_type': 'postnorm', - 'block_size': 16, - 'num_rand_blocks': 3, - # common bert configs - 'max_encoder_length': 1024, - 'max_decoder_length': 64, - 'couple_encoder_decoder': False, - 'beam_size': 5, - 'alpha': 0.7, - 'label_smoothing': 0.1, - 'weight_decay_rate': 0.01, - 'optimizer_beta1': 0.9, - 'optimizer_beta2': 0.999, - 'optimizer_epsilon': 1e-6, - # TPU settings - 'use_tpu': True, - 'tpu_name': None, - 'tpu_zone': None, - 'tpu_job_name': None, - 'gcp_project': None, - 'master': None, - 'num_tpu_cores': 8, - 'iterations_per_loop': '1000', - } - - return default_config diff --git a/malaya/train/model/pegasus/__init__.py b/malaya/train/model/pegasus/__init__.py deleted file mode 100644 index 792d6005..00000000 --- a/malaya/train/model/pegasus/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# diff --git a/malaya/train/model/pegasus/base.py b/malaya/train/model/pegasus/base.py deleted file mode 100644 index a1c6a2a3..00000000 --- a/malaya/train/model/pegasus/base.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright 2020 The PEGASUS Authors.. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Base model definition.""" - -import abc - - -class BaseModel(object): # pytype: disable=ignored-metaclass - """Base Abstract Class of All Models.""" - - __metaclass__ = abc.ABCMeta - - @abc.abstractmethod - def __init__(self, *args): - """Construct model class with parameters.""" - pass - - @abc.abstractmethod - def __call__(self, features, training): - """Build the class graph in training/evaluation mode. - - Args: - features: dictionary of tensors. - training: python boolean indicate of whether model is training - - Returns: - tuple of loss and outputs. loss is a scalar tensor and outputs is a - dictionary of tensors. - """ - loss = 0 - outputs = {} - return loss, outputs - - def predict(self, features, *args, **kwargs): - """Build the class graph in prediction model. - - Args: - features: dictionary of tensors. - *args: additional args. - **kwargs: additional keyword args. - - Returns: - dictionary of tensors. - """ - del args, kwargs - _, outputs = self.__call__(features, False) - return outputs diff --git a/malaya/train/model/pegasus/layers/__init__.py b/malaya/train/model/pegasus/layers/__init__.py deleted file mode 100644 index 779c85e7..00000000 --- a/malaya/train/model/pegasus/layers/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2020 The PEGASUS Authors.. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/malaya/train/model/pegasus/layers/attention.py b/malaya/train/model/pegasus/layers/attention.py deleted file mode 100644 index 79aaca3f..00000000 --- a/malaya/train/model/pegasus/layers/attention.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright 2020 The PEGASUS Authors.. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Attention layers. - -Notations: - B: batch_size, I: max_input_len, M: max_memory_len, D: hidden_size, - H: number of heads, Dh: hidden_size per head, Di: input dimension. -""" -# -# pylint: disable=invalid-name - -import tensorflow as tf - - -def split_heads(tensor_BxIxD, num_heads): - B = tf.shape(tensor_BxIxD)[0] - I = tf.shape(tensor_BxIxD)[1] - D = tf.shape(tensor_BxIxD)[2] - - tensor_BxIxHxD = tf.reshape(tensor_BxIxD, [B, I, num_heads, D // num_heads]) - tensor_BxHxIxD = tf.transpose(tensor_BxIxHxD, [0, 2, 1, 3]) - return tensor_BxHxIxD - - -class Attention(object): - """Multihead scaled dot product attention.""" - - def __init__(self, hidden_size, num_heads, attention_dropout): - if hidden_size % num_heads != 0: - raise ValueError( - 'Number of attention heads must divide hidden size' - ) - - self._q_layer = tf.layers.Dense( - hidden_size, use_bias=False, name='query' - ) - self._k_layer = tf.layers.Dense( - hidden_size, use_bias=False, name='key' - ) - self._v_layer = tf.layers.Dense( - hidden_size, use_bias=False, name='value' - ) - self._output_layer = tf.layers.Dense( - hidden_size, use_bias=False, name='output/dense' - ) - self._num_heads = num_heads - self._hidden_size = hidden_size - self._attention_dropout = attention_dropout - - def __call__( - self, - input_BxIxDi, - memory_BxMxDi, - bias_BxIxM, - training, - cache=None, - decode_i=None, - ): - - B = tf.shape(input_BxIxDi)[0] - I = tf.shape(input_BxIxDi)[1] - - M = tf.shape(memory_BxMxDi)[1] - H, D = self._num_heads, self._hidden_size - dtype = memory_BxMxDi.dtype - - q_BxHxIxDh = split_heads(self._q_layer(input_BxIxDi), H) - q_BxHxIxDh *= (D // H) ** -0.5 - k_BxHxMxDh = split_heads(self._k_layer(memory_BxMxDi), H) - v_BxHxMxDh = split_heads(self._v_layer(memory_BxMxDi), H) - - # cache saves previous activations before time decode_i during TPU decoding. - if cache is not None and decode_i is not None: - M = tf.shape(cache['k'])[2] - indices_1x1xMx1 = tf.reshape( - tf.one_hot(decode_i, M, dtype=dtype), [1, 1, M, 1] - ) - k_BxHxMxDh = cache['k'] + k_BxHxMxDh * indices_1x1xMx1 - v_BxHxMxDh = cache['v'] + v_BxHxMxDh * indices_1x1xMx1 - cache['k'] = k_BxHxMxDh - cache['v'] = v_BxHxMxDh - bias_BxHxIxM = tf.expand_dims(bias_BxIxM, axis=1) - logits_BxHxIxM = ( - tf.matmul(q_BxHxIxDh, k_BxHxMxDh, transpose_b=True) + bias_BxHxIxM - ) - alignment_BxHxIxM = tf.nn.softmax(logits_BxHxIxM) - if training: - alignment_BxHxIxM = tf.compat.v2.nn.dropout( - alignment_BxHxIxM, - rate=self._attention_dropout, - noise_shape=[1, 1, I, M], - ) - outputs_BxHxIxDh = tf.matmul(alignment_BxHxIxM, v_BxHxMxDh) - outputs_BxIxD = tf.reshape( - tf.transpose(outputs_BxHxIxDh, [0, 2, 1, 3]), [B, I, D] - ) - outputs_BxIxD = self._output_layer(outputs_BxIxD) - return outputs_BxIxD - - -class SelfAttention(Attention): - """Multihead scaled dot product self-attention.""" - - def __call__(self, x, bias, training, cache=None, decode_i=None): - return super(SelfAttention, self).__call__( - x, x, bias, training, cache=cache, decode_i=decode_i - ) - - -def ids_to_bias(ids_BxI, dtype=tf.float32, padding_id=0): - """Convert ids to attention bias for attention.""" - pad_BxI = tf.cast(tf.equal(ids_BxI, padding_id), dtype) - bias_Bx1xI = tf.expand_dims(pad_BxI * dtype.min, axis=1) - return bias_Bx1xI - - -def upper_triangle_bias(D, dtype=tf.float32): - """Create a upper triangle matrix for decoding bias.""" - upper_triangle_DxD = 1 - tf.matrix_band_part( - tf.ones([D, D], dtype=dtype), -1, 0 - ) - tensor_1xDxD = tf.expand_dims(upper_triangle_DxD * dtype.min, axis=0) - return tensor_1xDxD diff --git a/malaya/train/model/pegasus/layers/beam_search.py b/malaya/train/model/pegasus/layers/beam_search.py deleted file mode 100644 index b502bcdf..00000000 --- a/malaya/train/model/pegasus/layers/beam_search.py +++ /dev/null @@ -1,301 +0,0 @@ -# Copyright 2020 The PEGASUS Authors.. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Beam search. - -This beam search implementation is designed for TPU usage only and prefers -flexibility over efficiency. Transformer attention caching is not enabled yet. - -Mostly follows implementation in T2T. Several difference to pure beamsearch: -1. has finished and alive seqs, use 2 * beam_size to grow alive seqs, - which makes beam_size=1 doesn't equal greedy. -2. prefers finished seq over alive seqs. -3. prefers lower indices when equal probability (though unlikely). -4. with custom length normalization and constraint. - -Notations: - B: batch_size, M: beam_size, T: max_decode_len, V: vocab_size, U: undefined -""" -# -# pylint: disable=invalid-name - -import tensorflow as tf - - -def length_normalization(start, alpha, min_len, max_len, out_of_range_penalty): - r"""Create length normalization function. - - Combines length penalty from https://arxiv.org/abs/1609.08144, - and length constraint from https://www.aclweb.org/anthology/W18-2706.pdf. - - scores = \sum_j log(P_j) / ((start + lengths)/(1 + start))**alpha - + out_of_range_penalty * (length > max_len or length < min_len) - - Args: - start: int, length normalization start offset. - alpha: float, [0, 1.0], length normalization power. - min_len: int, minimum decode length. - max_len: int, maximum decode lengths. - out_of_range_penalty: float, penalty for lengths outside min len and max - len. Use a negative number that penalize out of range decodes, does hard - constraint if set to -inf. - - Returns: - fn(log_probs_BxM, length)->scores_BxM: a function to normalize sum log - probabilities of sequence with current decoding lengths. - """ - - def length_norm_fn(log_probs_BxM, length_int): - """Normalize sum log probabilities given a sequence length.""" - dtype = log_probs_BxM.dtype - norm_flt = tf.pow( - ((start + tf.cast(length_int, dtype)) / (1.0 + start)), alpha - ) - log_probs_BxM /= norm_flt - too_short_bool = tf.less(length_int, min_len) - too_long_bool = tf.logical_and( - tf.greater(length_int, max_len), max_len > 0 - ) - out_of_range_bool = tf.logical_or(too_long_bool, too_short_bool) - log_probs_BxM += out_of_range_penalty * tf.cast( - out_of_range_bool, dtype - ) - return log_probs_BxM - - return length_norm_fn - - -def beam_search( - symbols_to_logits_fn, - init_seq_BxT, - initial_cache_BxU, - vocab_size, - beam_size, - length_norm_fn, - eos_id=1, -): - """Beam search. - - Args: - symbols_to_logits_fn: fn(seq_BxT, cache_BxU, i) -> (logits_BxV, cache_BxU) - init_seq_BxT: initial sequence ids. - initial_cache_BxU: dictionary of tensors with shape BxU. - vocab_size: vocabulary size. - beam_size: beam size. - length_norm_fn: length normalization function. - eos_id: end of sequence. - - Returns: - Tuple of (beams_BxMxT, scores_BxM). Beam searched sequences and scores. - """ - B = tf.shape(init_seq_BxT)[0] - T = tf.shape(init_seq_BxT)[1] - M, V = beam_size, vocab_size - dtype = tf.float32 - int_dtype = init_seq_BxT.dtype - - def _loop_body( - i, - alive_seq_BxMxT, - alive_log_probs_BxM, - alive_cache_BxMxU, - finished_seq_BxMxT, - finished_scores_BxM, - ): - print('i', i) - """Beam search loop body.""" - # Decode one step with beam - logits_BMxV, cache_BMxU = symbols_to_logits_fn( - _flatten_beam_dim(alive_seq_BxMxT), - tf.nest.map_structure(_flatten_beam_dim, alive_cache_BxMxU), - i, - ) - logits_BxMxV = _unflatten_beam_dim(logits_BMxV, M) - new_cache_BxMxU = tf.nest.map_structure( - lambda t: _unflatten_beam_dim(t, M), cache_BMxU - ) - - # select top 2 * beam_size and fill alive and finished. - log_probs_BxMxV = logits_BxMxV - tf.reduce_logsumexp( - logits_BxMxV, axis=2, keepdims=True - ) - log_probs_BxMxV += tf.expand_dims(alive_log_probs_BxM, axis=2) - log_probs_BxMV = tf.reshape(log_probs_BxMxV, [B, -1]) - new_log_probs_Bx2M, topk_indices_Bx2M = tf.nn.top_k( - log_probs_BxMV, k=2 * M - ) - topk_beam_Bx2M = topk_indices_Bx2M // V - topk_seq_Bx2MxT, new_cache_Bx2MxU = _gather_nested( - [alive_seq_BxMxT, new_cache_BxMxU], topk_beam_Bx2M - ) - topk_ids_Bx2M = topk_indices_Bx2M % V - new_seq_Bx2MxT = _update_i(topk_seq_Bx2MxT, topk_ids_Bx2M, i) - new_finished_flags_Bx2M = tf.cast( - tf.reduce_any(tf.equal(new_seq_Bx2MxT, eos_id), axis=-1), dtype - ) - - # get new alive - _, topk_alive_indices_BxM = tf.nn.top_k( - new_log_probs_Bx2M + new_finished_flags_Bx2M * dtype.min, k=M - ) - ( - alive_seq_BxMxT, - alive_log_probs_BxM, - alive_cache_BxMxU, - ) = _gather_nested( - [new_seq_Bx2MxT, new_log_probs_Bx2M, new_cache_Bx2MxU], - topk_alive_indices_BxM, - ) - - # get new finished - new_scores_Bx2M = length_norm_fn(new_log_probs_Bx2M, i + 1) - new_scores_Bx2M += (1 - new_finished_flags_Bx2M) * dtype.min - finished_seq_Bx3MxT = tf.concat( - [finished_seq_BxMxT, new_seq_Bx2MxT], axis=1 - ) - finished_scores_Bx3M = tf.concat( - [finished_scores_BxM, new_scores_Bx2M], axis=1 - ) - _, topk_finished_indices_BxM = tf.nn.top_k(finished_scores_Bx3M, k=M) - (finished_seq_BxMxT, finished_scores_BxM) = _gather_nested( - [finished_seq_Bx3MxT, finished_scores_Bx3M], - topk_finished_indices_BxM, - ) - - return [ - i + 1, - alive_seq_BxMxT, - alive_log_probs_BxM, - alive_cache_BxMxU, - finished_seq_BxMxT, - finished_scores_BxM, - ] - - # initialize. - init_i = tf.constant(0, dtype=int_dtype) - init_alive_seq_BxMxT = _expand_to_beam_size(init_seq_BxT, M) - log_probs_1xM = tf.constant([[0.0] + [dtype.min] * (M - 1)], dtype=dtype) - init_alive_log_probs_BxM = tf.tile(log_probs_1xM, [B, 1]) - init_alive_cache_BxMxU = tf.nest.map_structure( - lambda t: _expand_to_beam_size(t, M), initial_cache_BxU - ) - init_finished_seq_BxMxT = tf.zeros( - tf.shape(init_alive_seq_BxMxT), int_dtype - ) - init_finished_scores_BxM = tf.zeros([B, M], dtype=dtype) + dtype.min - - T_shape = init_seq_BxT.shape[1] - - # run loop. - ( - _, - final_alive_seq_BxMxT, - final_alive_scores_BxM, - _, - final_finished_seq_BxMxT, - final_finished_scores_BxM, - ) = tf.while_loop( - lambda *args: True, # Always do T iterations - _loop_body, - loop_vars=[ - init_i, - init_alive_seq_BxMxT, - init_alive_log_probs_BxM, - init_alive_cache_BxMxU, - init_finished_seq_BxMxT, - init_finished_scores_BxM, - ], - parallel_iterations=1, - back_prop=False, - maximum_iterations=T_shape, - ) - - # process finished. - final_finished_flag_BxMx1 = tf.reduce_any( - tf.equal(final_finished_seq_BxMxT, eos_id), axis=-1, keepdims=True - ) - final_seq_BxMxT = tf.where( - tf.tile(final_finished_flag_BxMx1, [1, 1, T]), - final_finished_seq_BxMxT, - final_alive_seq_BxMxT, - ) - final_scores_BxM = tf.where( - tf.squeeze(final_finished_flag_BxMx1, axis=-1), - final_finished_scores_BxM, - final_alive_scores_BxM, - ) - return final_seq_BxMxT, final_scores_BxM - - -def _update_i(tensor_BxNxT, updates_BxN, i): - B = tf.shape(tensor_BxNxT)[0] - N = tf.shape(tensor_BxNxT)[1] - T = tf.shape(tensor_BxNxT)[2] - B = tf.cast(B, tf.int64) - N = tf.cast(N, tf.int64) - T = tf.cast(T, tf.int64) - tensor_BNxT = tf.reshape(tensor_BxNxT, [-1, T]) - updates_BN = tf.reshape(updates_BxN, [-1]) - batch_BN = tf.range(B * N, dtype=tf.int64) - i_BN = tf.fill([B * N], tf.cast(i, tf.int64)) - ind_BNx2 = tf.stack([batch_BN, i_BN], axis=-1) - tensor_BNxT = tf.tensor_scatter_nd_update(tensor_BNxT, ind_BNx2, updates_BN) - return tf.reshape(tensor_BNxT, [B, N, T]) - - -def _expand_to_beam_size(tensor_BxU, beam_size): - tensor_Bx1xU = tf.expand_dims(tensor_BxU, axis=1) - tile_dims = [1] * tensor_Bx1xU.shape.ndims - tile_dims[1] = beam_size - tensor_BxMxU = tf.tile(tensor_Bx1xU, tile_dims) - return tensor_BxMxU - - -def _shape_list(tensor): - """Return a list of the tensor's shape, and ensure no None values in list.""" - # Get statically known shape (may contain None's for unknown dimensions) - shape = tensor.get_shape().as_list() - - # Ensure that the shape values are not None - dynamic_shape = tf.shape(tensor) - for i in range(len(shape)): # pylint: disable=consider-using-enumerate - if shape[i] is None: - shape[i] = dynamic_shape[i] - return shape - - -def _flatten_beam_dim(tensor_BxMxU): - # shape = tensor_BxMxU.shape.as_list() - - shape = _shape_list(tensor_BxMxU) - - tensor_BMxU = tf.reshape(tensor_BxMxU, [shape[0] * shape[1]] + shape[2:]) - return tensor_BMxU - - -def _unflatten_beam_dim(tensor_BMxU, M): - # shape = tensor_BMxU.shape.as_list() - shape = _shape_list(tensor_BMxU) - tensor_BxMxU = tf.reshape(tensor_BMxU, [shape[0] // M, M] + shape[1:]) - return tensor_BxMxU - - -def _gather_nested(nested_BxMxU, indices_BxN): - def _gather_beam(tensor_BxMxU): - tensor_BxNxU = tf.gather( - tensor_BxMxU, indices_BxN, batch_dims=1, axis=1 - ) - return tensor_BxNxU - - return tf.nest.map_structure(_gather_beam, nested_BxMxU) diff --git a/malaya/train/model/pegasus/layers/decoding.py b/malaya/train/model/pegasus/layers/decoding.py deleted file mode 100644 index 89533cc1..00000000 --- a/malaya/train/model/pegasus/layers/decoding.py +++ /dev/null @@ -1,208 +0,0 @@ -# Copyright 2020 The PEGASUS Authors.. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Library for generative model decoding.""" -# -# pylint: disable=invalid-name - -import tensorflow as tf - -from . import beam_search - -EOS_ID = 1 - - -def process_logits(logits_BxN, top_k=0, top_p=0.0, temperature=0.0): - """Process logits using gumbel noise and mask top_k or top_p. - - The downstream task can perform probability sampling using gumbel-max trick - (taking the argmax of processed logits) (Statistical theory of extreme values - and some practical applications: a series of lectures. 1954). - Use cases: - greedy: top_k=0, top_p=0.0, temperature=0.0 - random sampling: top_k=0, top_p=0.0, temperature=1.0 - topk sampling: top_k=k, top_p=0.0, temperature=1.0 - nucleus sampling: top_k=0, top_p=p, temperature=1.0 - random sampling biased toward greedy: top_k=0, top_p=0.0, temperature=0.5 - Notations: - B: batch_size, N: number of logits, K: topk value. - Args: - logits_BxN: tensor of [batch_size vocab_size] - top_k: k in top_k sampling. - top_p: probability in necleus sampling. - temperature: gumbel noise sampling temperature. - - Returns: - logits: processed logits which is original logits add gumbel noise and - values outside top_k and top_p set to -inf. - """ - if top_k > 0 and top_p > 0: - raise ValueError( - 'Only one of the top_k and nucleus sampling should be specified.' - ) - - if top_k > 0: - top_values_BxK, _ = tf.math.top_k(logits_BxN, k=top_k, sorted=False) - min_value_Bx1 = tf.reduce_min( - top_values_BxK, axis=-1, keepdims=True - ) - mask_BxN = tf.cast(tf.less(logits_BxN, min_value_Bx1), logits_BxN.dtype) - logits_BxN -= mask_BxN * logits_BxN.dtype.max - - if top_p > 0: - sort_indices_BxN = tf.argsort( - logits_BxN, axis=-1, direction='DESCENDING' - ) - probs_BxN = tf.gather( - tf.nn.softmax(logits_BxN), sort_indices_BxN, batch_dims=1 - ) - cumprobs_BxN = tf.cumsum(probs_BxN, axis=-1, exclusive=True) - # The top 1 candidate always will not be masked. - # This way ensures at least 1 indices will be selected. - sort_mask_BxN = tf.cast( - tf.greater(cumprobs_BxN, top_p), logits_BxN.dtype - ) - batch_indices_BxN = tf.tile( - tf.expand_dims(tf.range(tf.shape(logits_BxN)[0]), axis=-1), - [1, tf.shape(logits_BxN)[1]], - ) - top_p_mask_BxN = tf.scatter_nd( - tf.stack([batch_indices_BxN, sort_indices_BxN], axis=-1), - sort_mask_BxN, - tf.shape(logits_BxN), - ) - logits_BxN -= top_p_mask_BxN * logits_BxN.dtype.max - - if temperature > 0: - logits_shape = tf.shape(logits_BxN) - uniform_noise_BxN = tf.random_uniform(logits_shape) - logits_BxN += -tf.log(-tf.log(uniform_noise_BxN)) * temperature - return logits_BxN - - -def inplace_update_i(tensor_BxL, updates_B, i): - """Inplace update a tensor. B: batch_size, L: tensor length.""" - batch_size = tf.shape(tensor_BxL)[0] - batch_size = tf.cast(batch_size, tf.int64) - - indices_Bx2 = tf.stack( - [ - tf.range(batch_size, dtype=tf.int64), - tf.fill([batch_size], tf.cast(i, tf.int64)), - ], - axis=-1, - ) - return tf.tensor_scatter_nd_update(tensor_BxL, indices_Bx2, updates_B) - - -def left2right_decode( - symbols_to_logits_fn, - context_BxU_dict, - batch_size, - max_decode_len, - vocab_size, - beam_size=1, - beam_start=5, - beam_alpha=0.6, - beam_min=0, - beam_max=-1, - temperature=0.0, - top_k=0, - top_p=0.0, - eos_id=EOS_ID, -): - """left to right decode. - - Notations: - B: batch_size, V: vocab_size, T: decode_len, U: undefined dimensions - - Args: - symbols_to_logits_fn: logits = fn(decodes, context, i). Shoud take - [batch_size, decoded_ids] and return [batch_size, vocab_size]. - context_BxU_dict: dict of Tensors. - batch_size: int, decode batch size. - max_decode_len: int, maximum number of steps to decode. - vocab_size: int, output vocab size. - beam_size: Number of beams to decode. - beam_start: start length for scaling, default to 5. - beam_alpha: Length penalty for decoding. Should be between 0 (shorter) and 1 - (longer), default to 0.6. - beam_min: Minimum beam search lengths. - beam_max: Maximum beam search lengths. Set -1 to use unlimited. - temperature: Sampling temp for next token (0 for argmax), default to 0.0. - top_k: Number of top symbols to consider at each time step, default to 0 - (consider all symbols). - top_p: Nucleus sampling probability. - eos_id: end of token id, default to 1. - - Returns: - decodes: Tensor[batch, decode_len] - """ - batch_size = tf.cast(batch_size, tf.int64) - max_decode_len = tf.cast(max_decode_len, tf.int64) - dtype = tf.int64 - # When beam_size=1, beam_search does not behave exactly like greedy. - # This is due to using 2 * beam_size in grow_topk, and keep the top beam_size - # ones that haven't reached EOS into alive. - # In this case, alpha value for length penalty will take effect. - if beam_size == 1: - - def decode_loop(i, decodes_BxT, cache_BxU_dict): - logits_BxV = symbols_to_logits_fn(decodes_BxT, cache_BxU_dict, i) - logits_BxV = process_logits(logits_BxV, top_k, top_p, temperature) - decodes_BxT = inplace_update_i( - decodes_BxT, tf.argmax(logits_BxV, -1), i - ) - return i + 1, decodes_BxT, cache_BxU_dict - - def loop_cond(i, decodes_BxT, unused_cache_BxU_dict): - finished_B = tf.reduce_any(tf.equal(decodes_BxT, EOS_ID), axis=1) - return tf.logical_and( - i < max_decode_len, tf.logical_not(tf.reduce_all(finished_B)) - ) - - zeros_dims = tf.stack([batch_size, max_decode_len]) - y = tf.fill(zeros_dims, 0) - init_dec_BxT = tf.cast(y, dtype) - - # init_dec_BxT = tf.zeros([batch_size, max_decode_len], dtype = dtype) - _, decodes, _ = tf.while_loop( - loop_cond, - decode_loop, - [tf.constant(0, dtype=dtype), init_dec_BxT, context_BxU_dict], - ) - return decodes - - else: - - raise Exception('beam decoder not supported.') - - def symbols_to_logits_fn_with_sampling(decodes_BxT, states_BxU_dict, i): - logits_BxV = symbols_to_logits_fn(decodes_BxT, states_BxU_dict, i) - logits_BxV = process_logits(logits_BxV, top_k, top_p, temperature) - return logits_BxV, states_BxU_dict - - length_norm_fn = beam_search.length_normalization( - beam_start, beam_alpha, beam_min, beam_max, -1e3 - ) - beams, _ = beam_search.beam_search( - symbols_to_logits_fn_with_sampling, - tf.zeros([batch_size, max_decode_len], dtype=tf.int32), - context_BxU_dict, - vocab_size, - beam_size, - length_norm_fn, - eos_id, - ) - return tf.cast(beams[:, 0, :], dtype) diff --git a/malaya/train/model/pegasus/layers/embedding.py b/malaya/train/model/pegasus/layers/embedding.py deleted file mode 100644 index 9fdf961c..00000000 --- a/malaya/train/model/pegasus/layers/embedding.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright 2020 The PEGASUS Authors.. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Embedding layers. - -Notations: - B: batch_size, I: max_input_len, D: hidden_size, V: vocab_size -""" -# -# pylint: disable=invalid-name - -import tensorflow as tf - - -class Embedding(object): - """Embedding layer supporting shared input/output weights.""" - - def __init__(self, vocab_size, hidden_size, name, dtype): - self._vocab_size = vocab_size - self._hidden_size = hidden_size - self._name = name - self._dtype = dtype - - def __call__(self, tensor, is_input_layer): - if is_input_layer: - return self._ids_to_weights(tensor) - else: - return self._weights_to_logits(tensor) - - def _ids_to_weights(self, ids_BxI): - """Maps IDs to embedding weights.""" - weights_BxIxD = tf.nn.embedding_lookup(self.weights_VxD, ids_BxI) - weights_BxIxD *= self._hidden_size ** 0.5 - return weights_BxIxD - - def _weights_to_logits(self, states_BxIxD): - B = tf.shape(states_BxIxD)[0] - I = tf.shape(states_BxIxD)[1] - D = tf.shape(states_BxIxD)[2] - states_BIxD = tf.reshape(states_BxIxD, [-1, D]) - states_BIxV = tf.matmul( - states_BIxD, self.weights_VxD, transpose_b=True - ) - states_BxIxV = tf.reshape(states_BIxV, [B, I, self._vocab_size]) - return states_BxIxV - - @property - def weights_VxD(self): - """Gets embedding weights.""" - with tf.variable_scope('embeddings', reuse=tf.AUTO_REUSE): - # Initialization is important here, and a normal distribution with stdev - # equal to rsqrt hidden_size is significantly better than the default - # initialization used for other layers (fan in / out avg). - embeddings_VxD = tf.get_variable( - self._name, - [self._vocab_size, self._hidden_size], - initializer=tf.random_normal_initializer( - stddev=self._hidden_size ** -0.5, dtype=self._dtype - ), - dtype=self._dtype, - ) - return embeddings_VxD diff --git a/malaya/train/model/pegasus/layers/timing.py b/malaya/train/model/pegasus/layers/timing.py deleted file mode 100644 index 986a9448..00000000 --- a/malaya/train/model/pegasus/layers/timing.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright 2020 The PEGASUS Authors.. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Timing layers. - -Notations: -B: batch_size, I: input_length, D: hidden_size, N: num_timescales -""" -# -# pylint: disable=invalid-name - -import math - -import tensorflow as tf - -_MIN_TIMESCALE = 1.0 -_MAX_TIMESCALE = 1.0e4 - - -def add_time_signal(inputs_BxIxD, start_index=None): - """Adds a transformer-style timing signal to inputs. - - Using periodic signals as in https://arxiv.org/abs/1706.03762. - Generalized to allow each example in a batch to begin at a different index. - - Args: - inputs_BxIxD: input representation. - start_index: tensor of starting pos. [batch_size] - - Returns: - output: representation with time signal added, same shape as input. - """ - - dtype = inputs_BxIxD.dtype - B = tf.shape(inputs_BxIxD)[0] - I = tf.shape(inputs_BxIxD)[1] - D = tf.shape(inputs_BxIxD)[2] - start_Bx1 = ( - tf.zeros((B, 1), tf.int32) if start_index is None else start_index - ) - - pos_1xI = tf.expand_dims(tf.range(I), 0) - pos_BxI = tf.tile(pos_1xI, [B, 1]) + tf.cast(start_Bx1, tf.int32) - pos_BxI = tf.cast(pos_BxI, dtype) - N = D // 2 - log_time_incr = math.log(_MAX_TIMESCALE / _MIN_TIMESCALE) / tf.maximum( - tf.cast(N, dtype) - 1, 1 - ) - inv_scale_N = _MIN_TIMESCALE * tf.exp( - tf.cast(tf.range(N), dtype) * -log_time_incr - ) - time_BxIxN = tf.expand_dims(pos_BxI, 2) * tf.reshape( - inv_scale_N, [1, 1, -1] - ) - signal_BxIxD = tf.concat([tf.sin(time_BxIxN), tf.cos(time_BxIxN)], axis=2) - signal_BxIxD = tf.reshape(signal_BxIxD, [B, I, D]) - return inputs_BxIxD + signal_BxIxD diff --git a/malaya/train/model/pegasus/layers/transformer_block.py b/malaya/train/model/pegasus/layers/transformer_block.py deleted file mode 100644 index 52765186..00000000 --- a/malaya/train/model/pegasus/layers/transformer_block.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright 2020 The PEGASUS Authors.. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Transformer block. - -From "Attention Is All You Need", https://arxiv.org/abs/1706.03762. - -Notations: - B: batch_size, I: max_input_len, M: max_memory_len, D: hidden_size -""" -# -# pylint: disable=invalid-name -# pylint: disable=g-long-lambda - -from . import attention -import tensorflow as tf -from tensorflow.contrib import layers as contrib_layers - - -class TransformerBlock(object): - """Transformer block. - - Attention block of self-attention, attention over external memory, and - feedforward network. - Initialize the block with - block = TransformerBlock(hidden_size, filter_size, num_heads, dropout) - To create an encoder self attention layer, use - x = block(x, x_bias, None, None) - To create a decoder attention layer, use - y = block(y, upper_triangle_bias, x, x_bias) - """ - - def __init__(self, hidden_size, filter_size, num_heads, dropout): - self._self_attn_layer = attention.SelfAttention( - hidden_size, num_heads, dropout - ) - self._attn_layer = attention.Attention(hidden_size, num_heads, dropout) - self._relu_layer = tf.layers.Dense(filter_size, activation=tf.nn.relu) - self._output_layer = tf.layers.Dense(hidden_size) - self._dropout_fn = ( - lambda x, training: tf.compat.v2.nn.dropout( - x, - rate=dropout, - noise_shape=[tf.shape(x)[0], 1, tf.shape(x)[2]], - ) - if training - else x - ) - - def __call__( - self, - training, - inputs_BxIxD, - bias_BxIxI, - memory_BxMxD, - bias_BxIxM, - cache=None, - decode_i=None, - ): - s_BxIxD = inputs_BxIxD - with tf.variable_scope('attention/self'): - y_BxIxD = contrib_layers.layer_norm(s_BxIxD, begin_norm_axis=2) - y_BxIxD = self._self_attn_layer( - y_BxIxD, - bias_BxIxI, - training, - cache=cache, - decode_i=decode_i, - ) - s_BxIxD += self._dropout_fn(y_BxIxD, training) - if memory_BxMxD is not None: - with tf.variable_scope('memory_attention'): - y_BxIxD = contrib_layers.layer_norm( - s_BxIxD, begin_norm_axis=2 - ) - y_BxIxD = self._attn_layer( - y_BxIxD, memory_BxMxD, bias_BxIxM, training - ) - s_BxIxD += self._dropout_fn(y_BxIxD, training) - with tf.variable_scope('ffn'): - y_BxIxD = contrib_layers.layer_norm(s_BxIxD, begin_norm_axis=2) - y_BxIxD = self._dropout_fn(self._relu_layer(y_BxIxD), training) - s_BxIxD += self._dropout_fn(self._output_layer(y_BxIxD), training) - return s_BxIxD - - -def stack( - layers, - training, - inputs_BxIxD, - bias_BxIxI, - memory_BxMxD, - bias_BxIxM, - cache=None, - decode_i=None, -): - """Stack AttentionBlock layers.""" - if (memory_BxMxD is None) != (bias_BxIxM is None): - raise ValueError('memory and memory_bias need to be provided together.') - s_BxIxD = inputs_BxIxD - for i, layer in enumerate(layers): - with tf.variable_scope('layer_%d' % i): - s_BxIxD = layer( - training, - s_BxIxD, - bias_BxIxI, - memory_BxMxD, - bias_BxIxM, - cache=cache[str(i)] if cache is not None else None, - decode_i=decode_i, - ) - return s_BxIxD diff --git a/malaya/train/model/pegasus/transformer.py b/malaya/train/model/pegasus/transformer.py deleted file mode 100644 index 4032e85d..00000000 --- a/malaya/train/model/pegasus/transformer.py +++ /dev/null @@ -1,198 +0,0 @@ -# Copyright 2020 The PEGASUS Authors.. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Standard Transformer models. - -Models contain embedding, encoding, and loss functions, and expect text ids as -inputs. All models have same format as below: - model = TransformerModel(...) - loss, output = model(features, training) -Features and outputs are dictionary of tensors. Features usually inlucdes inputs -and targets ids. -""" -# -# pylint: disable=invalid-name -# pylint: disable=g-long-lambda - -from .layers import attention -from .layers import decoding -from .layers import embedding -from .layers import timing -from .layers import transformer_block -from . import base -import tensorflow as tf -from tensorflow.contrib import layers as contrib_layers - - -class TransformerEncoderDecoderModel(base.BaseModel): - """Transformer encoder+decoder. - - Notations: - B: batch_size, I: max_input_len, T: max_target/decode_len, D: hidden_size - V: vocab_size - """ - - def __init__( - self, - vocab_size, - hidden_size, - filter_size, - num_heads, - num_encoder_layers, - num_decoder_layers, - label_smoothing, - dropout, - ): - self._dtype = tf.float32 - self._embedding_layer = embedding.Embedding( - vocab_size, hidden_size, 'weights', self._dtype - ) - - def block_fn(): return transformer_block.TransformerBlock( - hidden_size, filter_size, num_heads, dropout - ) - self._encoder_layers = [block_fn() for _ in range(num_encoder_layers)] - self._decoder_layers = [block_fn() for _ in range(num_decoder_layers)] - self._dropout_fn = ( - lambda x, training: tf.compat.v2.nn.dropout( - x, - rate=dropout, - noise_shape=[tf.shape(x)[0], 1, tf.shape(x)[2]], - ) - if training - else x - ) - self._vocab_size = vocab_size - self._num_heads = num_heads - self._label_smoothing = label_smoothing - self._decoder_scope_name = 'decoder' - - def _encode(self, features, training): - inputs_BxI = features['inputs'] - inputs_bias_Bx1xI = attention.ids_to_bias(inputs_BxI, self._dtype) - states_BxIxD = self._embedding_layer(inputs_BxI, True) - states_BxIxD = self._dropout_fn( - timing.add_time_signal(states_BxIxD), training - ) - with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE): - states_BxIxD = transformer_block.stack( - self._encoder_layers, - training, - states_BxIxD, - inputs_bias_Bx1xI, - None, - None, - ) - states_BxIxD = contrib_layers.layer_norm( - states_BxIxD, begin_norm_axis=2 - ) - return {'memory': states_BxIxD, 'memory_bias': inputs_bias_Bx1xI} - - def __call__(self, features, training): - """Create model. - - Args: - features: dictionary of tensors including "inputs" [batch, input_len] and - "targets" [batch, output_len] - training: bool of whether the mode is training. - - Returns: - Tuple of (loss, outputs): Loss is a scalar. Output is a dictionary of - tensors, containing model's output logits. - """ - if 'inputs' not in features or 'targets' not in features: - raise ValueError('Require inputs and targets keys in features.') - - context = self._encode(features, training) - self._context = context - targets_BxT = features['targets'] - bias_1xTxT = attention.upper_triangle_bias( - tf.shape(targets_BxT)[1], self._dtype - ) - states_BxTxD = self._embedding_layer(targets_BxT, True) - states_BxTxD = tf.pad(states_BxTxD, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] - states_BxTxD = timing.add_time_signal(states_BxTxD) - states_BxTxD = self._dropout_fn(states_BxTxD, training) - with tf.variable_scope(self._decoder_scope_name, reuse=tf.AUTO_REUSE): - states_BxTxD = transformer_block.stack( - self._decoder_layers, - training, - states_BxTxD, - bias_1xTxT, - context['memory'], - context['memory_bias'], - ) - states_BxTxD = contrib_layers.layer_norm( - states_BxTxD, begin_norm_axis=2 - ) - logits_BxTxV = self._embedding_layer(states_BxTxD, False) - targets_mask_BxT = tf.cast(tf.greater(targets_BxT, 0), self._dtype) - loss = tf.losses.softmax_cross_entropy( - tf.one_hot(targets_BxT, self._vocab_size), - logits_BxTxV, - label_smoothing=self._label_smoothing, - weights=targets_mask_BxT, - ) - return loss, {'logits': logits_BxTxV} - - def predict(self, features, max_decode_len, beam_size, **beam_kwargs): - """Predict.""" - cache = self._encode(features, False) - B = tf.shape(cache['memory'])[0] - D = tf.shape(cache['memory'])[2] - T, V, H = max_decode_len, self._vocab_size, self._num_heads - - bias_1xTxT = attention.upper_triangle_bias(T, self._dtype) - for i in range(len(self._decoder_layers)): - cache[str(i)] = { - 'k': tf.zeros([B, H, T, D // H], self._dtype), - 'v': tf.zeros([B, H, T, D // H], self._dtype), - } - - def symbols_to_logits_fn(dec_BxT, context, i): - """Decode loop.""" - dec_Bx1 = tf.slice( - dec_BxT, - [0, tf.maximum(tf.cast(0, i.dtype), i - 1)], - [tf.shape(dec_BxT)[0], 1], - ) - - bias_1x1xT = tf.slice(bias_1xTxT, [0, i, 0], [1, 1, T]) - dec_Bx1xD = self._embedding_layer(dec_Bx1, True) - dec_Bx1xD *= tf.cast(tf.greater(i, 0), self._dtype) - dec_Bx1xD = timing.add_time_signal(dec_Bx1xD, start_index=i) - with tf.variable_scope( - self._decoder_scope_name, reuse=tf.AUTO_REUSE - ): - dec_Bx1xD = transformer_block.stack( - self._decoder_layers, - False, - dec_Bx1xD, - bias_1x1xT, - context['memory'], - context['memory_bias'], - context, - i, - ) - dec_Bx1xD = contrib_layers.layer_norm( - dec_Bx1xD, begin_norm_axis=2 - ) - logits_Bx1xV = self._embedding_layer(dec_Bx1xD, False) - logits_BxV = tf.squeeze(logits_Bx1xV, axis=1) - return logits_BxV - - decodes_BxT = decoding.left2right_decode( - symbols_to_logits_fn, cache, B, T, V, beam_size, **beam_kwargs - ) - return {'outputs': decodes_BxT} diff --git a/malaya/train/model/product_key_memory/layer.py b/malaya/train/model/product_key_memory/layer.py deleted file mode 100644 index fc4d49d7..00000000 --- a/malaya/train/model/product_key_memory/layer.py +++ /dev/null @@ -1,22 +0,0 @@ -import tensorflow as tf -from tensorflow.python.ops.init_ops import Initializer - - -class Scaling(Initializer): - def __init__(self, seed=None, dtype=tf.float32): - self.seed = seed - self.dtype = dtype - - def __call__(self, shape, dtype=None, partition_info=None): - stdv = 1.0 / (shape[0] * shape[1]) - w = tf.random.uniform( - shape, - minval=-stdv, - maxval=stdv, - dtype=self.dtype, - seed=self.seed, - ) - std = tf.math.reduce_std(w) - scale = (std / self.reference) ** 0.5 - w = w / scale - return w diff --git a/malaya/train/model/product_key_memory/model.py b/malaya/train/model/product_key_memory/model.py deleted file mode 100644 index 28613362..00000000 --- a/malaya/train/model/product_key_memory/model.py +++ /dev/null @@ -1,33 +0,0 @@ -import tensorflow as tf -import math - - -def init_(t, dim=None): - dim = dim if dim is not None else t.shape[-1] - std = 1.0 / math.sqrt(dim) - return nn.init.normal_(t, mean=0, std=std) - - -class Model(tf.keras.Model): - def __init__( - self, - dim, - heads=4, - num_keys=128, - topk=32, - dim_head=256, - input_dropout=0.0, - query_dropout=0.0, - value_dropout=0.0, - **kwargs - ): - super().__init__(self, **kwargs) - assert ( - dim % heads == 0 - ), 'dimension must be divisible by number of heads' - self.topk = topk - self.heads = heads - self.num_keys = num_keys - dim_query = dim_head * heads - self.to_queries = tf.keras.layers.Dense(dim_query, use_bias=False) - self.keys = tf.zeros(shape=(heads, num_keys, 2, dim_head // 2)) diff --git a/pretrained-model/fnet/README.md b/pretrained-model/fnet/README.md index b8e6e0a9..d24fac36 100644 --- a/pretrained-model/fnet/README.md +++ b/pretrained-model/fnet/README.md @@ -64,10 +64,10 @@ python3 run_pretraining_large_tpu.py \ ## Downloads -1. **BASE**, last update 2nd June 2021, [fnet-base.tar.gz](https://f000.backblazeb2.com/file/malaya-model/pretrained/fnet-base.tar.gz) +1. **BASE**, last update 28th June 2021, [fnet-base.tar.gz](https://f000.backblazeb2.com/file/malaya-model/pretrained/fnet-base.tar.gz) - Vocab size 32k. - - 500k steps, V3-8 TPU. + - 475k steps, 1 Tesla V100 32GB VRAM. ## Citation diff --git a/pretrained-model/fnet/sentiment-base.ipynb b/pretrained-model/fnet/sentiment-base.ipynb new file mode 100644 index 00000000..551e0c69 --- /dev/null +++ b/pretrained-model/fnet/sentiment-base.ipynb @@ -0,0 +1,494 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ['CUDA_VISIBLE_DEVICES'] = ''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import model as modeling\n", + "import tensorflow as tf\n", + "import tokenization\n", + "import optimization\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = tokenization.FullTokenizer(\n", + " vocab_file = 'BERT.wordpiece', do_lower_case = False\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from rules import normalized_chars\n", + "import random\n", + "\n", + "laughing = {\n", + " 'huhu',\n", + " 'haha',\n", + " 'gagaga',\n", + " 'hihi',\n", + " 'wkawka',\n", + " 'wkwk',\n", + " 'kiki',\n", + " 'keke',\n", + " 'huehue',\n", + " 'hshs',\n", + " 'hoho',\n", + " 'hewhew',\n", + " 'uwu',\n", + " 'sksk',\n", + " 'ksks',\n", + " 'gituu',\n", + " 'gitu',\n", + " 'mmeeooww',\n", + " 'meow',\n", + " 'alhamdulillah',\n", + " 'muah',\n", + " 'mmuahh',\n", + " 'hehe',\n", + " 'salamramadhan',\n", + " 'happywomensday',\n", + " 'jahagaha',\n", + " 'ahakss',\n", + " 'ahksk'\n", + "}\n", + "\n", + "def make_cleaning(s, c_dict):\n", + " s = s.translate(c_dict)\n", + " return s\n", + "\n", + "def cleaning(string):\n", + " \"\"\"\n", + " use by any transformer model before tokenization\n", + " \"\"\"\n", + " string = unidecode(string)\n", + " \n", + " string = ' '.join(\n", + " [make_cleaning(w, normalized_chars) for w in string.split()]\n", + " )\n", + " string = re.sub('\\(dot\\)', '.', string)\n", + " string = (\n", + " re.sub(re.findall(r'\\', string)[0], '', string)\n", + " if (len(re.findall(r'\\', string)) > 0)\n", + " and ('href' in re.findall(r'\\', string)[0])\n", + " else string\n", + " )\n", + " string = re.sub(\n", + " r'\\w+:\\/{2}[\\d\\w-]+(\\.[\\d\\w-]+)*(?:(?:\\/[^\\s/]*))*', ' ', string\n", + " )\n", + " \n", + " chars = '.,/'\n", + " for c in chars:\n", + " string = string.replace(c, f' {c} ')\n", + " \n", + " string = re.sub(r'[ ]+', ' ', string).strip().split()\n", + " string = [w for w in string if w[0] != '@']\n", + " x = []\n", + " for word in string:\n", + " word = word.lower()\n", + " if any([laugh in word for laugh in laughing]):\n", + " if random.random() >= 0.5:\n", + " x.append(word)\n", + " else:\n", + " x.append(word)\n", + " string = [w.title() if w[0].isupper() else w for w in x]\n", + " return ' '.join(string)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# !wget https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/news-sentiment/sentiment-data-v2.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from unidecode import unidecode\n", + "import re\n", + "\n", + "df = pd.read_csv('sentiment-data-v2.csv')\n", + "Y = LabelEncoder().fit_transform(df.label)\n", + "\n", + "texts = df.iloc[:,1].tolist()\n", + "labels = Y.tolist()\n", + "\n", + "assert len(labels) == len(texts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import json\n", + "\n", + "# with open('/home/husein/sentiment/strong-positives.json') as fopen:\n", + "# positives = json.load(fopen)\n", + "# positives = random.sample(positives, 500000)\n", + " \n", + "# len(positives)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# with open('/home/husein/sentiment/strong-negatives.json') as fopen:\n", + "# negatives = json.load(fopen)\n", + "# negatives = random.sample(negatives, 500000)\n", + " \n", + "# len(negatives)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# texts += negatives\n", + "# labels += [0] * len(negatives)\n", + "# texts += positives\n", + "# labels += [1] * len(positives)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "\n", + "for i in tqdm(range(len(texts))):\n", + " texts[i] = cleaning(texts[i])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "actual_t, actual_l = [], []\n", + "\n", + "for i in tqdm(range(len(texts))):\n", + " if len(texts[i]) > 2:\n", + " actual_t.append(texts[i])\n", + " actual_l.append(labels[i])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "\n", + "input_ids, input_masks = [], []\n", + "\n", + "for text in tqdm(actual_t):\n", + " tokens_a = tokenizer.tokenize(text)\n", + " tokens = [\"[CLS]\"] + tokens_a + [\"[SEP]\"]\n", + " input_id = tokenizer.convert_tokens_to_ids(tokens)\n", + " input_mask = [1] * len(input_id)\n", + " \n", + " input_ids.append(input_id)\n", + " input_masks.append(input_mask)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "epoch = 2\n", + "batch_size = 60\n", + "warmup_proportion = 0.1\n", + "num_train_steps = int(len(texts) / batch_size * epoch)\n", + "num_warmup_steps = int(num_train_steps * warmup_proportion)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def create_initializer(initializer_range=0.02):\n", + " return tf.truncated_normal_initializer(stddev=initializer_range)\n", + "\n", + "class Model:\n", + " def __init__(\n", + " self,\n", + " dimension_output,\n", + " learning_rate = 2e-5,\n", + " training = True,\n", + " ):\n", + " self.X = tf.placeholder(tf.int32, [None, None])\n", + " self.MASK = tf.placeholder(tf.int32, [None, None])\n", + " self.Y = tf.placeholder(tf.int32, [None])\n", + " \n", + " model = modeling.Model(\n", + " dim = 512, vocab_size = 32000, depth = 12, mlp_dim = 3072\n", + " )\n", + " sequence_output = model(\n", + " self.X, input_mask = self.MASK, training = training\n", + " )\n", + " \n", + " output_layer = sequence_output\n", + " output_layer = tf.layers.dense(\n", + " output_layer,\n", + " model.hidden_size,\n", + " activation=tf.tanh,\n", + " kernel_initializer=create_initializer())\n", + " self.logits_seq = tf.layers.dense(output_layer, dimension_output,\n", + " kernel_initializer=create_initializer())\n", + " self.logits_seq = tf.identity(self.logits_seq, name = 'logits_seq')\n", + " self.logits = self.logits_seq[:, 0]\n", + " self.logits = tf.identity(self.logits, name = 'logits')\n", + " \n", + " self.cost = tf.reduce_mean(\n", + " tf.nn.sparse_softmax_cross_entropy_with_logits(\n", + " logits = self.logits, labels = self.Y\n", + " )\n", + " )\n", + " \n", + " self.optimizer = optimization.create_optimizer(self.cost, learning_rate, \n", + " num_train_steps, num_warmup_steps, False)\n", + " correct_pred = tf.equal(\n", + " tf.argmax(self.logits, 1, output_type = tf.int32), self.Y\n", + " )\n", + " self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "INIT_CHKPNT = 'fnet-base/model.ckpt-475000'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dimension_output = 2\n", + "learning_rate = 2e-5\n", + "\n", + "tf.reset_default_graph()\n", + "sess = tf.InteractiveSession()\n", + "model = Model(\n", + " dimension_output,\n", + " learning_rate\n", + ")\n", + "\n", + "sess.run(tf.global_variables_initializer())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import collections\n", + "import re\n", + "\n", + "def get_assignment_map_from_checkpoint(tvars, init_checkpoint):\n", + " \"\"\"Compute the union of the current variables and checkpoint variables.\"\"\"\n", + " assignment_map = {}\n", + " initialized_variable_names = {}\n", + "\n", + " name_to_variable = collections.OrderedDict()\n", + " for var in tvars:\n", + " name = var.name\n", + " m = re.match('^(.*):\\\\d+$', name)\n", + " if m is not None:\n", + " name = m.group(1)\n", + " name_to_variable[name] = var\n", + "\n", + " init_vars = tf.train.list_variables(init_checkpoint)\n", + "\n", + " assignment_map = collections.OrderedDict()\n", + " for x in init_vars:\n", + " (name, var) = (x[0], x[1])\n", + " if name not in name_to_variable:\n", + " continue\n", + " assignment_map[name] = name_to_variable[name]\n", + " initialized_variable_names[name] = 1\n", + " initialized_variable_names[name + ':0'] = 1\n", + "\n", + " return (assignment_map, initialized_variable_names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tvars = tf.trainable_variables()\n", + "assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, \n", + " INIT_CHKPNT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "saver = tf.train.Saver(var_list = assignment_map)\n", + "saver.restore(sess, INIT_CHKPNT)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "train_input_ids, test_input_ids, train_Y, test_Y, train_mask, test_mask = train_test_split(\n", + " input_ids, actual_l[:len(input_ids)], input_masks, test_size = 0.2\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pad_sequences = tf.keras.preprocessing.sequence.pad_sequences" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "import time\n", + "\n", + "for EPOCH in range(10):\n", + "\n", + " train_acc, train_loss, test_acc, test_loss = [], [], [], []\n", + " pbar = tqdm(\n", + " range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'\n", + " )\n", + " for i in pbar:\n", + " index = min(i + batch_size, len(train_input_ids))\n", + " batch_x = train_input_ids[i: index]\n", + " batch_x = pad_sequences(batch_x, padding='post')\n", + " batch_mask = train_mask[i: index]\n", + " batch_mask = pad_sequences(batch_mask, padding='post')\n", + " batch_y = train_Y[i: index]\n", + " acc, cost, _ = sess.run(\n", + " [model.accuracy, model.cost, model.optimizer],\n", + " feed_dict = {\n", + " model.Y: batch_y,\n", + " model.X: batch_x,\n", + " model.MASK: batch_mask\n", + " },\n", + " )\n", + " train_loss.append(cost)\n", + " train_acc.append(acc)\n", + " pbar.set_postfix(cost = cost, accuracy = acc)\n", + " \n", + " pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')\n", + " for i in pbar:\n", + " index = min(i + batch_size, len(test_input_ids))\n", + " batch_x = test_input_ids[i: index]\n", + " batch_x = pad_sequences(batch_x, padding='post')\n", + " batch_mask = test_mask[i: index]\n", + " batch_mask = pad_sequences(batch_mask, padding='post')\n", + " batch_y = test_Y[i: index]\n", + " acc, cost = sess.run(\n", + " [model.accuracy, model.cost],\n", + " feed_dict = {\n", + " model.Y: batch_y,\n", + " model.X: batch_x,\n", + " model.MASK: batch_mask\n", + " },\n", + " )\n", + " test_loss.append(cost)\n", + " test_acc.append(acc)\n", + " pbar.set_postfix(cost = cost, accuracy = acc)\n", + " \n", + " train_loss = np.mean(train_loss)\n", + " train_acc = np.mean(train_acc)\n", + " test_loss = np.mean(test_loss)\n", + " test_acc = np.mean(test_acc)\n", + " \n", + " print(\n", + " 'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\\n'\n", + " % (EPOCH, train_loss, train_acc, test_loss, test_acc)\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pretrained-model/fnet/test-fnet.ipynb b/pretrained-model/fnet/test-fnet.ipynb new file mode 100644 index 00000000..a370dc4e --- /dev/null +++ b/pretrained-model/fnet/test-fnet.ipynb @@ -0,0 +1,561 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import os\n", + "import tensorflow as tf\n", + "import malaya\n", + "tf.compat.v1.set_random_seed(1234)\n", + "import numpy as np\n", + "import logging\n", + "logging.basicConfig(level = logging.INFO)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip3 install einops" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "import numpy as np\n", + "\n", + "\n", + "def get_shape_list(tensor, expected_rank = None, name = None):\n", + " \"\"\"Returns a list of the shape of tensor, preferring static dimensions.\n", + " Args:\n", + " tensor: A tf.Tensor object to find the shape of.\n", + " expected_rank: (optional) int. The expected rank of `tensor`. If this is\n", + " specified and the `tensor` has a different rank, and exception will be\n", + " thrown.\n", + " name: Optional name of the tensor for the error message.\n", + " Returns:\n", + " A list of dimensions of the shape of tensor. All static dimensions will\n", + " be returned as python integers, and dynamic dimensions will be returned\n", + " as tf.Tensor scalars.\n", + " \"\"\"\n", + " if name is None:\n", + " name = tensor.name\n", + "\n", + " shape = tensor.shape.as_list()\n", + "\n", + " non_static_indexes = []\n", + " for (index, dim) in enumerate(shape):\n", + " if dim is None:\n", + " non_static_indexes.append(index)\n", + "\n", + " if not non_static_indexes:\n", + " return shape\n", + "\n", + " dyn_shape = tf.shape(tensor)\n", + " for index in non_static_indexes:\n", + " shape[index] = dyn_shape[index]\n", + " return shape\n", + "\n", + "\n", + "def create_initializer(initializer_range = 0.02):\n", + " \"\"\"Creates a `truncated_normal_initializer` with the given range.\"\"\"\n", + " return tf.truncated_normal_initializer(stddev = initializer_range)\n", + "\n", + "\n", + "def layer_norm(input_tensor, name = None):\n", + " return tf.contrib.layers.layer_norm(\n", + " inputs = input_tensor,\n", + " begin_norm_axis = -1,\n", + " begin_params_axis = -1,\n", + " scope = name,\n", + " )\n", + "\n", + "\n", + "def embedding_lookup(\n", + " input_ids,\n", + " vocab_size,\n", + " embedding_size = 128,\n", + " initializer_range = 0.02,\n", + " word_embedding_name = 'word_embeddings',\n", + " use_one_hot_embeddings = False,\n", + "):\n", + " \"\"\"Looks up words embeddings for id tensor.\n", + " Args:\n", + " input_ids: int32 Tensor of shape [batch_size, seq_length] containing word\n", + " ids.\n", + " vocab_size: int. Size of the embedding vocabulary.\n", + " embedding_size: int. Width of the word embeddings.\n", + " initializer_range: float. Embedding initialization range.\n", + " word_embedding_name: string. Name of the embedding table.\n", + " use_one_hot_embeddings: bool. If True, use one-hot method for word\n", + " embeddings. If False, use `tf.gather()`.\n", + " Returns:\n", + " float Tensor of shape [batch_size, seq_length, embedding_size].\n", + " \"\"\"\n", + " # This function assumes that the input is of shape [batch_size, seq_length,\n", + " # num_inputs].\n", + " #\n", + " # If the input is a 2D tensor of shape [batch_size, seq_length], we\n", + " # reshape to [batch_size, seq_length, 1].\n", + " if input_ids.shape.ndims == 2:\n", + " input_ids = tf.expand_dims(input_ids, axis = [-1])\n", + "\n", + " embedding_table = tf.get_variable(\n", + " name = word_embedding_name,\n", + " shape = [vocab_size, embedding_size],\n", + " initializer = create_initializer(initializer_range),\n", + " )\n", + "\n", + " flat_input_ids = tf.reshape(input_ids, [-1])\n", + " if use_one_hot_embeddings:\n", + " one_hot_input_ids = tf.one_hot(flat_input_ids, depth = vocab_size)\n", + " output = tf.matmul(one_hot_input_ids, embedding_table)\n", + " else:\n", + " output = tf.gather(embedding_table, flat_input_ids)\n", + "\n", + " input_shape = get_shape_list(input_ids)\n", + "\n", + " output = tf.reshape(\n", + " output, input_shape[0:-1] + [input_shape[-1] * embedding_size]\n", + " )\n", + " return (output, embedding_table)\n", + "\n", + "\n", + "def embedding_postprocessor(\n", + " input_tensor,\n", + " use_token_type = False,\n", + " token_type_ids = None,\n", + " token_type_vocab_size = 2,\n", + " token_type_embedding_name = 'token_type_embeddings',\n", + " use_position_embeddings = True,\n", + " position_embedding_name = 'position_embeddings',\n", + " initializer_range = 0.02,\n", + " max_position_embeddings = 512,\n", + "):\n", + " \"\"\"Performs various post-processing on a word embedding tensor.\n", + " Args:\n", + " input_tensor: float Tensor of shape [batch_size, seq_length,\n", + " embedding_size].\n", + " use_token_type: bool. Whether to add embeddings for `token_type_ids`.\n", + " token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].\n", + " Must be specified if `use_token_type` is True.\n", + " token_type_vocab_size: int. The vocabulary size of `token_type_ids`.\n", + " token_type_embedding_name: string. The name of the embedding table variable\n", + " for token type ids.\n", + " use_position_embeddings: bool. Whether to add position embeddings for the\n", + " position of each token in the sequence.\n", + " position_embedding_name: string. The name of the embedding table variable\n", + " for positional embeddings.\n", + " initializer_range: float. Range of the weight initialization.\n", + " max_position_embeddings: int. Maximum sequence length that might ever be\n", + " used with this model. This can be longer than the sequence length of\n", + " input_tensor, but cannot be shorter.\n", + " dropout_prob: float. Dropout probability applied to the final output tensor.\n", + " Returns:\n", + " float tensor with same shape as `input_tensor`.\n", + " Raises:\n", + " ValueError: One of the tensor shapes or input values is invalid.\n", + " \"\"\"\n", + " input_shape = get_shape_list(input_tensor, expected_rank = 3)\n", + " batch_size = input_shape[0]\n", + " seq_length = input_shape[1]\n", + " width = input_shape[2]\n", + "\n", + " output = input_tensor\n", + "\n", + " if use_token_type:\n", + " if token_type_ids is None:\n", + " raise ValueError(\n", + " '`token_type_ids` must be specified if'\n", + " '`use_token_type` is True.'\n", + " )\n", + " token_type_table = tf.get_variable(\n", + " name = token_type_embedding_name,\n", + " shape = [token_type_vocab_size, width],\n", + " initializer = create_initializer(initializer_range),\n", + " )\n", + " flat_token_type_ids = tf.reshape(token_type_ids, [-1])\n", + " one_hot_ids = tf.one_hot(\n", + " flat_token_type_ids, depth = token_type_vocab_size\n", + " )\n", + " token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)\n", + " token_type_embeddings = tf.reshape(\n", + " token_type_embeddings, [batch_size, seq_length, width]\n", + " )\n", + " output += token_type_embeddings\n", + "\n", + " if use_position_embeddings:\n", + " assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)\n", + " with tf.control_dependencies([assert_op]):\n", + " full_position_embeddings = tf.get_variable(\n", + " name = position_embedding_name,\n", + " shape = [max_position_embeddings, width],\n", + " initializer = create_initializer(initializer_range),\n", + " )\n", + " position_embeddings = tf.slice(\n", + " full_position_embeddings, [0, 0], [seq_length, -1]\n", + " )\n", + " num_dims = len(output.shape.as_list())\n", + " position_broadcast_shape = []\n", + " for _ in range(num_dims - 2):\n", + " position_broadcast_shape.append(1)\n", + " position_broadcast_shape.extend([seq_length, width])\n", + " position_embeddings = tf.reshape(\n", + " position_embeddings, position_broadcast_shape\n", + " )\n", + " output += position_embeddings\n", + "\n", + " return output\n", + "\n", + "\n", + "def gelu(x):\n", + " cdf = 0.5 * (\n", + " 1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044_715 * tf.pow(x, 3))))\n", + " )\n", + " return x * cdf\n", + "\n", + "\n", + "class Forward(tf.keras.layers.Layer):\n", + " def __init__(self, dim, mlp_dim, dropout, **kwargs):\n", + " super(Forward, self).__init__(**kwargs)\n", + " self.rate = dropout\n", + " self.dense1 = tf.keras.layers.Dense(mlp_dim, activation = gelu)\n", + " self.dense2 = tf.keras.layers.Dense(dim)\n", + " self.dropout = tf.keras.layers.Dropout(self.rate)\n", + "\n", + " def call(self, inputs, training = True):\n", + " X = self.dense1(inputs)\n", + " X = self.dropout(X, training = training)\n", + " X = self.dense2(X)\n", + " X = self.dropout(X, training = training)\n", + " return X\n", + "\n", + "\n", + "class FNetBlock(tf.keras.layers.Layer):\n", + " def __init__(self, dim, mlp_dim, dropout = 0.1, **kwargs):\n", + " super(FNetBlock, self).__init__(name = 'FNetBlock', **kwargs)\n", + " self.norm_fourier = tf.keras.layers.LayerNormalization()\n", + " self.norm_ffn = tf.keras.layers.LayerNormalization()\n", + " self.ffn = Forward(dim, mlp_dim, dropout = dropout)\n", + "\n", + " def call(self, inputs, training = True):\n", + " X_complex = tf.cast(inputs, tf.complex64)\n", + " X_fft = tf.math.real(tf.signal.fft2d(X_complex))\n", + " X_norm1 = self.norm_fourier(X_fft + inputs, training = training)\n", + " X_dense = self.ffn(X_norm1, training = training)\n", + " X_norm2 = self.norm_ffn(X_dense + X_norm1, training = training)\n", + " return X_norm2\n", + "\n", + "\n", + "class Model(tf.keras.Model):\n", + " def __init__(\n", + " self,\n", + " dim,\n", + " vocab_size,\n", + " depth,\n", + " mlp_dim,\n", + " dropout = 0.1,\n", + " dropout_embedding = 0.1,\n", + " max_position_embeddings = 1024,\n", + " **kwargs,\n", + " ):\n", + " super(Model, self).__init__(name = 'Model', **kwargs)\n", + " self.dim = dim\n", + " self.hidden_size = dim\n", + " self.vocab_size = vocab_size\n", + " self.dropout_embedding = dropout_embedding\n", + " self.max_position_embeddings = max_position_embeddings\n", + " self.attn = []\n", + " for _ in range(depth):\n", + " self.attn.append(\n", + " FNetBlock(dim = dim, mlp_dim = mlp_dim, dropout = dropout)\n", + " )\n", + " self.layernorm_dropout = tf.keras.Sequential()\n", + " self.layernorm_dropout.add(tf.keras.layers.LayerNormalization())\n", + " self.layernorm_dropout.add(tf.keras.layers.Dropout(dropout_embedding))\n", + "\n", + " def call(\n", + " self, x, input_mask = None, token_type_ids = None, training = True\n", + " ):\n", + "\n", + " if input_mask is None:\n", + " input_mask = tf.ones(\n", + " shape = [tf.shape(x)[0], tf.shape(x)[1]], dtype = tf.int32\n", + " )\n", + "\n", + " input_mask = tf.expand_dims(tf.cast(input_mask, tf.float32), -1)\n", + "\n", + " if token_type_ids is None:\n", + " token_type_ids = tf.zeros(\n", + " shape = [tf.shape(x)[0], tf.shape(x)[1]], dtype = tf.int32\n", + " )\n", + " (self.embedding_output, self.embedding_table) = embedding_lookup(\n", + " input_ids = x,\n", + " vocab_size = self.vocab_size,\n", + " embedding_size = self.dim,\n", + " initializer_range = 0.02,\n", + " word_embedding_name = 'word_embeddings',\n", + " use_one_hot_embeddings = False,\n", + " )\n", + " self.embedding_output = embedding_postprocessor(\n", + " input_tensor = self.embedding_output,\n", + " use_token_type = True,\n", + " token_type_ids = token_type_ids,\n", + " token_type_vocab_size = 2,\n", + " token_type_embedding_name = 'token_type_embeddings',\n", + " use_position_embeddings = True,\n", + " position_embedding_name = 'position_embeddings',\n", + " initializer_range = 0.02,\n", + " max_position_embeddings = self.max_position_embeddings,\n", + " )\n", + " x = self.layernorm_dropout(self.embedding_output, training = training)\n", + " for no, attn in enumerate(self.attn):\n", + " x = attn(x, training = training)\n", + " x = x * input_mask\n", + "\n", + " with tf.variable_scope('pooler'):\n", + " first_token_tensor = tf.squeeze(x[:, 0:1, :], axis = 1)\n", + " self.pooled_output = tf.layers.dense(\n", + " first_token_tensor,\n", + " self.hidden_size,\n", + " activation = tf.tanh,\n", + " kernel_initializer = create_initializer(0.02),\n", + " )\n", + " return x\n", + "\n", + "\n", + "# x = tf.placeholder(tf.int32, (None, None))\n", + "# model = Model(768, 32000, 12, 768)\n", + "# o = model(x)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "x = tf.placeholder(tf.int32, (None, None))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "model = Model(768, 32000, 12, 768)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /Users/huseinzolkepli/Documents/tf-1.15/env/lib/python3.7/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /Users/huseinzolkepli/Documents/tf-1.15/env/lib/python3.7/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /Users/huseinzolkepli/Documents/tf-1.15/env/lib/python3.7/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /Users/huseinzolkepli/Documents/tf-1.15/env/lib/python3.7/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /Users/huseinzolkepli/Documents/tf-1.15/env/lib/python3.7/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.assert_less_equal is deprecated. Please use tf.compat.v1.assert_less_equal instead.\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /Users/huseinzolkepli/Documents/tf-1.15/env/lib/python3.7/site-packages/tensorflow_core/python/autograph/converters/directives.py:119: The name tf.assert_less_equal is deprecated. Please use tf.compat.v1.assert_less_equal instead.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /Users/huseinzolkepli/Documents/tf-1.15/env/lib/python3.7/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "If using Keras pass *_constraint arguments to layers.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /Users/huseinzolkepli/Documents/tf-1.15/env/lib/python3.7/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "If using Keras pass *_constraint arguments to layers.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From :295: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use keras.layers.Dense instead.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From :295: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use keras.layers.Dense instead.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /Users/huseinzolkepli/Documents/tf-1.15/env/lib/python3.7/site-packages/tensorflow_core/python/layers/core.py:187: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Please use `layer.__call__` method instead.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /Users/huseinzolkepli/Documents/tf-1.15/env/lib/python3.7/site-packages/tensorflow_core/python/layers/core.py:187: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Please use `layer.__call__` method instead.\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "o = model(x)\n", + "o" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "sess = tf.Session()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "sess.run(tf.global_variables_initializer())" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 433 ms, sys: 23.6 ms, total: 456 ms\n", + "Wall time: 492 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[[-0.14676172, 0.12887064, -0.6372858 , ..., 0.37827602,\n", + " 0.5740778 , -0.84948176],\n", + " [ 0.4179115 , -0.3492962 , 0.99186915, ..., -0.82482225,\n", + " 0.7381474 , -0.86820143],\n", + " [ 0.7566453 , -0.1262211 , 0.04796262, ..., 1.421154 ,\n", + " 0.7783395 , -0.01940406],\n", + " [ 0.12364741, 0.39592683, 2.253871 , ..., -0.11123513,\n", + " 0.922476 , -0.94825745]]], dtype=float32)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\n", + "sess.run(o, feed_dict = {x: [[1,2,3,4]]})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/pretrained-model/performer/fast_attention.py b/pretrained-model/performer/fast_attention.py new file mode 100644 index 00000000..1be24cb6 --- /dev/null +++ b/pretrained-model/performer/fast_attention.py @@ -0,0 +1,529 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implementation of multiheaded FAVOR-attention & FAVOR-self-attention layers. + +Prefix Sum Tensorflow implementation by Valerii Likhosherstov. +""" +import math +import numpy as np +import tensorflow as tf +import util + +BIG_CONSTANT = 1e8 + + +def create_projection_matrix(m, d, seed=0, scaling=0, struct_mode=False): + r"""Constructs the matrix of random projections. + + Constructs a matrix of random orthogonal projections. Each projection vector + has direction chosen uniformly at random and either deterministic length + \sqrt{d} or length taken from the \chi(d) distribution (in the latter case + marginal distributions of the projections are d-dimensional Gaussian vectors + with associated identity covariance matrix). + + Args: + m: number of random projections. + d: dimensionality of each random projection. + seed: random seed used to construct projections. + scaling: 1 if all the random projections need to be renormalized to have + length \sqrt{d}, 0 if the lengths of random projections should follow + \chi(d) distribution. + struct_mode: if True then products of Givens rotations will be used to + construct random orthogonal matrix. This bypasses Gram-Schmidt + orthogonalization. + + Returns: + The matrix of random projections of the shape [m, d]. + """ + nb_full_blocks = int(m / d) + block_list = [] + current_seed = seed + for _ in range(nb_full_blocks): + if struct_mode: + q = create_products_of_givens_rotations(d, seed) + else: + unstructured_block = tf.random.normal((d, d), seed=current_seed) + q, _ = tf.linalg.qr(unstructured_block) + q = tf.transpose(q) + block_list.append(q) + current_seed += 1 + remaining_rows = m - nb_full_blocks * d + if remaining_rows > 0: + if struct_mode: + q = create_products_of_givens_rotations(d, seed) + else: + unstructured_block = tf.random.normal((d, d), seed=current_seed) + q, _ = tf.linalg.qr(unstructured_block) + q = tf.transpose(q) + block_list.append(q[0:remaining_rows]) + final_matrix = tf.experimental.numpy.vstack(block_list) + current_seed += 1 + + if scaling == 0: + multiplier = tf.norm(tf.random.normal((m, d), seed=current_seed), axis=1) + elif scaling == 1: + multiplier = tf.math.sqrt(float(d)) * tf.ones((m)) + else: + raise ValueError("Scaling must be one of {0, 1}. Was %s" % scaling) + + return tf.linalg.matmul(tf.linalg.diag(multiplier), final_matrix) + + +def create_products_of_givens_rotations(dim, seed): + r"""Constructs a 2D-tensor which is a product of Givens random rotations. + + Constructs a 2D-tensor of the form G_1 * ... * G_k, where G_i is a Givens + random rotation. The resulting tensor mimics a matrix taken uniformly at + random form the orthogonal group. + + Args: + dim: number of rows/columns of the resulting 2D-tensor. + seed: random seed. + + Returns: + The product of Givens random rotations. + """ + nb_givens_rotations = dim * int(math.ceil(math.log(float(dim)))) + q = np.eye(dim, dim) + np.random.seed(seed) + for _ in range(nb_givens_rotations): + random_angle = math.pi * np.random.uniform() + random_indices = np.random.choice(dim, 2) + index_i = min(random_indices[0], random_indices[1]) + index_j = max(random_indices[0], random_indices[1]) + slice_i = q[index_i] + slice_j = q[index_j] + new_slice_i = math.cos(random_angle) * slice_i + math.sin( + random_angle) * slice_j + new_slice_j = -math.sin(random_angle) * slice_i + math.cos( + random_angle) * slice_j + q[index_i] = new_slice_i + q[index_j] = new_slice_j + return tf.cast(tf.constant(q), dtype=tf.float32) + + +def relu_kernel_transformation(data, + is_query, + projection_matrix=None, + numerical_stabilizer=0.001): + """Computes features for the ReLU-kernel. + + Computes random features for the ReLU kernel from + https://arxiv.org/pdf/2009.14794.pdf. + + Args: + data: input data tensor of the shape [B, L, H, D], where: B - batch + dimension, L - attention dimensions, H - heads, D - features. + is_query: indicates whether input data is a query oor key tensor. + projection_matrix: random Gaussian matrix of shape [M, D], where M stands + for the number of random features and each D x D sub-block has pairwise + orthogonal rows. + numerical_stabilizer: small positive constant for numerical stability. + + Returns: + Corresponding kernel feature map. + """ + del is_query + if projection_matrix is None: + return tf.nn.relu(data) + numerical_stabilizer + else: + ratio = 1.0 / tf.math.sqrt( + tf.dtypes.cast(projection_matrix.shape[0], tf.float32)) + data_dash = ratio * tf.einsum("blhd,md->blhm", data, projection_matrix) + return tf.nn.relu(data_dash) + numerical_stabilizer + + +def softmax_kernel_transformation(data, + is_query, + projection_matrix=None, + numerical_stabilizer=0.000001): + """Computes random features for the softmax kernel using FAVOR+ mechanism. + + Computes random features for the softmax kernel using FAVOR+ mechanism from + https://arxiv.org/pdf/2009.14794.pdf. + + Args: + data: input data tensor of the shape [B, L, H, D], where: B - batch + dimension, L - attention dimensions, H - heads, D - features. + is_query: indicates whether input data is a query oor key tensor. + projection_matrix: random Gaussian matrix of shape [M, D], where M stands + for the number of random features and each D x D sub-block has pairwise + orthogonal rows. + numerical_stabilizer: small positive constant for numerical stability. + + Returns: + Corresponding kernel feature map. + """ + data_normalizer = 1.0 / ( + tf.math.sqrt(tf.math.sqrt(tf.dtypes.cast(data.shape[-1], tf.float32)))) + data = data_normalizer * data + ratio = 1.0 / tf.math.sqrt( + tf.dtypes.cast(projection_matrix.shape[0], tf.float32)) + data_dash = tf.einsum("blhd,md->blhm", data, projection_matrix) + diag_data = tf.math.square(data) + diag_data = tf.math.reduce_sum( + diag_data, axis=tf.keras.backend.ndim(data) - 1) + diag_data = diag_data / 2.0 + diag_data = tf.expand_dims(diag_data, axis=tf.keras.backend.ndim(data) - 1) + last_dims_t = (len(data_dash.shape) - 1,) + attention_dims_t = (len(data_dash.shape) - 3,) + if is_query: + data_dash = ratio * ( + tf.math.exp(data_dash - diag_data - tf.math.reduce_max( + data_dash, axis=last_dims_t, keepdims=True)) + numerical_stabilizer) + else: + data_dash = ratio * ( + tf.math.exp(data_dash - diag_data - tf.math.reduce_max( + data_dash, axis=last_dims_t + attention_dims_t, keepdims=True)) + + numerical_stabilizer) + + return data_dash + + +def noncausal_numerator(qs, ks, vs): + """Computes not-normalized FAVOR noncausal attention AV. + + Args: + qs: query_prime tensor of the shape [L,B,H,M]. + ks: key_prime tensor of the shape [L,B,H,M]. + vs: value tensor of the shape [L,B,H,D]. + + Returns: + Not-normalized FAVOR noncausal attention AV. + """ + kvs = tf.einsum("lbhm,lbhd->bhmd", ks, vs) + return tf.einsum("lbhm,bhmd->lbhd", qs, kvs) + + +def noncausal_denominator(qs, ks): + """Computes FAVOR normalizer in noncausal attention. + + Args: + qs: query_prime tensor of the shape [L,B,H,M]. + ks: key_prime tensor of the shape [L,B,H,M]. + + Returns: + FAVOR normalizer in noncausal attention. + """ + all_ones = tf.ones([ks.shape[0]]) + ks_sum = tf.einsum("lbhm,l->bhm", ks, all_ones) + return tf.einsum("lbhm,bhm->lbh", qs, ks_sum) + + +@tf.custom_gradient +def causal_numerator(qs, ks, vs): + """Computes not-normalized FAVOR causal attention A_{masked}V. + + Args: + qs: query_prime tensor of the shape [L,B,H,M]. + ks: key_prime tensor of the shape [L,B,H,M]. + vs: value tensor of the shape [L,B,H,D]. + + Returns: + Not-normalized FAVOR causal attention A_{masked}V. + """ + + result = [] + sums = tf.zeros_like(tf.einsum("ijk,ijl->ijkl", ks[0], vs[0])) + + for index in range(qs.shape[0]): + sums = sums + tf.einsum("ijk,ijl->ijkl", ks[index], vs[index]) + result.append(tf.einsum("ijkl,ijk->ijl", sums, qs[index])[None, Ellipsis]) + + result = tf.concat(result, axis=0) + + def grad(res_grad): + + grads = tf.zeros_like(tf.einsum("ijk,ijl->ijkl", ks[0], vs[0])) + + gr_sums = sums + + q_grads = [] + k_grads = [] + v_grads = [] + + for index in range(qs.shape[0] - 1, -1, -1): + + q_grads.append( + tf.einsum("ijkl,ijl->ijk", gr_sums, res_grad[index])[None, Ellipsis]) + grads = grads + tf.einsum("ijk,ijl->ijkl", qs[index], res_grad[index]) + k_grads.append(tf.einsum("ijkl,ijl->ijk", grads, vs[index])[None, Ellipsis]) + v_grads.append(tf.einsum("ijkl,ijk->ijl", grads, ks[index])[None, Ellipsis]) + gr_sums = gr_sums - tf.einsum("ijk,ijl->ijkl", ks[index], vs[index]) + + q_grads = tf.concat(q_grads[::-1], axis=0) + k_grads = tf.concat(k_grads[::-1], axis=0) + v_grads = tf.concat(v_grads[::-1], axis=0) + + return q_grads, k_grads, v_grads + + return result, grad + + +@tf.custom_gradient +def causal_denominator(qs, ks): + """Computes FAVOR normalizer in causal attention. + + Args: + qs: query_prime tensor of the shape [L,B,H,M]. + ks: key_prime tensor of the shape [L,B,H,M]. + + Returns: + FAVOR normalizer in causal attention. + """ + + result = [] + sums = tf.zeros_like(ks[0]) + + for index in range(qs.shape[0]): + sums = sums + ks[index] + result.append(tf.reduce_sum(qs[index] * sums, axis=2)[None, Ellipsis]) + + result = tf.concat(result, axis=0) + + def grad(res_grad): + + k_grad = tf.zeros_like(ks[0]) + + gr_sums = sums + + q_grads = [] + k_grads = [] + + for index in range(qs.shape[0] - 1, -1, -1): + + q_grads.append( + tf.einsum("ijk,ij->ijk", gr_sums, res_grad[index])[None, Ellipsis]) + k_grad = k_grad + tf.einsum("ijk,ij->ijk", qs[index], res_grad[index]) + k_grads.append(k_grad[None, Ellipsis]) + gr_sums = gr_sums - ks[index] + + q_grads = tf.concat(q_grads[::-1], axis=0) + k_grads = tf.concat(k_grads[::-1], axis=0) + + return q_grads, k_grads + + return result, grad + + +def favor_attention(query, + key, + value, + kernel_transformation, + causal, + projection_matrix=None): + """Computes FAVOR normalized attention. + + Args: + query: query tensor. + key: key tensor. + value: value tensor. + kernel_transformation: transformation used to get finite kernel features. + causal: whether attention is causal or not. + projection_matrix: projection matrix to be used. + + Returns: + FAVOR normalized attention. + """ + query_prime = kernel_transformation(query, True, + projection_matrix) # [B,L,H,M] + key_prime = kernel_transformation(key, False, projection_matrix) # [B,L,H,M] + query_prime = tf.transpose(query_prime, [1, 0, 2, 3]) # [L,B,H,M] + key_prime = tf.transpose(key_prime, [1, 0, 2, 3]) # [L,B,H,M] + value = tf.transpose(value, [1, 0, 2, 3]) # [L,B,H,D] + + if causal: + av_attention = causal_numerator(query_prime, key_prime, value) + attention_normalizer = causal_denominator(query_prime, key_prime) + else: + av_attention = noncausal_numerator(query_prime, key_prime, value) + attention_normalizer = noncausal_denominator(query_prime, key_prime) + # TODO(kchoro): Add more comments. + av_attention = tf.transpose(av_attention, [1, 0, 2, 3]) + attention_normalizer = tf.transpose(attention_normalizer, [1, 0, 2]) + attention_normalizer = tf.expand_dims(attention_normalizer, + len(attention_normalizer.shape)) + return av_attention / attention_normalizer + + +class Attention(tf.keras.layers.Layer): + """Multi-headed attention layer.""" + + def __init__(self, + hidden_size, + num_heads, + attention_dropout, + kernel_transformation=relu_kernel_transformation, + numerical_stabilizer=0.001, + causal=False, + projection_matrix_type=None, + nb_random_features=0): + """Initialize Attention. + + Args: + hidden_size: int, output dim of hidden layer. + num_heads: int, number of heads to repeat the same attention structure. + attention_dropout: float, dropout rate inside attention for training. + kernel_transformation: transformation used to produce kernel features for + attention. + numerical_stabilizer: used to bound away from zero kernel values. + causal: whether attention is causal or not. + projection_matrix_type: None if Identity should be used, otherwise random + projection matrix will be applied. + nb_random_features: number of random features to be used (relevant only if + projection_matrix is not None). + """ + if hidden_size % num_heads: + raise ValueError( + "Hidden size ({}) must be divisible by the number of heads ({})." + .format(hidden_size, num_heads)) + + super(Attention, self).__init__() + self.hidden_size = hidden_size + self.num_heads = num_heads + self.attention_dropout = attention_dropout + self.kernel_transformation = kernel_transformation + self.numerical_stabilizer = numerical_stabilizer + self.causal = causal + self.projection_matrix_type = projection_matrix_type + self.nb_random_features = nb_random_features + + def build(self, input_shape): + """Builds the layer.""" + # Layers for linearly projecting the queries, keys, and values. + size_per_head = self.hidden_size // self.num_heads + + def _glorot_initializer(fan_in, fan_out): + limit = math.sqrt(6.0 / (fan_in + fan_out)) + return tf.keras.initializers.RandomUniform(minval=-limit, maxval=limit) + + attention_initializer = _glorot_initializer(input_shape.as_list()[-1], + self.hidden_size) + self.query_dense_layer = util.DenseEinsum( + output_shape=(self.num_heads, size_per_head), + kernel_initializer=attention_initializer, + use_bias=False, + name="query") + self.key_dense_layer = util.DenseEinsum( + output_shape=(self.num_heads, size_per_head), + kernel_initializer=attention_initializer, + use_bias=False, + name="key") + self.value_dense_layer = util.DenseEinsum( + output_shape=(self.num_heads, size_per_head), + kernel_initializer=attention_initializer, + use_bias=False, + name="value") + + output_initializer = _glorot_initializer(self.hidden_size, self.hidden_size) + self.output_dense_layer = util.DenseEinsum( + output_shape=self.hidden_size, + num_summed_dimensions=2, + kernel_initializer=output_initializer, + use_bias=False, + name="output_transform") + super(Attention, self).build(input_shape) + + def get_config(self): + return { + "hidden_size": self.hidden_size, + "num_heads": self.num_heads, + "attention_dropout": self.attention_dropout, + } + + def call(self, + query_input, + source_input, + bias, + training, + cache=None, + decode_loop_step=None): + """Apply attention mechanism to query_input and source_input. + + Args: + query_input: A tensor with shape [batch_size, length_query, hidden_size]. + source_input: A tensor with shape [batch_size, length_source, + hidden_size]. + bias: A tensor with shape [batch_size, 1, length_query, length_source], + the attention bias that will be added to the result of the dot product. + training: A bool, whether in training mode or not. + cache: (Used during prediction) A dictionary with tensors containing + results of previous attentions. The dictionary must have the items: + {"k": tensor with shape [batch_size, i, heads, dim_per_head], + "v": tensor with shape [batch_size, i, heads, dim_per_head]} where + i is the current decoded length for non-padded decode, or max + sequence length for padded decode. + decode_loop_step: An integer, step number of the decoding loop. Used only + for autoregressive inference on TPU. + + Returns: + Attention layer output with shape [batch_size, length_query, hidden_size] + """ + # Linearly project the query, key and value using different learned + # projections. Splitting heads is automatically done during the linear + # projections --> [batch_size, length, num_heads, dim_per_head]. + query = self.query_dense_layer(query_input) + key = self.key_dense_layer(source_input) + value = self.value_dense_layer(source_input) + + if self.projection_matrix_type is None: + projection_matrix = None + else: + dim = query.shape[-1] + seed = tf.math.ceil(tf.math.abs(tf.math.reduce_sum(query) * BIG_CONSTANT)) + seed = tf.dtypes.cast(seed, tf.int32) + projection_matrix = create_projection_matrix( + self.nb_random_features, dim, seed=seed) + + if cache is not None: + # Combine cached keys and values with new keys and values. + if decode_loop_step is not None: + cache_k_shape = cache["k"].shape.as_list() + indices = tf.reshape( + tf.one_hot(decode_loop_step, cache_k_shape[1], dtype=key.dtype), + [1, cache_k_shape[1], 1, 1]) + key = cache["k"] + key * indices + cache_v_shape = cache["v"].shape.as_list() + indices = tf.reshape( + tf.one_hot(decode_loop_step, cache_v_shape[1], dtype=value.dtype), + [1, cache_v_shape[1], 1, 1]) + value = cache["v"] + value * indices + else: + key = tf.concat([tf.cast(cache["k"], key.dtype), key], axis=1) + value = tf.concat([tf.cast(cache["v"], value.dtype), value], axis=1) + + # Update cache + cache["k"] = key + cache["v"] = value + + attention_output = favor_attention(query, key, value, + self.kernel_transformation, self.causal, + projection_matrix) + attention_output = self.output_dense_layer(attention_output) + return attention_output + + +class SelfAttention(Attention): + """Multiheaded self-attention layer.""" + + def call(self, + query_input, + bias, + training, + cache=None, + decode_loop_step=None): + return super(SelfAttention, self).call(query_input, query_input, bias, + training, cache, decode_loop_step) diff --git a/pretrained-model/performer/model.py b/pretrained-model/performer/model.py new file mode 100644 index 00000000..ef61db37 --- /dev/null +++ b/pretrained-model/performer/model.py @@ -0,0 +1,230 @@ +import tensorflow as tf +import numpy as np + + +def gelu(x): + cdf = 0.5 * ( + 1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044_715 * tf.pow(x, 3)))) + ) + return x * cdf + + +def get_shape_list(tensor, expected_rank=None, name=None): + """Returns a list of the shape of tensor, preferring static dimensions. + Args: + tensor: A tf.Tensor object to find the shape of. + expected_rank: (optional) int. The expected rank of `tensor`. If this is + specified and the `tensor` has a different rank, and exception will be + thrown. + name: Optional name of the tensor for the error message. + Returns: + A list of dimensions of the shape of tensor. All static dimensions will + be returned as python integers, and dynamic dimensions will be returned + as tf.Tensor scalars. + """ + if name is None: + name = tensor.name + + shape = tensor.shape.as_list() + + non_static_indexes = [] + for (index, dim) in enumerate(shape): + if dim is None: + non_static_indexes.append(index) + + if not non_static_indexes: + return shape + + dyn_shape = tf.shape(tensor) + for index in non_static_indexes: + shape[index] = dyn_shape[index] + return shape + + +def embedding_lookup( + input_ids, + vocab_size, + embedding_size=128, + initializer_range=0.02, + word_embedding_name='word_embeddings', + use_one_hot_embeddings=False, +): + """Looks up words embeddings for id tensor. + Args: + input_ids: int32 Tensor of shape [batch_size, seq_length] containing word + ids. + vocab_size: int. Size of the embedding vocabulary. + embedding_size: int. Width of the word embeddings. + initializer_range: float. Embedding initialization range. + word_embedding_name: string. Name of the embedding table. + use_one_hot_embeddings: bool. If True, use one-hot method for word + embeddings. If False, use `tf.gather()`. + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size]. + """ + # This function assumes that the input is of shape [batch_size, seq_length, + # num_inputs]. + # + # If the input is a 2D tensor of shape [batch_size, seq_length], we + # reshape to [batch_size, seq_length, 1]. + if input_ids.shape.ndims == 2: + input_ids = tf.expand_dims(input_ids, axis=[-1]) + + embedding_table = tf.get_variable( + name=word_embedding_name, + shape=[vocab_size, embedding_size], + initializer=create_initializer(initializer_range), + ) + + flat_input_ids = tf.reshape(input_ids, [-1]) + if use_one_hot_embeddings: + one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) + output = tf.matmul(one_hot_input_ids, embedding_table) + else: + output = tf.gather(embedding_table, flat_input_ids) + + input_shape = get_shape_list(input_ids) + + output = tf.reshape( + output, input_shape[0:-1] + [input_shape[-1] * embedding_size] + ) + return (output, embedding_table) + + +def embedding_postprocessor( + input_tensor, + use_token_type=False, + token_type_ids=None, + token_type_vocab_size=2, + token_type_embedding_name='token_type_embeddings', + use_position_embeddings=True, + position_embedding_name='position_embeddings', + initializer_range=0.02, + max_position_embeddings=512, +): + """Performs various post-processing on a word embedding tensor. + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, + embedding_size]. + use_token_type: bool. Whether to add embeddings for `token_type_ids`. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + Must be specified if `use_token_type` is True. + token_type_vocab_size: int. The vocabulary size of `token_type_ids`. + token_type_embedding_name: string. The name of the embedding table variable + for token type ids. + use_position_embeddings: bool. Whether to add position embeddings for the + position of each token in the sequence. + position_embedding_name: string. The name of the embedding table variable + for positional embeddings. + initializer_range: float. Range of the weight initialization. + max_position_embeddings: int. Maximum sequence length that might ever be + used with this model. This can be longer than the sequence length of + input_tensor, but cannot be shorter. + dropout_prob: float. Dropout probability applied to the final output tensor. + Returns: + float tensor with same shape as `input_tensor`. + Raises: + ValueError: One of the tensor shapes or input values is invalid. + """ + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + width = input_shape[2] + + output = input_tensor + + if use_token_type: + if token_type_ids is None: + raise ValueError( + '`token_type_ids` must be specified if' + '`use_token_type` is True.' + ) + token_type_table = tf.get_variable( + name=token_type_embedding_name, + shape=[token_type_vocab_size, width], + initializer=create_initializer(initializer_range), + ) + flat_token_type_ids = tf.reshape(token_type_ids, [-1]) + one_hot_ids = tf.one_hot( + flat_token_type_ids, depth=token_type_vocab_size + ) + token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) + token_type_embeddings = tf.reshape( + token_type_embeddings, [batch_size, seq_length, width] + ) + output += token_type_embeddings + + if use_position_embeddings: + assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) + with tf.control_dependencies([assert_op]): + full_position_embeddings = tf.get_variable( + name=position_embedding_name, + shape=[max_position_embeddings, width], + initializer=create_initializer(initializer_range), + ) + position_embeddings = tf.slice( + full_position_embeddings, [0, 0], [seq_length, -1] + ) + num_dims = len(output.shape.as_list()) + position_broadcast_shape = [] + for _ in range(num_dims - 2): + position_broadcast_shape.append(1) + position_broadcast_shape.extend([seq_length, width]) + position_embeddings = tf.reshape( + position_embeddings, position_broadcast_shape + ) + output += position_embeddings + + return output + + +class Forward(tf.keras.layers.Layer): + def __init__(self, dim, mlp_dim, dropout, **kwargs): + super(Forward, self).__init__(**kwargs) + self.rate = dropout + self.dense1 = tf.keras.layers.Dense(mlp_dim, activation=gelu) + self.dense2 = tf.keras.layers.Dense(dim) + self.dropout = tf.keras.layers.Dropout(self.rate) + + def call(self, inputs, training=True): + X = self.dense1(inputs) + X = self.dropout(X, training=training) + X = self.dense2(X) + X = self.dropout(X, training=training) + return X + + +class FNetBlock(tf.keras.layers.Layer): + def __init__(self, dim, mlp_dim, dropout=0.1, **kwargs): + super(FNetBlock, self).__init__(name='FNetBlock', **kwargs) + self.norm_fourier = tf.keras.layers.LayerNormalization() + self.norm_ffn = tf.keras.layers.LayerNormalization() + self.ffn = Forward(dim, mlp_dim, dropout=dropout) + + def call(self, inputs, training=True): + X_complex = tf.cast(inputs, tf.complex64) + X_fft = tf.math.real(tf.signal.fft2d(X_complex)) + X_norm1 = self.norm_fourier(X_fft + inputs, training=training) + X_dense = self.ffn(X_norm1, training=training) + X_norm2 = self.norm_ffn(X_dense + X_norm1, training=training) + return X_norm2 + + +class Model(tf.keras.Model): + def __init__( + self, + hidden_size, + vocab_size, + nlayer, + head_size, + intermediate_size, + dropout=0.1, + dropout_embedding=0.1, + max_position_embeddings=512, + **kwargs, + ): + super(Model, self).__init__(name='Model', **kwargs) + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.dropout_embedding = dropout_embedding + self.max_position_embeddings = max_position_embeddings diff --git a/pretrained-model/performer/test-performer.ipynb b/pretrained-model/performer/test-performer.ipynb new file mode 100644 index 00000000..1091d43d --- /dev/null +++ b/pretrained-model/performer/test-performer.ipynb @@ -0,0 +1,33 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "black-italian", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pretrained-model/performer/util.py b/pretrained-model/performer/util.py new file mode 100644 index 00000000..f461ef79 --- /dev/null +++ b/pretrained-model/performer/util.py @@ -0,0 +1,195 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Keras-based einsum layer. + +Copied from +https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/dense_einsum.py. +""" +# pylint: disable=g-classes-have-attributes + +import tensorflow as tf + +_CHR_IDX = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"] + + +@tf.keras.utils.register_keras_serializable(package="Text") +class DenseEinsum(tf.keras.layers.Layer): + """A densely connected layer that uses tf.einsum as the backing computation. + + This layer can perform einsum calculations of arbitrary dimensionality. + + Arguments: + output_shape: Positive integer or tuple, dimensionality of the output space. + num_summed_dimensions: The number of dimensions to sum over. Standard 2D + matmul should use 1, 3D matmul should use 2, and so forth. + activation: Activation function to use. If you don't specify anything, no + activation is applied + (ie. "linear" activation: `a(x) = x`). + use_bias: Boolean, whether the layer uses a bias vector. + kernel_initializer: Initializer for the `kernel` weights matrix. + bias_initializer: Initializer for the bias vector. + kernel_regularizer: Regularizer function applied to the `kernel` weights + matrix. + bias_regularizer: Regularizer function applied to the bias vector. + activity_regularizer: Regularizer function applied to the output of the + layer (its "activation").. + kernel_constraint: Constraint function applied to the `kernel` weights + matrix. + bias_constraint: Constraint function applied to the bias vector. + Input shape: + N-D tensor with shape: `(batch_size, ..., input_dim)`. The most common + situation would be a 2D input with shape `(batch_size, input_dim)`. + Output shape: + N-D tensor with shape: `(batch_size, ..., units)`. For instance, for a 2D + input with shape `(batch_size, input_dim)`, the output would have shape + `(batch_size, units)`. + """ + + def __init__(self, + output_shape, + num_summed_dimensions=1, + activation=None, + use_bias=True, + kernel_initializer="glorot_uniform", + bias_initializer="zeros", + kernel_regularizer=None, + bias_regularizer=None, + activity_regularizer=None, + kernel_constraint=None, + bias_constraint=None, + **kwargs): + super(DenseEinsum, self).__init__(**kwargs) + self._output_shape = output_shape if isinstance( + output_shape, (list, tuple)) else (output_shape,) + self._activation = tf.keras.activations.get(activation) + self._use_bias = use_bias + self._kernel_initializer = tf.keras.initializers.get(kernel_initializer) + self._bias_initializer = tf.keras.initializers.get(bias_initializer) + self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer) + self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer) + self._kernel_constraint = tf.keras.constraints.get(kernel_constraint) + self._bias_constraint = tf.keras.constraints.get(bias_constraint) + self._num_summed_dimensions = num_summed_dimensions + self._einsum_string = None + + def _build_einsum_string(self, free_input_dims, bound_dims, output_dims): + input_str = "" + kernel_str = "" + output_str = "" + letter_offset = 0 + for i in range(free_input_dims): + char = _CHR_IDX[i + letter_offset] + input_str += char + output_str += char + + letter_offset += free_input_dims + for i in range(bound_dims): + char = _CHR_IDX[i + letter_offset] + input_str += char + kernel_str += char + + letter_offset += bound_dims + for i in range(output_dims): + char = _CHR_IDX[i + letter_offset] + kernel_str += char + output_str += char + + return input_str + "," + kernel_str + "->" + output_str + + def build(self, input_shape): + input_shape = tf.TensorShape(input_shape) + input_rank = input_shape.rank + free_input_dims = input_rank - self._num_summed_dimensions + output_dims = len(self._output_shape) + + self._einsum_string = self._build_einsum_string(free_input_dims, + self._num_summed_dimensions, + output_dims) + + # This is only saved for testing purposes. + self._kernel_shape = ( + input_shape[free_input_dims:].concatenate(self._output_shape)) + + self._kernel = self.add_weight( + "kernel", + shape=self._kernel_shape, + initializer=self._kernel_initializer, + regularizer=self._kernel_regularizer, + constraint=self._kernel_constraint, + dtype=self.dtype, + trainable=True) + if self._use_bias: + self._bias = self.add_weight( + "bias", + shape=self._output_shape, + initializer=self._bias_initializer, + regularizer=self._bias_regularizer, + constraint=self._bias_constraint, + dtype=self.dtype, + trainable=True) + else: + self._bias = None + super(DenseEinsum, self).build(input_shape) + + def get_config(self): + config = { + "output_shape": + self._output_shape, + "num_summed_dimensions": + self._num_summed_dimensions, + "activation": + tf.keras.activations.serialize(self._activation), + "use_bias": + self._use_bias, + "kernel_initializer": + tf.keras.initializers.serialize(self._kernel_initializer), + "bias_initializer": + tf.keras.initializers.serialize(self._bias_initializer), + "kernel_regularizer": + tf.keras.regularizers.serialize(self._kernel_regularizer), + "bias_regularizer": + tf.keras.regularizers.serialize(self._bias_regularizer), + "activity_regularizer": + tf.keras.regularizers.serialize(self._activity_regularizer), + "kernel_constraint": + tf.keras.constraints.serialize(self._kernel_constraint), + "bias_constraint": + tf.keras.constraints.serialize(self._bias_constraint) + } + base_config = super(DenseEinsum, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + def call(self, inputs): + ret = tf.einsum(self._einsum_string, inputs, self._kernel) + if self._use_bias: + ret += self._bias + if self._activation is not None: + ret = self._activation(ret) + return ret diff --git a/session/dependency/albert-base.ipynb b/session/dependency/albert-base.ipynb index e2309c4e..c03eaced 100644 --- a/session/dependency/albert-base.ipynb +++ b/session/dependency/albert-base.ipynb @@ -7,29 +7,13 @@ "outputs": [], "source": [ "import os\n", - "os.environ['CUDA_VISIBLE_DEVICES'] = '3'" + "os.environ['CUDA_VISIBLE_DEVICES'] = '1'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], - "source": [ - "with open('../Malaya-Dataset/dependency/gsd-ud-train.conllu.txt') as fopen:\n", - " corpus = fopen.read().split('\\n')\n", - " \n", - "with open('../Malaya-Dataset/dependency/gsd-ud-test.conllu.txt') as fopen:\n", - " corpus.extend(fopen.read().split('\\n'))\n", - " \n", - "with open('../Malaya-Dataset/dependency/gsd-ud-dev.conllu.txt') as fopen:\n", - " corpus.extend(fopen.read().split('\\n'))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, "outputs": [ { "name": "stdout", @@ -45,12 +29,13 @@ "from albert import optimization\n", "from albert import tokenization\n", "import tensorflow as tf\n", - "import numpy as np" + "import numpy as np\n", + "import json" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -66,235 +51,82 @@ "source": [ "tokenizer = tokenization.FullTokenizer(\n", " vocab_file='albert-base-2020-04-10/sp10m.cased.v10.vocab', do_lower_case=False,\n", - " spm_model_file='albert-base-2020-04-10/sp10m.cased.v10.model')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "tag2idx = {'PAD': 0, 'X': 1}\n", - "tag_idx = 2\n", - "\n", - "def process_corpus(corpus, until = None):\n", - " global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx\n", - " sentences, words, depends, labels, pos, sequences = [], [], [], [], [], []\n", - " temp_sentence, temp_word, temp_depend, temp_label, temp_pos = [], [], [], [], []\n", - " first_time = True\n", - " for sentence in corpus:\n", - " try:\n", - " if len(sentence):\n", - " if sentence[0] == '#':\n", - " continue\n", - " if first_time:\n", - " print(sentence)\n", - " first_time = False\n", - " sentence = sentence.split('\\t')\n", - " if sentence[7] not in tag2idx:\n", - " tag2idx[sentence[7]] = tag_idx\n", - " tag_idx += 1\n", - " temp_word.append(sentence[1])\n", - " temp_depend.append(int(sentence[6]) + 1)\n", - " temp_label.append(tag2idx[sentence[7]])\n", - " temp_sentence.append(sentence[1])\n", - " temp_pos.append(sentence[3])\n", - " else:\n", - " if len(temp_sentence) < 2 or len(temp_word) != len(temp_label):\n", - " temp_word = []\n", - " temp_depend = []\n", - " temp_label = []\n", - " temp_sentence = []\n", - " temp_pos = []\n", - " continue\n", - " bert_tokens = ['[CLS]']\n", - " labels_ = [0]\n", - " depends_ = [0]\n", - " seq_ = []\n", - " for no, orig_token in enumerate(temp_word):\n", - " labels_.append(temp_label[no])\n", - " depends_.append(temp_depend[no])\n", - " t = tokenizer.tokenize(orig_token)\n", - " bert_tokens.extend(t)\n", - " labels_.extend([1] * (len(t) - 1))\n", - " depends_.extend([0] * (len(t) - 1))\n", - " seq_.append(no + 1)\n", - " bert_tokens.append('[SEP]')\n", - " labels_.append(0)\n", - " depends_.append(0)\n", - " words.append(tokenizer.convert_tokens_to_ids(bert_tokens))\n", - " depends.append(depends_)\n", - " labels.append(labels_)\n", - " sentences.append(bert_tokens)\n", - " pos.append(temp_pos)\n", - " sequences.append(seq_)\n", - " temp_word = []\n", - " temp_depend = []\n", - " temp_label = []\n", - " temp_sentence = []\n", - " temp_pos = []\n", - " except Exception as e:\n", - " print(e, sentence)\n", - " return sentences[:-1], words[:-1], depends[:-1], labels[:-1], pos[:-1], sequences[:-1]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1\tSembungan\tsembungan\tPROPN\tX--\t_\t4\tnsubj\t_\tMorphInd=^sembungan_X--$\n" - ] - } - ], - "source": [ - "sentences, words, depends, labels, _, _ = process_corpus(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open('../Malaya-Dataset/dependency/augmented-dependency.json') as fopen:\n", - " augmented = json.load(fopen)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "text_augmented, depends_augmented, labels_augmented = [], [], []\n", - "\n", - "for a in augmented:\n", - " text_augmented.extend(a[0])\n", - " depends_augmented.extend(a[1])\n", - " labels_augmented.extend((np.array(a[2]) + 1).tolist())" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "def parse_XY(texts, depends, labels):\n", - " outside, sentences, outside_depends, outside_labels = [], [], [], []\n", - " for no, text in enumerate(texts):\n", - " temp_depend = depends[no]\n", - " temp_label = labels[no]\n", - " s = text.split()\n", - " sentences.append(s)\n", - " bert_tokens = ['[CLS]']\n", - " labels_ = [0]\n", - " depends_ = [0]\n", - " for no, orig_token in enumerate(s):\n", - " labels_.append(temp_label[no])\n", - " depends_.append(temp_depend[no])\n", - " t = tokenizer.tokenize(orig_token)\n", - " bert_tokens.extend(t)\n", - " labels_.extend([1] * (len(t) - 1))\n", - " depends_.extend([0] * (len(t) - 1))\n", - " bert_tokens.append('[SEP]')\n", - " labels_.append(0)\n", - " depends_.append(0)\n", - " outside.append(tokenizer.convert_tokens_to_ids(bert_tokens))\n", - " outside_depends.append(depends_)\n", - " outside_labels.append(labels_)\n", - " return outside, sentences, outside_depends, outside_labels" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "outside, _, outside_depends, outside_labels = parse_XY(text_augmented, \n", - " depends_augmented, \n", - " labels_augmented)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "words.extend(outside)\n", - "depends.extend(outside_depends)\n", - "labels.extend(outside_labels)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "idx2tag = {v:k for k, v in tag2idx.items()}" + " spm_model_file='albert-base-2020-04-10/sp10m.cased.v10.model')\n" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "from sklearn.model_selection import train_test_split\n", + "import pickle\n", "\n", - "words_train, words_test, depends_train, depends_test, labels_train, labels_test \\\n", - "= train_test_split(words, depends, labels, test_size = 0.2)" + "with open('train_X.pkl', 'rb') as fopen:\n", + " train_X, train_Y, train_depends = pickle.load(fopen)\n", + " \n", + "with open('test_X.pkl', 'rb') as fopen:\n", + " test_X, test_Y, test_depends = pickle.load(fopen)\n", + " \n", + "with open('tags.pkl', 'rb') as fopen:\n", + " idx2tag, tag2idx = pickle.load(fopen)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(40289, 10073)" + "{'PAD': 0,\n", + " 'X': 1,\n", + " 'nsubj': 2,\n", + " 'cop': 3,\n", + " 'det': 4,\n", + " 'root': 5,\n", + " 'nsubj:pass': 6,\n", + " 'acl': 7,\n", + " 'case': 8,\n", + " 'obl': 9,\n", + " 'flat': 10,\n", + " 'punct': 11,\n", + " 'appos': 12,\n", + " 'amod': 13,\n", + " 'compound': 14,\n", + " 'advmod': 15,\n", + " 'cc': 16,\n", + " 'obj': 17,\n", + " 'conj': 18,\n", + " 'mark': 19,\n", + " 'advcl': 20,\n", + " 'nmod': 21,\n", + " 'nummod': 22,\n", + " 'dep': 23,\n", + " 'xcomp': 24,\n", + " 'ccomp': 25,\n", + " 'parataxis': 26,\n", + " 'compound:plur': 27,\n", + " 'fixed': 28,\n", + " 'aux': 29,\n", + " 'csubj': 30,\n", + " 'iobj': 31,\n", + " 'csubj:pass': 32}" ] }, - "execution_count": 14, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "len(words_train), len(words_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "train_X = words_train\n", - "train_Y = labels_train\n", - "train_depends = depends_train\n", - "\n", - "test_X = words_test\n", - "test_Y = labels_test\n", - "test_depends = depends_test" + "tag2idx" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -308,10 +140,10 @@ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 16, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -323,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -332,11 +164,11 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "epoch = 30\n", + "epoch = 3\n", "batch_size = 32\n", "warmup_proportion = 0.1\n", "num_train_steps = int(len(train_X) / batch_size * epoch)\n", @@ -345,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -378,8 +210,15 @@ " e = tf.expand_dims(tf.expand_dims(mask_e, 1), 2)\n", " output = output * d * e\n", " \n", - " return output\n", - " \n", + " return output" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ "class BiLinear:\n", " def __init__(self, left_features, right_features, out_features):\n", " self.left_features = left_features\n", @@ -404,8 +243,17 @@ " output = output + tf.matmul(input_left, tf.transpose(self.W_l))\\\n", " + tf.matmul(input_right, tf.transpose(self.W_r))\n", " \n", - " return tf.reshape(output, output_shape)\n", - " \n", + " return tf.reshape(output, output_shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "_NEG_INF = -1e9\n", + "\n", "class Model:\n", " def __init__(\n", " self,\n", @@ -437,14 +285,20 @@ " config=albert_config,\n", " is_training=training,\n", " input_ids=self.words,\n", + " input_mask=self.mask,\n", " use_one_hot_embeddings=False)\n", + " \n", " output_layer = model.get_sequence_output()\n", " \n", " arc_h = tf.nn.elu(self.arc_h(output_layer))\n", " arc_c = tf.nn.elu(self.arc_c(output_layer))\n", + " self._arc_h = arc_h\n", + " self._arc_c = arc_c\n", " \n", " type_h = tf.nn.elu(self.type_h(output_layer))\n", " type_c = tf.nn.elu(self.type_c(output_layer))\n", + " self._type_h = type_h\n", + " self._type_c = type_c\n", " \n", " out_arc = tf.squeeze(self.attention.forward(arc_h, arc_c, mask_d=self.mask, \n", " mask_e=self.mask), axis = 1)\n", @@ -463,6 +317,11 @@ " self.heads_seq = tf.argmax(decode_arc, axis = 1)\n", " self.heads_seq = tf.identity(self.heads_seq, name = 'heads_seq')\n", " \n", + "# self.decode_arc_t = tf.transpose(decode_arc, (0, 2, 1))\n", + "# sequence_loss_depends = tf.contrib.seq2seq.sequence_loss(logits = self.decode_arc_t,\n", + "# targets = self.heads,\n", + "# weights = mask)\n", + " \n", " t = tf.cast(tf.transpose(self.heads_seq), tf.int32)\n", " broadcasted = tf.broadcast_to(batch_index, tf.shape(t))\n", " concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0), \n", @@ -537,7 +396,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -567,20 +426,20 @@ "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/layers/core.py:187: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use `layer.__call__` method instead.\n", - "WARNING:tensorflow:From :110: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From :61: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/contrib/crf/python/ops/crf.py:213: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use `keras.layers.RNN(cell)`, which is equivalent to this API\n", - "WARNING:tensorflow:From :145: calling log_softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From :101: calling log_softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "dim is deprecated, use axis instead\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/albert/optimization.py:36: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.\n", "\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/albert/optimization.py:41: The name tf.train.polynomial_decay is deprecated. Please use tf.compat.v1.train.polynomial_decay instead.\n", "\n", - "INFO:tensorflow:++++++ warmup starts at step 0, for 3777 steps ++++++\n", + "INFO:tensorflow:++++++ warmup starts at step 0, for 29336 steps ++++++\n", "INFO:tensorflow:using adamw\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/albert/optimization.py:101: The name tf.trainable_variables is deprecated. Please use tf.compat.v1.trainable_variables instead.\n", "\n" @@ -592,7 +451,7 @@ "sess = tf.InteractiveSession()\n", "\n", "learning_rate = 2e-5\n", - "hidden_size_word = 128\n", + "hidden_size_word = 256\n", "\n", "model = Model(learning_rate, hidden_size_word)\n", "sess.run(tf.global_variables_initializer())" @@ -600,7 +459,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -619,7 +478,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -635,16 +494,16 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[0.03448276, 0.00862069, 35.837]" + "[0.015625, 0.15625, 35.86845]" ] }, - "execution_count": 23, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -659,16 +518,16 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[0.03448276, 0.00862069, 334.12787]" + "[0.015625, 0.15625, 156.66893]" ] }, - "execution_count": 24, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -683,531 +542,120 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [10:06<00:00, 2.08it/s, accuracy=0.484, accuracy_depends=0.516, cost=2.22]\n", - "test minibatch loop: 100%|██████████| 315/315 [01:21<00:00, 3.89it/s, accuracy=0.566, accuracy_depends=0.537, cost=1.86]\n", - "train minibatch loop: 0%| | 0/1260 [00:00?@[\\]^_`{|}~'\n", + "\n", + "def transformer_textcleaning(string):\n", + " \"\"\"\n", + " use by any transformer model before tokenization\n", + " \"\"\"\n", + " string = unidecode(string)\n", + " string = re.sub('\\\\(dot\\\\)', '.', string)\n", + " string = (\n", + " re.sub(re.findall(r'\\', string)[0], '', string)\n", + " if (len(re.findall(r'\\', string)) > 0)\n", + " and ('href' in re.findall(r'\\', string)[0])\n", + " else string\n", + " )\n", + " string = re.sub(\n", + " r'\\w+:\\/{2}[\\d\\w-]+(\\.[\\d\\w-]+)*(?:(?:\\/[^\\s/]*))*', ' ', string\n", + " )\n", + " string = re.sub(r'[ ]+', ' ', string).strip().split()\n", + " string = [w for w in string if w[0] != '@']\n", + " string = ' '.join(string)\n", + " string = re.sub(f'([{PUNCTUATION}])', r' \\1 ', string)\n", + " string = re.sub('\\s{2,}', ' ', string)\n", + " original_string = string.split()\n", + " string = [\n", + " (original_string[no], word.title() if word.isupper() else word)\n", + " for no, word in enumerate(string.split())\n", + " if len(word)\n", + " ]\n", + " return [s[0] for s in string], [s[1] for s in string]\n", + "\n", + "def parse_X(left):\n", + " bert_tokens = ['[CLS]']\n", + " for no, orig_token in enumerate(left):\n", + " t = tokenizer.tokenize(orig_token)\n", + " bert_tokens.extend(t)\n", + " bert_tokens.append(\"[SEP]\")\n", + " t = tokenizer.convert_tokens_to_ids(bert_tokens)\n", + " return t, bert_tokens, [1] * len(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "def dependency_graph(tagging, indexing):\n", + " \"\"\"\n", + " Return helper object for dependency parser results. Only accept tagging and indexing outputs from dependency models.\n", + " \"\"\"\n", + " result = []\n", + " for i in range(len(tagging)):\n", + " result.append(\n", + " '%d\\t%s\\t_\\t_\\t_\\t_\\t%d\\t%s\\t_\\t_'\n", + " % (i + 1, tagging[i][0], int(indexing[i][1]), tagging[i][1])\n", + " )\n", + " return DependencyGraph('\\n'.join(result), top_relation_label='root')" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "2\n", + "2 (makan)\n", + "\n", + "\n", + "\n", + "0->2\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "1\n", + "1 (husein)\n", + "\n", + "\n", + "\n", + "2->1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "3\n", + "3 (ayam)\n", + "\n", + "\n", + "\n", + "2->3\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "string = 'husein makan ayam'\n", + "sequence = transformer_textcleaning(string)[1]\n", + "parsed_sequence, bert_sequence, mask = parse_X(sequence)\n", + "h, t = sess.run([model.heads_seq, model.tags_seq],\n", + " feed_dict = {\n", + " model.words: [parsed_sequence],\n", + " },\n", + ")\n", + "h = h[0] - 2\n", + "t = [idx2tag[d] for d in t[0]]\n", + "merged_h = merge_sentencepiece_tokens_tagging(bert_sequence, h)\n", + "merged_t = merge_sentencepiece_tokens_tagging(bert_sequence, t)\n", + "tagging = list(zip(merged_t[0], merged_t[1]))\n", + "indexing = list(zip(merged_h[0], merged_h[1]))\n", + "dep = dependency_graph(tagging, indexing)\n", + "dep.to_graphvis()" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "1\n", + "1 (Kuala)\n", + "\n", + "\n", + "\n", + "2\n", + "2 (Lumpur)\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "13\n", + "13 (membidas)\n", + "\n", + "\n", + "\n", + "1->13\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "13->1\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "4\n", + "4 (Ketua)\n", + "\n", + "\n", + "\n", + "13->4\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "14\n", + "14 (kenyataan)\n", + "\n", + "\n", + "\n", + "13->14\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "33\n", + "33 (melaksanakan)\n", + "\n", + "\n", + "\n", + "13->33\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "37\n", + "37 (.)\n", + "\n", + "\n", + "\n", + "13->37\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "39\n", + "39 (berkata)\n", + "\n", + "\n", + "\n", + "13->39\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "3\n", + "3 (:)\n", + "\n", + "\n", + "\n", + "4->3\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "7\n", + "7 (,)\n", + "\n", + "\n", + "\n", + "4->7\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "5\n", + "5 (Penerangan)\n", + "\n", + "\n", + "\n", + "4->5\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "8\n", + "8 (Datuk)\n", + "\n", + "\n", + "\n", + "4->8\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "6\n", + "6 (Bersatu)\n", + "\n", + "\n", + "\n", + "5->6\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "9\n", + "9 (Wan)\n", + "\n", + "\n", + "\n", + "8->9\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "10\n", + "10 (Saiful)\n", + "\n", + "\n", + "\n", + "9->10\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "11\n", + "11 (Wan)\n", + "\n", + "\n", + "\n", + "10->11\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "12\n", + "12 (Jan)\n", + "\n", + "\n", + "\n", + "11->12\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "34\n", + "34 (sekatan)\n", + "\n", + "\n", + "\n", + "33->34\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "38\n", + "38 (Beliau)\n", + "\n", + "\n", + "\n", + "39->38\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "41\n", + "41 (Najib)\n", + "\n", + "\n", + "\n", + "39->41\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "46\n", + "46 (memetik)\n", + "\n", + "\n", + "\n", + "39->46\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "61\n", + "61 (.)\n", + "\n", + "\n", + "\n", + "39->61\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "64\n", + "64 (berkata)\n", + "\n", + "\n", + "\n", + "39->64\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "15\n", + "15 (Datuk)\n", + "\n", + "\n", + "\n", + "16\n", + "16 (Seri)\n", + "\n", + "\n", + "\n", + "15->16\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "20\n", + "20 (Ketua)\n", + "\n", + "\n", + "\n", + "15->20\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "16->15\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "17\n", + "17 (Najib)\n", + "\n", + "\n", + "\n", + "16->17\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "24\n", + "24 (Datuk)\n", + "\n", + "\n", + "\n", + "16->24\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "19\n", + "19 (dan)\n", + "\n", + "\n", + "\n", + "20->19\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "21\n", + "21 (Pemuda)\n", + "\n", + "\n", + "\n", + "20->21\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "25\n", + "25 (Dr)\n", + "\n", + "\n", + "\n", + "20->25\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "23\n", + "23 (,)\n", + "\n", + "\n", + "\n", + "20->23\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "18\n", + "18 (Razak)\n", + "\n", + "\n", + "\n", + "17->18\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "30\n", + "30 (mempertikaikan)\n", + "\n", + "\n", + "\n", + "24->30\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "22\n", + "22 (Umno)\n", + "\n", + "\n", + "\n", + "21->22\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "26\n", + "26 (Asyraf)\n", + "\n", + "\n", + "\n", + "25->26\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "29\n", + "29 (yang)\n", + "\n", + "\n", + "\n", + "30->29\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "31\n", + "31 (tindakan)\n", + "\n", + "\n", + "\n", + "30->31\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "27\n", + "27 (Wajdi)\n", + "\n", + "\n", + "\n", + "26->27\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "28\n", + "28 (Dusuki)\n", + "\n", + "\n", + "\n", + "27->28\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "32\n", + "32 (kerajaan)\n", + "\n", + "\n", + "\n", + "31->32\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "35\n", + "35 (pergerakan)\n", + "\n", + "\n", + "\n", + "34->35\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "36\n", + "36 (penuh)\n", + "\n", + "\n", + "\n", + "35->36\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "40\n", + "40 (,)\n", + "\n", + "\n", + "\n", + "41->40\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "43\n", + "43 (Asyraf)\n", + "\n", + "\n", + "\n", + "41->43\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "45\n", + "45 (sengaja)\n", + "\n", + "\n", + "\n", + "46->45\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "47\n", + "47 (kenyataan)\n", + "\n", + "\n", + "\n", + "46->47\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "62\n", + "62 (Wan)\n", + "\n", + "\n", + "\n", + "64->62\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "65\n", + "65 (,)\n", + "\n", + "\n", + "\n", + "64->65\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "83\n", + "83 (.)\n", + "\n", + "\n", + "\n", + "64->83\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "68\n", + "68 (menjangka)\n", + "\n", + "\n", + "\n", + "64->68\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "42\n", + "42 (dan)\n", + "\n", + "\n", + "\n", + "43->42\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "44\n", + "44 (Wajdi)\n", + "\n", + "\n", + "\n", + "43->44\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "48\n", + "48 (Perdana)\n", + "\n", + "\n", + "\n", + "47->48\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "49\n", + "49 (Menteri)\n", + "\n", + "\n", + "\n", + "48->49\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "50\n", + "50 (,)\n", + "\n", + "\n", + "\n", + "48->50\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "51\n", + "51 (Tan)\n", + "\n", + "\n", + "\n", + "48->51\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "57\n", + "57 (lengkap)\n", + "\n", + "\n", + "\n", + "48->57\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "52\n", + "52 (Sri)\n", + "\n", + "\n", + "\n", + "51->52\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "55\n", + "55 (yang)\n", + "\n", + "\n", + "\n", + "57->55\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "56\n", + "56 (tidak)\n", + "\n", + "\n", + "\n", + "57->56\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "59\n", + "59 (mengelirukan)\n", + "\n", + "\n", + "\n", + "57->59\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "53\n", + "53 (Muhyiddin)\n", + "\n", + "\n", + "\n", + "52->53\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "54\n", + "54 (Yassin)\n", + "\n", + "\n", + "\n", + "53->54\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "58\n", + "58 (untuk)\n", + "\n", + "\n", + "\n", + "59->58\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "60\n", + "60 (rakyat)\n", + "\n", + "\n", + "\n", + "59->60\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "63\n", + "63 (Saiful)\n", + "\n", + "\n", + "\n", + "62->63\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "66\n", + "66 (beliau)\n", + "\n", + "\n", + "\n", + "68->66\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "67\n", + "67 (sudah)\n", + "\n", + "\n", + "\n", + "68->67\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "69\n", + "69 (ada)\n", + "\n", + "\n", + "\n", + "68->69\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "70\n", + "70 (kenyataan)\n", + "\n", + "\n", + "\n", + "69->70\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "73\n", + "73 (Najib)\n", + "\n", + "\n", + "\n", + "69->73\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "75\n", + "75 (tulisan)\n", + "\n", + "\n", + "\n", + "69->75\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "71\n", + "71 (balas)\n", + "\n", + "\n", + "\n", + "70->71\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "72\n", + "72 (daripada)\n", + "\n", + "\n", + "\n", + "73->72\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "74\n", + "74 (mengenai)\n", + "\n", + "\n", + "\n", + "75->74\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "77\n", + "77 (berhubung)\n", + "\n", + "\n", + "\n", + "75->77\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "76\n", + "76 (beliau)\n", + "\n", + "\n", + "\n", + "77->76\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "78\n", + "78 (kesan)\n", + "\n", + "\n", + "\n", + "77->78\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "79\n", + "79 (positif)\n", + "\n", + "\n", + "\n", + "78->79\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "80\n", + "80 (sekatan)\n", + "\n", + "\n", + "\n", + "78->80\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "81\n", + "81 (pergerakan)\n", + "\n", + "\n", + "\n", + "80->81\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "82\n", + "82 (penuh)\n", + "\n", + "\n", + "\n", + "81->82\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "string = 'KUALA LUMPUR: Ketua Penerangan BERSATU, Datuk Wan Saiful Wan Jan membidas kenyataan Datuk Seri Najib Razak dan Ketua Pemuda UMNO, Datuk Dr Asyraf Wajdi Dusuki yang mempertikaikan tindakan kerajaan melaksanakan sekatan pergerakan penuh. Beliau berkata, Najib dan Asyraf Wajdi sengaja memetik kenyataan Perdana Menteri, Tan Sri Muhyiddin Yassin yang tidak lengkap untuk mengelirukan rakyat. Wan Saiful berkata, beliau sudah menjangka ada kenyataan balas daripada Najib mengenai tulisan beliau berhubung kesan positif sekatan pergerakan penuh.'\n", + "sequence = transformer_textcleaning(string)[1]\n", + "parsed_sequence, bert_sequence, mask = parse_X(sequence)\n", + "h, t = sess.run([model.heads_seq, model.tags_seq],\n", + " feed_dict = {\n", + " model.words: [parsed_sequence],\n", + " },\n", + ")\n", + "h = h[0] - 2\n", + "t = [idx2tag[d] for d in t[0]]\n", + "merged_h = merge_sentencepiece_tokens_tagging(bert_sequence, h)\n", + "merged_t = merge_sentencepiece_tokens_tagging(bert_sequence, t)\n", + "tagging = list(zip(merged_t[0], merged_t[1]))\n", + "indexing = list(zip(merged_h[0], merged_h[1]))\n", + "dep = dependency_graph(tagging, indexing)\n", + "dep.to_graphvis()" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 71, "metadata": {}, "outputs": [ { @@ -1289,7 +1999,7 @@ "'albert-base-dependency/model.ckpt'" ] }, - "execution_count": 27, + "execution_count": 71, "metadata": {}, "output_type": "execute_result" } @@ -1301,22 +2011,14 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 72, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py:1750: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n", - " warnings.warn('An interactive session is already active. This can '\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "INFO:tensorflow:++++++ warmup starts at step 0, for 3777 steps ++++++\n", + "INFO:tensorflow:++++++ warmup starts at step 0, for 29336 steps ++++++\n", "INFO:tensorflow:using adamw\n", "INFO:tensorflow:Restoring parameters from albert-base-dependency/model.ckpt\n" ] @@ -1327,7 +2029,7 @@ "sess = tf.InteractiveSession()\n", "\n", "learning_rate = 2e-5\n", - "hidden_size_word = 128\n", + "hidden_size_word = 256\n", "\n", "model = Model(learning_rate, hidden_size_word, training = False)\n", "\n", @@ -1338,7 +2040,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -1354,7 +2056,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -1399,14 +2101,14 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 75, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 315/315 [01:27<00:00, 3.61it/s]\n" + "100%|██████████| 313/313 [00:51<00:00, 6.09it/s]\n" ] } ], @@ -1443,7 +2145,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1458,7 +2160,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 77, "metadata": {}, "outputs": [ { @@ -1467,43 +2169,41 @@ "text": [ " precision recall f1-score support\n", "\n", - " PAD 1.00000 1.00000 1.00000 905035\n", - " X 0.99997 0.99998 0.99998 159607\n", - " acl 0.89111 0.88994 0.89052 6051\n", - " advcl 0.75213 0.78003 0.76583 2373\n", - " advmod 0.89975 0.92642 0.91289 9378\n", - " amod 0.86607 0.87808 0.87204 8145\n", - " appos 0.87914 0.89496 0.88698 4779\n", - " aux 1.00000 0.37500 0.54545 8\n", - " case 0.96890 0.97142 0.97016 21521\n", - " cc 0.96049 0.96393 0.96221 6405\n", - " ccomp 0.70574 0.67583 0.69046 873\n", - " compound 0.88800 0.89660 0.89228 13530\n", - "compound:plur 0.93381 0.93981 0.93680 1246\n", - " conj 0.94147 0.93436 0.93790 8608\n", - " cop 0.94652 0.96651 0.95641 1941\n", - " csubj 0.75000 0.39623 0.51852 53\n", - " csubj:pass 0.77778 0.77778 0.77778 9\n", - " dep 0.81778 0.72871 0.77068 1010\n", - " det 0.91665 0.90606 0.91132 8314\n", - " fixed 0.87862 0.80565 0.84055 1168\n", - " flat 0.96177 0.93608 0.94875 20400\n", - " iobj 0.71429 0.42857 0.53571 35\n", - " mark 0.88640 0.88577 0.88608 2854\n", - " nmod 0.86857 0.90150 0.88473 8020\n", - " nsubj 0.89466 0.93382 0.91382 12633\n", - " nsubj:pass 0.91977 0.81904 0.86648 4045\n", - " nummod 0.95316 0.95864 0.95589 8003\n", - " obj 0.90795 0.92092 0.91439 10357\n", - " obl 0.93016 0.90607 0.91796 11466\n", - " parataxis 0.72669 0.62953 0.67463 718\n", - " punct 0.99482 0.99724 0.99603 33312\n", - " root 0.93869 0.94093 0.93981 10073\n", - " xcomp 0.85300 0.80468 0.82813 2524\n", + " PAD 1.00000 1.00000 1.00000 627830\n", + " X 1.00000 1.00000 1.00000 60741\n", + " acl 0.83673 0.81078 0.82355 3192\n", + " advcl 0.65863 0.62082 0.63917 1585\n", + " advmod 0.94333 0.93545 0.93938 6460\n", + " amod 0.89656 0.89594 0.89625 4363\n", + " appos 0.80183 0.74289 0.77124 3061\n", + " case 0.98074 0.98002 0.98038 10862\n", + " cc 0.98250 0.97005 0.97624 3473\n", + " ccomp 0.51095 0.40698 0.45307 344\n", + " compound 0.88936 0.93235 0.91035 11027\n", + "compound:plur 0.61538 0.55172 0.58182 29\n", + " conj 0.89128 0.87559 0.88336 5112\n", + " cop 0.97607 0.96453 0.97026 592\n", + " csubj 0.33333 0.12500 0.18182 8\n", + " dep 0.63277 0.63456 0.63366 353\n", + " det 0.94329 0.91823 0.93059 3840\n", + " fixed 0.90071 0.73837 0.81150 172\n", + " flat 0.95078 0.96312 0.95691 18492\n", + " iobj 0.00000 0.00000 0.00000 1\n", + " mark 0.91609 0.92954 0.92276 1703\n", + " nmod 0.84561 0.82701 0.83621 4457\n", + " nsubj 0.84280 0.84806 0.84542 6891\n", + " nsubj:pass 0.81498 0.77772 0.79591 1903\n", + " nummod 0.95994 0.95256 0.95624 4427\n", + " obj 0.89881 0.91319 0.90594 6128\n", + " obl 0.86886 0.84068 0.85454 4965\n", + " parataxis 0.51012 0.35097 0.41584 359\n", + " punct 0.99626 0.99768 0.99697 20300\n", + " root 0.88585 0.91420 0.89980 10000\n", + " xcomp 0.71181 0.72865 0.72013 1522\n", "\n", - " accuracy 0.98785 1284494\n", - " macro avg 0.88860 0.84152 0.85761 1284494\n", - " weighted avg 0.98786 0.98785 0.98782 1284494\n", + " accuracy 0.98614 824192\n", + " macro avg 0.80630 0.77893 0.78998 824192\n", + " weighted avg 0.98598 0.98614 0.98603 824192\n", "\n" ] } @@ -1515,16 +2215,16 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "arc accuracy: 0.8118309576064845\n", - "types accuracy: 0.7931625589721538\n", - "root accuracy: 0.879281746031746\n" + "arc accuracy: 0.821895729831751\n", + "types accuracy: 0.797527251552637\n", + "root accuracy: 1.0\n" ] } ], @@ -1536,63 +2236,9 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 79, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Placeholder',\n", - " 'Placeholder_1',\n", - " 'Placeholder_2',\n", - " 'Placeholder_3',\n", - " 'W_d',\n", - " 'W_e',\n", - " 'U',\n", - " 'U-bi',\n", - " 'Wl',\n", - " 'Wr',\n", - " 'bert/embeddings/word_embeddings',\n", - " 'bert/embeddings/token_type_embeddings',\n", - " 'bert/embeddings/position_embeddings',\n", - " 'bert/embeddings/LayerNorm/gamma',\n", - " 'bert/encoder/embedding_hidden_mapping_in/kernel',\n", - " 'bert/encoder/embedding_hidden_mapping_in/bias',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/kernel',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/bias',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/key/kernel',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/key/bias',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/value/kernel',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/value/bias',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/attention_1/output/dense/kernel',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/attention_1/output/dense/bias',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/LayerNorm/gamma',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/dense/kernel',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/dense/bias',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/output/dense/kernel',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/output/dense/bias',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/LayerNorm_1/gamma',\n", - " 'bert/pooler/dense/kernel',\n", - " 'bert/pooler/dense/bias',\n", - " 'dense/kernel',\n", - " 'dense/bias',\n", - " 'dense_1/kernel',\n", - " 'dense_1/bias',\n", - " 'dense_2/kernel',\n", - " 'dense_2/bias',\n", - " 'dense_3/kernel',\n", - " 'dense_3/bias',\n", - " 'heads_seq',\n", - " 'tags_seq',\n", - " 'transitions',\n", - " 'logits']" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "strings = ','.join(\n", " [\n", @@ -1610,13 +2256,12 @@ " and 'adam' not in n.name\n", " and 'gradients/bert' not in n.name\n", " ]\n", - ")\n", - "strings.split(',')" + ")" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -1651,7 +2296,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 81, "metadata": {}, "outputs": [ { @@ -1659,7 +2304,7 @@ "output_type": "stream", "text": [ "INFO:tensorflow:Restoring parameters from albert-base-dependency/model.ckpt\n", - "WARNING:tensorflow:From :23: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From :23: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use `tf.compat.v1.graph_util.convert_variables_to_constants`\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/framework/graph_util_impl.py:277: extract_sub_graph (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", @@ -1667,7 +2312,7 @@ "Use `tf.compat.v1.graph_util.extract_sub_graph`\n", "INFO:tensorflow:Froze 40 variables.\n", "INFO:tensorflow:Converted 40 variables to const ops.\n", - "3746 ops in the final graph.\n" + "3731 ops in the final graph.\n" ] } ], @@ -1677,165 +2322,46 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ - "string = 'husein makan ayam'\n", - "\n", - "import re\n", - "\n", - "def entities_textcleaning(string, lowering = False):\n", - " \"\"\"\n", - " use by entities recognition, pos recognition and dependency parsing\n", - " \"\"\"\n", - " string = re.sub('[^A-Za-z0-9\\-\\/() ]+', ' ', string)\n", - " string = re.sub(r'[ ]+', ' ', string).strip()\n", - " original_string = string.split()\n", - " if lowering:\n", - " string = string.lower()\n", - " string = [\n", - " (original_string[no], word.title() if word.isupper() else word)\n", - " for no, word in enumerate(string.split())\n", - " if len(word)\n", - " ]\n", - " return [s[0] for s in string], [s[1] for s in string]\n", - "\n", - "def parse_X(left):\n", - " bert_tokens = ['[CLS]']\n", - " for no, orig_token in enumerate(left):\n", - " t = tokenizer.tokenize(orig_token)\n", - " bert_tokens.extend(t)\n", - " bert_tokens.append(\"[SEP]\")\n", - " return tokenizer.convert_tokens_to_ids(bert_tokens), bert_tokens\n", - "\n", - "sequence = entities_textcleaning(string)[1]\n", - "parsed_sequence, bert_sequence = parse_X(sequence)" + "transforms = ['add_default_attributes',\n", + " 'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n", + " 'fold_batch_norms',\n", + " 'fold_old_batch_norms',\n", + " 'quantize_weights(fallback_min=-10, fallback_max=10)',\n", + " 'strip_unused_nodes',\n", + " 'sort_by_execution_order']" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ - "def merge_sentencepiece_tokens_tagging(x, y):\n", - " new_paired_tokens = []\n", - " n_tokens = len(x)\n", - " rejected = ['[CLS]', '[SEP]']\n", - "\n", - " i = 0\n", - "\n", - " while i < n_tokens:\n", - "\n", - " current_token, current_label = x[i], y[i]\n", - " if not current_token.startswith('▁') and current_token not in rejected:\n", - " previous_token, previous_label = new_paired_tokens.pop()\n", - " merged_token = previous_token\n", - " merged_label = [previous_label]\n", - " while (\n", - " not current_token.startswith('▁')\n", - " and current_token not in rejected\n", - " ):\n", - " merged_token = merged_token + current_token.replace('▁', '')\n", - " merged_label.append(current_label)\n", - " i = i + 1\n", - " current_token, current_label = x[i], y[i]\n", - " merged_label = merged_label[0]\n", - " new_paired_tokens.append((merged_token, merged_label))\n", - "\n", - " else:\n", - " new_paired_tokens.append((current_token, current_label))\n", - " i = i + 1\n", - "\n", - " words = [\n", - " i[0].replace('▁', '')\n", - " for i in new_paired_tokens\n", - " if i[0] not in rejected\n", - " ]\n", - " labels = [i[1] for i in new_paired_tokens if i[0] not in rejected]\n", - " return words, labels" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py:1750: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n", - " warnings.warn('An interactive session is already active. This can '\n" - ] - } - ], - "source": [ - "def load_graph(frozen_graph_filename):\n", - " with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n", - " graph_def = tf.GraphDef()\n", - " graph_def.ParseFromString(f.read())\n", - " with tf.Graph().as_default() as graph:\n", - " tf.import_graph_def(graph_def)\n", - " return graph\n", + "from tensorflow.tools.graph_transforms import TransformGraph\n", + "tf.set_random_seed(0)\n", "\n", - "g = load_graph('albert-base-dependency/frozen_model.pb')\n", - "x = g.get_tensor_by_name('import/Placeholder:0')\n", - "heads_seq = g.get_tensor_by_name('import/heads_seq:0')\n", - "tags_seq = g.get_tensor_by_name('import/logits:0')\n", - "test_sess = tf.InteractiveSession(graph = g)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [], - "source": [ - "h, t = test_sess.run([heads_seq, tags_seq],\n", - " feed_dict = {\n", - " x: [parsed_sequence],\n", - " },\n", - ")\n", - "h = h[0] - 1\n", - "t = [idx2tag[d] for d in t[0]]\n", - "merged_h = merge_sentencepiece_tokens_tagging(bert_sequence, h)\n", - "merged_t = merge_sentencepiece_tokens_tagging(bert_sequence, t)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('husein', 2), ('makan', 0), ('ayam', 0)]\n" - ] - } - ], - "source": [ - "print(list(zip(merged_h[0], merged_h[1])))" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [], - "source": [ - "import boto3\n", + "pb = 'albert-base-dependency/frozen_model.pb'\n", + "input_graph_def = tf.GraphDef()\n", + "with tf.gfile.FastGFile(pb, 'rb') as f:\n", + " input_graph_def.ParseFromString(f.read())\n", "\n", - "bucketName = 'huseinhouse-storage'\n", - "Key = 'albert-base-dependency/frozen_model.pb'\n", - "outPutname = \"v34/dependency/albert-base-dependency.pb\"\n", + "if 'bert' in pb:\n", + " inputs = ['Placeholder']\n", + " a = ['dense/BiasAdd']\n", + "if 'xlnet' in pb:\n", + " inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n", + " a = ['transpose_3']\n", "\n", - "s3 = boto3.client('s3')\n", + "transformed_graph_def = TransformGraph(input_graph_def, \n", + " inputs,\n", + " ['logits', 'heads_seq'] + a, transforms)\n", "\n", - "s3.upload_file(Key,bucketName,outPutname)" + "with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n", + " f.write(transformed_graph_def.SerializeToString())" ] } ], @@ -1855,7 +2381,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.9" } }, "nbformat": 4, diff --git a/session/dependency/albert-tiny.ipynb b/session/dependency/albert-tiny.ipynb index 2feac1fc..4761c1a2 100644 --- a/session/dependency/albert-tiny.ipynb +++ b/session/dependency/albert-tiny.ipynb @@ -2,17 +2,17 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", - "os.environ['CUDA_VISIBLE_DEVICES'] = '2'" + "os.environ['CUDA_VISIBLE_DEVICES'] = '1'" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -29,12 +29,13 @@ "from albert import optimization\n", "from albert import tokenization\n", "import tensorflow as tf\n", - "import numpy as np" + "import numpy as np\n", + "import json" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -49,8 +50,26 @@ ], "source": [ "tokenizer = tokenization.FullTokenizer(\n", - " vocab_file='albert-tiny-2020-04-17/sp10m.cased.v10.vocab', do_lower_case=False,\n", - " spm_model_file='albert-tiny-2020-04-17/sp10m.cased.v10.model')" + " vocab_file='albert-base-2020-04-10/sp10m.cased.v10.vocab', do_lower_case=False,\n", + " spm_model_file='albert-base-2020-04-10/sp10m.cased.v10.model')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "with open('train_X.pkl', 'rb') as fopen:\n", + " train_X, train_Y, train_depends = pickle.load(fopen)\n", + " \n", + "with open('test_X.pkl', 'rb') as fopen:\n", + " test_X, test_Y, test_depends = pickle.load(fopen)\n", + " \n", + "with open('tags.pkl', 'rb') as fopen:\n", + " idx2tag, tag2idx = pickle.load(fopen)" ] }, { @@ -69,7 +88,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 5, @@ -87,236 +106,17 @@ "execution_count": 6, "metadata": {}, "outputs": [], - "source": [ - "with open('../Malaya-Dataset/dependency/gsd-ud-train.conllu.txt') as fopen:\n", - " corpus = fopen.read().split('\\n')\n", - " \n", - "with open('../Malaya-Dataset/dependency/gsd-ud-test.conllu.txt') as fopen:\n", - " corpus.extend(fopen.read().split('\\n'))\n", - " \n", - "with open('../Malaya-Dataset/dependency/gsd-ud-dev.conllu.txt') as fopen:\n", - " corpus.extend(fopen.read().split('\\n'))" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "tag2idx = {'PAD': 0, 'X': 1}\n", - "tag_idx = 2\n", - "\n", - "def process_corpus(corpus, until = None):\n", - " global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx\n", - " sentences, words, depends, labels, pos, sequences = [], [], [], [], [], []\n", - " temp_sentence, temp_word, temp_depend, temp_label, temp_pos = [], [], [], [], []\n", - " first_time = True\n", - " for sentence in corpus:\n", - " try:\n", - " if len(sentence):\n", - " if sentence[0] == '#':\n", - " continue\n", - " if first_time:\n", - " print(sentence)\n", - " first_time = False\n", - " sentence = sentence.split('\\t')\n", - " if sentence[7] not in tag2idx:\n", - " tag2idx[sentence[7]] = tag_idx\n", - " tag_idx += 1\n", - " temp_word.append(sentence[1])\n", - " temp_depend.append(int(sentence[6]) + 1)\n", - " temp_label.append(tag2idx[sentence[7]])\n", - " temp_sentence.append(sentence[1])\n", - " temp_pos.append(sentence[3])\n", - " else:\n", - " if len(temp_sentence) < 2 or len(temp_word) != len(temp_label):\n", - " temp_word = []\n", - " temp_depend = []\n", - " temp_label = []\n", - " temp_sentence = []\n", - " temp_pos = []\n", - " continue\n", - " bert_tokens = ['[CLS]']\n", - " labels_ = [0]\n", - " depends_ = [0]\n", - " seq_ = []\n", - " for no, orig_token in enumerate(temp_word):\n", - " labels_.append(temp_label[no])\n", - " depends_.append(temp_depend[no])\n", - " t = tokenizer.tokenize(orig_token)\n", - " bert_tokens.extend(t)\n", - " labels_.extend([1] * (len(t) - 1))\n", - " depends_.extend([0] * (len(t) - 1))\n", - " seq_.append(no + 1)\n", - " bert_tokens.append('[SEP]')\n", - " labels_.append(0)\n", - " depends_.append(0)\n", - " words.append(tokenizer.convert_tokens_to_ids(bert_tokens))\n", - " depends.append(depends_)\n", - " labels.append(labels_)\n", - " sentences.append(bert_tokens)\n", - " pos.append(temp_pos)\n", - " sequences.append(seq_)\n", - " temp_word = []\n", - " temp_depend = []\n", - " temp_label = []\n", - " temp_sentence = []\n", - " temp_pos = []\n", - " except Exception as e:\n", - " print(e, sentence)\n", - " return sentences[:-1], words[:-1], depends[:-1], labels[:-1], pos[:-1], sequences[:-1]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1\tSembungan\tsembungan\tPROPN\tX--\t_\t4\tnsubj\t_\tMorphInd=^sembungan_X--$\n" - ] - } - ], - "source": [ - "sentences, words, depends, labels, _, _ = process_corpus(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open('../Malaya-Dataset/dependency/augmented-dependency.json') as fopen:\n", - " augmented = json.load(fopen)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "text_augmented, depends_augmented, labels_augmented = [], [], []\n", - "\n", - "for a in augmented:\n", - " text_augmented.extend(a[0])\n", - " depends_augmented.extend(a[1])\n", - " labels_augmented.extend((np.array(a[2]) + 1).tolist())" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "def parse_XY(texts, depends, labels):\n", - " outside, sentences, outside_depends, outside_labels = [], [], [], []\n", - " for no, text in enumerate(texts):\n", - " temp_depend = depends[no]\n", - " temp_label = labels[no]\n", - " s = text.split()\n", - " sentences.append(s)\n", - " bert_tokens = ['[CLS]']\n", - " labels_ = [0]\n", - " depends_ = [0]\n", - " for no, orig_token in enumerate(s):\n", - " labels_.append(temp_label[no])\n", - " depends_.append(temp_depend[no])\n", - " t = tokenizer.tokenize(orig_token)\n", - " bert_tokens.extend(t)\n", - " labels_.extend([1] * (len(t) - 1))\n", - " depends_.extend([0] * (len(t) - 1))\n", - " bert_tokens.append('[SEP]')\n", - " labels_.append(0)\n", - " depends_.append(0)\n", - " outside.append(tokenizer.convert_tokens_to_ids(bert_tokens))\n", - " outside_depends.append(depends_)\n", - " outside_labels.append(labels_)\n", - " return outside, sentences, outside_depends, outside_labels" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "outside, _, outside_depends, outside_labels = parse_XY(text_augmented, \n", - " depends_augmented, \n", - " labels_augmented)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "words.extend(outside)\n", - "depends.extend(outside_depends)\n", - "labels.extend(outside_labels)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "idx2tag = {v:k for k, v in tag2idx.items()}" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "words_train, words_test, depends_train, depends_test, labels_train, labels_test \\\n", - "= train_test_split(words, depends, labels, test_size = 0.2)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "train_X = words_train\n", - "train_Y = labels_train\n", - "train_depends = depends_train\n", - "\n", - "test_X = words_test\n", - "test_Y = labels_test\n", - "test_depends = depends_test" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], "source": [ "BERT_INIT_CHKPNT = 'albert-tiny-2020-04-17/model.ckpt-1000000'" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "epoch = 30\n", + "epoch = 3\n", "batch_size = 32\n", "warmup_proportion = 0.1\n", "num_train_steps = int(len(train_X) / batch_size * epoch)\n", @@ -325,7 +125,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -358,8 +158,15 @@ " e = tf.expand_dims(tf.expand_dims(mask_e, 1), 2)\n", " output = output * d * e\n", " \n", - " return output\n", - " \n", + " return output" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ "class BiLinear:\n", " def __init__(self, left_features, right_features, out_features):\n", " self.left_features = left_features\n", @@ -384,8 +191,17 @@ " output = output + tf.matmul(input_left, tf.transpose(self.W_l))\\\n", " + tf.matmul(input_right, tf.transpose(self.W_r))\n", " \n", - " return tf.reshape(output, output_shape)\n", - " \n", + " return tf.reshape(output, output_shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "_NEG_INF = -1e9\n", + "\n", "class Model:\n", " def __init__(\n", " self,\n", @@ -417,14 +233,20 @@ " config=albert_config,\n", " is_training=training,\n", " input_ids=self.words,\n", + " input_mask=self.mask,\n", " use_one_hot_embeddings=False)\n", + " \n", " output_layer = model.get_sequence_output()\n", " \n", " arc_h = tf.nn.elu(self.arc_h(output_layer))\n", " arc_c = tf.nn.elu(self.arc_c(output_layer))\n", + " self._arc_h = arc_h\n", + " self._arc_c = arc_c\n", " \n", " type_h = tf.nn.elu(self.type_h(output_layer))\n", " type_c = tf.nn.elu(self.type_c(output_layer))\n", + " self._type_h = type_h\n", + " self._type_c = type_c\n", " \n", " out_arc = tf.squeeze(self.attention.forward(arc_h, arc_c, mask_d=self.mask, \n", " mask_e=self.mask), axis = 1)\n", @@ -443,6 +265,11 @@ " self.heads_seq = tf.argmax(decode_arc, axis = 1)\n", " self.heads_seq = tf.identity(self.heads_seq, name = 'heads_seq')\n", " \n", + "# self.decode_arc_t = tf.transpose(decode_arc, (0, 2, 1))\n", + "# sequence_loss_depends = tf.contrib.seq2seq.sequence_loss(logits = self.decode_arc_t,\n", + "# targets = self.heads,\n", + "# weights = mask)\n", + " \n", " t = tf.cast(tf.transpose(self.heads_seq), tf.int32)\n", " broadcasted = tf.broadcast_to(batch_index, tf.shape(t))\n", " concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0), \n", @@ -517,7 +344,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -547,20 +374,20 @@ "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/layers/core.py:187: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use `layer.__call__` method instead.\n", - "WARNING:tensorflow:From :110: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From :61: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/contrib/crf/python/ops/crf.py:213: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use `keras.layers.RNN(cell)`, which is equivalent to this API\n", - "WARNING:tensorflow:From :145: calling log_softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From :101: calling log_softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "dim is deprecated, use axis instead\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/albert/optimization.py:36: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.\n", "\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/albert/optimization.py:41: The name tf.train.polynomial_decay is deprecated. Please use tf.compat.v1.train.polynomial_decay instead.\n", "\n", - "INFO:tensorflow:++++++ warmup starts at step 0, for 3777 steps ++++++\n", + "INFO:tensorflow:++++++ warmup starts at step 0, for 29336 steps ++++++\n", "INFO:tensorflow:using adamw\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/albert/optimization.py:101: The name tf.trainable_variables is deprecated. Please use tf.compat.v1.trainable_variables instead.\n", "\n" @@ -572,7 +399,7 @@ "sess = tf.InteractiveSession()\n", "\n", "learning_rate = 2e-5\n", - "hidden_size_word = 128\n", + "hidden_size_word = 256\n", "\n", "model = Model(learning_rate, hidden_size_word)\n", "sess.run(tf.global_variables_initializer())" @@ -580,7 +407,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -599,7 +426,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -615,74 +442,90 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 14, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:52<00:00, 4.31it/s, accuracy=0.438, accuracy_depends=0.75, cost=1.91] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:54<00:00, 5.83it/s, accuracy=0.257, accuracy_depends=0.502, cost=2.65]\n", - "train minibatch loop: 0%| | 0/1260 [00:00?@[\\]^_`{|}~'\n", + "\n", + "def transformer_textcleaning(string):\n", + " \"\"\"\n", + " use by any transformer model before tokenization\n", + " \"\"\"\n", + " string = unidecode(string)\n", + " string = re.sub('\\\\(dot\\\\)', '.', string)\n", + " string = (\n", + " re.sub(re.findall(r'\\', string)[0], '', string)\n", + " if (len(re.findall(r'\\', string)) > 0)\n", + " and ('href' in re.findall(r'\\', string)[0])\n", + " else string\n", + " )\n", + " string = re.sub(\n", + " r'\\w+:\\/{2}[\\d\\w-]+(\\.[\\d\\w-]+)*(?:(?:\\/[^\\s/]*))*', ' ', string\n", + " )\n", + " string = re.sub(r'[ ]+', ' ', string).strip().split()\n", + " string = [w for w in string if w[0] != '@']\n", + " string = ' '.join(string)\n", + " string = re.sub(f'([{PUNCTUATION}])', r' \\1 ', string)\n", + " string = re.sub('\\s{2,}', ' ', string)\n", + " original_string = string.split()\n", + " string = [\n", + " (original_string[no], word.title() if word.isupper() else word)\n", + " for no, word in enumerate(string.split())\n", + " if len(word)\n", + " ]\n", + " return [s[0] for s in string], [s[1] for s in string]\n", + "\n", + "def parse_X(left):\n", + " bert_tokens = ['[CLS]']\n", + " for no, orig_token in enumerate(left):\n", + " t = tokenizer.tokenize(orig_token)\n", + " bert_tokens.extend(t)\n", + " bert_tokens.append(\"[SEP]\")\n", + " t = tokenizer.convert_tokens_to_ids(bert_tokens)\n", + " return t, bert_tokens, [1] * len(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "def dependency_graph(tagging, indexing):\n", + " \"\"\"\n", + " Return helper object for dependency parser results. Only accept tagging and indexing outputs from dependency models.\n", + " \"\"\"\n", + " result = []\n", + " for i in range(len(tagging)):\n", + " result.append(\n", + " '%d\\t%s\\t_\\t_\\t_\\t_\\t%d\\t%s\\t_\\t_'\n", + " % (i + 1, tagging[i][0], int(indexing[i][1]), tagging[i][1])\n", + " )\n", + " return DependencyGraph('\\n'.join(result), top_relation_label='root')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "2\n", + "2 (makan)\n", + "\n", + "\n", + "\n", + "0->2\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "1\n", + "1 (husein)\n", + "\n", + "\n", + "\n", + "2->1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "3\n", + "3 (ayam)\n", + "\n", + "\n", + "\n", + "2->3\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "string = 'husein makan ayam'\n", + "sequence = transformer_textcleaning(string)[1]\n", + "parsed_sequence, bert_sequence, mask = parse_X(sequence)\n", + "h, t = sess.run([model.heads_seq, model.tags_seq],\n", + " feed_dict = {\n", + " model.words: [parsed_sequence],\n", + " },\n", + ")\n", + "h = h[0] - 2\n", + "t = [idx2tag[d] for d in t[0]]\n", + "merged_h = merge_sentencepiece_tokens_tagging(bert_sequence, h)\n", + "merged_t = merge_sentencepiece_tokens_tagging(bert_sequence, t)\n", + "tagging = list(zip(merged_t[0], merged_t[1]))\n", + "indexing = list(zip(merged_h[0], merged_h[1]))\n", + "dep = dependency_graph(tagging, indexing)\n", + "dep.to_graphvis()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "1\n", + "1 (Kuala)\n", + "\n", + "\n", + "\n", + "0->1\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "13\n", + "13 (membidas)\n", + "\n", + "\n", + "\n", + "0->13\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "2\n", + "2 (Lumpur)\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "4\n", + "4 (Ketua)\n", + "\n", + "\n", + "\n", + "1->4\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "37\n", + "37 (.)\n", + "\n", + "\n", + "\n", + "1->37\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "14\n", + "14 (kenyataan)\n", + "\n", + "\n", + "\n", + "13->14\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "23\n", + "23 (,)\n", + "\n", + "\n", + "\n", + "13->23\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "39\n", + "39 (berkata)\n", + "\n", + "\n", + "\n", + "13->39\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "46\n", + "46 (memetik)\n", + "\n", + "\n", + "\n", + "13->46\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "3\n", + "3 (:)\n", + "\n", + "\n", + "\n", + "2->3\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "5\n", + "5 (Penerangan)\n", + "\n", + "\n", + "\n", + "4->5\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "9\n", + "9 (Wan)\n", + "\n", + "\n", + "\n", + "4->9\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "7\n", + "7 (,)\n", + "\n", + "\n", + "\n", + "4->7\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "6\n", + "6 (Bersatu)\n", + "\n", + "\n", + "\n", + "5->6\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "10\n", + "10 (Saiful)\n", + "\n", + "\n", + "\n", + "9->10\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "8\n", + "8 (Datuk)\n", + "\n", + "\n", + "\n", + "11\n", + "11 (Wan)\n", + "\n", + "\n", + "\n", + "10->11\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "12\n", + "12 (Jan)\n", + "\n", + "\n", + "\n", + "11->12\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "12->8\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "61\n", + "61 (.)\n", + "\n", + "\n", + "\n", + "12->61\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "64\n", + "64 (berkata)\n", + "\n", + "\n", + "\n", + "12->64\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "15\n", + "15 (Datuk)\n", + "\n", + "\n", + "\n", + "14->15\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "20\n", + "20 (Ketua)\n", + "\n", + "\n", + "\n", + "14->20\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "38\n", + "38 (Beliau)\n", + "\n", + "\n", + "\n", + "39->38\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "40\n", + "40 (,)\n", + "\n", + "\n", + "\n", + "39->40\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "43\n", + "43 (Asyraf)\n", + "\n", + "\n", + "\n", + "46->43\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "45\n", + "45 (sengaja)\n", + "\n", + "\n", + "\n", + "46->45\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "47\n", + "47 (kenyataan)\n", + "\n", + "\n", + "\n", + "46->47\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "51\n", + "51 (Tan)\n", + "\n", + "\n", + "\n", + "46->51\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "16\n", + "16 (Seri)\n", + "\n", + "\n", + "\n", + "15->16\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "19\n", + "19 (dan)\n", + "\n", + "\n", + "\n", + "20->19\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "21\n", + "21 (Pemuda)\n", + "\n", + "\n", + "\n", + "20->21\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "22\n", + "22 (Umno)\n", + "\n", + "\n", + "\n", + "20->22\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "24\n", + "24 (Datuk)\n", + "\n", + "\n", + "\n", + "20->24\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "17\n", + "17 (Najib)\n", + "\n", + "\n", + "\n", + "16->17\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "18\n", + "18 (Razak)\n", + "\n", + "\n", + "\n", + "16->18\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "25\n", + "25 (Dr)\n", + "\n", + "\n", + "\n", + "24->25\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "30\n", + "30 (mempertikaikan)\n", + "\n", + "\n", + "\n", + "24->30\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "26\n", + "26 (Asyraf)\n", + "\n", + "\n", + "\n", + "25->26\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "29\n", + "29 (yang)\n", + "\n", + "\n", + "\n", + "30->29\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "31\n", + "31 (tindakan)\n", + "\n", + "\n", + "\n", + "30->31\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "33\n", + "33 (melaksanakan)\n", + "\n", + "\n", + "\n", + "30->33\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "27\n", + "27 (Wajdi)\n", + "\n", + "\n", + "\n", + "26->27\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "28\n", + "28 (Dusuki)\n", + "\n", + "\n", + "\n", + "27->28\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "32\n", + "32 (kerajaan)\n", + "\n", + "\n", + "\n", + "31->32\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "34\n", + "34 (sekatan)\n", + "\n", + "\n", + "\n", + "33->34\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "35\n", + "35 (pergerakan)\n", + "\n", + "\n", + "\n", + "34->35\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "36\n", + "36 (penuh)\n", + "\n", + "\n", + "\n", + "35->36\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "41\n", + "41 (Najib)\n", + "\n", + "\n", + "\n", + "42\n", + "42 (dan)\n", + "\n", + "\n", + "\n", + "43->42\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "44\n", + "44 (Wajdi)\n", + "\n", + "\n", + "\n", + "43->44\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "45->41\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "48\n", + "48 (Perdana)\n", + "\n", + "\n", + "\n", + "47->48\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "49\n", + "49 (Menteri)\n", + "\n", + "\n", + "\n", + "47->49\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "50\n", + "50 (,)\n", + "\n", + "\n", + "\n", + "47->50\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "57\n", + "57 (lengkap)\n", + "\n", + "\n", + "\n", + "47->57\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "53\n", + "53 (Muhyiddin)\n", + "\n", + "\n", + "\n", + "51->53\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "55\n", + "55 (yang)\n", + "\n", + "\n", + "\n", + "57->55\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "56\n", + "56 (tidak)\n", + "\n", + "\n", + "\n", + "57->56\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "58\n", + "58 (untuk)\n", + "\n", + "\n", + "\n", + "57->58\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "52\n", + "52 (Sri)\n", + "\n", + "\n", + "\n", + "52->52\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "54\n", + "54 (Yassin)\n", + "\n", + "\n", + "\n", + "52->54\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "59\n", + "59 (mengelirukan)\n", + "\n", + "\n", + "\n", + "59->59\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "60\n", + "60 (rakyat)\n", + "\n", + "\n", + "\n", + "59->60\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "62\n", + "62 (Wan)\n", + "\n", + "\n", + "\n", + "63\n", + "63 (Saiful)\n", + "\n", + "\n", + "\n", + "62->63\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "65\n", + "65 (,)\n", + "\n", + "\n", + "\n", + "66\n", + "66 (beliau)\n", + "\n", + "\n", + "\n", + "67\n", + "67 (sudah)\n", + "\n", + "\n", + "\n", + "67->62\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "67->65\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "67->66\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "67->67\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "68\n", + "68 (menjangka)\n", + "\n", + "\n", + "\n", + "67->68\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "69\n", + "69 (ada)\n", + "\n", + "\n", + "\n", + "67->69\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "70\n", + "70 (kenyataan)\n", + "\n", + "\n", + "\n", + "68->70\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "77\n", + "77 (berhubung)\n", + "\n", + "\n", + "\n", + "68->77\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "83\n", + "83 (.)\n", + "\n", + "\n", + "\n", + "68->83\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "71\n", + "71 (balas)\n", + "\n", + "\n", + "\n", + "70->71\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "73\n", + "73 (Najib)\n", + "\n", + "\n", + "\n", + "70->73\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "75\n", + "75 (tulisan)\n", + "\n", + "\n", + "\n", + "70->75\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "78\n", + "78 (kesan)\n", + "\n", + "\n", + "\n", + "77->78\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "79\n", + "79 (positif)\n", + "\n", + "\n", + "\n", + "77->79\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "80\n", + "80 (sekatan)\n", + "\n", + "\n", + "\n", + "77->80\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "81\n", + "81 (pergerakan)\n", + "\n", + "\n", + "\n", + "77->81\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "72\n", + "72 (daripada)\n", + "\n", + "\n", + "\n", + "73->72\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "74\n", + "74 (mengenai)\n", + "\n", + "\n", + "\n", + "75->74\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "76\n", + "76 (beliau)\n", + "\n", + "\n", + "\n", + "75->76\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "82\n", + "82 (penuh)\n", + "\n", + "\n", + "\n", + "81->82\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "string = 'KUALA LUMPUR: Ketua Penerangan BERSATU, Datuk Wan Saiful Wan Jan membidas kenyataan Datuk Seri Najib Razak dan Ketua Pemuda UMNO, Datuk Dr Asyraf Wajdi Dusuki yang mempertikaikan tindakan kerajaan melaksanakan sekatan pergerakan penuh. Beliau berkata, Najib dan Asyraf Wajdi sengaja memetik kenyataan Perdana Menteri, Tan Sri Muhyiddin Yassin yang tidak lengkap untuk mengelirukan rakyat. Wan Saiful berkata, beliau sudah menjangka ada kenyataan balas daripada Najib mengenai tulisan beliau berhubung kesan positif sekatan pergerakan penuh.'\n", + "sequence = transformer_textcleaning(string)[1]\n", + "parsed_sequence, bert_sequence, mask = parse_X(sequence)\n", + "h, t = sess.run([model.heads_seq, model.tags_seq],\n", + " feed_dict = {\n", + " model.words: [parsed_sequence],\n", + " },\n", + ")\n", + "h = h[0] - 2\n", + "t = [idx2tag[d] for d in t[0]]\n", + "merged_h = merge_sentencepiece_tokens_tagging(bert_sequence, h)\n", + "merged_t = merge_sentencepiece_tokens_tagging(bert_sequence, t)\n", + "tagging = list(zip(merged_t[0], merged_t[1]))\n", + "indexing = list(zip(merged_h[0], merged_h[1]))\n", + "dep = dependency_graph(tagging, indexing)\n", + "dep.to_graphvis()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1221,7 +2082,7 @@ "'albert-tiny-dependency/model.ckpt'" ] }, - "execution_count": 25, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1233,22 +2094,14 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 24, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py:1750: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n", - " warnings.warn('An interactive session is already active. This can '\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "INFO:tensorflow:++++++ warmup starts at step 0, for 3777 steps ++++++\n", + "INFO:tensorflow:++++++ warmup starts at step 0, for 29336 steps ++++++\n", "INFO:tensorflow:using adamw\n", "INFO:tensorflow:Restoring parameters from albert-tiny-dependency/model.ckpt\n" ] @@ -1259,7 +2112,7 @@ "sess = tf.InteractiveSession()\n", "\n", "learning_rate = 2e-5\n", - "hidden_size_word = 128\n", + "hidden_size_word = 256\n", "\n", "model = Model(learning_rate, hidden_size_word, training = False)\n", "\n", @@ -1270,7 +2123,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -1286,7 +2139,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -1331,14 +2184,14 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 315/315 [00:54<00:00, 5.80it/s]\n" + "100%|██████████| 313/313 [00:37<00:00, 8.27it/s]\n" ] } ], @@ -1375,7 +2228,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -1390,60 +2243,50 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 29, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/husein/.local/lib/python3.6/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", - " PAD 1.00000 1.00000 1.00000 901404\n", - " X 0.99997 0.99998 0.99997 158217\n", - " acl 0.74523 0.72259 0.73374 6056\n", - " advcl 0.44763 0.44416 0.44589 2319\n", - " advmod 0.80839 0.80245 0.80541 9537\n", - " amod 0.74481 0.69167 0.71726 8144\n", - " appos 0.71137 0.68084 0.69577 4963\n", - " aux 0.00000 0.00000 0.00000 9\n", - " case 0.90625 0.93745 0.92159 21056\n", - " cc 0.92435 0.90888 0.91655 6453\n", - " ccomp 0.32162 0.13918 0.19429 855\n", - " compound 0.76535 0.75323 0.75924 13008\n", - "compound:plur 0.76103 0.77066 0.76581 1186\n", - " conj 0.79454 0.78507 0.78978 8640\n", - " cop 0.87581 0.90736 0.89130 1943\n", - " csubj 0.66667 0.04082 0.07692 49\n", - " csubj:pass 0.00000 0.00000 0.00000 18\n", - " dep 0.41637 0.38321 0.39910 929\n", - " det 0.81424 0.77924 0.79636 7909\n", - " fixed 0.63932 0.41054 0.50000 1101\n", - " flat 0.85963 0.91321 0.88561 20856\n", - " iobj 1.00000 0.03333 0.06452 30\n", - " mark 0.69997 0.72039 0.71003 2879\n", - " nmod 0.71129 0.68985 0.70041 7964\n", - " nsubj 0.74144 0.81233 0.77527 12719\n", - " nsubj:pass 0.68649 0.56466 0.61964 3905\n", - " nummod 0.84427 0.87244 0.85813 7581\n", - " obj 0.79591 0.78073 0.78825 10380\n", - " obl 0.75820 0.78392 0.77085 11144\n", - " parataxis 0.25150 0.06231 0.09988 674\n", - " punct 0.98207 0.98323 0.98265 33034\n", - " root 0.84186 0.87362 0.85745 10073\n", - " xcomp 0.62652 0.63961 0.63300 2489\n", + " PAD 1.00000 1.00000 1.00000 627830\n", + " X 1.00000 1.00000 1.00000 60741\n", + " acl 0.80688 0.76441 0.78507 3192\n", + " advcl 0.60052 0.58044 0.59031 1585\n", + " advmod 0.92613 0.92570 0.92591 6460\n", + " amod 0.86782 0.85927 0.86353 4363\n", + " appos 0.76876 0.70271 0.73425 3061\n", + " case 0.97508 0.97247 0.97377 10862\n", + " cc 0.98150 0.96228 0.97179 3473\n", + " ccomp 0.52577 0.29651 0.37918 344\n", + " compound 0.85386 0.91240 0.88216 11027\n", + "compound:plur 0.47826 0.37931 0.42308 29\n", + " conj 0.87337 0.85133 0.86221 5112\n", + " cop 0.98953 0.95777 0.97339 592\n", + " csubj 0.00000 0.00000 0.00000 8\n", + " dep 0.59312 0.58640 0.58974 353\n", + " det 0.93306 0.90026 0.91637 3840\n", + " fixed 0.91270 0.66860 0.77181 172\n", + " flat 0.93104 0.95138 0.94110 18492\n", + " iobj 0.00000 0.00000 0.00000 1\n", + " mark 0.89771 0.92249 0.90993 1703\n", + " nmod 0.81300 0.79987 0.80638 4457\n", + " nsubj 0.80429 0.82122 0.81267 6891\n", + " nsubj:pass 0.77960 0.69890 0.73705 1903\n", + " nummod 0.93708 0.93517 0.93612 4427\n", + " obj 0.87808 0.88969 0.88385 6128\n", + " obl 0.84346 0.79980 0.82105 4965\n", + " parataxis 0.43750 0.21448 0.28785 359\n", + " punct 0.99444 0.99635 0.99540 20300\n", + " root 0.86422 0.89430 0.87901 10000\n", + " xcomp 0.67055 0.68068 0.67558 1522\n", "\n", - " accuracy 0.96997 1277524\n", - " macro avg 0.70128 0.63294 0.64105 1277524\n", - " weighted avg 0.96929 0.96997 0.96946 1277524\n", + " accuracy 0.98296 824192\n", + " macro avg 0.77217 0.73949 0.75253 824192\n", + " weighted avg 0.98272 0.98296 0.98277 824192\n", "\n" ] } @@ -1455,16 +2298,16 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "arc accuracy: 0.7087220659183397\n", - "types accuracy: 0.6735055899028873\n", - "root accuracy: 0.8178452380952382\n" + "arc accuracy: 0.7865049894214071\n", + "types accuracy: 0.7587000482179278\n", + "root accuracy: 1.0\n" ] } ], @@ -1476,63 +2319,9 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 31, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Placeholder',\n", - " 'Placeholder_1',\n", - " 'Placeholder_2',\n", - " 'Placeholder_3',\n", - " 'W_d',\n", - " 'W_e',\n", - " 'U',\n", - " 'U-bi',\n", - " 'Wl',\n", - " 'Wr',\n", - " 'bert/embeddings/word_embeddings',\n", - " 'bert/embeddings/token_type_embeddings',\n", - " 'bert/embeddings/position_embeddings',\n", - " 'bert/embeddings/LayerNorm/gamma',\n", - " 'bert/encoder/embedding_hidden_mapping_in/kernel',\n", - " 'bert/encoder/embedding_hidden_mapping_in/bias',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/kernel',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/bias',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/key/kernel',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/key/bias',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/value/kernel',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/value/bias',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/attention_1/output/dense/kernel',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/attention_1/output/dense/bias',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/LayerNorm/gamma',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/dense/kernel',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/dense/bias',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/output/dense/kernel',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/ffn_1/intermediate/output/dense/bias',\n", - " 'bert/encoder/transformer/group_0/inner_group_0/LayerNorm_1/gamma',\n", - " 'bert/pooler/dense/kernel',\n", - " 'bert/pooler/dense/bias',\n", - " 'dense/kernel',\n", - " 'dense/bias',\n", - " 'dense_1/kernel',\n", - " 'dense_1/bias',\n", - " 'dense_2/kernel',\n", - " 'dense_2/bias',\n", - " 'dense_3/kernel',\n", - " 'dense_3/bias',\n", - " 'heads_seq',\n", - " 'tags_seq',\n", - " 'transitions',\n", - " 'logits']" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "strings = ','.join(\n", " [\n", @@ -1550,13 +2339,12 @@ " and 'adam' not in n.name\n", " and 'gradients/bert' not in n.name\n", " ]\n", - ")\n", - "strings.split(',')" + ")" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -1591,7 +2379,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -1599,7 +2387,7 @@ "output_type": "stream", "text": [ "INFO:tensorflow:Restoring parameters from albert-tiny-dependency/model.ckpt\n", - "WARNING:tensorflow:From :23: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From :23: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use `tf.compat.v1.graph_util.convert_variables_to_constants`\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/framework/graph_util_impl.py:277: extract_sub_graph (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", @@ -1607,7 +2395,7 @@ "Use `tf.compat.v1.graph_util.extract_sub_graph`\n", "INFO:tensorflow:Froze 40 variables.\n", "INFO:tensorflow:Converted 40 variables to const ops.\n", - "1730 ops in the final graph.\n" + "1723 ops in the final graph.\n" ] } ], @@ -1617,101 +2405,56 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ - "string = 'husein makan ayam'\n", - "\n", - "import re\n", - "\n", - "def entities_textcleaning(string, lowering = False):\n", - " \"\"\"\n", - " use by entities recognition, pos recognition and dependency parsing\n", - " \"\"\"\n", - " string = re.sub('[^A-Za-z0-9\\-\\/() ]+', ' ', string)\n", - " string = re.sub(r'[ ]+', ' ', string).strip()\n", - " original_string = string.split()\n", - " if lowering:\n", - " string = string.lower()\n", - " string = [\n", - " (original_string[no], word.title() if word.isupper() else word)\n", - " for no, word in enumerate(string.split())\n", - " if len(word)\n", - " ]\n", - " return [s[0] for s in string], [s[1] for s in string]\n", - "\n", - "def parse_X(left):\n", - " bert_tokens = ['[CLS]']\n", - " for no, orig_token in enumerate(left):\n", - " t = tokenizer.tokenize(orig_token)\n", - " bert_tokens.extend(t)\n", - " bert_tokens.append(\"[SEP]\")\n", - " return tokenizer.convert_tokens_to_ids(bert_tokens), bert_tokens\n", - "\n", - "sequence = entities_textcleaning(string)[1]\n", - "parsed_sequence, bert_sequence = parse_X(sequence)" + "transforms = ['add_default_attributes',\n", + " 'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n", + " 'fold_batch_norms',\n", + " 'fold_old_batch_norms',\n", + " 'quantize_weights(fallback_min=-10, fallback_max=10)',\n", + " 'strip_unused_nodes',\n", + " 'sort_by_execution_order']" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 35, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From :6: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use tf.gfile.GFile.\n" + ] + } + ], "source": [ - "def merge_sentencepiece_tokens_tagging(x, y):\n", - " new_paired_tokens = []\n", - " n_tokens = len(x)\n", - " rejected = ['[CLS]', '[SEP]']\n", - "\n", - " i = 0\n", - "\n", - " while i < n_tokens:\n", - "\n", - " current_token, current_label = x[i], y[i]\n", - " if not current_token.startswith('▁') and current_token not in rejected:\n", - " previous_token, previous_label = new_paired_tokens.pop()\n", - " merged_token = previous_token\n", - " merged_label = [previous_label]\n", - " while (\n", - " not current_token.startswith('▁')\n", - " and current_token not in rejected\n", - " ):\n", - " merged_token = merged_token + current_token.replace('▁', '')\n", - " merged_label.append(current_label)\n", - " i = i + 1\n", - " current_token, current_label = x[i], y[i]\n", - " merged_label = merged_label[0]\n", - " new_paired_tokens.append((merged_token, merged_label))\n", - "\n", - " else:\n", - " new_paired_tokens.append((current_token, current_label))\n", - " i = i + 1\n", + "from tensorflow.tools.graph_transforms import TransformGraph\n", + "tf.set_random_seed(0)\n", "\n", - " words = [\n", - " i[0].replace('▁', '')\n", - " for i in new_paired_tokens\n", - " if i[0] not in rejected\n", - " ]\n", - " labels = [i[1] for i in new_paired_tokens if i[0] not in rejected]\n", - " return words, labels" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "import boto3\n", + "pb = 'albert-tiny-dependency/frozen_model.pb'\n", + "input_graph_def = tf.GraphDef()\n", + "with tf.gfile.FastGFile(pb, 'rb') as f:\n", + " input_graph_def.ParseFromString(f.read())\n", "\n", - "bucketName = 'huseinhouse-storage'\n", - "Key = 'albert-tiny-dependency/frozen_model.pb'\n", - "outPutname = \"v34/dependency/albert-tiny-dependency.pb\"\n", + "if 'bert' in pb:\n", + " inputs = ['Placeholder']\n", + " a = ['dense/BiasAdd']\n", + "if 'xlnet' in pb:\n", + " inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n", + " a = ['transpose_3']\n", "\n", - "s3 = boto3.client('s3')\n", + "transformed_graph_def = TransformGraph(input_graph_def, \n", + " inputs,\n", + " ['logits', 'heads_seq'] + a, transforms)\n", "\n", - "s3.upload_file(Key,bucketName,outPutname)" + "with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n", + " f.write(transformed_graph_def.SerializeToString())" ] } ], @@ -1731,7 +2474,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.9" } }, "nbformat": 4, diff --git a/session/dependency/alxlnet-base.ipynb b/session/dependency/alxlnet-base.ipynb index 09268dd2..6e4c45e2 100644 --- a/session/dependency/alxlnet-base.ipynb +++ b/session/dependency/alxlnet-base.ipynb @@ -7,35 +7,19 @@ "outputs": [], "source": [ "import os\n", - "os.environ['CUDA_VISIBLE_DEVICES'] = '2'" + "os.environ['CUDA_VISIBLE_DEVICES'] = '1'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], - "source": [ - "with open('../Malaya-Dataset/dependency/gsd-ud-train.conllu.txt') as fopen:\n", - " corpus = fopen.read().split('\\n')\n", - " \n", - "with open('../Malaya-Dataset/dependency/gsd-ud-test.conllu.txt') as fopen:\n", - " corpus.extend(fopen.read().split('\\n'))\n", - " \n", - "with open('../Malaya-Dataset/dependency/gsd-ud-dev.conllu.txt') as fopen:\n", - " corpus.extend(fopen.read().split('\\n'))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "WARNING:tensorflow:From /home/husein/alxnet/model_utils.py:334: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", + "WARNING:tensorflow:From /home/husein/alxlnet/model_utils.py:334: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", "\n" ] } @@ -53,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -61,7 +45,7 @@ "from prepro_utils import preprocess_text, encode_ids\n", "\n", "sp_model = spm.SentencePieceProcessor()\n", - "sp_model.Load('alxlnet-base/sp10m.cased.v9.model')\n", + "sp_model.Load('sp10m.cased.v9.model')\n", "\n", "def tokenize_fn(text):\n", " text = preprocess_text(text, lower= False)\n", @@ -70,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -102,274 +86,37 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "tag2idx = {'PAD': 0, 'X': 1}\n", - "tag_idx = 2\n", + "import pickle\n", "\n", - "def process_corpus(corpus, until = None):\n", - " global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx\n", - " sentences, words, depends, labels, pos, sequences = [], [], [], [], [], []\n", - " temp_sentence, temp_word, temp_depend, temp_label, temp_pos = [], [], [], [], []\n", - " segments, masks = [], []\n", - " first_time = True\n", - " for sentence in corpus:\n", - " try:\n", - " if len(sentence):\n", - " if sentence[0] == '#':\n", - " continue\n", - " if first_time:\n", - " print(sentence)\n", - " first_time = False\n", - " sentence = sentence.split('\\t')\n", - " if sentence[7] not in tag2idx:\n", - " tag2idx[sentence[7]] = tag_idx\n", - " tag_idx += 1\n", - " temp_word.append(sentence[1])\n", - " temp_depend.append(int(sentence[6]) + 1)\n", - " temp_label.append(tag2idx[sentence[7]])\n", - " temp_sentence.append(sentence[1])\n", - " temp_pos.append(sentence[3])\n", - " else:\n", - " if len(temp_sentence) < 2 or len(temp_word) != len(temp_label):\n", - " temp_word = []\n", - " temp_depend = []\n", - " temp_label = []\n", - " temp_sentence = []\n", - " temp_pos = []\n", - " continue\n", - " bert_tokens = []\n", - " labels_ = []\n", - " depends_ = []\n", - " seq_ = []\n", - " for no, orig_token in enumerate(temp_word):\n", - " labels_.append(temp_label[no])\n", - " depends_.append(temp_depend[no])\n", - " t = tokenize_fn(orig_token)\n", - " bert_tokens.extend(t)\n", - " labels_.extend([1] * (len(t) - 1))\n", - " depends_.extend([0] * (len(t) - 1))\n", - " seq_.append(no + 1)\n", - " bert_tokens.extend([4, 3])\n", - " labels_.extend([0, 0])\n", - " depends_.extend([0, 0])\n", - " segment = [0] * (len(bert_tokens) - 1) + [SEG_ID_CLS]\n", - " input_mask = [0] * len(segment)\n", - " words.append(bert_tokens)\n", - " depends.append(depends_)\n", - " labels.append(labels_)\n", - " sentences.append(bert_tokens)\n", - " pos.append(temp_pos)\n", - " sequences.append(seq_)\n", - " segments.append(segment)\n", - " masks.append(input_mask)\n", - " temp_word = []\n", - " temp_depend = []\n", - " temp_label = []\n", - " temp_sentence = []\n", - " temp_pos = []\n", - " except Exception as e:\n", - " print(e, sentence)\n", - " return sentences[:-1], words[:-1], depends[:-1], labels[:-1], pos[:-1], sequences[:-1], segments[:-1], masks[:-1]" + "with open('/home/husein/xlnet/train_X.pkl', 'rb') as fopen:\n", + " train_X, train_Y, train_depends, train_segments, train_masks = pickle.load(fopen)\n", + " \n", + "with open('/home/husein/xlnet/test_X.pkl', 'rb') as fopen:\n", + " test_X, test_Y, test_depends, test_segments, test_masks = pickle.load(fopen)\n", + " \n", + "with open('/home/husein/xlnet/tags.pkl', 'rb') as fopen:\n", + " idx2tag, tag2idx = pickle.load(fopen)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "1\tSembungan\tsembungan\tPROPN\tX--\t_\t4\tnsubj\t_\tMorphInd=^sembungan_X--$\n" + "WARNING:tensorflow:From /home/husein/alxlnet/xlnet.py:70: The name tf.gfile.Open is deprecated. Please use tf.io.gfile.GFile instead.\n", + "\n" ] } ], "source": [ - "sentences, words, depends, labels, _, _, segments, masks = process_corpus(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open('../Malaya-Dataset/dependency/augmented-dependency.json') as fopen:\n", - " augmented = json.load(fopen)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "text_augmented, depends_augmented, labels_augmented = [], [], []\n", - "\n", - "for a in augmented:\n", - " text_augmented.extend(a[0])\n", - " depends_augmented.extend(a[1])\n", - " labels_augmented.extend((np.array(a[2]) + 1).tolist())" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "def parse_XY(texts, depends, labels):\n", - " outside, sentences, outside_depends, outside_labels = [], [], [], []\n", - " segments, masks = [], []\n", - " for no, text in enumerate(texts):\n", - " temp_depend = depends[no]\n", - " temp_label = labels[no]\n", - " s = text.split()\n", - " sentences.append(s)\n", - " bert_tokens = []\n", - " labels_ = []\n", - " depends_ = []\n", - " for no, orig_token in enumerate(s):\n", - " labels_.append(temp_label[no])\n", - " depends_.append(temp_depend[no])\n", - " t = tokenize_fn(orig_token)\n", - " bert_tokens.extend(t)\n", - " labels_.extend([1] * (len(t) - 1))\n", - " depends_.extend([0] * (len(t) - 1))\n", - " bert_tokens.extend([4, 3])\n", - " labels_.extend([0, 0])\n", - " depends_.extend([0, 0])\n", - " segment = [0] * (len(bert_tokens) - 1) + [SEG_ID_CLS]\n", - " input_mask = [0] * len(segment)\n", - " outside.append(bert_tokens)\n", - " outside_depends.append(depends_)\n", - " outside_labels.append(labels_)\n", - " segments.append(segment)\n", - " masks.append(input_mask)\n", - " return outside, sentences, outside_depends, outside_labels, segments, masks" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "outside, _, outside_depends, outside_labels, outside_segments, outside_masks = parse_XY(text_augmented, \n", - " depends_augmented, \n", - " labels_augmented)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "words.extend(outside)\n", - "depends.extend(outside_depends)\n", - "labels.extend(outside_labels)\n", - "segments.extend(outside_segments)\n", - "masks.extend(outside_masks)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{0: 'PAD',\n", - " 1: 'X',\n", - " 2: 'nsubj',\n", - " 3: 'cop',\n", - " 4: 'det',\n", - " 5: 'root',\n", - " 6: 'nsubj:pass',\n", - " 7: 'acl',\n", - " 8: 'case',\n", - " 9: 'obl',\n", - " 10: 'flat',\n", - " 11: 'punct',\n", - " 12: 'appos',\n", - " 13: 'amod',\n", - " 14: 'compound',\n", - " 15: 'advmod',\n", - " 16: 'cc',\n", - " 17: 'obj',\n", - " 18: 'conj',\n", - " 19: 'mark',\n", - " 20: 'advcl',\n", - " 21: 'nmod',\n", - " 22: 'nummod',\n", - " 23: 'dep',\n", - " 24: 'xcomp',\n", - " 25: 'ccomp',\n", - " 26: 'parataxis',\n", - " 27: 'compound:plur',\n", - " 28: 'fixed',\n", - " 29: 'aux',\n", - " 30: 'csubj',\n", - " 31: 'iobj',\n", - " 32: 'csubj:pass'}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "idx2tag = {v:k for k, v in tag2idx.items()}\n", - "idx2tag" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "words_train, words_test, depends_train, depends_test, labels_train, labels_test, \\\n", - "segments_train, segments_test, masks_train, masks_test \\\n", - "= train_test_split(words, depends, labels, segments, masks, test_size = 0.2)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "train_X = words_train\n", - "train_Y = labels_train\n", - "train_depends = depends_train\n", - "\n", - "test_X = words_test\n", - "test_Y = labels_test\n", - "test_depends = depends_test" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "import xlnet\n", - "import tensorflow as tf\n", - "import numpy as np\n", - "\n", "kwargs = dict(\n", " is_training=True,\n", " use_tpu=False,\n", @@ -382,25 +129,27 @@ " clamp_len=-1)\n", "\n", "xlnet_parameters = xlnet.RunConfig(**kwargs)\n", - "xlnet_config = xlnet.XLNetConfig(json_path='alxlnet-base/config.json')" + "xlnet_config = xlnet.XLNetConfig(\n", + " json_path = 'alxlnet-base-2020-04-10/config.json'\n", + ")" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "37770 3777\n" + "1173456 117345\n" ] } ], "source": [ - "epoch = 15\n", - "batch_size = 16\n", + "epoch = 3\n", + "batch_size = 8\n", "warmup_proportion = 0.1\n", "num_train_steps = int(len(train_X) / batch_size * epoch)\n", "num_warmup_steps = int(num_train_steps * warmup_proportion)\n", @@ -429,7 +178,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -454,7 +203,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -674,7 +423,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -692,30 +441,30 @@ " * https://github.com/tensorflow/io (for I/O related ops)\n", "If you depend on functionality not listed there, please file an issue.\n", "\n", - "WARNING:tensorflow:From /home/husein/alxnet/xlnet.py:253: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.\n", + "WARNING:tensorflow:From /home/husein/alxlnet/xlnet.py:253: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.\n", "\n", - "WARNING:tensorflow:From /home/husein/alxnet/xlnet.py:253: The name tf.AUTO_REUSE is deprecated. Please use tf.compat.v1.AUTO_REUSE instead.\n", + "WARNING:tensorflow:From /home/husein/alxlnet/xlnet.py:253: The name tf.AUTO_REUSE is deprecated. Please use tf.compat.v1.AUTO_REUSE instead.\n", "\n", - "WARNING:tensorflow:From /home/husein/alxnet/custom_modeling.py:696: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.\n", + "WARNING:tensorflow:From /home/husein/alxlnet/custom_modeling.py:697: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.\n", "\n", "INFO:tensorflow:memory input None\n", "INFO:tensorflow:Use float type \n", - "WARNING:tensorflow:From /home/husein/alxnet/custom_modeling.py:808: dropout (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From /home/husein/alxlnet/custom_modeling.py:809: dropout (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use keras.layers.dropout instead.\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/layers/core.py:271: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use `layer.__call__` method instead.\n", - "WARNING:tensorflow:From /home/husein/alxnet/custom_modeling.py:109: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From /home/husein/alxlnet/custom_modeling.py:109: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use keras.layers.Dense instead.\n", - "WARNING:tensorflow:From :138: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From :138: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/contrib/crf/python/ops/crf.py:213: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use `keras.layers.RNN(cell)`, which is equivalent to this API\n", - "WARNING:tensorflow:From :172: calling log_softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From :172: calling log_softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "dim is deprecated, use axis instead\n" ] @@ -726,7 +475,7 @@ "sess = tf.InteractiveSession()\n", "\n", "learning_rate = 2e-5\n", - "hidden_size_word = 128\n", + "hidden_size_word = 256\n", "\n", "model = Model(learning_rate, hidden_size_word)\n", "sess.run(tf.global_variables_initializer())" @@ -734,7 +483,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -770,26 +519,26 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "tvars = tf.trainable_variables()\n", - "checkpoint = 'alxlnet-base/model.ckpt'\n", + "checkpoint = 'alxlnet-base-2020-04-10/model.ckpt-300000'\n", "assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, \n", " checkpoint)" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "INFO:tensorflow:Restoring parameters from alxlnet-base/model.ckpt\n" + "INFO:tensorflow:Restoring parameters from alxlnet-base-2020-04-10/model.ckpt-300000\n" ] } ], @@ -800,7 +549,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -812,24 +561,24 @@ "batch_y = pad_sequences(batch_y,padding='post')\n", "batch_depends = train_depends[:5]\n", "batch_depends = pad_sequences(batch_depends,padding='post')\n", - "batch_segments = segments_train[:5]\n", + "batch_segments = train_segments[:5]\n", "batch_segments = pad_sequences(batch_segments, padding='post', value = 4)\n", - "batch_masks = masks_train[:5]\n", + "batch_masks = train_masks[:5]\n", "batch_masks = pad_sequences(batch_masks, padding='post', value = 1)" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[0.028846154, 0.014423077, 44.482525]" + "[0.08235294, 0.105882354, 40.901375]" ] }, - "execution_count": 30, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -846,99 +595,48 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(array([ 2, 4, 22, 7, 26, 12, 22, 3, 28, 8, 23, 14, 13, 28, 29, 8, 22,\n", - " 17, 29, 22, 19, 22, 28, 16, 8, 28, 3, 1, 16, 3, 30, 23, 22, 16,\n", - " 16, 8, 8, 22, 32, 32, 12, 16, 16, 16, 22, 23, 22, 7, 23, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0], dtype=int32),\n", - " array([11, 22, 11, 19, 34, 35, 47, 19, 39, 10, 34, 24, 34, 43, 10, 10, 11,\n", - " 18, 34, 22, 23, 22, 43, 10, 37, 41, 19, 34, 10, 19, 10, 34, 11, 2,\n", - " 27, 10, 37, 11, 7, 11, 19, 25, 31, 25, 42, 34, 22, 34, 34, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0]),\n", - " array([ 9, 0, 0, 2, 0, 3, 0, 0, 0, 3, 0, 9, 9, 7, 1, 11, 9,\n", - " 11, 12, 0, 11, 0, 0, 11, 15, 0, 0, 16, 15, 0, 15, 19, 20, 0,\n", - " 0, 0, 0, 0, 19, 0, 19, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0], dtype=int32))" + "[0.03529412, 0.07058824, 185.67154]" ] }, - "execution_count": 32, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tags_seq, heads = sess.run(\n", - " [model.logits, model.heads_seq],\n", - " feed_dict = {\n", - " model.words: batch_x,\n", - " model.segment_ids: batch_segments,\n", - " model.input_masks: batch_masks\n", - " },\n", - ")\n", - "tags_seq[0], heads[0], batch_depends[0]" + "sess.run([model.accuracy, model.accuracy_depends, model.cost],\n", + " feed_dict = {model.words: batch_x,\n", + " model.types: batch_y,\n", + " model.heads: batch_depends,\n", + " model.segment_ids: batch_segments,\n", + " model.input_masks: batch_masks,\n", + " model.switch: True})" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "train minibatch loop: 100%|██████████| 2519/2519 [14:07<00:00, 2.97it/s, accuracy=0.875, accuracy_depends=0.75, cost=0.65] \n", - "test minibatch loop: 100%|██████████| 630/630 [01:55<00:00, 5.45it/s, accuracy=0.844, accuracy_depends=0.543, cost=2] \n", - "train minibatch loop: 0%| | 0/2519 [00:00', '']\n", + "\n", + " i = 0\n", + "\n", + " while i < n_tokens:\n", + "\n", + " current_token, current_label = x[i], y[i]\n", + " if not current_token.startswith('▁') and current_token not in rejected:\n", + " previous_token, previous_label = new_paired_tokens.pop()\n", + " merged_token = previous_token\n", + " merged_label = [previous_label]\n", + " while (\n", + " not current_token.startswith('▁')\n", + " and current_token not in rejected\n", + " ):\n", + " merged_token = merged_token + current_token.replace('▁', '')\n", + " merged_label.append(current_label)\n", + " i = i + 1\n", + " current_token, current_label = x[i], y[i]\n", + " merged_label = merged_label[0]\n", + " new_paired_tokens.append((merged_token, merged_label))\n", + "\n", + " else:\n", + " new_paired_tokens.append((current_token, current_label))\n", + " i = i + 1\n", + "\n", + " words = [\n", + " i[0].replace('▁', '')\n", + " for i in new_paired_tokens\n", + " if i[0] not in ['', '']\n", + " ]\n", + " labels = [i[1] for i in new_paired_tokens if i[0] not in ['', '']]\n", + " return words, labels" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "from unidecode import unidecode\n", + "from malaya.function.parse_dependency import DependencyGraph\n", + "\n", + "PUNCTUATION = '!\"#$%&\\'()*+,./:;<=>?@[\\]^_`{|}~'\n", + "\n", + "def transformer_textcleaning(string):\n", + " \"\"\"\n", + " use by any transformer model before tokenization\n", + " \"\"\"\n", + " string = unidecode(string)\n", + " string = re.sub('\\\\(dot\\\\)', '.', string)\n", + " string = (\n", + " re.sub(re.findall(r'\\', string)[0], '', string)\n", + " if (len(re.findall(r'\\', string)) > 0)\n", + " and ('href' in re.findall(r'\\', string)[0])\n", + " else string\n", + " )\n", + " string = re.sub(\n", + " r'\\w+:\\/{2}[\\d\\w-]+(\\.[\\d\\w-]+)*(?:(?:\\/[^\\s/]*))*', ' ', string\n", + " )\n", + " string = re.sub(r'[ ]+', ' ', string).strip().split()\n", + " string = [w for w in string if w[0] != '@']\n", + " string = ' '.join(string)\n", + " string = re.sub(f'([{PUNCTUATION}])', r' \\1 ', string)\n", + " string = re.sub('\\s{2,}', ' ', string)\n", + " original_string = string.split()\n", + " string = [\n", + " (original_string[no], word.title() if word.isupper() else word)\n", + " for no, word in enumerate(string.split())\n", + " if len(word)\n", + " ]\n", + " return [s[0] for s in string], [s[1] for s in string]\n", + "\n", + "def parse_X(left):\n", + " left = ' '.join(left)\n", + " bert_tokens = tokenize_fn(left)\n", + " bert_tokens.extend([3, 4])\n", + " segment = [0] * (len(bert_tokens) - 1) + [SEG_ID_CLS]\n", + " input_mask = [0] * len(segment)\n", + " s_tokens = [sp_model.IdToPiece(i) for i in bert_tokens]\n", + " return bert_tokens, segment, input_mask, s_tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def dependency_graph(tagging, indexing):\n", + " \"\"\"\n", + " Return helper object for dependency parser results. Only accept tagging and indexing outputs from dependency models.\n", + " \"\"\"\n", + " result = []\n", + " for i in range(len(tagging)):\n", + " result.append(\n", + " '%d\\t%s\\t_\\t_\\t_\\t_\\t%d\\t%s\\t_\\t_'\n", + " % (i + 1, tagging[i][0], int(indexing[i][1]), tagging[i][1])\n", + " )\n", + " return DependencyGraph('\\n'.join(result), top_relation_label='root')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 2519/2519 [14:37<00:00, 2.87it/s, accuracy=1, accuracy_depends=1, cost=0.00701] \n", - "test minibatch loop: 100%|██████████| 630/630 [02:02<00:00, 5.15it/s, accuracy=0.962, accuracy_depends=0.853, cost=4.4] \n", - "train minibatch loop: 0%| | 0/2519 [00:00\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "2\n", + "2 (makan)\n", + "\n", + "\n", + "\n", + "0->2\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "3\n", + "3 (ayam)\n", + "\n", + "\n", + "\n", + "0->3\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "1\n", + "1 (husein)\n", + "\n", + "\n", + "\n", + "2->1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "string = 'husein makan ayam'\n", + "sequence = transformer_textcleaning(string)[1]\n", + "parsed_sequence, segment_sequence, mask_sequence, xlnet_sequence = parse_X(sequence)\n", + "h, t = sess.run([model.heads_seq, model.tags_seq],\n", + " feed_dict = {\n", + " model.words: [parsed_sequence],\n", + " model.segment_ids: [segment_sequence],\n", + " model.input_masks: [mask_sequence],\n", + " },\n", + ")\n", + "h = h[0] - 2\n", + "t = [idx2tag[d] for d in t[0]]\n", + "merged_h = merge_sentencepiece_tokens_tagging(xlnet_sequence, h)\n", + "merged_t = merge_sentencepiece_tokens_tagging(xlnet_sequence, t)\n", + "tagging = list(zip(merged_t[0], merged_t[1]))\n", + "indexing = list(zip(merged_h[0], merged_h[1]))\n", + "dep = dependency_graph(tagging, indexing)\n", + "dep.to_graphvis()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "1\n", + "1 (Kuala)\n", + "\n", + "\n", + "\n", + "0->1\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "13\n", + "13 (membidas)\n", + "\n", + "\n", + "\n", + "0->13\n", + "\n", + "\n", + "parataxis\n", + "\n", + "\n", + "\n", + "2\n", + "2 (Lumpur)\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "3\n", + "3 (:)\n", + "\n", + "\n", + "\n", + "1->3\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "7\n", + "7 (,)\n", + "\n", + "\n", + "\n", + "1->7\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "4\n", + "4 (Ketua)\n", + "\n", + "\n", + "\n", + "13->4\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "8\n", + "8 (Datuk)\n", + "\n", + "\n", + "\n", + "13->8\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "33\n", + "33 (melaksanakan)\n", + "\n", + "\n", + "\n", + "13->33\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "37\n", + "37 (.)\n", + "\n", + "\n", + "\n", + "13->37\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "5\n", + "5 (Penerangan)\n", + "\n", + "\n", + "\n", + "4->5\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "6\n", + "6 (Bersatu)\n", + "\n", + "\n", + "\n", + "5->6\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "9\n", + "9 (Wan)\n", + "\n", + "\n", + "\n", + "8->9\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "10\n", + "10 (Saiful)\n", + "\n", + "\n", + "\n", + "9->10\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "11\n", + "11 (Wan)\n", + "\n", + "\n", + "\n", + "10->11\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "12\n", + "12 (Jan)\n", + "\n", + "\n", + "\n", + "11->12\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "34\n", + "34 (sekatan)\n", + "\n", + "\n", + "\n", + "33->34\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "39\n", + "39 (berkata)\n", + "\n", + "\n", + "\n", + "33->39\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "14\n", + "14 (kenyataan)\n", + "\n", + "\n", + "\n", + "16\n", + "16 (Seri)\n", + "\n", + "\n", + "\n", + "14->16\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "15\n", + "15 (Datuk)\n", + "\n", + "\n", + "\n", + "15->15\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "17\n", + "17 (Najib)\n", + "\n", + "\n", + "\n", + "15->17\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "20\n", + "20 (Ketua)\n", + "\n", + "\n", + "\n", + "15->20\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "24\n", + "24 (Datuk)\n", + "\n", + "\n", + "\n", + "15->24\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "19\n", + "19 (dan)\n", + "\n", + "\n", + "\n", + "20->19\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "22\n", + "22 (Umno)\n", + "\n", + "\n", + "\n", + "20->22\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "25\n", + "25 (Dr)\n", + "\n", + "\n", + "\n", + "20->25\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "21\n", + "21 (Pemuda)\n", + "\n", + "\n", + "\n", + "24->21\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "23\n", + "23 (,)\n", + "\n", + "\n", + "\n", + "24->23\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "30\n", + "30 (mempertikaikan)\n", + "\n", + "\n", + "\n", + "24->30\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "18\n", + "18 (Razak)\n", + "\n", + "\n", + "\n", + "21->18\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "29\n", + "29 (yang)\n", + "\n", + "\n", + "\n", + "30->29\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "31\n", + "31 (tindakan)\n", + "\n", + "\n", + "\n", + "30->31\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "26\n", + "26 (Asyraf)\n", + "\n", + "\n", + "\n", + "26->26\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "27\n", + "27 (Wajdi)\n", + "\n", + "\n", + "\n", + "26->27\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "28\n", + "28 (Dusuki)\n", + "\n", + "\n", + "\n", + "27->28\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "32\n", + "32 (kerajaan)\n", + "\n", + "\n", + "\n", + "31->32\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "35\n", + "35 (pergerakan)\n", + "\n", + "\n", + "\n", + "34->35\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "46\n", + "46 (memetik)\n", + "\n", + "\n", + "\n", + "39->46\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "36\n", + "36 (penuh)\n", + "\n", + "\n", + "\n", + "35->36\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "38\n", + "38 (Beliau)\n", + "\n", + "\n", + "\n", + "46->38\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "40\n", + "40 (,)\n", + "\n", + "\n", + "\n", + "46->40\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "41\n", + "41 (Najib)\n", + "\n", + "\n", + "\n", + "46->41\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "45\n", + "45 (sengaja)\n", + "\n", + "\n", + "\n", + "46->45\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "61\n", + "61 (.)\n", + "\n", + "\n", + "\n", + "46->61\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "47\n", + "47 (kenyataan)\n", + "\n", + "\n", + "\n", + "46->47\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "64\n", + "64 (berkata)\n", + "\n", + "\n", + "\n", + "46->64\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "43\n", + "43 (Asyraf)\n", + "\n", + "\n", + "\n", + "41->43\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "42\n", + "42 (dan)\n", + "\n", + "\n", + "\n", + "43->42\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "44\n", + "44 (Wajdi)\n", + "\n", + "\n", + "\n", + "43->44\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "48\n", + "48 (Perdana)\n", + "\n", + "\n", + "\n", + "47->48\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "64->14\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "62\n", + "62 (Wan)\n", + "\n", + "\n", + "\n", + "64->62\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "65\n", + "65 (,)\n", + "\n", + "\n", + "\n", + "64->65\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "83\n", + "83 (.)\n", + "\n", + "\n", + "\n", + "64->83\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "68\n", + "68 (menjangka)\n", + "\n", + "\n", + "\n", + "64->68\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "49\n", + "49 (Menteri)\n", + "\n", + "\n", + "\n", + "48->49\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "50\n", + "50 (,)\n", + "\n", + "\n", + "\n", + "48->50\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "51\n", + "51 (Tan)\n", + "\n", + "\n", + "\n", + "48->51\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "57\n", + "57 (lengkap)\n", + "\n", + "\n", + "\n", + "48->57\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "52\n", + "52 (Sri)\n", + "\n", + "\n", + "\n", + "51->52\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "55\n", + "55 (yang)\n", + "\n", + "\n", + "\n", + "57->55\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "56\n", + "56 (tidak)\n", + "\n", + "\n", + "\n", + "57->56\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "59\n", + "59 (mengelirukan)\n", + "\n", + "\n", + "\n", + "57->59\n", + "\n", + "\n", + "ccomp\n", + "\n", + "\n", + "\n", + "53\n", + "53 (Muhyiddin)\n", + "\n", + "\n", + "\n", + "52->53\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "54\n", + "54 (Yassin)\n", + "\n", + "\n", + "\n", + "53->54\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "58\n", + "58 (untuk)\n", + "\n", + "\n", + "\n", + "59->58\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "60\n", + "60 (rakyat)\n", + "\n", + "\n", + "\n", + "59->60\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "63\n", + "63 (Saiful)\n", + "\n", + "\n", + "\n", + "62->63\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "66\n", + "66 (beliau)\n", + "\n", + "\n", + "\n", + "68->66\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "67\n", + "67 (sudah)\n", + "\n", + "\n", + "\n", + "68->67\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "69\n", + "69 (ada)\n", + "\n", + "\n", + "\n", + "68->69\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "70\n", + "70 (kenyataan)\n", + "\n", + "\n", + "\n", + "69->70\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "73\n", + "73 (Najib)\n", + "\n", + "\n", + "\n", + "69->73\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "75\n", + "75 (tulisan)\n", + "\n", + "\n", + "\n", + "69->75\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "71\n", + "71 (balas)\n", + "\n", + "\n", + "\n", + "70->71\n", + "\n", + "\n", + "ccomp\n", + "\n", + "\n", + "\n", + "72\n", + "72 (daripada)\n", + "\n", + "\n", + "\n", + "73->72\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "74\n", + "74 (mengenai)\n", + "\n", + "\n", + "\n", + "75->74\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "77\n", + "77 (berhubung)\n", + "\n", + "\n", + "\n", + "75->77\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "76\n", + "76 (beliau)\n", + "\n", + "\n", + "\n", + "77->76\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "78\n", + "78 (kesan)\n", + "\n", + "\n", + "\n", + "77->78\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "79\n", + "79 (positif)\n", + "\n", + "\n", + "\n", + "78->79\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "80\n", + "80 (sekatan)\n", + "\n", + "\n", + "\n", + "78->80\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "81\n", + "81 (pergerakan)\n", + "\n", + "\n", + "\n", + "80->81\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "82\n", + "82 (penuh)\n", + "\n", + "\n", + "\n", + "81->82\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "from tqdm import tqdm\n", - "\n", - "epoch = 5\n", - "for e in range(epoch):\n", - " train_acc, train_loss = [], []\n", - " test_acc, test_loss = [], []\n", - " train_acc_depends, test_acc_depends = [], []\n", - " \n", - " pbar = tqdm(\n", - " range(0, len(train_X), batch_size), desc = 'train minibatch loop'\n", - " )\n", - " for i in pbar:\n", - " index = min(i + batch_size, len(train_X))\n", - " batch_x = train_X[i: index]\n", - " batch_x = pad_sequences(batch_x,padding='post')\n", - " batch_y = train_Y[i: index]\n", - " batch_y = pad_sequences(batch_y,padding='post')\n", - " batch_depends = train_depends[i: index]\n", - " batch_depends = pad_sequences(batch_depends,padding='post')\n", - " batch_segments = segments_train[i: index]\n", - " batch_segments = pad_sequences(batch_segments, padding='post', value = 4)\n", - " batch_masks = masks_train[i: index]\n", - " batch_masks = pad_sequences(batch_masks, padding='post', value = 1)\n", - " \n", - " acc_depends, acc, cost, _ = sess.run(\n", - " [model.accuracy_depends, model.accuracy, model.cost, model.optimizer],\n", - " feed_dict = {\n", - " model.words: batch_x,\n", - " model.types: batch_y,\n", - " model.heads: batch_depends,\n", - " model.segment_ids: batch_segments,\n", - " model.input_masks: batch_masks,\n", - " model.switch: True\n", - " },\n", - " )\n", - " train_loss.append(cost)\n", - " train_acc.append(acc)\n", - " train_acc_depends.append(acc_depends)\n", - " pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)\n", - " \n", - " pbar = tqdm(\n", - " range(0, len(test_X), batch_size), desc = 'test minibatch loop'\n", - " )\n", - " for i in pbar:\n", - " index = min(i + batch_size, len(test_X))\n", - " batch_x = test_X[i: index]\n", - " batch_x = pad_sequences(batch_x,padding='post')\n", - " batch_y = test_Y[i: index]\n", - " batch_y = pad_sequences(batch_y,padding='post')\n", - " batch_depends = test_depends[i: index]\n", - " batch_depends = pad_sequences(batch_depends,padding='post')\n", - " batch_segments = segments_test[i: index]\n", - " batch_segments = pad_sequences(batch_segments, padding='post', value = 4)\n", - " batch_masks = masks_test[i: index]\n", - " batch_masks = pad_sequences(batch_masks, padding='post', value = 1)\n", - " \n", - " acc_depends, acc, cost = sess.run(\n", - " [model.accuracy_depends, model.accuracy, model.cost],\n", - " feed_dict = {\n", - " model.words: batch_x,\n", - " model.types: batch_y,\n", - " model.heads: batch_depends,\n", - " model.segment_ids: batch_segments,\n", - " model.input_masks: batch_masks,\n", - " model.switch: True\n", - " },\n", - " )\n", - " test_loss.append(cost)\n", - " test_acc.append(acc)\n", - " test_acc_depends.append(acc_depends)\n", - " pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)\n", - " \n", - " \n", - " print(\n", - " 'epoch: %d, training loss: %f, training acc: %f, training depends: %f, valid loss: %f, valid acc: %f, valid depends: %f\\n'\n", - " % (e, np.mean(train_loss), \n", - " np.mean(train_acc), \n", - " np.mean(train_acc_depends), \n", - " np.mean(test_loss), \n", - " np.mean(test_acc), \n", - " np.mean(test_acc_depends)\n", - " ))" + "string = 'KUALA LUMPUR: Ketua Penerangan BERSATU, Datuk Wan Saiful Wan Jan membidas kenyataan Datuk Seri Najib Razak dan Ketua Pemuda UMNO, Datuk Dr Asyraf Wajdi Dusuki yang mempertikaikan tindakan kerajaan melaksanakan sekatan pergerakan penuh. Beliau berkata, Najib dan Asyraf Wajdi sengaja memetik kenyataan Perdana Menteri, Tan Sri Muhyiddin Yassin yang tidak lengkap untuk mengelirukan rakyat. Wan Saiful berkata, beliau sudah menjangka ada kenyataan balas daripada Najib mengenai tulisan beliau berhubung kesan positif sekatan pergerakan penuh.'\n", + "sequence = transformer_textcleaning(string)[1]\n", + "parsed_sequence, segment_sequence, mask_sequence, xlnet_sequence = parse_X(sequence)\n", + "h, t = sess.run([model.heads_seq, model.tags_seq],\n", + " feed_dict = {\n", + " model.words: [parsed_sequence],\n", + " model.segment_ids: [segment_sequence],\n", + " model.input_masks: [mask_sequence],\n", + " },\n", + ")\n", + "h = h[0] - 2\n", + "t = [idx2tag[d] for d in t[0]]\n", + "merged_h = merge_sentencepiece_tokens_tagging(xlnet_sequence, h)\n", + "merged_t = merge_sentencepiece_tokens_tagging(xlnet_sequence, t)\n", + "tagging = list(zip(merged_t[0], merged_t[1]))\n", + "indexing = list(zip(merged_h[0], merged_h[1]))\n", + "dep = dependency_graph(tagging, indexing)\n", + "dep.to_graphvis()" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1515,7 +2233,7 @@ "'alxlnet-base-dependency/model.ckpt'" ] }, - "execution_count": 35, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1527,7 +2245,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -1543,12 +2261,14 @@ " clamp_len=-1)\n", "\n", "xlnet_parameters = xlnet.RunConfig(**kwargs)\n", - "xlnet_config = xlnet.XLNetConfig(json_path='alxlnet-base/config.json')" + "xlnet_config = xlnet.XLNetConfig(\n", + " json_path = 'alxlnet-base-2020-04-10/config.json'\n", + ")" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -1558,19 +2278,11 @@ "INFO:tensorflow:memory input None\n", "INFO:tensorflow:Use float type \n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py:1750: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n", - " warnings.warn('An interactive session is already active. This can '\n" - ] } ], "source": [ "learning_rate = 2e-5\n", - "hidden_size_word = 128\n", + "hidden_size_word = 256\n", "\n", "tf.reset_default_graph()\n", "sess = tf.InteractiveSession()\n", @@ -1580,7 +2292,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -1598,7 +2310,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -1614,7 +2326,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -1659,14 +2371,14 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 630/630 [01:59<00:00, 5.27it/s]\n" + "100%|██████████| 1250/1250 [02:00<00:00, 10.39it/s]\n" ] } ], @@ -1682,9 +2394,9 @@ " batch_y = pad_sequences(batch_y,padding='post')\n", " batch_depends = test_depends[i: index]\n", " batch_depends = pad_sequences(batch_depends,padding='post')\n", - " batch_segments = segments_test[i: index]\n", + " batch_segments = test_segments[i: index]\n", " batch_segments = pad_sequences(batch_segments, padding='post', value = 4)\n", - " batch_masks = masks_test[i: index]\n", + " batch_masks = test_masks[i: index]\n", " batch_masks = pad_sequences(batch_masks, padding='post', value = 1)\n", " \n", " tags_seq, heads = sess.run(\n", @@ -1709,7 +2421,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -1719,12 +2431,12 @@ " \n", "temp_predict_Y = []\n", "for r in predict_Y:\n", - " temp_predict_Y.extend(r)" + " temp_predict_Y.extend(r)\n" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -1733,43 +2445,42 @@ "text": [ " precision recall f1-score support\n", "\n", - " PAD 0.99999 1.00000 0.99999 644667\n", - " X 0.99998 0.99999 0.99998 144988\n", - " acl 0.95995 0.96137 0.96066 6058\n", - " advcl 0.91687 0.93839 0.92751 2386\n", - " advmod 0.97160 0.97620 0.97389 9496\n", - " amod 0.95264 0.94761 0.95012 8342\n", - " appos 0.97560 0.97638 0.97599 4995\n", - " aux 1.00000 1.00000 1.00000 6\n", - " case 0.99147 0.98685 0.98916 21680\n", - " cc 0.97523 0.99377 0.98441 6418\n", - " ccomp 0.95249 0.90112 0.92610 890\n", - " compound 0.95478 0.95656 0.95567 13399\n", - "compound:plur 0.97575 0.98067 0.97821 1190\n", - " conj 0.96575 0.98929 0.97738 8494\n", - " cop 0.98201 0.98708 0.98454 1935\n", - " csubj 1.00000 0.90476 0.95000 42\n", - " csubj:pass 0.91667 0.91667 0.91667 12\n", - " dep 0.96490 0.94781 0.95628 1073\n", - " det 0.96461 0.97375 0.96916 8230\n", - " fixed 0.95762 0.92188 0.93941 1152\n", - " flat 0.98208 0.98030 0.98119 20967\n", - " iobj 1.00000 0.82927 0.90667 41\n", - " mark 0.96463 0.95609 0.96034 2824\n", - " nmod 0.96933 0.95492 0.96207 8207\n", - " nsubj 0.97533 0.97086 0.97309 12867\n", - " nsubj:pass 0.95811 0.94145 0.94970 3911\n", - " nummod 0.98952 0.98590 0.98770 7659\n", - " obj 0.97249 0.96839 0.97044 10440\n", - " obl 0.97129 0.97222 0.97175 11483\n", - " parataxis 0.95691 0.91348 0.93469 705\n", - " punct 0.99883 0.99955 0.99919 33252\n", - " root 0.98284 0.98372 0.98328 10073\n", - " xcomp 0.92520 0.94988 0.93738 2474\n", + " PAD 0.99976 1.00000 0.99988 339805\n", + " X 1.00000 0.99936 0.99968 62631\n", + " acl 0.83797 0.82854 0.83323 3202\n", + " advcl 0.63223 0.66865 0.64993 1684\n", + " advmod 0.95967 0.93403 0.94668 6700\n", + " amod 0.90195 0.90054 0.90124 4464\n", + " appos 0.85624 0.74838 0.79869 3088\n", + " case 0.98172 0.98093 0.98133 11117\n", + " cc 0.98209 0.97993 0.98101 3637\n", + " ccomp 0.50143 0.49157 0.49645 356\n", + " compound 0.91041 0.92329 0.91681 11381\n", + "compound:plur 0.55000 0.66667 0.60274 33\n", + " conj 0.89766 0.88735 0.89248 5140\n", + " cop 0.96975 0.97302 0.97138 593\n", + " csubj 0.25000 0.16667 0.20000 6\n", + " csubj:pass 0.00000 0.00000 0.00000 1\n", + " dep 0.65012 0.72576 0.68586 361\n", + " det 0.94992 0.91641 0.93286 3912\n", + " fixed 0.87597 0.77397 0.82182 146\n", + " flat 0.95129 0.97231 0.96169 18638\n", + " iobj 1.00000 0.25000 0.40000 4\n", + " mark 0.91882 0.92439 0.92160 1812\n", + " nmod 0.85028 0.84376 0.84701 4429\n", + " nsubj 0.83595 0.87600 0.85551 6992\n", + " nsubj:pass 0.82935 0.78216 0.80506 1951\n", + " nummod 0.97242 0.95072 0.96145 4302\n", + " obj 0.90211 0.91702 0.90950 6351\n", + " obl 0.87606 0.85241 0.86408 5075\n", + " parataxis 0.46886 0.31762 0.37870 403\n", + " punct 0.99703 0.99631 0.99667 20881\n", + " root 0.89346 0.91070 0.90200 10000\n", + " xcomp 0.75430 0.74016 0.74716 1601\n", "\n", - " accuracy 0.99475 1010356\n", - " macro avg 0.97044 0.95958 0.96462 1010356\n", - " weighted avg 0.99476 0.99475 0.99475 1010356\n", + " accuracy 0.97964 540696\n", + " macro avg 0.81115 0.77808 0.78633 540696\n", + " weighted avg 0.97957 0.97964 0.97954 540696\n", "\n" ] } @@ -1781,16 +2492,16 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "arc accuracy: 0.8943757029483008\n", - "types accuracy: 0.88690168487317\n", - "root accuracy: 0.9425595238095238\n" + "arc accuracy: 0.8492916621693761\n", + "types accuracy: 0.8281206797454291\n", + "root accuracy: 0.9209962611210157\n" ] } ], @@ -1802,60 +2513,9 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 34, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Placeholder',\n", - " 'Placeholder_1',\n", - " 'Placeholder_2',\n", - " 'Placeholder_3',\n", - " 'Placeholder_4',\n", - " 'Placeholder_5',\n", - " 'W_d',\n", - " 'W_e',\n", - " 'U',\n", - " 'U-bi',\n", - " 'Wl',\n", - " 'Wr',\n", - " 'model/transformer/r_w_bias',\n", - " 'model/transformer/r_r_bias',\n", - " 'model/transformer/word_embedding/lookup_table',\n", - " 'model/transformer/word_embedding/lookup_table_2',\n", - " 'model/transformer/r_s_bias',\n", - " 'model/transformer/seg_embed',\n", - " 'model/transformer/layer_shared/rel_attn/q/kernel',\n", - " 'model/transformer/layer_shared/rel_attn/k/kernel',\n", - " 'model/transformer/layer_shared/rel_attn/v/kernel',\n", - " 'model/transformer/layer_shared/rel_attn/r/kernel',\n", - " 'model/transformer/layer_shared/rel_attn/o/kernel',\n", - " 'model/transformer/layer_shared/rel_attn/LayerNorm/gamma',\n", - " 'model/transformer/layer_shared/ff/layer_1/kernel',\n", - " 'model/transformer/layer_shared/ff/layer_1/bias',\n", - " 'model/transformer/layer_shared/ff/layer_2/kernel',\n", - " 'model/transformer/layer_shared/ff/layer_2/bias',\n", - " 'model/transformer/layer_shared/ff/LayerNorm/gamma',\n", - " 'dense/kernel',\n", - " 'dense/bias',\n", - " 'dense_1/kernel',\n", - " 'dense_1/bias',\n", - " 'dense_2/kernel',\n", - " 'dense_2/bias',\n", - " 'dense_3/kernel',\n", - " 'dense_3/bias',\n", - " 'heads_seq',\n", - " 'tags_seq',\n", - " 'transitions',\n", - " 'logits']" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "strings = ','.join(\n", " [\n", @@ -1873,13 +2533,12 @@ " and 'adam' not in n.name\n", " and 'gradients/bert' not in n.name\n", " ]\n", - ")\n", - "strings.split(',')" + ")" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -1914,7 +2573,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -1922,7 +2581,7 @@ "output_type": "stream", "text": [ "INFO:tensorflow:Restoring parameters from alxlnet-base-dependency/model.ckpt\n", - "WARNING:tensorflow:From :23: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From :23: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use `tf.compat.v1.graph_util.convert_variables_to_constants`\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/framework/graph_util_impl.py:277: extract_sub_graph (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", @@ -1940,27 +2599,57 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ - "import boto3\n", - "\n", - "bucketName = 'huseinhouse-storage'\n", - "Key = 'alxlnet-base-dependency/frozen_model.pb'\n", - "outPutname = \"v34/dependency/alxlnet-base-dependency.pb\"\n", - "\n", - "s3 = boto3.client('s3')\n", - "\n", - "s3.upload_file(Key,bucketName,outPutname)" + "transforms = ['add_default_attributes',\n", + " 'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n", + " 'fold_batch_norms',\n", + " 'fold_old_batch_norms',\n", + " 'quantize_weights(fallback_min=-10, fallback_max=10)',\n", + " 'strip_unused_nodes',\n", + " 'sort_by_execution_order']" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From :6: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use tf.gfile.GFile.\n" + ] + } + ], + "source": [ + "from tensorflow.tools.graph_transforms import TransformGraph\n", + "tf.set_random_seed(0)\n", + "\n", + "pb = 'alxlnet-base-dependency/frozen_model.pb'\n", + "input_graph_def = tf.GraphDef()\n", + "with tf.gfile.FastGFile(pb, 'rb') as f:\n", + " input_graph_def.ParseFromString(f.read())\n", + "\n", + "if 'bert' in pb:\n", + " inputs = ['Placeholder']\n", + " a = ['dense/BiasAdd']\n", + "if 'xlnet' in pb:\n", + " inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n", + " a = ['transpose_3']\n", + "\n", + "transformed_graph_def = TransformGraph(input_graph_def, \n", + " inputs,\n", + " ['logits', 'heads_seq'] + a, transforms)\n", + "\n", + "with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n", + " f.write(transformed_graph_def.SerializeToString())" + ] } ], "metadata": { @@ -1979,7 +2668,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.9" } }, "nbformat": 4, diff --git a/session/dependency/bert-base.ipynb b/session/dependency/bert-base.ipynb index 23afbdf6..36167fa2 100644 --- a/session/dependency/bert-base.ipynb +++ b/session/dependency/bert-base.ipynb @@ -7,132 +7,49 @@ "outputs": [], "source": [ "import os\n", - "os.environ['CUDA_VISIBLE_DEVICES'] = '3'" + "os.environ['CUDA_VISIBLE_DEVICES'] = '2'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], - "source": [ - "with open('../Malaya-Dataset/dependency/gsd-ud-train.conllu.txt') as fopen:\n", - " corpus = fopen.read().split('\\n')\n", - " \n", - "with open('../Malaya-Dataset/dependency/gsd-ud-test.conllu.txt') as fopen:\n", - " corpus.extend(fopen.read().split('\\n'))\n", - " \n", - "with open('../Malaya-Dataset/dependency/gsd-ud-dev.conllu.txt') as fopen:\n", - " corpus.extend(fopen.read().split('\\n'))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/optimization.py:87: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", + "WARNING:tensorflow:From /home/husein/bert-standard/bert/optimization.py:87: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", "\n" ] } ], "source": [ "import bert\n", - "from bert import run_classifier\n", "from bert import optimization\n", "from bert import tokenization\n", "from bert import modeling\n", + "import numpy as np\n", + "import json\n", "import tensorflow as tf\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import unicodedata\n", - "import six\n", - "from functools import partial\n", - "\n", - "SPIECE_UNDERLINE = '▁'\n", - "\n", - "def preprocess_text(inputs, lower=False, remove_space=True, keep_accents=False):\n", - " if remove_space:\n", - " outputs = ' '.join(inputs.strip().split())\n", - " else:\n", - " outputs = inputs\n", - " outputs = outputs.replace(\"``\", '\"').replace(\"''\", '\"')\n", - "\n", - " if six.PY2 and isinstance(outputs, str):\n", - " outputs = outputs.decode('utf-8')\n", - "\n", - " if not keep_accents:\n", - " outputs = unicodedata.normalize('NFKD', outputs)\n", - " outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])\n", - " if lower:\n", - " outputs = outputs.lower()\n", - "\n", - " return outputs\n", - "\n", - "\n", - "def encode_pieces(sp_model, text, return_unicode=True, sample=False):\n", - " # return_unicode is used only for py2\n", - "\n", - " # note(zhiliny): in some systems, sentencepiece only accepts str for py2\n", - " if six.PY2 and isinstance(text, unicode):\n", - " text = text.encode('utf-8')\n", - "\n", - " if not sample:\n", - " pieces = sp_model.EncodeAsPieces(text)\n", - " else:\n", - " pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1)\n", - " new_pieces = []\n", - " for piece in pieces:\n", - " if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():\n", - " cur_pieces = sp_model.EncodeAsPieces(\n", - " piece[:-1].replace(SPIECE_UNDERLINE, ''))\n", - " if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:\n", - " if len(cur_pieces[0]) == 1:\n", - " cur_pieces = cur_pieces[1:]\n", - " else:\n", - " cur_pieces[0] = cur_pieces[0][1:]\n", - " cur_pieces.append(piece[-1])\n", - " new_pieces.extend(cur_pieces)\n", - " else:\n", - " new_pieces.append(piece)\n", - "\n", - " # note(zhiliny): convert back to unicode for py2\n", - " if six.PY2 and return_unicode:\n", - " ret_pieces = []\n", - " for piece in new_pieces:\n", - " if isinstance(piece, str):\n", - " piece = piece.decode('utf-8')\n", - " ret_pieces.append(piece)\n", - " new_pieces = ret_pieces\n", - "\n", - " return new_pieces\n", - "\n", - "\n", - "def encode_ids(sp_model, text, sample=False):\n", - " pieces = encode_pieces(sp_model, text, return_unicode=False, sample=sample)\n", - " ids = [sp_model.PieceToId(piece) for piece in pieces]\n", - " return ids" + "import itertools\n", + "import collections\n", + "import re\n", + "import random\n", + "import sentencepiece as spm\n", + "from unidecode import unidecode\n", + "from sklearn.utils import shuffle\n", + "from tqdm import tqdm\n", + "from prepro_utils import preprocess_text, encode_ids, encode_pieces\n", + "from malaya.text.function import transformer_textcleaning as cleaning" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "import sentencepiece as spm\n", - "\n", "sp_model = spm.SentencePieceProcessor()\n", "sp_model.Load('sp10m.cased.bert.model')\n", "\n", @@ -141,282 +58,90 @@ "v = [i.split('\\t') for i in v]\n", "v = {i[0]: i[1] for i in v}\n", "\n", + "\n", "class Tokenizer:\n", - " def __init__(self, v):\n", + " def __init__(self, v, sp_model):\n", " self.vocab = v\n", - " pass\n", - " \n", + " self.sp_model = sp_model\n", + "\n", " def tokenize(self, string):\n", - " return encode_pieces(sp_model, string, return_unicode=False, sample=False)\n", - " \n", + " return encode_pieces(\n", + " self.sp_model, string, return_unicode = False, sample = False\n", + " )\n", + "\n", " def convert_tokens_to_ids(self, tokens):\n", - " return [sp_model.PieceToId(piece) for piece in tokens]\n", - " \n", + " return [self.sp_model.PieceToId(piece) for piece in tokens]\n", + "\n", " def convert_ids_to_tokens(self, ids):\n", - " return [sp_model.IdToPiece(i) for i in ids]\n", - " \n", - "tokenizer = Tokenizer(v)" + " return [self.sp_model.IdToPiece(i) for i in ids]\n", + "\n", + "\n", + "tokenizer = Tokenizer(v, sp_model)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "tag2idx = {'PAD': 0, 'X': 1}\n", - "tag_idx = 2\n", + "import pickle\n", "\n", - "def process_corpus(corpus, until = None):\n", - " global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx\n", - " sentences, words, depends, labels, pos, sequences = [], [], [], [], [], []\n", - " temp_sentence, temp_word, temp_depend, temp_label, temp_pos = [], [], [], [], []\n", - " first_time = True\n", - " for sentence in corpus:\n", - " try:\n", - " if len(sentence):\n", - " if sentence[0] == '#':\n", - " continue\n", - " if first_time:\n", - " print(sentence)\n", - " first_time = False\n", - " sentence = sentence.split('\\t')\n", - " if sentence[7] not in tag2idx:\n", - " tag2idx[sentence[7]] = tag_idx\n", - " tag_idx += 1\n", - " temp_word.append(sentence[1])\n", - " temp_depend.append(int(sentence[6]) + 1)\n", - " temp_label.append(tag2idx[sentence[7]])\n", - " temp_sentence.append(sentence[1])\n", - " temp_pos.append(sentence[3])\n", - " else:\n", - " if len(temp_sentence) < 2 or len(temp_word) != len(temp_label):\n", - " temp_word = []\n", - " temp_depend = []\n", - " temp_label = []\n", - " temp_sentence = []\n", - " temp_pos = []\n", - " continue\n", - " bert_tokens = ['[CLS]']\n", - " labels_ = [0]\n", - " depends_ = [0]\n", - " seq_ = []\n", - " for no, orig_token in enumerate(temp_word):\n", - " labels_.append(temp_label[no])\n", - " depends_.append(temp_depend[no])\n", - " t = tokenizer.tokenize(orig_token)\n", - " bert_tokens.extend(t)\n", - " labels_.extend([1] * (len(t) - 1))\n", - " depends_.extend([0] * (len(t) - 1))\n", - " seq_.append(no + 1)\n", - " bert_tokens.append('[SEP]')\n", - " labels_.append(0)\n", - " depends_.append(0)\n", - " words.append(tokenizer.convert_tokens_to_ids(bert_tokens))\n", - " depends.append(depends_)\n", - " labels.append(labels_)\n", - " sentences.append(bert_tokens)\n", - " pos.append(temp_pos)\n", - " sequences.append(seq_)\n", - " temp_word = []\n", - " temp_depend = []\n", - " temp_label = []\n", - " temp_sentence = []\n", - " temp_pos = []\n", - " except Exception as e:\n", - " print(e, sentence)\n", - " return sentences[:-1], words[:-1], depends[:-1], labels[:-1], pos[:-1], sequences[:-1]" + "with open('train_X.pkl', 'rb') as fopen:\n", + " train_X, train_Y, train_depends = pickle.load(fopen)\n", + " \n", + "with open('test_X.pkl', 'rb') as fopen:\n", + " test_X, test_Y, test_depends = pickle.load(fopen)\n", + " \n", + "with open('tags.pkl', 'rb') as fopen:\n", + " idx2tag, tag2idx = pickle.load(fopen)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "1\tSembungan\tsembungan\tPROPN\tX--\t_\t4\tnsubj\t_\tMorphInd=^sembungan_X--$\n" + "WARNING:tensorflow:From /home/husein/bert-standard/bert/modeling.py:93: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.\n", + "\n" ] } ], "source": [ - "sentences, words, depends, labels, _, _ = process_corpus(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open('../Malaya-Dataset/dependency/augmented-dependency.json') as fopen:\n", - " augmented = json.load(fopen)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "text_augmented, depends_augmented, labels_augmented = [], [], []\n", - "\n", - "for a in augmented:\n", - " text_augmented.extend(a[0])\n", - " depends_augmented.extend(a[1])\n", - " labels_augmented.extend((np.array(a[2]) + 1).tolist())" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "def parse_XY(texts, depends, labels):\n", - " outside, sentences, outside_depends, outside_labels = [], [], [], []\n", - " for no, text in enumerate(texts):\n", - " temp_depend = depends[no]\n", - " temp_label = labels[no]\n", - " s = text.split()\n", - " sentences.append(s)\n", - " bert_tokens = ['[CLS]']\n", - " labels_ = [0]\n", - " depends_ = [0]\n", - " for no, orig_token in enumerate(s):\n", - " labels_.append(temp_label[no])\n", - " depends_.append(temp_depend[no])\n", - " t = tokenizer.tokenize(orig_token)\n", - " bert_tokens.extend(t)\n", - " labels_.extend([1] * (len(t) - 1))\n", - " depends_.extend([0] * (len(t) - 1))\n", - " bert_tokens.append('[SEP]')\n", - " labels_.append(0)\n", - " depends_.append(0)\n", - " outside.append(tokenizer.convert_tokens_to_ids(bert_tokens))\n", - " outside_depends.append(depends_)\n", - " outside_labels.append(labels_)\n", - " return outside, sentences, outside_depends, outside_labels" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "outside, _, outside_depends, outside_labels = parse_XY(text_augmented, \n", - " depends_augmented, \n", - " labels_augmented)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "words.extend(outside)\n", - "depends.extend(outside_depends)\n", - "labels.extend(outside_labels)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "idx2tag = {v:k for k, v in tag2idx.items()}" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "words_train, words_test, depends_train, depends_test, labels_train, labels_test \\\n", - "= train_test_split(words, depends, labels, test_size = 0.2)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(40289, 10073)" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(words_train), len(words_test)" + "bert_config = modeling.BertConfig.from_json_file(\n", + " 'bert-base-2020-03-19/bert_config.json'\n", + ")" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "train_X = words_train\n", - "train_Y = labels_train\n", - "train_depends = depends_train\n", - "\n", - "test_X = words_test\n", - "test_Y = labels_test\n", - "test_depends = depends_test" + "BERT_INIT_CHKPNT = 'bert-base-2020-03-19/model.ckpt-2000002'" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "BERT_INIT_CHKPNT = 'bert-base-v3/model.ckpt'\n", - "BERT_CONFIG = 'bert-base-v3/config.json'" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/modeling.py:93: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.\n", - "\n" - ] - } - ], - "source": [ - "epoch = 30\n", - "batch_size = 32\n", + "epoch = 3\n", + "batch_size = 16\n", "warmup_proportion = 0.1\n", "num_train_steps = int(len(train_X) / batch_size * epoch)\n", - "num_warmup_steps = int(num_train_steps * warmup_proportion)\n", - "bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)" + "num_warmup_steps = int(num_train_steps * warmup_proportion)" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -449,8 +174,15 @@ " e = tf.expand_dims(tf.expand_dims(mask_e, 1), 2)\n", " output = output * d * e\n", " \n", - " return output\n", - " \n", + " return output" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ "class BiLinear:\n", " def __init__(self, left_features, right_features, out_features):\n", " self.left_features = left_features\n", @@ -475,8 +207,17 @@ " output = output + tf.matmul(input_left, tf.transpose(self.W_l))\\\n", " + tf.matmul(input_right, tf.transpose(self.W_r))\n", " \n", - " return tf.reshape(output, output_shape)\n", - " \n", + " return tf.reshape(output, output_shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "_NEG_INF = -1e9\n", + "\n", "class Model:\n", " def __init__(\n", " self,\n", @@ -508,14 +249,20 @@ " config=bert_config,\n", " is_training=training,\n", " input_ids=self.words,\n", + " input_mask=self.mask,\n", " use_one_hot_embeddings=False)\n", + " \n", " output_layer = model.get_sequence_output()\n", " \n", " arc_h = tf.nn.elu(self.arc_h(output_layer))\n", " arc_c = tf.nn.elu(self.arc_c(output_layer))\n", + " self._arc_h = arc_h\n", + " self._arc_c = arc_c\n", " \n", " type_h = tf.nn.elu(self.type_h(output_layer))\n", " type_c = tf.nn.elu(self.type_c(output_layer))\n", + " self._type_h = type_h\n", + " self._type_c = type_c\n", " \n", " out_arc = tf.squeeze(self.attention.forward(arc_h, arc_c, mask_d=self.mask, \n", " mask_e=self.mask), axis = 1)\n", @@ -534,6 +281,11 @@ " self.heads_seq = tf.argmax(decode_arc, axis = 1)\n", " self.heads_seq = tf.identity(self.heads_seq, name = 'heads_seq')\n", " \n", + "# self.decode_arc_t = tf.transpose(decode_arc, (0, 2, 1))\n", + "# sequence_loss_depends = tf.contrib.seq2seq.sequence_loss(logits = self.decode_arc_t,\n", + "# targets = self.heads,\n", + "# weights = mask)\n", + " \n", " t = tf.cast(tf.transpose(self.heads_seq), tf.int32)\n", " broadcasted = tf.broadcast_to(batch_index, tf.shape(t))\n", " concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0), \n", @@ -608,7 +360,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -626,33 +378,33 @@ " * https://github.com/tensorflow/io (for I/O related ops)\n", "If you depend on functionality not listed there, please file an issue.\n", "\n", - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/modeling.py:171: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.\n", + "WARNING:tensorflow:From /home/husein/bert-standard/bert/modeling.py:171: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.\n", "\n", - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/modeling.py:490: The name tf.assert_less_equal is deprecated. Please use tf.compat.v1.assert_less_equal instead.\n", + "WARNING:tensorflow:From /home/husein/bert-standard/bert/modeling.py:490: The name tf.assert_less_equal is deprecated. Please use tf.compat.v1.assert_less_equal instead.\n", "\n", - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/modeling.py:358: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From /home/husein/bert-standard/bert/modeling.py:358: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n", - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/modeling.py:671: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From /home/husein/bert-standard/bert/modeling.py:671: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use keras.layers.Dense instead.\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/layers/core.py:187: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use `layer.__call__` method instead.\n", - "WARNING:tensorflow:From :110: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From :61: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/contrib/crf/python/ops/crf.py:213: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use `keras.layers.RNN(cell)`, which is equivalent to this API\n", - "WARNING:tensorflow:From :145: calling log_softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From :101: calling log_softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "dim is deprecated, use axis instead\n", - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/optimization.py:27: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.\n", + "WARNING:tensorflow:From /home/husein/bert-standard/bert/optimization.py:27: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.\n", "\n", - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/optimization.py:32: The name tf.train.polynomial_decay is deprecated. Please use tf.compat.v1.train.polynomial_decay instead.\n", + "WARNING:tensorflow:From /home/husein/bert-standard/bert/optimization.py:32: The name tf.train.polynomial_decay is deprecated. Please use tf.compat.v1.train.polynomial_decay instead.\n", "\n", - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/optimization.py:70: The name tf.trainable_variables is deprecated. Please use tf.compat.v1.trainable_variables instead.\n", + "WARNING:tensorflow:From /home/husein/bert-standard/bert/optimization.py:70: The name tf.trainable_variables is deprecated. Please use tf.compat.v1.trainable_variables instead.\n", "\n" ] } @@ -662,7 +414,7 @@ "sess = tf.InteractiveSession()\n", "\n", "learning_rate = 2e-5\n", - "hidden_size_word = 128\n", + "hidden_size_word = 256\n", "\n", "model = Model(learning_rate, hidden_size_word)\n", "sess.run(tf.global_variables_initializer())" @@ -670,14 +422,14 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "INFO:tensorflow:Restoring parameters from bert-base-v3/model.ckpt\n" + "INFO:tensorflow:Restoring parameters from bert-base-2020-03-19/model.ckpt-2000002\n" ] } ], @@ -689,7 +441,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -705,16 +457,16 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[0.017699115, 0.026548672, 27.245255]" + "[0.01369863, 0.06849315, 32.680145]" ] }, - "execution_count": 23, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -729,16 +481,16 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[0.026548672, 0.03539823, 172.58093]" + "[0.02739726, 0.02739726, 152.1837]" ] }, - "execution_count": 24, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -753,106 +505,22 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([ 5, 5, 11, 5, 5, 5, 5, 5, 5, 5, 5, 17, 5, 5, 17, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " dtype=int32),\n", - " array([ 8, 8, 0, 12, 8, 3, 12, 12, 12, 3, 12, 9, 14, 12, 7, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),\n", - " array([0, 1, 2, 0, 2, 4, 0, 4, 0, 0, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32))" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tags_seq, heads = sess.run(\n", - " [model.logits, model.heads_seq],\n", - " feed_dict = {\n", - " model.words: batch_x,\n", - " },\n", - ")\n", - "tags_seq[0], heads[0], batch_depends[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 26, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [10:31<00:00, 2.00it/s, accuracy=0.971, accuracy_depends=0.6, cost=1.25] \n", - "test minibatch loop: 100%|██████████| 315/315 [01:23<00:00, 3.75it/s, accuracy=0.838, accuracy_depends=0.556, cost=1.77]\n", - "train minibatch loop: 0%| | 0/1260 [00:00?@[\\]^_`{|}~'\n", + "\n", + "def transformer_textcleaning(string):\n", + " \"\"\"\n", + " use by any transformer model before tokenization\n", + " \"\"\"\n", + " string = unidecode(string)\n", + " string = re.sub('\\\\(dot\\\\)', '.', string)\n", + " string = (\n", + " re.sub(re.findall(r'\\', string)[0], '', string)\n", + " if (len(re.findall(r'\\', string)) > 0)\n", + " and ('href' in re.findall(r'\\', string)[0])\n", + " else string\n", + " )\n", + " string = re.sub(\n", + " r'\\w+:\\/{2}[\\d\\w-]+(\\.[\\d\\w-]+)*(?:(?:\\/[^\\s/]*))*', ' ', string\n", + " )\n", + " string = re.sub(r'[ ]+', ' ', string).strip().split()\n", + " string = [w for w in string if w[0] != '@']\n", + " string = ' '.join(string)\n", + " string = re.sub(f'([{PUNCTUATION}])', r' \\1 ', string)\n", + " string = re.sub('\\s{2,}', ' ', string)\n", + " original_string = string.split()\n", + " string = [\n", + " (original_string[no], word.title() if word.isupper() else word)\n", + " for no, word in enumerate(string.split())\n", + " if len(word)\n", + " ]\n", + " return [s[0] for s in string], [s[1] for s in string]\n", + "\n", + "def parse_X(left):\n", + " bert_tokens = ['[CLS]']\n", + " for no, orig_token in enumerate(left):\n", + " t = tokenizer.tokenize(orig_token)\n", + " bert_tokens.extend(t)\n", + " bert_tokens.append(\"[SEP]\")\n", + " t = tokenizer.convert_tokens_to_ids(bert_tokens)\n", + " return t, bert_tokens, [1] * len(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "def dependency_graph(tagging, indexing):\n", + " \"\"\"\n", + " Return helper object for dependency parser results. Only accept tagging and indexing outputs from dependency models.\n", + " \"\"\"\n", + " result = []\n", + " for i in range(len(tagging)):\n", + " result.append(\n", + " '%d\\t%s\\t_\\t_\\t_\\t_\\t%d\\t%s\\t_\\t_'\n", + " % (i + 1, tagging[i][0], int(indexing[i][1]), tagging[i][1])\n", + " )\n", + " return DependencyGraph('\\n'.join(result), top_relation_label='root')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 4, training loss: 0.638962, training acc: 0.996465, training depends: 0.879061, valid loss: 4.045756, valid acc: 0.978286, valid depends: 0.823873\n", - "\n" - ] - }, + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "2\n", + "2 (makan)\n", + "\n", + "\n", + "\n", + "0->2\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "1\n", + "1 (husein)\n", + "\n", + "\n", + "\n", + "2->1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "3\n", + "3 (ayam)\n", + "\n", + "\n", + "\n", + "2->3\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "string = 'husein makan ayam'\n", + "sequence = transformer_textcleaning(string)[1]\n", + "parsed_sequence, bert_sequence, mask = parse_X(sequence)\n", + "h, t = sess.run([model.heads_seq, model.tags_seq],\n", + " feed_dict = {\n", + " model.words: [parsed_sequence],\n", + " },\n", + ")\n", + "h = h[0] - 2\n", + "t = [idx2tag[d] for d in t[0]]\n", + "merged_h = merge_sentencepiece_tokens_tagging(bert_sequence, h)\n", + "merged_t = merge_sentencepiece_tokens_tagging(bert_sequence, t)\n", + "tagging = list(zip(merged_t[0], merged_t[1]))\n", + "indexing = list(zip(merged_h[0], merged_h[1]))\n", + "dep = dependency_graph(tagging, indexing)\n", + "dep.to_graphvis()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "1\n", + "1 (Kuala)\n", + "\n", + "\n", + "\n", + "0->1\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "2\n", + "2 (Lumpur)\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "13\n", + "13 (membidas)\n", + "\n", + "\n", + "\n", + "1->13\n", + "\n", + "\n", + "parataxis\n", + "\n", + "\n", + "\n", + "3\n", + "3 (:)\n", + "\n", + "\n", + "\n", + "2->3\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "4\n", + "4 (Ketua)\n", + "\n", + "\n", + "\n", + "13->4\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "14\n", + "14 (kenyataan)\n", + "\n", + "\n", + "\n", + "13->14\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "33\n", + "33 (melaksanakan)\n", + "\n", + "\n", + "\n", + "13->33\n", + "\n", + "\n", + "ccomp\n", + "\n", + "\n", + "\n", + "37\n", + "37 (.)\n", + "\n", + "\n", + "\n", + "13->37\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "39\n", + "39 (berkata)\n", + "\n", + "\n", + "\n", + "13->39\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "5\n", + "5 (Penerangan)\n", + "\n", + "\n", + "\n", + "4->5\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "7\n", + "7 (,)\n", + "\n", + "\n", + "\n", + "4->7\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "8\n", + "8 (Datuk)\n", + "\n", + "\n", + "\n", + "4->8\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "6\n", + "6 (Bersatu)\n", + "\n", + "\n", + "\n", + "5->6\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "9\n", + "9 (Wan)\n", + "\n", + "\n", + "\n", + "8->9\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "10\n", + "10 (Saiful)\n", + "\n", + "\n", + "\n", + "9->10\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "11\n", + "11 (Wan)\n", + "\n", + "\n", + "\n", + "10->11\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "12\n", + "12 (Jan)\n", + "\n", + "\n", + "\n", + "11->12\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "15\n", + "15 (Datuk)\n", + "\n", + "\n", + "\n", + "14->15\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "20\n", + "20 (Ketua)\n", + "\n", + "\n", + "\n", + "14->20\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "34\n", + "34 (sekatan)\n", + "\n", + "\n", + "\n", + "33->34\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "38\n", + "38 (Beliau)\n", + "\n", + "\n", + "\n", + "39->38\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "41\n", + "41 (Najib)\n", + "\n", + "\n", + "\n", + "39->41\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "40\n", + "40 (,)\n", + "\n", + "\n", + "\n", + "39->40\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "61\n", + "61 (.)\n", + "\n", + "\n", + "\n", + "39->61\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "46\n", + "46 (memetik)\n", + "\n", + "\n", + "\n", + "39->46\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "64\n", + "64 (berkata)\n", + "\n", + "\n", + "\n", + "39->64\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "16\n", + "16 (Seri)\n", + "\n", + "\n", + "\n", + "15->16\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "19\n", + "19 (dan)\n", + "\n", + "\n", + "\n", + "20->19\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "21\n", + "21 (Pemuda)\n", + "\n", + "\n", + "\n", + "20->21\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "24\n", + "24 (Datuk)\n", + "\n", + "\n", + "\n", + "20->24\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "17\n", + "17 (Najib)\n", + "\n", + "\n", + "\n", + "16->17\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "18\n", + "18 (Razak)\n", + "\n", + "\n", + "\n", + "17->18\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "22\n", + "22 (Umno)\n", + "\n", + "\n", + "\n", + "21->22\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "23\n", + "23 (,)\n", + "\n", + "\n", + "\n", + "24->23\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "25\n", + "25 (Dr)\n", + "\n", + "\n", + "\n", + "24->25\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "30\n", + "30 (mempertikaikan)\n", + "\n", + "\n", + "\n", + "24->30\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "26\n", + "26 (Asyraf)\n", + "\n", + "\n", + "\n", + "25->26\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "29\n", + "29 (yang)\n", + "\n", + "\n", + "\n", + "30->29\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "31\n", + "31 (tindakan)\n", + "\n", + "\n", + "\n", + "30->31\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "27\n", + "27 (Wajdi)\n", + "\n", + "\n", + "\n", + "26->27\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "28\n", + "28 (Dusuki)\n", + "\n", + "\n", + "\n", + "27->28\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "32\n", + "32 (kerajaan)\n", + "\n", + "\n", + "\n", + "31->32\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "35\n", + "35 (pergerakan)\n", + "\n", + "\n", + "\n", + "34->35\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "36\n", + "36 (penuh)\n", + "\n", + "\n", + "\n", + "35->36\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "43\n", + "43 (Asyraf)\n", + "\n", + "\n", + "\n", + "41->43\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "63\n", + "63 (Saiful)\n", + "\n", + "\n", + "\n", + "61->63\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "45\n", + "45 (sengaja)\n", + "\n", + "\n", + "\n", + "46->45\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "47\n", + "47 (kenyataan)\n", + "\n", + "\n", + "\n", + "46->47\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "62\n", + "62 (Wan)\n", + "\n", + "\n", + "\n", + "64->62\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "65\n", + "65 (,)\n", + "\n", + "\n", + "\n", + "64->65\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "83\n", + "83 (.)\n", + "\n", + "\n", + "\n", + "64->83\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "68\n", + "68 (menjangka)\n", + "\n", + "\n", + "\n", + "64->68\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "42\n", + "42 (dan)\n", + "\n", + "\n", + "\n", + "43->42\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "44\n", + "44 (Wajdi)\n", + "\n", + "\n", + "\n", + "43->44\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "48\n", + "48 (Perdana)\n", + "\n", + "\n", + "\n", + "47->48\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "49\n", + "49 (Menteri)\n", + "\n", + "\n", + "\n", + "48->49\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "50\n", + "50 (,)\n", + "\n", + "\n", + "\n", + "48->50\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "51\n", + "51 (Tan)\n", + "\n", + "\n", + "\n", + "48->51\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "57\n", + "57 (lengkap)\n", + "\n", + "\n", + "\n", + "48->57\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "52\n", + "52 (Sri)\n", + "\n", + "\n", + "\n", + "51->52\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "55\n", + "55 (yang)\n", + "\n", + "\n", + "\n", + "57->55\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "56\n", + "56 (tidak)\n", + "\n", + "\n", + "\n", + "57->56\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "59\n", + "59 (mengelirukan)\n", + "\n", + "\n", + "\n", + "57->59\n", + "\n", + "\n", + "ccomp\n", + "\n", + "\n", + "\n", + "53\n", + "53 (Muhyiddin)\n", + "\n", + "\n", + "\n", + "52->53\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "54\n", + "54 (Yassin)\n", + "\n", + "\n", + "\n", + "54->54\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "58\n", + "58 (untuk)\n", + "\n", + "\n", + "\n", + "59->58\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "60\n", + "60 (rakyat)\n", + "\n", + "\n", + "\n", + "59->60\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "66\n", + "66 (beliau)\n", + "\n", + "\n", + "\n", + "68->66\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "69\n", + "69 (ada)\n", + "\n", + "\n", + "\n", + "68->69\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "67\n", + "67 (sudah)\n", + "\n", + "\n", + "\n", + "67->67\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "70\n", + "70 (kenyataan)\n", + "\n", + "\n", + "\n", + "69->70\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "75\n", + "75 (tulisan)\n", + "\n", + "\n", + "\n", + "69->75\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "71\n", + "71 (balas)\n", + "\n", + "\n", + "\n", + "70->71\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "73\n", + "73 (Najib)\n", + "\n", + "\n", + "\n", + "70->73\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "74\n", + "74 (mengenai)\n", + "\n", + "\n", + "\n", + "75->74\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "77\n", + "77 (berhubung)\n", + "\n", + "\n", + "\n", + "75->77\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "72\n", + "72 (daripada)\n", + "\n", + "\n", + "\n", + "73->72\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "76\n", + "76 (beliau)\n", + "\n", + "\n", + "\n", + "77->76\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "78\n", + "78 (kesan)\n", + "\n", + "\n", + "\n", + "77->78\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "80\n", + "80 (sekatan)\n", + "\n", + "\n", + "\n", + "77->80\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "79\n", + "79 (positif)\n", + "\n", + "\n", + "\n", + "78->79\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "81\n", + "81 (pergerakan)\n", + "\n", + "\n", + "\n", + "80->81\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "82\n", + "82 (penuh)\n", + "\n", + "\n", + "\n", + "80->82\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "from tqdm import tqdm\n", - "\n", - "epoch = 5\n", - "\n", - "for e in range(epoch):\n", - " train_acc, train_loss = [], []\n", - " test_acc, test_loss = [], []\n", - " train_acc_depends, test_acc_depends = [], []\n", - " \n", - " pbar = tqdm(\n", - " range(0, len(train_X), batch_size), desc = 'train minibatch loop'\n", - " )\n", - " for i in pbar:\n", - " index = min(i + batch_size, len(train_X))\n", - " batch_x = train_X[i: index]\n", - " batch_x = pad_sequences(batch_x,padding='post')\n", - " batch_y = train_Y[i: index]\n", - " batch_y = pad_sequences(batch_y,padding='post')\n", - " batch_depends = train_depends[i: index]\n", - " batch_depends = pad_sequences(batch_depends,padding='post')\n", - " \n", - " acc_depends, acc, cost, _ = sess.run(\n", - " [model.accuracy_depends, model.accuracy, model.cost, model.optimizer],\n", - " feed_dict = {\n", - " model.words: batch_x,\n", - " model.types: batch_y,\n", - " model.heads: batch_depends,\n", - " model.switch: True\n", - " },\n", - " )\n", - " train_loss.append(cost)\n", - " train_acc.append(acc)\n", - " train_acc_depends.append(acc_depends)\n", - " pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)\n", - " \n", - " pbar = tqdm(\n", - " range(0, len(test_X), batch_size), desc = 'test minibatch loop'\n", - " )\n", - " for i in pbar:\n", - " index = min(i + batch_size, len(test_X))\n", - " batch_x = test_X[i: index]\n", - " batch_x = pad_sequences(batch_x,padding='post')\n", - " batch_y = test_Y[i: index]\n", - " batch_y = pad_sequences(batch_y,padding='post')\n", - " batch_depends = test_depends[i: index]\n", - " batch_depends = pad_sequences(batch_depends,padding='post')\n", - " \n", - " acc_depends, acc, cost = sess.run(\n", - " [model.accuracy_depends, model.accuracy, model.cost],\n", - " feed_dict = {\n", - " model.words: batch_x,\n", - " model.types: batch_y,\n", - " model.heads: batch_depends,\n", - " model.switch: True\n", - " },\n", - " )\n", - " test_loss.append(cost)\n", - " test_acc.append(acc)\n", - " test_acc_depends.append(acc_depends)\n", - " pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)\n", - " \n", - " \n", - " print(\n", - " 'epoch: %d, training loss: %f, training acc: %f, training depends: %f, valid loss: %f, valid acc: %f, valid depends: %f\\n'\n", - " % (e, np.mean(train_loss), \n", - " np.mean(train_acc), \n", - " np.mean(train_acc_depends), \n", - " np.mean(test_loss), \n", - " np.mean(test_acc), \n", - " np.mean(test_acc_depends)\n", - " ))" + "string = 'KUALA LUMPUR: Ketua Penerangan BERSATU, Datuk Wan Saiful Wan Jan membidas kenyataan Datuk Seri Najib Razak dan Ketua Pemuda UMNO, Datuk Dr Asyraf Wajdi Dusuki yang mempertikaikan tindakan kerajaan melaksanakan sekatan pergerakan penuh. Beliau berkata, Najib dan Asyraf Wajdi sengaja memetik kenyataan Perdana Menteri, Tan Sri Muhyiddin Yassin yang tidak lengkap untuk mengelirukan rakyat. Wan Saiful berkata, beliau sudah menjangka ada kenyataan balas daripada Najib mengenai tulisan beliau berhubung kesan positif sekatan pergerakan penuh.'\n", + "sequence = transformer_textcleaning(string)[1]\n", + "parsed_sequence, bert_sequence, mask = parse_X(sequence)\n", + "h, t = sess.run([model.heads_seq, model.tags_seq],\n", + " feed_dict = {\n", + " model.words: [parsed_sequence],\n", + " },\n", + ")\n", + "h = h[0] - 2\n", + "t = [idx2tag[d] for d in t[0]]\n", + "merged_h = merge_sentencepiece_tokens_tagging(bert_sequence, h)\n", + "merged_t = merge_sentencepiece_tokens_tagging(bert_sequence, t)\n", + "tagging = list(zip(merged_t[0], merged_t[1]))\n", + "indexing = list(zip(merged_h[0], merged_h[1]))\n", + "dep = dependency_graph(tagging, indexing)\n", + "dep.to_graphvis()" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1391,7 +1987,7 @@ "'bert-base-dependency/model.ckpt'" ] }, - "execution_count": 28, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1403,17 +1999,9 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 24, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py:1750: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n", - " warnings.warn('An interactive session is already active. This can '\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -1427,7 +2015,7 @@ "sess = tf.InteractiveSession()\n", "\n", "learning_rate = 2e-5\n", - "hidden_size_word = 128\n", + "hidden_size_word = 256\n", "\n", "model = Model(learning_rate, hidden_size_word, training = False)\n", "\n", @@ -1438,7 +2026,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -1454,7 +2042,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -1499,14 +2087,14 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 315/315 [01:18<00:00, 4.01it/s]\n" + "100%|██████████| 625/625 [01:08<00:00, 9.15it/s]\n" ] } ], @@ -1543,7 +2131,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -1558,7 +2146,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1567,43 +2155,42 @@ "text": [ " precision recall f1-score support\n", "\n", - " PAD 0.99996 1.00000 0.99998 877864\n", - " X 1.00000 0.99986 0.99993 145204\n", - " acl 0.96111 0.96190 0.96150 6037\n", - " advcl 0.94287 0.93895 0.94091 2408\n", - " advmod 0.97171 0.96904 0.97037 9464\n", - " amod 0.96283 0.94008 0.95132 8128\n", - " appos 0.97426 0.95940 0.96677 4852\n", - " aux 1.00000 0.50000 0.66667 4\n", - " case 0.98907 0.98834 0.98870 21519\n", - " cc 0.98089 0.98708 0.98397 6500\n", - " ccomp 0.95515 0.92164 0.93810 855\n", - " compound 0.95432 0.96565 0.95995 13479\n", - "compound:plur 0.96507 0.97778 0.97138 1215\n", - " conj 0.96943 0.98036 0.97486 8604\n", - " cop 0.96407 0.98531 0.97457 1906\n", - " csubj 0.92157 0.85455 0.88679 55\n", - " csubj:pass 0.93750 0.78947 0.85714 19\n", - " dep 0.95199 0.93574 0.94380 996\n", - " det 0.97043 0.96678 0.96860 8248\n", - " fixed 0.94176 0.93672 0.93923 1122\n", - " flat 0.98010 0.98217 0.98113 20755\n", - " iobj 0.87500 0.80000 0.83582 35\n", - " mark 0.94507 0.97448 0.95955 2860\n", - " nmod 0.96363 0.95912 0.96137 8121\n", - " nsubj 0.97076 0.97091 0.97083 12788\n", - " nsubj:pass 0.95192 0.96362 0.95774 3986\n", - " nummod 0.98563 0.97942 0.98251 7773\n", - " obj 0.96915 0.97071 0.96993 10551\n", - " obl 0.97549 0.97164 0.97356 11389\n", - " parataxis 0.95038 0.90415 0.92669 699\n", - " punct 0.99752 0.99773 0.99762 33438\n", - " root 0.98046 0.98124 0.98085 10073\n", - " xcomp 0.95153 0.94749 0.94951 2590\n", + " PAD 0.99944 1.00000 0.99972 481710\n", + " X 1.00000 0.99690 0.99845 61945\n", + " acl 0.84049 0.83081 0.83562 3298\n", + " advcl 0.66176 0.62500 0.64286 1656\n", + " advmod 0.95277 0.94836 0.95056 6700\n", + " amod 0.91014 0.90522 0.90767 4621\n", + " appos 0.81969 0.77457 0.79650 3052\n", + " case 0.98062 0.98199 0.98130 11492\n", + " cc 0.98549 0.97787 0.98166 3750\n", + " ccomp 0.56966 0.48042 0.52125 383\n", + " compound 0.90827 0.93299 0.92047 11133\n", + "compound:plur 0.55882 0.63333 0.59375 30\n", + " conj 0.89626 0.88883 0.89253 5424\n", + " cop 0.96823 0.96661 0.96742 599\n", + " csubj 0.66667 0.40000 0.50000 10\n", + " csubj:pass 0.00000 0.00000 0.00000 1\n", + " dep 0.65768 0.67218 0.66485 363\n", + " det 0.94642 0.92567 0.93593 3969\n", + " fixed 0.93103 0.78947 0.85443 171\n", + " flat 0.96042 0.96700 0.96370 18393\n", + " iobj 0.00000 0.00000 0.00000 4\n", + " mark 0.91283 0.93645 0.92449 1778\n", + " nmod 0.86204 0.85472 0.85836 4591\n", + " nsubj 0.86476 0.86536 0.86506 7145\n", + " nsubj:pass 0.81694 0.79646 0.80657 2034\n", + " nummod 0.96463 0.95920 0.96191 4436\n", + " obj 0.90765 0.91968 0.91363 6412\n", + " obl 0.88052 0.86034 0.87031 5191\n", + " parataxis 0.43636 0.38710 0.41026 372\n", + " punct 0.99723 0.99574 0.99649 20643\n", + " root 0.89938 0.91530 0.90727 10000\n", + " xcomp 0.74105 0.77125 0.75585 1718\n", "\n", - " accuracy 0.99562 1243537\n", - " macro avg 0.96396 0.93822 0.94823 1243537\n", - " weighted avg 0.99562 0.99562 0.99562 1243537\n", + " accuracy 0.98402 683024\n", + " macro avg 0.79679 0.77996 0.78684 683024\n", + " weighted avg 0.98390 0.98402 0.98394 683024\n", "\n" ] } @@ -1615,16 +2202,16 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "arc accuracy: 0.8554239102233114\n", - "types accuracy: 0.8481064607232274\n", - "root accuracy: 0.9203253968253969\n" + "arc accuracy: 0.8204592062639473\n", + "types accuracy: 0.7997014021904779\n", + "root accuracy: 0.9893670748261997\n" ] } ], @@ -1636,227 +2223,9 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 31, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Placeholder',\n", - " 'Placeholder_1',\n", - " 'Placeholder_2',\n", - " 'Placeholder_3',\n", - " 'W_d',\n", - " 'W_e',\n", - " 'U',\n", - " 'U-bi',\n", - " 'Wl',\n", - " 'Wr',\n", - " 'bert/embeddings/word_embeddings',\n", - " 'bert/embeddings/token_type_embeddings',\n", - " 'bert/embeddings/position_embeddings',\n", - " 'bert/embeddings/LayerNorm/gamma',\n", - " 'bert/encoder/layer_0/attention/self/query/kernel',\n", - " 'bert/encoder/layer_0/attention/self/query/bias',\n", - " 'bert/encoder/layer_0/attention/self/key/kernel',\n", - " 'bert/encoder/layer_0/attention/self/key/bias',\n", - " 'bert/encoder/layer_0/attention/self/value/kernel',\n", - " 'bert/encoder/layer_0/attention/self/value/bias',\n", - " 'bert/encoder/layer_0/attention/self/Softmax',\n", - " 'bert/encoder/layer_0/attention/output/dense/kernel',\n", - " 'bert/encoder/layer_0/attention/output/dense/bias',\n", - " 'bert/encoder/layer_0/attention/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_0/intermediate/dense/kernel',\n", - " 'bert/encoder/layer_0/intermediate/dense/bias',\n", - " 'bert/encoder/layer_0/output/dense/kernel',\n", - " 'bert/encoder/layer_0/output/dense/bias',\n", - " 'bert/encoder/layer_0/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_1/attention/self/query/kernel',\n", - " 'bert/encoder/layer_1/attention/self/query/bias',\n", - " 'bert/encoder/layer_1/attention/self/key/kernel',\n", - " 'bert/encoder/layer_1/attention/self/key/bias',\n", - " 'bert/encoder/layer_1/attention/self/value/kernel',\n", - " 'bert/encoder/layer_1/attention/self/value/bias',\n", - " 'bert/encoder/layer_1/attention/self/Softmax',\n", - " 'bert/encoder/layer_1/attention/output/dense/kernel',\n", - " 'bert/encoder/layer_1/attention/output/dense/bias',\n", - " 'bert/encoder/layer_1/attention/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_1/intermediate/dense/kernel',\n", - " 'bert/encoder/layer_1/intermediate/dense/bias',\n", - " 'bert/encoder/layer_1/output/dense/kernel',\n", - " 'bert/encoder/layer_1/output/dense/bias',\n", - " 'bert/encoder/layer_1/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_2/attention/self/query/kernel',\n", - " 'bert/encoder/layer_2/attention/self/query/bias',\n", - " 'bert/encoder/layer_2/attention/self/key/kernel',\n", - " 'bert/encoder/layer_2/attention/self/key/bias',\n", - " 'bert/encoder/layer_2/attention/self/value/kernel',\n", - " 'bert/encoder/layer_2/attention/self/value/bias',\n", - " 'bert/encoder/layer_2/attention/self/Softmax',\n", - " 'bert/encoder/layer_2/attention/output/dense/kernel',\n", - " 'bert/encoder/layer_2/attention/output/dense/bias',\n", - " 'bert/encoder/layer_2/attention/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_2/intermediate/dense/kernel',\n", - " 'bert/encoder/layer_2/intermediate/dense/bias',\n", - " 'bert/encoder/layer_2/output/dense/kernel',\n", - " 'bert/encoder/layer_2/output/dense/bias',\n", - " 'bert/encoder/layer_2/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_3/attention/self/query/kernel',\n", - " 'bert/encoder/layer_3/attention/self/query/bias',\n", - " 'bert/encoder/layer_3/attention/self/key/kernel',\n", - " 'bert/encoder/layer_3/attention/self/key/bias',\n", - " 'bert/encoder/layer_3/attention/self/value/kernel',\n", - " 'bert/encoder/layer_3/attention/self/value/bias',\n", - " 'bert/encoder/layer_3/attention/self/Softmax',\n", - " 'bert/encoder/layer_3/attention/output/dense/kernel',\n", - " 'bert/encoder/layer_3/attention/output/dense/bias',\n", - " 'bert/encoder/layer_3/attention/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_3/intermediate/dense/kernel',\n", - " 'bert/encoder/layer_3/intermediate/dense/bias',\n", - " 'bert/encoder/layer_3/output/dense/kernel',\n", - " 'bert/encoder/layer_3/output/dense/bias',\n", - " 'bert/encoder/layer_3/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_4/attention/self/query/kernel',\n", - " 'bert/encoder/layer_4/attention/self/query/bias',\n", - " 'bert/encoder/layer_4/attention/self/key/kernel',\n", - " 'bert/encoder/layer_4/attention/self/key/bias',\n", - " 'bert/encoder/layer_4/attention/self/value/kernel',\n", - " 'bert/encoder/layer_4/attention/self/value/bias',\n", - " 'bert/encoder/layer_4/attention/self/Softmax',\n", - " 'bert/encoder/layer_4/attention/output/dense/kernel',\n", - " 'bert/encoder/layer_4/attention/output/dense/bias',\n", - " 'bert/encoder/layer_4/attention/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_4/intermediate/dense/kernel',\n", - " 'bert/encoder/layer_4/intermediate/dense/bias',\n", - " 'bert/encoder/layer_4/output/dense/kernel',\n", - " 'bert/encoder/layer_4/output/dense/bias',\n", - " 'bert/encoder/layer_4/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_5/attention/self/query/kernel',\n", - " 'bert/encoder/layer_5/attention/self/query/bias',\n", - " 'bert/encoder/layer_5/attention/self/key/kernel',\n", - " 'bert/encoder/layer_5/attention/self/key/bias',\n", - " 'bert/encoder/layer_5/attention/self/value/kernel',\n", - " 'bert/encoder/layer_5/attention/self/value/bias',\n", - " 'bert/encoder/layer_5/attention/self/Softmax',\n", - " 'bert/encoder/layer_5/attention/output/dense/kernel',\n", - " 'bert/encoder/layer_5/attention/output/dense/bias',\n", - " 'bert/encoder/layer_5/attention/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_5/intermediate/dense/kernel',\n", - " 'bert/encoder/layer_5/intermediate/dense/bias',\n", - " 'bert/encoder/layer_5/output/dense/kernel',\n", - " 'bert/encoder/layer_5/output/dense/bias',\n", - " 'bert/encoder/layer_5/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_6/attention/self/query/kernel',\n", - " 'bert/encoder/layer_6/attention/self/query/bias',\n", - " 'bert/encoder/layer_6/attention/self/key/kernel',\n", - " 'bert/encoder/layer_6/attention/self/key/bias',\n", - " 'bert/encoder/layer_6/attention/self/value/kernel',\n", - " 'bert/encoder/layer_6/attention/self/value/bias',\n", - " 'bert/encoder/layer_6/attention/self/Softmax',\n", - " 'bert/encoder/layer_6/attention/output/dense/kernel',\n", - " 'bert/encoder/layer_6/attention/output/dense/bias',\n", - " 'bert/encoder/layer_6/attention/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_6/intermediate/dense/kernel',\n", - " 'bert/encoder/layer_6/intermediate/dense/bias',\n", - " 'bert/encoder/layer_6/output/dense/kernel',\n", - " 'bert/encoder/layer_6/output/dense/bias',\n", - " 'bert/encoder/layer_6/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_7/attention/self/query/kernel',\n", - " 'bert/encoder/layer_7/attention/self/query/bias',\n", - " 'bert/encoder/layer_7/attention/self/key/kernel',\n", - " 'bert/encoder/layer_7/attention/self/key/bias',\n", - " 'bert/encoder/layer_7/attention/self/value/kernel',\n", - " 'bert/encoder/layer_7/attention/self/value/bias',\n", - " 'bert/encoder/layer_7/attention/self/Softmax',\n", - " 'bert/encoder/layer_7/attention/output/dense/kernel',\n", - " 'bert/encoder/layer_7/attention/output/dense/bias',\n", - " 'bert/encoder/layer_7/attention/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_7/intermediate/dense/kernel',\n", - " 'bert/encoder/layer_7/intermediate/dense/bias',\n", - " 'bert/encoder/layer_7/output/dense/kernel',\n", - " 'bert/encoder/layer_7/output/dense/bias',\n", - " 'bert/encoder/layer_7/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_8/attention/self/query/kernel',\n", - " 'bert/encoder/layer_8/attention/self/query/bias',\n", - " 'bert/encoder/layer_8/attention/self/key/kernel',\n", - " 'bert/encoder/layer_8/attention/self/key/bias',\n", - " 'bert/encoder/layer_8/attention/self/value/kernel',\n", - " 'bert/encoder/layer_8/attention/self/value/bias',\n", - " 'bert/encoder/layer_8/attention/self/Softmax',\n", - " 'bert/encoder/layer_8/attention/output/dense/kernel',\n", - " 'bert/encoder/layer_8/attention/output/dense/bias',\n", - " 'bert/encoder/layer_8/attention/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_8/intermediate/dense/kernel',\n", - " 'bert/encoder/layer_8/intermediate/dense/bias',\n", - " 'bert/encoder/layer_8/output/dense/kernel',\n", - " 'bert/encoder/layer_8/output/dense/bias',\n", - " 'bert/encoder/layer_8/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_9/attention/self/query/kernel',\n", - " 'bert/encoder/layer_9/attention/self/query/bias',\n", - " 'bert/encoder/layer_9/attention/self/key/kernel',\n", - " 'bert/encoder/layer_9/attention/self/key/bias',\n", - " 'bert/encoder/layer_9/attention/self/value/kernel',\n", - " 'bert/encoder/layer_9/attention/self/value/bias',\n", - " 'bert/encoder/layer_9/attention/self/Softmax',\n", - " 'bert/encoder/layer_9/attention/output/dense/kernel',\n", - " 'bert/encoder/layer_9/attention/output/dense/bias',\n", - " 'bert/encoder/layer_9/attention/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_9/intermediate/dense/kernel',\n", - " 'bert/encoder/layer_9/intermediate/dense/bias',\n", - " 'bert/encoder/layer_9/output/dense/kernel',\n", - " 'bert/encoder/layer_9/output/dense/bias',\n", - " 'bert/encoder/layer_9/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_10/attention/self/query/kernel',\n", - " 'bert/encoder/layer_10/attention/self/query/bias',\n", - " 'bert/encoder/layer_10/attention/self/key/kernel',\n", - " 'bert/encoder/layer_10/attention/self/key/bias',\n", - " 'bert/encoder/layer_10/attention/self/value/kernel',\n", - " 'bert/encoder/layer_10/attention/self/value/bias',\n", - " 'bert/encoder/layer_10/attention/self/Softmax',\n", - " 'bert/encoder/layer_10/attention/output/dense/kernel',\n", - " 'bert/encoder/layer_10/attention/output/dense/bias',\n", - " 'bert/encoder/layer_10/attention/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_10/intermediate/dense/kernel',\n", - " 'bert/encoder/layer_10/intermediate/dense/bias',\n", - " 'bert/encoder/layer_10/output/dense/kernel',\n", - " 'bert/encoder/layer_10/output/dense/bias',\n", - " 'bert/encoder/layer_10/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_11/attention/self/query/kernel',\n", - " 'bert/encoder/layer_11/attention/self/query/bias',\n", - " 'bert/encoder/layer_11/attention/self/key/kernel',\n", - " 'bert/encoder/layer_11/attention/self/key/bias',\n", - " 'bert/encoder/layer_11/attention/self/value/kernel',\n", - " 'bert/encoder/layer_11/attention/self/value/bias',\n", - " 'bert/encoder/layer_11/attention/self/Softmax',\n", - " 'bert/encoder/layer_11/attention/output/dense/kernel',\n", - " 'bert/encoder/layer_11/attention/output/dense/bias',\n", - " 'bert/encoder/layer_11/attention/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_11/intermediate/dense/kernel',\n", - " 'bert/encoder/layer_11/intermediate/dense/bias',\n", - " 'bert/encoder/layer_11/output/dense/kernel',\n", - " 'bert/encoder/layer_11/output/dense/bias',\n", - " 'bert/encoder/layer_11/output/LayerNorm/gamma',\n", - " 'bert/pooler/dense/kernel',\n", - " 'bert/pooler/dense/bias',\n", - " 'dense/kernel',\n", - " 'dense/bias',\n", - " 'dense_1/kernel',\n", - " 'dense_1/bias',\n", - " 'dense_2/kernel',\n", - " 'dense_2/bias',\n", - " 'dense_3/kernel',\n", - " 'dense_3/bias',\n", - " 'heads_seq',\n", - " 'tags_seq',\n", - " 'transitions',\n", - " 'logits']" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "strings = ','.join(\n", " [\n", @@ -1874,13 +2243,12 @@ " and 'adam' not in n.name\n", " and 'gradients/bert' not in n.name\n", " ]\n", - ")\n", - "strings.split(',')" + ")" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -1915,7 +2283,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -1923,7 +2291,7 @@ "output_type": "stream", "text": [ "INFO:tensorflow:Restoring parameters from bert-base-dependency/model.ckpt\n", - "WARNING:tensorflow:From :23: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From :23: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use `tf.compat.v1.graph_util.convert_variables_to_constants`\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/framework/graph_util_impl.py:277: extract_sub_graph (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", @@ -1931,7 +2299,7 @@ "Use `tf.compat.v1.graph_util.extract_sub_graph`\n", "INFO:tensorflow:Froze 214 variables.\n", "INFO:tensorflow:Converted 214 variables to const ops.\n", - "2771 ops in the final graph.\n" + "2768 ops in the final graph.\n" ] } ], @@ -1941,182 +2309,56 @@ }, { "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "string = 'husein makan ayam'\n", - "\n", - "import re\n", - "\n", - "def entities_textcleaning(string, lowering = False):\n", - " \"\"\"\n", - " use by entities recognition, pos recognition and dependency parsing\n", - " \"\"\"\n", - " string = re.sub('[^A-Za-z0-9\\-\\/() ]+', ' ', string)\n", - " string = re.sub(r'[ ]+', ' ', string).strip()\n", - " original_string = string.split()\n", - " if lowering:\n", - " string = string.lower()\n", - " string = [\n", - " (original_string[no], word.title() if word.isupper() else word)\n", - " for no, word in enumerate(string.split())\n", - " if len(word)\n", - " ]\n", - " return [s[0] for s in string], [s[1] for s in string]\n", - "\n", - "def parse_X(left):\n", - " bert_tokens = ['[CLS]']\n", - " for no, orig_token in enumerate(left):\n", - " t = tokenizer.tokenize(orig_token)\n", - " bert_tokens.extend(t)\n", - " bert_tokens.append(\"[SEP]\")\n", - " return tokenizer.convert_tokens_to_ids(bert_tokens), bert_tokens\n", - "\n", - "sequence = entities_textcleaning(string)[1]\n", - "parsed_sequence, bert_sequence = parse_X(sequence)" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "def merge_sentencepiece_tokens_tagging(x, y):\n", - " new_paired_tokens = []\n", - " n_tokens = len(x)\n", - " rejected = ['[CLS]', '[SEP]']\n", - "\n", - " i = 0\n", - "\n", - " while i < n_tokens:\n", - "\n", - " current_token, current_label = x[i], y[i]\n", - " if not current_token.startswith('▁') and current_token not in rejected:\n", - " previous_token, previous_label = new_paired_tokens.pop()\n", - " merged_token = previous_token\n", - " merged_label = [previous_label]\n", - " while (\n", - " not current_token.startswith('▁')\n", - " and current_token not in rejected\n", - " ):\n", - " merged_token = merged_token + current_token.replace('▁', '')\n", - " merged_label.append(current_label)\n", - " i = i + 1\n", - " current_token, current_label = x[i], y[i]\n", - " merged_label = merged_label[0]\n", - " new_paired_tokens.append((merged_token, merged_label))\n", - "\n", - " else:\n", - " new_paired_tokens.append((current_token, current_label))\n", - " i = i + 1\n", - "\n", - " words = [\n", - " i[0].replace('▁', '')\n", - " for i in new_paired_tokens\n", - " if i[0] not in rejected\n", - " ]\n", - " labels = [i[1] for i in new_paired_tokens if i[0] not in rejected]\n", - " return words, labels" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py:1750: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n", - " warnings.warn('An interactive session is already active. This can '\n" - ] - } - ], - "source": [ - "def load_graph(frozen_graph_filename):\n", - " with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n", - " graph_def = tf.GraphDef()\n", - " graph_def.ParseFromString(f.read())\n", - " with tf.Graph().as_default() as graph:\n", - " tf.import_graph_def(graph_def)\n", - " return graph\n", - "\n", - "g = load_graph('bert-base-dependency/frozen_model.pb')\n", - "x = g.get_tensor_by_name('import/Placeholder:0')\n", - "heads_seq = g.get_tensor_by_name('import/heads_seq:0')\n", - "tags_seq = g.get_tensor_by_name('import/logits:0')\n", - "test_sess = tf.InteractiveSession(graph = g)" - ] - }, - { - "cell_type": "code", - "execution_count": 42, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ - "h, t = test_sess.run([heads_seq, tags_seq],\n", - " feed_dict = {\n", - " x: [parsed_sequence],\n", - " },\n", - ")\n", - "h = h[0] - 1\n", - "t = [idx2tag[d] for d in t[0]]\n", - "merged_h = merge_sentencepiece_tokens_tagging(bert_sequence, h)\n", - "merged_t = merge_sentencepiece_tokens_tagging(bert_sequence, t)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('husein', 2), ('makan', 0), ('ayam', 2)]\n" - ] - } - ], - "source": [ - "print(list(zip(merged_h[0], merged_h[1])))" + "transforms = ['add_default_attributes',\n", + " 'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n", + " 'fold_batch_norms',\n", + " 'fold_old_batch_norms',\n", + " 'quantize_weights(fallback_min=-10, fallback_max=10)',\n", + " 'strip_unused_nodes',\n", + " 'sort_by_execution_order']" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[('husein', 'amod'), ('makan', 'root'), ('ayam', 'obj')]\n" + "WARNING:tensorflow:From :6: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use tf.gfile.GFile.\n" ] } ], "source": [ - "print(list(zip(merged_t[0], merged_t[1])))" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [], - "source": [ - "import boto3\n", + "from tensorflow.tools.graph_transforms import TransformGraph\n", + "tf.set_random_seed(0)\n", + "\n", + "pb = 'bert-base-dependency/frozen_model.pb'\n", + "input_graph_def = tf.GraphDef()\n", + "with tf.gfile.FastGFile(pb, 'rb') as f:\n", + " input_graph_def.ParseFromString(f.read())\n", "\n", - "bucketName = 'huseinhouse-storage'\n", - "Key = 'bert-base-dependency/frozen_model.pb'\n", - "outPutname = \"v34/dependency/bert-base-dependency.pb\"\n", + "if 'bert' in pb:\n", + " inputs = ['Placeholder']\n", + " a = ['dense/BiasAdd']\n", + "if 'xlnet' in pb:\n", + " inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n", + " a = ['transpose_3']\n", "\n", - "s3 = boto3.client('s3')\n", + "transformed_graph_def = TransformGraph(input_graph_def, \n", + " inputs,\n", + " ['logits', 'heads_seq'] + a, transforms)\n", "\n", - "s3.upload_file(Key,bucketName,outPutname)" + "with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n", + " f.write(transformed_graph_def.SerializeToString())" ] } ], @@ -2136,7 +2378,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.9" } }, "nbformat": 4, diff --git a/session/dependency/tiny-bert.ipynb b/session/dependency/tiny-bert.ipynb index 2e5db223..57226da6 100644 --- a/session/dependency/tiny-bert.ipynb +++ b/session/dependency/tiny-bert.ipynb @@ -14,389 +14,134 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], - "source": [ - "with open('../Malaya-Dataset/dependency/gsd-ud-train.conllu.txt') as fopen:\n", - " corpus = fopen.read().split('\\n')\n", - " \n", - "with open('../Malaya-Dataset/dependency/gsd-ud-test.conllu.txt') as fopen:\n", - " corpus.extend(fopen.read().split('\\n'))\n", - " \n", - "with open('../Malaya-Dataset/dependency/gsd-ud-dev.conllu.txt') as fopen:\n", - " corpus.extend(fopen.read().split('\\n'))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/optimization.py:87: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", + "WARNING:tensorflow:From /home/husein/bert-standard/bert/optimization.py:87: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", "\n" ] } ], "source": [ "import bert\n", - "from bert import run_classifier\n", "from bert import optimization\n", "from bert import tokenization\n", "from bert import modeling\n", + "import numpy as np\n", + "import json\n", "import tensorflow as tf\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import unicodedata\n", - "import six\n", - "from functools import partial\n", - "\n", - "SPIECE_UNDERLINE = '▁'\n", - "\n", - "def preprocess_text(inputs, lower=False, remove_space=True, keep_accents=False):\n", - " if remove_space:\n", - " outputs = ' '.join(inputs.strip().split())\n", - " else:\n", - " outputs = inputs\n", - " outputs = outputs.replace(\"``\", '\"').replace(\"''\", '\"')\n", - "\n", - " if six.PY2 and isinstance(outputs, str):\n", - " outputs = outputs.decode('utf-8')\n", - "\n", - " if not keep_accents:\n", - " outputs = unicodedata.normalize('NFKD', outputs)\n", - " outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])\n", - " if lower:\n", - " outputs = outputs.lower()\n", - "\n", - " return outputs\n", - "\n", - "\n", - "def encode_pieces(sp_model, text, return_unicode=True, sample=False):\n", - " # return_unicode is used only for py2\n", - "\n", - " # note(zhiliny): in some systems, sentencepiece only accepts str for py2\n", - " if six.PY2 and isinstance(text, unicode):\n", - " text = text.encode('utf-8')\n", - "\n", - " if not sample:\n", - " pieces = sp_model.EncodeAsPieces(text)\n", - " else:\n", - " pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1)\n", - " new_pieces = []\n", - " for piece in pieces:\n", - " if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():\n", - " cur_pieces = sp_model.EncodeAsPieces(\n", - " piece[:-1].replace(SPIECE_UNDERLINE, ''))\n", - " if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:\n", - " if len(cur_pieces[0]) == 1:\n", - " cur_pieces = cur_pieces[1:]\n", - " else:\n", - " cur_pieces[0] = cur_pieces[0][1:]\n", - " cur_pieces.append(piece[-1])\n", - " new_pieces.extend(cur_pieces)\n", - " else:\n", - " new_pieces.append(piece)\n", - "\n", - " # note(zhiliny): convert back to unicode for py2\n", - " if six.PY2 and return_unicode:\n", - " ret_pieces = []\n", - " for piece in new_pieces:\n", - " if isinstance(piece, str):\n", - " piece = piece.decode('utf-8')\n", - " ret_pieces.append(piece)\n", - " new_pieces = ret_pieces\n", - "\n", - " return new_pieces\n", - "\n", - "\n", - "def encode_ids(sp_model, text, sample=False):\n", - " pieces = encode_pieces(sp_model, text, return_unicode=False, sample=sample)\n", - " ids = [sp_model.PieceToId(piece) for piece in pieces]\n", - " return ids" + "import itertools\n", + "import collections\n", + "import re\n", + "import random\n", + "import sentencepiece as spm\n", + "from unidecode import unidecode\n", + "from sklearn.utils import shuffle\n", + "from tqdm import tqdm\n", + "from prepro_utils import preprocess_text, encode_ids, encode_pieces\n", + "from malaya.text.function import transformer_textcleaning as cleaning" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "import sentencepiece as spm\n", - "\n", "sp_model = spm.SentencePieceProcessor()\n", - "sp_model.Load('tiny-bert-v1/sp10m.cased.bert.model')\n", + "sp_model.Load('sp10m.cased.bert.model')\n", "\n", - "with open('tiny-bert-v1/sp10m.cased.bert.vocab') as fopen:\n", + "with open('sp10m.cased.bert.vocab') as fopen:\n", " v = fopen.read().split('\\n')[:-1]\n", "v = [i.split('\\t') for i in v]\n", "v = {i[0]: i[1] for i in v}\n", "\n", + "\n", "class Tokenizer:\n", - " def __init__(self, v):\n", + " def __init__(self, v, sp_model):\n", " self.vocab = v\n", - " pass\n", - " \n", + " self.sp_model = sp_model\n", + "\n", " def tokenize(self, string):\n", - " return encode_pieces(sp_model, string, return_unicode=False, sample=False)\n", - " \n", + " return encode_pieces(\n", + " self.sp_model, string, return_unicode = False, sample = False\n", + " )\n", + "\n", " def convert_tokens_to_ids(self, tokens):\n", - " return [sp_model.PieceToId(piece) for piece in tokens]\n", - " \n", + " return [self.sp_model.PieceToId(piece) for piece in tokens]\n", + "\n", " def convert_ids_to_tokens(self, ids):\n", - " return [sp_model.IdToPiece(i) for i in ids]\n", - " \n", - "tokenizer = Tokenizer(v)" + " return [self.sp_model.IdToPiece(i) for i in ids]\n", + "\n", + "\n", + "tokenizer = Tokenizer(v, sp_model)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "tag2idx = {'PAD': 0, 'X': 1}\n", - "tag_idx = 2\n", + "import pickle\n", "\n", - "def process_corpus(corpus, until = None):\n", - " global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx\n", - " sentences, words, depends, labels, pos, sequences = [], [], [], [], [], []\n", - " temp_sentence, temp_word, temp_depend, temp_label, temp_pos = [], [], [], [], []\n", - " first_time = True\n", - " for sentence in corpus:\n", - " try:\n", - " if len(sentence):\n", - " if sentence[0] == '#':\n", - " continue\n", - " if first_time:\n", - " print(sentence)\n", - " first_time = False\n", - " sentence = sentence.split('\\t')\n", - " if sentence[7] not in tag2idx:\n", - " tag2idx[sentence[7]] = tag_idx\n", - " tag_idx += 1\n", - " temp_word.append(sentence[1])\n", - " temp_depend.append(int(sentence[6]) + 1)\n", - " temp_label.append(tag2idx[sentence[7]])\n", - " temp_sentence.append(sentence[1])\n", - " temp_pos.append(sentence[3])\n", - " else:\n", - " if len(temp_sentence) < 2 or len(temp_word) != len(temp_label):\n", - " temp_word = []\n", - " temp_depend = []\n", - " temp_label = []\n", - " temp_sentence = []\n", - " temp_pos = []\n", - " continue\n", - " bert_tokens = ['[CLS]']\n", - " labels_ = [0]\n", - " depends_ = [0]\n", - " seq_ = []\n", - " for no, orig_token in enumerate(temp_word):\n", - " labels_.append(temp_label[no])\n", - " depends_.append(temp_depend[no])\n", - " t = tokenizer.tokenize(orig_token)\n", - " bert_tokens.extend(t)\n", - " labels_.extend([1] * (len(t) - 1))\n", - " depends_.extend([0] * (len(t) - 1))\n", - " seq_.append(no + 1)\n", - " bert_tokens.append('[SEP]')\n", - " labels_.append(0)\n", - " depends_.append(0)\n", - " words.append(tokenizer.convert_tokens_to_ids(bert_tokens))\n", - " depends.append(depends_)\n", - " labels.append(labels_)\n", - " sentences.append(bert_tokens)\n", - " pos.append(temp_pos)\n", - " sequences.append(seq_)\n", - " temp_word = []\n", - " temp_depend = []\n", - " temp_label = []\n", - " temp_sentence = []\n", - " temp_pos = []\n", - " except Exception as e:\n", - " print(e, sentence)\n", - " return sentences[:-1], words[:-1], depends[:-1], labels[:-1], pos[:-1], sequences[:-1]" + "with open('train_X.pkl', 'rb') as fopen:\n", + " train_X, train_Y, train_depends = pickle.load(fopen)\n", + " \n", + "with open('test_X.pkl', 'rb') as fopen:\n", + " test_X, test_Y, test_depends = pickle.load(fopen)\n", + " \n", + "with open('tags.pkl', 'rb') as fopen:\n", + " idx2tag, tag2idx = pickle.load(fopen)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "1\tSembungan\tsembungan\tPROPN\tX--\t_\t4\tnsubj\t_\tMorphInd=^sembungan_X--$\n" + "WARNING:tensorflow:From /home/husein/bert-standard/bert/modeling.py:93: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.\n", + "\n" ] } ], "source": [ - "sentences, words, depends, labels, _, _ = process_corpus(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open('../Malaya-Dataset/dependency/augmented-dependency.json') as fopen:\n", - " augmented = json.load(fopen)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "text_augmented, depends_augmented, labels_augmented = [], [], []\n", - "\n", - "for a in augmented:\n", - " text_augmented.extend(a[0])\n", - " depends_augmented.extend(a[1])\n", - " labels_augmented.extend((np.array(a[2]) + 1).tolist())" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "def parse_XY(texts, depends, labels):\n", - " outside, sentences, outside_depends, outside_labels = [], [], [], []\n", - " for no, text in enumerate(texts):\n", - " temp_depend = depends[no]\n", - " temp_label = labels[no]\n", - " s = text.split()\n", - " sentences.append(s)\n", - " bert_tokens = ['[CLS]']\n", - " labels_ = [0]\n", - " depends_ = [0]\n", - " for no, orig_token in enumerate(s):\n", - " labels_.append(temp_label[no])\n", - " depends_.append(temp_depend[no])\n", - " t = tokenizer.tokenize(orig_token)\n", - " bert_tokens.extend(t)\n", - " labels_.extend([1] * (len(t) - 1))\n", - " depends_.extend([0] * (len(t) - 1))\n", - " bert_tokens.append('[SEP]')\n", - " labels_.append(0)\n", - " depends_.append(0)\n", - " outside.append(tokenizer.convert_tokens_to_ids(bert_tokens))\n", - " outside_depends.append(depends_)\n", - " outside_labels.append(labels_)\n", - " return outside, sentences, outside_depends, outside_labels" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "outside, _, outside_depends, outside_labels = parse_XY(text_augmented, \n", - " depends_augmented, \n", - " labels_augmented)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "words.extend(outside)\n", - "depends.extend(outside_depends)\n", - "labels.extend(outside_labels)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "idx2tag = {v:k for k, v in tag2idx.items()}" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "words_train, words_test, depends_train, depends_test, labels_train, labels_test \\\n", - "= train_test_split(words, depends, labels, test_size = 0.2)" + "bert_config = modeling.BertConfig.from_json_file(\n", + " 'tiny-bert-v1/config.json'\n", + ")" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "train_X = words_train\n", - "train_Y = labels_train\n", - "train_depends = depends_train\n", - "\n", - "test_X = words_test\n", - "test_Y = labels_test\n", - "test_depends = depends_test" + "BERT_INIT_CHKPNT = 'tiny-bert-v1/model.ckpt'" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "BERT_INIT_CHKPNT = 'tiny-bert-v1/model.ckpt'\n", - "BERT_CONFIG = 'tiny-bert-v1/config.json'" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/modeling.py:93: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.\n", - "\n" - ] - } - ], - "source": [ - "epoch = 30\n", + "epoch = 3\n", "batch_size = 32\n", "warmup_proportion = 0.1\n", "num_train_steps = int(len(train_X) / batch_size * epoch)\n", - "num_warmup_steps = int(num_train_steps * warmup_proportion)\n", - "bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)" + "num_warmup_steps = int(num_train_steps * warmup_proportion)" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -429,8 +174,15 @@ " e = tf.expand_dims(tf.expand_dims(mask_e, 1), 2)\n", " output = output * d * e\n", " \n", - " return output\n", - " \n", + " return output" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ "class BiLinear:\n", " def __init__(self, left_features, right_features, out_features):\n", " self.left_features = left_features\n", @@ -455,8 +207,17 @@ " output = output + tf.matmul(input_left, tf.transpose(self.W_l))\\\n", " + tf.matmul(input_right, tf.transpose(self.W_r))\n", " \n", - " return tf.reshape(output, output_shape)\n", - " \n", + " return tf.reshape(output, output_shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "_NEG_INF = -1e9\n", + "\n", "class Model:\n", " def __init__(\n", " self,\n", @@ -488,14 +249,20 @@ " config=bert_config,\n", " is_training=training,\n", " input_ids=self.words,\n", + " input_mask=self.mask,\n", " use_one_hot_embeddings=False)\n", + " \n", " output_layer = model.get_sequence_output()\n", " \n", " arc_h = tf.nn.elu(self.arc_h(output_layer))\n", " arc_c = tf.nn.elu(self.arc_c(output_layer))\n", + " self._arc_h = arc_h\n", + " self._arc_c = arc_c\n", " \n", " type_h = tf.nn.elu(self.type_h(output_layer))\n", " type_c = tf.nn.elu(self.type_c(output_layer))\n", + " self._type_h = type_h\n", + " self._type_c = type_c\n", " \n", " out_arc = tf.squeeze(self.attention.forward(arc_h, arc_c, mask_d=self.mask, \n", " mask_e=self.mask), axis = 1)\n", @@ -514,6 +281,11 @@ " self.heads_seq = tf.argmax(decode_arc, axis = 1)\n", " self.heads_seq = tf.identity(self.heads_seq, name = 'heads_seq')\n", " \n", + "# self.decode_arc_t = tf.transpose(decode_arc, (0, 2, 1))\n", + "# sequence_loss_depends = tf.contrib.seq2seq.sequence_loss(logits = self.decode_arc_t,\n", + "# targets = self.heads,\n", + "# weights = mask)\n", + " \n", " t = tf.cast(tf.transpose(self.heads_seq), tf.int32)\n", " broadcasted = tf.broadcast_to(batch_index, tf.shape(t))\n", " concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0), \n", @@ -588,7 +360,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -606,33 +378,33 @@ " * https://github.com/tensorflow/io (for I/O related ops)\n", "If you depend on functionality not listed there, please file an issue.\n", "\n", - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/modeling.py:171: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.\n", + "WARNING:tensorflow:From /home/husein/bert-standard/bert/modeling.py:171: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.\n", "\n", - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/modeling.py:490: The name tf.assert_less_equal is deprecated. Please use tf.compat.v1.assert_less_equal instead.\n", + "WARNING:tensorflow:From /home/husein/bert-standard/bert/modeling.py:490: The name tf.assert_less_equal is deprecated. Please use tf.compat.v1.assert_less_equal instead.\n", "\n", - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/modeling.py:358: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From /home/husein/bert-standard/bert/modeling.py:358: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n", - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/modeling.py:671: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From /home/husein/bert-standard/bert/modeling.py:671: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use keras.layers.Dense instead.\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/layers/core.py:187: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use `layer.__call__` method instead.\n", - "WARNING:tensorflow:From :110: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From :61: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/contrib/crf/python/ops/crf.py:213: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use `keras.layers.RNN(cell)`, which is equivalent to this API\n", - "WARNING:tensorflow:From :145: calling log_softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From :101: calling log_softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "dim is deprecated, use axis instead\n", - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/optimization.py:27: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.\n", + "WARNING:tensorflow:From /home/husein/bert-standard/bert/optimization.py:27: The name tf.train.get_or_create_global_step is deprecated. Please use tf.compat.v1.train.get_or_create_global_step instead.\n", "\n", - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/optimization.py:32: The name tf.train.polynomial_decay is deprecated. Please use tf.compat.v1.train.polynomial_decay instead.\n", + "WARNING:tensorflow:From /home/husein/bert-standard/bert/optimization.py:32: The name tf.train.polynomial_decay is deprecated. Please use tf.compat.v1.train.polynomial_decay instead.\n", "\n", - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/bert/optimization.py:70: The name tf.trainable_variables is deprecated. Please use tf.compat.v1.trainable_variables instead.\n", + "WARNING:tensorflow:From /home/husein/bert-standard/bert/optimization.py:70: The name tf.trainable_variables is deprecated. Please use tf.compat.v1.trainable_variables instead.\n", "\n" ] } @@ -642,7 +414,7 @@ "sess = tf.InteractiveSession()\n", "\n", "learning_rate = 2e-5\n", - "hidden_size_word = 128\n", + "hidden_size_word = 256\n", "\n", "model = Model(learning_rate, hidden_size_word)\n", "sess.run(tf.global_variables_initializer())" @@ -650,7 +422,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -669,7 +441,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -685,16 +457,16 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[0.026415095, 0.00754717, 39.253395]" + "[0.01369863, 0.09589041, 30.746227]" ] }, - "execution_count": 22, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -709,16 +481,16 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[0.01509434, 0.01509434, 528.8345]" + "[0.01369863, 0.09589041, 157.3611]" ] }, - "execution_count": 23, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -733,63 +505,32 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([29, 19, 19, 13, 19, 13, 12, 13, 19, 19, 12, 19, 19, 12, 12, 19, 12,\n", - " 19, 19, 12, 13, 19, 13, 19, 12, 19, 29, 29, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),\n", - " array([ 1, 26, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),\n", - " array([ 0, 4, 0, 2, 0, 1, 0, 4, 0, 0, 5, 0, 0, 0, 5, 0, 0,\n", - " 7, 0, 10, 7, 0, 10, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32))" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tags_seq, heads = sess.run(\n", - " [model.logits, model.heads_seq],\n", - " feed_dict = {\n", - " model.words: batch_x,\n", - " },\n", - ")\n", - "tags_seq[0], heads[0], batch_depends[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 25, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:54<00:00, 4.27it/s, accuracy=0.659, accuracy_depends=0.549, cost=3.14]\n", - "test minibatch loop: 100%|██████████| 315/315 [00:58<00:00, 5.38it/s, accuracy=0.725, accuracy_depends=0.499, cost=2.45]\n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:33, 5.89it/s, accuracy=0.716, accuracy_depends=0.475, cost=2.77]" + "train minibatch loop: 8%|▊ | 8228/97788 [21:47<3:47:18, 6.57it/s, accuracy=0.815, accuracy_depends=0.494, cost=1.85] IOPub message rate exceeded.\n", + "The notebook server will temporarily stop sending output\n", + "to the client in order to avoid crashing it.\n", + "To change this limit, set the config variable\n", + "`--NotebookApp.iopub_msg_rate_limit`.\n", + "\n", + "Current values:\n", + "NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n", + "NotebookApp.rate_limit_window=3.0 (secs)\n", + "\n", + "train minibatch loop: 100%|██████████| 97788/97788 [4:20:01<00:00, 6.27it/s, accuracy=0.884, accuracy_depends=0.755, cost=0.702] \n", + "test minibatch loop: 100%|██████████| 313/313 [00:36<00:00, 8.65it/s, accuracy=0.89, accuracy_depends=0.741, cost=0.799] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "epoch: 0, training loss: 5.789861, training acc: 0.452741, training depends: 0.414746, valid loss: 2.697248, valid acc: 0.692594, valid depends: 0.480903\n", + "epoch: 0, training loss: 1.250804, training acc: 0.847605, training depends: 0.668640, valid loss: 0.735416, valid acc: 0.884954, valid depends: 0.744215\n", "\n" ] }, @@ -797,33 +538,116 @@ "name": "stderr", "output_type": "stream", "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:53<00:00, 4.29it/s, accuracy=0.805, accuracy_depends=0.549, cost=2.5] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:58<00:00, 5.37it/s, accuracy=0.816, accuracy_depends=0.559, cost=1.86]\n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:44, 5.60it/s, accuracy=0.816, accuracy_depends=0.503, cost=2.11]" + "train minibatch loop: 100%|██████████| 97788/97788 [4:17:28<00:00, 6.33it/s, accuracy=0.886, accuracy_depends=0.771, cost=0.613] \n", + "test minibatch loop: 100%|██████████| 313/313 [00:35<00:00, 8.70it/s, accuracy=0.89, accuracy_depends=0.743, cost=0.689] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "epoch: 1, training loss: 2.288419, training acc: 0.767721, training depends: 0.503435, valid loss: 2.034172, valid acc: 0.806141, valid depends: 0.522205\n", + "epoch: 1, training loss: 0.664987, training acc: 0.888118, training depends: 0.757840, valid loss: 0.641807, valid acc: 0.889845, valid depends: 0.765465\n", "\n" ] - }, + } + ], + "source": [ + "from tqdm import tqdm\n", + "\n", + "epoch = 2\n", + "\n", + "for e in range(epoch):\n", + " train_acc, train_loss = [], []\n", + " test_acc, test_loss = [], []\n", + " train_acc_depends, test_acc_depends = [], []\n", + " \n", + " pbar = tqdm(\n", + " range(0, len(train_X), batch_size), desc = 'train minibatch loop'\n", + " )\n", + " for i in pbar:\n", + " index = min(i + batch_size, len(train_X))\n", + " batch_x = train_X[i: index]\n", + " batch_x = pad_sequences(batch_x,padding='post')\n", + " batch_y = train_Y[i: index]\n", + " batch_y = pad_sequences(batch_y,padding='post')\n", + " batch_depends = train_depends[i: index]\n", + " batch_depends = pad_sequences(batch_depends,padding='post')\n", + " \n", + " if batch_x.shape == batch_y.shape:\n", + " acc_depends, acc, cost, _ = sess.run(\n", + " [model.accuracy_depends, model.accuracy, model.cost, model.optimizer],\n", + " feed_dict = {\n", + " model.words: batch_x,\n", + " model.types: batch_y,\n", + " model.heads: batch_depends,\n", + " model.switch: False\n", + " },\n", + " )\n", + " train_loss.append(cost)\n", + " train_acc.append(acc)\n", + " train_acc_depends.append(acc_depends)\n", + " pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)\n", + " \n", + " pbar = tqdm(\n", + " range(0, len(test_X), batch_size), desc = 'test minibatch loop'\n", + " )\n", + " for i in pbar:\n", + " index = min(i + batch_size, len(test_X))\n", + " batch_x = test_X[i: index]\n", + " batch_x = pad_sequences(batch_x,padding='post')\n", + " batch_y = test_Y[i: index]\n", + " batch_y = pad_sequences(batch_y,padding='post')\n", + " batch_depends = test_depends[i: index]\n", + " batch_depends = pad_sequences(batch_depends,padding='post')\n", + " \n", + " if batch_x.shape == batch_y.shape:\n", + " acc_depends, acc, cost = sess.run(\n", + " [model.accuracy_depends, model.accuracy, model.cost],\n", + " feed_dict = {\n", + " model.words: batch_x,\n", + " model.types: batch_y,\n", + " model.heads: batch_depends,\n", + " model.switch: False\n", + " },\n", + " )\n", + " test_loss.append(cost)\n", + " test_acc.append(acc)\n", + " test_acc_depends.append(acc_depends)\n", + " pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)\n", + " \n", + " \n", + " print(\n", + " 'epoch: %d, training loss: %f, training acc: %f, training depends: %f, valid loss: %f, valid acc: %f, valid depends: %f\\n'\n", + " % (e, np.mean(train_loss), \n", + " np.mean(train_acc), \n", + " np.mean(train_acc_depends), \n", + " np.mean(test_loss), \n", + " np.mean(test_acc), \n", + " np.mean(test_acc_depends)\n", + " ))\n", + " \n", + " saver = tf.train.Saver(tf.trainable_variables())\n", + " saver.save(sess, 'tiny-bert-dependency/model.ckpt')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:53<00:00, 4.29it/s, accuracy=0.854, accuracy_depends=0.646, cost=2.1] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:58<00:00, 5.38it/s, accuracy=0.846, accuracy_depends=0.585, cost=1.6] \n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:26, 6.09it/s, accuracy=0.852, accuracy_depends=0.539, cost=1.78]" + "train minibatch loop: 100%|██████████| 97788/97788 [4:22:12<00:00, 6.22it/s, accuracy=0.886, accuracy_depends=0.788, cost=0.576] \n", + "test minibatch loop: 100%|██████████| 313/313 [00:36<00:00, 8.61it/s, accuracy=0.9, accuracy_depends=0.778, cost=0.629] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "epoch: 2, training loss: 1.842319, training acc: 0.825247, training depends: 0.546368, valid loss: 1.727046, valid acc: 0.836843, valid depends: 0.564716\n", + "epoch: 0, training loss: 0.602583, training acc: 0.890296, training depends: 0.772322, valid loss: 0.611718, valid acc: 0.891331, valid depends: 0.773028\n", "\n" ] }, @@ -831,294 +655,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:53<00:00, 4.29it/s, accuracy=0.854, accuracy_depends=0.561, cost=2.07]\n", - "test minibatch loop: 100%|██████████| 315/315 [00:58<00:00, 5.36it/s, accuracy=0.867, accuracy_depends=0.595, cost=1.45]\n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:37, 5.79it/s, accuracy=0.849, accuracy_depends=0.562, cost=1.62]" + "train minibatch loop: 100%|██████████| 97788/97788 [4:07:31<00:00, 6.58it/s, accuracy=0.891, accuracy_depends=0.783, cost=0.613] \n", + "test minibatch loop: 100%|██████████| 313/313 [00:34<00:00, 9.09it/s, accuracy=0.885, accuracy_depends=0.788, cost=0.621]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "epoch: 3, training loss: 1.606321, training acc: 0.843694, training depends: 0.584820, valid loss: 1.566533, valid acc: 0.847156, valid depends: 0.592860\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:54<00:00, 4.28it/s, accuracy=0.841, accuracy_depends=0.585, cost=1.95]\n", - "test minibatch loop: 100%|██████████| 315/315 [00:58<00:00, 5.37it/s, accuracy=0.864, accuracy_depends=0.632, cost=1.38]\n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:33, 5.89it/s, accuracy=0.861, accuracy_depends=0.587, cost=1.53]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 4, training loss: 1.464989, training acc: 0.853639, training depends: 0.611301, valid loss: 1.459764, valid acc: 0.855135, valid depends: 0.613328\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:53<00:00, 4.29it/s, accuracy=0.878, accuracy_depends=0.622, cost=1.7] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:58<00:00, 5.36it/s, accuracy=0.87, accuracy_depends=0.634, cost=1.33] \n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:32, 5.92it/s, accuracy=0.868, accuracy_depends=0.596, cost=1.45]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 5, training loss: 1.367900, training acc: 0.860362, training depends: 0.629246, valid loss: 1.385906, valid acc: 0.860289, valid depends: 0.627439\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:53<00:00, 4.29it/s, accuracy=0.866, accuracy_depends=0.61, cost=1.68] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:58<00:00, 5.37it/s, accuracy=0.868, accuracy_depends=0.653, cost=1.27]\n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:40, 5.72it/s, accuracy=0.878, accuracy_depends=0.616, cost=1.34]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 6, training loss: 1.294124, training acc: 0.865166, training depends: 0.643810, valid loss: 1.331457, valid acc: 0.863615, valid depends: 0.637117\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:53<00:00, 4.29it/s, accuracy=0.866, accuracy_depends=0.585, cost=1.66] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:58<00:00, 5.38it/s, accuracy=0.874, accuracy_depends=0.666, cost=1.2] \n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:35, 5.83it/s, accuracy=0.879, accuracy_depends=0.63, cost=1.31]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 7, training loss: 1.232699, training acc: 0.868785, training depends: 0.655266, valid loss: 1.282819, valid acc: 0.866522, valid depends: 0.646933\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:53<00:00, 4.30it/s, accuracy=0.866, accuracy_depends=0.72, cost=1.47] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:58<00:00, 5.37it/s, accuracy=0.885, accuracy_depends=0.676, cost=1.12] \n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:33, 5.91it/s, accuracy=0.876, accuracy_depends=0.63, cost=1.3]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 8, training loss: 1.182210, training acc: 0.872922, training depends: 0.665300, valid loss: 1.252104, valid acc: 0.869890, valid depends: 0.652689\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:49<00:00, 4.36it/s, accuracy=0.878, accuracy_depends=0.61, cost=1.43] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:58<00:00, 5.42it/s, accuracy=0.882, accuracy_depends=0.66, cost=1.15] \n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:35, 5.84it/s, accuracy=0.874, accuracy_depends=0.63, cost=1.28]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 9, training loss: 1.138188, training acc: 0.875656, training depends: 0.674430, valid loss: 1.213842, valid acc: 0.872358, valid depends: 0.660089\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:50<00:00, 4.33it/s, accuracy=0.89, accuracy_depends=0.671, cost=1.43] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:57<00:00, 5.44it/s, accuracy=0.887, accuracy_depends=0.681, cost=1.06] \n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:31, 5.96it/s, accuracy=0.883, accuracy_depends=0.647, cost=1.2]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 10, training loss: 1.100374, training acc: 0.878828, training depends: 0.682079, valid loss: 1.187184, valid acc: 0.874205, valid depends: 0.667161\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:50<00:00, 4.34it/s, accuracy=0.878, accuracy_depends=0.659, cost=1.42] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:57<00:00, 5.46it/s, accuracy=0.878, accuracy_depends=0.686, cost=1.08] \n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:40, 5.72it/s, accuracy=0.889, accuracy_depends=0.662, cost=1.19]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 11, training loss: 1.066078, training acc: 0.881652, training depends: 0.688804, valid loss: 1.156008, valid acc: 0.875828, valid depends: 0.674163\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:50<00:00, 4.34it/s, accuracy=0.89, accuracy_depends=0.659, cost=1.42] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:57<00:00, 5.45it/s, accuracy=0.89, accuracy_depends=0.693, cost=1.07] \n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:26, 6.09it/s, accuracy=0.886, accuracy_depends=0.651, cost=1.18]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 12, training loss: 1.036247, training acc: 0.883852, training depends: 0.695145, valid loss: 1.138222, valid acc: 0.877338, valid depends: 0.678166\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:51<00:00, 4.33it/s, accuracy=0.915, accuracy_depends=0.659, cost=1.32] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:57<00:00, 5.44it/s, accuracy=0.892, accuracy_depends=0.709, cost=1.03] \n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:48, 5.51it/s, accuracy=0.884, accuracy_depends=0.666, cost=1.12]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 13, training loss: 1.007638, training acc: 0.886176, training depends: 0.701007, valid loss: 1.117662, valid acc: 0.879050, valid depends: 0.680728\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:50<00:00, 4.33it/s, accuracy=0.866, accuracy_depends=0.695, cost=1.23] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:58<00:00, 5.41it/s, accuracy=0.9, accuracy_depends=0.7, cost=0.974] \n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:44, 5.61it/s, accuracy=0.889, accuracy_depends=0.67, cost=1.13]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 14, training loss: 0.983746, training acc: 0.887991, training depends: 0.706029, valid loss: 1.111054, valid acc: 0.881190, valid depends: 0.681359\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:51<00:00, 4.33it/s, accuracy=0.915, accuracy_depends=0.695, cost=1.31] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:58<00:00, 5.42it/s, accuracy=0.889, accuracy_depends=0.7, cost=1.02] \n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:29, 6.02it/s, accuracy=0.892, accuracy_depends=0.673, cost=1.07]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 15, training loss: 0.960521, training acc: 0.889907, training depends: 0.710786, valid loss: 1.088889, valid acc: 0.882030, valid depends: 0.688486\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:39<00:00, 4.51it/s, accuracy=0.902, accuracy_depends=0.695, cost=1.3] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:54<00:00, 5.83it/s, accuracy=0.908, accuracy_depends=0.71, cost=0.902] \n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:26, 6.09it/s, accuracy=0.893, accuracy_depends=0.674, cost=1.07]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 16, training loss: 0.941127, training acc: 0.891492, training depends: 0.714844, valid loss: 1.078972, valid acc: 0.882973, valid depends: 0.689849\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:41<00:00, 4.48it/s, accuracy=0.89, accuracy_depends=0.659, cost=1.21] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:56<00:00, 5.56it/s, accuracy=0.897, accuracy_depends=0.704, cost=0.948]\n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:41, 5.69it/s, accuracy=0.886, accuracy_depends=0.692, cost=1.02]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 17, training loss: 0.922439, training acc: 0.892776, training depends: 0.718796, valid loss: 1.060917, valid acc: 0.884413, valid depends: 0.695307\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:34<00:00, 4.60it/s, accuracy=0.915, accuracy_depends=0.646, cost=1.28] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:53<00:00, 5.84it/s, accuracy=0.892, accuracy_depends=0.718, cost=0.977]\n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:17, 6.37it/s, accuracy=0.896, accuracy_depends=0.667, cost=1.05]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 18, training loss: 0.906757, training acc: 0.894139, training depends: 0.722021, valid loss: 1.048187, valid acc: 0.885442, valid depends: 0.698038\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:38<00:00, 4.53it/s, accuracy=0.89, accuracy_depends=0.659, cost=1.21] \n", - "test minibatch loop: 100%|██████████| 315/315 [00:53<00:00, 5.85it/s, accuracy=0.895, accuracy_depends=0.722, cost=0.961]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 19, training loss: 0.891351, training acc: 0.895233, training depends: 0.725029, valid loss: 1.037661, valid acc: 0.885347, valid depends: 0.699040\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ + "epoch: 1, training loss: 0.587275, training acc: 0.891074, training depends: 0.775791, valid loss: 0.609368, valid acc: 0.890627, valid depends: 0.773412\n", "\n" ] } @@ -1126,7 +671,7 @@ "source": [ "from tqdm import tqdm\n", "\n", - "epoch = 20\n", + "epoch = 1\n", "\n", "for e in range(epoch):\n", " train_acc, train_loss = [], []\n", @@ -1145,19 +690,20 @@ " batch_depends = train_depends[i: index]\n", " batch_depends = pad_sequences(batch_depends,padding='post')\n", " \n", - " acc_depends, acc, cost, _ = sess.run(\n", - " [model.accuracy_depends, model.accuracy, model.cost, model.optimizer],\n", - " feed_dict = {\n", - " model.words: batch_x,\n", - " model.types: batch_y,\n", - " model.heads: batch_depends,\n", - " model.switch: False\n", - " },\n", - " )\n", - " train_loss.append(cost)\n", - " train_acc.append(acc)\n", - " train_acc_depends.append(acc_depends)\n", - " pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)\n", + " if batch_x.shape == batch_y.shape:\n", + " acc_depends, acc, cost, _ = sess.run(\n", + " [model.accuracy_depends, model.accuracy, model.cost, model.optimizer],\n", + " feed_dict = {\n", + " model.words: batch_x,\n", + " model.types: batch_y,\n", + " model.heads: batch_depends,\n", + " model.switch: True\n", + " },\n", + " )\n", + " train_loss.append(cost)\n", + " train_acc.append(acc)\n", + " train_acc_depends.append(acc_depends)\n", + " pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)\n", " \n", " pbar = tqdm(\n", " range(0, len(test_X), batch_size), desc = 'test minibatch loop'\n", @@ -1171,19 +717,20 @@ " batch_depends = test_depends[i: index]\n", " batch_depends = pad_sequences(batch_depends,padding='post')\n", " \n", - " acc_depends, acc, cost = sess.run(\n", - " [model.accuracy_depends, model.accuracy, model.cost],\n", - " feed_dict = {\n", - " model.words: batch_x,\n", - " model.types: batch_y,\n", - " model.heads: batch_depends,\n", - " model.switch: False\n", - " },\n", - " )\n", - " test_loss.append(cost)\n", - " test_acc.append(acc)\n", - " test_acc_depends.append(acc_depends)\n", - " pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)\n", + " if batch_x.shape == batch_y.shape:\n", + " acc_depends, acc, cost = sess.run(\n", + " [model.accuracy_depends, model.accuracy, model.cost],\n", + " feed_dict = {\n", + " model.words: batch_x,\n", + " model.types: batch_y,\n", + " model.heads: batch_depends,\n", + " model.switch: True\n", + " },\n", + " )\n", + " test_loss.append(cost)\n", + " test_acc.append(acc)\n", + " test_acc_depends.append(acc_depends)\n", + " pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)\n", " \n", " \n", " print(\n", @@ -1194,219 +741,1298 @@ " np.mean(test_loss), \n", " np.mean(test_acc), \n", " np.mean(test_acc_depends)\n", - " ))" + " ))\n", + " \n", + " saver = tf.train.Saver(tf.trainable_variables())\n", + " saver.save(sess, 'tiny-bert-dependency/model.ckpt')" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 18, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:38<00:00, 4.52it/s, accuracy=0.915, accuracy_depends=0.659, cost=22.5]\n", - "test minibatch loop: 100%|██████████| 315/315 [00:53<00:00, 5.84it/s, accuracy=0.926, accuracy_depends=0.716, cost=9.6] \n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:09, 6.63it/s, accuracy=0.932, accuracy_depends=0.671, cost=11.2]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 0, training loss: 10.320434, training acc: 0.924881, training depends: 0.717967, valid loss: 12.149561, valid acc: 0.915361, valid depends: 0.686787\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:37<00:00, 4.54it/s, accuracy=0.927, accuracy_depends=0.646, cost=17.4]\n", - "test minibatch loop: 100%|██████████| 315/315 [00:54<00:00, 5.83it/s, accuracy=0.925, accuracy_depends=0.691, cost=10.2]\n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:20, 6.27it/s, accuracy=0.924, accuracy_depends=0.648, cost=11.3]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 1, training loss: 9.671070, training acc: 0.929382, training depends: 0.712335, valid loss: 11.713265, valid acc: 0.917885, valid depends: 0.683404\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:38<00:00, 4.53it/s, accuracy=0.951, accuracy_depends=0.646, cost=16.5]\n", - "test minibatch loop: 100%|██████████| 315/315 [00:53<00:00, 5.86it/s, accuracy=0.938, accuracy_depends=0.68, cost=8.52] \n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:27, 6.07it/s, accuracy=0.929, accuracy_depends=0.646, cost=10.6]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 2, training loss: 9.260620, training acc: 0.932450, training depends: 0.708046, valid loss: 11.539114, valid acc: 0.919409, valid depends: 0.678840\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:37<00:00, 4.54it/s, accuracy=0.963, accuracy_depends=0.646, cost=14.3]\n", - "test minibatch loop: 100%|██████████| 315/315 [00:54<00:00, 5.83it/s, accuracy=0.937, accuracy_depends=0.691, cost=8.97]\n", - "train minibatch loop: 0%| | 1/1260 [00:00<03:22, 6.22it/s, accuracy=0.929, accuracy_depends=0.654, cost=10.7]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 3, training loss: 8.946695, training acc: 0.934782, training depends: 0.705428, valid loss: 11.285476, valid acc: 0.921092, valid depends: 0.676844\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "train minibatch loop: 100%|██████████| 1260/1260 [04:38<00:00, 4.52it/s, accuracy=0.939, accuracy_depends=0.659, cost=21.7]\n", - "test minibatch loop: 100%|██████████| 315/315 [00:53<00:00, 5.87it/s, accuracy=0.941, accuracy_depends=0.691, cost=8.72]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 4, training loss: 8.691786, training acc: 0.936757, training depends: 0.703728, valid loss: 11.110484, valid acc: 0.922271, valid depends: 0.675406\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], + "source": [ + "def merge_sentencepiece_tokens_tagging(x, y):\n", + " new_paired_tokens = []\n", + " n_tokens = len(x)\n", + " rejected = ['[CLS]', '[SEP]']\n", + "\n", + " i = 0\n", + "\n", + " while i < n_tokens:\n", + "\n", + " current_token, current_label = x[i], y[i]\n", + " if not current_token.startswith('▁') and current_token not in rejected:\n", + " previous_token, previous_label = new_paired_tokens.pop()\n", + " merged_token = previous_token\n", + " merged_label = [previous_label]\n", + " while (\n", + " not current_token.startswith('▁')\n", + " and current_token not in rejected\n", + " ):\n", + " merged_token = merged_token + current_token.replace('▁', '')\n", + " merged_label.append(current_label)\n", + " i = i + 1\n", + " current_token, current_label = x[i], y[i]\n", + " merged_label = merged_label[0]\n", + " new_paired_tokens.append((merged_token, merged_label))\n", + "\n", + " else:\n", + " new_paired_tokens.append((current_token, current_label))\n", + " i = i + 1\n", + "\n", + " words = [\n", + " i[0].replace('▁', '')\n", + " for i in new_paired_tokens\n", + " if i[0] not in rejected\n", + " ]\n", + " labels = [i[1] for i in new_paired_tokens if i[0] not in rejected]\n", + " return words, labels" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], "source": [ - "from tqdm import tqdm\n", + "import re\n", + "from unidecode import unidecode\n", + "from malaya.function.parse_dependency import DependencyGraph\n", "\n", - "epoch = 5\n", + "PUNCTUATION = '!\"#$%&\\'()*+,./:;<=>?@[\\]^_`{|}~'\n", "\n", - "for e in range(epoch):\n", - " train_acc, train_loss = [], []\n", - " test_acc, test_loss = [], []\n", - " train_acc_depends, test_acc_depends = [], []\n", - " \n", - " pbar = tqdm(\n", - " range(0, len(train_X), batch_size), desc = 'train minibatch loop'\n", + "def transformer_textcleaning(string):\n", + " \"\"\"\n", + " use by any transformer model before tokenization\n", + " \"\"\"\n", + " string = unidecode(string)\n", + " string = re.sub('\\\\(dot\\\\)', '.', string)\n", + " string = (\n", + " re.sub(re.findall(r'\\', string)[0], '', string)\n", + " if (len(re.findall(r'\\', string)) > 0)\n", + " and ('href' in re.findall(r'\\', string)[0])\n", + " else string\n", " )\n", - " for i in pbar:\n", - " index = min(i + batch_size, len(train_X))\n", - " batch_x = train_X[i: index]\n", - " batch_x = pad_sequences(batch_x,padding='post')\n", - " batch_y = train_Y[i: index]\n", - " batch_y = pad_sequences(batch_y,padding='post')\n", - " batch_depends = train_depends[i: index]\n", - " batch_depends = pad_sequences(batch_depends,padding='post')\n", - " \n", - " acc_depends, acc, cost, _ = sess.run(\n", - " [model.accuracy_depends, model.accuracy, model.cost, model.optimizer],\n", - " feed_dict = {\n", - " model.words: batch_x,\n", - " model.types: batch_y,\n", - " model.heads: batch_depends,\n", - " model.switch: True\n", - " },\n", - " )\n", - " train_loss.append(cost)\n", - " train_acc.append(acc)\n", - " train_acc_depends.append(acc_depends)\n", - " pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)\n", - " \n", - " pbar = tqdm(\n", - " range(0, len(test_X), batch_size), desc = 'test minibatch loop'\n", + " string = re.sub(\n", + " r'\\w+:\\/{2}[\\d\\w-]+(\\.[\\d\\w-]+)*(?:(?:\\/[^\\s/]*))*', ' ', string\n", " )\n", - " for i in pbar:\n", - " index = min(i + batch_size, len(test_X))\n", - " batch_x = test_X[i: index]\n", - " batch_x = pad_sequences(batch_x,padding='post')\n", - " batch_y = test_Y[i: index]\n", - " batch_y = pad_sequences(batch_y,padding='post')\n", - " batch_depends = test_depends[i: index]\n", - " batch_depends = pad_sequences(batch_depends,padding='post')\n", - " \n", - " acc_depends, acc, cost = sess.run(\n", - " [model.accuracy_depends, model.accuracy, model.cost],\n", - " feed_dict = {\n", - " model.words: batch_x,\n", - " model.types: batch_y,\n", - " model.heads: batch_depends,\n", - " model.switch: True\n", - " },\n", + " string = re.sub(r'[ ]+', ' ', string).strip().split()\n", + " string = [w for w in string if w[0] != '@']\n", + " string = ' '.join(string)\n", + " string = re.sub(f'([{PUNCTUATION}])', r' \\1 ', string)\n", + " string = re.sub('\\s{2,}', ' ', string)\n", + " original_string = string.split()\n", + " string = [\n", + " (original_string[no], word.title() if word.isupper() else word)\n", + " for no, word in enumerate(string.split())\n", + " if len(word)\n", + " ]\n", + " return [s[0] for s in string], [s[1] for s in string]\n", + "\n", + "def parse_X(left):\n", + " bert_tokens = ['[CLS]']\n", + " for no, orig_token in enumerate(left):\n", + " t = tokenizer.tokenize(orig_token)\n", + " bert_tokens.extend(t)\n", + " bert_tokens.append(\"[SEP]\")\n", + " t = tokenizer.convert_tokens_to_ids(bert_tokens)\n", + " return t, bert_tokens, [1] * len(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "def dependency_graph(tagging, indexing):\n", + " \"\"\"\n", + " Return helper object for dependency parser results. Only accept tagging and indexing outputs from dependency models.\n", + " \"\"\"\n", + " result = []\n", + " for i in range(len(tagging)):\n", + " result.append(\n", + " '%d\\t%s\\t_\\t_\\t_\\t_\\t%d\\t%s\\t_\\t_'\n", + " % (i + 1, tagging[i][0], int(indexing[i][1]), tagging[i][1])\n", " )\n", - " test_loss.append(cost)\n", - " test_acc.append(acc)\n", - " test_acc_depends.append(acc_depends)\n", - " pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)\n", - " \n", - " \n", - " print(\n", - " 'epoch: %d, training loss: %f, training acc: %f, training depends: %f, valid loss: %f, valid acc: %f, valid depends: %f\\n'\n", - " % (e, np.mean(train_loss), \n", - " np.mean(train_acc), \n", - " np.mean(train_acc_depends), \n", - " np.mean(test_loss), \n", - " np.mean(test_acc), \n", - " np.mean(test_acc_depends)\n", - " ))" + " return DependencyGraph('\\n'.join(result), top_relation_label='root')" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "2\n", + "2 (makan)\n", + "\n", + "\n", + "\n", + "0->2\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "1\n", + "1 (husein)\n", + "\n", + "\n", + "\n", + "2->1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "3\n", + "3 (ayam)\n", + "\n", + "\n", + "\n", + "2->3\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n" + ], "text/plain": [ - "'bert-base-dependency/model.ckpt'" + "" ] }, - "execution_count": 27, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "saver = tf.train.Saver(tf.trainable_variables())\n", - "saver.save(sess, 'bert-base-dependency/model.ckpt')" + "string = 'husein makan ayam'\n", + "sequence = transformer_textcleaning(string)[1]\n", + "parsed_sequence, bert_sequence, mask = parse_X(sequence)\n", + "h, t = sess.run([model.heads_seq, model.tags_seq],\n", + " feed_dict = {\n", + " model.words: [parsed_sequence],\n", + " },\n", + ")\n", + "h = h[0] - 2\n", + "t = [idx2tag[d] for d in t[0]]\n", + "merged_h = merge_sentencepiece_tokens_tagging(bert_sequence, h)\n", + "merged_t = merge_sentencepiece_tokens_tagging(bert_sequence, t)\n", + "tagging = list(zip(merged_t[0], merged_t[1]))\n", + "indexing = list(zip(merged_h[0], merged_h[1]))\n", + "dep = dependency_graph(tagging, indexing)\n", + "dep.to_graphvis()" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 22, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py:1750: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n", - " warnings.warn('An interactive session is already active. This can '\n" - ] - }, + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "13\n", + "13 (membidas)\n", + "\n", + "\n", + "\n", + "0->13\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "1\n", + "1 (Kuala)\n", + "\n", + "\n", + "\n", + "13->1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "14\n", + "14 (kenyataan)\n", + "\n", + "\n", + "\n", + "13->14\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "20\n", + "20 (Ketua)\n", + "\n", + "\n", + "\n", + "13->20\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "33\n", + "33 (melaksanakan)\n", + "\n", + "\n", + "\n", + "13->33\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "37\n", + "37 (.)\n", + "\n", + "\n", + "\n", + "13->37\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "39\n", + "39 (berkata)\n", + "\n", + "\n", + "\n", + "13->39\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "2\n", + "2 (Lumpur)\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "4\n", + "4 (Ketua)\n", + "\n", + "\n", + "\n", + "1->4\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "3\n", + "3 (:)\n", + "\n", + "\n", + "\n", + "2->3\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "5\n", + "5 (Penerangan)\n", + "\n", + "\n", + "\n", + "4->5\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "7\n", + "7 (,)\n", + "\n", + "\n", + "\n", + "4->7\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "8\n", + "8 (Datuk)\n", + "\n", + "\n", + "\n", + "4->8\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "6\n", + "6 (Bersatu)\n", + "\n", + "\n", + "\n", + "5->6\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "9\n", + "9 (Wan)\n", + "\n", + "\n", + "\n", + "8->9\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "10\n", + "10 (Saiful)\n", + "\n", + "\n", + "\n", + "9->10\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "11\n", + "11 (Wan)\n", + "\n", + "\n", + "\n", + "10->11\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "12\n", + "12 (Jan)\n", + "\n", + "\n", + "\n", + "11->12\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "15\n", + "15 (Datuk)\n", + "\n", + "\n", + "\n", + "14->15\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "19\n", + "19 (dan)\n", + "\n", + "\n", + "\n", + "20->19\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "21\n", + "21 (Pemuda)\n", + "\n", + "\n", + "\n", + "20->21\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "23\n", + "23 (,)\n", + "\n", + "\n", + "\n", + "20->23\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "24\n", + "24 (Datuk)\n", + "\n", + "\n", + "\n", + "20->24\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "32\n", + "32 (kerajaan)\n", + "\n", + "\n", + "\n", + "33->32\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "34\n", + "34 (sekatan)\n", + "\n", + "\n", + "\n", + "33->34\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "38\n", + "38 (Beliau)\n", + "\n", + "\n", + "\n", + "39->38\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "41\n", + "41 (Najib)\n", + "\n", + "\n", + "\n", + "39->41\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "40\n", + "40 (,)\n", + "\n", + "\n", + "\n", + "39->40\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "64\n", + "64 (berkata)\n", + "\n", + "\n", + "\n", + "39->64\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "16\n", + "16 (Seri)\n", + "\n", + "\n", + "\n", + "15->16\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "17\n", + "17 (Najib)\n", + "\n", + "\n", + "\n", + "16->17\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "18\n", + "18 (Razak)\n", + "\n", + "\n", + "\n", + "17->18\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "22\n", + "22 (Umno)\n", + "\n", + "\n", + "\n", + "21->22\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "25\n", + "25 (Dr)\n", + "\n", + "\n", + "\n", + "24->25\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "26\n", + "26 (Asyraf)\n", + "\n", + "\n", + "\n", + "25->26\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "30\n", + "30 (mempertikaikan)\n", + "\n", + "\n", + "\n", + "25->30\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "27\n", + "27 (Wajdi)\n", + "\n", + "\n", + "\n", + "26->27\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "29\n", + "29 (yang)\n", + "\n", + "\n", + "\n", + "30->29\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "31\n", + "31 (tindakan)\n", + "\n", + "\n", + "\n", + "30->31\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "28\n", + "28 (Dusuki)\n", + "\n", + "\n", + "\n", + "27->28\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "35\n", + "35 (pergerakan)\n", + "\n", + "\n", + "\n", + "34->35\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "36\n", + "36 (penuh)\n", + "\n", + "\n", + "\n", + "34->36\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "46\n", + "46 (memetik)\n", + "\n", + "\n", + "\n", + "38->46\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "61\n", + "61 (.)\n", + "\n", + "\n", + "\n", + "38->61\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "45\n", + "45 (sengaja)\n", + "\n", + "\n", + "\n", + "46->45\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "47\n", + "47 (kenyataan)\n", + "\n", + "\n", + "\n", + "46->47\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "50\n", + "50 (,)\n", + "\n", + "\n", + "\n", + "46->50\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "43\n", + "43 (Asyraf)\n", + "\n", + "\n", + "\n", + "41->43\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "62\n", + "62 (Wan)\n", + "\n", + "\n", + "\n", + "64->62\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "83\n", + "83 (.)\n", + "\n", + "\n", + "\n", + "64->83\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "42\n", + "42 (dan)\n", + "\n", + "\n", + "\n", + "43->42\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "44\n", + "44 (Wajdi)\n", + "\n", + "\n", + "\n", + "43->44\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "48\n", + "48 (Perdana)\n", + "\n", + "\n", + "\n", + "47->48\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "51\n", + "51 (Tan)\n", + "\n", + "\n", + "\n", + "47->51\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "49\n", + "49 (Menteri)\n", + "\n", + "\n", + "\n", + "49->49\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "52\n", + "52 (Sri)\n", + "\n", + "\n", + "\n", + "52->52\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "57\n", + "57 (lengkap)\n", + "\n", + "\n", + "\n", + "52->57\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "55\n", + "55 (yang)\n", + "\n", + "\n", + "\n", + "57->55\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "53\n", + "53 (Muhyiddin)\n", + "\n", + "\n", + "\n", + "53->53\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "54\n", + "54 (Yassin)\n", + "\n", + "\n", + "\n", + "54->54\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "56\n", + "56 (tidak)\n", + "\n", + "\n", + "\n", + "58\n", + "58 (untuk)\n", + "\n", + "\n", + "\n", + "58->56\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "58->58\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "59\n", + "59 (mengelirukan)\n", + "\n", + "\n", + "\n", + "58->59\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "60\n", + "60 (rakyat)\n", + "\n", + "\n", + "\n", + "59->60\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "63\n", + "63 (Saiful)\n", + "\n", + "\n", + "\n", + "62->63\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "65\n", + "65 (,)\n", + "\n", + "\n", + "\n", + "65->65\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "68\n", + "68 (menjangka)\n", + "\n", + "\n", + "\n", + "65->68\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "69\n", + "69 (ada)\n", + "\n", + "\n", + "\n", + "65->69\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "66\n", + "66 (beliau)\n", + "\n", + "\n", + "\n", + "68->66\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "67\n", + "67 (sudah)\n", + "\n", + "\n", + "\n", + "68->67\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "70\n", + "70 (kenyataan)\n", + "\n", + "\n", + "\n", + "70->70\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "71\n", + "71 (balas)\n", + "\n", + "\n", + "\n", + "70->71\n", + "\n", + "\n", + "ccomp\n", + "\n", + "\n", + "\n", + "73\n", + "73 (Najib)\n", + "\n", + "\n", + "\n", + "71->73\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "75\n", + "75 (tulisan)\n", + "\n", + "\n", + "\n", + "71->75\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "72\n", + "72 (daripada)\n", + "\n", + "\n", + "\n", + "73->72\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "74\n", + "74 (mengenai)\n", + "\n", + "\n", + "\n", + "73->74\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "77\n", + "77 (berhubung)\n", + "\n", + "\n", + "\n", + "75->77\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "76\n", + "76 (beliau)\n", + "\n", + "\n", + "\n", + "76->76\n", + "\n", + "\n", + "det\n", + "\n", + "\n", + "\n", + "78\n", + "78 (kesan)\n", + "\n", + "\n", + "\n", + "76->78\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "79\n", + "79 (positif)\n", + "\n", + "\n", + "\n", + "78->79\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "80\n", + "80 (sekatan)\n", + "\n", + "\n", + "\n", + "78->80\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "81\n", + "81 (pergerakan)\n", + "\n", + "\n", + "\n", + "78->81\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "82\n", + "82 (penuh)\n", + "\n", + "\n", + "\n", + "79->82\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "string = 'KUALA LUMPUR: Ketua Penerangan BERSATU, Datuk Wan Saiful Wan Jan membidas kenyataan Datuk Seri Najib Razak dan Ketua Pemuda UMNO, Datuk Dr Asyraf Wajdi Dusuki yang mempertikaikan tindakan kerajaan melaksanakan sekatan pergerakan penuh. Beliau berkata, Najib dan Asyraf Wajdi sengaja memetik kenyataan Perdana Menteri, Tan Sri Muhyiddin Yassin yang tidak lengkap untuk mengelirukan rakyat. Wan Saiful berkata, beliau sudah menjangka ada kenyataan balas daripada Najib mengenai tulisan beliau berhubung kesan positif sekatan pergerakan penuh.'\n", + "sequence = transformer_textcleaning(string)[1]\n", + "parsed_sequence, bert_sequence, mask = parse_X(sequence)\n", + "h, t = sess.run([model.heads_seq, model.tags_seq],\n", + " feed_dict = {\n", + " model.words: [parsed_sequence],\n", + " },\n", + ")\n", + "h = h[0] - 2\n", + "t = [idx2tag[d] for d in t[0]]\n", + "merged_h = merge_sentencepiece_tokens_tagging(bert_sequence, h)\n", + "merged_t = merge_sentencepiece_tokens_tagging(bert_sequence, t)\n", + "tagging = list(zip(merged_t[0], merged_t[1]))\n", + "indexing = list(zip(merged_h[0], merged_h[1]))\n", + "dep = dependency_graph(tagging, indexing)\n", + "dep.to_graphvis()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'tiny-bert-dependency/model.ckpt'" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "saver = tf.train.Saver(tf.trainable_variables())\n", + "saver.save(sess, 'tiny-bert-dependency/model.ckpt')" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "INFO:tensorflow:Restoring parameters from bert-base-dependency/model.ckpt\n" + "INFO:tensorflow:Restoring parameters from tiny-bert-dependency/model.ckpt\n" ] } ], @@ -1415,18 +2041,18 @@ "sess = tf.InteractiveSession()\n", "\n", "learning_rate = 2e-5\n", - "hidden_size_word = 128\n", + "hidden_size_word = 256\n", "\n", "model = Model(learning_rate, hidden_size_word, training = False)\n", "\n", "sess.run(tf.global_variables_initializer())\n", "saver = tf.train.Saver(tf.trainable_variables())\n", - "saver.restore(sess, 'bert-base-dependency/model.ckpt')" + "saver.restore(sess, 'tiny-bert-dependency/model.ckpt')" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -1442,7 +2068,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -1487,14 +2113,14 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 315/315 [00:57<00:00, 5.52it/s]\n" + "100%|██████████| 313/313 [00:38<00:00, 8.12it/s]\n" ] } ], @@ -1531,7 +2157,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -1546,60 +2172,51 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 29, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/husein/.local/lib/python3.6/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", - " PAD 0.99996 1.00000 0.99998 943088\n", - " X 0.99999 0.99981 0.99990 145797\n", - " acl 0.85006 0.80040 0.82448 6042\n", - " advcl 0.61783 0.60566 0.61169 2437\n", - " advmod 0.86865 0.86755 0.86810 9513\n", - " amod 0.82596 0.78837 0.80672 8217\n", - " appos 0.84113 0.79100 0.81530 5000\n", - " aux 0.80000 0.50000 0.61538 8\n", - " case 0.94714 0.95046 0.94879 21376\n", - " cc 0.92151 0.94487 0.93304 6349\n", - " ccomp 0.59326 0.26201 0.36349 874\n", - " compound 0.85764 0.83530 0.84632 13667\n", - "compound:plur 0.83743 0.91349 0.87381 1156\n", - " conj 0.87306 0.90624 0.88934 8500\n", - " cop 0.90592 0.93670 0.92105 1943\n", - " csubj 0.75000 0.05263 0.09836 57\n", - " csubj:pass 0.00000 0.00000 0.00000 16\n", - " dep 0.66704 0.55176 0.60395 1082\n", - " det 0.89147 0.84818 0.86929 7970\n", - " fixed 0.80819 0.61696 0.69975 1120\n", - " flat 0.90396 0.93947 0.92137 21129\n", - " iobj 0.00000 0.00000 0.00000 25\n", - " mark 0.74718 0.83845 0.79019 2767\n", - " nmod 0.86083 0.78159 0.81930 8017\n", - " nsubj 0.85174 0.89750 0.87402 12712\n", - " nsubj:pass 0.78514 0.82246 0.80337 4061\n", - " nummod 0.88943 0.93509 0.91169 8026\n", - " obj 0.89982 0.84423 0.87114 10618\n", - " obl 0.84081 0.88283 0.86131 11385\n", - " parataxis 0.48635 0.26667 0.34446 735\n", - " punct 0.98350 0.99126 0.98736 33736\n", - " root 0.91085 0.93726 0.92387 10073\n", - " xcomp 0.69305 0.76415 0.72686 2544\n", + " PAD 0.99958 1.00000 0.99979 656062\n", + " X 0.99998 0.99232 0.99613 61945\n", + " acl 0.64541 0.81625 0.72085 3298\n", + " advcl 0.27776 0.56039 0.37142 1656\n", + " advmod 0.91310 0.92373 0.91839 6700\n", + " amod 0.82278 0.86302 0.84242 4621\n", + " appos 0.76714 0.69299 0.72818 3052\n", + " case 0.96439 0.97337 0.96886 11492\n", + " cc 0.98187 0.96747 0.97461 3750\n", + " ccomp 0.45118 0.34987 0.39412 383\n", + " compound 0.83453 0.89652 0.86442 11133\n", + "compound:plur 0.43478 0.33333 0.37736 30\n", + " conj 0.84660 0.85878 0.85265 5424\n", + " cop 0.97119 0.95659 0.96384 599\n", + " csubj 0.25000 0.10000 0.14286 10\n", + " csubj:pass 0.00000 0.00000 0.00000 1\n", + " dep 0.45412 0.53168 0.48985 363\n", + " det 0.92230 0.89720 0.90958 3969\n", + " fixed 0.88889 0.70175 0.78431 171\n", + " flat 0.91953 0.94618 0.93266 18393\n", + " iobj 0.00000 0.00000 0.00000 4\n", + " mark 0.88054 0.91620 0.89802 1778\n", + " nmod 0.77113 0.77499 0.77306 4591\n", + " nsubj 0.59707 0.82001 0.69100 7145\n", + " nsubj:pass 0.74934 0.69813 0.72283 2034\n", + " nummod 0.89096 0.94680 0.91803 4436\n", + " obj 0.88715 0.87539 0.88123 6412\n", + " obl 0.83020 0.74976 0.78793 5191\n", + " parataxis 0.09302 0.19355 0.12565 372\n", + " punct 0.99385 0.99501 0.99443 20643\n", + " root 0.45797 0.14820 0.22393 10000\n", + " xcomp 0.56806 0.68743 0.62207 1718\n", "\n", - " accuracy 0.98102 1310040\n", - " macro avg 0.77906 0.72946 0.74011 1310040\n", - " weighted avg 0.98076 0.98102 0.98073 1310040\n", + " accuracy 0.97313 857376\n", + " macro avg 0.68951 0.69272 0.68345 857376\n", + " weighted avg 0.97255 0.97313 0.97156 857376\n", "\n" ] } @@ -1611,16 +2228,16 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "arc accuracy: 0.7189048051328787\n", - "types accuracy: 0.6942783162846734\n", - "root accuracy: 0.8860992063492065\n" + "arc accuracy: 0.795252499643322\n", + "types accuracy: 0.7247015428088897\n", + "root accuracy: 0.9893913291696776\n" ] } ], @@ -1632,107 +2249,9 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 31, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Placeholder',\n", - " 'Placeholder_1',\n", - " 'Placeholder_2',\n", - " 'Placeholder_3',\n", - " 'W_d',\n", - " 'W_e',\n", - " 'U',\n", - " 'U-bi',\n", - " 'Wl',\n", - " 'Wr',\n", - " 'bert/embeddings/word_embeddings',\n", - " 'bert/embeddings/token_type_embeddings',\n", - " 'bert/embeddings/position_embeddings',\n", - " 'bert/embeddings/LayerNorm/gamma',\n", - " 'bert/encoder/layer_0/attention/self/query/kernel',\n", - " 'bert/encoder/layer_0/attention/self/query/bias',\n", - " 'bert/encoder/layer_0/attention/self/key/kernel',\n", - " 'bert/encoder/layer_0/attention/self/key/bias',\n", - " 'bert/encoder/layer_0/attention/self/value/kernel',\n", - " 'bert/encoder/layer_0/attention/self/value/bias',\n", - " 'bert/encoder/layer_0/attention/self/Softmax',\n", - " 'bert/encoder/layer_0/attention/output/dense/kernel',\n", - " 'bert/encoder/layer_0/attention/output/dense/bias',\n", - " 'bert/encoder/layer_0/attention/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_0/intermediate/dense/kernel',\n", - " 'bert/encoder/layer_0/intermediate/dense/bias',\n", - " 'bert/encoder/layer_0/output/dense/kernel',\n", - " 'bert/encoder/layer_0/output/dense/bias',\n", - " 'bert/encoder/layer_0/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_1/attention/self/query/kernel',\n", - " 'bert/encoder/layer_1/attention/self/query/bias',\n", - " 'bert/encoder/layer_1/attention/self/key/kernel',\n", - " 'bert/encoder/layer_1/attention/self/key/bias',\n", - " 'bert/encoder/layer_1/attention/self/value/kernel',\n", - " 'bert/encoder/layer_1/attention/self/value/bias',\n", - " 'bert/encoder/layer_1/attention/self/Softmax',\n", - " 'bert/encoder/layer_1/attention/output/dense/kernel',\n", - " 'bert/encoder/layer_1/attention/output/dense/bias',\n", - " 'bert/encoder/layer_1/attention/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_1/intermediate/dense/kernel',\n", - " 'bert/encoder/layer_1/intermediate/dense/bias',\n", - " 'bert/encoder/layer_1/output/dense/kernel',\n", - " 'bert/encoder/layer_1/output/dense/bias',\n", - " 'bert/encoder/layer_1/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_2/attention/self/query/kernel',\n", - " 'bert/encoder/layer_2/attention/self/query/bias',\n", - " 'bert/encoder/layer_2/attention/self/key/kernel',\n", - " 'bert/encoder/layer_2/attention/self/key/bias',\n", - " 'bert/encoder/layer_2/attention/self/value/kernel',\n", - " 'bert/encoder/layer_2/attention/self/value/bias',\n", - " 'bert/encoder/layer_2/attention/self/Softmax',\n", - " 'bert/encoder/layer_2/attention/output/dense/kernel',\n", - " 'bert/encoder/layer_2/attention/output/dense/bias',\n", - " 'bert/encoder/layer_2/attention/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_2/intermediate/dense/kernel',\n", - " 'bert/encoder/layer_2/intermediate/dense/bias',\n", - " 'bert/encoder/layer_2/output/dense/kernel',\n", - " 'bert/encoder/layer_2/output/dense/bias',\n", - " 'bert/encoder/layer_2/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_3/attention/self/query/kernel',\n", - " 'bert/encoder/layer_3/attention/self/query/bias',\n", - " 'bert/encoder/layer_3/attention/self/key/kernel',\n", - " 'bert/encoder/layer_3/attention/self/key/bias',\n", - " 'bert/encoder/layer_3/attention/self/value/kernel',\n", - " 'bert/encoder/layer_3/attention/self/value/bias',\n", - " 'bert/encoder/layer_3/attention/self/Softmax',\n", - " 'bert/encoder/layer_3/attention/output/dense/kernel',\n", - " 'bert/encoder/layer_3/attention/output/dense/bias',\n", - " 'bert/encoder/layer_3/attention/output/LayerNorm/gamma',\n", - " 'bert/encoder/layer_3/intermediate/dense/kernel',\n", - " 'bert/encoder/layer_3/intermediate/dense/bias',\n", - " 'bert/encoder/layer_3/output/dense/kernel',\n", - " 'bert/encoder/layer_3/output/dense/bias',\n", - " 'bert/encoder/layer_3/output/LayerNorm/gamma',\n", - " 'bert/pooler/dense/kernel',\n", - " 'bert/pooler/dense/bias',\n", - " 'dense/kernel',\n", - " 'dense/bias',\n", - " 'dense_1/kernel',\n", - " 'dense_1/bias',\n", - " 'dense_2/kernel',\n", - " 'dense_2/bias',\n", - " 'dense_3/kernel',\n", - " 'dense_3/bias',\n", - " 'heads_seq',\n", - " 'tags_seq',\n", - " 'transitions',\n", - " 'logits']" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "strings = ','.join(\n", " [\n", @@ -1750,13 +2269,12 @@ " and 'adam' not in n.name\n", " and 'gradients/bert' not in n.name\n", " ]\n", - ")\n", - "strings.split(',')" + ")" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -1791,15 +2309,15 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "INFO:tensorflow:Restoring parameters from bert-base-dependency/model.ckpt\n", - "WARNING:tensorflow:From :23: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", + "INFO:tensorflow:Restoring parameters from tiny-bert-dependency/model.ckpt\n", + "WARNING:tensorflow:From :23: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use `tf.compat.v1.graph_util.convert_variables_to_constants`\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/framework/graph_util_impl.py:277: extract_sub_graph (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", @@ -1807,158 +2325,66 @@ "Use `tf.compat.v1.graph_util.extract_sub_graph`\n", "INFO:tensorflow:Froze 86 variables.\n", "INFO:tensorflow:Converted 86 variables to const ops.\n", - "1403 ops in the final graph.\n" + "1400 ops in the final graph.\n" ] } ], "source": [ - "freeze_graph('bert-base-dependency', strings)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "string = 'husein makan ayam'\n", - "\n", - "import re\n", - "\n", - "def entities_textcleaning(string, lowering = False):\n", - " \"\"\"\n", - " use by entities recognition, pos recognition and dependency parsing\n", - " \"\"\"\n", - " string = re.sub('[^A-Za-z0-9\\-\\/() ]+', ' ', string)\n", - " string = re.sub(r'[ ]+', ' ', string).strip()\n", - " original_string = string.split()\n", - " if lowering:\n", - " string = string.lower()\n", - " string = [\n", - " (original_string[no], word.title() if word.isupper() else word)\n", - " for no, word in enumerate(string.split())\n", - " if len(word)\n", - " ]\n", - " return [s[0] for s in string], [s[1] for s in string]\n", - "\n", - "def parse_X(left):\n", - " bert_tokens = ['[CLS]']\n", - " for no, orig_token in enumerate(left):\n", - " t = tokenizer.tokenize(orig_token)\n", - " bert_tokens.extend(t)\n", - " bert_tokens.append(\"[SEP]\")\n", - " return tokenizer.convert_tokens_to_ids(bert_tokens), bert_tokens\n", - "\n", - "sequence = entities_textcleaning(string)[1]\n", - "parsed_sequence, bert_sequence = parse_X(sequence)" + "freeze_graph('tiny-bert-dependency', strings)" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ - "def merge_sentencepiece_tokens_tagging(x, y):\n", - " new_paired_tokens = []\n", - " n_tokens = len(x)\n", - " rejected = ['[CLS]', '[SEP]']\n", - "\n", - " i = 0\n", - "\n", - " while i < n_tokens:\n", - "\n", - " current_token, current_label = x[i], y[i]\n", - " if not current_token.startswith('▁') and current_token not in rejected:\n", - " previous_token, previous_label = new_paired_tokens.pop()\n", - " merged_token = previous_token\n", - " merged_label = [previous_label]\n", - " while (\n", - " not current_token.startswith('▁')\n", - " and current_token not in rejected\n", - " ):\n", - " merged_token = merged_token + current_token.replace('▁', '')\n", - " merged_label.append(current_label)\n", - " i = i + 1\n", - " current_token, current_label = x[i], y[i]\n", - " merged_label = merged_label[0]\n", - " new_paired_tokens.append((merged_token, merged_label))\n", - "\n", - " else:\n", - " new_paired_tokens.append((current_token, current_label))\n", - " i = i + 1\n", - "\n", - " words = [\n", - " i[0].replace('▁', '')\n", - " for i in new_paired_tokens\n", - " if i[0] not in rejected\n", - " ]\n", - " labels = [i[1] for i in new_paired_tokens if i[0] not in rejected]\n", - " return words, labels" + "transforms = ['add_default_attributes',\n", + " 'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n", + " 'fold_batch_norms',\n", + " 'fold_old_batch_norms',\n", + " 'quantize_weights(fallback_min=-10, fallback_max=10)',\n", + " 'strip_unused_nodes',\n", + " 'sort_by_execution_order']" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 35, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "/home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py:1750: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n", - " warnings.warn('An interactive session is already active. This can '\n" + "WARNING:tensorflow:From :6: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use tf.gfile.GFile.\n" ] } ], "source": [ - "def load_graph(frozen_graph_filename):\n", - " with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n", - " graph_def = tf.GraphDef()\n", - " graph_def.ParseFromString(f.read())\n", - " with tf.Graph().as_default() as graph:\n", - " tf.import_graph_def(graph_def)\n", - " return graph\n", + "from tensorflow.tools.graph_transforms import TransformGraph\n", + "tf.set_random_seed(0)\n", "\n", - "g = load_graph('bert-base-dependency/frozen_model.pb')\n", - "x = g.get_tensor_by_name('import/Placeholder:0')\n", - "heads_seq = g.get_tensor_by_name('import/heads_seq:0')\n", - "tags_seq = g.get_tensor_by_name('import/logits:0')\n", - "test_sess = tf.InteractiveSession(graph = g)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "h, t = test_sess.run([heads_seq, tags_seq],\n", - " feed_dict = {\n", - " x: [parsed_sequence],\n", - " },\n", - ")\n", - "h = h[0] - 1\n", - "t = [idx2tag[d] for d in t[0]]\n", - "merged_h = merge_sentencepiece_tokens_tagging(bert_sequence, h)\n", - "merged_t = merge_sentencepiece_tokens_tagging(bert_sequence, t)" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [], - "source": [ - "import boto3\n", + "pb = 'tiny-bert-dependency/frozen_model.pb'\n", + "input_graph_def = tf.GraphDef()\n", + "with tf.gfile.FastGFile(pb, 'rb') as f:\n", + " input_graph_def.ParseFromString(f.read())\n", "\n", - "bucketName = 'huseinhouse-storage'\n", - "Key = 'bert-base-dependency/frozen_model.pb'\n", - "outPutname = \"v34/dependency/tiny-bert-dependency.pb\"\n", + "if 'bert' in pb:\n", + " inputs = ['Placeholder']\n", + " a = ['dense/BiasAdd']\n", + "if 'xlnet' in pb:\n", + " inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n", + " a = ['transpose_3']\n", "\n", - "s3 = boto3.client('s3')\n", + "transformed_graph_def = TransformGraph(input_graph_def, \n", + " inputs,\n", + " ['logits', 'heads_seq'] + a, transforms)\n", "\n", - "s3.upload_file(Key,bucketName,outPutname)" + "with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n", + " f.write(transformed_graph_def.SerializeToString())" ] } ], @@ -1978,7 +2404,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.9" } }, "nbformat": 4, diff --git a/session/dependency/xlnet-base.ipynb b/session/dependency/xlnet-base.ipynb index 17c6084a..65217d98 100644 --- a/session/dependency/xlnet-base.ipynb +++ b/session/dependency/xlnet-base.ipynb @@ -14,22 +14,6 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], - "source": [ - "with open('../Malaya-Dataset/dependency/gsd-ud-train.conllu.txt') as fopen:\n", - " corpus = fopen.read().split('\\n')\n", - " \n", - "with open('../Malaya-Dataset/dependency/gsd-ud-test.conllu.txt') as fopen:\n", - " corpus.extend(fopen.read().split('\\n'))\n", - " \n", - "with open('../Malaya-Dataset/dependency/gsd-ud-dev.conllu.txt') as fopen:\n", - " corpus.extend(fopen.read().split('\\n'))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, "outputs": [ { "name": "stdout", @@ -53,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -61,7 +45,7 @@ "from prepro_utils import preprocess_text, encode_ids\n", "\n", "sp_model = spm.SentencePieceProcessor()\n", - "sp_model.Load('xlnet-base/sp10m.cased.v9.model')\n", + "sp_model.Load('xlnet-base-29-03-2020/sp10m.cased.v9.model')\n", "\n", "def tokenize_fn(text):\n", " text = preprocess_text(text, lower= False)\n", @@ -70,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -102,318 +86,27 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "tag2idx = {'PAD': 0, 'X': 1}\n", - "tag_idx = 2\n", - "\n", - "def process_corpus(corpus, until = None):\n", - " global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx\n", - " sentences, words, depends, labels, pos, sequences = [], [], [], [], [], []\n", - " temp_sentence, temp_word, temp_depend, temp_label, temp_pos = [], [], [], [], []\n", - " segments, masks = [], []\n", - " first_time = True\n", - " for sentence in corpus:\n", - " try:\n", - " if len(sentence):\n", - " if sentence[0] == '#':\n", - " continue\n", - " if first_time:\n", - " print(sentence)\n", - " first_time = False\n", - " sentence = sentence.split('\\t')\n", - " if sentence[7] not in tag2idx:\n", - " tag2idx[sentence[7]] = tag_idx\n", - " tag_idx += 1\n", - " temp_word.append(sentence[1])\n", - " temp_depend.append(int(sentence[6]) + 1)\n", - " temp_label.append(tag2idx[sentence[7]])\n", - " temp_sentence.append(sentence[1])\n", - " temp_pos.append(sentence[3])\n", - " else:\n", - " if len(temp_sentence) < 2 or len(temp_word) != len(temp_label):\n", - " temp_word = []\n", - " temp_depend = []\n", - " temp_label = []\n", - " temp_sentence = []\n", - " temp_pos = []\n", - " continue\n", - " bert_tokens = []\n", - " labels_ = []\n", - " depends_ = []\n", - " seq_ = []\n", - " for no, orig_token in enumerate(temp_word):\n", - " labels_.append(temp_label[no])\n", - " depends_.append(temp_depend[no])\n", - " t = tokenize_fn(orig_token)\n", - " bert_tokens.extend(t)\n", - " labels_.extend([1] * (len(t) - 1))\n", - " depends_.extend([0] * (len(t) - 1))\n", - " seq_.append(no + 1)\n", - " bert_tokens.extend([4, 3])\n", - " labels_.extend([0, 0])\n", - " depends_.extend([0, 0])\n", - " segment = [0] * (len(bert_tokens) - 1) + [SEG_ID_CLS]\n", - " input_mask = [0] * len(segment)\n", - " words.append(bert_tokens)\n", - " depends.append(depends_)\n", - " labels.append(labels_)\n", - " sentences.append(bert_tokens)\n", - " pos.append(temp_pos)\n", - " sequences.append(seq_)\n", - " segments.append(segment)\n", - " masks.append(input_mask)\n", - " temp_word = []\n", - " temp_depend = []\n", - " temp_label = []\n", - " temp_sentence = []\n", - " temp_pos = []\n", - " except Exception as e:\n", - " print(e, sentence)\n", - " return sentences[:-1], words[:-1], depends[:-1], labels[:-1], pos[:-1], sequences[:-1], segments[:-1], masks[:-1]" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1\tSembungan\tsembungan\tPROPN\tX--\t_\t4\tnsubj\t_\tMorphInd=^sembungan_X--$\n" - ] - } - ], - "source": [ - "sentences, words, depends, labels, _, _, segments, masks = process_corpus(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(26, 26, 26)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(words[0]), len(depends[0]), len(labels[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "with open('../Malaya-Dataset/dependency/augmented-dependency.json') as fopen:\n", - " augmented = json.load(fopen)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "text_augmented, depends_augmented, labels_augmented = [], [], []\n", - "\n", - "for a in augmented:\n", - " text_augmented.extend(a[0])\n", - " depends_augmented.extend(a[1])\n", - " labels_augmented.extend((np.array(a[2]) + 1).tolist())" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "def parse_XY(texts, depends, labels):\n", - " outside, sentences, outside_depends, outside_labels = [], [], [], []\n", - " segments, masks = [], []\n", - " for no, text in enumerate(texts):\n", - " temp_depend = depends[no]\n", - " temp_label = labels[no]\n", - " s = text.split()\n", - " sentences.append(s)\n", - " bert_tokens = []\n", - " labels_ = []\n", - " depends_ = []\n", - " for no, orig_token in enumerate(s):\n", - " labels_.append(temp_label[no])\n", - " depends_.append(temp_depend[no])\n", - " t = tokenize_fn(orig_token)\n", - " bert_tokens.extend(t)\n", - " labels_.extend([1] * (len(t) - 1))\n", - " depends_.extend([0] * (len(t) - 1))\n", - " bert_tokens.extend([4, 3])\n", - " labels_.extend([0, 0])\n", - " depends_.extend([0, 0])\n", - " segment = [0] * (len(bert_tokens) - 1) + [SEG_ID_CLS]\n", - " input_mask = [0] * len(segment)\n", - " outside.append(bert_tokens)\n", - " outside_depends.append(depends_)\n", - " outside_labels.append(labels_)\n", - " segments.append(segment)\n", - " masks.append(input_mask)\n", - " return outside, sentences, outside_depends, outside_labels, segments, masks" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "outside, _, outside_depends, outside_labels, outside_segments, outside_masks = parse_XY(text_augmented, \n", - " depends_augmented, \n", - " labels_augmented)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "words.extend(outside)\n", - "depends.extend(outside_depends)\n", - "labels.extend(outside_labels)\n", - "segments.extend(outside_segments)\n", - "masks.extend(outside_masks)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{0: 'PAD',\n", - " 1: 'X',\n", - " 2: 'nsubj',\n", - " 3: 'cop',\n", - " 4: 'det',\n", - " 5: 'root',\n", - " 6: 'nsubj:pass',\n", - " 7: 'acl',\n", - " 8: 'case',\n", - " 9: 'obl',\n", - " 10: 'flat',\n", - " 11: 'punct',\n", - " 12: 'appos',\n", - " 13: 'amod',\n", - " 14: 'compound',\n", - " 15: 'advmod',\n", - " 16: 'cc',\n", - " 17: 'obj',\n", - " 18: 'conj',\n", - " 19: 'mark',\n", - " 20: 'advcl',\n", - " 21: 'nmod',\n", - " 22: 'nummod',\n", - " 23: 'dep',\n", - " 24: 'xcomp',\n", - " 25: 'ccomp',\n", - " 26: 'parataxis',\n", - " 27: 'compound:plur',\n", - " 28: 'fixed',\n", - " 29: 'aux',\n", - " 30: 'csubj',\n", - " 31: 'iobj',\n", - " 32: 'csubj:pass'}" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "idx2tag = {v:k for k, v in tag2idx.items()}\n", - "idx2tag" - ] - }, - { - "cell_type": "code", - "execution_count": 15, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "from sklearn.model_selection import train_test_split\n", + "import pickle\n", "\n", - "words_train, words_test, depends_train, depends_test, labels_train, labels_test, \\\n", - "segments_train, segments_test, masks_train, masks_test \\\n", - "= train_test_split(words, depends, labels, segments, masks, test_size = 0.2)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(40289, 10073)" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(words_train), len(words_test)" + "with open('train_X.pkl', 'rb') as fopen:\n", + " train_X, train_Y, train_depends, train_segments, train_masks = pickle.load(fopen)\n", + " \n", + "with open('test_X.pkl', 'rb') as fopen:\n", + " test_X, test_Y, test_depends, test_segments, test_masks = pickle.load(fopen)\n", + " \n", + "with open('tags.pkl', 'rb') as fopen:\n", + " idx2tag, tag2idx = pickle.load(fopen)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 66, "metadata": {}, "outputs": [], - "source": [ - "train_X = words_train\n", - "train_Y = labels_train\n", - "train_depends = depends_train\n", - "\n", - "test_X = words_test\n", - "test_Y = labels_test\n", - "test_depends = depends_test" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:From /home/husein/xlnet/xlnet.py:63: The name tf.gfile.Open is deprecated. Please use tf.io.gfile.GFile instead.\n", - "\n" - ] - } - ], "source": [ "import xlnet\n", "import model_utils\n", @@ -432,25 +125,25 @@ " clamp_len=-1)\n", "\n", "xlnet_parameters = xlnet.RunConfig(**kwargs)\n", - "xlnet_config = xlnet.XLNetConfig(json_path='xlnet-base/config.json')" + "xlnet_config = xlnet.XLNetConfig(json_path='xlnet-base-29-03-2020/config.json')" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "37770 3777\n" + "1173456 117345\n" ] } ], "source": [ - "epoch = 15\n", - "batch_size = 16\n", + "epoch = 3\n", + "batch_size = 8\n", "warmup_proportion = 0.1\n", "num_train_steps = int(len(train_X) / batch_size * epoch)\n", "num_warmup_steps = int(num_train_steps * warmup_proportion)\n", @@ -479,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -504,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -724,50 +417,15 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 67, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py:507: calling count_nonzero (from tensorflow.python.ops.math_ops) with axis is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "reduction_indices is deprecated, use axis instead\n", - "WARNING:tensorflow:\n", - "The TensorFlow contrib module will not be included in TensorFlow 2.0.\n", - "For more information, please see:\n", - " * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n", - " * https://github.com/tensorflow/addons\n", - " * https://github.com/tensorflow/io (for I/O related ops)\n", - "If you depend on functionality not listed there, please file an issue.\n", - "\n", - "WARNING:tensorflow:From /home/husein/xlnet/xlnet.py:220: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.\n", - "\n", - "WARNING:tensorflow:From /home/husein/xlnet/xlnet.py:220: The name tf.AUTO_REUSE is deprecated. Please use tf.compat.v1.AUTO_REUSE instead.\n", - "\n", - "WARNING:tensorflow:From /home/husein/xlnet/modeling.py:453: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.\n", - "\n", "INFO:tensorflow:memory input None\n", - "INFO:tensorflow:Use float type \n", - "WARNING:tensorflow:From /home/husein/xlnet/modeling.py:535: dropout (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Use keras.layers.dropout instead.\n", - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/layers/core.py:271: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Please use `layer.__call__` method instead.\n", - "WARNING:tensorflow:From /home/husein/xlnet/modeling.py:67: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Use keras.layers.Dense instead.\n", - "WARNING:tensorflow:From :138: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", - "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/contrib/crf/python/ops/crf.py:213: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Please use `keras.layers.RNN(cell)`, which is equivalent to this API\n", - "WARNING:tensorflow:From :172: calling log_softmax (from tensorflow.python.ops.nn_ops) with dim is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "dim is deprecated, use axis instead\n" + "INFO:tensorflow:Use float type \n" ] } ], @@ -776,7 +434,7 @@ "sess = tf.InteractiveSession()\n", "\n", "learning_rate = 2e-5\n", - "hidden_size_word = 128\n", + "hidden_size_word = 256\n", "\n", "model = Model(learning_rate, hidden_size_word)\n", "sess.run(tf.global_variables_initializer())" @@ -784,7 +442,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -820,26 +478,26 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "tvars = tf.trainable_variables()\n", - "checkpoint = 'xlnet-base/model.ckpt'\n", + "checkpoint = 'xlnet-base-29-03-2020/model.ckpt-300000'\n", "assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, \n", " checkpoint)" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "INFO:tensorflow:Restoring parameters from xlnet-base/model.ckpt\n" + "INFO:tensorflow:Restoring parameters from xlnet-base-29-03-2020/model.ckpt-300000\n" ] } ], @@ -850,7 +508,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -862,24 +520,24 @@ "batch_y = pad_sequences(batch_y,padding='post')\n", "batch_depends = train_depends[:5]\n", "batch_depends = pad_sequences(batch_depends,padding='post')\n", - "batch_segments = segments_train[:5]\n", + "batch_segments = train_segments[:5]\n", "batch_segments = pad_sequences(batch_segments, padding='post', value = 4)\n", - "batch_masks = masks_train[:5]\n", + "batch_masks = train_masks[:5]\n", "batch_masks = pad_sequences(batch_masks, padding='post', value = 1)" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[0.04255319, 0.014184397, 128.52495]" + "[0.03529412, 0.07058824, 140.91621]" ] }, - "execution_count": 27, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -896,16 +554,16 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[0.0070921984, 0.04964539, 544.50476]" + "[0.047058824, 0.023529412, 352.61438]" ] }, - "execution_count": 28, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -922,391 +580,138 @@ }, { "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([26, 6, 6, 28, 26, 18, 19, 18, 28, 6, 26, 32, 27, 28, 27, 6, 28,\n", - " 19, 19, 28, 6, 28, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0], dtype=int32),\n", - " array([21, 3, 15, 18, 10, 21, 10, 21, 7, 10, 1, 21, 1, 7, 3, 8, 7,\n", - " 9, 10, 18, 8, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0]),\n", - " array([ 3, 1, 3, 3, 3, 6, 3, 0, 3, 0, 9, 13, 11, 9, 15, 13, 15,\n", - " 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0], dtype=int32))" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tags_seq, heads = sess.run(\n", - " [model.logits, model.heads_seq],\n", - " feed_dict = {\n", - " model.words: batch_x,\n", - " model.segment_ids: batch_segments,\n", - " model.input_masks: batch_masks\n", - " },\n", - ")\n", - "tags_seq[0], heads[0], batch_depends[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, + "execution_count": 48, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "train minibatch loop: 100%|██████████| 2519/2519 [15:04<00:00, 2.78it/s, accuracy=0.8, accuracy_depends=0.5, cost=1.95] \n", - "test minibatch loop: 100%|██████████| 630/630 [02:03<00:00, 5.12it/s, accuracy=0.843, accuracy_depends=0.545, cost=2.06]\n", - "train minibatch loop: 0%| | 0/2519 [00:00', '']\n", + "\n", + " i = 0\n", + "\n", + " while i < n_tokens:\n", + "\n", + " current_token, current_label = x[i], y[i]\n", + " if not current_token.startswith('▁') and current_token not in rejected:\n", + " previous_token, previous_label = new_paired_tokens.pop()\n", + " merged_token = previous_token\n", + " merged_label = [previous_label]\n", + " while (\n", + " not current_token.startswith('▁')\n", + " and current_token not in rejected\n", + " ):\n", + " merged_token = merged_token + current_token.replace('▁', '')\n", + " merged_label.append(current_label)\n", + " i = i + 1\n", + " current_token, current_label = x[i], y[i]\n", + " merged_label = merged_label[0]\n", + " new_paired_tokens.append((merged_token, merged_label))\n", + "\n", + " else:\n", + " new_paired_tokens.append((current_token, current_label))\n", + " i = i + 1\n", + "\n", + " words = [\n", + " i[0].replace('▁', '')\n", + " for i in new_paired_tokens\n", + " if i[0] not in ['', '']\n", + " ]\n", + " labels = [i[1] for i in new_paired_tokens if i[0] not in ['', '']]\n", + " return words, labels" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "from unidecode import unidecode\n", + "from malaya.function.parse_dependency import DependencyGraph\n", + "\n", + "PUNCTUATION = '!\"#$%&\\'()*+,./:;<=>?@[\\]^_`{|}~'\n", + "\n", + "def transformer_textcleaning(string):\n", + " \"\"\"\n", + " use by any transformer model before tokenization\n", + " \"\"\"\n", + " string = unidecode(string)\n", + " string = re.sub('\\\\(dot\\\\)', '.', string)\n", + " string = (\n", + " re.sub(re.findall(r'\\', string)[0], '', string)\n", + " if (len(re.findall(r'\\', string)) > 0)\n", + " and ('href' in re.findall(r'\\', string)[0])\n", + " else string\n", + " )\n", + " string = re.sub(\n", + " r'\\w+:\\/{2}[\\d\\w-]+(\\.[\\d\\w-]+)*(?:(?:\\/[^\\s/]*))*', ' ', string\n", + " )\n", + " string = re.sub(r'[ ]+', ' ', string).strip().split()\n", + " string = [w for w in string if w[0] != '@']\n", + " string = ' '.join(string)\n", + " string = re.sub(f'([{PUNCTUATION}])', r' \\1 ', string)\n", + " string = re.sub('\\s{2,}', ' ', string)\n", + " original_string = string.split()\n", + " string = [\n", + " (original_string[no], word.title() if word.isupper() else word)\n", + " for no, word in enumerate(string.split())\n", + " if len(word)\n", + " ]\n", + " return [s[0] for s in string], [s[1] for s in string]\n", + "\n", + "def parse_X(left):\n", + " left = ' '.join(left)\n", + " bert_tokens = tokenize_fn(left)\n", + " bert_tokens.extend([3, 4])\n", + " segment = [0] * (len(bert_tokens) - 1) + [SEG_ID_CLS]\n", + " input_mask = [0] * len(segment)\n", + " s_tokens = [sp_model.IdToPiece(i) for i in bert_tokens]\n", + " return bert_tokens, segment, input_mask, s_tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "def dependency_graph(tagging, indexing):\n", + " \"\"\"\n", + " Return helper object for dependency parser results. Only accept tagging and indexing outputs from dependency models.\n", + " \"\"\"\n", + " result = []\n", + " for i in range(len(tagging)):\n", + " result.append(\n", + " '%d\\t%s\\t_\\t_\\t_\\t_\\t%d\\t%s\\t_\\t_'\n", + " % (i + 1, tagging[i][0], int(indexing[i][1]), tagging[i][1])\n", + " )\n", + " return DependencyGraph('\\n'.join(result), top_relation_label='root')" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "epoch: 4, training loss: 0.545647, training acc: 0.996865, training depends: 0.917185, valid loss: 3.373970, valid acc: 0.984332, valid depends: 0.899482\n", - "\n" - ] - }, + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "2\n", + "2 (makan)\n", + "\n", + "\n", + "\n", + "0->2\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "1\n", + "1 (husein)\n", + "\n", + "\n", + "\n", + "2->1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "3\n", + "3 (ayam)\n", + "\n", + "\n", + "\n", + "1->3\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "string = 'husein makan ayam'\n", + "sequence = transformer_textcleaning(string)[1]\n", + "parsed_sequence, segment_sequence, mask_sequence, xlnet_sequence = parse_X(sequence)\n", + "h, t = sess.run([model.heads_seq, model.tags_seq],\n", + " feed_dict = {\n", + " model.words: [parsed_sequence],\n", + " model.segment_ids: [segment_sequence],\n", + " model.input_masks: [mask_sequence],\n", + " },\n", + ")\n", + "h = h[0] - 2\n", + "t = [idx2tag[d] for d in t[0]]\n", + "merged_h = merge_sentencepiece_tokens_tagging(xlnet_sequence, h)\n", + "merged_t = merge_sentencepiece_tokens_tagging(xlnet_sequence, t)\n", + "tagging = list(zip(merged_t[0], merged_t[1]))\n", + "indexing = list(zip(merged_h[0], merged_h[1]))\n", + "dep = dependency_graph(tagging, indexing)\n", + "dep.to_graphvis()" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "scrolled": true + }, + "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "G\n", + "\n", + "\n", + "\n", + "0\n", + "0 (None)\n", + "\n", + "\n", + "\n", + "1\n", + "1 (Kuala)\n", + "\n", + "\n", + "\n", + "0->1\n", + "\n", + "\n", + "root\n", + "\n", + "\n", + "\n", + "2\n", + "2 (Lumpur)\n", + "\n", + "\n", + "\n", + "1->2\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "3\n", + "3 (:)\n", + "\n", + "\n", + "\n", + "1->3\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "13\n", + "13 (membidas)\n", + "\n", + "\n", + "\n", + "1->13\n", + "\n", + "\n", + "parataxis\n", + "\n", + "\n", + "\n", + "14\n", + "14 (kenyataan)\n", + "\n", + "\n", + "\n", + "1->14\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "39\n", + "39 (berkata)\n", + "\n", + "\n", + "\n", + "1->39\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "4\n", + "4 (Ketua)\n", + "\n", + "\n", + "\n", + "13->4\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "33\n", + "33 (melaksanakan)\n", + "\n", + "\n", + "\n", + "13->33\n", + "\n", + "\n", + "ccomp\n", + "\n", + "\n", + "\n", + "37\n", + "37 (.)\n", + "\n", + "\n", + "\n", + "13->37\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "16\n", + "16 (Seri)\n", + "\n", + "\n", + "\n", + "14->16\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "38\n", + "38 (Beliau)\n", + "\n", + "\n", + "\n", + "39->38\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "46\n", + "46 (memetik)\n", + "\n", + "\n", + "\n", + "39->46\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "64\n", + "64 (berkata)\n", + "\n", + "\n", + "\n", + "39->64\n", + "\n", + "\n", + "dep\n", + "\n", + "\n", + "\n", + "5\n", + "5 (Penerangan)\n", + "\n", + "\n", + "\n", + "4->5\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "8\n", + "8 (Datuk)\n", + "\n", + "\n", + "\n", + "4->8\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "6\n", + "6 (Bersatu)\n", + "\n", + "\n", + "\n", + "5->6\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "7\n", + "7 (,)\n", + "\n", + "\n", + "\n", + "8->7\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "9\n", + "9 (Wan)\n", + "\n", + "\n", + "\n", + "8->9\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "10\n", + "10 (Saiful)\n", + "\n", + "\n", + "\n", + "9->10\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "17\n", + "17 (Najib)\n", + "\n", + "\n", + "\n", + "9->17\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "22\n", + "22 (Umno)\n", + "\n", + "\n", + "\n", + "17->22\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "11\n", + "11 (Wan)\n", + "\n", + "\n", + "\n", + "11->11\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "12\n", + "12 (Jan)\n", + "\n", + "\n", + "\n", + "11->12\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "18\n", + "18 (Razak)\n", + "\n", + "\n", + "\n", + "11->18\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "24\n", + "24 (Datuk)\n", + "\n", + "\n", + "\n", + "33->24\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "34\n", + "34 (sekatan)\n", + "\n", + "\n", + "\n", + "33->34\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "15\n", + "15 (Datuk)\n", + "\n", + "\n", + "\n", + "16->15\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "20\n", + "20 (Ketua)\n", + "\n", + "\n", + "\n", + "16->20\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "19\n", + "19 (dan)\n", + "\n", + "\n", + "\n", + "20->19\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "23\n", + "23 (,)\n", + "\n", + "\n", + "\n", + "20->23\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "25\n", + "25 (Dr)\n", + "\n", + "\n", + "\n", + "20->25\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "21\n", + "21 (Pemuda)\n", + "\n", + "\n", + "\n", + "26\n", + "26 (Asyraf)\n", + "\n", + "\n", + "\n", + "21->26\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "27\n", + "27 (Wajdi)\n", + "\n", + "\n", + "\n", + "26->27\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "24->21\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "30\n", + "30 (mempertikaikan)\n", + "\n", + "\n", + "\n", + "24->30\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "29\n", + "29 (yang)\n", + "\n", + "\n", + "\n", + "30->29\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "31\n", + "31 (tindakan)\n", + "\n", + "\n", + "\n", + "30->31\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "28\n", + "28 (Dusuki)\n", + "\n", + "\n", + "\n", + "27->28\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "32\n", + "32 (kerajaan)\n", + "\n", + "\n", + "\n", + "31->32\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "36\n", + "36 (penuh)\n", + "\n", + "\n", + "\n", + "31->36\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "35\n", + "35 (pergerakan)\n", + "\n", + "\n", + "\n", + "34->35\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "40\n", + "40 (,)\n", + "\n", + "\n", + "\n", + "46->40\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "41\n", + "41 (Najib)\n", + "\n", + "\n", + "\n", + "46->41\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "45\n", + "45 (sengaja)\n", + "\n", + "\n", + "\n", + "46->45\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "61\n", + "61 (.)\n", + "\n", + "\n", + "\n", + "46->61\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "47\n", + "47 (kenyataan)\n", + "\n", + "\n", + "\n", + "46->47\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "62\n", + "62 (Wan)\n", + "\n", + "\n", + "\n", + "64->62\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "65\n", + "65 (,)\n", + "\n", + "\n", + "\n", + "64->65\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "83\n", + "83 (.)\n", + "\n", + "\n", + "\n", + "64->83\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "68\n", + "68 (menjangka)\n", + "\n", + "\n", + "\n", + "64->68\n", + "\n", + "\n", + "advcl\n", + "\n", + "\n", + "\n", + "43\n", + "43 (Asyraf)\n", + "\n", + "\n", + "\n", + "41->43\n", + "\n", + "\n", + "conj\n", + "\n", + "\n", + "\n", + "44\n", + "44 (Wajdi)\n", + "\n", + "\n", + "\n", + "41->44\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "42\n", + "42 (dan)\n", + "\n", + "\n", + "\n", + "43->42\n", + "\n", + "\n", + "cc\n", + "\n", + "\n", + "\n", + "48\n", + "48 (Perdana)\n", + "\n", + "\n", + "\n", + "47->48\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "57\n", + "57 (lengkap)\n", + "\n", + "\n", + "\n", + "47->57\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "49\n", + "49 (Menteri)\n", + "\n", + "\n", + "\n", + "48->49\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "50\n", + "50 (,)\n", + "\n", + "\n", + "\n", + "48->50\n", + "\n", + "\n", + "punct\n", + "\n", + "\n", + "\n", + "51\n", + "51 (Tan)\n", + "\n", + "\n", + "\n", + "48->51\n", + "\n", + "\n", + "appos\n", + "\n", + "\n", + "\n", + "55\n", + "55 (yang)\n", + "\n", + "\n", + "\n", + "57->55\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "56\n", + "56 (tidak)\n", + "\n", + "\n", + "\n", + "57->56\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "59\n", + "59 (mengelirukan)\n", + "\n", + "\n", + "\n", + "57->59\n", + "\n", + "\n", + "ccomp\n", + "\n", + "\n", + "\n", + "52\n", + "52 (Sri)\n", + "\n", + "\n", + "\n", + "51->52\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "53\n", + "53 (Muhyiddin)\n", + "\n", + "\n", + "\n", + "52->53\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "54\n", + "54 (Yassin)\n", + "\n", + "\n", + "\n", + "53->54\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "58\n", + "58 (untuk)\n", + "\n", + "\n", + "\n", + "59->58\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "60\n", + "60 (rakyat)\n", + "\n", + "\n", + "\n", + "59->60\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "63\n", + "63 (Saiful)\n", + "\n", + "\n", + "\n", + "62->63\n", + "\n", + "\n", + "flat\n", + "\n", + "\n", + "\n", + "66\n", + "66 (beliau)\n", + "\n", + "\n", + "\n", + "68->66\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "67\n", + "67 (sudah)\n", + "\n", + "\n", + "\n", + "68->67\n", + "\n", + "\n", + "advmod\n", + "\n", + "\n", + "\n", + "69\n", + "69 (ada)\n", + "\n", + "\n", + "\n", + "68->69\n", + "\n", + "\n", + "xcomp\n", + "\n", + "\n", + "\n", + "70\n", + "70 (kenyataan)\n", + "\n", + "\n", + "\n", + "69->70\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "71\n", + "71 (balas)\n", + "\n", + "\n", + "\n", + "69->71\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "73\n", + "73 (Najib)\n", + "\n", + "\n", + "\n", + "69->73\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "75\n", + "75 (tulisan)\n", + "\n", + "\n", + "\n", + "71->75\n", + "\n", + "\n", + "obl\n", + "\n", + "\n", + "\n", + "74\n", + "74 (mengenai)\n", + "\n", + "\n", + "\n", + "75->74\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "77\n", + "77 (berhubung)\n", + "\n", + "\n", + "\n", + "75->77\n", + "\n", + "\n", + "acl\n", + "\n", + "\n", + "\n", + "72\n", + "72 (daripada)\n", + "\n", + "\n", + "\n", + "72->72\n", + "\n", + "\n", + "case\n", + "\n", + "\n", + "\n", + "76\n", + "76 (beliau)\n", + "\n", + "\n", + "\n", + "77->76\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "78\n", + "78 (kesan)\n", + "\n", + "\n", + "\n", + "77->78\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n", + "79\n", + "79 (positif)\n", + "\n", + "\n", + "\n", + "77->79\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n", + "80\n", + "80 (sekatan)\n", + "\n", + "\n", + "\n", + "78->80\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "81\n", + "81 (pergerakan)\n", + "\n", + "\n", + "\n", + "79->81\n", + "\n", + "\n", + "compound\n", + "\n", + "\n", + "\n", + "82\n", + "82 (penuh)\n", + "\n", + "\n", + "\n", + "80->82\n", + "\n", + "\n", + "amod\n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "from tqdm import tqdm\n", - "\n", - "epoch = 5\n", - "for e in range(epoch):\n", - " train_acc, train_loss = [], []\n", - " test_acc, test_loss = [], []\n", - " train_acc_depends, test_acc_depends = [], []\n", - " \n", - " pbar = tqdm(\n", - " range(0, len(train_X), batch_size), desc = 'train minibatch loop'\n", - " )\n", - " for i in pbar:\n", - " index = min(i + batch_size, len(train_X))\n", - " batch_x = train_X[i: index]\n", - " batch_x = pad_sequences(batch_x,padding='post')\n", - " batch_y = train_Y[i: index]\n", - " batch_y = pad_sequences(batch_y,padding='post')\n", - " batch_depends = train_depends[i: index]\n", - " batch_depends = pad_sequences(batch_depends,padding='post')\n", - " batch_segments = segments_train[i: index]\n", - " batch_segments = pad_sequences(batch_segments, padding='post', value = 4)\n", - " batch_masks = masks_train[i: index]\n", - " batch_masks = pad_sequences(batch_masks, padding='post', value = 1)\n", - " \n", - " acc_depends, acc, cost, _ = sess.run(\n", - " [model.accuracy_depends, model.accuracy, model.cost, model.optimizer],\n", - " feed_dict = {\n", - " model.words: batch_x,\n", - " model.types: batch_y,\n", - " model.heads: batch_depends,\n", - " model.segment_ids: batch_segments,\n", - " model.input_masks: batch_masks,\n", - " model.switch: True\n", - " },\n", - " )\n", - " train_loss.append(cost)\n", - " train_acc.append(acc)\n", - " train_acc_depends.append(acc_depends)\n", - " pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)\n", - " \n", - " pbar = tqdm(\n", - " range(0, len(test_X), batch_size), desc = 'test minibatch loop'\n", - " )\n", - " for i in pbar:\n", - " index = min(i + batch_size, len(test_X))\n", - " batch_x = test_X[i: index]\n", - " batch_x = pad_sequences(batch_x,padding='post')\n", - " batch_y = test_Y[i: index]\n", - " batch_y = pad_sequences(batch_y,padding='post')\n", - " batch_depends = test_depends[i: index]\n", - " batch_depends = pad_sequences(batch_depends,padding='post')\n", - " batch_segments = segments_test[i: index]\n", - " batch_segments = pad_sequences(batch_segments, padding='post', value = 4)\n", - " batch_masks = masks_test[i: index]\n", - " batch_masks = pad_sequences(batch_masks, padding='post', value = 1)\n", - " \n", - " acc_depends, acc, cost = sess.run(\n", - " [model.accuracy_depends, model.accuracy, model.cost],\n", - " feed_dict = {\n", - " model.words: batch_x,\n", - " model.types: batch_y,\n", - " model.heads: batch_depends,\n", - " model.segment_ids: batch_segments,\n", - " model.input_masks: batch_masks,\n", - " model.switch: True\n", - " },\n", - " )\n", - " test_loss.append(cost)\n", - " test_acc.append(acc)\n", - " test_acc_depends.append(acc_depends)\n", - " pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)\n", - " \n", - " \n", - " print(\n", - " 'epoch: %d, training loss: %f, training acc: %f, training depends: %f, valid loss: %f, valid acc: %f, valid depends: %f\\n'\n", - " % (e, np.mean(train_loss), \n", - " np.mean(train_acc), \n", - " np.mean(train_acc_depends), \n", - " np.mean(test_loss), \n", - " np.mean(test_acc), \n", - " np.mean(test_acc_depends)\n", - " ))" + "string = 'KUALA LUMPUR: Ketua Penerangan BERSATU, Datuk Wan Saiful Wan Jan membidas kenyataan Datuk Seri Najib Razak dan Ketua Pemuda UMNO, Datuk Dr Asyraf Wajdi Dusuki yang mempertikaikan tindakan kerajaan melaksanakan sekatan pergerakan penuh. Beliau berkata, Najib dan Asyraf Wajdi sengaja memetik kenyataan Perdana Menteri, Tan Sri Muhyiddin Yassin yang tidak lengkap untuk mengelirukan rakyat. Wan Saiful berkata, beliau sudah menjangka ada kenyataan balas daripada Najib mengenai tulisan beliau berhubung kesan positif sekatan pergerakan penuh.'\n", + "sequence = transformer_textcleaning(string)[1]\n", + "parsed_sequence, segment_sequence, mask_sequence, xlnet_sequence = parse_X(sequence)\n", + "h, t = sess.run([model.heads_seq, model.tags_seq],\n", + " feed_dict = {\n", + " model.words: [parsed_sequence],\n", + " model.segment_ids: [segment_sequence],\n", + " model.input_masks: [mask_sequence],\n", + " },\n", + ")\n", + "h = h[0] - 2\n", + "t = [idx2tag[d] for d in t[0]]\n", + "merged_h = merge_sentencepiece_tokens_tagging(xlnet_sequence, h)\n", + "merged_t = merge_sentencepiece_tokens_tagging(xlnet_sequence, t)\n", + "tagging = list(zip(merged_t[0], merged_t[1]))\n", + "indexing = list(zip(merged_h[0], merged_h[1]))\n", + "dep = dependency_graph(tagging, indexing)\n", + "dep.to_graphvis()" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 72, "metadata": {}, "outputs": [ { @@ -1589,7 +2100,7 @@ "'xlnet-base-dependency/model.ckpt'" ] }, - "execution_count": 35, + "execution_count": 72, "metadata": {}, "output_type": "execute_result" } @@ -1601,7 +2112,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -1617,12 +2128,12 @@ " clamp_len=-1)\n", "\n", "xlnet_parameters = xlnet.RunConfig(**kwargs)\n", - "xlnet_config = xlnet.XLNetConfig(json_path='xlnet-base/config.json')" + "xlnet_config = xlnet.XLNetConfig(json_path='xlnet-base-29-03-2020/config.json')" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 74, "metadata": {}, "outputs": [ { @@ -1632,19 +2143,11 @@ "INFO:tensorflow:memory input None\n", "INFO:tensorflow:Use float type \n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py:1750: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n", - " warnings.warn('An interactive session is already active. This can '\n" - ] } ], "source": [ "learning_rate = 2e-5\n", - "hidden_size_word = 128\n", + "hidden_size_word = 256\n", "\n", "tf.reset_default_graph()\n", "sess = tf.InteractiveSession()\n", @@ -1654,7 +2157,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 75, "metadata": {}, "outputs": [ { @@ -1672,7 +2175,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1688,7 +2191,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -1733,14 +2236,14 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 630/630 [01:56<00:00, 5.39it/s]\n" + "100%|██████████| 1250/1250 [02:03<00:00, 10.15it/s]\n" ] } ], @@ -1756,9 +2259,9 @@ " batch_y = pad_sequences(batch_y,padding='post')\n", " batch_depends = test_depends[i: index]\n", " batch_depends = pad_sequences(batch_depends,padding='post')\n", - " batch_segments = segments_test[i: index]\n", + " batch_segments = test_segments[i: index]\n", " batch_segments = pad_sequences(batch_segments, padding='post', value = 4)\n", - " batch_masks = masks_test[i: index]\n", + " batch_masks = test_masks[i: index]\n", " batch_masks = pad_sequences(batch_masks, padding='post', value = 1)\n", " \n", " tags_seq, heads = sess.run(\n", @@ -1783,7 +2286,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -1793,12 +2296,12 @@ " \n", "temp_predict_Y = []\n", "for r in predict_Y:\n", - " temp_predict_Y.extend(r)" + " temp_predict_Y.extend(r)\n" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 80, "metadata": {}, "outputs": [ { @@ -1807,43 +2310,42 @@ "text": [ " precision recall f1-score support\n", "\n", - " PAD 0.99998 1.00000 0.99999 632972\n", - " X 1.00000 0.99997 0.99999 143586\n", - " acl 0.98091 0.98226 0.98158 5806\n", - " advcl 0.97098 0.95161 0.96120 2356\n", - " advmod 0.98802 0.97806 0.98302 9527\n", - " amod 0.95966 0.97100 0.96530 8208\n", - " appos 0.98846 0.98947 0.98896 4936\n", - " aux 1.00000 1.00000 1.00000 10\n", - " case 0.99454 0.99110 0.99282 21128\n", - " cc 0.98704 0.99518 0.99109 6429\n", - " ccomp 0.89091 0.97313 0.93021 856\n", - " compound 0.98091 0.96643 0.97362 13079\n", - "compound:plur 0.99068 0.98401 0.98733 1188\n", - " conj 0.98303 0.99214 0.98756 8524\n", - " cop 0.98664 0.99071 0.98867 1938\n", - " csubj 0.96000 0.96000 0.96000 50\n", - " csubj:pass 0.95652 0.91667 0.93617 24\n", - " dep 0.98182 0.96716 0.97444 1005\n", - " det 0.98698 0.97756 0.98225 8065\n", - " fixed 0.96071 0.97162 0.96613 1057\n", - " flat 0.98389 0.99064 0.98726 20411\n", - " iobj 0.96154 0.80645 0.87719 31\n", - " mark 0.96611 0.98539 0.97565 2806\n", - " nmod 0.97956 0.97285 0.97619 8030\n", - " nsubj 0.98317 0.98402 0.98359 12701\n", - " nsubj:pass 0.96930 0.97858 0.97392 3969\n", - " nummod 0.99113 0.99327 0.99220 7879\n", - " obj 0.98266 0.98076 0.98171 10342\n", - " obl 0.98468 0.98256 0.98362 11183\n", - " parataxis 0.95595 0.95455 0.95525 682\n", - " punct 0.99952 0.99949 0.99950 33107\n", - " root 0.98888 0.98888 0.98888 10073\n", - " xcomp 0.95951 0.96027 0.95989 2517\n", + " PAD 0.99976 1.00000 0.99988 339805\n", + " X 1.00000 0.99938 0.99969 62631\n", + " acl 0.84425 0.83292 0.83855 3202\n", + " advcl 0.64532 0.68824 0.66609 1684\n", + " advmod 0.95594 0.94239 0.94912 6700\n", + " amod 0.90791 0.90995 0.90893 4464\n", + " appos 0.84555 0.77299 0.80765 3088\n", + " case 0.98213 0.98372 0.98292 11117\n", + " cc 0.97966 0.97993 0.97979 3637\n", + " ccomp 0.48588 0.48315 0.48451 356\n", + " compound 0.91807 0.92646 0.92224 11381\n", + "compound:plur 0.51163 0.66667 0.57895 33\n", + " conj 0.90245 0.89455 0.89849 5140\n", + " cop 0.97639 0.97639 0.97639 593\n", + " csubj 0.33333 0.16667 0.22222 6\n", + " csubj:pass 0.00000 0.00000 0.00000 1\n", + " dep 0.66500 0.73684 0.69908 361\n", + " det 0.94574 0.92229 0.93387 3912\n", + " fixed 0.82857 0.79452 0.81119 146\n", + " flat 0.95545 0.97113 0.96323 18638\n", + " iobj 0.00000 0.00000 0.00000 4\n", + " mark 0.91407 0.92163 0.91783 1812\n", + " nmod 0.87013 0.83202 0.85065 4429\n", + " nsubj 0.84932 0.87142 0.86023 6992\n", + " nsubj:pass 0.80982 0.80318 0.80648 1951\n", + " nummod 0.97935 0.94793 0.96338 4302\n", + " obj 0.90603 0.92001 0.91297 6351\n", + " obl 0.87106 0.87054 0.87080 5075\n", + " parataxis 0.46512 0.34739 0.39773 403\n", + " punct 0.99665 0.99622 0.99643 20881\n", + " root 0.90154 0.90740 0.90446 10000\n", + " xcomp 0.74568 0.75453 0.75008 1601\n", "\n", - " accuracy 0.99678 994475\n", - " macro avg 0.97738 0.97381 0.97531 994475\n", - " weighted avg 0.99679 0.99678 0.99678 994475\n", + " accuracy 0.98035 540696\n", + " macro avg 0.78099 0.77564 0.77668 540696\n", + " weighted avg 0.98030 0.98035 0.98029 540696\n", "\n" ] } @@ -1855,16 +2357,16 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "arc accuracy: 0.9310084738376598\n", - "types accuracy: 0.9258795751889828\n", - "root accuracy: 0.9474206349206349\n" + "arc accuracy: 0.8481110435316473\n", + "types accuracy: 0.8274148274750857\n", + "root accuracy: 0.9210116457364005\n" ] } ], @@ -1876,180 +2378,9 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 83, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Placeholder',\n", - " 'Placeholder_1',\n", - " 'Placeholder_2',\n", - " 'Placeholder_3',\n", - " 'Placeholder_4',\n", - " 'Placeholder_5',\n", - " 'W_d',\n", - " 'W_e',\n", - " 'U',\n", - " 'U-bi',\n", - " 'Wl',\n", - " 'Wr',\n", - " 'model/transformer/r_w_bias',\n", - " 'model/transformer/r_r_bias',\n", - " 'model/transformer/word_embedding/lookup_table',\n", - " 'model/transformer/r_s_bias',\n", - " 'model/transformer/seg_embed',\n", - " 'model/transformer/layer_0/rel_attn/q/kernel',\n", - " 'model/transformer/layer_0/rel_attn/k/kernel',\n", - " 'model/transformer/layer_0/rel_attn/v/kernel',\n", - " 'model/transformer/layer_0/rel_attn/r/kernel',\n", - " 'model/transformer/layer_0/rel_attn/o/kernel',\n", - " 'model/transformer/layer_0/rel_attn/LayerNorm/gamma',\n", - " 'model/transformer/layer_0/ff/layer_1/kernel',\n", - " 'model/transformer/layer_0/ff/layer_1/bias',\n", - " 'model/transformer/layer_0/ff/layer_2/kernel',\n", - " 'model/transformer/layer_0/ff/layer_2/bias',\n", - " 'model/transformer/layer_0/ff/LayerNorm/gamma',\n", - " 'model/transformer/layer_1/rel_attn/q/kernel',\n", - " 'model/transformer/layer_1/rel_attn/k/kernel',\n", - " 'model/transformer/layer_1/rel_attn/v/kernel',\n", - " 'model/transformer/layer_1/rel_attn/r/kernel',\n", - " 'model/transformer/layer_1/rel_attn/o/kernel',\n", - " 'model/transformer/layer_1/rel_attn/LayerNorm/gamma',\n", - " 'model/transformer/layer_1/ff/layer_1/kernel',\n", - " 'model/transformer/layer_1/ff/layer_1/bias',\n", - " 'model/transformer/layer_1/ff/layer_2/kernel',\n", - " 'model/transformer/layer_1/ff/layer_2/bias',\n", - " 'model/transformer/layer_1/ff/LayerNorm/gamma',\n", - " 'model/transformer/layer_2/rel_attn/q/kernel',\n", - " 'model/transformer/layer_2/rel_attn/k/kernel',\n", - " 'model/transformer/layer_2/rel_attn/v/kernel',\n", - " 'model/transformer/layer_2/rel_attn/r/kernel',\n", - " 'model/transformer/layer_2/rel_attn/o/kernel',\n", - " 'model/transformer/layer_2/rel_attn/LayerNorm/gamma',\n", - " 'model/transformer/layer_2/ff/layer_1/kernel',\n", - " 'model/transformer/layer_2/ff/layer_1/bias',\n", - " 'model/transformer/layer_2/ff/layer_2/kernel',\n", - " 'model/transformer/layer_2/ff/layer_2/bias',\n", - " 'model/transformer/layer_2/ff/LayerNorm/gamma',\n", - " 'model/transformer/layer_3/rel_attn/q/kernel',\n", - " 'model/transformer/layer_3/rel_attn/k/kernel',\n", - " 'model/transformer/layer_3/rel_attn/v/kernel',\n", - " 'model/transformer/layer_3/rel_attn/r/kernel',\n", - " 'model/transformer/layer_3/rel_attn/o/kernel',\n", - " 'model/transformer/layer_3/rel_attn/LayerNorm/gamma',\n", - " 'model/transformer/layer_3/ff/layer_1/kernel',\n", - " 'model/transformer/layer_3/ff/layer_1/bias',\n", - " 'model/transformer/layer_3/ff/layer_2/kernel',\n", - " 'model/transformer/layer_3/ff/layer_2/bias',\n", - " 'model/transformer/layer_3/ff/LayerNorm/gamma',\n", - " 'model/transformer/layer_4/rel_attn/q/kernel',\n", - " 'model/transformer/layer_4/rel_attn/k/kernel',\n", - " 'model/transformer/layer_4/rel_attn/v/kernel',\n", - " 'model/transformer/layer_4/rel_attn/r/kernel',\n", - " 'model/transformer/layer_4/rel_attn/o/kernel',\n", - " 'model/transformer/layer_4/rel_attn/LayerNorm/gamma',\n", - " 'model/transformer/layer_4/ff/layer_1/kernel',\n", - " 'model/transformer/layer_4/ff/layer_1/bias',\n", - " 'model/transformer/layer_4/ff/layer_2/kernel',\n", - " 'model/transformer/layer_4/ff/layer_2/bias',\n", - " 'model/transformer/layer_4/ff/LayerNorm/gamma',\n", - " 'model/transformer/layer_5/rel_attn/q/kernel',\n", - " 'model/transformer/layer_5/rel_attn/k/kernel',\n", - " 'model/transformer/layer_5/rel_attn/v/kernel',\n", - " 'model/transformer/layer_5/rel_attn/r/kernel',\n", - " 'model/transformer/layer_5/rel_attn/o/kernel',\n", - " 'model/transformer/layer_5/rel_attn/LayerNorm/gamma',\n", - " 'model/transformer/layer_5/ff/layer_1/kernel',\n", - " 'model/transformer/layer_5/ff/layer_1/bias',\n", - " 'model/transformer/layer_5/ff/layer_2/kernel',\n", - " 'model/transformer/layer_5/ff/layer_2/bias',\n", - " 'model/transformer/layer_5/ff/LayerNorm/gamma',\n", - " 'model/transformer/layer_6/rel_attn/q/kernel',\n", - " 'model/transformer/layer_6/rel_attn/k/kernel',\n", - " 'model/transformer/layer_6/rel_attn/v/kernel',\n", - " 'model/transformer/layer_6/rel_attn/r/kernel',\n", - " 'model/transformer/layer_6/rel_attn/o/kernel',\n", - " 'model/transformer/layer_6/rel_attn/LayerNorm/gamma',\n", - " 'model/transformer/layer_6/ff/layer_1/kernel',\n", - " 'model/transformer/layer_6/ff/layer_1/bias',\n", - " 'model/transformer/layer_6/ff/layer_2/kernel',\n", - " 'model/transformer/layer_6/ff/layer_2/bias',\n", - " 'model/transformer/layer_6/ff/LayerNorm/gamma',\n", - " 'model/transformer/layer_7/rel_attn/q/kernel',\n", - " 'model/transformer/layer_7/rel_attn/k/kernel',\n", - " 'model/transformer/layer_7/rel_attn/v/kernel',\n", - " 'model/transformer/layer_7/rel_attn/r/kernel',\n", - " 'model/transformer/layer_7/rel_attn/o/kernel',\n", - " 'model/transformer/layer_7/rel_attn/LayerNorm/gamma',\n", - " 'model/transformer/layer_7/ff/layer_1/kernel',\n", - " 'model/transformer/layer_7/ff/layer_1/bias',\n", - " 'model/transformer/layer_7/ff/layer_2/kernel',\n", - " 'model/transformer/layer_7/ff/layer_2/bias',\n", - " 'model/transformer/layer_7/ff/LayerNorm/gamma',\n", - " 'model/transformer/layer_8/rel_attn/q/kernel',\n", - " 'model/transformer/layer_8/rel_attn/k/kernel',\n", - " 'model/transformer/layer_8/rel_attn/v/kernel',\n", - " 'model/transformer/layer_8/rel_attn/r/kernel',\n", - " 'model/transformer/layer_8/rel_attn/o/kernel',\n", - " 'model/transformer/layer_8/rel_attn/LayerNorm/gamma',\n", - " 'model/transformer/layer_8/ff/layer_1/kernel',\n", - " 'model/transformer/layer_8/ff/layer_1/bias',\n", - " 'model/transformer/layer_8/ff/layer_2/kernel',\n", - " 'model/transformer/layer_8/ff/layer_2/bias',\n", - " 'model/transformer/layer_8/ff/LayerNorm/gamma',\n", - " 'model/transformer/layer_9/rel_attn/q/kernel',\n", - " 'model/transformer/layer_9/rel_attn/k/kernel',\n", - " 'model/transformer/layer_9/rel_attn/v/kernel',\n", - " 'model/transformer/layer_9/rel_attn/r/kernel',\n", - " 'model/transformer/layer_9/rel_attn/o/kernel',\n", - " 'model/transformer/layer_9/rel_attn/LayerNorm/gamma',\n", - " 'model/transformer/layer_9/ff/layer_1/kernel',\n", - " 'model/transformer/layer_9/ff/layer_1/bias',\n", - " 'model/transformer/layer_9/ff/layer_2/kernel',\n", - " 'model/transformer/layer_9/ff/layer_2/bias',\n", - " 'model/transformer/layer_9/ff/LayerNorm/gamma',\n", - " 'model/transformer/layer_10/rel_attn/q/kernel',\n", - " 'model/transformer/layer_10/rel_attn/k/kernel',\n", - " 'model/transformer/layer_10/rel_attn/v/kernel',\n", - " 'model/transformer/layer_10/rel_attn/r/kernel',\n", - " 'model/transformer/layer_10/rel_attn/o/kernel',\n", - " 'model/transformer/layer_10/rel_attn/LayerNorm/gamma',\n", - " 'model/transformer/layer_10/ff/layer_1/kernel',\n", - " 'model/transformer/layer_10/ff/layer_1/bias',\n", - " 'model/transformer/layer_10/ff/layer_2/kernel',\n", - " 'model/transformer/layer_10/ff/layer_2/bias',\n", - " 'model/transformer/layer_10/ff/LayerNorm/gamma',\n", - " 'model/transformer/layer_11/rel_attn/q/kernel',\n", - " 'model/transformer/layer_11/rel_attn/k/kernel',\n", - " 'model/transformer/layer_11/rel_attn/v/kernel',\n", - " 'model/transformer/layer_11/rel_attn/r/kernel',\n", - " 'model/transformer/layer_11/rel_attn/o/kernel',\n", - " 'model/transformer/layer_11/rel_attn/LayerNorm/gamma',\n", - " 'model/transformer/layer_11/ff/layer_1/kernel',\n", - " 'model/transformer/layer_11/ff/layer_1/bias',\n", - " 'model/transformer/layer_11/ff/layer_2/kernel',\n", - " 'model/transformer/layer_11/ff/layer_2/bias',\n", - " 'model/transformer/layer_11/ff/LayerNorm/gamma',\n", - " 'dense/kernel',\n", - " 'dense/bias',\n", - " 'dense_1/kernel',\n", - " 'dense_1/bias',\n", - " 'dense_2/kernel',\n", - " 'dense_2/bias',\n", - " 'dense_3/kernel',\n", - " 'dense_3/bias',\n", - " 'heads_seq',\n", - " 'tags_seq',\n", - " 'transitions',\n", - " 'logits']" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "strings = ','.join(\n", " [\n", @@ -2067,13 +2398,12 @@ " and 'adam' not in n.name\n", " and 'gradients/bert' not in n.name\n", " ]\n", - ")\n", - "strings.split(',')" + ")" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -2108,7 +2438,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 85, "metadata": {}, "outputs": [ { @@ -2116,7 +2446,7 @@ "output_type": "stream", "text": [ "INFO:tensorflow:Restoring parameters from xlnet-base-dependency/model.ckpt\n", - "WARNING:tensorflow:From :23: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", + "WARNING:tensorflow:From :23: convert_variables_to_constants (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use `tf.compat.v1.graph_util.convert_variables_to_constants`\n", "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/framework/graph_util_impl.py:277: extract_sub_graph (from tensorflow.python.framework.graph_util_impl) is deprecated and will be removed in a future version.\n", @@ -2134,215 +2464,57 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 86, "metadata": {}, "outputs": [], "source": [ - "def merge_sentencepiece_tokens_tagging(x, y):\n", - " new_paired_tokens = []\n", - " n_tokens = len(x)\n", - " rejected = ['', '']\n", - "\n", - " i = 0\n", - "\n", - " while i < n_tokens:\n", - "\n", - " current_token, current_label = x[i], y[i]\n", - " if not current_token.startswith('▁') and current_token not in rejected:\n", - " previous_token, previous_label = new_paired_tokens.pop()\n", - " merged_token = previous_token\n", - " merged_label = [previous_label]\n", - " while (\n", - " not current_token.startswith('▁')\n", - " and current_token not in rejected\n", - " ):\n", - " merged_token = merged_token + current_token.replace('▁', '')\n", - " merged_label.append(current_label)\n", - " i = i + 1\n", - " current_token, current_label = x[i], y[i]\n", - " merged_label = merged_label[0]\n", - " new_paired_tokens.append((merged_token, merged_label))\n", - "\n", - " else:\n", - " new_paired_tokens.append((current_token, current_label))\n", - " i = i + 1\n", - "\n", - " words = [\n", - " i[0].replace('▁', '')\n", - " for i in new_paired_tokens\n", - " if i[0] not in ['', '']\n", - " ]\n", - " labels = [i[1] for i in new_paired_tokens if i[0] not in ['', '']]\n", - " return words, labels" + "transforms = ['add_default_attributes',\n", + " 'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',\n", + " 'fold_batch_norms',\n", + " 'fold_old_batch_norms',\n", + " 'quantize_weights(fallback_min=-10, fallback_max=10)',\n", + " 'strip_unused_nodes',\n", + " 'sort_by_execution_order']" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 87, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['Kuala', 'Lumpur:', 'Sempena', 'sambutan', 'Aidilfitri', 'minggu', 'depan,', 'Perdana', 'Menteri', 'Tun', 'Dr', 'Mahathir', 'Mohamad', 'dan', 'Menteri', 'Pengangkutan', 'Anthony', 'Loke', 'Siew', 'Fook', 'menitipkan', 'pesanan', 'khas', 'kepada', 'orang', 'ramai', 'yang', 'mahu', 'pulang', 'ke', 'kampung', 'halaman', 'masing-masing.', 'Dalam', 'video', 'pendek', 'terbitan', 'Jabatan', 'Keselamatan', 'Jalan', 'Raya', '(Jkjr)', 'itu,', 'Dr', 'Mahathir', 'menasihati', 'mereka', 'supaya', 'berhenti', 'berehat', 'dan', 'tidur', 'sebentar', 'sekiranya', 'mengantuk', 'ketika', 'memandu.'] 57\n" - ] - }, - { - "data": { - "text/plain": [ - "73" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "string = 'KUALA LUMPUR: Sempena sambutan Aidilfitri minggu depan, Perdana Menteri Tun Dr Mahathir Mohamad dan Menteri Pengangkutan Anthony Loke Siew Fook menitipkan pesanan khas kepada orang ramai yang mahu pulang ke kampung halaman masing-masing. Dalam video pendek terbitan Jabatan Keselamatan Jalan Raya (JKJR) itu, Dr Mahathir menasihati mereka supaya berhenti berehat dan tidur sebentar sekiranya mengantuk ketika memandu.'\n", - "\n", - "import re\n", - "\n", - "def entities_textcleaning(string, lowering = False):\n", - " \"\"\"\n", - " use by entities recognition, pos recognition and dependency parsing\n", - " \"\"\"\n", - " string = re.sub('[^A-Za-z0-9\\-\\/():,. ]+', ' ', string)\n", - " string = re.sub(r'[ ]+', ' ', string).strip()\n", - " original_string = string.split()\n", - " if lowering:\n", - " string = string.lower()\n", - " string = [\n", - " (original_string[no], word.title() if word.isupper() else word)\n", - " for no, word in enumerate(string.split())\n", - " if len(word)\n", - " ]\n", - " return [s[0] for s in string], [s[1] for s in string]\n", - "\n", - "def parse_X(left):\n", - " left = ' '.join(left)\n", - " bert_tokens = tokenize_fn(left)\n", - " bert_tokens.extend([4, 3])\n", - " segment = [0] * (len(bert_tokens) - 1) + [SEG_ID_CLS]\n", - " input_mask = [0] * len(segment)\n", - " s_tokens = [sp_model.IdToPiece(i) for i in bert_tokens]\n", - " return bert_tokens, segment, input_mask, s_tokens\n", - "\n", - "sequence = entities_textcleaning(string)[1]\n", - "print(sequence, len(sequence))\n", - "parsed_sequence, segment_sequence, mask_sequence, xlnet_sequence = parse_X(sequence)\n", - "len(parsed_sequence)" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py:1750: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n", - " warnings.warn('An interactive session is already active. This can '\n" + "WARNING:tensorflow:From :6: FastGFile.__init__ (from tensorflow.python.platform.gfile) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use tf.gfile.GFile.\n" ] } ], "source": [ - "def load_graph(frozen_graph_filename):\n", - " with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:\n", - " graph_def = tf.GraphDef()\n", - " graph_def.ParseFromString(f.read())\n", - " with tf.Graph().as_default() as graph:\n", - " tf.import_graph_def(graph_def)\n", - " return graph\n", + "from tensorflow.tools.graph_transforms import TransformGraph\n", + "tf.set_random_seed(0)\n", "\n", - "g = load_graph('xlnet-base-dependency/frozen_model.pb')\n", - "x = g.get_tensor_by_name('import/Placeholder:0')\n", - "seg = g.get_tensor_by_name('import/Placeholder_1:0')\n", - "m = g.get_tensor_by_name('import/Placeholder_2:0')\n", - "heads_seq = g.get_tensor_by_name('import/heads_seq:0')\n", - "tags_seq = g.get_tensor_by_name('import/logits:0')\n", - "test_sess = tf.InteractiveSession(graph = g)" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [], - "source": [ - "h, t = test_sess.run([heads_seq, tags_seq],\n", - " feed_dict = {\n", - " x: [parsed_sequence],\n", - " seg: [segment_sequence],\n", - " m: [mask_sequence],\n", - " },\n", - ")\n", - "h = h[0] - 1\n", - "t = [idx2tag[d] for d in t[0]]\n", - "merged_h = merge_sentencepiece_tokens_tagging(xlnet_sequence, h)\n", - "merged_t = merge_sentencepiece_tokens_tagging(xlnet_sequence, t)" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('Kuala', 23), ('Lumpur:', 1), ('Sempena', 5), ('sambutan', 23), ('Aidilfitri', 5), ('minggu', 6), ('depan,', 7), ('Perdana', 23), ('Menteri', 10), ('Tun', 11), ('Dr', 12), ('Mahathir', 21), ('Mohamad', 21), ('dan', 16), ('Menteri', 10), ('Pengangkutan', 17), ('Anthony', 24), ('Loke', 19), ('Siew', 21), ('Fook', 21), ('menitipkan', 0), ('pesanan', 23), ('khas', 24), ('kepada', 27), ('orang', 24), ('ramai', 27), ('yang', 31), ('mahu', 31), ('pulang', 27), ('ke', 33), ('kampung', 31), ('halaman', 33), ('masing-masing.', 33), ('Dalam', 38), ('video', 50), ('pendek', 38), ('terbitan', 38), ('Jabatan', 39), ('Keselamatan', 40), ('Jalan', 41), ('Raya', 41), ('(Jkjr)', 50), ('itu,', 38), ('Dr', 50), ('Mahathir', 50), ('menasihati', 23), ('mereka', 50), ('supaya', 52), ('berhenti', 50), ('berehat', 52), ('dan', 54), ('tidur', 52), ('sebentar', 56), ('sekiranya', 58), ('mengantuk', 54), ('ketika', 59), ('memandu.', 58)]\n" - ] - } - ], - "source": [ - "print(list(zip(merged_h[0], merged_h[1])))" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('Kuala', 'nsubj'), ('Lumpur:', 'flat'), ('Sempena', 'case'), ('sambutan', 'obl'), ('Aidilfitri', 'compound'), ('minggu', 'compound'), ('depan,', 'compound'), ('Perdana', 'nsubj'), ('Menteri', 'flat'), ('Tun', 'flat'), ('Dr', 'flat'), ('Mahathir', 'flat'), ('Mohamad', 'flat'), ('dan', 'cc'), ('Menteri', 'conj'), ('Pengangkutan', 'flat'), ('Anthony', 'flat'), ('Loke', 'flat'), ('Siew', 'flat'), ('Fook', 'flat'), ('menitipkan', 'root'), ('pesanan', 'obj'), ('khas', 'amod'), ('kepada', 'case'), ('orang', 'nmod'), ('ramai', 'compound'), ('yang', 'nsubj'), ('mahu', 'advmod'), ('pulang', 'acl'), ('ke', 'case'), ('kampung', 'obl'), ('halaman', 'compound'), ('masing-masing.', 'det'), ('Dalam', 'case'), ('video', 'obl'), ('pendek', 'amod'), ('terbitan', 'compound'), ('Jabatan', 'flat'), ('Keselamatan', 'flat'), ('Jalan', 'flat'), ('Raya', 'flat'), ('(Jkjr)', 'punct'), ('itu,', 'det'), ('Dr', 'nsubj'), ('Mahathir', 'flat'), ('menasihati', 'parataxis'), ('mereka', 'obj'), ('supaya', 'case'), ('berhenti', 'xcomp'), ('berehat', 'xcomp'), ('dan', 'cc'), ('tidur', 'conj'), ('sebentar', 'advmod'), ('sekiranya', 'mark'), ('mengantuk', 'ccomp'), ('ketika', 'mark'), ('memandu.', 'ccomp')]\n" - ] - } - ], - "source": [ - "print(list(zip(merged_t[0], merged_t[1])))" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [], - "source": [ - "import boto3\n", + "pb = 'xlnet-base-dependency/frozen_model.pb'\n", + "input_graph_def = tf.GraphDef()\n", + "with tf.gfile.FastGFile(pb, 'rb') as f:\n", + " input_graph_def.ParseFromString(f.read())\n", "\n", - "bucketName = 'huseinhouse-storage'\n", - "Key = 'xlnet-base-dependency/frozen_model.pb'\n", - "outPutname = \"v34/dependency/xlnet-base-dependency.pb\"\n", + "if 'bert' in pb:\n", + " inputs = ['Placeholder']\n", + " a = ['dense/BiasAdd']\n", + "if 'xlnet' in pb:\n", + " inputs = ['Placeholder', 'Placeholder_1', 'Placeholder_2']\n", + " a = ['transpose_3']\n", "\n", - "s3 = boto3.client('s3')\n", + "transformed_graph_def = TransformGraph(input_graph_def, \n", + " inputs,\n", + " ['logits', 'heads_seq'] + a, transforms)\n", "\n", - "s3.upload_file(Key,bucketName,outPutname)" + "with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:\n", + " f.write(transformed_graph_def.SerializeToString())" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -2361,7 +2533,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.9" } }, "nbformat": 4,