From aae77d9feb1a597b926e738ef02f6d8a8c97787a Mon Sep 17 00:00:00 2001 From: huseinzol05 Date: Thu, 17 Mar 2022 19:59:38 +0800 Subject: [PATCH] fix sentiment docs --- docs/load-sentiment.ipynb | 31 +- example/sentiment/load-sentiment.ipynb | 31 +- load-tokenizer.ipynb | 1527 ------------------------ malaya/__init__.py | 2 +- setup.py | 2 +- 5 files changed, 62 insertions(+), 1531 deletions(-) delete mode 100644 load-tokenizer.ipynb diff --git a/docs/load-sentiment.ipynb b/docs/load-sentiment.ipynb index d3bdce75..53ccaaf2 100644 --- a/docs/load-sentiment.ipynb +++ b/docs/load-sentiment.ipynb @@ -661,7 +661,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Open emotion visualization dashboard\n", + "#### Open sentiment visualization dashboard\n", "\n", "Default when you call `predict_words` it will open a browser with visualization dashboard, you can disable by `visualization=False`.\n", "\n", @@ -1186,6 +1186,35 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false } }, "nbformat": 4, diff --git a/example/sentiment/load-sentiment.ipynb b/example/sentiment/load-sentiment.ipynb index d3bdce75..53ccaaf2 100644 --- a/example/sentiment/load-sentiment.ipynb +++ b/example/sentiment/load-sentiment.ipynb @@ -661,7 +661,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Open emotion visualization dashboard\n", + "#### Open sentiment visualization dashboard\n", "\n", "Default when you call `predict_words` it will open a browser with visualization dashboard, you can disable by `visualization=False`.\n", "\n", @@ -1186,6 +1186,35 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false } }, "nbformat": 4, diff --git a/load-tokenizer.ipynb b/load-tokenizer.ipynb deleted file mode 100644 index 70d6a7e5..00000000 --- a/load-tokenizer.ipynb +++ /dev/null @@ -1,1527 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Word and sentence tokenizer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "\n", - "This tutorial is available as an IPython notebook at [Malaya/example/tokenizer](https://github.com/huseinzol05/Malaya/tree/master/example/tokenizer).\n", - " \n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 6.52 s, sys: 1.42 s, total: 7.94 s\n", - "Wall time: 9.94 s\n" - ] - } - ], - "source": [ - "%%time\n", - "import malaya" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "string1 = 'xjdi ke, y u xsuke makan HUSEIN kt situ tmpt, i hate it. pelikle, pada'\n", - "string2 = 'i mmg2 xske mknn HUSEIN kampng tmpat, i love them. pelikle saye'\n", - "string3 = 'perdana menteri ke11 sgt suka makn ayam, harganya cuma rm15.50'\n", - "string4 = 'pada 10/4, kementerian mengumumkan, 1/100'\n", - "string5 = 'Husein Zolkepli dapat tempat ke-12 lumba lari hari ni'\n", - "string6 = 'Husein Zolkepli (2011 - 2019) adalah ketua kampng di kedah sekolah King Edward ke-IV'\n", - "string7 = '2jam 30 minit aku tunggu kau, 60.1 kg kau ni, suhu harini 31.2c, aku dahaga minum 600ml'\n", - "string8 = 'online & desktop: regexr.com or download the desktop version for Mac'\n", - "string9 = 'belajaq unity di google.us.edi?34535/534534?dfg=g&fg unity'\n", - "string10 = 'Gambar ni membantu. Gambar tutorial >>. facebook. com/story. story_fbid=10206183032200965&id=1418962070'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load word tokenizer\n", - "\n", - "```python\n", - "class Tokenizer:\n", - " def __init__(self, lowercase: bool = False, **kwargs):\n", - " \"\"\"\n", - " Load Tokenizer object.\n", - " Check supported regex pattern at \n", - " https://github.com/huseinzol05/Malaya/blob/master/malaya/text/regex.py#L85\n", - "\n", - " Parameters\n", - " ----------\n", - " lowercase: bool, optional (default=False)\n", - " lowercase tokens.\n", - " emojis: bool, optional (default=True)\n", - " True to keep emojis.\n", - " urls: bool, optional (default=True)\n", - " True to keep urls.\n", - " urls_improved: bool, optional (default=True)\n", - " True to keep urls, better version.\n", - " tags: bool, optional (default=True)\n", - " True to keep tags: .\n", - " emails: bool, optional (default=True)\n", - " True to keep emails.\n", - " users: bool, optional (default=True)\n", - " True to keep users handles: @cbaziotis.\n", - " hashtags: bool, optional (default=True)\n", - " True to keep hashtags.\n", - " phones: bool, optional (default=True)\n", - " True to keep phones.\n", - " percents: bool, optional (default=True)\n", - " True to keep percents.\n", - " money: bool, optional (default=True)\n", - " True to keep money expressions.\n", - " date: bool, optional (default=True)\n", - " True to keep date expressions.\n", - " time: bool, optional (default=True)\n", - " True to keep time expressions.\n", - " acronyms: bool, optional (default=True)\n", - " True to keep acronyms.\n", - " emoticons: bool, optional (default=True)\n", - " True to keep emoticons.\n", - " censored: bool, optional (default=True)\n", - " True to keep censored words: f**k.\n", - " emphasis: bool, optional (default=True)\n", - " True to keep words with emphasis: *very* good.\n", - " numbers: bool, optional (default=True)\n", - " True to keep numbers.\n", - " temperature: bool, optional (default=True)\n", - " True to keep temperatures\n", - " distance: bool, optional (default=True)\n", - " True to keep distances.\n", - " volume: bool, optional (default=True)\n", - " True to keep volumes.\n", - " duration: bool, optional (default=True)\n", - " True to keep durations.\n", - " weight: bool, optional (default=True)\n", - " True to keep weights.\n", - " hypen: bool, optional (default=True)\n", - " True to keep hypens.\n", - " \"\"\"\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "tokenizer = malaya.tokenizer.Tokenizer()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['xjdi',\n", - " 'ke',\n", - " ',',\n", - " 'y',\n", - " 'u',\n", - " 'xsuke',\n", - " 'makan',\n", - " 'HUSEIN',\n", - " 'kt',\n", - " 'situ',\n", - " 'tmpt',\n", - " ',',\n", - " 'i',\n", - " 'hate',\n", - " 'it',\n", - " '.',\n", - " 'pelikle',\n", - " ',',\n", - " 'pada']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize(string1)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['i',\n", - " 'mmg2',\n", - " 'xske',\n", - " 'mknn',\n", - " 'HUSEIN',\n", - " 'kampng',\n", - " 'tmpat',\n", - " ',',\n", - " 'i',\n", - " 'love',\n", - " 'them',\n", - " '.',\n", - " 'pelikle',\n", - " 'saye']" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize(string2)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['perdana',\n", - " 'menteri',\n", - " 'ke11',\n", - " 'sgt',\n", - " 'suka',\n", - " 'makn',\n", - " 'ayam',\n", - " ',',\n", - " 'harganya',\n", - " 'cuma',\n", - " 'rm15.50']" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize(string3)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['pada',\n", - " '10',\n", - " '/',\n", - " '4',\n", - " ',',\n", - " 'kementerian',\n", - " 'mengumumkan',\n", - " ',',\n", - " '1',\n", - " '/',\n", - " '100']" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize(string4)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Husein',\n", - " 'Zolkepli',\n", - " '(',\n", - " '2011',\n", - " '-',\n", - " '2019',\n", - " ')',\n", - " 'adalah',\n", - " 'ketua',\n", - " 'kampng',\n", - " 'di',\n", - " 'kedah',\n", - " 'sekolah',\n", - " 'King',\n", - " 'Edward',\n", - " 'ke-IV']" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize(string6)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['2jam',\n", - " '30 minit',\n", - " 'aku',\n", - " 'tunggu',\n", - " 'kau',\n", - " ',',\n", - " '60.1 kg',\n", - " 'kau',\n", - " 'ni',\n", - " ',',\n", - " 'suhu',\n", - " 'harini',\n", - " '31.2c',\n", - " ',',\n", - " 'aku',\n", - " 'dahaga',\n", - " 'minum',\n", - " '600ml']" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize(string7)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['online',\n", - " '&',\n", - " 'desktop',\n", - " ':',\n", - " 'regexr.com',\n", - " 'or',\n", - " 'download',\n", - " 'the',\n", - " 'desktop',\n", - " 'version',\n", - " 'for',\n", - " 'Mac']" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize(string8)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['belajaq', 'unity', 'di', 'google.us.edi?34535/534534?dfg=g&fg', 'unity']" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize(string9)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### url" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['website', 'saya', 'http://huseinhouse.com']" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('website saya http://huseinhouse.com')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['website', 'saya', 'huseinhouse.com']" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('website saya huseinhouse.com')" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['website', 'saya', 'huseinhouse.com/pelik?a=1']" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('website saya huseinhouse.com/pelik?a=1')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### tags" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['panggil', 'saya', '']" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('panggil saya ')" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['panggil', 'saya', '<', 'husein', '>']" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('panggil saya ')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### emails" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['email', 'saya', 'husein@rumah.com']" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('email saya husein@rumah.com')" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['email', 'saya', 'husein@rumah.com.my']" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('email saya husein@rumah.com.my')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### users" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['twitter', 'saya', '@husein123zolkepli']" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('twitter saya @husein123zolkepli')" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['twitter', 'saya', '@', 'husein123zolkepli']" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('twitter saya @ husein123zolkepli')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### hashtags" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['panggil', 'saya', '#huseincomel']" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('panggil saya #huseincomel')" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['panggil', 'saya', '#', 'huseincomel']" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('panggil saya # huseincomel')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### phones" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['call', 'sye', 'di', '013-1234567']" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('call sye di 013-1234567')" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['call', 'sye', 'di', '013', '-', '1234567']" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('call sye di 013- 1234567')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### percents" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['saya', 'sokong', '100%']" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('saya sokong 100%')" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['saya', 'sokong', '100', '%']" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('saya sokong 100 %')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### money" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['saya', 'tinggal', 'rm100']" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('saya tinggal rm100')" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['saya', 'tinggal', 'rm100k']" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('saya tinggal rm100k')" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['saya', 'tinggal', 'rm100M']" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('saya tinggal rm100M')" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['saya', 'tinggal', 'rm100.123M']" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('saya tinggal rm100.123M')" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['saya', 'tinggal', '40 sen']" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('saya tinggal 40 sen')" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['saya', 'tinggal', '21 ringgit', '50 sen']" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('saya tinggal 21 ringgit 50 sen')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### date" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['tarikh', 'perjumpaan', '10/11/2011']" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('tarikh perjumpaan 10/11/2011')" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['tarikh', 'perjumpaan', '10-11-2011']" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('tarikh perjumpaan 10-11-2011')" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['tarikh', 'perjumpaan', '12 mei 2011']" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('tarikh perjumpaan 12 mei 2011')" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['tarikh', 'perjumpaan', 'mei 12 2011']" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('tarikh perjumpaan mei 12 2011')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### time" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['jumpa', '3 am']" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('jumpa 3 am')" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['jumpa', '22:00']" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('jumpa 22:00')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### censored" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['f**k', 'lah']" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('f**k lah')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### emphasis" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['*damn*', 'good', 'weih']" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('*damn* good weih')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### numbers" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['no', 'saya', '123']" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('no saya 123')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### temperature" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['sejuk', 'harini', ',', '31.1c']" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('sejuk harini, 31.1c')" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['sejuk', 'harini', ',', '31.1C']" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('sejuk harini, 31.1C')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### distance" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['nak', 'sampai', 'lagi', '31km']" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('nak sampai lagi 31km')" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['nak', 'sampai', 'lagi', '31 km']" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('nak sampai lagi 31 km')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### volume" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['botol', 'ni', '400ml']" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('botol ni 400ml')" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['botol', 'ni', '400 l']" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('botol ni 400 l')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### duration" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['aku', 'dah', 'tunggu', 'kau', '2jam', 'kut']" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('aku dah tunggu kau 2jam kut')" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['aku', 'dah', 'tunggu', 'kau', '2 jam', 'kut']" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('aku dah tunggu kau 2 jam kut')" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['lagi', '10 minit', '3 jam']" - ] - }, - "execution_count": 50, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('lagi 10 minit 3 jam')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### weight" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['berat', 'kau', '60 kg']" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('berat kau 60 kg')" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['berat', 'kau', '60kg']" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('berat kau 60kg')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### hypen" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['sememang-memangnya', 'kau', 'sakai']" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('sememang-memangnya kau sakai')" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['sememang', '-', 'memangnya', 'kau', 'sakai']" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.tokenize('sememang- memangnya kau sakai')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sentence tokenizer\n", - "\n", - "We considered prefixes, suffixes, starters, acronyms, websites, emails, digits, before digits, time and month to split a sentence into multiple sentences.\n", - "\n", - "```python\n", - "class SentenceTokenizer:\n", - " def __init__(self):\n", - " pass\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [], - "source": [ - "s = \"\"\"\n", - "no.1 polis bertemu dengan suspek di ladang getah. polis tembak pui pui pui bertubi tubi\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [], - "source": [ - "s_tokenizer = malaya.tokenizer.SentenceTokenizer()" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['no.1 polis bertemu dengan suspek di ladang getah.',\n", - " 'polis tembak pui pui pui bertubi tubi.']" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s_tokenizer.tokenize(s)" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [], - "source": [ - "s = \"\"\"\n", - "email saya di husein.zol01@gmail.com, nanti jom berkopi\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['email saya di husein.zol01@gmail.com, nanti jom berkopi.']" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s_tokenizer.tokenize(s)" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [], - "source": [ - "s = \"\"\"\n", - "ke. 2 cerita nya begini. saya berjalan jalan ditepi muara jumpa anak dara.\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['ke.2 cerita nya begini.',\n", - " 'saya berjalan jalan ditepi muara jumpa anak dara.']" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "s_tokenizer.tokenize(s)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/malaya/__init__.py b/malaya/__init__.py index d4a55b7c..dbd8ba42 100644 --- a/malaya/__init__.py +++ b/malaya/__init__.py @@ -9,7 +9,7 @@ from malaya_boilerplate.utils import get_home version = '4.7' -bump_version = '4.7.2' +bump_version = '4.7.3' __version__ = bump_version package = 'malaya' diff --git a/setup.py b/setup.py index ac8a86fc..7d32563d 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ def readme(): setuptools.setup( name=__packagename__, packages=setuptools.find_packages(), - version='4.7.2', + version='4.7.3', python_requires='>=3.6.*', description='Natural-Language-Toolkit for bahasa Malaysia, powered by Deep Learning Tensorflow.', long_description=readme(),