Skip to content

Commit

Permalink
finalize 5.0
Browse files Browse the repository at this point in the history
  • Loading branch information
huseinzol05 committed Dec 19, 2022
1 parent 6d48535 commit 3c056cb
Show file tree
Hide file tree
Showing 26 changed files with 3,134 additions and 1,060 deletions.
216 changes: 160 additions & 56 deletions docs/load-augmentation-abstractive.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3.02 s, sys: 3.63 s, total: 6.65 s\n",
"Wall time: 2.51 s\n"
"CPU times: user 3.23 s, sys: 3.54 s, total: 6.77 s\n",
"Wall time: 2.25 s\n"
]
}
],
Expand Down Expand Up @@ -106,45 +106,52 @@
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Size (MB)</th>\n",
" <th>ROUGE-1</th>\n",
" <th>ROUGE-2</th>\n",
" <th>ROUGE-L</th>\n",
" <th>BLEU</th>\n",
" <th>SacreBLEU Verbose</th>\n",
" <th>Suggested length</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>mesolitica/finetune-noisy-translation-t5-tiny-bahasa-cased-v2</th>\n",
" <td>139</td>\n",
" <td>60.000967</td>\n",
" <td>77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ...</td>\n",
" <td>256</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4</th>\n",
" <td>242.0</td>\n",
" <td>0.757218</td>\n",
" <td>0.496729</td>\n",
" <td>0.304022</td>\n",
" <td>256.0</td>\n",
" <td>242</td>\n",
" <td>64.062582</td>\n",
" <td>80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ...</td>\n",
" <td>256</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mesolitica/finetune-noisy-translation-t5-base-bahasa-cased-v2</th>\n",
" <td>892.0</td>\n",
" <td>0.713227</td>\n",
" <td>0.470135</td>\n",
" <td>0.366797</td>\n",
" <td>256.0</td>\n",
" <td>892</td>\n",
" <td>64.583819</td>\n",
" <td>80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ...</td>\n",
" <td>256</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Size (MB) ROUGE-1 \\\n",
"mesolitica/finetune-noisy-translation-t5-small-... 242.0 0.757218 \n",
"mesolitica/finetune-noisy-translation-t5-base-b... 892.0 0.713227 \n",
" Size (MB) BLEU \\\n",
"mesolitica/finetune-noisy-translation-t5-tiny-b... 139 60.000967 \n",
"mesolitica/finetune-noisy-translation-t5-small-... 242 64.062582 \n",
"mesolitica/finetune-noisy-translation-t5-base-b... 892 64.583819 \n",
"\n",
" ROUGE-2 ROUGE-L \\\n",
"mesolitica/finetune-noisy-translation-t5-small-... 0.496729 0.304022 \n",
"mesolitica/finetune-noisy-translation-t5-base-b... 0.470135 0.366797 \n",
" SacreBLEU Verbose \\\n",
"mesolitica/finetune-noisy-translation-t5-tiny-b... 77.9/63.9/54.6/47.7 (BP = 1.000 ratio = 1.036 ... \n",
"mesolitica/finetune-noisy-translation-t5-small-... 80.1/67.7/59.1/52.5 (BP = 1.000 ratio = 1.042 ... \n",
"mesolitica/finetune-noisy-translation-t5-base-b... 80.2/68.1/59.8/53.2 (BP = 1.000 ratio = 1.048 ... \n",
"\n",
" Suggested length \n",
"mesolitica/finetune-noisy-translation-t5-small-... 256.0 \n",
"mesolitica/finetune-noisy-translation-t5-base-b... 256.0 "
" Suggested length \n",
"mesolitica/finetune-noisy-translation-t5-tiny-b... 256 \n",
"mesolitica/finetune-noisy-translation-t5-small-... 256 \n",
"mesolitica/finetune-noisy-translation-t5-base-b... 256 "
]
},
"execution_count": 3,
Expand Down Expand Up @@ -193,9 +200,38 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b9b42d7176db4aa78f8145770bbf783f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading: 0%| | 0.00/826 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3a251a3b5ae746acbbf0528216fc914d",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading: 0%| | 0.00/242M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"model = malaya.augmentation.abstractive.huggingface(model = 'mesolitica/finetune-noisy-translation-t5-small-bahasa-cased-v4')"
]
Expand Down Expand Up @@ -226,7 +262,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -237,18 +273,25 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
]
},
{
"data": {
"text/plain": [
"['Suka makan ayam ikan',\n",
" 'Aku sebenarnya tak suka sangat dekat lelaki tu, ketiak masam sebab tak mandi',\n",
" 'Perdana Menteri berkata, beliau perlu mendapatkan maklumat terperinci mengenai isu tersebut sebelum kerajaan dapat mengambil sebarang tindakan lanjut. Bagaimanapun, beliau yakin masalah itu dapat diselesaikan dan pentadbiran kerajaan boleh berfungsi dengan baik.']"
" 'Perdana Menteri berkata, beliau perlu mendapatkan maklumat terperinci berhubung isu tersebut sebelum kerajaan dapat mengambil sebarang tindakan lanjut. Bagaimanapun, beliau yakin masalah tersebut dapat diselesaikan dan pentadbiran kerajaan dapat berfungsi dengan baik.']"
]
},
"execution_count": 10,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -268,26 +311,53 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Rakyat memang tak suka ko pun',\n",
" 'rakyat memang x suka u pun',\n",
" 'rakyat mmg xsuka ko pon',\n",
" 'rakyat mmg tak suka kau pun',\n",
" 'rakyat mmg x suka kau pun']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"outputs = model.generate(['rakyat memang tak suka awak pun'], \n",
" max_length = 100, do_sample=True, top_k=100, top_p=0.95, temperature=0.7,\n",
" num_return_sequences=5)\n",
"outputs"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"['sy suka makan ayam dan ikan',\n",
" 'aku suka makan ayam ikan',\n",
" 'Suka makan ayam dan ikan',\n",
" 'i actually tak berapa suka sangat dekat laki tu ketiak masam sebab tak mandi',\n",
" 'sebenarnya i tak suka sangat dkt lelaki tu tu ketiak masam sbb x mandi',\n",
" 'aku sebenarnya tak suka sangat dekat lelaki tu, ketiak masam sebab tak mandi',\n",
" 'Perdana Menteri berkata, beliau perlu mendapatkan maklumat terperinci berhubung isu tersebut sebelum kerajaan dapat mengambil sebarang tindakan lanjut. Apa-apa pun, beliau yakin masalah dapat diselesaikan dan pentadbiran kerajaan dapat berfungsi dengan baik.',\n",
" 'Perdana Menteri berkata, beliau perlu mendapatkan maklumat terperinci tentang isu berkenaan sebelum kerajaan dapat mengambil sebarang tindakan lanjut. Bagaimanapun beliau yakin masalah tersebut dapat diselesaikan dan pentadbiran kerajaan boleh berfungsi dengan baik.',\n",
" 'Perdana Menteri berkata, beliau perlu mendapatkan maklumat yang terperinci mengenai isu tersebut sebelum kerajaan dapat mengambil sebarang tindakan lanjut. Apapun beliau yakin masalah itu dapat diselesaikan dan pentadbiran kerajaan boleh berfungsi dengan baik.']"
"['suka nya makan ayam sama ikan',\n",
" 'Suka makan ayam ikan',\n",
" 'Suka dah mkn ayam ikan',\n",
" 'Aku sebenarnya tak berapa suka dekat lelaki tu, ketiak masam sbb aku tak mandi',\n",
" 'Aku sebenarnya tak suka sangat dekat laki tu, ketiak masam sbb tak mandi',\n",
" 'aku sebenarnya tak suka sangat dekat laki tu, ketiak masam sebab tak mandi',\n",
" 'Perdana Menteri berkata, beliau perlu memperoleh maklumat terperinci berkenaan isu tersebut sebelum kerajaan dapat mengambil sebarang tindakan lanjut. Bagaimanapun beliau yakin masalah itu dapat diselesaikan dan pentadbiran kerajaan dapat berfungsi dengan baik.',\n",
" 'Perdana Menteri berkata, beliau perlu mendapatkan maklumat terperinci tentang isu berkenaan sebelum kerajaan dapat mengambil sebarang tindakan lanjut. Walau apa pun beliau yakin masalah itu dapat diselesaikan dan pentadbiran kerajaan boleh berfungsi dengan baik.',\n",
" 'Perdana Menteri berkata, beliau perlu memperoleh maklumat yang terperinci mengenai isu tersebut sebelum kerajaan dapat mengambil sebarang tindakan lanjut. Walau bagaimanapun, beliau yakin masalah tersebut dapat diselesaikan dan pentadbiran kerajaan boleh berfungsi dengan baik.']"
]
},
"execution_count": 11,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -308,7 +378,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -322,27 +392,27 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['bodoh betul kerajaan ni, mana kebebasan bersuara',\n",
" 'bodoh betul kerajaan ni mana free speech',\n",
" 'bodoh betul kerajaan ni, mana kebebasan bersuara',\n",
" 'Perbincangan khas juga bertujuan bagi Seri Paduka mendapat pandangan Raja2 Melayu untuk membolehkan baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan negara serta rakyat',\n",
" 'Perbincangan khas juga bertujuan bagi Seri Paduka mendapat pandangan Raja2 Melayu bagi membolehkan baginda membuat keputusan yg terbaik demi kepentingan dan kesejahteraan negara serta rakyat',\n",
" 'Perbincangan khas juga bertujuan bagi Seri Paduka mendapat pandangan Raja-Raja Melayu agar beliau membuat keputusan yang terbaik demi kepentingan dan kesejahteraan Negara dan rakyat',\n",
" 'semalam buka tv tgk berita, dia kata ada tanah runtuh',\n",
" 'semalam buka tv tgk news katanya ada tanah runtuh',\n",
" 'semalam aku bukak tv tengok news, katanya ada landslide',\n",
" 'Aku benci dia sangat, perangai buruk!',\n",
" 'i hate him so much, teruknya perangai ye!',\n",
" 'u hate him so much, bad utk perangai!']"
"['bodoh betul kerajaan ni, kebebasan bersuara mana',\n",
" 'Bodoh betul kerajaan ni, kebebasan bersuara mana',\n",
" 'kerajaan ni memang bodoh lah, mana kebebasan bersuara',\n",
" 'Ceramah khas pun bertujuan Seri Paduka Baginda ambil pandangan Raja2 Melayu untuk membolehkan baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan negara serta rakyat',\n",
" 'Perbincangan khas juga bertujuan untuk Seri Paduka Baginda mendapat pandangan Raja2 Melayu bagi membolehkan Baginda membuat keputusan terbaik demi dan kesejahteraan negara dan rakyat',\n",
" 'Perbincangan khas juga bertujuan Seri Paduka Baginda mendapat pandangan Raja2 Melayu supaya membolehkan baginda membuat keputusan yang terbaik demi kepentingan dan kesejahteraan negara serta rakyat',\n",
" 'semalam bukak tv tengok berita, katanya ada rebah',\n",
" 'Semalam aku bukak tv tgk berita katanya ada resah',\n",
" 'semalam buka tv tengok berita rupanya ada tanah runtuh',\n",
" 'Aku benci dia sangat2, perangai teruk!',\n",
" 'Aku benci dia sangat2 ni, perangai tu teruk!',\n",
" 'i hate him so much, bad traits!']"
]
},
"execution_count": 21,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -354,6 +424,40 @@
"outputs"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Bodoh betul kerajaan ni, mana ada kebebasan bersuara',\n",
" 'Bodoh betul kerajaan ni, mana kebebasan bersuara',\n",
" 'bodoh betul kerajaan ni, mana kebebasan bersuara',\n",
" 'Perbincangan khas tersebut juga bertujuan Seri Paduka Baginda mendapat pandangan Raja2 Melayu bagi membolehkan beliau membuat keputusan yang terbaik demi dan kemakmuran negara dan rakyat',\n",
" 'Perbincangan khas itu juga bertujuan Seri Paduka Baginda mendapat pandangan Raja2 Melayu bagi membolehkan baginda membuat keputusan terbaik demi kepentingan dan kesejahteraan negara dan rakyat',\n",
" 'Perbincangan khas itu juga bertujuan bagi Seri Paduka Baginda mendapat view dari Raja2 Melayu untuk membolehkan baginda membuat keputusan terbaik demi kepentingan dan kesejahteraan negara dan rakyat',\n",
" 'semalam bukak tv tgk kb katanya ada tumbang',\n",
" 'semalam buka tv tgk berita, katanya ade sangkut2',\n",
" 'semalam bukak tv tgk berita, katanya ada tanah runtuh',\n",
" 'i benci dia sgt ni, perangai x elok!',\n",
" 'aku benci sangat dengan dia ni, perangai dah teruk tau!',\n",
" 'i hate it so much, bad behavior!']"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"outputs = model.generate(strings, \n",
" max_length = 100, do_sample=True, penalty_alpha=0.9, top_k=4, temperature=0.9,\n",
" num_return_sequences=3)\n",
"outputs"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
46 changes: 23 additions & 23 deletions docs/load-dependency-huggingface.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3.3 s, sys: 3.22 s, total: 6.51 s\n",
"Wall time: 2.36 s\n"
"CPU times: user 3.31 s, sys: 3.3 s, total: 6.6 s\n",
"Wall time: 2.42 s\n"
]
}
],
Expand Down Expand Up @@ -331,44 +331,44 @@
" <tbody>\n",
" <tr>\n",
" <th>mesolitica/finetune-dependency-t5-tiny-standard-bahasa-cased</th>\n",
" <td>61.2</td>\n",
" <td>0.84929</td>\n",
" <td>0.8281</td>\n",
" <td>0.92099</td>\n",
" <td>143.0</td>\n",
" <td>0.850607</td>\n",
" <td>0.783164</td>\n",
" <td>0.872302</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mesolitica/finetune-dependency-t5-small-standard-bahasa-cased</th>\n",
" <td>61.2</td>\n",
" <td>0.84929</td>\n",
" <td>0.8281</td>\n",
" <td>0.92099</td>\n",
" <td>247.0</td>\n",
" <td>0.849405</td>\n",
" <td>0.783103</td>\n",
" <td>0.866906</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mesolitica/finetune-dependency-t5-base-standard-bahasa-cased</th>\n",
" <td>61.2</td>\n",
" <td>0.84929</td>\n",
" <td>0.8281</td>\n",
" <td>0.92099</td>\n",
" <td>898.0</td>\n",
" <td>0.852892</td>\n",
" <td>0.784091</td>\n",
" <td>0.859712</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Size (MB) Arc Accuracy \\\n",
"mesolitica/finetune-dependency-t5-tiny-standard... 61.2 0.84929 \n",
"mesolitica/finetune-dependency-t5-small-standar... 61.2 0.84929 \n",
"mesolitica/finetune-dependency-t5-base-standard... 61.2 0.84929 \n",
"mesolitica/finetune-dependency-t5-tiny-standard... 143.0 0.850607 \n",
"mesolitica/finetune-dependency-t5-small-standar... 247.0 0.849405 \n",
"mesolitica/finetune-dependency-t5-base-standard... 898.0 0.852892 \n",
"\n",
" Types Accuracy \\\n",
"mesolitica/finetune-dependency-t5-tiny-standard... 0.8281 \n",
"mesolitica/finetune-dependency-t5-small-standar... 0.8281 \n",
"mesolitica/finetune-dependency-t5-base-standard... 0.8281 \n",
"mesolitica/finetune-dependency-t5-tiny-standard... 0.783164 \n",
"mesolitica/finetune-dependency-t5-small-standar... 0.783103 \n",
"mesolitica/finetune-dependency-t5-base-standard... 0.784091 \n",
"\n",
" Root Accuracy \n",
"mesolitica/finetune-dependency-t5-tiny-standard... 0.92099 \n",
"mesolitica/finetune-dependency-t5-small-standar... 0.92099 \n",
"mesolitica/finetune-dependency-t5-base-standard... 0.92099 "
"mesolitica/finetune-dependency-t5-tiny-standard... 0.872302 \n",
"mesolitica/finetune-dependency-t5-small-standar... 0.866906 \n",
"mesolitica/finetune-dependency-t5-base-standard... 0.859712 "
]
},
"execution_count": 5,
Expand Down
Loading

0 comments on commit 3c056cb

Please sign in to comment.