diff --git a/app/requirements-dev.txt b/app/requirements-dev.txt new file mode 100644 index 0000000..a2c36c9 --- /dev/null +++ b/app/requirements-dev.txt @@ -0,0 +1,3 @@ +ipykernel==6.28.0 +wandb==0.16.1 +evaluate==0.4.1 diff --git a/app/requirements.txt b/app/requirements.txt index 67d6f4c..3ad1e0c 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -1,2 +1,3 @@ flask==3.0.0 gunicorn==21.2.0 +transformers==4.36.2 diff --git a/experiments/train.ipynb b/experiments/train.ipynb new file mode 100644 index 0000000..45cb365 --- /dev/null +++ b/experiments/train.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"code","execution_count":58,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:38:24.348418Z","iopub.status.busy":"2024-01-08T00:38:24.347727Z","iopub.status.idle":"2024-01-08T00:38:24.354720Z","shell.execute_reply":"2024-01-08T00:38:24.353625Z","shell.execute_reply.started":"2024-01-08T00:38:24.348385Z"},"trusted":true},"outputs":[],"source":["from datasets import load_dataset\n","from transformers import (\n"," BertForSequenceClassification,\n"," BertTokenizer,\n"," TrainingArguments,\n"," Trainer,\n"," pipeline\n",")\n","from functools import partial\n","import numpy as np\n","import evaluate\n","import wandb"]},{"cell_type":"markdown","metadata":{},"source":["## Load dataset"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2024-01-07T23:33:19.657359Z","iopub.status.busy":"2024-01-07T23:33:19.657062Z","iopub.status.idle":"2024-01-07T23:33:23.931368Z","shell.execute_reply":"2024-01-07T23:33:23.930520Z","shell.execute_reply.started":"2024-01-07T23:33:19.657337Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"335e4af20627429f823b7763046054fc","version_major":2,"version_minor":0},"text/plain":["Downloading: 0%| | 0.00/1.03k [00:00"]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":["Run data is saved locally in /kaggle/working/wandb/run-20240107_235614-88rzow23"],"text/plain":[""]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":["Syncing run graceful-breeze-1 to Weights & Biases (docs)
"],"text/plain":[""]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":[" View project at https://wandb.ai/yurii-havrylko/huggingface"],"text/plain":[""]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":[" View run at https://wandb.ai/yurii-havrylko/huggingface/runs/88rzow23"],"text/plain":[""]},"metadata":{},"output_type":"display_data"},{"name":"stderr","output_type":"stream","text":["/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n"]},{"data":{"text/html":["\n","
\n"," \n"," \n"," [1500/1500 25:15, Epoch 3/3]\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
EpochTraining LossValidation LossAccuracy
10.0740000.0277870.986500
20.0326000.0109200.995000
30.0101000.0027390.999500

"],"text/plain":[""]},"metadata":{},"output_type":"display_data"},{"name":"stderr","output_type":"stream","text":["/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n","/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n","/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n","/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n","/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n"]},{"data":{"text/plain":["TrainOutput(global_step=1500, training_loss=0.038901503562927243, metrics={'train_runtime': 1572.7041, 'train_samples_per_second': 15.26, 'train_steps_per_second': 0.954, 'total_flos': 6314665328640000.0, 'train_loss': 0.038901503562927243, 'epoch': 3.0})"]},"execution_count":32,"metadata":{},"output_type":"execute_result"}],"source":["trainer.train()"]},{"cell_type":"code","execution_count":42,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:27:04.256277Z","iopub.status.busy":"2024-01-08T00:27:04.255502Z","iopub.status.idle":"2024-01-08T00:27:04.261328Z","shell.execute_reply":"2024-01-08T00:27:04.260224Z","shell.execute_reply.started":"2024-01-08T00:27:04.256246Z"},"trusted":true},"outputs":[],"source":["PATH = \"working/checkpoints\""]},{"cell_type":"code","execution_count":49,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:34:03.559698Z","iopub.status.busy":"2024-01-08T00:34:03.559317Z","iopub.status.idle":"2024-01-08T00:34:04.650001Z","shell.execute_reply":"2024-01-08T00:34:04.648660Z","shell.execute_reply.started":"2024-01-08T00:34:03.559669Z"},"trusted":true},"outputs":[{"data":{"text/plain":["('/kaggle/working/checkpoints/tokenizer_config.json',\n"," '/kaggle/working/checkpoints/special_tokens_map.json',\n"," '/kaggle/working/checkpoints/vocab.txt',\n"," '/kaggle/working/checkpoints/added_tokens.json')"]},"execution_count":49,"metadata":{},"output_type":"execute_result"}],"source":["trainer.save_model(PATH)\n","bert_tokenizer.save_pretrained(PATH)"]},{"cell_type":"markdown","metadata":{},"source":["## Usage"]},{"cell_type":"code","execution_count":52,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:34:22.825288Z","iopub.status.busy":"2024-01-08T00:34:22.824467Z","iopub.status.idle":"2024-01-08T00:34:23.163749Z","shell.execute_reply":"2024-01-08T00:34:23.162693Z","shell.execute_reply.started":"2024-01-08T00:34:22.825254Z"},"trusted":true},"outputs":[],"source":["tokenizer = BertTokenizer.from_pretrained(PATH, local_files_only=True)\n","model = BertForSequenceClassification.from_pretrained(PATH, local_files_only=True)\n"]},{"cell_type":"code","execution_count":53,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:36:16.296584Z","iopub.status.busy":"2024-01-08T00:36:16.295914Z","iopub.status.idle":"2024-01-08T00:36:16.303129Z","shell.execute_reply":"2024-01-08T00:36:16.301783Z","shell.execute_reply.started":"2024-01-08T00:36:16.296552Z"},"trusted":true},"outputs":[],"source":["text = \"\"\"\n","Liverpool struck twice late on to beat Arsenal at Emirates Stadium and reach the FA Cup fourth round.\n","\n","Arsenal paid the price for missing a host of opportunities and were punished as Liverpool grew increasingly dangerous, the deadlock broken when Trent Alexander-Arnold's free-kick glanced in off Jakub Kiwior's head with seven minutes left.\n","\n","Liverpool's triumph was completed in the closing seconds as a lethal break ended with Luis Diaz firing an emphatic finish high past Arsenal goalkeeper Aaron Ramsdale.\n","\n","Mikel Arteta's side dominated the first half, with Martin Odegaard hitting the bar while Reiss Nelson and Kai Havertz also had chances to give Arsenal reward for their pressure.\n","\n","Liverpool, despite missing captain Virgil van Dijk through illness and with Mohamed Salah at the Africa Cup of Nations, held firm and were always a threat. Alexander-Arnold hit the bar in the first half and as they grew into the game, Ramsdale saved well from Diaz and Diogo Jota headed against the woodwork.\n","\"\"\""]},{"cell_type":"code","execution_count":59,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:39:01.210310Z","iopub.status.busy":"2024-01-08T00:39:01.209908Z","iopub.status.idle":"2024-01-08T00:39:01.695811Z","shell.execute_reply":"2024-01-08T00:39:01.694120Z","shell.execute_reply.started":"2024-01-08T00:39:01.210282Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["/opt/conda/lib/python3.10/site-packages/transformers/pipelines/text_classification.py:105: UserWarning: `return_all_scores` is now deprecated, if want a similar functionality use `top_k=None` instead of `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.\n"," warnings.warn(\n"]}],"source":["text_classification_pipeline = pipeline(\n"," \"text-classification\",\n"," model=PATH,\n"," tokenizer=PATH,\n"," return_all_scores=True,\n"," device=0,\n",")"]},{"cell_type":"code","execution_count":60,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:39:26.256297Z","iopub.status.busy":"2024-01-08T00:39:26.255624Z","iopub.status.idle":"2024-01-08T00:39:26.314708Z","shell.execute_reply":"2024-01-08T00:39:26.313773Z","shell.execute_reply.started":"2024-01-08T00:39:26.256264Z"},"trusted":true},"outputs":[{"data":{"text/plain":["[[{'label': 'LABEL_0', 'score': 0.9968350529670715},\n"," {'label': 'LABEL_1', 'score': 0.0031650131568312645}]]"]},"execution_count":60,"metadata":{},"output_type":"execute_result"}],"source":[" text_classification_pipeline(text)"]}],"metadata":{"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"dockerImageVersionId":30627,"isGpuEnabled":true,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.11"}},"nbformat":4,"nbformat_minor":4}