diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..dd9a124 --- /dev/null +++ b/.dvc/config @@ -0,0 +1,5 @@ +[core] + remote = minio +['remote "minio"'] + url = s3://ml-data + endpointurl = http://10.0.0.6:9000 diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/Dockerfile b/Dockerfile index 52b02c0..18925d2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -33,5 +33,6 @@ RUN chmod 777 /.config CMD exec seldon-core-microservice $MODEL_NAME --service-type $SERVICE_TYPE + FROM builder AS app-kserve ENTRYPOINT ["python", "app/src/serving/kserve.py"] diff --git a/README.md b/README.md index 1d60926..7963f81 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ DH Images: Works on push to master/feature* ![Alt text](assets/actions.png) + ### Streamlit Run: @@ -37,6 +38,13 @@ kubectl create -f deployment/app-ui.yml kubectl port-forward --address 0.0.0.0 svc/app-ui.yml 8080:8080 ``` +Deploy k8s: +``` +kubectl create -f deployment/app-ui.yml +kubectl port-forward --address 0.0.0.0 svc/app-ui.yml 8080:8080 +``` + + ### Fast API Postman @@ -44,6 +52,7 @@ Postman ![Alt text](assets/fastapi.png) + Deploy k8s: ``` kubectl create -f deployment/app-fasttext.yml @@ -78,10 +87,89 @@ kubectl create -f deployment/kserve.yaml kubectl get inferenceservice custom-model ``` + ### Load testing ![Alt text](assets/locust.png) ``` locust -f benchmarks/load_test.py --host=http://localhost:9933 --users 50 --spawn-rate 10 --autostart --run-time 600s + +### DVC + +Install DVC + +``` +brew install dvc +``` + +Init in repo + +``` +dvc init --subdir +git status +git commit -m "init DVC" +``` + +Move file with data and add to DVC, commit DBV data config + +``` +dvc add ./data/data.csv +git add data/.gitignore data/data.csv.dvc +git commit -m "create data" +``` + +Add remote data storage and push DVC remote config +(ensure that bucket already created) + +``` +dvc remote add -d minio s3://ml-data +dvc remote modify minio endpointurl [$AWS_ENDPOINT](http://10.0.0.6:9000) + +git add .dvc/config +git commit -m "configure remote" +git push +``` + +Upload data +``` +export AWS_ACCESS_KEY_ID='...' +export AWS_SECRET_ACCESS_KEY='...' +dvc push + + +### Label studio + +``` +docker pull heartexlabs/label-studio:latest +docker run -it -p 8080:8080 -v `pwd`/mydata:/label-studio/data heartexlabs/label-studio:latest +``` + +![Alt text](assets/labeling.png) + + +### Minio setup +Mac/Local +``` +brew install minio/stable/minio + +minio server --console-address :9001 ~/minio # path to persistent local storage + run on custom port +``` + +Docker + +``` +docker run \ + -p 9002:9002 \ + --name minio \ + -v ~/minio:/data \ + -e "MINIO_ROOT_USER=ROOTNAME" \ + -e "MINIO_ROOT_PASSWORD=CHANGEME123" \ + quay.io/minio/minio server /data --console-address ":9002" +``` + +Kubernetes + +``` +kubectl create -f deployment/minio.yml ``` diff --git a/app/requirements-dev.txt b/app/requirements-dev.txt index 16cdb7a..2824f34 100644 --- a/app/requirements-dev.txt +++ b/app/requirements-dev.txt @@ -9,3 +9,4 @@ datasets==2.16.1 wandb==0.16.1 httpx==0.23.0 locust==2.20.1 +ipykernel==6.28.0 diff --git a/assets/labeling.png b/assets/labeling.png new file mode 100644 index 0000000..294c9a0 Binary files /dev/null and b/assets/labeling.png differ diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..3e63a77 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +/data.csv diff --git a/data/data.csv.dvc b/data/data.csv.dvc new file mode 100644 index 0000000..41d07a2 --- /dev/null +++ b/data/data.csv.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 7ec83b215d1790bedaf458a1690370e3 + size: 25144581 + hash: md5 + path: data.csv diff --git a/deployment/minio.yml b/deployment/minio.yml new file mode 100644 index 0000000..7de1ec6 --- /dev/null +++ b/deployment/minio.yml @@ -0,0 +1,38 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: minio-deployment +spec: + selector: + matchLabels: + app: minio + strategy: + type: Recreate + template: + metadata: + labels: + # Label is used as selector in the service. + app: minio + spec: + volumes: + - name: storage + persistentVolumeClaim: + claimName: minio-pv-claim + containers: + - name: minio + image: quay.io/minio/minio:latest + args: + - server + - /storage + env: + # Minio access key and secret key + - name: MINIO_ACCESS_KEY + value: "minio" + - name: MINIO_SECRET_KEY + value: "minio123" + ports: + - containerPort: 9003 + hostPort: 9003 + volumeMounts: + - name: storage + mountPath: "/storage" diff --git a/experiments/train.ipynb b/experiments/train.ipynb new file mode 100644 index 0000000..45cb365 --- /dev/null +++ b/experiments/train.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"code","execution_count":58,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:38:24.348418Z","iopub.status.busy":"2024-01-08T00:38:24.347727Z","iopub.status.idle":"2024-01-08T00:38:24.354720Z","shell.execute_reply":"2024-01-08T00:38:24.353625Z","shell.execute_reply.started":"2024-01-08T00:38:24.348385Z"},"trusted":true},"outputs":[],"source":["from datasets import load_dataset\n","from transformers import (\n"," BertForSequenceClassification,\n"," BertTokenizer,\n"," TrainingArguments,\n"," Trainer,\n"," pipeline\n",")\n","from functools import partial\n","import numpy as np\n","import evaluate\n","import wandb"]},{"cell_type":"markdown","metadata":{},"source":["## Load dataset"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2024-01-07T23:33:19.657359Z","iopub.status.busy":"2024-01-07T23:33:19.657062Z","iopub.status.idle":"2024-01-07T23:33:23.931368Z","shell.execute_reply":"2024-01-07T23:33:23.930520Z","shell.execute_reply.started":"2024-01-07T23:33:19.657337Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"335e4af20627429f823b7763046054fc","version_major":2,"version_minor":0},"text/plain":["Downloading: 0%| | 0.00/1.03k [00:00"]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":["Run data is saved locally in /kaggle/working/wandb/run-20240107_235614-88rzow23"],"text/plain":[""]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":["Syncing run graceful-breeze-1 to Weights & Biases (docs)
"],"text/plain":[""]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":[" View project at https://wandb.ai/yurii-havrylko/huggingface"],"text/plain":[""]},"metadata":{},"output_type":"display_data"},{"data":{"text/html":[" View run at https://wandb.ai/yurii-havrylko/huggingface/runs/88rzow23"],"text/plain":[""]},"metadata":{},"output_type":"display_data"},{"name":"stderr","output_type":"stream","text":["/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n"]},{"data":{"text/html":["\n","
\n"," \n"," \n"," [1500/1500 25:15, Epoch 3/3]\n","
\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
EpochTraining LossValidation LossAccuracy
10.0740000.0277870.986500
20.0326000.0109200.995000
30.0101000.0027390.999500

"],"text/plain":[""]},"metadata":{},"output_type":"display_data"},{"name":"stderr","output_type":"stream","text":["/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n","/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n","/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n","/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n","/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n"," warnings.warn('Was asked to gather along dimension 0, but all '\n"]},{"data":{"text/plain":["TrainOutput(global_step=1500, training_loss=0.038901503562927243, metrics={'train_runtime': 1572.7041, 'train_samples_per_second': 15.26, 'train_steps_per_second': 0.954, 'total_flos': 6314665328640000.0, 'train_loss': 0.038901503562927243, 'epoch': 3.0})"]},"execution_count":32,"metadata":{},"output_type":"execute_result"}],"source":["trainer.train()"]},{"cell_type":"code","execution_count":42,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:27:04.256277Z","iopub.status.busy":"2024-01-08T00:27:04.255502Z","iopub.status.idle":"2024-01-08T00:27:04.261328Z","shell.execute_reply":"2024-01-08T00:27:04.260224Z","shell.execute_reply.started":"2024-01-08T00:27:04.256246Z"},"trusted":true},"outputs":[],"source":["PATH = \"working/checkpoints\""]},{"cell_type":"code","execution_count":49,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:34:03.559698Z","iopub.status.busy":"2024-01-08T00:34:03.559317Z","iopub.status.idle":"2024-01-08T00:34:04.650001Z","shell.execute_reply":"2024-01-08T00:34:04.648660Z","shell.execute_reply.started":"2024-01-08T00:34:03.559669Z"},"trusted":true},"outputs":[{"data":{"text/plain":["('/kaggle/working/checkpoints/tokenizer_config.json',\n"," '/kaggle/working/checkpoints/special_tokens_map.json',\n"," '/kaggle/working/checkpoints/vocab.txt',\n"," '/kaggle/working/checkpoints/added_tokens.json')"]},"execution_count":49,"metadata":{},"output_type":"execute_result"}],"source":["trainer.save_model(PATH)\n","bert_tokenizer.save_pretrained(PATH)"]},{"cell_type":"markdown","metadata":{},"source":["## Usage"]},{"cell_type":"code","execution_count":52,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:34:22.825288Z","iopub.status.busy":"2024-01-08T00:34:22.824467Z","iopub.status.idle":"2024-01-08T00:34:23.163749Z","shell.execute_reply":"2024-01-08T00:34:23.162693Z","shell.execute_reply.started":"2024-01-08T00:34:22.825254Z"},"trusted":true},"outputs":[],"source":["tokenizer = BertTokenizer.from_pretrained(PATH, local_files_only=True)\n","model = BertForSequenceClassification.from_pretrained(PATH, local_files_only=True)\n"]},{"cell_type":"code","execution_count":53,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:36:16.296584Z","iopub.status.busy":"2024-01-08T00:36:16.295914Z","iopub.status.idle":"2024-01-08T00:36:16.303129Z","shell.execute_reply":"2024-01-08T00:36:16.301783Z","shell.execute_reply.started":"2024-01-08T00:36:16.296552Z"},"trusted":true},"outputs":[],"source":["text = \"\"\"\n","Liverpool struck twice late on to beat Arsenal at Emirates Stadium and reach the FA Cup fourth round.\n","\n","Arsenal paid the price for missing a host of opportunities and were punished as Liverpool grew increasingly dangerous, the deadlock broken when Trent Alexander-Arnold's free-kick glanced in off Jakub Kiwior's head with seven minutes left.\n","\n","Liverpool's triumph was completed in the closing seconds as a lethal break ended with Luis Diaz firing an emphatic finish high past Arsenal goalkeeper Aaron Ramsdale.\n","\n","Mikel Arteta's side dominated the first half, with Martin Odegaard hitting the bar while Reiss Nelson and Kai Havertz also had chances to give Arsenal reward for their pressure.\n","\n","Liverpool, despite missing captain Virgil van Dijk through illness and with Mohamed Salah at the Africa Cup of Nations, held firm and were always a threat. Alexander-Arnold hit the bar in the first half and as they grew into the game, Ramsdale saved well from Diaz and Diogo Jota headed against the woodwork.\n","\"\"\""]},{"cell_type":"code","execution_count":59,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:39:01.210310Z","iopub.status.busy":"2024-01-08T00:39:01.209908Z","iopub.status.idle":"2024-01-08T00:39:01.695811Z","shell.execute_reply":"2024-01-08T00:39:01.694120Z","shell.execute_reply.started":"2024-01-08T00:39:01.210282Z"},"trusted":true},"outputs":[{"name":"stderr","output_type":"stream","text":["/opt/conda/lib/python3.10/site-packages/transformers/pipelines/text_classification.py:105: UserWarning: `return_all_scores` is now deprecated, if want a similar functionality use `top_k=None` instead of `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.\n"," warnings.warn(\n"]}],"source":["text_classification_pipeline = pipeline(\n"," \"text-classification\",\n"," model=PATH,\n"," tokenizer=PATH,\n"," return_all_scores=True,\n"," device=0,\n",")"]},{"cell_type":"code","execution_count":60,"metadata":{"execution":{"iopub.execute_input":"2024-01-08T00:39:26.256297Z","iopub.status.busy":"2024-01-08T00:39:26.255624Z","iopub.status.idle":"2024-01-08T00:39:26.314708Z","shell.execute_reply":"2024-01-08T00:39:26.313773Z","shell.execute_reply.started":"2024-01-08T00:39:26.256264Z"},"trusted":true},"outputs":[{"data":{"text/plain":["[[{'label': 'LABEL_0', 'score': 0.9968350529670715},\n"," {'label': 'LABEL_1', 'score': 0.0031650131568312645}]]"]},"execution_count":60,"metadata":{},"output_type":"execute_result"}],"source":[" text_classification_pipeline(text)"]}],"metadata":{"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"dockerImageVersionId":30627,"isGpuEnabled":true,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.11"}},"nbformat":4,"nbformat_minor":4} diff --git a/modelcard.md b/modelcard.md new file mode 100644 index 0000000..586a8d0 --- /dev/null +++ b/modelcard.md @@ -0,0 +1,63 @@ +--- +language: en +tags: +- bert +license: apache-2.0 +datasets: +- GonzaloA/fake_news +--- + +# BERT fake news classifiction model + +Pretrained model on English language based on uncased version of BERT finetuned for task of binary classification. + + +### How to use + +You can use this model directly with a pipeline for masked language modeling: + +```python + +tokenizer = BertTokenizer.from_pretrained(PATH, local_files_only=True) +bert_model = BertForSequenceClassification.from_pretrained(PATH, local_files_only=True) + +# run infernce + +``` +With transformers pipeline + +```python + +text_classification_pipeline = pipeline( + "text-classification", + model=PATH, + tokenizer=PATH, + return_all_scores=True +) +``` + + +## Training data + +The BERT model was pretrained on [bert-base-uncased](https://huggingface.co/bert-base-uncased), a dataset consisting of ~25,000 of news labeled as fake and real. +For training purpoose 10k of samples randomly selected and splitted in 80:20 ratio. + +## Training procedure + +### Preprocessing + +The texts are tokenized using BERT tokenizer. + +### Training + +The model was trained on GPU T4 x 2. + +## Evaluation results + + +| Epoch | Training Loss | Validation Loss | Accuracy | +|-------|---------------|-----------------|----------| +| 1 | 0.074000 | 0.027787 | 0.986500 | +| 2 | 0.032600 | 0.010920 | 0.995000 | +| 3 | 0.010100 | 0.002739 | 0.999500 | +