diff --git a/mist/data_modules/__init__.py b/mist/data_modules/__init__.py index e69de29..e8cb088 100644 --- a/mist/data_modules/__init__.py +++ b/mist/data_modules/__init__.py @@ -0,0 +1,2 @@ +from .property_prediction_dataset import PropertyPredictionDataModule +from .roberta_dataset import RobertaDataSet diff --git a/mist/models/__init__.py b/mist/models/__init__.py index e69de29..8a504b4 100644 --- a/mist/models/__init__.py +++ b/mist/models/__init__.py @@ -0,0 +1,2 @@ +from .lm_finetuning import LMFinetuning +from .roberta_base import RoBERTa diff --git a/notebooks/PretrainingMIST.ipynb b/notebooks/PretrainingMIST.ipynb index 80e910f..2ad8acd 100644 --- a/notebooks/PretrainingMIST.ipynb +++ b/notebooks/PretrainingMIST.ipynb @@ -2,19 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-07-16 16:22:17.710192: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", - "To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2024-07-16 16:22:19.153266: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "\n", @@ -49,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -76,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -110,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -142,282 +132,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using 16bit Automatic Mixed Precision (AMP)\n", - "GPU available: True (cuda), used: True\n", - "TPU available: False, using: 0 TPU cores\n", - "HPU available: False, using: 0 HPUs\n", - "/soft/applications/conda/2024-04-29/mconda3/lib/python3.11/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n", - " self.pid = os.fork()\n", - "You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n", - "/soft/applications/conda/2024-04-29/mconda3/lib/python3.11/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n", - " self.pid = os.fork()\n", - "You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n", - "Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4\n", - "You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n", - "You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n", - "Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/4\n", - "Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/4\n", - "Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/4\n", - "----------------------------------------------------------------------------------------------------\n", - "distributed_backend=nccl\n", - "All distributed processes registered. Starting with 4 processes\n", - "----------------------------------------------------------------------------------------------------\n", - "\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "bfd606f75f1a483f88a82e60711ce24b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Resolving data files: 0%| | 0/1024 [00:00C[13C-]S[Se@@]=O)(ONCC1=CN(CCOCOCCNC(=O)CN2N=C3OCN3(=OC2=O)[n-]=1\n", - "Labels ['C', '(', '=', ')', 'C', 'C', 'C', 'C', ')', 'N', 'N']\n" - ] - }, - { - "ename": "AttributeError", - "evalue": "'RoBERTa' object has no attribute 'model'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 7\u001b[0m\n\u001b[1;32m 5\u001b[0m labels \u001b[38;5;241m=\u001b[39m sample[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlabels\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mflatten()[mask]\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLabels\u001b[39m\u001b[38;5;124m\"\u001b[39m, datamodule\u001b[38;5;241m.\u001b[39mtokenizer\u001b[38;5;241m.\u001b[39mconvert_ids_to_tokens(labels))\n\u001b[0;32m----> 7\u001b[0m pred \u001b[38;5;241m=\u001b[39m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m(\n\u001b[1;32m 8\u001b[0m input_ids\u001b[38;5;241m=\u001b[39msample[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m 9\u001b[0m attention_mask \u001b[38;5;241m=\u001b[39m sample[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mattention_mask\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m 10\u001b[0m )\n\u001b[1;32m 11\u001b[0m pred \u001b[38;5;241m=\u001b[39m pred\u001b[38;5;241m.\u001b[39mlogits[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39margmax(axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)[mask]\n\u001b[1;32m 12\u001b[0m pred \u001b[38;5;241m=\u001b[39m datamodule\u001b[38;5;241m.\u001b[39mtokenizer\u001b[38;5;241m.\u001b[39mconvert_ids_to_tokens(pred)\n", - "File \u001b[0;32m~/mist/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1709\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 1707\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[1;32m 1708\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[0;32m-> 1709\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "\u001b[0;31mAttributeError\u001b[0m: 'RoBERTa' object has no attribute 'model'" - ] - } - ], + "outputs": [], "source": [ "datamodule.setup(stage=\"test\")\n", "for step, sample in enumerate(datamodule.val_dataloader()):\n", diff --git a/notebooks/UnderstandingMIST.ipynb b/notebooks/UnderstandingMIST.ipynb new file mode 100644 index 0000000..4802cb3 --- /dev/null +++ b/notebooks/UnderstandingMIST.ipynb @@ -0,0 +1,78 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/abhutani/electrolyte_fm/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "import torch\n", + "import pytorch_lightning as pl\n", + "\n", + "from electrolyte_fm.models import RoBERTa\n", + "from electrolyte_fm.data_modules import RobertaDataSet\n", + "from electrolyte_fm.utils.lr_schedule import RelativeCosineWarmup" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"true\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Attention Visualization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Embedding Visualization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mist_kernel", + "language": "python", + "name": "mist_kernel" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/test_ckpt.py b/test/test_ckpt.py new file mode 100644 index 0000000..84dbcb7 --- /dev/null +++ b/test/test_ckpt.py @@ -0,0 +1,98 @@ +import json +from unittest import mock + +import pytest +import torch +from pytorch_lightning import LightningDataModule, LightningModule +from pytorch_lightning.cli import LightningArgumentParser, LightningCLI +from torch.utils.data import DataLoader + +from electrolyte_fm.utils.ckpt import SaveConfigWithCkpts + + +class MockedModel(LightningModule): + def __init__(self, vocab_size: int, linked: int): + self.save_hyperparameters() + super().__init__() + + def forward(self, input): + return input + + def training_step(self, batch, batch_idx): + self.forward(batch) # Mock calling forward + return torch.zeros(1, requires_grad=True) + + def configure_optimizers(self): + pass + + +class MockedData(LightningDataModule): + def __init__(self, tokenizer: str, linked: int): + self.tokenizer = tokenizer + self.save_hyperparameters() + super().__init__() + + def train_dataloader(self): + return DataLoader(range(5), batch_size=1) + + +@pytest.fixture() +def cli(tmp_path): + + with mock.patch( + "sys.argv", + [ + "any.py", + "--data.tokenizer=smirk", + "--data.linked=10", + "--model.vocab_size=256", + ], + ): + parser = LightningArgumentParser() + parser.add_class_arguments(MockedModel, "model") + parser.add_class_arguments(MockedData, "data") + parser.link_arguments("data.linked", "model.linked", apply_on="parse") + parsed_args = dict(parser.parse_args()) + args_ = [ + "fit", + ] + args_.extend(["--" + k + "=" + str(v) for k, v in parsed_args.items()]) + + _cli = LightningCLI( + trainer_defaults={ + "max_steps": 2, + "default_root_dir": tmp_path, + }, + model_class=MockedModel, + datamodule_class=MockedData, + save_config_callback=SaveConfigWithCkpts, + args=args_, + ) + return _cli + + +def test_ckpt(cli): + # Locate callback + cb = list( + filter(lambda cb: isinstance(cb, SaveConfigWithCkpts), cli.trainer.callbacks) + ) + assert len(cb) == 1 + cb: SaveConfigWithCkpts = cb[0] + + assert cb.config_path is not None + assert cb.config_path.is_dir() + assert cb.config_path.joinpath("config.json").is_file() + assert cb.config_path.joinpath("model_hparams.json").is_file() + + # Check that the dataloader config is saved + data_config = {"linked": 10, "tokenizer": "smirk"} + assert dict(cb.config["data"]) == data_config + with open(cb.config_path.joinpath("config.json"), "r") as fid: + assert json.load(fid)["data"] == data_config + + # Check that the model config is saved + with open(cb.config_path.joinpath("model_hparams.json"), "r") as fid: + model_config = json.load(fid) + assert model_config["class_path"] == __name__ + ".MockedModel" + assert model_config["init_args"] == {"linked": 10, "vocab_size": 256} + assert "version" in model_config.keys() diff --git a/test/test_lr.py b/test/test_lr.py new file mode 100644 index 0000000..ffdf247 --- /dev/null +++ b/test/test_lr.py @@ -0,0 +1,28 @@ +from mist.utils.lr_schedule import _get_cosine_relative_decay_with_warmup + + +def test_cosine_rel(): + assert 0 == _get_cosine_relative_decay_with_warmup( + 0, + num_training_steps=100, + num_warmup_steps=10, + rel_decay=0.25, + ) + assert 1.0 == _get_cosine_relative_decay_with_warmup( + 10, + num_training_steps=100, + num_warmup_steps=10, + rel_decay=0.25, + ) + assert 0.25 == _get_cosine_relative_decay_with_warmup( + 100, + num_training_steps=100, + num_warmup_steps=10, + rel_decay=0.25, + ) + assert (0.25 + 0.75 / 2) == _get_cosine_relative_decay_with_warmup( + 55, + num_training_steps=100, + num_warmup_steps=10, + rel_decay=0.25, + )