diff --git a/mist/data_modules/__init__.py b/mist/data_modules/__init__.py
index e69de29..e8cb088 100644
--- a/mist/data_modules/__init__.py
+++ b/mist/data_modules/__init__.py
@@ -0,0 +1,2 @@
+from .property_prediction_dataset import PropertyPredictionDataModule
+from .roberta_dataset import RobertaDataSet
diff --git a/mist/models/__init__.py b/mist/models/__init__.py
index e69de29..8a504b4 100644
--- a/mist/models/__init__.py
+++ b/mist/models/__init__.py
@@ -0,0 +1,2 @@
+from .lm_finetuning import LMFinetuning
+from .roberta_base import RoBERTa
diff --git a/notebooks/PretrainingMIST.ipynb b/notebooks/PretrainingMIST.ipynb
index 80e910f..2ad8acd 100644
--- a/notebooks/PretrainingMIST.ipynb
+++ b/notebooks/PretrainingMIST.ipynb
@@ -2,19 +2,9 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-07-16 16:22:17.710192: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "2024-07-16 16:22:19.153266: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import os\n",
     "\n",
@@ -49,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -76,7 +66,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -110,7 +100,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -142,282 +132,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using 16bit Automatic Mixed Precision (AMP)\n",
-      "GPU available: True (cuda), used: True\n",
-      "TPU available: False, using: 0 TPU cores\n",
-      "HPU available: False, using: 0 HPUs\n",
-      "/soft/applications/conda/2024-04-29/mconda3/lib/python3.11/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n",
-      "  self.pid = os.fork()\n",
-      "You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n",
-      "/soft/applications/conda/2024-04-29/mconda3/lib/python3.11/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n",
-      "  self.pid = os.fork()\n",
-      "You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n",
-      "Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4\n",
-      "You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n",
-      "You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision\n",
-      "Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/4\n",
-      "Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/4\n",
-      "Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/4\n",
-      "----------------------------------------------------------------------------------------------------\n",
-      "distributed_backend=nccl\n",
-      "All distributed processes registered. Starting with 4 processes\n",
-      "----------------------------------------------------------------------------------------------------\n",
-      "\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "bfd606f75f1a483f88a82e60711ce24b",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e0965b94bd97444d9629c4f097f9f5ef",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "37a6262d11174a11a63a8e974d3b0c53",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "37ec64a9b20243779895c0a30d52b18b",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4767dbb168684068bcc23bc9239bb5a2",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7375635c77104a76ab1bb904bc4686d4",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "af208a2ac6e74fb3a11b6c9025c18d5e",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "36bf1d91e84e400d9d2b33c2c7aa4e32",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2adba159d4304c828a7fca077fff7159",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0ce5198ed5f44b9c96b1555d5c3b4291",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "30c96fa4c3564319995d29312782c6d1",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e137c313db994d32bd432f668e645cbd",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3]\n",
-      "LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3]\n",
-      "LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3]\n",
-      "LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]\n",
-      "\n",
-      "  | Name  | Type               | Params | Mode \n",
-      "-----------------------------------------------------\n",
-      "0 | model | RobertaForMaskedLM | 24.1 M | train\n",
-      "-----------------------------------------------------\n",
-      "24.1 M    Trainable params\n",
-      "0         Non-trainable params\n",
-      "24.1 M    Total params\n",
-      "96.335    Total estimated model params size (MB)\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0ab7cdc78156424d8a3bdb45c59a2c0a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Sanity Checking: |          | 0/? [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/abhutani/mist/.venv/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.\n",
-      "/home/abhutani/mist/.venv/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "320d4fcece6c4d76a5a340f7fcbf8811",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Training: |          | 0/? [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7a22022be1f24f4aaf01ec438234098d",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Validation: |          | 0/? [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Epoch 0, global step 5000: 'val/loss_epoch' reached 2.20291 (best 2.20291), saving model to '/home/abhutani/mist/notebooks/lightning_logs/version_5/checkpoints/epoch=0-step=5000-val_loss=2.20.ckpt' as top 5\n",
-      "`Trainer.fit` stopped: `max_epochs=1` reached.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "trainer = pl.Trainer(\n",
     "    precision = \"16-mixed\", # Combines FP32 and lower-bit floating points to reduce memory footprint and increase performance.\n",
@@ -441,72 +158,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "13d35e94d20743b9a2eb82f6edff2459",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e670fe39c4b649839bd63198ed66c65a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fea8b6fa9c1e487da6cde8870e7ab0e0",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Masked Molecule <bos>C[13C-]S[Se@@]=O)(<mask>O<mask>NCC1=CN(CCO<mask>COCCNC(=O)CN2N=C3<mask>O<mask>CN3<mask>(=O<mask>C2=O)[n-]=<mask>1<eos>\n",
-      "Labels ['C', '(', '=', ')', 'C', 'C', 'C', 'C', ')', 'N', 'N']\n"
-     ]
-    },
-    {
-     "ename": "AttributeError",
-     "evalue": "'RoBERTa' object has no attribute 'model'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[6], line 7\u001b[0m\n\u001b[1;32m      5\u001b[0m labels \u001b[38;5;241m=\u001b[39m sample[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlabels\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mflatten()[mask]\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLabels\u001b[39m\u001b[38;5;124m\"\u001b[39m, datamodule\u001b[38;5;241m.\u001b[39mtokenizer\u001b[38;5;241m.\u001b[39mconvert_ids_to_tokens(labels))\n\u001b[0;32m----> 7\u001b[0m pred \u001b[38;5;241m=\u001b[39m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m(\n\u001b[1;32m      8\u001b[0m     input_ids\u001b[38;5;241m=\u001b[39msample[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m      9\u001b[0m     attention_mask \u001b[38;5;241m=\u001b[39m sample[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mattention_mask\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m     10\u001b[0m )\n\u001b[1;32m     11\u001b[0m pred \u001b[38;5;241m=\u001b[39m pred\u001b[38;5;241m.\u001b[39mlogits[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39margmax(axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)[mask]\n\u001b[1;32m     12\u001b[0m pred \u001b[38;5;241m=\u001b[39m datamodule\u001b[38;5;241m.\u001b[39mtokenizer\u001b[38;5;241m.\u001b[39mconvert_ids_to_tokens(pred)\n",
-      "File \u001b[0;32m~/mist/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1709\u001b[0m, in \u001b[0;36mModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   1707\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m modules:\n\u001b[1;32m   1708\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m modules[name]\n\u001b[0;32m-> 1709\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
-      "\u001b[0;31mAttributeError\u001b[0m: 'RoBERTa' object has no attribute 'model'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "datamodule.setup(stage=\"test\")\n",
     "for step, sample in enumerate(datamodule.val_dataloader()):\n",
diff --git a/notebooks/UnderstandingMIST.ipynb b/notebooks/UnderstandingMIST.ipynb
new file mode 100644
index 0000000..4802cb3
--- /dev/null
+++ b/notebooks/UnderstandingMIST.ipynb
@@ -0,0 +1,78 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/abhutani/electrolyte_fm/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "import torch\n",
+    "import pytorch_lightning as pl\n",
+    "\n",
+    "from electrolyte_fm.models import RoBERTa\n",
+    "from electrolyte_fm.data_modules import RobertaDataSet\n",
+    "from electrolyte_fm.utils.lr_schedule import RelativeCosineWarmup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"true\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Attention Visualization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Embedding Visualization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mist_kernel",
+   "language": "python",
+   "name": "mist_kernel"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/test/test_ckpt.py b/test/test_ckpt.py
new file mode 100644
index 0000000..84dbcb7
--- /dev/null
+++ b/test/test_ckpt.py
@@ -0,0 +1,98 @@
+import json
+from unittest import mock
+
+import pytest
+import torch
+from pytorch_lightning import LightningDataModule, LightningModule
+from pytorch_lightning.cli import LightningArgumentParser, LightningCLI
+from torch.utils.data import DataLoader
+
+from electrolyte_fm.utils.ckpt import SaveConfigWithCkpts
+
+
+class MockedModel(LightningModule):
+    def __init__(self, vocab_size: int, linked: int):
+        self.save_hyperparameters()
+        super().__init__()
+
+    def forward(self, input):
+        return input
+
+    def training_step(self, batch, batch_idx):
+        self.forward(batch)  # Mock calling forward
+        return torch.zeros(1, requires_grad=True)
+
+    def configure_optimizers(self):
+        pass
+
+
+class MockedData(LightningDataModule):
+    def __init__(self, tokenizer: str, linked: int):
+        self.tokenizer = tokenizer
+        self.save_hyperparameters()
+        super().__init__()
+
+    def train_dataloader(self):
+        return DataLoader(range(5), batch_size=1)
+
+
+@pytest.fixture()
+def cli(tmp_path):
+
+    with mock.patch(
+        "sys.argv",
+        [
+            "any.py",
+            "--data.tokenizer=smirk",
+            "--data.linked=10",
+            "--model.vocab_size=256",
+        ],
+    ):
+        parser = LightningArgumentParser()
+        parser.add_class_arguments(MockedModel, "model")
+        parser.add_class_arguments(MockedData, "data")
+        parser.link_arguments("data.linked", "model.linked", apply_on="parse")
+        parsed_args = dict(parser.parse_args())
+        args_ = [
+            "fit",
+        ]
+        args_.extend(["--" + k + "=" + str(v) for k, v in parsed_args.items()])
+
+    _cli = LightningCLI(
+        trainer_defaults={
+            "max_steps": 2,
+            "default_root_dir": tmp_path,
+        },
+        model_class=MockedModel,
+        datamodule_class=MockedData,
+        save_config_callback=SaveConfigWithCkpts,
+        args=args_,
+    )
+    return _cli
+
+
+def test_ckpt(cli):
+    # Locate callback
+    cb = list(
+        filter(lambda cb: isinstance(cb, SaveConfigWithCkpts), cli.trainer.callbacks)
+    )
+    assert len(cb) == 1
+    cb: SaveConfigWithCkpts = cb[0]
+
+    assert cb.config_path is not None
+    assert cb.config_path.is_dir()
+    assert cb.config_path.joinpath("config.json").is_file()
+    assert cb.config_path.joinpath("model_hparams.json").is_file()
+
+    # Check that the dataloader config is saved
+    data_config = {"linked": 10, "tokenizer": "smirk"}
+    assert dict(cb.config["data"]) == data_config
+    with open(cb.config_path.joinpath("config.json"), "r") as fid:
+        assert json.load(fid)["data"] == data_config
+
+    # Check that the model config is saved
+    with open(cb.config_path.joinpath("model_hparams.json"), "r") as fid:
+        model_config = json.load(fid)
+    assert model_config["class_path"] == __name__ + ".MockedModel"
+    assert model_config["init_args"] == {"linked": 10, "vocab_size": 256}
+    assert "version" in model_config.keys()
diff --git a/test/test_lr.py b/test/test_lr.py
new file mode 100644
index 0000000..ffdf247
--- /dev/null
+++ b/test/test_lr.py
@@ -0,0 +1,28 @@
+from mist.utils.lr_schedule import _get_cosine_relative_decay_with_warmup
+
+
+def test_cosine_rel():
+    assert 0 == _get_cosine_relative_decay_with_warmup(
+        0,
+        num_training_steps=100,
+        num_warmup_steps=10,
+        rel_decay=0.25,
+    )
+    assert 1.0 == _get_cosine_relative_decay_with_warmup(
+        10,
+        num_training_steps=100,
+        num_warmup_steps=10,
+        rel_decay=0.25,
+    )
+    assert 0.25 == _get_cosine_relative_decay_with_warmup(
+        100,
+        num_training_steps=100,
+        num_warmup_steps=10,
+        rel_decay=0.25,
+    )
+    assert (0.25 + 0.75 / 2) == _get_cosine_relative_decay_with_warmup(
+        55,
+        num_training_steps=100,
+        num_warmup_steps=10,
+        rel_decay=0.25,
+    )