diff --git a/detector/modeler.py b/detector/modeler.py
index 5a1d730..88a7ca1 100644
--- a/detector/modeler.py
+++ b/detector/modeler.py
@@ -1,5 +1,5 @@
 import os
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
+# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
 
 import shutil
 import pandas as pd
@@ -17,6 +17,8 @@
 from transformers import AdamW, get_linear_schedule_with_warmup
 
 from torch.utils.data import DataLoader, TensorDataset#, SubsetRandomSampler
+import torch.nn.functional as F
+
 import wandb
 from mlflow.sklearn import save_model
 from scipy.sparse import hstack
@@ -105,7 +107,7 @@ def train(
         # Convert lists to tensors
         input_ids = torch.cat(input_ids, dim=0)
         attention_masks = torch.cat(attention_masks, dim=0)
-        label_ids = torch.stack(label_ids).squeeze()  # Create a 1D tensor for label_ids
+        label_ids = torch.stack(label_ids)
 
         # Split the data into train and validation sets
         dataset = TensorDataset(input_ids, attention_masks, label_ids)
@@ -152,8 +154,11 @@ def train(
                 outputs = self.model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
                 logits = outputs.logits  # Use logits attribute to get the predicted logits
 
+                # Convert labels to one-hot encoding
+                b_labels_one_hot = F.one_hot(b_labels, num_classes=2).float()
+
                 # Calculate the loss using the weighted loss function
-                loss = loss_function(logits.squeeze(), b_labels)
+                loss = loss_function(logits, b_labels_one_hot)
                 total_train_loss += loss.item()
 
                 loss.backward()
@@ -185,7 +190,11 @@ def train(
                     # loss = outputs[0]
                     logits = outputs.logits
                 
-                loss = loss_function(logits, b_labels)
+                # Convert labels to one-hot encoding
+                b_labels_one_hot = F.one_hot(b_labels, num_classes=2).float()
+
+                # Calculate the loss using the weighted loss function
+                loss = loss_function(logits, b_labels_one_hot)
                 total_eval_loss += loss.item()
                 logits = logits.detach().to(self.device).numpy()
                 label_ids = b_labels.to(self.device).numpy()
@@ -393,7 +402,7 @@ def train(
         # Convert lists to tensors
         input_ids = torch.cat(input_ids, dim=0)
         attention_masks = torch.cat(attention_masks, dim=0)
-        label_ids = torch.stack(label_ids).squeeze()  # Create a 1D tensor for label_ids
+        label_ids = torch.stack(label_ids)
 
         # Split the data into train and validation sets
         dataset = TensorDataset(input_ids, attention_masks, label_ids)
@@ -440,8 +449,11 @@ def train(
                 outputs = self.model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
                 logits = outputs.logits
 
+                # Convert labels to one-hot encoding
+                b_labels_one_hot = F.one_hot(b_labels, num_classes=2).float()
+
                 # Calculate the loss using the weighted loss function
-                loss = loss_function(logits.squeeze(), b_labels)
+                loss = loss_function(logits, b_labels_one_hot)
                 total_train_loss += loss.item()
 
                 loss.backward()
@@ -472,7 +484,11 @@ def train(
                     outputs = self.model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
                     logits = outputs.logits
                 
-                loss = loss_function(logits, b_labels)
+                # Convert labels to one-hot encoding
+                b_labels_one_hot = F.one_hot(b_labels, num_classes=2).float()
+
+                # Calculate the loss using the weighted loss function
+                loss = loss_function(logits, b_labels_one_hot)
                 total_eval_loss += loss.item()
                 logits = logits.detach().to(self.device).numpy()
                 label_ids = b_labels.to(self.device).numpy()
@@ -552,7 +568,7 @@ def predict(
 
             with torch.no_grad():
                 outputs = self.model(b_input_ids, attention_mask=b_input_mask)
-                logits = outputs[0]
+                logits = outputs.logits
 
             logits = logits.detach().cpu().numpy()