Skip to content

Commit

Permalink
Created Base.py for base distilbert model and modified dp.save
Browse files Browse the repository at this point in the history
  • Loading branch information
advaithsrao committed Dec 5, 2023
1 parent ae39567 commit 18fe3f3
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 30 deletions.
52 changes: 52 additions & 0 deletions ethics/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

import shutil
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import Pipeline
import torch
from torch import nn

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertModel
from transformers import AdamW,get_linear_schedule_with_warmup

from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

import wandb
from mlflow.sklearn import save_model
from scipy.sparse import hstack


class BaseDistilbertModel(nn.Module):
def __init__(self, num_labels, model_name='distilbert-base-uncased', device = 'cuda'):
super(BaseDistilbertModel, self).__init__()

# Load pre-trained RobertaModel
self.model = DistilBertModel.from_pretrained(model_name).to(device)

for param in self.model.parameters():
param.requires_grad = False

# Define classification head
self.classification_head = nn.Sequential(
nn.Linear(self.model.config.hidden_size, 128),
nn.ReLU(),
nn.Linear(128, num_labels)
)

def forward(self, input_ids, attention_mask, labels=None):
# Get model outputs
outputs = self.model(input_ids, attention_mask=attention_mask)
last_hidden_states = outputs.last_hidden_state

# Apply classification head
logits = self.classification_head(last_hidden_states[:, 0, :])

return logits
46 changes: 16 additions & 30 deletions ethics/differential_privacy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

import sys
sys.path.append('..')

import shutil
import pandas as pd
import numpy as np
Expand All @@ -23,39 +26,13 @@
from mlflow.sklearn import save_model
from scipy.sparse import hstack

from base import BaseDistilbertModel
from utils.util_modeler import Word2VecEmbedder, TPSampler

from opacus import PrivacyEngine
from opacus.utils.batch_memory_manager import BatchMemoryManager


class BaseModel(nn.Module):
def __init__(self, num_labels, model_name='distilbert-base-uncased', device = 'cuda'):
super(BaseModel, self).__init__()

# Load pre-trained RobertaModel
self.model = DistilBertModel.from_pretrained(model_name).to(device)

for param in self.model.parameters():
param.requires_grad = False

# Define classification head
self.classification_head = nn.Sequential(
nn.Linear(self.model.config.hidden_size, 128),
nn.ReLU(),
nn.Linear(128, num_labels)
)

def forward(self, input_ids, attention_mask, labels=None):
# Get model outputs
outputs = self.model(input_ids, attention_mask=attention_mask)
last_hidden_states = outputs.last_hidden_state

# Apply classification head
logits = self.classification_head(last_hidden_states[:, 0, :])

return logits

class DistilbertPrivacyModel:
def __init__(
self,
Expand Down Expand Up @@ -88,7 +65,7 @@ def __init__(
if self.path != '':
raise NotImplementedError('Loading model from path is not implemented yet.')
else:
self.model = BaseModel(num_labels=self.num_labels, model_name=self.model_name)
self.model = BaseDistilbertModel(num_labels=self.num_labels, model_name=self.model_name)
self.model.to(self.device)

self.privacy_engine = PrivacyEngine()
Expand Down Expand Up @@ -355,8 +332,17 @@ def save_model(
os.makedirs(path, exist_ok=True)

# Save the transformer model and the classification head
self.model.save_pretrained(path)
torch.save(self.classification_head.state_dict(), os.path.join(path, 'classification_head.pth'))
# self.model.save_pretrained(path)
# torch.save(self.classification_head.state_dict(), os.path.join(path, 'classification_head.pth'))
try:
torch.save(self.privacy_engine.accountant)
except:
print('Accountant not saved')

try:
torch.save(self.model._module.state_dict())
except:
print('Model not saved')

def accuracy(
self,
Expand Down

0 comments on commit 18fe3f3

Please sign in to comment.