Skip to content

Commit

Permalink
Fix for svm - model pipeline and roberta - tpsampler
Browse files Browse the repository at this point in the history
  • Loading branch information
advaithsrao committed Nov 16, 2023
1 parent b2a9a0e commit b7708a7
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 16 deletions.
18 changes: 7 additions & 11 deletions detector/modeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import Pipeline
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
Expand Down Expand Up @@ -316,9 +317,11 @@ def __init__(
self.num_labels = num_labels
self.kernel = kernel
self.C = C
self.model = SVC(kernel=self.kernel, C=self.C, probability=True, random_state=42, verbose=True, class_weight='balanced')
# self.vectorizer = TfidfVectorizer(max_features=512)
self.vectorizer = Word2VecEmbedder()
self.model = Pipeline([
('vectorizer', self.vectorizer),
('classifier', SVC(kernel=self.kernel, C=self.C, probability=True, random_state=42, verbose=True, class_weight='balanced'))
])

def train(
self,
Expand All @@ -339,12 +342,8 @@ def train(
if isinstance(label, pd.Series):
label = label.tolist()

# Vectorize the input texts
X = np.array([self.vectorizer.fit_transform(text) for text in body])
y = np.array(label)

# Train the SVM model
self.model.fit(X, y)
self.model.fit(body, label)

def predict(
self,
Expand All @@ -361,11 +360,8 @@ def predict(
if isinstance(body, pd.Series):
body = body.tolist()

# Vectorize the input texts
X = np.array([self.vectorizer.fit_transform(text) for text in body])

# Make predictions using the trained SVM model
predictions = self.model.predict(X)
predictions = self.model.predict(body)

if isinstance(predictions, np.ndarray):
predictions = predictions.tolist()
Expand Down
8 changes: 3 additions & 5 deletions utils/util_modeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,7 @@ def __init__(
self.tp_ratio = tp_ratio
self.batch_size = batch_size

def __iter__(
self
):
def __iter__(self):
"""Iterate through the sampled indices.
Returns:
Expand All @@ -160,8 +158,8 @@ def __iter__(
sampled_indices = []

while len(sampled_indices) < num_samples:
tp_indices = np.random.choice(self.tp_indices, tp_batch_size, replace=True)
non_tp_indices = np.random.choice(self.non_tp_indices, non_tp_batch_size, replace=True)
tp_indices = np.random.choice(self.tp_indices, tp_batch_size, replace=False)
non_tp_indices = np.random.choice(self.non_tp_indices, non_tp_batch_size, replace=False)
batch_indices = np.concatenate((tp_indices, non_tp_indices))
np.random.shuffle(batch_indices)
sampled_indices.extend(batch_indices)
Expand Down

0 comments on commit b7708a7

Please sign in to comment.