Fix for svm - model pipeline and roberta - tpsampler

advaithsrao · Nov 16, 2023 · b7708a7 · b7708a7
1 parent b2a9a0e
commit b7708a7
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 16 deletions.
diff --git a/detector/modeler.py b/detector/modeler.py
@@ -9,6 +9,7 @@
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score
 from sklearn.utils.class_weight import compute_class_weight
+from sklearn.pipeline import Pipeline
 import torch
 from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
 from torch.utils.data import DataLoader, TensorDataset
@@ -316,9 +317,11 @@ def __init__(
         self.num_labels = num_labels
         self.kernel = kernel
         self.C = C
-        self.model = SVC(kernel=self.kernel, C=self.C, probability=True, random_state=42, verbose=True, class_weight='balanced')
-        # self.vectorizer = TfidfVectorizer(max_features=512)
         self.vectorizer = Word2VecEmbedder()
+        self.model = Pipeline([
+            ('vectorizer', self.vectorizer),
+            ('classifier', SVC(kernel=self.kernel, C=self.C, probability=True, random_state=42, verbose=True, class_weight='balanced'))
+        ])
 
     def train(
         self,
@@ -339,12 +342,8 @@ def train(
         if isinstance(label, pd.Series):
             label = label.tolist()
 
-        # Vectorize the input texts
-        X = np.array([self.vectorizer.fit_transform(text) for text in body])
-        y = np.array(label)
-
         # Train the SVM model
-        self.model.fit(X, y)
+        self.model.fit(body, label)
 
     def predict(
         self,
@@ -361,11 +360,8 @@ def predict(
         if isinstance(body, pd.Series):
             body = body.tolist()
 
-        # Vectorize the input texts
-        X = np.array([self.vectorizer.fit_transform(text) for text in body])
-
         # Make predictions using the trained SVM model
-        predictions = self.model.predict(X)
+        predictions = self.model.predict(body)
 
         if isinstance(predictions, np.ndarray):
             predictions = predictions.tolist()

diff --git a/utils/util_modeler.py b/utils/util_modeler.py
@@ -145,9 +145,7 @@ def __init__(
         self.tp_ratio = tp_ratio
         self.batch_size = batch_size
 
-    def __iter__(
-        self
-    ):
+    def __iter__(self):
         """Iterate through the sampled indices.
 
         Returns:
@@ -160,8 +158,8 @@ def __iter__(
         sampled_indices = []
 
         while len(sampled_indices) < num_samples:
-            tp_indices = np.random.choice(self.tp_indices, tp_batch_size, replace=True)
-            non_tp_indices = np.random.choice(self.non_tp_indices, non_tp_batch_size, replace=True)
+            tp_indices = np.random.choice(self.tp_indices, tp_batch_size, replace=False)
+            non_tp_indices = np.random.choice(self.non_tp_indices, non_tp_batch_size, replace=False)
             batch_indices = np.concatenate((tp_indices, non_tp_indices))
             np.random.shuffle(batch_indices)
             sampled_indices.extend(batch_indices)