diff --git a/detector/modeler.py b/detector/modeler.py index d4d703d..0be9bb1 100644 --- a/detector/modeler.py +++ b/detector/modeler.py @@ -9,6 +9,7 @@ from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.utils.class_weight import compute_class_weight +from sklearn.pipeline import Pipeline import torch from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup from torch.utils.data import DataLoader, TensorDataset @@ -316,9 +317,11 @@ def __init__( self.num_labels = num_labels self.kernel = kernel self.C = C - self.model = SVC(kernel=self.kernel, C=self.C, probability=True, random_state=42, verbose=True, class_weight='balanced') - # self.vectorizer = TfidfVectorizer(max_features=512) self.vectorizer = Word2VecEmbedder() + self.model = Pipeline([ + ('vectorizer', self.vectorizer), + ('classifier', SVC(kernel=self.kernel, C=self.C, probability=True, random_state=42, verbose=True, class_weight='balanced')) + ]) def train( self, @@ -339,12 +342,8 @@ def train( if isinstance(label, pd.Series): label = label.tolist() - # Vectorize the input texts - X = np.array([self.vectorizer.fit_transform(text) for text in body]) - y = np.array(label) - # Train the SVM model - self.model.fit(X, y) + self.model.fit(body, label) def predict( self, @@ -361,11 +360,8 @@ def predict( if isinstance(body, pd.Series): body = body.tolist() - # Vectorize the input texts - X = np.array([self.vectorizer.fit_transform(text) for text in body]) - # Make predictions using the trained SVM model - predictions = self.model.predict(X) + predictions = self.model.predict(body) if isinstance(predictions, np.ndarray): predictions = predictions.tolist() diff --git a/utils/util_modeler.py b/utils/util_modeler.py index d23da3e..78f7126 100644 --- a/utils/util_modeler.py +++ b/utils/util_modeler.py @@ -145,9 +145,7 @@ def __init__( self.tp_ratio = tp_ratio self.batch_size = batch_size - def __iter__( - self - ): + def __iter__(self): """Iterate through the sampled indices. Returns: @@ -160,8 +158,8 @@ def __iter__( sampled_indices = [] while len(sampled_indices) < num_samples: - tp_indices = np.random.choice(self.tp_indices, tp_batch_size, replace=True) - non_tp_indices = np.random.choice(self.non_tp_indices, non_tp_batch_size, replace=True) + tp_indices = np.random.choice(self.tp_indices, tp_batch_size, replace=False) + non_tp_indices = np.random.choice(self.non_tp_indices, non_tp_batch_size, replace=False) batch_indices = np.concatenate((tp_indices, non_tp_indices)) np.random.shuffle(batch_indices) sampled_indices.extend(batch_indices)