From 1a14a9ba9f59753a16877c1b8136663e3f1a3724 Mon Sep 17 00:00:00 2001 From: Advaith Rao Date: Thu, 16 Nov 2023 01:26:46 -0500 Subject: [PATCH] Word2vec embedder transformer for sklearn pipeline - svm --- detector/modeler.py | 2 +- tests/test_modeler.py | 2 +- utils/util_modeler.py | 35 ++++++++++++++++++++++++----------- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/detector/modeler.py b/detector/modeler.py index 0be9bb1..0c91599 100644 --- a/detector/modeler.py +++ b/detector/modeler.py @@ -381,4 +381,4 @@ def save_model( if not os.path.exists(path): os.makedirs(path, exist_ok=True) - save_model(self.model, path) \ No newline at end of file + save_model(self.model, path) diff --git a/tests/test_modeler.py b/tests/test_modeler.py index cee21ec..dabe15f 100644 --- a/tests/test_modeler.py +++ b/tests/test_modeler.py @@ -80,7 +80,7 @@ def test_evaluate_and_log(x, y_true, y_pred): def test_word2vec_embedding(mail): embedder = Word2VecEmbedder() - embedding = embedder.fit_transform(mail) + embedding = embedder.transform(mail)[0] assert len(embedding) == 300 def test_tp_sampler(): diff --git a/utils/util_modeler.py b/utils/util_modeler.py index 78f7126..6b648af 100644 --- a/utils/util_modeler.py +++ b/utils/util_modeler.py @@ -6,6 +6,7 @@ import wandb # from torch.utils.data import Sampler from sklearn.utils.class_weight import compute_sample_weight +from sklearn.base import BaseEstimator, TransformerMixin def get_f1_score( y_true: list[int], @@ -91,35 +92,47 @@ def evaluate_and_log( log_file.write(log_content) -class Word2VecEmbedder: +class Word2VecEmbedder(BaseEstimator, TransformerMixin): def __init__( self, model_name: str = 'word2vec-google-news-300', - tokenizer: RegexpTokenizer(r'\w+') = RegexpTokenizer(r'\w+') + tokenizer=RegexpTokenizer(r'\w+') ): self.model = gensim.downloader.load(model_name) self.tokenizer = tokenizer - def fit_transform( + def fit( self, - text: str, - + X, + y=None + ): + return self + + def transform( + self, + X ): """Calculate Word2Vec embeddings for the given text. Args: - text (str): text document. + X (list): List of text documents. Returns: np.ndarray: Word2Vec embeddings for the input text. """ - # Initialize an array to store Word2Vec embeddings for the input text - words = self.tokenizer.tokenize(text) # Tokenize the document - word_vectors = [self.model[word] if word in self.model else np.zeros(self.model.vector_size) for word in words] - document_embedding = np.mean(word_vectors, axis=0) # Calculate the mean of word embeddings for the document + if isinstance(X, str): + X = [X] + + embeddings = [] + + for text in X: + words = self.tokenizer.tokenize(text) # Tokenize the document + word_vectors = [self.model[word] if word in self.model else np.zeros(self.model.vector_size) for word in words] + document_embedding = np.mean(word_vectors, axis=0) # Calculate the mean of word embeddings for the document + embeddings.append(document_embedding) - return document_embedding.tolist() + return np.array(embeddings) class TPSampler: