From 86c7aa23a8aee977349f7f3a9ab6f4222a2d1bd5 Mon Sep 17 00:00:00 2001 From: Boris Date: Sat, 20 Jul 2024 17:08:03 +0200 Subject: [PATCH] fix: remove keybert import (#120) --- cognee/modules/data/extraction/__init__.py | 1 - .../modules/data/extraction/extract_topics.py | 85 ------------------- 2 files changed, 86 deletions(-) delete mode 100644 cognee/modules/data/extraction/extract_topics.py diff --git a/cognee/modules/data/extraction/__init__.py b/cognee/modules/data/extraction/__init__.py index 25f734b7..e69de29b 100644 --- a/cognee/modules/data/extraction/__init__.py +++ b/cognee/modules/data/extraction/__init__.py @@ -1 +0,0 @@ -from .extract_topics import extract_topics_keybert diff --git a/cognee/modules/data/extraction/extract_topics.py b/cognee/modules/data/extraction/extract_topics.py deleted file mode 100644 index 331f71c8..00000000 --- a/cognee/modules/data/extraction/extract_topics.py +++ /dev/null @@ -1,85 +0,0 @@ -import re -import nltk -from nltk.tag import pos_tag -from nltk.corpus import stopwords, wordnet -from nltk.tokenize import word_tokenize -from nltk.stem import WordNetLemmatizer - -def extract_topics_keybert(texts: list[str]): - from keybert import KeyBERT - - kw_model = KeyBERT() - - for text in texts: - topics = kw_model.extract_keywords( - preprocess_text(text), - keyphrase_ngram_range = (1, 2), - top_n = 3, - # use_mmr = True, - # diversity = 0.9, - ) - yield [topic[0] for topic in topics] - -def preprocess_text(text: str): - try: - # Used for stopwords removal. - stopwords.ensure_loaded() - except LookupError: - nltk.download("stopwords", quiet = True) - stopwords.ensure_loaded() - - try: - # Used in WordNetLemmatizer. - wordnet.ensure_loaded() - except LookupError: - nltk.download("wordnet", quiet = True) - wordnet.ensure_loaded() - - try: - # Used in word_tokenize. - nltk.data.find("tokenizers/punkt") - except LookupError: - nltk.download("punkt", quiet = True) - - text = text.lower() - - # Remove punctuation - text = re.sub(r"[^\w\s-]", "", text) - - # Tokenize the text - tokens = word_tokenize(text) - - tagged_tokens = pos_tag(tokens) - tokens = [word for word, tag in tagged_tokens if tag in ["NNP", "NN", "JJ"]] - - # Remove stop words - stop_words = set(stopwords.words("english")) - tokens = [word for word in tokens if word not in stop_words] - - # Lemmatize the text - lemmatizer = WordNetLemmatizer() - tokens = [lemmatizer.lemmatize(word) for word in tokens] - - # Join tokens back to a single string - processed_text = " ".join(tokens) - - return processed_text - - -# def clean_text(text: str): -# text = re.sub(r"[ \t]{2,}|[\n\r]", " ", text.lower()) -# # text = re.sub(r"[`\"'.,;!?…]", "", text).strip() -# return text - -# def remove_stop_words(text: str): -# try: -# stopwords.ensure_loaded() -# except LookupError: -# download("stopwords") -# stopwords.ensure_loaded() - -# stop_words = set(stopwords.words("english")) -# text = text.split() -# text = [word for word in text if not word in stop_words] -# return " ".join(text) -