From 779bd97f415daa6d24f66071362159fdd54103bf Mon Sep 17 00:00:00 2001
From: ZenithClown <debmalyapramanik.005@gmail.com>
Date: Thu, 22 Aug 2024 01:18:41 +0530
Subject: [PATCH] =?UTF-8?q?=F0=9F=93=83=20ad=20document=20for=20text=20fea?=
 =?UTF-8?q?ture=20submodule=20and=20host=20in=20rtd=20-=20this=20fixes=20#?=
 =?UTF-8?q?12=20-=20this=20also=20fixes=20#9=20as=20all=20tasks=20are=20co?=
 =?UTF-8?q?mpleted?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/feature.md                    | 15 +++++++++++
 docs/feature_selection.md          |  9 -------
 docs/index.md                      |  2 +-
 nlpurify/feature/selection/nltk.py | 43 +++++++++++++++++++++++++-----
 4 files changed, 53 insertions(+), 16 deletions(-)
 create mode 100644 docs/feature.md
 delete mode 100644 docs/feature_selection.md
diff --git a/docs/feature.md b/docs/feature.md
new file mode 100644
index 0000000..92a0413
--- /dev/null
+++ b/docs/feature.md
@@ -0,0 +1,15 @@
+# Text Featurization
+
+<div align = "justify">
+
+```{eval-rst}
+.. automodule:: nlpurify.feature.selection
+```
+
+## Feature Selection with NLTK
+
+```{eval-rst}
+.. automodule:: nlpurify.feature.selection.nltk
+```
+
+</div>
diff --git a/docs/feature_selection.md b/docs/feature_selection.md
deleted file mode 100644
index fcf5cbd..0000000
--- a/docs/feature_selection.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Feature Selection
-
-<div align = "justify">
-
-```{eval-rst}
-.. automodule:: nlpurify.feature.selection
-```
-
-</div>
diff --git a/docs/index.md b/docs/index.md
index 5905854..99ef53e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -15,8 +15,8 @@
 ```{toctree}
 :hidden:
 normalize.md
+feature.md
 legacy.md
-feature_selection.md
 ```
 
 <div align = "justify">
diff --git a/nlpurify/feature/selection/nltk.py b/nlpurify/feature/selection/nltk.py
index cb81f6d..2c100a5 100644
--- a/nlpurify/feature/selection/nltk.py
+++ b/nlpurify/feature/selection/nltk.py
@@ -17,6 +17,8 @@
 
 import re
 
+from typing import Union
+
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 
@@ -75,7 +77,7 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k
 
     The default keyword arguments are defined for the
     :func:`nltk.tokenize.word_tokenize` function.
-        
+
         * **preserve_line** (*bool*): A flag to decide whether to
           sentence tokenize the text or not, as accepted by
           the function. Defaults to False.
@@ -83,7 +85,7 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k
         * **tokenize_language** (*str*): The language model name as
           accepted by the Punkt corpus by NLTK. Defaults to the
           "english" language, as in function.
-    
+
     The paramter value associated with regular expression data
     control is as below:
 
@@ -137,6 +139,17 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k
         s = "this is an example string, with p()nct & n0s. 987"
         print(nlpurify.feature_selection.tokenize_text(s, vanilla = True, retalpha = False))
         >> ['this', 'is', 'an', 'example', 'with', '987']
+
+    **Error Guidelines**
+
+    :raises ValueError: The error is raised when both the attribute
+        ``vanilla`` and ``regexp`` is set to True.
+
+    **Return Type**
+
+    :rtype:  list[str]
+    :return: Returns a tokenized list of strings. To represent and
+        save the same in a tabular format use ``"".join()`` method.
     """
 
     preserve_line = kwargs.get("preserve_line", False)
@@ -185,7 +198,7 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k
     return tokens[tokenize_method]
 
 
-def remove_stopwords(text : str, language : str = "english", **kwargs) -> str:
+def remove_stopwords(text : str, language : str = "english", rtype : object = str, **kwargs) -> Union[str, list]:
     """
     Function to Remove Stopwods from a Raw Text using NLTK
 
@@ -246,7 +259,7 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str:
           the case of the words is not lower.
 
     **Function Example**
-    
+
     For more control over the tokenization, all the parameters
     of :func:`tokenize_text()` is accepted.
 
@@ -261,6 +274,18 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str:
         # this we can further simplify by using other features
         print(nlpurify.feature_selection.remove_stopwords(s, regexp = True))
         >> example string p nct n0s
+
+    **Error Guidelines**
+
+    :raises ValueError: The error is raised when the return type is
+        not in {str, list} values. Make sure the data type is an type
+        instance and is not passed as a string value.
+
+    **Return Type**
+
+    :rtype:  str | list
+    :return: A cleaned string or a vector (*iterable*) of selected
+        features from a given text message.
     """
 
     tokenize = kwargs.get("tokenize", True)
@@ -269,7 +294,7 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str:
     stopwords_ = stopwords.words(language) # defaults to english
 
     # ? normalize the text using nlpurify.normalizeText()
-    # else, left at user's discreations or additional functionalities 
+    # else, left at user's discreations or additional functionalities
     text = normalizeText(
         text,
         uniform_text_case = "lower",
@@ -277,4 +302,10 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str:
     ) if normalize else text
 
     tokens = tokenize_text(text, **kwargs) if tokenize else text
-    return " ".join([word for word in tokens if word not in stopwords_])
+    tokens = [word for word in tokens if word not in stopwords_]
+
+    # ensure return type of the data, else raise error
+    if rtype not in [str, list]:
+        raise ValueError(f"Accepted arguments ``list`` or ``str`` received {rtype}.")
+
+    return " ".join(tokens) if rtype == str else tokens