📃 ad document for text feature submodule and host in rtd

- this fixes #12 - this also fixes #9 as all tasks are completed
sharkutilities · Aug 21, 2024 · 779bd97 · 779bd97
1 parent f9c903d
commit 779bd97
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 16 deletions.
diff --git a/docs/feature.md b/docs/feature.md
@@ -0,0 +1,15 @@
+# Text Featurization
+
+<div align = "justify">
+
+```{eval-rst}
+.. automodule:: nlpurify.feature.selection
+```
+
+## Feature Selection with NLTK
+
+```{eval-rst}
+.. automodule:: nlpurify.feature.selection.nltk
+```
+
+</div>
diff --git a/docs/feature_selection.md b/docs/feature_selection.md
diff --git a/docs/index.md b/docs/index.md
@@ -15,8 +15,8 @@
 ```{toctree}
 :hidden:
 normalize.md
+feature.md
 legacy.md
-feature_selection.md
 ```
 
 <div align = "justify">

diff --git a/nlpurify/feature/selection/nltk.py b/nlpurify/feature/selection/nltk.py
@@ -17,6 +17,8 @@
 
 import re
 
+from typing import Union
+
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 
@@ -75,15 +77,15 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k
 
     The default keyword arguments are defined for the
     :func:`nltk.tokenize.word_tokenize` function.
-        
+
         * **preserve_line** (*bool*): A flag to decide whether to
           sentence tokenize the text or not, as accepted by
           the function. Defaults to False.
 
         * **tokenize_language** (*str*): The language model name as
           accepted by the Punkt corpus by NLTK. Defaults to the
           "english" language, as in function.
-    
+
     The paramter value associated with regular expression data
     control is as below:
 
@@ -137,6 +139,17 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k
         s = "this is an example string, with p()nct & n0s. 987"
         print(nlpurify.feature_selection.tokenize_text(s, vanilla = True, retalpha = False))
         >> ['this', 'is', 'an', 'example', 'with', '987']
+
+    **Error Guidelines**
+
+    :raises ValueError: The error is raised when both the attribute
+        ``vanilla`` and ``regexp`` is set to True.
+
+    **Return Type**
+
+    :rtype:  list[str]
+    :return: Returns a tokenized list of strings. To represent and
+        save the same in a tabular format use ``"".join()`` method.
     """
 
     preserve_line = kwargs.get("preserve_line", False)
@@ -185,7 +198,7 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k
     return tokens[tokenize_method]
 
 
-def remove_stopwords(text : str, language : str = "english", **kwargs) -> str:
+def remove_stopwords(text : str, language : str = "english", rtype : object = str, **kwargs) -> Union[str, list]:
     """
     Function to Remove Stopwods from a Raw Text using NLTK
 
@@ -246,7 +259,7 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str:
           the case of the words is not lower.
 
     **Function Example**
-    
+
     For more control over the tokenization, all the parameters
     of :func:`tokenize_text()` is accepted.
 
@@ -261,6 +274,18 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str:
         # this we can further simplify by using other features
         print(nlpurify.feature_selection.remove_stopwords(s, regexp = True))
         >> example string p nct n0s
+
+    **Error Guidelines**
+
+    :raises ValueError: The error is raised when the return type is
+        not in {str, list} values. Make sure the data type is an type
+        instance and is not passed as a string value.
+
+    **Return Type**
+
+    :rtype:  str | list
+    :return: A cleaned string or a vector (*iterable*) of selected
+        features from a given text message.
     """
 
     tokenize = kwargs.get("tokenize", True)
@@ -269,12 +294,18 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str:
     stopwords_ = stopwords.words(language) # defaults to english
 
     # ? normalize the text using nlpurify.normalizeText()
-    # else, left at user's discreations or additional functionalities 
+    # else, left at user's discreations or additional functionalities
     text = normalizeText(
         text,
         uniform_text_case = "lower",
         strip_line_breaks = True
     ) if normalize else text
 
     tokens = tokenize_text(text, **kwargs) if tokenize else text
-    return " ".join([word for word in tokens if word not in stopwords_])
+    tokens = [word for word in tokens if word not in stopwords_]
+
+    # ensure return type of the data, else raise error
+    if rtype not in [str, list]:
+        raise ValueError(f"Accepted arguments ``list`` or ``str`` received {rtype}.")
+
+    return " ".join(tokens) if rtype == str else tokens