diff --git a/docs/feature.md b/docs/feature.md new file mode 100644 index 0000000..92a0413 --- /dev/null +++ b/docs/feature.md @@ -0,0 +1,15 @@ +# Text Featurization + +
+ +```{eval-rst} +.. automodule:: nlpurify.feature.selection +``` + +## Feature Selection with NLTK + +```{eval-rst} +.. automodule:: nlpurify.feature.selection.nltk +``` + +
diff --git a/docs/feature_selection.md b/docs/feature_selection.md deleted file mode 100644 index fcf5cbd..0000000 --- a/docs/feature_selection.md +++ /dev/null @@ -1,9 +0,0 @@ -# Feature Selection - -
- -```{eval-rst} -.. automodule:: nlpurify.feature.selection -``` - -
diff --git a/docs/index.md b/docs/index.md index 5905854..99ef53e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -15,8 +15,8 @@ ```{toctree} :hidden: normalize.md +feature.md legacy.md -feature_selection.md ```
diff --git a/nlpurify/feature/selection/nltk.py b/nlpurify/feature/selection/nltk.py index cb81f6d..2c100a5 100644 --- a/nlpurify/feature/selection/nltk.py +++ b/nlpurify/feature/selection/nltk.py @@ -17,6 +17,8 @@ import re +from typing import Union + from nltk.corpus import stopwords from nltk.tokenize import word_tokenize @@ -75,7 +77,7 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k The default keyword arguments are defined for the :func:`nltk.tokenize.word_tokenize` function. - + * **preserve_line** (*bool*): A flag to decide whether to sentence tokenize the text or not, as accepted by the function. Defaults to False. @@ -83,7 +85,7 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k * **tokenize_language** (*str*): The language model name as accepted by the Punkt corpus by NLTK. Defaults to the "english" language, as in function. - + The paramter value associated with regular expression data control is as below: @@ -137,6 +139,17 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k s = "this is an example string, with p()nct & n0s. 987" print(nlpurify.feature_selection.tokenize_text(s, vanilla = True, retalpha = False)) >> ['this', 'is', 'an', 'example', 'with', '987'] + + **Error Guidelines** + + :raises ValueError: The error is raised when both the attribute + ``vanilla`` and ``regexp`` is set to True. + + **Return Type** + + :rtype: list[str] + :return: Returns a tokenized list of strings. To represent and + save the same in a tabular format use ``"".join()`` method. """ preserve_line = kwargs.get("preserve_line", False) @@ -185,7 +198,7 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k return tokens[tokenize_method] -def remove_stopwords(text : str, language : str = "english", **kwargs) -> str: +def remove_stopwords(text : str, language : str = "english", rtype : object = str, **kwargs) -> Union[str, list]: """ Function to Remove Stopwods from a Raw Text using NLTK @@ -246,7 +259,7 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str: the case of the words is not lower. **Function Example** - + For more control over the tokenization, all the parameters of :func:`tokenize_text()` is accepted. @@ -261,6 +274,18 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str: # this we can further simplify by using other features print(nlpurify.feature_selection.remove_stopwords(s, regexp = True)) >> example string p nct n0s + + **Error Guidelines** + + :raises ValueError: The error is raised when the return type is + not in {str, list} values. Make sure the data type is an type + instance and is not passed as a string value. + + **Return Type** + + :rtype: str | list + :return: A cleaned string or a vector (*iterable*) of selected + features from a given text message. """ tokenize = kwargs.get("tokenize", True) @@ -269,7 +294,7 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str: stopwords_ = stopwords.words(language) # defaults to english # ? normalize the text using nlpurify.normalizeText() - # else, left at user's discreations or additional functionalities + # else, left at user's discreations or additional functionalities text = normalizeText( text, uniform_text_case = "lower", @@ -277,4 +302,10 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str: ) if normalize else text tokens = tokenize_text(text, **kwargs) if tokenize else text - return " ".join([word for word in tokens if word not in stopwords_]) + tokens = [word for word in tokens if word not in stopwords_] + + # ensure return type of the data, else raise error + if rtype not in [str, list]: + raise ValueError(f"Accepted arguments ``list`` or ``str`` received {rtype}.") + + return " ".join(tokens) if rtype == str else tokens