From 779bd97f415daa6d24f66071362159fdd54103bf Mon Sep 17 00:00:00 2001 From: ZenithClown Date: Thu, 22 Aug 2024 01:18:41 +0530 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=83=20ad=20document=20for=20text=20fea?= =?UTF-8?q?ture=20submodule=20and=20host=20in=20rtd=20-=20this=20fixes=20#?= =?UTF-8?q?12=20-=20this=20also=20fixes=20#9=20as=20all=20tasks=20are=20co?= =?UTF-8?q?mpleted?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/feature.md | 15 +++++++++++ docs/feature_selection.md | 9 ------- docs/index.md | 2 +- nlpurify/feature/selection/nltk.py | 43 +++++++++++++++++++++++++----- 4 files changed, 53 insertions(+), 16 deletions(-) create mode 100644 docs/feature.md delete mode 100644 docs/feature_selection.md diff --git a/docs/feature.md b/docs/feature.md new file mode 100644 index 0000000..92a0413 --- /dev/null +++ b/docs/feature.md @@ -0,0 +1,15 @@ +# Text Featurization + +
+ +```{eval-rst} +.. automodule:: nlpurify.feature.selection +``` + +## Feature Selection with NLTK + +```{eval-rst} +.. automodule:: nlpurify.feature.selection.nltk +``` + +
diff --git a/docs/feature_selection.md b/docs/feature_selection.md deleted file mode 100644 index fcf5cbd..0000000 --- a/docs/feature_selection.md +++ /dev/null @@ -1,9 +0,0 @@ -# Feature Selection - -
- -```{eval-rst} -.. automodule:: nlpurify.feature.selection -``` - -
diff --git a/docs/index.md b/docs/index.md index 5905854..99ef53e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -15,8 +15,8 @@ ```{toctree} :hidden: normalize.md +feature.md legacy.md -feature_selection.md ```
diff --git a/nlpurify/feature/selection/nltk.py b/nlpurify/feature/selection/nltk.py index cb81f6d..2c100a5 100644 --- a/nlpurify/feature/selection/nltk.py +++ b/nlpurify/feature/selection/nltk.py @@ -17,6 +17,8 @@ import re +from typing import Union + from nltk.corpus import stopwords from nltk.tokenize import word_tokenize @@ -75,7 +77,7 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k The default keyword arguments are defined for the :func:`nltk.tokenize.word_tokenize` function. - + * **preserve_line** (*bool*): A flag to decide whether to sentence tokenize the text or not, as accepted by the function. Defaults to False. @@ -83,7 +85,7 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k * **tokenize_language** (*str*): The language model name as accepted by the Punkt corpus by NLTK. Defaults to the "english" language, as in function. - + The paramter value associated with regular expression data control is as below: @@ -137,6 +139,17 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k s = "this is an example string, with p()nct & n0s. 987" print(nlpurify.feature_selection.tokenize_text(s, vanilla = True, retalpha = False)) >> ['this', 'is', 'an', 'example', 'with', '987'] + + **Error Guidelines** + + :raises ValueError: The error is raised when both the attribute + ``vanilla`` and ``regexp`` is set to True. + + **Return Type** + + :rtype: list[str] + :return: Returns a tokenized list of strings. To represent and + save the same in a tabular format use ``"".join()`` method. """ preserve_line = kwargs.get("preserve_line", False) @@ -185,7 +198,7 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k return tokens[tokenize_method] -def remove_stopwords(text : str, language : str = "english", **kwargs) -> str: +def remove_stopwords(text : str, language : str = "english", rtype : object = str, **kwargs) -> Union[str, list]: """ Function to Remove Stopwods from a Raw Text using NLTK @@ -246,7 +259,7 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str: the case of the words is not lower. **Function Example** - + For more control over the tokenization, all the parameters of :func:`tokenize_text()` is accepted. @@ -261,6 +274,18 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str: # this we can further simplify by using other features print(nlpurify.feature_selection.remove_stopwords(s, regexp = True)) >> example string p nct n0s + + **Error Guidelines** + + :raises ValueError: The error is raised when the return type is + not in {str, list} values. Make sure the data type is an type + instance and is not passed as a string value. + + **Return Type** + + :rtype: str | list + :return: A cleaned string or a vector (*iterable*) of selected + features from a given text message. """ tokenize = kwargs.get("tokenize", True) @@ -269,7 +294,7 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str: stopwords_ = stopwords.words(language) # defaults to english # ? normalize the text using nlpurify.normalizeText() - # else, left at user's discreations or additional functionalities + # else, left at user's discreations or additional functionalities text = normalizeText( text, uniform_text_case = "lower", @@ -277,4 +302,10 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str: ) if normalize else text tokens = tokenize_text(text, **kwargs) if tokenize else text - return " ".join([word for word in tokens if word not in stopwords_]) + tokens = [word for word in tokens if word not in stopwords_] + + # ensure return type of the data, else raise error + if rtype not in [str, list]: + raise ValueError(f"Accepted arguments ``list`` or ``str`` received {rtype}.") + + return " ".join(tokens) if rtype == str else tokens