Skip to content

Commit

Permalink
📃 ad document for text feature submodule and host in rtd
Browse files Browse the repository at this point in the history
- this fixes #12
- this also fixes #9 as all tasks are completed
  • Loading branch information
ZenithClown committed Aug 21, 2024
1 parent f9c903d commit 779bd97
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 16 deletions.
15 changes: 15 additions & 0 deletions docs/feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Text Featurization

<div align = "justify">

```{eval-rst}
.. automodule:: nlpurify.feature.selection
```

## Feature Selection with NLTK

```{eval-rst}
.. automodule:: nlpurify.feature.selection.nltk
```

</div>
9 changes: 0 additions & 9 deletions docs/feature_selection.md

This file was deleted.

2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
```{toctree}
:hidden:
normalize.md
feature.md
legacy.md
feature_selection.md
```

<div align = "justify">
Expand Down
43 changes: 37 additions & 6 deletions nlpurify/feature/selection/nltk.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

import re

from typing import Union

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

Expand Down Expand Up @@ -75,15 +77,15 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k
The default keyword arguments are defined for the
:func:`nltk.tokenize.word_tokenize` function.
* **preserve_line** (*bool*): A flag to decide whether to
sentence tokenize the text or not, as accepted by
the function. Defaults to False.
* **tokenize_language** (*str*): The language model name as
accepted by the Punkt corpus by NLTK. Defaults to the
"english" language, as in function.
The paramter value associated with regular expression data
control is as below:
Expand Down Expand Up @@ -137,6 +139,17 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k
s = "this is an example string, with p()nct & n0s. 987"
print(nlpurify.feature_selection.tokenize_text(s, vanilla = True, retalpha = False))
>> ['this', 'is', 'an', 'example', 'with', '987']
**Error Guidelines**
:raises ValueError: The error is raised when both the attribute
``vanilla`` and ``regexp`` is set to True.
**Return Type**
:rtype: list[str]
:return: Returns a tokenized list of strings. To represent and
save the same in a tabular format use ``"".join()`` method.
"""

preserve_line = kwargs.get("preserve_line", False)
Expand Down Expand Up @@ -185,7 +198,7 @@ def tokenize_text(text : str, regexp : bool = False, vanilla : bool = False, **k
return tokens[tokenize_method]


def remove_stopwords(text : str, language : str = "english", **kwargs) -> str:
def remove_stopwords(text : str, language : str = "english", rtype : object = str, **kwargs) -> Union[str, list]:
"""
Function to Remove Stopwods from a Raw Text using NLTK
Expand Down Expand Up @@ -246,7 +259,7 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str:
the case of the words is not lower.
**Function Example**
For more control over the tokenization, all the parameters
of :func:`tokenize_text()` is accepted.
Expand All @@ -261,6 +274,18 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str:
# this we can further simplify by using other features
print(nlpurify.feature_selection.remove_stopwords(s, regexp = True))
>> example string p nct n0s
**Error Guidelines**
:raises ValueError: The error is raised when the return type is
not in {str, list} values. Make sure the data type is an type
instance and is not passed as a string value.
**Return Type**
:rtype: str | list
:return: A cleaned string or a vector (*iterable*) of selected
features from a given text message.
"""

tokenize = kwargs.get("tokenize", True)
Expand All @@ -269,12 +294,18 @@ def remove_stopwords(text : str, language : str = "english", **kwargs) -> str:
stopwords_ = stopwords.words(language) # defaults to english

# ? normalize the text using nlpurify.normalizeText()
# else, left at user's discreations or additional functionalities
# else, left at user's discreations or additional functionalities
text = normalizeText(
text,
uniform_text_case = "lower",
strip_line_breaks = True
) if normalize else text

tokens = tokenize_text(text, **kwargs) if tokenize else text
return " ".join([word for word in tokens if word not in stopwords_])
tokens = [word for word in tokens if word not in stopwords_]

# ensure return type of the data, else raise error
if rtype not in [str, list]:
raise ValueError(f"Accepted arguments ``list`` or ``str`` received {rtype}.")

return " ".join(tokens) if rtype == str else tokens

0 comments on commit 779bd97

Please sign in to comment.