Skip to content

Commit

Permalink
💣 refactor normalize text for #15, fixes #16
Browse files Browse the repository at this point in the history
  • Loading branch information
ZenithClown committed Nov 5, 2024
1 parent 02100fc commit 34ff306
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 106 deletions.
2 changes: 1 addition & 1 deletion nlpurify/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@
)

from nlpurify import fuzzy
from nlpurify.normalize import normalizeText
from nlpurify.normalize import normalize
140 changes: 35 additions & 105 deletions nlpurify/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,9 @@ def strip_whitespace(text : str, **kwargs) -> str:
return text


def normalizeText(
text : str,
uniform_text_case : str = None,
replace_double_space : bool = True,
strip_line_breaks : bool = False,
replace_double_line_breaks : bool = True,
**kwargs
) -> str:
def normalize(text : str, strip : bool = True, **kwargs) -> str:
"""
Normalize a Given String with User-Defined Configurations
Normalize a Text for AI/ML Operations to Reduce Randomness
The normalization function uses the in-built string function like
:attr:`.strip()`, :attr:`.replace()` etc. to return a cleaner
Expand All @@ -75,7 +68,7 @@ def normalizeText(
* It may not start or end with a white space character,
* It may not have double space instead of single space, and
* It may not be spread across multiple lines.
* It may not be spread across multiple lines (i.e., paragraphs).
All the above properties are desired, and can improve performance
when used to train a large language model. Normalizaton of texts
Expand All @@ -88,110 +81,47 @@ def normalizeText(
be single line, multi-line (example from "text area") and can
have any type of escape characters.
:type uniform_text_case: str
:param uniform_text_case: Create an uniform text case, which can
be either {``lower``, ``upper``, or ``casefold``}. Defaults to
None, i.e., no change in case. NOTE: other text case includes
"capital", or "title" case but that is not included and is
left for user's discretion as not used frequently.
:type replace_double_space: bool
:param replace_double_space: A common type of uncleaned text
format includes double space (white characters), which can be
directly cleaned without compromising informations. Defaults
to True.
:type strip_line_breaks: bool
:param strip_line_breaks: Strip line breaks and returns a single
line statement. This uses the os default which is either
"CR LF" for windows or "LF" for *nix based systems. However,
the default value can be override using keyword argument
:attr:`line_break_seperator` if not using default value.
Defaults to False. The parameter has an overriding effect on
the :attr:`replace_double_line_breaks` and the keyword
argument :attr:`strip_whitespace_inline*` if True the text
is unaltered.
:type replace_double_line_breaks: bool
:param replace_double_line_breaks: Double line breaks are common
in texts containing paragraphs. This can be easily replaced
with a single line break character set. Defaults to True.
NOTE: The line break is dependent on the operating system:
in windows it is "\\r\\n" or "CR LF" while in *nix system it
is always "\\n" or "LF". To answer this, the program considers
the default line break based on the operating system the code
is running. To override this - use the keyword argument.
:type strip: bool
:param strip: The global attribute to clean and normalize text
of white spaces and multiple line breaks.
**Keyword Arguments**
* **strip_whitespace** (*bool*): Strip white space from the
beginning or end of the text. Defaults to True. Alternate
keyword terms are :attr:`strip_whitespace_start` and
:attr:`strip_whitespace_final` which cleans white space from
the beginning or end of string only respectively. The
attribute :attr:`strip_whitespace` has priority over its
alternates and ignores alternates if set to True.
* **strip_whitespace_inline** (*bool*): This is an extension
of the :attr:`strip_whitespace` that iterates for each line
and strips the white spaces at the beginning and end of each
line. This is useful when the text spans multiple lines.
Defaults to True. Similar to :attr:`strip_whitespace` the
alternate arguments are :attr:`strip_whitespace_inline_start`
and :attr:`strip_whitespace_inline_start` which if True
strips only the beginning or the ending white space from
each line.
All the arguments of :func:`nlpurify.normalize.strip_whitespace()`
is accepted. In addition, the following are specific to this
function:
* **strip_line_breaks** (*bool*): Strip line breaks and
returns a single line statement. This uses the os default
which is either "CR LF" for windows or "LF" for *nix
based systems. However, the default value can be override
using keyword argument :attr:`line_break_seperator`.
Defaults to False. The parameter has an overriding effect on
the :attr:`replace_double_line_breaks`, and if True the text
is unaltered.
* **replace_double_line_breaks** (*bool*): Double line breaks
are common in texts containing paragraphs. This can be
easily replaced with a single line break character set.
Defaults to True.
* **line_break_seperator** (*str*): The end line character
which is either "\\r\\n" for windows or "\\n" for *nix
based systems. By default defaults to running operating
systems default.
"""

strip_whitespace = kwargs.get("strip_whitespace", True)
strip_whitespace_inline = kwargs.get("strip_whitespace_inline", True)

# ? related alternate terms to `strip_whitespace`
strip_whitespace_start = kwargs.get("strip_whitespace_start", False)
strip_whitespace_final = kwargs.get("strip_whitespace_final", False)

# ? related alternate terms to `strip_whitespace_inline`
strip_whitespace_inline_start = kwargs.get("strip_whitespace_inline_start", False)
strip_whitespace_inline_final = kwargs.get("strip_whitespace_inline_final", False)

# ? define line break seperator, else use default os value
line_break_seperator = kwargs.get("line_break_seperator", os.linesep)

if replace_double_space:
text = _replace_double_space(text)

if strip_line_breaks:
text = text.replace(line_break_seperator, " ")

if replace_double_line_breaks:
# get the keyword argument for line break seperator,
# or else get the os default, value is doubled internally
text = text.replace(line_break_seperator * 2, line_break_seperator)

if uniform_text_case:
text = _uniform_text_case(text, case = uniform_text_case)

if any([strip_whitespace, strip_whitespace_start, strip_whitespace_final]):
# white space character from the string is to be removed
text = _strip_whitespace(
text,
strip_whitespace = strip_whitespace,
strip_whitespace_start = strip_whitespace_start,
strip_whitespace_final = strip_whitespace_final
)

if any([strip_whitespace_inline, strip_whitespace_inline_start, strip_whitespace_inline_final]):
# white space character from the string is to be removed
text = "\n".join([
_strip_whitespace(
line,
strip_whitespace = strip_whitespace_inline,
strip_whitespace_start = strip_whitespace_inline_start,
strip_whitespace_final = strip_whitespace_inline_final
)
for line in text.splitlines()
])
# normalize text of line breaks based on os/user defined
text = text.replace(line_break_seperator, " ") \
if kwargs.get("strip_line_breaks", False) else text
text = text.replace(line_break_seperator * 2, line_break_seperator) \
if kwargs.get("replace_double_line_breaks", True) else text

# ! 💣 always return the text in lowercase instead of user choice
# in addition, run the white space removal logic to normalize the text
text = strip_whitespace(text, **kwargs).lower() \
if strip else text.lower()

return text

0 comments on commit 34ff306

Please sign in to comment.