💣 refactor normalize text for #15, fixes #16

sharkutilities · Nov 5, 2024 · 34ff306 · 34ff306
1 parent 02100fc
commit 34ff306
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 106 deletions.
diff --git a/nlpurify/__init__.py b/nlpurify/__init__.py
@@ -18,4 +18,4 @@
 )
 
 from nlpurify import fuzzy
-from nlpurify.normalize import normalizeText
+from nlpurify.normalize import normalize
diff --git a/nlpurify/normalize.py b/nlpurify/normalize.py
@@ -57,16 +57,9 @@ def strip_whitespace(text : str, **kwargs) -> str:
     return text
 
 
-def normalizeText(
-        text : str,
-        uniform_text_case : str = None,
-        replace_double_space : bool = True,
-        strip_line_breaks : bool = False,
-        replace_double_line_breaks : bool = True,
-        **kwargs
-    ) -> str:
+def normalize(text : str, strip : bool = True, **kwargs) -> str:
     """
-    Normalize a Given String with User-Defined Configurations
+    Normalize a Text for AI/ML Operations to Reduce Randomness
 
     The normalization function uses the in-built string function like
     :attr:`.strip()`, :attr:`.replace()` etc. to return a cleaner
@@ -75,7 +68,7 @@ def normalizeText(
 
         * It may not start or end with a white space character,
         * It may not have double space instead of single space, and
-        * It may not be spread across multiple lines.
+        * It may not be spread across multiple lines (i.e., paragraphs).
 
     All the above properties are desired, and can improve performance
     when used to train a large language model. Normalizaton of texts
@@ -88,110 +81,47 @@ def normalizeText(
         be single line, multi-line (example from "text area") and can
         have any type of escape characters.
 
-    :type  uniform_text_case: str
-    :param uniform_text_case: Create an uniform text case, which can
-        be either {``lower``, ``upper``, or ``casefold``}. Defaults to
-        None, i.e., no change in case. NOTE: other text case includes
-        "capital", or "title" case but that is not included and is
-        left for user's discretion as not used frequently.
-
-    :type  replace_double_space: bool
-    :param replace_double_space: A common type of uncleaned text
-        format includes double space (white characters), which can be
-        directly cleaned without compromising informations. Defaults
-        to True.
-
-    :type  strip_line_breaks: bool
-    :param strip_line_breaks: Strip line breaks and returns a single
-        line statement. This uses the os default which is either
-        "CR LF" for windows or "LF" for *nix based systems. However,
-        the default value can be override using keyword argument
-        :attr:`line_break_seperator` if not using default value.
-        Defaults to False. The parameter has an overriding effect on
-        the :attr:`replace_double_line_breaks` and the keyword
-        argument :attr:`strip_whitespace_inline*` if True the text
-        is unaltered.
-
-    :type  replace_double_line_breaks: bool
-    :param replace_double_line_breaks: Double line breaks are common
-        in texts containing paragraphs. This can be easily replaced
-        with a single line break character set. Defaults to True.
-        NOTE: The line break is dependent on the operating system:
-        in windows it is "\\r\\n" or "CR LF" while in *nix system it
-        is always "\\n" or "LF". To answer this, the program considers
-        the default line break based on the operating system the code
-        is running. To override this - use the keyword argument.
+    :type  strip: bool
+    :param strip: The global attribute to clean and normalize text
+        of white spaces and multiple line breaks.
 
     **Keyword Arguments**
-        * **strip_whitespace** (*bool*): Strip white space from the
-          beginning or end of the text. Defaults to True. Alternate
-          keyword terms are :attr:`strip_whitespace_start` and
-          :attr:`strip_whitespace_final` which cleans white space from
-          the beginning or end of string only respectively. The
-          attribute :attr:`strip_whitespace` has priority over its
-          alternates and ignores alternates if set to True.
-        * **strip_whitespace_inline** (*bool*): This is an extension
-          of the :attr:`strip_whitespace` that iterates for each line
-          and strips the white spaces at the beginning and end of each
-          line. This is useful when the text spans multiple lines.
-          Defaults to True. Similar to :attr:`strip_whitespace` the
-          alternate arguments are :attr:`strip_whitespace_inline_start`
-          and :attr:`strip_whitespace_inline_start` which if True
-          strips only the beginning or the ending white space from
-          each line.
+
+    All the arguments of :func:`nlpurify.normalize.strip_whitespace()`
+    is accepted. In addition, the following are specific to this
+    function:
+
+        * **strip_line_breaks** (*bool*): Strip line breaks and
+            returns a single line statement. This uses the os default
+            which is either "CR LF" for windows or "LF" for *nix
+            based systems. However, the default value can be override
+            using keyword argument :attr:`line_break_seperator`.
+            Defaults to False. The parameter has an overriding effect on
+            the :attr:`replace_double_line_breaks`, and if True the text
+            is unaltered.
+
+        * **replace_double_line_breaks** (*bool*): Double line breaks
+            are common in texts containing paragraphs. This can be
+            easily replaced with a single line break character set.
+            Defaults to True.
+
         * **line_break_seperator** (*str*): The end line character
           which is either "\\r\\n" for windows or "\\n" for *nix
           based systems. By default defaults to running operating
           systems default.
     """
 
-    strip_whitespace = kwargs.get("strip_whitespace", True)
-    strip_whitespace_inline = kwargs.get("strip_whitespace_inline", True)
-
-    # ? related alternate terms to `strip_whitespace`
-    strip_whitespace_start = kwargs.get("strip_whitespace_start", False)
-    strip_whitespace_final = kwargs.get("strip_whitespace_final", False)
-
-    # ? related alternate terms to `strip_whitespace_inline`
-    strip_whitespace_inline_start = kwargs.get("strip_whitespace_inline_start", False)
-    strip_whitespace_inline_final = kwargs.get("strip_whitespace_inline_final", False)
-
-    # ? define line break seperator, else use default os value
     line_break_seperator = kwargs.get("line_break_seperator", os.linesep)
 
-    if replace_double_space:
-        text = _replace_double_space(text)
-
-    if strip_line_breaks:
-        text = text.replace(line_break_seperator, " ")
-
-    if replace_double_line_breaks:
-        # get the keyword argument for line break seperator,
-        # or else get the os default, value is doubled internally
-        text = text.replace(line_break_seperator * 2, line_break_seperator)
-
-    if uniform_text_case:
-        text = _uniform_text_case(text, case = uniform_text_case)
-
-    if any([strip_whitespace, strip_whitespace_start, strip_whitespace_final]):
-        # white space character from the string is to be removed
-        text = _strip_whitespace(
-            text,
-            strip_whitespace = strip_whitespace,
-            strip_whitespace_start = strip_whitespace_start,
-            strip_whitespace_final = strip_whitespace_final
-        )
-
-    if any([strip_whitespace_inline, strip_whitespace_inline_start, strip_whitespace_inline_final]):
-        # white space character from the string is to be removed
-        text = "\n".join([
-            _strip_whitespace(
-                line,
-                strip_whitespace = strip_whitespace_inline,
-                strip_whitespace_start = strip_whitespace_inline_start,
-                strip_whitespace_final = strip_whitespace_inline_final
-            )
-            for line in text.splitlines()
-        ])
+    # normalize text of line breaks based on os/user defined
+    text = text.replace(line_break_seperator, " ") \
+        if kwargs.get("strip_line_breaks", False) else text
+    text = text.replace(line_break_seperator * 2, line_break_seperator) \
+        if kwargs.get("replace_double_line_breaks", True) else text
+
+    # ! 💣 always return the text in lowercase instead of user choice
+    # in addition, run the white space removal logic to normalize the text
+    text = strip_whitespace(text, **kwargs).lower() \
+        if strip else text.lower()
 
     return text