Merge pull request #3 from sharkutilities/merge/existing-gist

Merging Existing GitHub Gist into NLPurify
sharkutilities · Aug 17, 2024 · 29b636d · 29b636d
2 parents a737bf4 + a6d8029
commit 29b636d
Show file tree

Hide file tree

Showing 5 changed files with 198 additions and 0 deletions.
diff --git a/nlpurify/__init__.py b/nlpurify/__init__.py
@@ -0,0 +1,15 @@
+# -*- encoding: utf-8 -*-
+
+"""
+Text Cleaning and Feature Extraction Engine
+
+The module provides text cleaning and feature extractions (like
+mobile number, url, etc.) from a Python string and also provides an
+one-stop solution library oriented towards text cleaning.
+"""
+
+# ? package follows https://peps.python.org/pep-0440/
+# ? https://python-semver.readthedocs.io/en/latest/advanced/convert-pypi-to-semver.html
+__version__ = "0.0.1.dev0"
+
+# init-time options registrations
diff --git a/nlpurify/legacy/__init__.py b/nlpurify/legacy/__init__.py
@@ -0,0 +1,14 @@
+# -*- encoding: utf-8 -*-
+
+"""
+Legacy Version of NLP Utility Module
+
+The NLP utility module is refactored and moved to a new version with
+advanced collection of features. However the existing codes from the
+`gist <https://gist.github.com/ZenithClown>`_ is maintained under the
+legacy submodule unless dependent codes are gradually migrated.
+
+More Information: `Issue #5 <https://github.com/sharkutilities/NLPurify/issues/5>`_
+"""
+
+from nlpurify.legacy.nlp_utils import * # noqa: F401, F403 # pyright: ignore[reportMissingImports]
diff --git a/nlpurify/legacy/nlp_utils.py b/nlpurify/legacy/nlp_utils.py
@@ -0,0 +1,113 @@
+# -*- encoding: utf-8 -*-
+
+"""
+A set of utility function related to natural language
+processing. In addition to the basic libraries, the module
+requires the following corpus from `nltk` library:
+  * `stopwords` : used to remove stop words from a given
+                  strings. Currently using the function for
+                  pre-processing.
+
+In addition, need some additional libraries like `fuzzywuzzy`
+and `python-Levenshtein` using the following:
+
+```python
+pip install fuzzywuzzy
+pip install python-Levenshtein
+```
+"""
+
+import re
+
+from fuzzywuzzy import fuzz
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem.wordnet import WordNetLemmatizer
+
+
+def processor(string : str, text_process : bool = False, **kwargs) -> str:
+    """
+    A Simple Utility Function to Pre-Process a String
+
+    The function inputs a string, and exports clean formatted string
+    which is free of stop words (english) and the words are
+    lemmatized, i.e. transformed to their base form.
+
+    :type  string: str
+    :param string: Base string on which various `nltk` functions are
+        applied to clean unwanted informations.
+
+    :type  text_process: bool
+    :param text_process: Should the base string be  formatted using
+        `text_process()`. Defaults to False.
+    """
+
+    tokens = word_tokenize(string.lower())
+    filterted = [word for word in tokens if word not in stopwords.words("english")]
+    lemmatized = [WordNetLemmatizer().lemmatize(word, "v") for word in filterted]
+
+    return text_processor(" ".join(lemmatized), **kwargs) if text_process else " ".join(lemmatized)
+
+
+def fuzzyMatch(string : str, reference : str, method : str = "partial_ratio") -> int:
+    """
+    Calculate a Percentage Similarity between `string` and `reference` Text
+
+    Using the `fuzzywuzzy.fuzz()` method, the function calculates the percentage of
+    similarity between two text data. There are various methods available which can
+    be declared via `method` parameter. However, `partial_ratio` is great when
+    we want to match a text with partial data. For example, we want to find all the
+    strings which have the word 'annonymous' but the spelling, position may be
+    different in each case.
+    """
+
+    method = {
+        "ratio" : fuzz.ratio,
+        "partial_ratio" : fuzz.partial_ratio,
+        "token_sort_ratio" : fuzz.token_sort_ratio
+    }.get(method)
+
+    return method(reference, string)
+
+
+def text_processor(string : str, **kwargs) -> str:
+    """
+    Uses String Methods to Clean a String
+
+    An extension of the `processor` function, which uses the in-built
+    python string methods to clear string contents. The function can
+    be called seperatly, or pass `text_process = True)` in `processor`.
+    More information on in-built string methods is available here:
+    https://www.programiz.com/python-programming/methods/string.
+
+    # ! Function is not yet optimized when used in conjunction.
+
+    :type  string: str
+    :param string: Base string which needs formatting. The string
+                   is converted into lower case. If passed from
+                   ! `processor`this step is repeated.
+                   TODO fix when passed through parent function.
+
+    Keyword Arguments
+    -----------------
+        * *isalnum* (bool): Only keep `alpha-numeric` charecters in the
+          string. Defaults to False.
+        * *isalpha* (bool): Only keep `alphabets` charecters in the
+          string. Defaults to False.
+    """
+
+    isalnum = kwargs.get("isalnum", False)
+    isalpha = kwargs.get("isalpha", False)
+
+    string = re.sub("[^a-zA-Z0-9 \n\.]", "", string)
+    string = string.lower().split()
+
+    if isalnum:
+        string = [s for s in string if s.isalnum()]
+    elif isalpha:
+        string = [s for s in string if s.isalpha()]
+    else:
+        pass # no processing required
+
+    string = " ".join(string)
+    return string.replace("  ", " ").strip() # remove extra spaces
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+fuzzywuzzy
+python-Levenshtein
diff --git a/setup.py b/setup.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+#
+# Copywright (C) 2024 Debmalya Pramanik <[email protected]>
+# LICENSE: MIT License
+
+from setuptools import setup
+from setuptools import find_packages
+
+import nlpurify as nlpu
+
+setup(
+    name = "NLPurify",
+    version = nlpu.__version__,
+    author = "shark-utilities developers",
+    author_email = "[email protected]",
+    description = "Text cleaning and feature extractions using NLP, Traditional approach.",
+    long_description = open("README.md", "r").read(),
+    long_description_content_type = "text/markdown",
+    url = "https://github.com/sharkutilities/NLPurify",
+    packages = find_packages(),
+    classifiers = [
+        "Development Status :: 1 - Planning",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: End Users/Desktop",
+        "Intended Audience :: Information Technology",
+        "Intended Audience :: Science/Research",
+        "Operating System :: Unix",
+        "Operating System :: POSIX",
+        "Operating System :: Microsoft :: Windows",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3 :: Only",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "License :: OSI Approved :: MIT License"
+    ],
+    project_urls = {
+        "Issue Tracker" : "https://github.com/sharkutilities/NLPurify/issues",
+        # "Code Documentations" : "https://.readthedocs.io/en/latest/index.html",
+        "Org. Homepage" : "https://github.com/sharkutilities"
+    },
+    keywords = [
+        # keywords for finding the package::
+        "nlp", "text-cleaning", "nlp-cleaning", "llm",
+        "utility", "utilities", "util", "utils", "functions",
+        # keywords for finding the package relevant to usecases::
+        "wrappers", "data science", "data analysis", "data scientist", "data analyst"
+    ],
+    python_requires = ">=3.8"
+)