diff --git a/nlpurify/__init__.py b/nlpurify/__init__.py new file mode 100644 index 0000000..350214e --- /dev/null +++ b/nlpurify/__init__.py @@ -0,0 +1,15 @@ +# -*- encoding: utf-8 -*- + +""" +Text Cleaning and Feature Extraction Engine + +The module provides text cleaning and feature extractions (like +mobile number, url, etc.) from a Python string and also provides an +one-stop solution library oriented towards text cleaning. +""" + +# ? package follows https://peps.python.org/pep-0440/ +# ? https://python-semver.readthedocs.io/en/latest/advanced/convert-pypi-to-semver.html +__version__ = "0.0.1.dev0" + +# init-time options registrations diff --git a/nlpurify/legacy/__init__.py b/nlpurify/legacy/__init__.py new file mode 100644 index 0000000..36334b7 --- /dev/null +++ b/nlpurify/legacy/__init__.py @@ -0,0 +1,14 @@ +# -*- encoding: utf-8 -*- + +""" +Legacy Version of NLP Utility Module + +The NLP utility module is refactored and moved to a new version with +advanced collection of features. However the existing codes from the +`gist `_ is maintained under the +legacy submodule unless dependent codes are gradually migrated. + +More Information: `Issue #5 `_ +""" + +from nlpurify.legacy.nlp_utils import * # noqa: F401, F403 # pyright: ignore[reportMissingImports] diff --git a/nlpurify/legacy/nlp_utils.py b/nlpurify/legacy/nlp_utils.py new file mode 100644 index 0000000..c4d357e --- /dev/null +++ b/nlpurify/legacy/nlp_utils.py @@ -0,0 +1,113 @@ +# -*- encoding: utf-8 -*- + +""" +A set of utility function related to natural language +processing. In addition to the basic libraries, the module +requires the following corpus from `nltk` library: + * `stopwords` : used to remove stop words from a given + strings. Currently using the function for + pre-processing. + +In addition, need some additional libraries like `fuzzywuzzy` +and `python-Levenshtein` using the following: + +```python +pip install fuzzywuzzy +pip install python-Levenshtein +``` +""" + +import re + +from fuzzywuzzy import fuzz +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem.wordnet import WordNetLemmatizer + + +def processor(string : str, text_process : bool = False, **kwargs) -> str: + """ + A Simple Utility Function to Pre-Process a String + + The function inputs a string, and exports clean formatted string + which is free of stop words (english) and the words are + lemmatized, i.e. transformed to their base form. + + :type string: str + :param string: Base string on which various `nltk` functions are + applied to clean unwanted informations. + + :type text_process: bool + :param text_process: Should the base string be formatted using + `text_process()`. Defaults to False. + """ + + tokens = word_tokenize(string.lower()) + filterted = [word for word in tokens if word not in stopwords.words("english")] + lemmatized = [WordNetLemmatizer().lemmatize(word, "v") for word in filterted] + + return text_processor(" ".join(lemmatized), **kwargs) if text_process else " ".join(lemmatized) + + +def fuzzyMatch(string : str, reference : str, method : str = "partial_ratio") -> int: + """ + Calculate a Percentage Similarity between `string` and `reference` Text + + Using the `fuzzywuzzy.fuzz()` method, the function calculates the percentage of + similarity between two text data. There are various methods available which can + be declared via `method` parameter. However, `partial_ratio` is great when + we want to match a text with partial data. For example, we want to find all the + strings which have the word 'annonymous' but the spelling, position may be + different in each case. + """ + + method = { + "ratio" : fuzz.ratio, + "partial_ratio" : fuzz.partial_ratio, + "token_sort_ratio" : fuzz.token_sort_ratio + }.get(method) + + return method(reference, string) + + +def text_processor(string : str, **kwargs) -> str: + """ + Uses String Methods to Clean a String + + An extension of the `processor` function, which uses the in-built + python string methods to clear string contents. The function can + be called seperatly, or pass `text_process = True)` in `processor`. + More information on in-built string methods is available here: + https://www.programiz.com/python-programming/methods/string. + + # ! Function is not yet optimized when used in conjunction. + + :type string: str + :param string: Base string which needs formatting. The string + is converted into lower case. If passed from + ! `processor`this step is repeated. + TODO fix when passed through parent function. + + Keyword Arguments + ----------------- + * *isalnum* (bool): Only keep `alpha-numeric` charecters in the + string. Defaults to False. + * *isalpha* (bool): Only keep `alphabets` charecters in the + string. Defaults to False. + """ + + isalnum = kwargs.get("isalnum", False) + isalpha = kwargs.get("isalpha", False) + + string = re.sub("[^a-zA-Z0-9 \n\.]", "", string) + string = string.lower().split() + + if isalnum: + string = [s for s in string if s.isalnum()] + elif isalpha: + string = [s for s in string if s.isalpha()] + else: + pass # no processing required + + string = " ".join(string) + return string.replace(" ", " ").strip() # remove extra spaces diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a2065f9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +fuzzywuzzy +python-Levenshtein diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..25a7694 --- /dev/null +++ b/setup.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# +# Copywright (C) 2024 Debmalya Pramanik +# LICENSE: MIT License + +from setuptools import setup +from setuptools import find_packages + +import nlpurify as nlpu + +setup( + name = "NLPurify", + version = nlpu.__version__, + author = "shark-utilities developers", + author_email = "neuralNOD@outlook.com", + description = "Text cleaning and feature extractions using NLP, Traditional approach.", + long_description = open("README.md", "r").read(), + long_description_content_type = "text/markdown", + url = "https://github.com/sharkutilities/NLPurify", + packages = find_packages(), + classifiers = [ + "Development Status :: 1 - Planning", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: End Users/Desktop", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "Operating System :: Unix", + "Operating System :: POSIX", + "Operating System :: Microsoft :: Windows", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "License :: OSI Approved :: MIT License" + ], + project_urls = { + "Issue Tracker" : "https://github.com/sharkutilities/NLPurify/issues", + # "Code Documentations" : "https://.readthedocs.io/en/latest/index.html", + "Org. Homepage" : "https://github.com/sharkutilities" + }, + keywords = [ + # keywords for finding the package:: + "nlp", "text-cleaning", "nlp-cleaning", "llm", + "utility", "utilities", "util", "utils", "functions", + # keywords for finding the package relevant to usecases:: + "wrappers", "data science", "data analysis", "data scientist", "data analyst" + ], + python_requires = ">=3.8" +)