Skip to content

Commit

Permalink
Merge pull request #3 from sharkutilities/merge/existing-gist
Browse files Browse the repository at this point in the history
Merging Existing GitHub Gist into NLPurify
  • Loading branch information
ZenithClown authored Aug 17, 2024
2 parents a737bf4 + a6d8029 commit 29b636d
Show file tree
Hide file tree
Showing 5 changed files with 198 additions and 0 deletions.
15 changes: 15 additions & 0 deletions nlpurify/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# -*- encoding: utf-8 -*-

"""
Text Cleaning and Feature Extraction Engine
The module provides text cleaning and feature extractions (like
mobile number, url, etc.) from a Python string and also provides an
one-stop solution library oriented towards text cleaning.
"""

# ? package follows https://peps.python.org/pep-0440/
# ? https://python-semver.readthedocs.io/en/latest/advanced/convert-pypi-to-semver.html
__version__ = "0.0.1.dev0"

# init-time options registrations
14 changes: 14 additions & 0 deletions nlpurify/legacy/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# -*- encoding: utf-8 -*-

"""
Legacy Version of NLP Utility Module
The NLP utility module is refactored and moved to a new version with
advanced collection of features. However the existing codes from the
`gist <https://gist.github.com/ZenithClown>`_ is maintained under the
legacy submodule unless dependent codes are gradually migrated.
More Information: `Issue #5 <https://github.com/sharkutilities/NLPurify/issues/5>`_
"""

from nlpurify.legacy.nlp_utils import * # noqa: F401, F403 # pyright: ignore[reportMissingImports]
113 changes: 113 additions & 0 deletions nlpurify/legacy/nlp_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# -*- encoding: utf-8 -*-

"""
A set of utility function related to natural language
processing. In addition to the basic libraries, the module
requires the following corpus from `nltk` library:
* `stopwords` : used to remove stop words from a given
strings. Currently using the function for
pre-processing.
In addition, need some additional libraries like `fuzzywuzzy`
and `python-Levenshtein` using the following:
```python
pip install fuzzywuzzy
pip install python-Levenshtein
```
"""

import re

from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer


def processor(string : str, text_process : bool = False, **kwargs) -> str:
"""
A Simple Utility Function to Pre-Process a String
The function inputs a string, and exports clean formatted string
which is free of stop words (english) and the words are
lemmatized, i.e. transformed to their base form.
:type string: str
:param string: Base string on which various `nltk` functions are
applied to clean unwanted informations.
:type text_process: bool
:param text_process: Should the base string be formatted using
`text_process()`. Defaults to False.
"""

tokens = word_tokenize(string.lower())
filterted = [word for word in tokens if word not in stopwords.words("english")]
lemmatized = [WordNetLemmatizer().lemmatize(word, "v") for word in filterted]

return text_processor(" ".join(lemmatized), **kwargs) if text_process else " ".join(lemmatized)


def fuzzyMatch(string : str, reference : str, method : str = "partial_ratio") -> int:
"""
Calculate a Percentage Similarity between `string` and `reference` Text
Using the `fuzzywuzzy.fuzz()` method, the function calculates the percentage of
similarity between two text data. There are various methods available which can
be declared via `method` parameter. However, `partial_ratio` is great when
we want to match a text with partial data. For example, we want to find all the
strings which have the word 'annonymous' but the spelling, position may be
different in each case.
"""

method = {
"ratio" : fuzz.ratio,
"partial_ratio" : fuzz.partial_ratio,
"token_sort_ratio" : fuzz.token_sort_ratio
}.get(method)

return method(reference, string)


def text_processor(string : str, **kwargs) -> str:
"""
Uses String Methods to Clean a String
An extension of the `processor` function, which uses the in-built
python string methods to clear string contents. The function can
be called seperatly, or pass `text_process = True)` in `processor`.
More information on in-built string methods is available here:
https://www.programiz.com/python-programming/methods/string.
# ! Function is not yet optimized when used in conjunction.
:type string: str
:param string: Base string which needs formatting. The string
is converted into lower case. If passed from
! `processor`this step is repeated.
TODO fix when passed through parent function.
Keyword Arguments
-----------------
* *isalnum* (bool): Only keep `alpha-numeric` charecters in the
string. Defaults to False.
* *isalpha* (bool): Only keep `alphabets` charecters in the
string. Defaults to False.
"""

isalnum = kwargs.get("isalnum", False)
isalpha = kwargs.get("isalpha", False)

string = re.sub("[^a-zA-Z0-9 \n\.]", "", string)
string = string.lower().split()

if isalnum:
string = [s for s in string if s.isalnum()]
elif isalpha:
string = [s for s in string if s.isalpha()]
else:
pass # no processing required

string = " ".join(string)
return string.replace(" ", " ").strip() # remove extra spaces
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
fuzzywuzzy
python-Levenshtein
54 changes: 54 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env python
#
# Copywright (C) 2024 Debmalya Pramanik <[email protected]>
# LICENSE: MIT License

from setuptools import setup
from setuptools import find_packages

import nlpurify as nlpu

setup(
name = "NLPurify",
version = nlpu.__version__,
author = "shark-utilities developers",
author_email = "[email protected]",
description = "Text cleaning and feature extractions using NLP, Traditional approach.",
long_description = open("README.md", "r").read(),
long_description_content_type = "text/markdown",
url = "https://github.com/sharkutilities/NLPurify",
packages = find_packages(),
classifiers = [
"Development Status :: 1 - Planning",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: End Users/Desktop",
"Intended Audience :: Information Technology",
"Intended Audience :: Science/Research",
"Operating System :: Unix",
"Operating System :: POSIX",
"Operating System :: Microsoft :: Windows",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"License :: OSI Approved :: MIT License"
],
project_urls = {
"Issue Tracker" : "https://github.com/sharkutilities/NLPurify/issues",
# "Code Documentations" : "https://.readthedocs.io/en/latest/index.html",
"Org. Homepage" : "https://github.com/sharkutilities"
},
keywords = [
# keywords for finding the package::
"nlp", "text-cleaning", "nlp-cleaning", "llm",
"utility", "utilities", "util", "utils", "functions",
# keywords for finding the package relevant to usecases::
"wrappers", "data science", "data analysis", "data scientist", "data analyst"
],
python_requires = ">=3.8"
)

0 comments on commit 29b636d

Please sign in to comment.