-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from sharkutilities/merge/existing-gist
Merging Existing GitHub Gist into NLPurify
- Loading branch information
Showing
5 changed files
with
198 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# -*- encoding: utf-8 -*- | ||
|
||
""" | ||
Text Cleaning and Feature Extraction Engine | ||
The module provides text cleaning and feature extractions (like | ||
mobile number, url, etc.) from a Python string and also provides an | ||
one-stop solution library oriented towards text cleaning. | ||
""" | ||
|
||
# ? package follows https://peps.python.org/pep-0440/ | ||
# ? https://python-semver.readthedocs.io/en/latest/advanced/convert-pypi-to-semver.html | ||
__version__ = "0.0.1.dev0" | ||
|
||
# init-time options registrations |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# -*- encoding: utf-8 -*- | ||
|
||
""" | ||
Legacy Version of NLP Utility Module | ||
The NLP utility module is refactored and moved to a new version with | ||
advanced collection of features. However the existing codes from the | ||
`gist <https://gist.github.com/ZenithClown>`_ is maintained under the | ||
legacy submodule unless dependent codes are gradually migrated. | ||
More Information: `Issue #5 <https://github.com/sharkutilities/NLPurify/issues/5>`_ | ||
""" | ||
|
||
from nlpurify.legacy.nlp_utils import * # noqa: F401, F403 # pyright: ignore[reportMissingImports] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
# -*- encoding: utf-8 -*- | ||
|
||
""" | ||
A set of utility function related to natural language | ||
processing. In addition to the basic libraries, the module | ||
requires the following corpus from `nltk` library: | ||
* `stopwords` : used to remove stop words from a given | ||
strings. Currently using the function for | ||
pre-processing. | ||
In addition, need some additional libraries like `fuzzywuzzy` | ||
and `python-Levenshtein` using the following: | ||
```python | ||
pip install fuzzywuzzy | ||
pip install python-Levenshtein | ||
``` | ||
""" | ||
|
||
import re | ||
|
||
from fuzzywuzzy import fuzz | ||
from nltk.corpus import stopwords | ||
from nltk.tokenize import word_tokenize | ||
from nltk.stem.wordnet import WordNetLemmatizer | ||
|
||
|
||
def processor(string : str, text_process : bool = False, **kwargs) -> str: | ||
""" | ||
A Simple Utility Function to Pre-Process a String | ||
The function inputs a string, and exports clean formatted string | ||
which is free of stop words (english) and the words are | ||
lemmatized, i.e. transformed to their base form. | ||
:type string: str | ||
:param string: Base string on which various `nltk` functions are | ||
applied to clean unwanted informations. | ||
:type text_process: bool | ||
:param text_process: Should the base string be formatted using | ||
`text_process()`. Defaults to False. | ||
""" | ||
|
||
tokens = word_tokenize(string.lower()) | ||
filterted = [word for word in tokens if word not in stopwords.words("english")] | ||
lemmatized = [WordNetLemmatizer().lemmatize(word, "v") for word in filterted] | ||
|
||
return text_processor(" ".join(lemmatized), **kwargs) if text_process else " ".join(lemmatized) | ||
|
||
|
||
def fuzzyMatch(string : str, reference : str, method : str = "partial_ratio") -> int: | ||
""" | ||
Calculate a Percentage Similarity between `string` and `reference` Text | ||
Using the `fuzzywuzzy.fuzz()` method, the function calculates the percentage of | ||
similarity between two text data. There are various methods available which can | ||
be declared via `method` parameter. However, `partial_ratio` is great when | ||
we want to match a text with partial data. For example, we want to find all the | ||
strings which have the word 'annonymous' but the spelling, position may be | ||
different in each case. | ||
""" | ||
|
||
method = { | ||
"ratio" : fuzz.ratio, | ||
"partial_ratio" : fuzz.partial_ratio, | ||
"token_sort_ratio" : fuzz.token_sort_ratio | ||
}.get(method) | ||
|
||
return method(reference, string) | ||
|
||
|
||
def text_processor(string : str, **kwargs) -> str: | ||
""" | ||
Uses String Methods to Clean a String | ||
An extension of the `processor` function, which uses the in-built | ||
python string methods to clear string contents. The function can | ||
be called seperatly, or pass `text_process = True)` in `processor`. | ||
More information on in-built string methods is available here: | ||
https://www.programiz.com/python-programming/methods/string. | ||
# ! Function is not yet optimized when used in conjunction. | ||
:type string: str | ||
:param string: Base string which needs formatting. The string | ||
is converted into lower case. If passed from | ||
! `processor`this step is repeated. | ||
TODO fix when passed through parent function. | ||
Keyword Arguments | ||
----------------- | ||
* *isalnum* (bool): Only keep `alpha-numeric` charecters in the | ||
string. Defaults to False. | ||
* *isalpha* (bool): Only keep `alphabets` charecters in the | ||
string. Defaults to False. | ||
""" | ||
|
||
isalnum = kwargs.get("isalnum", False) | ||
isalpha = kwargs.get("isalpha", False) | ||
|
||
string = re.sub("[^a-zA-Z0-9 \n\.]", "", string) | ||
string = string.lower().split() | ||
|
||
if isalnum: | ||
string = [s for s in string if s.isalnum()] | ||
elif isalpha: | ||
string = [s for s in string if s.isalpha()] | ||
else: | ||
pass # no processing required | ||
|
||
string = " ".join(string) | ||
return string.replace(" ", " ").strip() # remove extra spaces |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
fuzzywuzzy | ||
python-Levenshtein |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
#!/usr/bin/env python | ||
# | ||
# Copywright (C) 2024 Debmalya Pramanik <[email protected]> | ||
# LICENSE: MIT License | ||
|
||
from setuptools import setup | ||
from setuptools import find_packages | ||
|
||
import nlpurify as nlpu | ||
|
||
setup( | ||
name = "NLPurify", | ||
version = nlpu.__version__, | ||
author = "shark-utilities developers", | ||
author_email = "[email protected]", | ||
description = "Text cleaning and feature extractions using NLP, Traditional approach.", | ||
long_description = open("README.md", "r").read(), | ||
long_description_content_type = "text/markdown", | ||
url = "https://github.com/sharkutilities/NLPurify", | ||
packages = find_packages(), | ||
classifiers = [ | ||
"Development Status :: 1 - Planning", | ||
"Intended Audience :: Developers", | ||
"Intended Audience :: Education", | ||
"Intended Audience :: End Users/Desktop", | ||
"Intended Audience :: Information Technology", | ||
"Intended Audience :: Science/Research", | ||
"Operating System :: Unix", | ||
"Operating System :: POSIX", | ||
"Operating System :: Microsoft :: Windows", | ||
"Programming Language :: Python", | ||
"Programming Language :: Python :: 3", | ||
"Programming Language :: Python :: 3 :: Only", | ||
"Programming Language :: Python :: 3.8", | ||
"Programming Language :: Python :: 3.9", | ||
"Programming Language :: Python :: 3.10", | ||
"Programming Language :: Python :: 3.11", | ||
"Programming Language :: Python :: 3.12", | ||
"License :: OSI Approved :: MIT License" | ||
], | ||
project_urls = { | ||
"Issue Tracker" : "https://github.com/sharkutilities/NLPurify/issues", | ||
# "Code Documentations" : "https://.readthedocs.io/en/latest/index.html", | ||
"Org. Homepage" : "https://github.com/sharkutilities" | ||
}, | ||
keywords = [ | ||
# keywords for finding the package:: | ||
"nlp", "text-cleaning", "nlp-cleaning", "llm", | ||
"utility", "utilities", "util", "utils", "functions", | ||
# keywords for finding the package relevant to usecases:: | ||
"wrappers", "data science", "data analysis", "data scientist", "data analyst" | ||
], | ||
python_requires = ">=3.8" | ||
) |