-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_clean.py
58 lines (47 loc) · 2 KB
/
data_clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# NLP basics: reading and cleaning data
import pandas as pd
import string
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
# Reading data using open()
# rawData = open('SMSSpamCollection').read()
# # print(rawData[0:500])
# parseData = rawData.replace('\t', '\n').split('\n')
# labels = parseData[0::2]
# texts = parseData[1::2]
# # convert these data into dataframe
# data_df = pd.DataFrame({
# 'labels': labels[:-1],
# 'texts': texts,
# })
# Reg=ular Expression
# [0-9] => search numbers from 0 to 9, single strings like 0, 1, 2..
# [0-9]+ => search numbers from 0 to 9 including multiple characters like 0, 123, 12..
# \s => check single whitespace
# \s+ => check multiple whitespaces
# \w+ => search for any non-word character and remove it
# can use re.split('\w+') or re.findall('\w+')
#===========================================================================================
data = pd.read_csv('SMSSpamCollection', sep='\t', header=None)
data.columns = ['labels', 'texts']
# Explore the dataset
print('Out of {} rows, {} are spam, {} are ham'.format(len(data), len(data[data['labels']=='spam']), len(data[data['labels']=='ham'])))
# Check the Number of missing data
print('Number of null in labels: {} and number of null in texts: {}'.format(data['labels'].isnull().sum(), data['texts'].isnull().sum()))
# stopwords removal
stopwords = nltk.corpus.stopwords.words('english')
# Wordnetlemmatizer
wm = nltk.WordNetLemmatizer()
# pre-processing data
def data_clean(texts):
text = "".join([char for char in texts if char not in string.punctuation])
tokens = re.split('W+', text)
text = [wm.lemmatize(word) for word in tokens if word not in stopwords]
return text
data['cleaned_text'] = data['texts'].apply(lambda x: data_clean(x.lower()))
# Vectorizing
tfidf_vect = TfidfVectorizer(analyzer=data_clean)
X_tfidf = tfidf_vect.fit_transform(data['cleaned_text'])
import ipdb; ipdb.set_trace()
print(X_tfidf.shape, tfidf_vect.get_feature_names())