-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_cleaning.py
30 lines (25 loc) · 934 Bytes
/
data_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import spacy
import json
import re
from nltk.corpus import stopwords
with open('abusedetection_dataset') as fichero:
datos = json.load(fichero)
nlp = spacy.load('es_core_news_sm')
def datacleaner(text):
text = text.lower()
parsed = nlp(text)
final_tokens = []
for t in parsed:
if t.is_punct or t.is_space or t.like_num or t.like_url or str(t)[0] == '@':
pass
else:
sc_removed = re.sub("[^a-zñáéíóúäëïöü]", '', str(t.lemma_))
if len(sc_removed) > 1:
if not sc_removed in stopwords.words('spanish'):
final_tokens.append(sc_removed)
joined = ' '.join(final_tokens)
spell_corrected = re.sub(r'(.)\1+', r'\1\1', joined)
return spell_corrected
datos_clean = [[datacleaner(line[0]),line[1],line[2]] for line in datos]
with open('clean_data','w') as ficherosalida:
json.dump(datos_clean, ficherosalida)