-
Notifications
You must be signed in to change notification settings - Fork 8
/
text_processing.py
91 lines (76 loc) · 2.72 KB
/
text_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gzip
import os
import re
import nltk
def vocab_map(vocab):
voc_dict = {}
for i, v in enumerate(vocab):
voc_dict[v] = i
# else:
# voc_dict['UNK'] = i
return voc_dict
def tokenize(sequence):
tokens = [token.replace("``", '').replace("''", '').replace('"', '') for token in nltk.word_tokenize(sequence) if
token != " "]
# return tokens
return tokens
def clean_text(text):
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
text = re.sub(r'\<a href', ' ', text)
text = re.sub(r'&', '', text)
# text = re.sub(r'[_"()|+&=*#$@\[\]/]', '', text)
text = re.sub(r'[_"|+&=*#$@/]', '', text) # by Amir
text = re.sub(r'\(', ' ( ', text) # by Amir
text = re.sub(r'\)', ' ) ', text) # by Amir
text = re.sub(r'LRB', ' ( ', text) # by Amir
text = re.sub(r'RRB', ' ) ', text) # by Amir
# text = re.sub("LSB", "[", text) # by Amir [
# text = re.sub("RSB", "]", text) # by Amir ]
# text = re.sub("[[].*?[\]]", "", text) # by Amir remove every thing between [ ]
text = re.sub(r'\-', ' ', text)
text = re.sub(r'<br />', ' ', text)
text = text.replace("...", " ")
return text
def load_whole_glove(glove_file):
# logger = LogHelper.get_logger("load_whole_glove")
is_gz = os.path.splitext(glove_file)[1] == '.gz'
# Getting embedding dimension
def _get_dim(_file):
line = _file.readline()
return len(line.strip().split(' ')) - 1
if is_gz:
with gzip.open(glove_file, 'rt') as file0:
emb_dim = _get_dim(file0)
else:
with open(glove_file, 'r', encoding='utf-8') as file0:
emb_dim = _get_dim(file0)
# First row of embedding matrix is 0 for zero padding
vocab = ['[PAD]']
embed = [[0.0] * emb_dim]
vocab.append('UNK')
embed.append([1.0] * emb_dim)
def _read_glove_file(_vocab, _embed, _file):
for line in _file:
items = line.replace('\r', '').replace('\n', '').split(' ')
if len(items) < 10:
print("exceptional line: {}".format(line))
continue
word = items[0]
_vocab.append(word)
vec = [float(i) for i in items[1:]]
_embed.append(vec)
return _vocab, _embed
# Reading embedding matrix
if is_gz:
with gzip.open(glove_file, 'rt') as file:
vocab, embed = _read_glove_file(vocab, embed, file)
else:
with open(glove_file, 'r', encoding='utf-8') as file:
vocab, embed = _read_glove_file(vocab, embed, file)
print('Loaded GloVe!')
return vocab, embed
# if __name__=="__main__":
#
# text ="I don\'t think this is right..."
# text =clean_text(text)
# print(text)