forked from mhezarei/ai-bot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
argument_corpse.py
97 lines (94 loc) · 2.54 KB
/
argument_corpse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# import pandas as pd
#
#
# with open("stop_words_short.txt", encoding="utf-8") as f:
# sw = f.read().split('\n')
#
#
# corpse = []
# unused_chars = ['-', '،', '_', '\n', '//', '/', 'ـ', '?', '؟', '.', '؛']
#
# with open("used_combs.txt") as f:
# temp = f.read().split('=')
# ret = []
# for t in temp:
# ret += [w for w in t.split('\n') if w != '' and w not in sw]
# corpse += ret
#
# with open("used_words.txt") as f:
# temp = f.read().split('=')
# ret = []
# for t in temp:
# ret += [w for w in t.split('\n') if w != '' and w not in sw]
# corpse += ret
#
# # events
# df = pd.read_csv("shamsi_events.csv", encoding="utf-8")
# events = list(set(df["event"].tolist()))
# # whole events
# corpse += list(set(events))
# # splitting words
# # clean = []
# # for e in events:
# # s = e
# # for c in unused_chars:
# # s = s.replace(c, ' ')
# # clean.append(s)
# # clean2 = []
# # for e in clean:
# # clean2 = clean2 + [w for w in e.split(' ') if w != '']
# # clean2 = list(set(clean2))
# # corpse += clean2
#
# # countries
# df = pd.read_csv("IP2LOCATION-COUNTRY-MULTILINGUAL.CSV",
# encoding="utf-8",
# header=None, skiprows=4981, nrows=249)
# countries = df[5].tolist()
# new = []
# for c in countries:
# new = new + [w for w in c.split(' ') if w != '' and w not in sw]
# new = list(set(new))
# corpse += new
#
# # cities
# # df = pd.read_csv("cities15000.txt", sep='\t',
# # encoding="utf-8", header=None)
# # cities = df[3].tolist()
# # new2 = []
# # for c in cities:
# # new2 = new2 + [w for w in c.split(' ') if w != '' and w not in sw]
# # new2 = list(set(new2))
# # print(new2)
#
# # questions
# df = pd.read_csv("Intents_questions.csv", encoding="utf-8")
# questions = df["questions"].tolist()
# new = []
# for q in questions:
# temp = q
# for c in unused_chars:
# temp = temp.replace(c, ' ')
# new += [w for w in temp.split(' ') if w != '' and w not in sw]
# new = list(set(new))
# corpse += new
#
#
# # PLEASE DO NOT RUN THIS AS IT WILL APPEND THE DATA TO THE CORPSE FILE
# corpse = list(set(corpse))
# with open("argument_corpse.txt", "a+") as f:
# for w in corpse:
# f.write(w + '\n')
new = []
with open("argument_corpse.txt") as f:
words = f.read().split('\n')
words.remove('')
for w in words:
if ' ' in w:
new += w.split()
else:
new.append(w)
new = list(set(new))
# with open("new_argument_corpse.txt", "a+") as f:
# for w in new:
# f.write(w + '\n')