-
Notifications
You must be signed in to change notification settings - Fork 0
/
title_handler.py
102 lines (84 loc) · 2.13 KB
/
title_handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# coding=utf-8
import re
import json
import jieba
city_file = "city.txt"
country_file = "country.txt"
province_file = "province.txt"
# city_file = "city.txt"
pending_titles = set()
key_words = set()
def read_data(filename):
fd = open(filename, "r")
data = fd.read()
fd.close()
return data
def load_pending_titles(filename):
data = read_data(filename)
# data = data.decode("gbk").encode("utf-8")
for line in data.split("\n"):
# print(line.decode("utf-8"))
pending_titles.add(line.decode("utf-8"))
pass
def load_brand_name_database():
pass
def load_sensitive_word_database():
pass
def load_violation_word_database():
pass
def load_country_name_database():
data = read_data(country_file)
key_words.update(json.loads(data))
pass
def load_province_name_database():
data = read_data(province_file)
key_words.update(json.loads(data))
pass
def load_city_name_database():
data = read_data(city_file)
for line in data.split("\n"):
data_list = line.split(":")
word = None
if len(data_list) == 1:
word = data_list[0].decode("utf-8")
elif len(data_list) == 2:
word = data_list[1].decode("utf-8")
print(word)
key_words.add(word)
if (word.endswith(u"市")):
word = word.rstrip(u"市")
elif word.endswith(u"自治州"):
word = word.rstrip(u"自治州")
print(word)
key_words.add(word)
pass
def remove_brand_name():
pass
# def
def remove_punctuation(title):
# re.sub('', "", title)
return re.sub(r'[<>,\.\?;:\'\"\\\|\[\]\{\}_\+=\(\)\*&\^%\$#@!~`,《。》?;:‘“’”、\|【】\{\})(……¥!~·]+', "", title)
pass
def main(title_file):
total = 0
load_pending_titles(title_file)
load_country_name_database()
load_province_name_database()
load_city_name_database()
for title in pending_titles:
print(title)
title = remove_punctuation(title)
# word_list = jieba.cut(title, cut_all=False)
# print(" ".join(word_list))
for word in key_words:
if title.count(word) > 0:
title = title.replace(word, "")
total += 1
print(title)
print("removed %d" % (total))
# for word in key_words:
# print(word)
pass
if __name__ == '__main__':
title_file = "./test.txt"
main(title_file)