-
Notifications
You must be signed in to change notification settings - Fork 0
/
sensitive_detect.py
61 lines (46 loc) · 1.82 KB
/
sensitive_detect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# -*- coding: utf-8 -*-
import string, urllib2
import urllib
from urllib import FancyURLopener
import re
from bs4 import BeautifulSoup
from companysite.models import Sensitive
class MyOpener(FancyURLopener,object):
version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
def get_opener():
myopener = MyOpener()
return myopener
def get_sensitive_commentlists(begin_page, end_page, baseUrl):
content_dict = {}
myopener = get_opener()
for i in range(begin_page, end_page+1):
wholePage = myopener.open(baseUrl + str(i)).read()
if 'zmid' in wholePage:
content = find_content(wholePage)
print content
if check_sensitive_words(content):
word = check_sensitive_words(content)
content = content.replace(word, 'M'+word+'M') # use 'M' to mark the sensitive word
content_dict[i] = content
return content_dict
# you may have to change this function according to the html file you want to parse
def find_content(wholePage):
soup = BeautifulSoup(wholePage)
class_zmid = soup(class_='zmid') # define your own class name
contents = class_zmid[0]
contents = contents.get_text()
return contents
def check_sensitive_words(to_check):
word_list = Sensitive.objects.all() # here Sensitive is a django model object;
for word in word_list:
word = str(word)
word = word.decode('utf-8')
if word in to_check:
return word
return False
if __name__ == "__main__":
begin_page = 4287
end_page = 4288
baseUrl = '' # define your Url here leaving the end id, something like http://www.xxxx.com/comments/
content_dict = get_sensitive_commentlists(begin_page, end_page, baseUrl)
print content_dict