-
Notifications
You must be signed in to change notification settings - Fork 0
/
readingwithtext.py
133 lines (95 loc) · 4.61 KB
/
readingwithtext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import tarfile
import simplejson as json
import sys
import os
import bz2
import re
import tweet_utils as Util
import csv
from nltk.tokenize import WordPunctTokenizer
from bs4 import BeautifulSoup
bz2FilePath = 'dummy'
CSVfields = ['created_at', 'text', 'hashtags', 'username', 'screenname', 'followers_count', 'following_count',
'user_location', 'user_desc']
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't": "is not", "aren't": "are not", "wasn't": "was not", "weren't": "were not",
"haven't": "have not", "hasn't": "has not", "hadn't": "had not", "won't": "will not",
"wouldn't": "would not", "don't": "do not", "doesn't": "does not", "didn't": "did not",
"can't": "can not", "couldn't": "could not", "shouldn't": "should not", "mightn't": "might not",
"mustn't": "must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')
def tweet_cleaner(text):
soup = BeautifulSoup(text)
souped = soup.get_text()
try:
bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
except:
bom_removed = souped
stripped = re.sub(combined_pat, '', bom_removed)
stripped = re.sub(www_pat, '', stripped)
lower_case = stripped.lower()
neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
# During the letters_only process two lines above, it has created unnecessay white spaces,
# I will tokenize and join together to remove unneccessary white spaces
words = [x for x in tok.tokenize(letters_only) if len(x) > 1]
return (" ".join(words)).strip()
def checkDesc(tweet):
descr = 'empty'
if tweet['user']['description'] == None:
return descr
else:
return tweet['user']['description']
def contains_word(s, w):
return (' ' + w + ' ') in (' ' + s + ' ')
def checkTermForTweet(tweet):
hashtags = []
TermList = ['coachella2017', 'Coachella', 'coachellalive', 'Beychella', 'coachellaoutfit',
'coachellavalley', 'coachellavibes', 'coachellaready']
tweetTextWords = tweet['text'].split()
for term in TermList:
if term in tweetTextWords:
tweettext = tweet_cleaner(tweet['text'])
if 'hashtags' in tweet['entities']:
for tag in tweet['entities']['hashtags']:
hashtags.append(tag['text'])
csvObj = {'created_at': tweet['created_at'],
'text': tweettext,
'hashtags': hashtags,
'username': tweet['user']['name'],
'screenname': tweet['user']['screen_name'],
'followers_count': tweet['user']['followers_count'],
'following_count': tweet['user']['friends_count'],
'user_location': tweet['user']['location'],
'user_desc': checkDesc(tweet)
}
print(csvObj)
return csvObj
def writeCSV(tbody, writer):
writer.writerow(tbody)
def reading(filepath):
with open('tweetsfromtext.csv', 'w', newline='', encoding='utf-8') as csvfile: ## open the CSV new file
tweetwriter = csv.DictWriter(csvfile, delimiter=',', fieldnames=CSVfields) ## initialize the CSV
tweetwriter.writeheader() ## write the header of CSV
for root, dirs, files in os.walk(filepath): ## read the folder
for name in files: ## read the file
print("PATHHH---------", os.path.join(root, name).capitalize())
filename = os.path.join(root, name) ## save the path of the file
try:
with bz2.BZ2File(filename, 'r') as f: ## read the BZ file in the path
for line in f: ## read each line
tweet = json.loads(line) ## parse the json
if tweet['created_at']:
if tweet['lang'] == 'en':
tweetbody = checkTermForTweet(tweet)
if tweetbody:
writeCSV(tweetbody, tweetwriter)
# tweetwriter.writerow(tweetbody)
except Exception as e:
continue
print(e)
reading(bz2FilePath)