-
Notifications
You must be signed in to change notification settings - Fork 0
/
2_5_find_tweets_with_keywords.py
executable file
·95 lines (81 loc) · 2.32 KB
/
2_5_find_tweets_with_keywords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
get the tweets of the specified keywords
and make csv files
"""
#%%
import os
import re
import pickle
from copy import deepcopy
import time
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import ginza
from ginza import *
import matplotlib.pyplot as plt
import japanize_matplotlib
from tqdm import tqdm
from wordcloud import WordCloud
from googletrans import Translator # googletrans==4.0.0-rc1
import spacy
nlp = spacy.load('ja_ginza')
from utils import remove_string_special_characters, remove_keywords
translator = Translator()
title = "vaccine"
root_dir = os.getcwd()
input_path = os.path.join(root_dir, 'results', title)
# umcomment the keywords to get the results of unigrams or combination of multiple unigrams
keywords = [
# "感染",
"予約",
"会場",
# "情報",
# "日本",
# "副反応",
# "ファイザー",
# "可能",
# "効果",
# "デルタ",
# "以上",
# "感染者",
# "モデルナ",
# "変異",
]
keywords_title = "+".join(keywords)
print(keywords_title)
result_path = os.path.join(root_dir, 'results', 'keywords', keywords_title)
if not os.path.exists(result_path):
os.makedirs(result_path)
start, end = '2021-02-01', '2021-09-30'
print(f"{start}~{end}")
dates = pd.date_range(start, end, freq='D')
date_list = dates.strftime('%Y-%m-%d').to_list()
#%%
text_list = []
for date in tqdm(date_list):
date_path = os.path.join(input_path, date)
file_list = os.listdir(date_path)
for file_name in file_list:
text = None
with open(os.path.join(date_path, file_name), 'r') as f:
text = f.readline()
f.close()
pre_text = remove_string_special_characters(text)
if pre_text is None:
continue
doc = nlp(pre_text)
pre_text = ' '.join([x.string for x in doc if not x.is_stop])
pre_text = remove_keywords(pre_text)
mark = True
for keyword in keywords:
if keyword not in pre_text:
mark = False
if mark:
text_list.append([date, file_name])
df_keywords = pd.DataFrame(text_list, columns=["Date", "ID"])
df_keywords.to_csv(os.path.join(result_path, f"{keywords_title}.csv"))
del text_list, df_keywords
print("Done!")
# %%