-
Notifications
You must be signed in to change notification settings - Fork 2
/
twitter_wordclould.py
122 lines (92 loc) · 14.5 KB
/
twitter_wordclould.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from typing import Optional, Callable, Any
import os
import re
import random
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import arabic_reshaper
from pytweets import TwitterApp
from bidi.algorithm import get_display
from wordcloud import WordCloud, STOPWORDS
from farsi_tools import stop_words, standardize_persian_text, NON_WORD_REGEX
def persianize(x):
return get_display(arabic_reshaper.reshape(x))
def create_word_cloud(target_username: str, target_tweet_count: int, filename: str, drop_stopwords: bool = False, colormap: Optional[str] = None, color_func: Optional[Callable[[str, Any, Any, Any, Any, Any, Any], str]] = None):
d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
app = TwitterApp(api_key='YOUR_API_KEY',
api_secret_key='YOUR_API_SECRET_KEY',
app_name='YOUR_APP_NAME',
bearer_token='YOUR_BEARER_TOKEN')
user_info = app.get_user_by_username(username=target_username)
user_id = user_info.data[0].user_id
tweets = app.get_user_statuses_timeline(user_id=user_id, count=target_tweet_count)
text = ''
if len(tweets) > target_tweet_count:
tweets = tweets[:target_tweet_count]
for tweet in tweets:
tweet_text = tweet.text
# remove urls and mentions
tweet_text = re.sub(r"(?:@|https?://)\S+", "", tweet_text)
# remove emoticons
tweet_text = re.sub(r"([#*0-9]\uFE0F\u20E3|[\u00A9\u00AE\u203C\u2049\u2122\u2139\u2194-\u2199\u21A9\u21AA\u231A\u231B\u2328\u23CF\u23E9-\u23F3\u23F8-\u23FA\u24C2\u25AA\u25AB\u25B6\u25C0\u25FB-\u25FE\u2600-\u2604\u260E\u2611\u2614\u2615\u2618]|\u261D[\U0001F3FB-\U0001F3FF]?|[\u2620\u2622\u2623\u2626\u262A\u262E\u262F\u2638-\u263A\u2640\u2642\u2648-\u2653\u265F\u2660\u2663\u2665\u2666\u2668\u267B\u267E\u267F\u2692-\u2697\u2699\u269B\u269C\u26A0\u26A1\u26AA\u26AB\u26B0\u26B1\u26BD\u26BE\u26C4\u26C5\u26C8\u26CE\u26CF\u26D1\u26D3\u26D4\u26E9\u26EA\u26F0-\u26F5\u26F7\u26F8]|\u26F9(?:\uFE0F\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|[\u26FA\u26FD\u2702\u2705\u2708\u2709]|[\u270A-\u270D][\U0001F3FB-\U0001F3FF]?|[\u270F\u2712\u2714\u2716\u271D\u2721\u2728\u2733\u2734\u2744\u2747\u274C\u274E\u2753-\u2755\u2757\u2763\u2764\u2795-\u2797\u27A1\u27B0\u27BF\u2934\u2935\u2B05-\u2B07\u2B1B\u2B1C\u2B50\u2B55\u3030\u303D\u3297\u3299\U0001F004\U0001F0CF\U0001F170\U0001F171\U0001F17E\U0001F17F\U0001F18E\U0001F191-\U0001F19A]|\U0001F1E6[\U0001F1E8-\U0001F1EC\U0001F1EE\U0001F1F1\U0001F1F2\U0001F1F4\U0001F1F6-\U0001F1FA\U0001F1FC\U0001F1FD\U0001F1FF]|\U0001F1E7[\U0001F1E6\U0001F1E7\U0001F1E9-\U0001F1EF\U0001F1F1-\U0001F1F4\U0001F1F6-\U0001F1F9\U0001F1FB\U0001F1FC\U0001F1FE\U0001F1FF]|\U0001F1E8[\U0001F1E6\U0001F1E8\U0001F1E9\U0001F1EB-\U0001F1EE\U0001F1F0-\U0001F1F5\U0001F1F7\U0001F1FA-\U0001F1FF]|\U0001F1E9[\U0001F1EA\U0001F1EC\U0001F1EF\U0001F1F0\U0001F1F2\U0001F1F4\U0001F1FF]|\U0001F1EA[\U0001F1E6\U0001F1E8\U0001F1EA\U0001F1EC\U0001F1ED\U0001F1F7-\U0001F1FA]|\U0001F1EB[\U0001F1EE-\U0001F1F0\U0001F1F2\U0001F1F4\U0001F1F7]|\U0001F1EC[\U0001F1E6\U0001F1E7\U0001F1E9-\U0001F1EE\U0001F1F1-\U0001F1F3\U0001F1F5-\U0001F1FA\U0001F1FC\U0001F1FE]|\U0001F1ED[\U0001F1F0\U0001F1F2\U0001F1F3\U0001F1F7\U0001F1F9\U0001F1FA]|\U0001F1EE[\U0001F1E8-\U0001F1EA\U0001F1F1-\U0001F1F4\U0001F1F6-\U0001F1F9]|\U0001F1EF[\U0001F1EA\U0001F1F2\U0001F1F4\U0001F1F5]|\U0001F1F0[\U0001F1EA\U0001F1EC-\U0001F1EE\U0001F1F2\U0001F1F3\U0001F1F5\U0001F1F7\U0001F1FC\U0001F1FE\U0001F1FF]|\U0001F1F1[\U0001F1E6-\U0001F1E8\U0001F1EE\U0001F1F0\U0001F1F7-\U0001F1FB\U0001F1FE]|\U0001F1F2[\U0001F1E6\U0001F1E8-\U0001F1ED\U0001F1F0-\U0001F1FF]|\U0001F1F3[\U0001F1E6\U0001F1E8\U0001F1EA-\U0001F1EC\U0001F1EE\U0001F1F1\U0001F1F4\U0001F1F5\U0001F1F7\U0001F1FA\U0001F1FF]|\U0001F1F4\U0001F1F2|\U0001F1F5[\U0001F1E6\U0001F1EA-\U0001F1ED\U0001F1F0-\U0001F1F3\U0001F1F7-\U0001F1F9\U0001F1FC\U0001F1FE]|\U0001F1F6\U0001F1E6|\U0001F1F7[\U0001F1EA\U0001F1F4\U0001F1F8\U0001F1FA\U0001F1FC]|\U0001F1F8[\U0001F1E6-\U0001F1EA\U0001F1EC-\U0001F1F4\U0001F1F7-\U0001F1F9\U0001F1FB\U0001F1FD-\U0001F1FF]|\U0001F1F9[\U0001F1E6\U0001F1E8\U0001F1E9\U0001F1EB-\U0001F1ED\U0001F1EF-\U0001F1F4\U0001F1F7\U0001F1F9\U0001F1FB\U0001F1FC\U0001F1FF]|\U0001F1FA[\U0001F1E6\U0001F1EC\U0001F1F2\U0001F1F3\U0001F1F8\U0001F1FE\U0001F1FF]|\U0001F1FB[\U0001F1E6\U0001F1E8\U0001F1EA\U0001F1EC\U0001F1EE\U0001F1F3\U0001F1FA]|\U0001F1FC[\U0001F1EB\U0001F1F8]|\U0001F1FD\U0001F1F0|\U0001F1FE[\U0001F1EA\U0001F1F9]|\U0001F1FF[\U0001F1E6\U0001F1F2\U0001F1FC]|[\U0001F201\U0001F202\U0001F21A\U0001F22F\U0001F232-\U0001F23A\U0001F250\U0001F251\U0001F300-\U0001F321\U0001F324-\U0001F384]|\U0001F385[\U0001F3FB-\U0001F3FF]?|[\U0001F386-\U0001F393\U0001F396\U0001F397\U0001F399-\U0001F39B\U0001F39E-\U0001F3C1]|\U0001F3C2[\U0001F3FB-\U0001F3FF]?|[\U0001F3C3\U0001F3C4](?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|[\U0001F3C5\U0001F3C6]|\U0001F3C7[\U0001F3FB-\U0001F3FF]?|[\U0001F3C8\U0001F3C9]|\U0001F3CA(?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|[\U0001F3CB\U0001F3CC](?:\uFE0F\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|[\U0001F3CD-\U0001F3F0]|\U0001F3F3(?:\uFE0F\u200D\U0001F308)?|\U0001F3F4(?:\u200D\u2620\uFE0F|\U000E0067\U000E0062(?:\U000E0065\U000E006E\U000E0067|\U000E0073\U000E0063\U000E0074|\U000E0077\U000E006C\U000E0073)\U000E007F)?|[\U0001F3F5\U0001F3F7-\U0001F440]|\U0001F441(?:\uFE0F\u200D\U0001F5E8\uFE0F)?|[\U0001F442\U0001F443][\U0001F3FB-\U0001F3FF]?|[\U0001F444\U0001F445]|[\U0001F446-\U0001F450][\U0001F3FB-\U0001F3FF]?|[\U0001F451-\U0001F465]|[\U0001F466\U0001F467][\U0001F3FB-\U0001F3FF]?|\U0001F468(?:\u200D(?:[\u2695\u2696\u2708]\uFE0F|\u2764\uFE0F\u200D(?:\U0001F48B\u200D)?\U0001F468|[\U0001F33E\U0001F373\U0001F393\U0001F3A4\U0001F3A8\U0001F3EB\U0001F3ED]|\U0001F466(?:\u200D\U0001F466)?|\U0001F467(?:\u200D[\U0001F466\U0001F467])?|[\U0001F468\U0001F469]\u200D(?:\U0001F466(?:\u200D\U0001F466)?|\U0001F467(?:\u200D[\U0001F466\U0001F467])?)|[\U0001F4BB\U0001F4BC\U0001F527\U0001F52C\U0001F680\U0001F692\U0001F9B0-\U0001F9B3])|[\U0001F3FB-\U0001F3FF](?:\u200D(?:[\u2695\u2696\u2708]\uFE0F|[\U0001F33E\U0001F373\U0001F393\U0001F3A4\U0001F3A8\U0001F3EB\U0001F3ED\U0001F4BB\U0001F4BC\U0001F527\U0001F52C\U0001F680\U0001F692\U0001F9B0-\U0001F9B3]))?)?|\U0001F469(?:\u200D(?:[\u2695\u2696\u2708]\uFE0F|\u2764\uFE0F\u200D(?:\U0001F48B\u200D)?[\U0001F468\U0001F469]|[\U0001F33E\U0001F373\U0001F393\U0001F3A4\U0001F3A8\U0001F3EB\U0001F3ED]|\U0001F466(?:\u200D\U0001F466)?|\U0001F467(?:\u200D[\U0001F466\U0001F467])?|\U0001F469\u200D(?:\U0001F466(?:\u200D\U0001F466)?|\U0001F467(?:\u200D[\U0001F466\U0001F467])?)|[\U0001F4BB\U0001F4BC\U0001F527\U0001F52C\U0001F680\U0001F692\U0001F9B0-\U0001F9B3])|[\U0001F3FB-\U0001F3FF](?:\u200D(?:[\u2695\u2696\u2708]\uFE0F|[\U0001F33E\U0001F373\U0001F393\U0001F3A4\U0001F3A8\U0001F3EB\U0001F3ED\U0001F4BB\U0001F4BC\U0001F527\U0001F52C\U0001F680\U0001F692\U0001F9B0-\U0001F9B3]))?)?|[\U0001F46A-\U0001F46D]|\U0001F46E(?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|\U0001F46F(?:\u200D[\u2640\u2642]\uFE0F)?|\U0001F470[\U0001F3FB-\U0001F3FF]?|\U0001F471(?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|\U0001F472[\U0001F3FB-\U0001F3FF]?|\U0001F473(?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|[\U0001F474-\U0001F476][\U0001F3FB-\U0001F3FF]?|\U0001F477(?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|\U0001F478[\U0001F3FB-\U0001F3FF]?|[\U0001F479-\U0001F47B]|\U0001F47C[\U0001F3FB-\U0001F3FF]?|[\U0001F47D-\U0001F480]|[\U0001F481\U0001F482](?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|\U0001F483[\U0001F3FB-\U0001F3FF]?|\U0001F484|\U0001F485[\U0001F3FB-\U0001F3FF]?|[\U0001F486\U0001F487](?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|[\U0001F488-\U0001F4A9]|\U0001F4AA[\U0001F3FB-\U0001F3FF]?|[\U0001F4AB-\U0001F4FD\U0001F4FF-\U0001F53D\U0001F549-\U0001F54E\U0001F550-\U0001F567\U0001F56F\U0001F570\U0001F573]|\U0001F574[\U0001F3FB-\U0001F3FF]?|\U0001F575(?:\uFE0F\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|[\U0001F576-\U0001F579]|\U0001F57A[\U0001F3FB-\U0001F3FF]?|[\U0001F587\U0001F58A-\U0001F58D]|[\U0001F590\U0001F595\U0001F596][\U0001F3FB-\U0001F3FF]?|[\U0001F5A4\U0001F5A5\U0001F5A8\U0001F5B1\U0001F5B2\U0001F5BC\U0001F5C2-\U0001F5C4\U0001F5D1-\U0001F5D3\U0001F5DC-\U0001F5DE\U0001F5E1\U0001F5E3\U0001F5E8\U0001F5EF\U0001F5F3\U0001F5FA-\U0001F644]|[\U0001F645-\U0001F647](?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|[\U0001F648-\U0001F64A]|\U0001F64B(?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|\U0001F64C[\U0001F3FB-\U0001F3FF]?|[\U0001F64D\U0001F64E](?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|\U0001F64F[\U0001F3FB-\U0001F3FF]?|[\U0001F680-\U0001F6A2]|\U0001F6A3(?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|[\U0001F6A4-\U0001F6B3]|[\U0001F6B4-\U0001F6B6](?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|[\U0001F6B7-\U0001F6BF]|\U0001F6C0[\U0001F3FB-\U0001F3FF]?|[\U0001F6C1-\U0001F6C5\U0001F6CB]|\U0001F6CC[\U0001F3FB-\U0001F3FF]?|[\U0001F6CD-\U0001F6D2\U0001F6E0-\U0001F6E5\U0001F6E9\U0001F6EB\U0001F6EC\U0001F6F0\U0001F6F3-\U0001F6F9\U0001F910-\U0001F917]|[\U0001F918-\U0001F91C][\U0001F3FB-\U0001F3FF]?|\U0001F91D|[\U0001F91E\U0001F91F][\U0001F3FB-\U0001F3FF]?|[\U0001F920-\U0001F925]|\U0001F926(?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|[\U0001F927-\U0001F92F]|[\U0001F930-\U0001F936][\U0001F3FB-\U0001F3FF]?|\U0001F937(?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|[\U0001F938\U0001F939](?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|\U0001F93A|\U0001F93C(?:\u200D[\u2640\u2642]\uFE0F)?|[\U0001F93D\U0001F93E](?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|[\U0001F940-\U0001F945\U0001F947-\U0001F970\U0001F973-\U0001F976\U0001F97A\U0001F97C-\U0001F9A2\U0001F9B0-\U0001F9B4]|[\U0001F9B5\U0001F9B6][\U0001F3FB-\U0001F3FF]?|\U0001F9B7|[\U0001F9B8\U0001F9B9](?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|[\U0001F9C0-\U0001F9C2\U0001F9D0]|[\U0001F9D1-\U0001F9D5][\U0001F3FB-\U0001F3FF]?|\U0001F9D6(?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|[\U0001F9D7-\U0001F9DD](?:\u200D[\u2640\u2642]\uFE0F|[\U0001F3FB-\U0001F3FF](?:\u200D[\u2640\u2642]\uFE0F)?)?|[\U0001F9DE\U0001F9DF](?:\u200D[\u2640\u2642]\uFE0F)?|[\U0001F9E0-\U0001F9FF])", "", tweet_text)
# make standard persian text
tweet_text = standardize_persian_text(tweet_text, remove_spaces=False)
text += tweet_text
font_path = d + '/fonts/Samim.TTF'
# replace unsupported arabic characters with space
replaced_space = ['', '♀', '🕸', '🕷', '🦇', '️', '\U0001f9df', '\U0001f9db', '\U0001f92c', '\U0001f9f9', '@', '🏻', '😍', ' ', '_', '\U0001f9d0', '\U0001f97a', '\U0001f928',
'\u2066',
'\u2069', '\U0001f970', ' ', '\U0001f92e', '\U0001f96e', '\U0001f9d9', '\U0001f92d', '\U0001f929', '\U0001f92f', '\U0001f92a', '\U0001f932', '\U0001f92b',
'\U0001f973', '\U0001f974', '\u2067', '\U0001f9a0', '\U0001f976', '\U0001f931', '\U0001f96a', '\U0001f967', '\U0001f9af', '\U0001f6f7', '\U0001f9e1',
'\U0001f9d4', '\U0001f9b1', '\U0001f9e0', '\U0001f9c2', '\U0001f9d8', '\U0001f9a5', '\U0001f91f', '\U0001f9a6', '\U0001f9b6', '\U0001f975', '\U0001f9fb',
'\U0001f9ce', '\U0001f9d1', '\U0001f9bd', '\U0001f9ff', '\U0001f90d', '\U0001f9b8', '\U0001f9da',
'\U0001f9b3', '\U0001f9fc', '\U0001f96c', '\U0001f995', '\U0001f9f7', '\U0001f9d0', '\U0001f97a', '\U0001f928', '\u2066', '\u2069', '\U0001f970', ' ',
'\U0001f92e', '\U0001f96e', '\U0001f9d9', '>', '<', '=', '&', '\u003E', '\u003C', 'gt gt', 'lt lt', '\U0001f9d5', '\U0001f998', '\U0001f966', '\U0001f962',
'\U0001fa78', '\U0001f9dc', '\U0001f90e', '\U0001f6f8', '\U0001f992', '\U0001f964', '\U0001f9c1', '\U0001fa90', '\U0001f90f', '\U0001f9cd', '\U0001f9cd',
]
for char in replaced_space:
text = text.replace(char, ' ')
replaced_empty = ['ْ', 'ٌ', 'ٍ', 'ً', 'ُ', 'ِ', 'َ', 'ّ']
for char in replaced_empty:
text = text.replace(char, '')
if drop_stopwords:
stopwords = set(stop_words())
stopwords.update(STOPWORDS)
for stop_word in stopwords:
if stop_word in text:
text = re.sub(r'(^|.*(non_word))(name)((non_word)|$)'.replace('name', stop_word).replace('(non_word)', NON_WORD_REGEX), ' ', text)
# convert to rtl persian/arabic style text
text = persianize(text)
# replace special characters
text = text.replace('gt;', ' ').replace('lt;', ' ').replace('gt', ' ').replace('lt', ' ')
mask = np.array(Image.open(os.path.join(d, "twitter-large.png")))
if color_func is not None:
wc = WordCloud(background_color="white", max_words=2000,
mask=mask,
color_func=color_func,
font_path=font_path)
else:
wc = WordCloud(background_color="white", max_words=2000,
mask=mask,
colormap='winter' if colormap is None else colormap,
font_path=font_path)
# generate word cloud
wc.generate(text)
# store to file
wc.to_file(os.path.join(d, filename))
# show
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
if __name__ == '__main__':
# create some english word-cloud from tweets of jack
create_word_cloud('jack', 500, filename='jack.png')
# noinspection PyUnusedLocal
def my_color_func(word, font_size, position, orientation, random_state, font_path):
# grayish colors
return random.choice(['#999999', '#888888', '#777777', '#666666', '#555555', '#aaaaaa', '#444444', '#333333', '#222222', ])
# create some english word-cloud from tweets of jack with custom colors
create_word_cloud('jack', 500, filename='jack_grayish.png', color_func=my_color_func)
# create some farsi word-cloud!
create_word_cloud('SinaRz91', 500, filename='SinaRz91.png', colormap='autumn')