-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessor.py
executable file
·67 lines (49 loc) · 1.99 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd
import numpy as np
from .utils import *
class Preprocessor:
def __init__(self, feature_list = None):
self.feature_list = [
'statuses_count',
'followers_count',
'friends_count',
'favourites_count',
'listed_count',
'default_profile',
'profile_use_background_image',
'verified',
'tweet_freq',
'followers_growth_rate',
'friends_growth_rate',
'favourites_growth_rate',
'listed_growth_rate',
'followers_friends_ratio',
'screen_name_length',
'description_length',
] if feature_list is None else feature_list
def process(self, json_list):
frame = pd.DataFrame(json_list)
frame = pd.json_normalize(frame.user)
# remove duplicate rows
frame.drop_duplicates(subset='id', keep="last").reset_index(drop=True)
current_time = datetime.datetime.now(datetime.timezone.utc)
frame['user_age'] = frame.apply(lambda x: calculate_age(current_time, x['created_at']), axis=1)
frame['tweet_freq'] \
= frame.apply(lambda x: (x['statuses_count'] / max(x['user_age'], 1)), axis=1)
frame['followers_growth_rate'] \
= frame.apply(lambda x: (x['followers_count'] / max(x['user_age'], 1)), axis=1)
frame['friends_growth_rate'] \
= frame.apply(lambda x: (x['friends_count'] / max(x['user_age'], 1)), axis=1)
frame['favourites_growth_rate'] \
= frame.apply(lambda x: (x['favourites_count'] / max(x['user_age'], 1)), axis=1)
frame['listed_growth_rate'] \
= frame.apply(lambda x: (x['listed_count'] / max(x['user_age'], 1)), axis=1)
frame['followers_friends_ratio'] \
= frame.apply(lambda x: (x['followers_count'] / max(x['friends_count'], 1)), axis=1)
frame['screen_name_length'] = frame.apply(lambda x: len(x['screen_name']), axis=1)
frame['description_length'] = frame.apply(lambda x: len(x['description']), axis=1)
frame = frame[self.feature_list]
cat_features = frame.select_dtypes(exclude=np.number).columns.to_list()
for col in cat_features:
frame[col] = pd.Categorical(frame[col])
return frame