-
Notifications
You must be signed in to change notification settings - Fork 0
/
Sorting Hat pipeline.R
149 lines (127 loc) · 5.03 KB
/
Sorting Hat pipeline.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
library(tidyverse)
library(rtweet)
library(tidytext)
library(sentimentr)
library(doParallel)
library(caret)
library(ggthemes)
cl <- makePSOCKcluster(7)
registerDoParallel(cl)
StartTime <- Sys.time()
userName <- "_andrewcouch"
bow <- read.csv("bowlist.csv", header = TRUE,stringsAsFactors = FALSE)
bigram <- read.csv("bigramlist.csv", header = TRUE,stringsAsFactors = FALSE)
trigram <- read.csv("trigramlist.csv", header = TRUE,stringsAsFactors = FALSE)
token <- readRDS("twitter_token.rds")
myTweets <- get_timeline(userName, n = 3200)
tweetData <- myTweets %>%
filter(is_retweet == FALSE) %>%
select(text) %>%
mutate(userName = userName)
tweetData$text <- str_trim(gsub('http\\S+\\s*',"", tweetData$text))
tweetData$text <- gsub("(^|[^@\\w])@(\\w{1,15})\\b", "", tweetData$text)
tweetData <- tweetData %>%
filter(!text %in% c(" ", "", " "))
bowFeatures <- tweetData %>%
unnest_tokens(word, "text") %>%
right_join(bow, by = c("word" = "bow")) %>%
count(word, word) %>%
mutate(n = n-1) %>%
spread(word, n)
bigramFeatures <- tweetData %>%
unnest_tokens(bigram, "text", token = "ngrams", n = 2) %>%
right_join(bigram, by = c("bigram" = "bigram")) %>%
count(bigram, bigram) %>%
mutate(n = n-1) %>%
spread(bigram, n)
trigramFeatures <- tweetData %>%
unnest_tokens(trigram, "text", token = "ngrams", n = 3) %>%
right_join(trigram, by = c("trigram" = "trigram")) %>%
count(trigram, trigram) %>%
mutate(n = n-1) %>%
spread(trigram, n)
sentences <- tweetData %>% select(text) %>% get_sentences()
sentiments <- cbind(
sentences %>%
sentiment((lexicon::hash_sentiment_huliu)) %>%
select(sentiment) %>%
rename("huliu" = sentiment),
sentences %>%
sentiment(lexicon::hash_sentiment_jockers_rinker) %>%
select(sentiment) %>%
rename("jockers_rinker" = sentiment),
sentences %>%
sentiment(lexicon::hash_sentiment_nrc) %>%
select(sentiment) %>%
rename("nrc" = sentiment),
sentences %>%
sentiment(lexicon::hash_sentiment_senticnet) %>%
select(sentiment) %>%
rename("senticnet" = sentiment),
sentences %>%
sentiment(lexicon::hash_sentiment_sentiword) %>%
select(sentiment) %>%
rename("sentiword" = sentiment),
sentences %>%
sentiment(lexicon::hash_sentiment_slangsd) %>%
select(sentiment) %>%
rename("slangsd" = sentiment),
sentences %>%
sentiment(lexicon::hash_sentiment_socal_google) %>%
select(sentiment) %>%
rename("socal_google" = sentiment))
sentiments <- sentiments %>%
gather(key = "sentiment", value = "score") %>%
group_by(sentiment) %>%
summarise(score = mean(score)) %>%
spread(key = sentiment, value = score)
emotions <- lexicon::nrc_emotions
emotionFeatures <- sentences %>%
unnest_tokens(word, "text") %>%
filter(word %in% emotions$term) %>%
left_join(emotions, by = c("word" = "term")) %>%
select(-word,-element_id, -sentence_id) %>%
summarise_each(funs(sum)) %>%
gather(key = "sentiment", value = "score") %>%
mutate(score = score / sentences %>% unnest_tokens(word, "text") %>% nrow()) %>%
spread(sentiment, score) %>%
rename("anger.emotion" = anger,
"anticipation.emotion" = anticipation,
"digust.emotion" = disgust,
"fear.emotion" = fear,
"joy.emotion" = joy,
"sadness.emotion" = sadness,
"surprise.emotion" = surprise,
"trust.emotion" = trust)
df <- cbind(bowFeatures, bigramFeatures, trigramFeatures, sentiments, emotionFeatures)
LogisticRegressionModel <- readRDS("LogisticRegressionModel.rds")
NaiveBayesModel <- readRDS("NaiveBayesModel.rds")
L1Model <- readRDS("L1Model.rds")
L2Model <- readRDS("L2Model.rds")
ElasticNetModel <- readRDS("ElasticNetModel.rds")
MARSModel <- readRDS("MARSModel.rds")
KnnModel <- readRDS("KnnModel.rds")
RandomForestModel <- readRDS("RandomForestModel.rds")
SVMModel <- readRDS("SupportVectorMachineModel.rds")
EnsembleModel <- readRDS("EnsembleModel.rds")
ensembleData <- cbind(predict(LogisticRegressionModel, df),
predict(NaiveBayesModel, df),
predict(L1Model, df),
predict(L2Model,df),
predict(ElasticNetModel, df),
predict(MARSModel, df),
predict(KnnModel, df),
predict(RandomForestModel, df),
predict(SVMModel,df)) %>%
as.data.frame()
colnames(ensembleData) <- c("Logistic","NaiveBayes","L1","L2","ElasticNet","MARS","Knn","RandomForest","SVM")
HousePrediction <- predict(EnsembleModel, ensembleData, type = "prob")
colnames(HousePrediction) <- c("Gryffindor", "Hufflepuff", "Ravenclaw", "Slytherin")
stopCluster(cl)
EndTime <- Sys.time()
StartTime - EndTime
HousePrediction %>%
gather(key = "House", value = "Percentage") %>%
ggplot(aes(x = House, y = Percentage, color = House, fill = House)) + geom_col() +
scale_y_continuous(labels = scales::percent) + ggtitle(paste(userName, "'s House Assignment", sep = "")) + theme(plot.title = element_text(hjust = .5)) + theme_economist()
HousePrediction