-
Notifications
You must be signed in to change notification settings - Fork 0
/
08_pq_model.Rmd
439 lines (373 loc) · 15.7 KB
/
08_pq_model.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
---
title: "SEN Analysis"
output: html_notebook
---
# STEP 6 - Subset based on quarantined dataset and re-run DTM and LDAs
This notebook:
1. Reads in the cleaned corpus from 01_pq_clean.Rmd ("01_pq_metaclean.csv")
2. Removes 'quarantined' data ()
3. Removes Topic 1
4. creates a Document Term Matrix
5. trains an LDA model
6. Uses the LDA model to predict topic categories for each article in our corpus
7. writes the Document Term Matrix and LDA models to r data files ("*.rda")
## notes to coders
* IF ADDING NEW PLOT PLEASE CREATE NEW CHUNK
* 90% & "quarantine"
# https://github.com/tqx94/Text-Analytics_LDA
# https://towardsdatascience.com/beginners-guide-to-lda-topic-modelling-with-r-e57a5a8e7a25
# https://towardsdatascience.com/lda-topic-modeling-an-explanation-e184c90aadcd
```{r, echo=FALSE, results="hide", messages=FALSE}
# Load and Install Libraries
source("SEN_functions.R")
## Check libraries & install
LibraryList<-c("dplyr","stringr","magrittr","data.table","tidytext","topicmodels","colorspace","purrr","ldatuning",
"gmp","wordcloud","RColorBrewer","ggplot2","lubridate","reshape2","textmineR","scales")
install_or_load_pack(LibraryList)
outputFolder = "Data/02_Working/"
outputImgFolder = "Images/"
rFileNum = "08"
outputPngFolder<-file.path(outputImgFolder, paste0(rFileNum,"_pq_model"))
if (!dir.exists(outputPngFolder)) dir.create(outputPngFolder)
overwrite = FALSE
```
# Load cleaned corpus output from 01_pq_clean.Rmd with quarantined articles removed
```{r}
#load data
pq_metaclean <- data.table::fread('Data/02_Working/06_pq_metafilter.csv', colClasses = 'character')
pq_topic1 <- data.table::fread('Data/02_Working/07_pq_topic1.csv', colClasses = 'character')
#names(pq_topic1)
pq_topic1 <- pq_topic1 %>% select(`ProQuest document ID`)
pq_metafilter <- pq_metaclean %>%
anti_join(pq_topic1, by="ProQuest document ID")
numFiltered <- nrow(pq_metaclean)-nrow(pq_metafilter)
# 4067
print(paste("Number of articles pre-drop", nrow(pq_metaclean)))
# 982 removed; 3085 articles total
print(paste("Total number of articles post-drop:", nrow(pq_metafilter), "| Number of articles removed:",numFiltered))
rm(pq_metaclean);rm(pq_topic1)
```
# Subset to document ID and full text of each article
```{r}
# subset corpus to unique identifier & full text of article
pq_metasub <- pq_metafilter %>%
select(`ProQuest document ID`,`Full text`)
nrow(pq_metasub)
rm(pq_metafilter)
```
# generate tokens based on custom_stop_words
* Each article is treated as a separate "document"
* we have our own "custom_stop_words" which are appended to tidytext::stop_words
* tidytext::stop_words == "SMART"
```{r}
# add words to stop_words to create custom_stop_words
# stopwords to add pending feedback:
# main, url, becaus, day, st, caus, mani, dont, time, town
custom_stop_words <- bind_rows(tibble(word = c("maine","mainea","pm","am","ita", "pos","rt",
"main", "url", "becaus", "day", "st", "caus",
"mani", "dont", "time", "town"),
lexicon = c("custom")),
stop_words)
outputTokenFile = paste0(rFileNum,"_tokens")
tokens<-create_ifnot_tokens(outputFolder, outputTokenFile, pq_metasub, custom_stop_words, overwrite)
```
# In united tokens we trust & generate DocumentTermMatrix from textmineR library with 1:2 ngram
```{r}
outputDTMFileName <- paste0(rFileNum,"_dtm_tmr_NGram-2")
outputUTokensFileName <-paste0(rFileNum,"_united_tokens")
ngrams_range=c(1,2)
tokens<-create_ifnot_united_tokens(outputFolder, outputUTokensFileName, tokens, overwrite)
dtm<-create_ifnot_CreateDtm(outputFolder,
outputDTMFileName,
ngrams_range,
tokens$`Full text`,
tokens$`ProQuest document ID`,
overwrite)
rm(pq_metasub)
#explore the basic frequency
tf <- TermDocFreq(dtm = dtm)
original_tf <- tf %>% select(term, term_freq,doc_freq)
rownames(original_tf) <- 1:nrow(original_tf)
# Eliminate words appearing less than 2 times or in more than half of the
# documents
vocabulary <- tf$term[ tf$term_freq > 1 & tf$doc_freq < nrow(dtm) / 2 ]
dtm = dtm
```
# Running LDA
And picking the optimal number of topics from 1 to 20 in increments of 5.
```{r}
# number of topics to iterate through to determine topic with highest coherence score
numTopicList<-c(5,10,15,20)
ldaFileNum<-rFileNum
# model_list<-create_ifnot_plda(outputFolder, ldaFileNum, dtm, numTopicList, overwrite)
# create outputFolderLDA
outputFolderLDA <- paste0(outputFolder,ldaFileNum,"_lda_models/")
# if the dir doesn't exist, create it
if (!dir.exists(outputFolderLDA)) dir.create(outputFolderLDA)
fileNameLDAList <- paste0(outputFolderLDA,ldaFileNum,"_lda_model_list.rda")
if (!file.exists(fileNameLDAList) | overwrite) {
for (numTopics in numTopicList) {
k_list <- seq(1, numTopics, by = 1)
# save all model outputs to file & run all lda in parallel (i.e., at same time; runs faster)
model_list <- textmineR::TmParallelApply(X = k_list, FUN = function(k){
fileNameLDA = file.path(outputFolderLDA, paste0(k, "_lda_topic.rda"))
if (!file.exists(fileNameLDA)) {
warning(paste("Creating new LDA:",fileNameLDA))
# find best model fit for each potential # of topics (1:NumTopics)
lda_model <- textmineR::FitLdaModel(dtm = dtm, k = k, iterations = 500)
# k == number of topics
lda_model$k <- k
lda_model$coherence <- textmineR::CalcProbCoherence(phi = lda_model$phi, dtm = dtm, M = 5)
save(lda_model, file = fileNameLDA)
} else {
warning(paste("Already exists, loading:",fileNameLDA))
# if does exists, load
load(fileNameLDA)
}
lda_model
}, export=c("dtm", "outputFolderLDA")) # export only needed for Windows machines
}
save(model_list, file = fileNameLDAList)
} else {
warning(paste("Already exists, loading:",fileNameLDAList))
# RData already exists & overwrite == FALSE; load saved RData object
load(file=fileNameLDAList)
}
#model tuning
#choosing the best model
coherence_mat <- data.frame(k = sapply(model_list, function(x) nrow(x$phi)),
coherence = sapply(model_list, function(x) mean(x$coherence)),
stringsAsFactors = FALSE)
```
# Best topic by Coherence Score
```{r}
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("coherence_score_topic.png"))
png(outputPNGFileName,height=6,width=12, units='in', res=300)
ggplot(coherence_mat, aes(x = k, y = coherence)) +
geom_point() +
geom_line(group = 1)+
ggtitle("Best Topic by Coherence Score") + theme_minimal() +
scale_x_continuous(breaks = seq(1,max(numTopicList),1)) + ylab("Coherence")
print(paste("Plot saved as:",outputPNGFileName))
dev.off()
```
#1. Top 20 terms based on phi
```{r}
#select models based on max average
max_coherence<-which.max(coherence_mat$coherence)
#manual_coherence<-7
model <- model_list[max_coherence][[ 1 ]]
#1. Top 20 terms based on phi
model$top_terms <- GetTopTerms(phi = model$phi, M = 20)
top20_wide <- as.data.frame(model$top_terms)
top20_wide
```
# 2. word, topic relationship
```{r, messages=FALSE}
#looking at the terms allocated to the topic and their pr(word|topic)
allterms <-data.frame(t(model$phi))
allterms$word <- rownames(allterms)
rownames(allterms) <- 1:nrow(allterms)
allterms <- melt(allterms,idvars = "word")
allterms <- allterms %>% rename(topic = variable)
FINAL_allterms <- allterms %>% group_by(topic) %>% arrange(desc(value))
FINAL_allterms
```
# 3. Topic,word,freq
Use this area to look at the top frequency words per topic to infer words to add as stop words
```{r, messages=FALSE}
final_summary_words <- data.frame(top_terms = t(model$top_terms))
final_summary_words$topic <- rownames(final_summary_words)
rownames(final_summary_words) <- 1:nrow(final_summary_words)
final_summary_words <- final_summary_words %>% melt(id.vars = c("topic"))
final_summary_words <- final_summary_words %>% rename(word = value) %>% select(-variable)
final_summary_words <- left_join(final_summary_words,allterms)
final_summary_words <- final_summary_words %>% group_by(topic,word) %>%
arrange(desc(value))
final_summary_words <- final_summary_words %>% group_by(topic, word) %>% filter(row_number() == 1) %>%
ungroup() %>% tidyr::separate(topic, into =c("t","topic")) %>% select(-t)
word_topic_freq <- left_join(final_summary_words, original_tf, by = c("word" = "term"))
word_topic_freq
```
#4. per-document-per-topic probabilities
The probability that an article is a particular topic.
```{r, messages=FALSE}
#4. per-document-per-topic probabilities
#trying to see the topic in each document
theta_df <- data.frame(model$theta)
theta_df$document <-rownames(theta_df)
rownames(theta_df) <- 1:nrow(theta_df)
theta_df$document <- as.numeric(theta_df$document)
theta_df <- melt(theta_df,id.vars = "document")
theta_df <- theta_df %>% rename(topic = variable)
theta_df <- theta_df %>% tidyr::separate(topic, into =c("t","topic")) %>% select(-t)
FINAL_document_topic <- theta_df %>% group_by(document) %>%
arrange(desc(value)) %>% filter(row_number() ==1)
# value here corresponds to probability of document being a particular topic
# with 2 topics, there is fairly strong
FINAL_document_topic;nrow(FINAL_document_topic)
```
# Count of Articles per Topic
As bar chart
```{r, messages=FALSE}
plotTitle = "Count of Articles per Topic"
# count by topic
count_topic <-FINAL_document_topic %>%
group_by(topic) %>%
summarise(n= n(), perc = percent(n()/nrow(FINAL_document_topic))) %>%
arrange(as.numeric(topic),n)
count_topic
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("topic_count_bar.png"))
png(outputPNGFileName,height=6,width=12, units='in', res=300)
ggplot(count_topic, aes(x = as.factor(as.numeric(topic)), y = n, label = n, fill=as.factor(as.numeric(topic)))) +
geom_bar(width = 1,stat = "identity") +
ggtitle(plotTitle) +
geom_text(size = 3, position = position_stack(vjust = 0.5)) +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Topic") +
ylab("Article Count") +
labs(fill = "Topic")
print(paste("Plot saved as:",outputPNGFileName))
dev.off()
```
As pie chart
```{r, messages=FALSE}
plotTitle = "Articles per Topic"
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("topic_count_pie.png"))
png(outputPNGFileName,height=6,width=6, units='in', res=300)
ggplot(count_topic, aes(x = "", y=n, label = perc, fill =as.factor(as.numeric(topic)))) +
geom_bar(width = 1,stat = "identity") +
coord_polar("y") +
ggtitle(plotTitle) +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(size = 3, position = position_stack(vjust = 0.5)) +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x=element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.ticks = element_blank()) +
xlab("Topic") +
ylab("Article Count") +
labs(fill = "Topic")
print(paste("Plot saved as:",outputPNGFileName))
dev.off()
```
# histogram of topic probabilities
Distribution of article probabilities by topic.
```{r, messages=FALSE}
plotTitle = "Topic Probabilities"
numRows<-nrow(FINAL_document_topic)
numBins <- round(numRows*.01)
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("prob_topic.png"))
png(outputPNGFileName,height=6,width=12, units='in', res=300)
ggplot(FINAL_document_topic, aes(value, fill=as.factor(as.numeric(topic)))) +
geom_histogram(bins=numBins) +
ggtitle(plotTitle) +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Probability") +
ylab("Article Count") +
labs(fill = "Topic")
print(paste("Plot saved as:",outputPNGFileName))
dev.off()
```
# Count of Articles by Topic and Probability Range
Count of article probabilities by probability range and topic.
```{r, messages=FALSE}
plotTitle = "Count of Articles by Topic and Probability Range"
# count by 10% breaks each topic
prob_count_topic <-FINAL_document_topic %>%
group_by(range=cut(value, breaks= seq(0, 1, by = 0.10)), topic) %>%
summarise(n= n()) %>%
arrange(topic,as.numeric(range))
prob_count_topic
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("prob_topic_count.png"))
png(outputPNGFileName,height=6, width=12, units='in', res=300)
ggplot(prob_count_topic, aes(x = range, y = n, fill = as.factor(as.numeric(topic)), label = n)) +
geom_bar(stat = "identity") +
geom_text(size = 3, position = position_stack(vjust = 0.5)) +
ggtitle(plotTitle) +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Probability Range") +
ylab("Article Count") +
labs(fill = "Topic")
print(paste("Plot saved as:",outputPNGFileName))
dev.off()
```
# Count of Articles by Probability Range
```{r, messages=FALSE}
plotTitle = "Count of Articles by Probability Range"
# count by 10% breaks
prob_count<-FINAL_document_topic %>%
group_by(range=cut(value, breaks= seq(0, 1, by = 0.10))) %>%
summarise(n= n()) %>%
arrange(as.numeric(range))
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("prob_count.png"))
png(outputPNGFileName,height=6,width=12, units='in', res=300)
ggplot(prob_count, aes(x = range, y = n, label = n, fill=range)) +
geom_bar(stat = "identity") +
geom_text(size = 3, position = position_stack(vjust = 0.5)) +
ggtitle(plotTitle) +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Probability Range") +
ylab("Article Count") +
labs(fill = "Probability Range")
print(paste("Plot saved as:",outputPNGFileName))
dev.off()
```
# Save to labeled corpus to 02_pq_labels.csv
```{r}
# Save cleaned corpus to new file
outputFileNameLabels <- paste0(rFileNum,"_pq_labels")
outputFileLabels<-paste(outputFolder,outputFileNameLabels,".csv",sep="")
# don't want to lose any part of the document ID, so saving as character
FINAL_document_topic$document <- as.character(FINAL_document_topic$document)
# reformat value to 5 sig figs
FINAL_document_topic$val<-formatC(signif(FINAL_document_topic$value,digits=5), digits=5,format="fg", flag="#")
# drop old value column
FINAL_document_topic$value<-NULL
head(FINAL_document_topic);nrow(FINAL_document_topic)
if (!file.exists(outputFileLabels) | overwrite) {
write.csv(FINAL_document_topic, outputFileLabels, row.names=FALSE)
}
```
#5. Visualising of topics in a dendrogram
Shows the 'similarity' between topics.
```{r}
#probability distributions called Hellinger distance, distance between 2 probability vectors
if (max_coherence>2) {
model$topic_linguistic_dist <- textmineR::CalcHellingerDist(model$phi)
model$hclust <- hclust(as.dist(model$topic_linguistic_dist), "ward.D")
model$hclust$labels <- paste(model$hclust$labels, model$labels[ , 1])
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("hclust_dendrogram.png"))
png(outputPNGFileName,height=6,width=12, units='in', res=300)
plot(model$hclust)
print(paste("Plot saved as:",outputPNGFileName))
dev.off()
}
```
# optimized topic wordclouds
wordclouds of the top words in each topic.
```{r}
#visualising topics of words based on the max value of phi
set.seed(1234)
#pdf("cluster.pdf")
pal2 <- brewer.pal(8,"Dark2")
for(i in 1:length(unique(final_summary_words$topic))){
outputPNGFileName <- file.path(outputPngFolder,paste0(i,"_lda_topic_wc.png"))
#print(outputPNGFileName)
png(outputPNGFileName,height=6,width=6, units='in', res=300)
wordcloud(words = subset(final_summary_words ,topic == i)$word,
freq = subset(final_summary_words ,topic == i)$value, scale=c(8,.2), min.freq = 1,
max.words=Inf, random.order=FALSE, rot.per=.15,
colors=pal2)
dev.off()
print(paste("Plot saved as:",outputPNGFileName))
}
```