07_pq_label_review.Rmd

---
title: "pq_label_review"
output:
  pdf_document: default
  html_notebook: default
---

# STEP 3 - Review results from DTM_inferLDATopics_LabelCorpus

This notebook:
1. Reads in labeled outputs from 06_pq_model.Rmd ("06_pq_labels.csv")
2. Joins "06_pq_labels.csv" with "01_pq_metaclean.csv"
3. Subsets dataset based on labels and percent probabilities
4. Writes subsets to CSVs

# Notes
1. "SUBSETTING" keyword for searching 06_pq_label_review.Rmd

```{r, echo=FALSE, results="hide", messages=FALSE}
# Load and Install Libraries
source("SEN_functions.R")

## Check libraries & install
LibraryList<-c("stringr","data.table","dplyr","tidyr","magrittr","NLP","tidytext","tm","ggplot2",
               "scales", "ggwordcloud","textmineR","digest", "broom", "stringi", "xlsx")
install_or_load_pack(LibraryList)


outputFolder = "Data/02_Working/"
outputImgFolder = "Images/"
rFileNum = "07"
rFileModelNum = "06"

outputPngFolder<-file.path(outputImgFolder, paste0(rFileNum,"_pq_review"))
if (!dir.exists(outputPngFolder)) dir.create(outputPngFolder)

overwrite = FALSE
```


```{r}
#load data
# 01_pq_metaclean.csv
pq_metaclean <- data.table::fread('Data/02_Working/01_pq_metaclean.csv')
# read in columns as characters so that doc id does not read in as numeric
# 04_pq_labels.csv
pq_labels <- data.table::fread(paste0('Data/02_Working/',rFileModelNum,'_pq_labels.csv'), colClasses = 'character')

# Head displays the first 6 rows of the data.table
#head(pq_metaclean)
#head(pq_labels)
```


```{r}
# displays column names
print("pq_metaclean columns:")
names(pq_metaclean)
print("")
print("pq_labels columns:")
names(pq_labels)
nrow(pq_metaclean);nrow(pq_labels)
```

# join cleaned dataset with labels
```{r}
# join tables
# inner_join because pq_metaclean was subset based on topic 2 when the model was re-ran in "04_pq_model.Rmd"
# so we only want where the Proquest ID exists in both the original dataset and the labels.
pq_metajoin <- pq_labels %>% 
  inner_join(pq_metaclean, by = c("document" = "ProQuest document ID"))

pq_metajoin <- pq_metajoin %>%
  rename(`ProQuest document ID` = document)

nrow(pq_metajoin);head(pq_metajoin)

pq_empty<-pq_metajoin[pq_metajoin$topic == "" | is.na(pq_metajoin$topic),]
nrow(pq_empty)

# write labeled corpus to CSV
outputFileName = "pq_topics"
outputFile = paste(outputFolder,rFileNum,"_",outputFileName,".xlsx",sep="")
if (!file.exists(outputFile) | overwrite) {
  write.xlsx(pq_metajoin, outputFile, row.names=FALSE)
  #write.htmltable(pq_metajoin,title=outputFile, outputFile, sortby="topic","val")
}
```
# Code now ready for SUBSETTING if you want to skip the following sections --


# exploratory data analysis of topics generated
Do we want to keep both topics, or drop one of the topics and dig deeper into the other topic?

# number of topics by publication title
```{r, messages=FALSE}
plotTitle = "Count of Articles by Topic and Publication Title"

# count by pub title and topic
count_topic_pubtitle <-pq_metajoin %>%
  group_by(`Publication title`,topic) %>% 
  summarise(n= n()) %>%
  arrange(as.numeric(topic),as.numeric(n))

count_topic_pubtitle

# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("count_topic_pubtitle.png"))
png(outputPNGFileName,height=6,width=12, units='in', res=300)
ggplot(count_topic_pubtitle, aes(x = as.factor(as.numeric(topic)), y = n, fill = `Publication title`, label = n )) +
  geom_bar(stat = "identity") +
  geom_text(size = 3, position = position_stack(vjust = 0.5)) +
  ggtitle(plotTitle) +
  theme(plot.title = element_text(hjust = 0.5)) +
  xlab("Topic") +
  ylab("Article Count") +
  labs(fill = "Topic")
print(paste("Plot saved as:",outputPNGFileName))
dev.off()

knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/count_topic_pubtitle.png"))
```

# number of topics by location

# Count articles by topic and year
Important to consider the total articles/year in addition to the raw count of articles per topic.
Peaks in articles may simply be due to an overall increase of articles for a particular year.
```{r, messages=FALSE}
# article count per year and topic
count_topic_year <-pq_metajoin %>%
  group_by(`Publication year`,topic) %>% 
  summarise(n= n()) %>%
  arrange(as.numeric(topic),as.numeric(`Publication year`))

count_topic_year

# article count per year
count_year <-pq_metajoin %>%
  group_by(`Publication year`) %>% 
  summarise(perYear= n()) %>%
  arrange(as.numeric(`Publication year`))

count_topic_year<-count_topic_year %>%
  left_join(count_year, by = "Publication year")

count_topic_year<-count_topic_year %>%
  mutate(ratio = round(n/perYear,2))

plotTitle = "Count of Articles by Topic and Year"
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("count_topic_year.png"))
png(outputPNGFileName,height=5,width=15, units='in', res=300)
ggplot(count_topic_year, aes(x = `Publication year`, colour = as.factor(as.numeric(topic)))) +
  geom_line(aes(y = n)) +
  scale_x_continuous(breaks=seq(min(na.omit(count_topic_year$`Publication year`)), max(na.omit(count_topic_year$`Publication year`)),1)) +
  theme(axis.text.x = element_text(angle = 90),
        plot.title = element_text(hjust = 0.5)) +
  facet_wrap(vars(as.numeric(topic))) +
  ggtitle(plotTitle) +
  xlab("Publication Year") +
  ylab("Article Count") +
  guides(colour=FALSE) 
print(paste("Plot saved as:",outputPNGFileName))
dev.off()

plotTitle = "Count of Articles by Topic and Year with Year Total"
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("count_topic_year_tot.png"))
png(outputPNGFileName,height=5,width=15, units='in', res=300)
ggplot(count_topic_year, aes(x = `Publication year`, colour = as.factor(as.numeric(topic)))) +
  geom_line(aes(y = n)) +
  geom_line(linetype = "dashed", color="black", aes(y = perYear)) +
  scale_x_continuous(breaks=seq(min(na.omit(count_topic_year$`Publication year`)), max(na.omit(count_topic_year$`Publication year`)),1)) +
  theme(axis.text.x = element_text(angle = 90),
        plot.title = element_text(hjust = 0.5)) +
  facet_wrap(vars(as.numeric(topic))) +
  ggtitle(plotTitle) +
  xlab("Publication Year") +
  ylab("Article Count") +
  guides(colour=FALSE) 
print(paste("Plot saved as:",outputPNGFileName))
dev.off()

plotTitle = "Ratio of Articles by Topic and Year"
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("ratio_topic_year.png"))
png(outputPNGFileName,height=5,width=15, units='in', res=300)
ggplot(count_topic_year, aes(x = `Publication year`, colour = as.factor(as.numeric(topic)))) +
  geom_line(aes(y = ratio)) +
  scale_x_continuous(breaks=seq(min(na.omit(count_topic_year$`Publication year`)), max(na.omit(count_topic_year$`Publication year`)),1)) +
  theme(axis.text.x = element_text(angle = 90),
        plot.title = element_text(hjust = 0.5)) +
  facet_wrap(vars(as.numeric(topic))) +
  ggtitle(plotTitle) +
  xlab("Publication Year") +
  ylab("(Article Count)/(Article Total)") +
  guides(colour=FALSE) 
print(paste("Plot saved as:",outputPNGFileName))
dev.off()

knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/count_topic_year.png"))
knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/count_topic_year_tot.png"))
knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/ratio_topic_year.png"))
```

# Coherence score by topic
```{r, out.width="50%", fig.pos="h"}
knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/coherence_score_topic.png"))
```

# Wordclouds
```{r, out.width="50%", fig.pos="h"}
knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/1_lda_topic_wc.png"))
knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/2_lda_topic_wc.png"))
```
# Count of Article by Topic
```{r, out.width="50%", fig.pos="h"}
# bar chart
knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/topic_count_bar.png"))

# pie chart
knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/topic_count_pie.png"))
```

# Count of Probabilities by Percentage
```{r, out.width="50%", fig.pos="h"}
knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/prob_count.png"))
```
# Count of Topic Probabilities by Percentage
```{r, out.width="50%", fig.pos="h"}
knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/prob_topic_count.png"))
```
#5. Visualising of topics in a dendrogram - not enough topics (<2)
```{r, out.width="50%", fig.pos="h"}
# knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/hclust_dendrogram.png"))

```

# START SUBSETTING - Subset Function
#### topic_subset_csv <- function(outputFolderName, rFileNum, outputFileName, inputCorpus, minPerc, maxPerc, theTopic, overwrite)

* outputFolderName = folder to save to -- i.e., "Data/02_Working/" (this is set at the top of the code where the libraries are loaded/installed)
* rFileNum = The number in the r file -- i.e., "05" (this is set at the top of the code where the libraries are loaded/installed)
* outputFileName = the file name to save as -- i.e., "pq_topic5_90perc" (this needs to be set at least 1 line above where the function is ran)
* inputCorpus = the labeled corpus -- i.e., pq_metajoin (this is created shortly after where libraries are loaded/installed in review.Rmd by joining pq_metaclean with pq_labels)
* minPerc = minimum percentage -- i.e., 0.9 (this needs to be set at least 1 line above where the function is ran)
* maxPerc = maximum percentage -- i.e., 1 (this needs to be set at least 1 line above where the function is ran)
* theTopic = the topic category -- i.e., "1" (this needs to be set at least 1 line above where the function is ran)
* overwrite = whether to overwrite the file if it already exists -- i.e., FALSE (this needs to be set at least 1 line above where the function is ran)

# Subset topic 1
```{r}
# subset corpus to unique identifier & full text of article
# investigate topic 1

outputFileName_1t <- "pq_topic1"
minPerc <- 0
maxPerc <- 1
theTopic <- "1"

## e.g, minPerc = 0 & maxPerc = 0.4 is zero to 0.3999999999 ...
pq_topic1_subset<-topic_subset_csv(outputFolder, rFileNum, outputFileName_1t, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

head(pq_topic1_subset);nrow(pq_topic1_subset)
```
# Subset topic 2
```{r}
# subset corpus to unique identifier & full text of article
# investigate topic 2

outputFileName_1t <- "pq_topic2"
minPerc <- 0
maxPerc <- 1
theTopic <- "2"

## e.g, minPerc = 0 & maxPerc = 0.4 is zero to 0.3999999999 ...
pq_topic1_subset<-topic_subset_csv(outputFolder, rFileNum, outputFileName_1t, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

head(pq_topic1_subset);nrow(pq_topic1_subset)
```

# Subset topic 1, 90%
```{r}
# subset corpus to unique identifier & full text of article
# investigate topic 1, 90%
outputFile_1t90perc <- "pq_topic1_90perc"
minPerc <- 0.9
maxPerc <- 1
theTopic <- "1"

## e.g, minPerc = 0 & maxPerc = 0.4 is zero to 0.3999999999 ...
pq_topic1_90perc<-topic_subset_csv(outputFolder, rFileNum, outputFile_1t90perc, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

head(pq_topic1_90perc);nrow(pq_topic1_90perc)
```


# Subset topic 2, 90%
```{r}
# subset corpus to unique identifier & full text of article
# investigate topic 2, 90%
outputFile_5t90perc <- "pq_topic2_90perc"
minPerc <- 0.9
maxPerc <- 1
theTopic <- "2"

## e.g, minPerc = 0 & maxPerc = 0.4 is zero to 0.3999999999 ...
pq_topic5_90perc<-topic_subset_csv(outputFolder, rFileNum, outputFile_5t90perc, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

head(pq_topic5_90perc);nrow(pq_topic5_90perc)
```


# Subset 50-60%-ers
```{r}
# subset corpus to unique identifier & full text of article
# investigate all topics, 50-60%
outputFile_56perc <- "pq_56perc"
minPerc <- 0.5
maxPerc <- 0.6
theTopic <- "all"

## e.g, minPerc = 0 & maxPerc = 0.4 is zero to 0.3999999999 ...
pq_56perc<-topic_subset_csv(outputFolder, rFileNum, outputFile_56perc, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

head(pq_56perc);nrow(pq_56perc)
```

# Subset 0-60%-ers
```{r}
# subset corpus to unique identifier & full text of article
# investigate all topics, 0-60%
# if within the function you write overwrite=TRUE, then the output CSV file will be re-written. 
# overwrite, outputFolder, rFileName are set in the first chunk of code


outputFileName_06perc <- "pq_06perc"
minPerc <- 0
maxPerc <- 0.6
theTopic <- "all"

## e.g, minPerc = 0 & maxPerc = 0.6 is zero to 0.59999999 ...
pq_perc06_subset<-topic_subset_csv(outputFolder, rFileNum, outputFileName_06perc, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

head(pq_perc06_subset);nrow(pq_topic1_subset)
unique(pq_perc06_subset$topic)
min(pq_perc06_subset$val)
max(pq_perc06_subset$val)
```

# END SUBSETTING

# explore term frequencies by year - how do words in our corpus change over time?
Referenced walk-through from: https://cran.r-project.org/web/packages/tidytext/vignettes/tidying_casting.html
Are articles normalized based on the number of articles per year
```{r}
# subset corpus to unique identifier & year
pq_time <- pq_metaclean %>% 
  select(`ProQuest document ID`, `Publication year`)

# tokens generated from 06_pq_model.Rmd
outputTokenFile = paste0(rFileModelNum,"_tokens.RData")
tokensFileName = file.path(outputFolder,outputTokenFile)

load(file=tokensFileName)

tokens <- tokens %>% 
  full_join(pq_time, by = c("ProQuest document ID" = "ProQuest document ID")) %>%
  rename(Year = `Publication year`) %>%
  rename(pq_id = `ProQuest document ID`) %>%
  mutate_at(vars(Year), funs(as.integer))
rm(pq_time)

outputFreqsFile = paste0(rFileNum,"_tokens_freq")
tokens_freq<-create_ifnot_tokens_freq(outputFolder, outputFreqsFile, tokens, overwrite)

outputFreqModelFile = paste0(rFileNum,"_freq_models")
freq_models <- create_ifnot_freqmodels(outputFolder, outputFreqModelFile, tokens_freq, overwrite)
```

# model results
```{r}
freq_models %>%
  filter(term == "Year") %>%
  arrange(desc(abs(estimate)))
```

# Models displayed as a volcano plot, which compares the effect size with the significance
```{r}
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("word_change_over_time.png"))
png(outputPNGFileName,height=5,width=15, units='in', res=300)

freq_models %>%
  mutate(adjusted.p.value = p.adjust(p.value)) %>%
  ggplot(aes(estimate, adjusted.p.value)) +
  geom_point() +
  scale_y_log10() +
  geom_text(aes(label = word), vjust = 1, hjust = 1,
            check_overlap = TRUE) +
  xlab("Estimated change over time") +
  ylab("Adjusted p-value")


print(paste("Plot saved as:",outputPNGFileName))
dev.off()

knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/word_change_over_time.png"))
```

# Top 6 terms that have changed in frequency over time
```{r}
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("top_word_change_over_time.png"))
png(outputPNGFileName,height=5,width=15, units='in', res=300)

freq_models %>%
  top_n(6, abs(estimate)) %>%
  inner_join(tokens_freq) %>%
  ggplot(aes(Year, percent)) +
  geom_point() +
  geom_smooth() +
  facet_wrap(~ word) +
  scale_y_continuous(labels = percent_format()) +
  ylab("Frequency of word in speech")

print(paste("Plot saved as:",outputPNGFileName))
dev.off()

knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/top_word_change_over_time.png"))
```