From d4292f8d7a4858af2634db12b5efe9c67500550a Mon Sep 17 00:00:00 2001
From: embruna <embruna@ufl.edu>
Date: Wed, 24 Jul 2024 10:59:34 -0400
Subject: [PATCH 1/7] added WOS tag AR

---
 R/references_read.R | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/R/references_read.R b/R/references_read.R
index 45702ea..1ea35fb 100644
--- a/R/references_read.R
+++ b/R/references_read.R
@@ -96,6 +96,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
     "VL" = character(0),
     "WC" = character(0),
     "Z9" = character(0),
+    "AR" = character(0),
     stringsAsFactors = FALSE
   )
 
@@ -280,11 +281,11 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
   if (include_all != TRUE){
 
     dropnames <- c("CC", "CH", "CL", "CT", "CY",
-      "DT", "FX", "GA", "GE", "ID",
-      "IS", "J9", "JI", "LA", "LT",
-      "MC", "MI", "NR", "PA", "PI",
-      "PN", "PS", "RID", "SI", "SU",
-      "TA", "VR")
+                   "DT", "FX", "GA", "GE", "ID",
+                   "IS", "J9", "JI", "LA", "LT",
+                   "MC", "MI", "NR", "PA", "PI",
+                   "PN", "PS", "RID", "SI", "SU",
+                   "TA", "VR")
 
     rdo <- dupe_output[, !(names(dupe_output) %in% dropnames)]
 

From c4294d5fdf6bbd039eeefb01b57a0e143d4b316e Mon Sep 17 00:00:00 2001
From: embruna <embruna@ufl.edu>
Date: Fri, 9 Aug 2024 13:03:07 -0400
Subject: [PATCH 2/7] updating references_read with new field codes, deleting
 old ones

added notes to reflect these changes
---
 NEWS.md             | 15 +++++++++++++++
 R/references_read.R | 35 ++++++++++++++++++-----------------
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index cfa2411..6ca0185 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,20 @@
 # refsplitr News
 
+
+refsplitr 1.X.X (2024-XX-XX)
+=========================
+
+### NEW FEATURES
+
+  * `references_read` now extracts additional fields from Web of Science records: WE (Source Database), C3 (affiliations*), EI (eISSN) and RID (the original version of the Thomson-Reuters ResearcherID (RI); authors of some older publications might have an RID but not an RI). To include these in the output of`references_read` use the setting `include_all=TRUE`.  
+
+  *a single cell with list of all affiliations, not brtoken down by author. to match scopus
+  * `references_read` no longer extracts some rarely used field codes: GE, LT, MC, MI, and TA
+  
+  * the Document Type (DT), Keywords Plus (ID), Issue (IS), ISO abbreviated source code (JI), and number of references cited in an article (NR) are now returned by default (`include_all=FALSE`). 
+
+
+
 refsplitr 1.0.1 (2024-07-23)
 =========================
 
diff --git a/R/references_read.R b/R/references_read.R
index 1ea35fb..c0551fa 100644
--- a/R/references_read.R
+++ b/R/references_read.R
@@ -12,8 +12,8 @@
 #' all files in the folder will be imported). Defaults to FALSE. 
 #' @param include_all if FALSE only a subset of commonly used fields from references records are imported. 
 #' If TRUE then all fields from the reference records are imported. Defaults to FALSE.  
-#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, DT, FX, GA, GE, ID, IS, J9, JI, 
-#' LA, LT, MC, MI, NR, PA, PI, PN, PS, RID, SU, TA, VR.
+#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, FX, GA, J9, 
+#' LA, PA, PI, PN, PS, RID, SU, VR.
 #' @export references_read
 #' 
 #' @examples 
@@ -41,6 +41,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
     "CA" = character(0),
     "BP" = character(0),
     "C1" = character(0),
+    "C3" = character(0),
     "CC" = character(0),
     "CH" = character(0),
     "CL" = character(0),
@@ -58,15 +59,15 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
     "FU" = character(0),
     "FX" = character(0),
     "GA" = character(0),
-    "GE" = character(0),
+    # "GE" = character(0), (removed by EB Sept 2024)
     "ID" = character(0),
     "IS" = character(0),
     "J9" = character(0),
     "JI" = character(0),
     "LA" = character(0),
-    "LT" = character(0),
-    "MC" = character(0),
-    "MI" = character(0),
+    # "LT" = character(0),
+    # "MC" = character(0),
+    # "MI" = character(0),
     "NR" = character(0),
     "PA" = character(0),
     "PD" = character(0),
@@ -77,18 +78,19 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
     "PT" = character(0),
     "PU" = character(0),
     "PY" = character(0),
-    "RI" = character(0), # NEW field code for Thomson-Reuters ResearcherID
-    "RID" = character(0), # OLD field code for Thomson-Reuters ResearcherID
-    # Older searchers will have RID, not RI ACTUALLY LOOK SL IKE NOT
-    "OI" = character(0), # New field code for ORCID ID (added EB Jan 2017)
-    "PM" = character(0), # Pubmed ID Number (added by EB 3 dec 2017)
+    "RI" = character(0), # New field code for Thomson-Reuters ResearcherID
+    "RID" = character(0), # Original field code for Thomson-Reuters ResearcherID
+                         # Older searchers will have RID (added by EB Sept 2024)
+    "OI" = character(0), # Field code for ORCID ID (added by EB Jan 2017)
+    "PM" = character(0), # Pubmed ID Number (added by EB Dec 2017)
     "RP" = character(0),
     "SC" = character(0),
     "SI" = character(0),
     "SN" = character(0),
+    "EI" = character(0),
     "SO" = character(0),
     "SU" = character(0),
-    "TA" = character(0),
+    # "TA" = character(0), (removed by EB Sept 2024)
     "TC" = character(0),
     "TI" = character(0),
     "UT" = character(0),
@@ -97,6 +99,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
     "WC" = character(0),
     "Z9" = character(0),
     "AR" = character(0),
+    "WE" = character(0),
     stringsAsFactors = FALSE
   )
 
@@ -281,11 +284,9 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
   if (include_all != TRUE){
 
     dropnames <- c("CC", "CH", "CL", "CT", "CY",
-                   "DT", "FX", "GA", "GE", "ID",
-                   "IS", "J9", "JI", "LA", "LT",
-                   "MC", "MI", "NR", "PA", "PI",
-                   "PN", "PS", "RID", "SI", "SU",
-                   "TA", "VR")
+                   "FX", "GA", "GE", "J9", "LA",
+                   "PA", "PI", "PN", "PS", "RID", 
+                   "SI", "SU", "VR")
 
     rdo <- dupe_output[, !(names(dupe_output) %in% dropnames)]
 

From 006b2ec64ef81972980e61d9ccd8e73f9f1678f8 Mon Sep 17 00:00:00 2001
From: embruna <embruna@ufl.edu>
Date: Fri, 9 Aug 2024 13:03:36 -0400
Subject: [PATCH 3/7] adding draft function for references_read of scopus files
 downloaded as csv

---
 R/references_read_scopus_download.R | 570 ++++++++++++++++++++++++++++
 1 file changed, 570 insertions(+)
 create mode 100644 R/references_read_scopus_download.R

diff --git a/R/references_read_scopus_download.R b/R/references_read_scopus_download.R
new file mode 100644
index 0000000..91d2827
--- /dev/null
+++ b/R/references_read_scopus_download.R
@@ -0,0 +1,570 @@
+#' Reads SCOPUS download Output 
+##' Thomson Reuters Web of Knowledge/Science and ISI reference export files (both .txt or .ciw format accepted)
+#'
+#' \code{references_read_scopus_download} This function reads Scopus 
+#' reference data files downloaded directly as csv into an R-friendly data format. The resulting dataframe
+#' is the argument for the refsplitr function `authors_clean()`.    
+#'
+#' @param data the location of the file or files to be imported. This can be either the absolute or 
+#' relative name of the file (for a single file) or folder (for multiple files stored in the same folder; 
+#' used in conjunction with `dir = TRUE``). If left blank it is assumed the location is the working directory.
+#' @param dir if FALSE it is assumed a single file is to be imported. 
+#' Set to TRUE if importing multiple files (the path to the folder in which files are stored is set with `data=``; 
+#' all files in the folder will be imported). Defaults to FALSE. 
+#' @param include_all if FALSE only a subset of commonly used fields from references records are imported. 
+#' If TRUE then all fields from the reference records are imported. Defaults to FALSE.  
+#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, DT, FX, GA, GE, ID, IS, J9, JI, 
+#' LA, LT, MC, MI, NR, PA, PI, PN, PS, RID, SU, TA, VR.
+#' @export references_read_scopus_api
+#' 
+#' @examples 
+#' ## If a single files is being imported from a folder called "data" located in an RStudio Project: 
+#' ## imported_refs<-references_read_scopus_api(data = './data/refs.txt', dir = FALSE, include_all=FALSE)
+#' 
+#' ## If multiple files are being imported from a folder named "heliconia" nested within a folder
+#' ## called "data" located in an RStudio Project: 
+#' ## heliconia_refs<-references_read_scopus_api(data = './data/heliconia', dir = TRUE, include_all=FALSE)
+#' 
+#' ## To load the Scopus API records used in the examples in the documentation  
+#' scopus_api_data <- system.file('extdata', 'BITR_test.txt', package = 'refsplitr')
+#' scopus_api_example <- references_read_scopus_api(scopus_api_data)
+#' 
+#' 
+
+# SEE IMPORTANT NOTES ON ORIGINAL 265 and 295 IN TROPICAL SCIENTOMETRIX
+
+
+library(tidyverse)
+# read & standardize: SCOPUS papers ---------------------------------------
+
+file_list <- list.files(path='./data/for_testing_updates/scopus_csv',
+                            full.names = TRUE)
+
+references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
+  ## 	NOTE: The fields stored in our output table are a combination of the
+  ## 	"Thomson Reuters Web of Knowledge" FN format and the "ISI Export
+  ## 	Format" both of which are version 1.0:
+  output <- data.frame(
+    "filename" = character(0),
+    "AB" = character(0),
+    "AF" = character(0),
+    "AU" = character(0),
+    "CA" = character(0),
+    "BP" = character(0),
+    "C1" = character(0),
+    "C3" = character(0),
+    "CC" = character(0),
+    "CH" = character(0),
+    "CL" = character(0),
+    "CR" = character(0),
+    "CT" = character(0),
+    "CY" = character(0),
+    "DE" = character(0),
+    "DI" = character(0),
+    "DT" = character(0),
+    # 	"EF" = character(0),	##	End file
+    "EM" = character(0),
+    "EP" = character(0),
+    # 	"ER" = character(0),	##	End record
+    "FN" = character(0),
+    "FU" = character(0),
+    "FX" = character(0),
+    "GA" = character(0),
+    # "GE" = character(0), (removed by EB Sept 2024)
+    "ID" = character(0),
+    "IS" = character(0),
+    "J9" = character(0),
+    "JI" = character(0),
+    "LA" = character(0),
+    # "LT" = character(0),
+    # "MC" = character(0),
+    # "MI" = character(0),
+    "NR" = character(0),
+    "PA" = character(0),
+    "PD" = character(0),
+    "PG" = character(0),
+    "PI" = character(0),
+    "PN" = character(0),
+    "PS" = character(0),
+    "PT" = character(0),
+    "PU" = character(0),
+    "PY" = character(0),
+    "RI" = character(0), # New field code for Thomson-Reuters ResearcherID
+    "RID" = character(0), # Original field code for Thomson-Reuters ResearcherID
+    # Older searchers will have RID (added by EB Sept 2024)
+    "OI" = character(0), # Field code for ORCID ID (added by EB Jan 2017)
+    "PM" = character(0), # Pubmed ID Number (added by EB Dec 2017)
+    "RP" = character(0),
+    "SC" = character(0),
+    "SI" = character(0),
+    "SN" = character(0),
+    "EI" = character(0),
+    "SO" = character(0),
+    "SU" = character(0),
+    # "TA" = character(0), (removed by EB Sept 2024)
+    "TC" = character(0),
+    "TI" = character(0),
+    "UT" = character(0),
+    "VR" = character(0),
+    "VL" = character(0),
+    "WC" = character(0),
+    "Z9" = character(0),
+    "AR" = character(0),
+    "WE" = character(0),
+    stringsAsFactors = FALSE
+  )
+
+  
+  ## 	This is an index for the current record, it gets iterated for each
+  # record we advance through:
+  i <- 1
+  if (dir) {
+    file_list <- dir(path = data)
+  } else {
+    file_list <- data
+  }
+  
+  
+  
+  ## 	Strip out any files in the directory that aren't Web of Knowledge files:
+  file_list <- file_list[ grep(".ciw|.csv", file_list) ]
+  
+  if (length(file_list) == 0) {
+    stop("ERROR:  The specified file or directory does not contain any
+      Scopus download records in .csv format!")
+  }
+  message("Now processing all references files")
+  
+  
+  
+  # for (filename in file_list) {
+  #   if (dir) {
+  #     in_file <- file(paste0(data, "/", filename), "r")
+  #   }
+  #   if (!dir) {
+  #     in_file <- file(filename, "r")
+  #   }
+  #   
+  #   field <- ""
+  #   
+  #   ##    Process the first line to determine what file type it is:
+  #   ## 		NOTE:  We could add the encoding="UTF-8" flag to the readLines in
+  #   ## 		order to remove the byte-order mark (BOM) from some exported
+  #   ## 		files coming out of ISI, but there seems to be a bug in the
+  #   ## 		readLines() function after bringing a UTF-8 file in, in that
+  #   ## 		it doesn't respsect the BOM characters.  So we'll just read
+  #   ## 		the files in with no encoding specified and strip the BOM if
+  #   
+  #   read_line <- readLines(in_file, n = 1, warn = FALSE)
+  #   
+  #   if (length(read_line) > 0) {
+  #     
+  #     read_line <- gsub("^[^A-Z]*([A-Z]+)(.*)$", "\\1\\2", read_line)
+  #     
+  #     ##  Strip the first two characters from the text line,
+  #     #   skip the third (should be a space) and store the rest:
+  #     pre_text <- substr(read_line, 1, 2)
+  #     line_text <- substr(read_line, 4, nchar(read_line))
+  #     
+  #     if (pre_text != "FN") {
+  #       close(in_file)
+  #       error <- paste0("ERROR:  The file ",
+  #                       filename,
+  #                       " doesn't appear to be a valid ISI or
+  #         Thomson Reuters reference library file!")
+  #       stop(error)
+  #     }
+  #     
+  #     ## 	Check to see if this is a "ISI Export Format" file, in which
+  #     ## 		case we need to parse out the first line into three fields:
+  #     if (substr(line_text, 1, 3) == "ISI") {
+  #       field <- pre_text
+  #       
+  #       ## 	Pull apart the FN, VR and PT fields all contained on the first
+  #       ## 		line of the ISI file format:
+  #       matches <- regexec(
+  #         "^(.*) VR (.*) PT (.*)",
+  #         line_text
+  #       )
+  #       
+  #       match_strings <- regmatches(
+  #         line_text,
+  #         matches
+  #       )
+  #       
+  #       ## 	Store those fields:
+  #       output[i, "FN"] <- paste(match_strings[[1]][2], "\n", sep = "")
+  #       
+  #       output[i, "VR"] <- paste(match_strings[[1]][3], "\n", sep = "")
+  #       
+  #       output[i, "PT"] <- paste(match_strings[[1]][4], "\n", sep = "")
+  #     } else {
+  #       ## 	If this is not an ISI export format then just parse the first
+  #       ## 		line normally into the FN field:
+  #       field <- pre_text
+  #       if (field %in% names(output)) {
+  #         output[i, field] <- ""
+  #         output[i, field] <- trimws(
+  #           ifelse(length(line_text) == 1,
+  #                  paste(output[i, field], line_text,
+  #                        sep = "\n")),
+  #           "both")
+  #       }
+  #     }
+  #   } else {
+  #     utils::flush.console()
+  #     stop("WARNING:  Nothing contained in the specified file!")
+  #   }
+  #   
+  #   ## 	Process the remaining lines in the file (see the note above about
+  #   ## 		the encoding= flag and necessity for it, but why we didn't use it):
+  #   while (length(read_line <- readLines(in_file, n = 1, warn = FALSE)) > 0) {
+  #     ## 	Strip the first three characters from the text line:
+  #     pre_text <- substr(read_line, 1, 2)
+  #     
+  #     line_text <- substr(read_line, 4, nchar(read_line))
+  #     
+  #     ## 	Check to see if this is a new field:
+  #     if (pre_text != "  ") {
+  #       field <- pre_text
+  #       ## 	If the field is in our file and in our data structure then
+  #       ## 		initialize it to an empty string:
+  #       if (field %in% names(output)) {
+  #         output[i, field] <- ""
+  #       }
+  #     }
+  #     
+  #     ## 	Check to see if the current field is one we are saving to output:
+  #     if (field %in% names(output)) {
+  #       ##... if it is then append this line's data to the field in our output:
+  #       
+  #       output[i, field] <- trimws(
+  #         ifelse(length(line_text) == 1,
+  #                paste(output[i, field], line_text, sep = "\n")),
+  #         "both")
+  #     }
+  #     
+  #     # 	If this is the end of a record then add any per-record items and
+  #     # 		advance our row:
+  #     if (field == "ER") {
+  #       output[i, "filename"] <- filename
+  #       
+  #       ## 	These fields are not repeated for every record, so we set them
+  #       ## 		from the first record where they were recorded:
+  #       
+  #       output[i, "FN"] <- output[1, "FN"]
+  #       output[i, "VR"] <- output[1, "VR"]
+  #       
+  #       i <- i + 1
+  #     }
+  #   }
+  #   
+  #   close(in_file)
+  #   ############################### Clock######################################
+  #   total <- length(file_list)
+  #   pb <- utils::txtProgressBar(min = 0, max = total, style = 3)
+  #   utils::setTxtProgressBar(pb, counter)
+  #   counter <- counter + 1
+  #   utils::flush.console()
+  #   ###########################################################################
+  # }
+  # 
+  # 
+output <- file_list %>% 
+  lapply(read_csv,col_types = cols(.default = "c")) %>% # read all the files at once
+  bind_rows(.id = "csv_id") %>% # bind all tables into one object, and give id for each
+  rename_all(~str_replace_all(.,"prism","")) %>% 
+  rename_all(~str_replace_all(.,"dc","")) %>% 
+  rename_all(~str_replace_all(.,"\\:","")) %>% 
+  # mutate(PY=str_sub(coverDate, 1, 4)) %>% 
+  rename("SO"="Source title",
+         "AB"="Abstract",
+         "DI"="DOI",
+         "DT"="Document Type",
+         "VL"="Volume",
+         "filename"="csv_id",
+         "TI"="Title",
+         "PY"="Year",
+         "SO"="Source title",
+         "VL"="Volume",
+         "IS"="Issue",
+         "AR"="Art. No.",
+         "BP"="Page start",
+         "EP"="Page end",
+         "PG"="Page count",
+         "DI"="DOI",
+         "DE"="Author Keywords",
+         "RP"="Correspondence Address",
+         "BE"="Editors",
+         "PU"="Publisher",
+         "SN"="ISSN",
+         "BN"="ISBN",
+         "PM"="PubMed ID",
+         "LA"="Language of Original Document",
+         "JI"="Abbreviated Source Title",
+         "DT"="Document Type",
+         "OA"="Open Access",
+         "UT"="EID",
+         "TC"="Cited by",
+         "ID"="Index Keywords",
+         "FU"="Funding Details",
+         "FX"="Funding Texts",
+         "PubStage"="Publication Stage",
+         "CODEN"="CODEN",
+         "WE"="Source",
+         "URL"="Link",
+         "C3"="Affiliations", # in WOS csv download it is also "affiliations"
+         "C1"="Authors with affiliations", #"Addresses" in WOS csv downloads
+         "AU"="Authors",
+         "AF"="Author full names",
+         "SID"="Author(s) ID" #scopus ID number
+         ) %>% 
+  mutate(SO=tolower(SO)) %>% 
+  distinct() %>% 
+  filter(if_any(everything(), is.na)) %>% 
+  filter(!is.na(URL)) %>% 
+  mutate(PG=str_replace_all(PG,"�","")) %>% 
+  mutate(PG=str_replace_all(PG,"E-","E")) %>% 
+  # separate(PG,c("BP","EP"),remove=FALSE,sep="-",extra="merge") %>% 
+  unite(FU,FX,sep="-",na.rm=TRUE,remove = TRUE) %>% 
+  mutate(refID = row_number(),.before=1) %>% 
+  unite("refID",refID:filename,sep="-",na.rm=TRUE,remove=FALSE) %>% 
+  mutate(DE=str_replace_all(DE,"\\|",";")) %>% 
+  # select(-"@_fa",
+  #        -"coverDisplayDate",
+  #        -"aggregationType",
+  #        -"author-count.@limit",
+  #        -"openaccess",
+  #        -"freetoread.value.$",
+  #        -"freetoreadLabel.value.$",
+  #        # -"pii",
+  #        -'author-count.$',
+  #        -"coverDate",
+  #        # -"error",
+  #        -"eid",
+  #        -"url",
+  #        -"pageRange",
+  #        -article_type_long,
+  # ) %>% 
+  # mutate_all(tolower) %>% 
+  mutate_all(trimws)
+
+
+  output<-output %>% distinct(DI,TI,.keep_all = TRUE)
+  output
+
+write_csv(output,file="./data/for_testing_updates/scopus_api/scopus_papers.csv")
+
+# names(scopus_papers)
+# unique(scopus_papers$SO)
+# names(scopus_papers)
+# head(scopus_papers)
+# unique(scopus_papers$DE)[999]
+
+# unique(scopus_papers$journal)
+
+
+scopus_refs<-scopus_papers %>% 
+  group_by(SO,PY) %>% 
+  tally() 
+
+
+scopus_papers %>% 
+  group_by(SO) %>% 
+  tally() 
+
+
+
+
+# read & standardize - SCOPUS affiliations --------------------------------
+
+
+scopus_affils <- list.files(path='./data/for_testing_updates/scopus_api/affils',
+                            full.names = TRUE) %>% 
+  lapply(read_csv,col_types = cols(.default = "c")) %>% 
+  bind_rows %>% 
+  select(-"@_fa") %>% 
+  mutate(affilname =str_replace_all(affilname, "\\,", "")) %>% 
+  select(-'affiliation-url',
+         # -"entry_number"
+         ) %>% 
+  distinct() %>%
+  mutate(C1=paste(affilname,`affiliation-city`,`affiliation-country`,sep=", ")) %>% 
+  # mutate(C1=paste("[", C1,"]",sep="")) %>% 
+  relocate(C1, .after = afid) %>% 
+  rename("university" = "affilname",
+         "city" = "affiliation-city",
+         "country" = "affiliation-country") %>% 
+  # mutate_all(tolower) %>% 
+  mutate_all(trimws)
+
+
+
+# read & standardize: SCOPUS authors --------------------------------------
+
+scopus_authors <- list.files(path='./data/for_testing_updates/scopus_api/authors',
+                             full.names = TRUE)
+
+scopus_authors <- scopus_authors %>% 
+  lapply(read_csv,col_types = cols(.default = "c")) %>% # read all the files at once
+  bind_rows(.id = "csv_id") %>% # bind all tables into one object, and give id for each
+  mutate(author_key = dplyr::row_number()) %>% 
+  # left_join(scopus_authors2) %>% # join month column created earlier
+  select(-"@_fa",
+         -"afid.@_fa") %>% 
+  rename("author_order"="@seq",
+         "afid"="afid.$",
+         "first_name"="given-name",
+         "author_url"="author-url") %>% 
+  # relocate(author_key,SO,PY,entry_number, .before="author_order") %>% 
+  relocate(author_key, .before="author_order") %>% 
+  mutate(first_name = str_replace_all(first_name, "\\. " ,"")) %>%
+  mutate(first_name = str_replace_all(first_name, "\\." ,"")) %>%
+  mutate(initials = str_replace_all(initials, "\\. " ,"")) %>%
+  mutate(initials = str_replace_all(initials, "\\." ,"")) %>%
+  mutate(last_init = paste(surname, initials, sep=" ")) %>% 
+  mutate(authname = paste(surname, first_name,sep= ", ")) %>% 
+  relocate(authid,authname,last_init, .after="author_key") %>% 
+  rename("author_count"="author_order") %>% 
+  # mutate_all(tolower) %>% 
+  mutate_all(trimws) %>% 
+  unite(refID,csv_id,sep="-",na.rm=TRUE,remove=FALSE)
+
+scopus_authors
+write_csv(scopus_authors,'./data/for_testing_updates/scopus_api/scopus_authors.csv')
+rm(scopus_authors2)
+
+
+names(scopus_affils)
+# add the affiliations to authors -----------------------------------------
+
+# scopus_authors_affils<-scopus_authors %>% left_join(scopus_affils)
+names(scopus_authors)
+names(scopus_affils)
+# Add the name to C1 to make it consistent with WOS
+# excludes secondary/current
+scopus_authors_affils<-scopus_authors %>% 
+  left_join(scopus_affils) %>% 
+  select(-author_url) %>% 
+  # relocate(SO,.before="PY") %>% 
+  mutate(first_name = str_replace_all(first_name, "\\. " ,"\\.")) %>% 
+  mutate(last_init = paste(surname, initials, sep=" ")) %>% 
+  mutate(authname = paste(surname, first_name,sep= ", ")) %>% 
+  mutate(C1 = paste("[",authname,"] ", C1 ,sep= "")) %>% 
+  # distinct(SO,PY,entry_number,authid,afid,.keep_all = TRUE)%>% 
+  distinct(authid,afid,.keep_all = TRUE)%>% 
+  # mutate_all(tolower) %>% 
+  mutate_all(trimws) %>% 
+  mutate(C1=gsub("Second Author, ","No Inst Given, ",C1)) 
+
+# unique(scopus_authors_affils$C1)
+
+# are there any remaining?
+scopus_authors_affils %>% filter(is.na(author_key)) 
+
+write_csv(scopus_authors_affils,'./data/for_testing_updates/scopus_api/scopus_authors_affils.csv')
+
+# add the authors and affiliations to papers -------------------------------
+
+names(scopus_authors_affils)
+# author_affils in WIDE FORMAT (authors for each paper)
+scopus_authors_affils_wide<-scopus_authors_affils %>% 
+  select(
+    -"university", 
+    -"city", 
+    -"country", 
+    -"last_init", 
+    -"authid",
+    #       -"C1", 
+    - "afid", 
+    -"surname" ,
+    -"first_name" ,
+    #       -"entry_number",
+    -author_key,
+    -authname,
+    #        # -SO,
+    #        # -PY,
+    -"initials") %>%
+  pivot_wider(names_from = author_count, 
+              id_cols=refID,
+              values_from = C1,
+              names_prefix = "C") %>% 
+  # BE SURE TO CHECK HOW MANY C COLUMNS AND EDIT BELOW
+  unite("C1", C1:last(matches("C[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="./") %>% 
+  mutate(C1 = str_replace_all(C1, " \\[na, na, na]" ," \\[missing]"))
+# head(scopus_authors_affils_wide$C1,40)
+names(scopus_authors_affils_wide)
+
+
+
+# select(scopus_authors_affils, matches("C[[:digit:]]"))
+# select(scopus_authors_affils, last(matches("C[[:digit:]]")))
+# scopus_authors_affils$C2<-NULL
+# scopus_authors_affils$C3<-NULL
+scopus_article_authors_wide<-scopus_authors_affils %>% 
+  distinct() %>% 
+  ungroup() %>% 
+  select(-"C1", 
+         -"university", 
+         -"city", 
+         -"country", 
+         -"last_init", 
+         -"authid",
+         - "afid", 
+         -"surname" ,
+         -"first_name" ,
+         # -"entry_number",
+         # -authname,
+         -author_key,
+         # -SO,
+         # -PY,
+         -"initials") %>%
+  mutate(orcid=paste(authname,"/",orcid,";",sep="")) %>% 
+  pivot_wider(names_from = author_count,
+              values_from = c("authname","orcid"),
+              id_cols=refID,
+              # names_prefix = c("AF","OI")
+              ) %>%
+  rename_with(~ gsub("authname_", "AF", .x, fixed = TRUE)) %>% 
+  rename_with(~ gsub("orcid_", "OI", .x, fixed = TRUE)) %>% 
+  # CHECK HOW MANY AUTHOR COLUMNS
+  unite("AF", AF1:last(matches("AF[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="; ") %>% 
+  unite("OI", OI1:last(matches("OI[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="") 
+# mutate(C1 = str_replace_all(C1, " \\[NA, NA, NA]" ," \\[MISSING]"))
+# head(scopus_allauthors_wide$AF,40)
+names(scopus_article_authors_wide)
+
+
+
+
+scopus_papers_complete <- scopus_papers %>% 
+  left_join(scopus_authors_affils_wide) %>% 
+  left_join(scopus_article_authors_wide) %>% 
+  rename("filename"="csv_id")
+
+
+# final tweaks to make it possible to use refsplitr on scopus -------------
+names(scopus_papers_complete)
+scopus_papers_complete<-scopus_papers_complete %>% 
+ # mutate(AU=if_else(is.na(AU), AF, AU)) %>% 
+  mutate(AU=AF) %>%
+  # mutate(refID=gsub("scopus_", "", refID)) %>% 
+  mutate(AF=gsub("\\.", "", AF)) %>% 
+  mutate(AU=gsub("\\.", "", AU)) %>% 
+  mutate(AF=gsub("\\;", "\n", AF)) %>% 
+  mutate(AU=gsub("\\;", "\n", AU)) %>% 
+  mutate(AF=gsub("\n ", "\n", AF)) %>% 
+  mutate(AU=gsub("\n ;", "\n", AU)) %>% 
+  # mutate(orcid=paste0(creator, " /", orcid)) %>% # authors clean requires orcid be in wos format "[name]/orcid" 
+  relocate(SO,PY,AF,C1,DI,TI,VL,BP,EP,.after="filename")
+
+# remove the last semicolon from every orcid id (last character)
+scopus_papers_complete<-scopus_papers_complete %>% 
+  mutate(OI=str_sub(OI,end=-2))
+# scopus_papers_complete$EM<-NA
+  # mutate(refID=as.numeric(refID)*1000) 
+
+
+
+write_csv(scopus_papers_complete,"./data/for_testing_updates/scopus_api/scopus_refs.csv")
+# save csv ----------------------------------------------------------------

From 3c85734403ceb0a07acb2e745ede3b4611305ad9 Mon Sep 17 00:00:00 2001
From: embruna <embruna@ufl.edu>
Date: Fri, 9 Aug 2024 13:04:05 -0400
Subject: [PATCH 4/7] addinfg draft code for references_read from scopus api

---
 R/references_read_scopus_api.R | 318 +++++++++++++++++++++++++++++++++
 1 file changed, 318 insertions(+)
 create mode 100644 R/references_read_scopus_api.R

diff --git a/R/references_read_scopus_api.R b/R/references_read_scopus_api.R
new file mode 100644
index 0000000..f9e75c7
--- /dev/null
+++ b/R/references_read_scopus_api.R
@@ -0,0 +1,318 @@
+#' Reads SCOPUS API Output 
+##' Thomson Reuters Web of Knowledge/Science and ISI reference export files (both .txt or .ciw format accepted)
+#'
+#' \code{references_read_scopus_api} This function reads Scopus 
+#' reference data files downloaded via API into an R-friendly data format. The resulting dataframe
+#' is the argument for the refsplitr function `authors_clean()`.    
+#'
+#' @param data the location of the file or files to be imported. This can be either the absolute or 
+#' relative name of the file (for a single file) or folder (for multiple files stored in the same folder; 
+#' used in conjunction with `dir = TRUE``). If left blank it is assumed the location is the working directory.
+#' @param dir if FALSE it is assumed a single file is to be imported. 
+#' Set to TRUE if importing multiple files (the path to the folder in which files are stored is set with `data=``; 
+#' all files in the folder will be imported). Defaults to FALSE. 
+#' @param include_all if FALSE only a subset of commonly used fields from references records are imported. 
+#' If TRUE then all fields from the reference records are imported. Defaults to FALSE.  
+#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, DT, FX, GA, GE, ID, IS, J9, JI, 
+#' LA, LT, MC, MI, NR, PA, PI, PN, PS, RID, SU, TA, VR.
+#' @export references_read_scopus_api
+#' 
+#' @examples 
+#' ## If a single files is being imported from a folder called "data" located in an RStudio Project: 
+#' ## imported_refs<-references_read_scopus_api(data = './data/refs.txt', dir = FALSE, include_all=FALSE)
+#' 
+#' ## If multiple files are being imported from a folder named "heliconia" nested within a folder
+#' ## called "data" located in an RStudio Project: 
+#' ## heliconia_refs<-references_read_scopus_api(data = './data/heliconia', dir = TRUE, include_all=FALSE)
+#' 
+#' ## To load the Scopus API records used in the examples in the documentation  
+#' scopus_api_data <- system.file('extdata', 'BITR_test.txt', package = 'refsplitr')
+#' scopus_api_example <- references_read_scopus_api(scopus_api_data)
+#' 
+#' 
+
+# SEE IMPORTANT NOTES ON ORIGINAL 265 and 295 IN TROPICAL SCIENTOMETRIX
+
+
+library(tidyverse)
+# read & standardize: SCOPUS papers ---------------------------------------
+
+scopus_papers <- list.files(path='./data/for_testing_updates/scopus_api/papers',
+                            full.names = TRUE)
+
+scopus_papers <- scopus_papers %>% 
+  lapply(read_csv,col_types = cols(.default = "c")) %>% # read all the files at once
+  bind_rows(.id = "filename") %>% # bind all tables into one object, and give id for each
+  rename_all(~str_replace_all(.,"prism","")) %>% 
+  rename_all(~str_replace_all(.,"dc","")) %>% 
+  rename_all(~str_replace_all(.,"\\:","")) %>% 
+  mutate(PY=str_sub(coverDate, 1, 4)) %>% 
+  rename("SO"="publicationName",
+         "AB"="description",
+         "DI"="doi",
+         "DT"="subtype",
+         "VL"="volume",
+         "article_type_long"="subtypeDescription",
+         "author_count"='author-count.@total',
+         "IS"='issueIdentifier',
+         "SN"="issn",
+         "EI"="eIssn",
+         "DE"="authkeywords",
+         "PM"="pubmed-id",
+         "TC"="citedby-count",
+         "TI"="title",
+         # "UT"="url",
+         "fund_acr"="fund-acr",
+         "fund_no"="fund-no",
+         "fund_sp"="fund-sponsor",
+         "UT"="identifier",
+         "OA"="openaccessFlag"
+  ) %>% 
+  mutate(SO=tolower(SO)) %>% 
+  distinct() %>% 
+  filter(if_any(everything(), is.na)) %>% 
+  filter(!is.na(url)) %>% 
+  # mutate(pub_number = row_number()) %>% 
+  mutate(pageRange=str_replace_all(pageRange,"�","")) %>% 
+  mutate(pageRange=str_replace_all(pageRange,"E-","E")) %>% 
+  separate(pageRange,c("BP","EP"),remove=FALSE,sep="-",extra="merge") %>% 
+  unite(FU,fund_acr,fund_sp,fund_no, sep="-",na.rm=TRUE,remove = TRUE) %>% 
+  unite(refID,filename,entry_number,sep="-",na.rm=TRUE,remove=FALSE) %>% 
+  mutate(DE=str_replace_all(DE,"\\|",";")) %>% 
+  select(-"@_fa",
+         -"coverDisplayDate",
+         -"aggregationType",
+         -"author-count.@limit",
+         -"openaccess",
+         -"freetoread.value.$",
+         -"freetoreadLabel.value.$",
+         # -"pii",
+         -'author-count.$',
+         -"coverDate",
+         # -"error",
+         -"eid",
+         -"url",
+         -"pageRange",
+         -article_type_long,
+  ) %>% 
+  # mutate_all(tolower) %>% 
+  mutate_all(trimws)
+
+
+scopus_papers<-scopus_papers %>% distinct(DI,TI,.keep_all = TRUE)
+rm(scopus_papers2)
+
+write_csv(scopus_papers,file="./data/for_testing_updates/scopus_api/scopus_papers.csv")
+
+# names(scopus_papers)
+# unique(scopus_papers$SO)
+# names(scopus_papers)
+# head(scopus_papers)
+# unique(scopus_papers$DE)[999]
+
+# unique(scopus_papers$journal)
+
+
+scopus_refs<-scopus_papers %>% 
+  group_by(SO,PY) %>% 
+  tally() 
+
+
+scopus_papers %>% 
+  group_by(SO) %>% 
+  tally() 
+
+
+
+
+# read & standardize - SCOPUS affiliations --------------------------------
+
+
+scopus_affils <- list.files(path='./data/for_testing_updates/scopus_api/affils',
+                            full.names = TRUE) %>% 
+  lapply(read_csv,col_types = cols(.default = "c")) %>% 
+  bind_rows %>% 
+  select(-"@_fa") %>% 
+  mutate(affilname =str_replace_all(affilname, "\\,", "")) %>% 
+  select(-'affiliation-url',
+         # -"entry_number"
+         ) %>% 
+  distinct() %>%
+  mutate(C1=paste(affilname,`affiliation-city`,`affiliation-country`,sep=", ")) %>% 
+  # mutate(C1=paste("[", C1,"]",sep="")) %>% 
+  relocate(C1, .after = afid) %>% 
+  rename("university" = "affilname",
+         "city" = "affiliation-city",
+         "country" = "affiliation-country") %>% 
+  # mutate_all(tolower) %>% 
+  mutate_all(trimws)
+
+
+
+# read & standardize: SCOPUS authors --------------------------------------
+
+scopus_authors <- list.files(path='./data/for_testing_updates/scopus_api/authors',
+                             full.names = TRUE)
+
+scopus_authors <- scopus_authors %>% 
+  lapply(read_csv,col_types = cols(.default = "c")) %>% # read all the files at once
+  bind_rows(.id = "filename") %>% # bind all tables into one object, and give id for each
+  mutate(author_key = dplyr::row_number()) %>% 
+  # left_join(scopus_authors2) %>% # join month column created earlier
+  select(-"@_fa",
+         -"afid.@_fa") %>% 
+  rename("author_order"="@seq",
+         "afid"="afid.$",
+         "first_name"="given-name",
+         "author_url"="author-url") %>% 
+  # relocate(author_key,SO,PY,entry_number, .before="author_order") %>% 
+  relocate(author_key,entry_number, .before="author_order") %>% 
+  mutate(first_name = str_replace_all(first_name, "\\. " ,"")) %>%
+  mutate(first_name = str_replace_all(first_name, "\\." ,"")) %>%
+  mutate(initials = str_replace_all(initials, "\\. " ,"")) %>%
+  mutate(initials = str_replace_all(initials, "\\." ,"")) %>%
+  mutate(last_init = paste(surname, initials, sep=" ")) %>% 
+  mutate(authname = paste(surname, first_name,sep= ", ")) %>% 
+  relocate(authid,authname,last_init, .after="author_key") %>% 
+  rename("author_count"="author_order") %>% 
+  # mutate_all(tolower) %>% 
+  mutate_all(trimws) %>% 
+  unite(refID,filename,entry_number,sep="-",na.rm=TRUE,remove=FALSE)
+
+scopus_authors
+write_csv(scopus_authors,'./data/for_testing_updates/scopus_api/scopus_authors.csv')
+rm(scopus_authors2)
+
+
+names(scopus_affils)
+# add the affiliations to authors -----------------------------------------
+
+# scopus_authors_affils<-scopus_authors %>% left_join(scopus_affils)
+names(scopus_authors)
+names(scopus_affils)
+# Add the name to C1 to make it consistent with WOS
+# excludes secondary/current
+scopus_authors_affils<-scopus_authors %>% 
+  left_join(scopus_affils) %>% 
+  select(-author_url) %>% 
+  # relocate(SO,.before="PY") %>% 
+  mutate(first_name = str_replace_all(first_name, "\\. " ,"\\.")) %>% 
+  mutate(last_init = paste(surname, initials, sep=" ")) %>% 
+  mutate(authname = paste(surname, first_name,sep= ", ")) %>% 
+  mutate(C1 = paste("[",authname,"] ", C1 ,sep= "")) %>% 
+  # distinct(SO,PY,entry_number,authid,afid,.keep_all = TRUE)%>% 
+  distinct(entry_number,authid,afid,.keep_all = TRUE)%>% 
+  # mutate_all(tolower) %>% 
+  mutate_all(trimws) %>% 
+  mutate(C1=gsub("Second Author, ","No Inst Given, ",C1)) 
+
+# unique(scopus_authors_affils$C1)
+
+# are there any remaining?
+scopus_authors_affils %>% filter(is.na(author_key)) 
+
+write_csv(scopus_authors_affils,'./data/for_testing_updates/scopus_api/scopus_authors_affils.csv')
+
+# add the authors and affiliations to papers -------------------------------
+
+names(scopus_authors_affils)
+# author_affils in WIDE FORMAT (authors for each paper)
+scopus_authors_affils_wide<-scopus_authors_affils %>% 
+  select(
+    -"university", 
+    -"city", 
+    -"country", 
+    -"last_init", 
+    -"authid",
+    #       -"C1", 
+    - "afid", 
+    -"surname" ,
+    -"first_name" ,
+    #       -"entry_number",
+    -author_key,
+    -authname,
+    #        # -SO,
+    #        # -PY,
+    -"initials") %>%
+  pivot_wider(names_from = author_count, 
+              id_cols=refID,
+              values_from = C1,
+              names_prefix = "C") %>% 
+  # BE SURE TO CHECK HOW MANY C COLUMNS AND EDIT BELOW
+  unite("C1", C1:last(matches("C[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="./") %>% 
+  mutate(C1 = str_replace_all(C1, " \\[na, na, na]" ," \\[missing]"))
+# head(scopus_authors_affils_wide$C1,40)
+names(scopus_authors_affils_wide)
+
+
+
+# select(scopus_authors_affils, matches("C[[:digit:]]"))
+# select(scopus_authors_affils, last(matches("C[[:digit:]]")))
+# scopus_authors_affils$C2<-NULL
+# scopus_authors_affils$C3<-NULL
+scopus_article_authors_wide<-scopus_authors_affils %>% 
+  distinct() %>% 
+  ungroup() %>% 
+  select(-"C1", 
+         -"university", 
+         -"city", 
+         -"country", 
+         -"last_init", 
+         -"authid",
+         - "afid", 
+         -"surname" ,
+         -"first_name" ,
+         # -"entry_number",
+         # -authname,
+         -author_key,
+         # -SO,
+         # -PY,
+         -"initials") %>%
+  mutate(orcid=paste(authname,"/",orcid,";",sep="")) %>% 
+  pivot_wider(names_from = author_count,
+              values_from = c("authname","orcid"),
+              id_cols=refID,
+              # names_prefix = c("AF","OI")
+              ) %>%
+  rename_with(~ gsub("authname_", "AF", .x, fixed = TRUE)) %>% 
+  rename_with(~ gsub("orcid_", "OI", .x, fixed = TRUE)) %>% 
+  # CHECK HOW MANY AUTHOR COLUMNS
+  unite("AF", AF1:last(matches("AF[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="; ") %>% 
+  unite("OI", OI1:last(matches("OI[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="") 
+# mutate(C1 = str_replace_all(C1, " \\[NA, NA, NA]" ," \\[MISSING]"))
+# head(scopus_allauthors_wide$AF,40)
+names(scopus_article_authors_wide)
+
+
+
+
+scopus_papers_complete <- scopus_papers %>% 
+  left_join(scopus_authors_affils_wide) %>% 
+  left_join(scopus_article_authors_wide) 
+
+
+# final tweaks to make it possible to use refsplitr on scopus -------------
+names(scopus_papers_complete)
+scopus_papers_complete<-scopus_papers_complete %>% 
+ # mutate(AU=if_else(is.na(AU), AF, AU)) %>% 
+  mutate(AU=AF) %>%
+  # mutate(refID=gsub("scopus_", "", refID)) %>% 
+  mutate(AF=gsub("\\.", "", AF)) %>% 
+  mutate(AU=gsub("\\.", "", AU)) %>% 
+  mutate(AF=gsub("\\;", "\n", AF)) %>% 
+  mutate(AU=gsub("\\;", "\n", AU)) %>% 
+  mutate(AF=gsub("\n ", "\n", AF)) %>% 
+  mutate(AU=gsub("\n ;", "\n", AU)) %>% 
+  # mutate(orcid=paste0(creator, " /", orcid)) %>% # authors clean requires orcid be in wos format "[name]/orcid" 
+  relocate(SO,PY,AF,C1,DI,TI,VL,BP,EP,.after="filename")
+
+# remove the last semicolon from every orcid id (last character)
+scopus_papers_complete<-scopus_papers_complete %>% 
+  mutate(OI=str_sub(OI,end=-2))
+# scopus_papers_complete$EM<-NA
+  # mutate(refID=as.numeric(refID)*1000) 
+
+
+
+write_csv(scopus_papers_complete,"./data/for_testing_updates/scopus_api/scopus_refs.csv")
+# save csv ----------------------------------------------------------------

From 626b64ad679daec27f380574eff02404a2acf252 Mon Sep 17 00:00:00 2001
From: embruna <embruna@ufl.edu>
Date: Mon, 12 Aug 2024 10:54:20 -0400
Subject: [PATCH 5/7] cleanup to merge

---
 NEWS.md                             |   9 +-
 R/plot_net_country.R                |   5 -
 R/references_read.R                 |   3 +-
 R/references_read_scopus_api.R      | 318 ----------------
 R/references_read_scopus_download.R | 570 ----------------------------
 5 files changed, 6 insertions(+), 899 deletions(-)
 delete mode 100644 R/references_read_scopus_api.R
 delete mode 100644 R/references_read_scopus_download.R

diff --git a/NEWS.md b/NEWS.md
index 6ca0185..ea7a4d7 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,18 +1,17 @@
 # refsplitr News
 
 
-refsplitr 1.X.X (2024-XX-XX)
+refsplitr 1.0.2 (2024-08-12)
 =========================
 
 ### NEW FEATURES
 
-  * `references_read` now extracts additional fields from Web of Science records: WE (Source Database), C3 (affiliations*), EI (eISSN) and RID (the original version of the Thomson-Reuters ResearcherID (RI); authors of some older publications might have an RID but not an RI). To include these in the output of`references_read` use the setting `include_all=TRUE`.  
 
-  *a single cell with list of all affiliations, not brtoken down by author. to match scopus
+  * `references_read` now extracts additional fields from Web of Science records: WE (Source Database), C3 (all author affiliations, equivalent to the Scopus `affiliations` field code), EI (eISSN), OA (Open Access), and RID (the original version of the Thomson-Reuters ResearcherID (RI); authors of some older publications might have an RID but not an RI). These are not included in the default output of `references_read`, to include them use `include_all = TRUE`.  
+
   * `references_read` no longer extracts some rarely used field codes: GE, LT, MC, MI, and TA
   
-  * the Document Type (DT), Keywords Plus (ID), Issue (IS), ISO abbreviated source code (JI), and number of references cited in an article (NR) are now returned by default (`include_all=FALSE`). 
-
+  * The following field codes are now returned by default when using `references_read`: DT (Document Type), ID (Keywords Plus), IS (Issue), JI (ISO abbreviated source code), and  NR (number of references cited by the article). 
 
 
 refsplitr 1.0.1 (2024-07-23)
diff --git a/R/plot_net_country.R b/R/plot_net_country.R
index a1d8f71..1a16d47 100644
--- a/R/plot_net_country.R
+++ b/R/plot_net_country.R
@@ -57,11 +57,6 @@ plot_net_country <- function(data,
   
   data <- data[!is.na(data$country), ]
   
-  
-  
-  
-  
-  
   # names in WOS often don't match those in rworldmap'
   data<-data %>% 
   dplyr::mutate(country=dplyr::case_when(
diff --git a/R/references_read.R b/R/references_read.R
index c0551fa..524d2f6 100644
--- a/R/references_read.R
+++ b/R/references_read.R
@@ -100,6 +100,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
     "Z9" = character(0),
     "AR" = character(0),
     "WE" = character(0),
+    "OA" = character(0), # Field code for Open Acceess (added by EB Sept 2024)
     stringsAsFactors = FALSE
   )
 
@@ -286,7 +287,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
     dropnames <- c("CC", "CH", "CL", "CT", "CY",
                    "FX", "GA", "GE", "J9", "LA",
                    "PA", "PI", "PN", "PS", "RID", 
-                   "SI", "SU", "VR")
+                   "SI", "SU", "VR", "OA")
 
     rdo <- dupe_output[, !(names(dupe_output) %in% dropnames)]
 
diff --git a/R/references_read_scopus_api.R b/R/references_read_scopus_api.R
deleted file mode 100644
index f9e75c7..0000000
--- a/R/references_read_scopus_api.R
+++ /dev/null
@@ -1,318 +0,0 @@
-#' Reads SCOPUS API Output 
-##' Thomson Reuters Web of Knowledge/Science and ISI reference export files (both .txt or .ciw format accepted)
-#'
-#' \code{references_read_scopus_api} This function reads Scopus 
-#' reference data files downloaded via API into an R-friendly data format. The resulting dataframe
-#' is the argument for the refsplitr function `authors_clean()`.    
-#'
-#' @param data the location of the file or files to be imported. This can be either the absolute or 
-#' relative name of the file (for a single file) or folder (for multiple files stored in the same folder; 
-#' used in conjunction with `dir = TRUE``). If left blank it is assumed the location is the working directory.
-#' @param dir if FALSE it is assumed a single file is to be imported. 
-#' Set to TRUE if importing multiple files (the path to the folder in which files are stored is set with `data=``; 
-#' all files in the folder will be imported). Defaults to FALSE. 
-#' @param include_all if FALSE only a subset of commonly used fields from references records are imported. 
-#' If TRUE then all fields from the reference records are imported. Defaults to FALSE.  
-#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, DT, FX, GA, GE, ID, IS, J9, JI, 
-#' LA, LT, MC, MI, NR, PA, PI, PN, PS, RID, SU, TA, VR.
-#' @export references_read_scopus_api
-#' 
-#' @examples 
-#' ## If a single files is being imported from a folder called "data" located in an RStudio Project: 
-#' ## imported_refs<-references_read_scopus_api(data = './data/refs.txt', dir = FALSE, include_all=FALSE)
-#' 
-#' ## If multiple files are being imported from a folder named "heliconia" nested within a folder
-#' ## called "data" located in an RStudio Project: 
-#' ## heliconia_refs<-references_read_scopus_api(data = './data/heliconia', dir = TRUE, include_all=FALSE)
-#' 
-#' ## To load the Scopus API records used in the examples in the documentation  
-#' scopus_api_data <- system.file('extdata', 'BITR_test.txt', package = 'refsplitr')
-#' scopus_api_example <- references_read_scopus_api(scopus_api_data)
-#' 
-#' 
-
-# SEE IMPORTANT NOTES ON ORIGINAL 265 and 295 IN TROPICAL SCIENTOMETRIX
-
-
-library(tidyverse)
-# read & standardize: SCOPUS papers ---------------------------------------
-
-scopus_papers <- list.files(path='./data/for_testing_updates/scopus_api/papers',
-                            full.names = TRUE)
-
-scopus_papers <- scopus_papers %>% 
-  lapply(read_csv,col_types = cols(.default = "c")) %>% # read all the files at once
-  bind_rows(.id = "filename") %>% # bind all tables into one object, and give id for each
-  rename_all(~str_replace_all(.,"prism","")) %>% 
-  rename_all(~str_replace_all(.,"dc","")) %>% 
-  rename_all(~str_replace_all(.,"\\:","")) %>% 
-  mutate(PY=str_sub(coverDate, 1, 4)) %>% 
-  rename("SO"="publicationName",
-         "AB"="description",
-         "DI"="doi",
-         "DT"="subtype",
-         "VL"="volume",
-         "article_type_long"="subtypeDescription",
-         "author_count"='author-count.@total',
-         "IS"='issueIdentifier',
-         "SN"="issn",
-         "EI"="eIssn",
-         "DE"="authkeywords",
-         "PM"="pubmed-id",
-         "TC"="citedby-count",
-         "TI"="title",
-         # "UT"="url",
-         "fund_acr"="fund-acr",
-         "fund_no"="fund-no",
-         "fund_sp"="fund-sponsor",
-         "UT"="identifier",
-         "OA"="openaccessFlag"
-  ) %>% 
-  mutate(SO=tolower(SO)) %>% 
-  distinct() %>% 
-  filter(if_any(everything(), is.na)) %>% 
-  filter(!is.na(url)) %>% 
-  # mutate(pub_number = row_number()) %>% 
-  mutate(pageRange=str_replace_all(pageRange,"�","")) %>% 
-  mutate(pageRange=str_replace_all(pageRange,"E-","E")) %>% 
-  separate(pageRange,c("BP","EP"),remove=FALSE,sep="-",extra="merge") %>% 
-  unite(FU,fund_acr,fund_sp,fund_no, sep="-",na.rm=TRUE,remove = TRUE) %>% 
-  unite(refID,filename,entry_number,sep="-",na.rm=TRUE,remove=FALSE) %>% 
-  mutate(DE=str_replace_all(DE,"\\|",";")) %>% 
-  select(-"@_fa",
-         -"coverDisplayDate",
-         -"aggregationType",
-         -"author-count.@limit",
-         -"openaccess",
-         -"freetoread.value.$",
-         -"freetoreadLabel.value.$",
-         # -"pii",
-         -'author-count.$',
-         -"coverDate",
-         # -"error",
-         -"eid",
-         -"url",
-         -"pageRange",
-         -article_type_long,
-  ) %>% 
-  # mutate_all(tolower) %>% 
-  mutate_all(trimws)
-
-
-scopus_papers<-scopus_papers %>% distinct(DI,TI,.keep_all = TRUE)
-rm(scopus_papers2)
-
-write_csv(scopus_papers,file="./data/for_testing_updates/scopus_api/scopus_papers.csv")
-
-# names(scopus_papers)
-# unique(scopus_papers$SO)
-# names(scopus_papers)
-# head(scopus_papers)
-# unique(scopus_papers$DE)[999]
-
-# unique(scopus_papers$journal)
-
-
-scopus_refs<-scopus_papers %>% 
-  group_by(SO,PY) %>% 
-  tally() 
-
-
-scopus_papers %>% 
-  group_by(SO) %>% 
-  tally() 
-
-
-
-
-# read & standardize - SCOPUS affiliations --------------------------------
-
-
-scopus_affils <- list.files(path='./data/for_testing_updates/scopus_api/affils',
-                            full.names = TRUE) %>% 
-  lapply(read_csv,col_types = cols(.default = "c")) %>% 
-  bind_rows %>% 
-  select(-"@_fa") %>% 
-  mutate(affilname =str_replace_all(affilname, "\\,", "")) %>% 
-  select(-'affiliation-url',
-         # -"entry_number"
-         ) %>% 
-  distinct() %>%
-  mutate(C1=paste(affilname,`affiliation-city`,`affiliation-country`,sep=", ")) %>% 
-  # mutate(C1=paste("[", C1,"]",sep="")) %>% 
-  relocate(C1, .after = afid) %>% 
-  rename("university" = "affilname",
-         "city" = "affiliation-city",
-         "country" = "affiliation-country") %>% 
-  # mutate_all(tolower) %>% 
-  mutate_all(trimws)
-
-
-
-# read & standardize: SCOPUS authors --------------------------------------
-
-scopus_authors <- list.files(path='./data/for_testing_updates/scopus_api/authors',
-                             full.names = TRUE)
-
-scopus_authors <- scopus_authors %>% 
-  lapply(read_csv,col_types = cols(.default = "c")) %>% # read all the files at once
-  bind_rows(.id = "filename") %>% # bind all tables into one object, and give id for each
-  mutate(author_key = dplyr::row_number()) %>% 
-  # left_join(scopus_authors2) %>% # join month column created earlier
-  select(-"@_fa",
-         -"afid.@_fa") %>% 
-  rename("author_order"="@seq",
-         "afid"="afid.$",
-         "first_name"="given-name",
-         "author_url"="author-url") %>% 
-  # relocate(author_key,SO,PY,entry_number, .before="author_order") %>% 
-  relocate(author_key,entry_number, .before="author_order") %>% 
-  mutate(first_name = str_replace_all(first_name, "\\. " ,"")) %>%
-  mutate(first_name = str_replace_all(first_name, "\\." ,"")) %>%
-  mutate(initials = str_replace_all(initials, "\\. " ,"")) %>%
-  mutate(initials = str_replace_all(initials, "\\." ,"")) %>%
-  mutate(last_init = paste(surname, initials, sep=" ")) %>% 
-  mutate(authname = paste(surname, first_name,sep= ", ")) %>% 
-  relocate(authid,authname,last_init, .after="author_key") %>% 
-  rename("author_count"="author_order") %>% 
-  # mutate_all(tolower) %>% 
-  mutate_all(trimws) %>% 
-  unite(refID,filename,entry_number,sep="-",na.rm=TRUE,remove=FALSE)
-
-scopus_authors
-write_csv(scopus_authors,'./data/for_testing_updates/scopus_api/scopus_authors.csv')
-rm(scopus_authors2)
-
-
-names(scopus_affils)
-# add the affiliations to authors -----------------------------------------
-
-# scopus_authors_affils<-scopus_authors %>% left_join(scopus_affils)
-names(scopus_authors)
-names(scopus_affils)
-# Add the name to C1 to make it consistent with WOS
-# excludes secondary/current
-scopus_authors_affils<-scopus_authors %>% 
-  left_join(scopus_affils) %>% 
-  select(-author_url) %>% 
-  # relocate(SO,.before="PY") %>% 
-  mutate(first_name = str_replace_all(first_name, "\\. " ,"\\.")) %>% 
-  mutate(last_init = paste(surname, initials, sep=" ")) %>% 
-  mutate(authname = paste(surname, first_name,sep= ", ")) %>% 
-  mutate(C1 = paste("[",authname,"] ", C1 ,sep= "")) %>% 
-  # distinct(SO,PY,entry_number,authid,afid,.keep_all = TRUE)%>% 
-  distinct(entry_number,authid,afid,.keep_all = TRUE)%>% 
-  # mutate_all(tolower) %>% 
-  mutate_all(trimws) %>% 
-  mutate(C1=gsub("Second Author, ","No Inst Given, ",C1)) 
-
-# unique(scopus_authors_affils$C1)
-
-# are there any remaining?
-scopus_authors_affils %>% filter(is.na(author_key)) 
-
-write_csv(scopus_authors_affils,'./data/for_testing_updates/scopus_api/scopus_authors_affils.csv')
-
-# add the authors and affiliations to papers -------------------------------
-
-names(scopus_authors_affils)
-# author_affils in WIDE FORMAT (authors for each paper)
-scopus_authors_affils_wide<-scopus_authors_affils %>% 
-  select(
-    -"university", 
-    -"city", 
-    -"country", 
-    -"last_init", 
-    -"authid",
-    #       -"C1", 
-    - "afid", 
-    -"surname" ,
-    -"first_name" ,
-    #       -"entry_number",
-    -author_key,
-    -authname,
-    #        # -SO,
-    #        # -PY,
-    -"initials") %>%
-  pivot_wider(names_from = author_count, 
-              id_cols=refID,
-              values_from = C1,
-              names_prefix = "C") %>% 
-  # BE SURE TO CHECK HOW MANY C COLUMNS AND EDIT BELOW
-  unite("C1", C1:last(matches("C[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="./") %>% 
-  mutate(C1 = str_replace_all(C1, " \\[na, na, na]" ," \\[missing]"))
-# head(scopus_authors_affils_wide$C1,40)
-names(scopus_authors_affils_wide)
-
-
-
-# select(scopus_authors_affils, matches("C[[:digit:]]"))
-# select(scopus_authors_affils, last(matches("C[[:digit:]]")))
-# scopus_authors_affils$C2<-NULL
-# scopus_authors_affils$C3<-NULL
-scopus_article_authors_wide<-scopus_authors_affils %>% 
-  distinct() %>% 
-  ungroup() %>% 
-  select(-"C1", 
-         -"university", 
-         -"city", 
-         -"country", 
-         -"last_init", 
-         -"authid",
-         - "afid", 
-         -"surname" ,
-         -"first_name" ,
-         # -"entry_number",
-         # -authname,
-         -author_key,
-         # -SO,
-         # -PY,
-         -"initials") %>%
-  mutate(orcid=paste(authname,"/",orcid,";",sep="")) %>% 
-  pivot_wider(names_from = author_count,
-              values_from = c("authname","orcid"),
-              id_cols=refID,
-              # names_prefix = c("AF","OI")
-              ) %>%
-  rename_with(~ gsub("authname_", "AF", .x, fixed = TRUE)) %>% 
-  rename_with(~ gsub("orcid_", "OI", .x, fixed = TRUE)) %>% 
-  # CHECK HOW MANY AUTHOR COLUMNS
-  unite("AF", AF1:last(matches("AF[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="; ") %>% 
-  unite("OI", OI1:last(matches("OI[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="") 
-# mutate(C1 = str_replace_all(C1, " \\[NA, NA, NA]" ," \\[MISSING]"))
-# head(scopus_allauthors_wide$AF,40)
-names(scopus_article_authors_wide)
-
-
-
-
-scopus_papers_complete <- scopus_papers %>% 
-  left_join(scopus_authors_affils_wide) %>% 
-  left_join(scopus_article_authors_wide) 
-
-
-# final tweaks to make it possible to use refsplitr on scopus -------------
-names(scopus_papers_complete)
-scopus_papers_complete<-scopus_papers_complete %>% 
- # mutate(AU=if_else(is.na(AU), AF, AU)) %>% 
-  mutate(AU=AF) %>%
-  # mutate(refID=gsub("scopus_", "", refID)) %>% 
-  mutate(AF=gsub("\\.", "", AF)) %>% 
-  mutate(AU=gsub("\\.", "", AU)) %>% 
-  mutate(AF=gsub("\\;", "\n", AF)) %>% 
-  mutate(AU=gsub("\\;", "\n", AU)) %>% 
-  mutate(AF=gsub("\n ", "\n", AF)) %>% 
-  mutate(AU=gsub("\n ;", "\n", AU)) %>% 
-  # mutate(orcid=paste0(creator, " /", orcid)) %>% # authors clean requires orcid be in wos format "[name]/orcid" 
-  relocate(SO,PY,AF,C1,DI,TI,VL,BP,EP,.after="filename")
-
-# remove the last semicolon from every orcid id (last character)
-scopus_papers_complete<-scopus_papers_complete %>% 
-  mutate(OI=str_sub(OI,end=-2))
-# scopus_papers_complete$EM<-NA
-  # mutate(refID=as.numeric(refID)*1000) 
-
-
-
-write_csv(scopus_papers_complete,"./data/for_testing_updates/scopus_api/scopus_refs.csv")
-# save csv ----------------------------------------------------------------
diff --git a/R/references_read_scopus_download.R b/R/references_read_scopus_download.R
deleted file mode 100644
index 91d2827..0000000
--- a/R/references_read_scopus_download.R
+++ /dev/null
@@ -1,570 +0,0 @@
-#' Reads SCOPUS download Output 
-##' Thomson Reuters Web of Knowledge/Science and ISI reference export files (both .txt or .ciw format accepted)
-#'
-#' \code{references_read_scopus_download} This function reads Scopus 
-#' reference data files downloaded directly as csv into an R-friendly data format. The resulting dataframe
-#' is the argument for the refsplitr function `authors_clean()`.    
-#'
-#' @param data the location of the file or files to be imported. This can be either the absolute or 
-#' relative name of the file (for a single file) or folder (for multiple files stored in the same folder; 
-#' used in conjunction with `dir = TRUE``). If left blank it is assumed the location is the working directory.
-#' @param dir if FALSE it is assumed a single file is to be imported. 
-#' Set to TRUE if importing multiple files (the path to the folder in which files are stored is set with `data=``; 
-#' all files in the folder will be imported). Defaults to FALSE. 
-#' @param include_all if FALSE only a subset of commonly used fields from references records are imported. 
-#' If TRUE then all fields from the reference records are imported. Defaults to FALSE.  
-#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, DT, FX, GA, GE, ID, IS, J9, JI, 
-#' LA, LT, MC, MI, NR, PA, PI, PN, PS, RID, SU, TA, VR.
-#' @export references_read_scopus_api
-#' 
-#' @examples 
-#' ## If a single files is being imported from a folder called "data" located in an RStudio Project: 
-#' ## imported_refs<-references_read_scopus_api(data = './data/refs.txt', dir = FALSE, include_all=FALSE)
-#' 
-#' ## If multiple files are being imported from a folder named "heliconia" nested within a folder
-#' ## called "data" located in an RStudio Project: 
-#' ## heliconia_refs<-references_read_scopus_api(data = './data/heliconia', dir = TRUE, include_all=FALSE)
-#' 
-#' ## To load the Scopus API records used in the examples in the documentation  
-#' scopus_api_data <- system.file('extdata', 'BITR_test.txt', package = 'refsplitr')
-#' scopus_api_example <- references_read_scopus_api(scopus_api_data)
-#' 
-#' 
-
-# SEE IMPORTANT NOTES ON ORIGINAL 265 and 295 IN TROPICAL SCIENTOMETRIX
-
-
-library(tidyverse)
-# read & standardize: SCOPUS papers ---------------------------------------
-
-file_list <- list.files(path='./data/for_testing_updates/scopus_csv',
-                            full.names = TRUE)
-
-references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
-  ## 	NOTE: The fields stored in our output table are a combination of the
-  ## 	"Thomson Reuters Web of Knowledge" FN format and the "ISI Export
-  ## 	Format" both of which are version 1.0:
-  output <- data.frame(
-    "filename" = character(0),
-    "AB" = character(0),
-    "AF" = character(0),
-    "AU" = character(0),
-    "CA" = character(0),
-    "BP" = character(0),
-    "C1" = character(0),
-    "C3" = character(0),
-    "CC" = character(0),
-    "CH" = character(0),
-    "CL" = character(0),
-    "CR" = character(0),
-    "CT" = character(0),
-    "CY" = character(0),
-    "DE" = character(0),
-    "DI" = character(0),
-    "DT" = character(0),
-    # 	"EF" = character(0),	##	End file
-    "EM" = character(0),
-    "EP" = character(0),
-    # 	"ER" = character(0),	##	End record
-    "FN" = character(0),
-    "FU" = character(0),
-    "FX" = character(0),
-    "GA" = character(0),
-    # "GE" = character(0), (removed by EB Sept 2024)
-    "ID" = character(0),
-    "IS" = character(0),
-    "J9" = character(0),
-    "JI" = character(0),
-    "LA" = character(0),
-    # "LT" = character(0),
-    # "MC" = character(0),
-    # "MI" = character(0),
-    "NR" = character(0),
-    "PA" = character(0),
-    "PD" = character(0),
-    "PG" = character(0),
-    "PI" = character(0),
-    "PN" = character(0),
-    "PS" = character(0),
-    "PT" = character(0),
-    "PU" = character(0),
-    "PY" = character(0),
-    "RI" = character(0), # New field code for Thomson-Reuters ResearcherID
-    "RID" = character(0), # Original field code for Thomson-Reuters ResearcherID
-    # Older searchers will have RID (added by EB Sept 2024)
-    "OI" = character(0), # Field code for ORCID ID (added by EB Jan 2017)
-    "PM" = character(0), # Pubmed ID Number (added by EB Dec 2017)
-    "RP" = character(0),
-    "SC" = character(0),
-    "SI" = character(0),
-    "SN" = character(0),
-    "EI" = character(0),
-    "SO" = character(0),
-    "SU" = character(0),
-    # "TA" = character(0), (removed by EB Sept 2024)
-    "TC" = character(0),
-    "TI" = character(0),
-    "UT" = character(0),
-    "VR" = character(0),
-    "VL" = character(0),
-    "WC" = character(0),
-    "Z9" = character(0),
-    "AR" = character(0),
-    "WE" = character(0),
-    stringsAsFactors = FALSE
-  )
-
-  
-  ## 	This is an index for the current record, it gets iterated for each
-  # record we advance through:
-  i <- 1
-  if (dir) {
-    file_list <- dir(path = data)
-  } else {
-    file_list <- data
-  }
-  
-  
-  
-  ## 	Strip out any files in the directory that aren't Web of Knowledge files:
-  file_list <- file_list[ grep(".ciw|.csv", file_list) ]
-  
-  if (length(file_list) == 0) {
-    stop("ERROR:  The specified file or directory does not contain any
-      Scopus download records in .csv format!")
-  }
-  message("Now processing all references files")
-  
-  
-  
-  # for (filename in file_list) {
-  #   if (dir) {
-  #     in_file <- file(paste0(data, "/", filename), "r")
-  #   }
-  #   if (!dir) {
-  #     in_file <- file(filename, "r")
-  #   }
-  #   
-  #   field <- ""
-  #   
-  #   ##    Process the first line to determine what file type it is:
-  #   ## 		NOTE:  We could add the encoding="UTF-8" flag to the readLines in
-  #   ## 		order to remove the byte-order mark (BOM) from some exported
-  #   ## 		files coming out of ISI, but there seems to be a bug in the
-  #   ## 		readLines() function after bringing a UTF-8 file in, in that
-  #   ## 		it doesn't respsect the BOM characters.  So we'll just read
-  #   ## 		the files in with no encoding specified and strip the BOM if
-  #   
-  #   read_line <- readLines(in_file, n = 1, warn = FALSE)
-  #   
-  #   if (length(read_line) > 0) {
-  #     
-  #     read_line <- gsub("^[^A-Z]*([A-Z]+)(.*)$", "\\1\\2", read_line)
-  #     
-  #     ##  Strip the first two characters from the text line,
-  #     #   skip the third (should be a space) and store the rest:
-  #     pre_text <- substr(read_line, 1, 2)
-  #     line_text <- substr(read_line, 4, nchar(read_line))
-  #     
-  #     if (pre_text != "FN") {
-  #       close(in_file)
-  #       error <- paste0("ERROR:  The file ",
-  #                       filename,
-  #                       " doesn't appear to be a valid ISI or
-  #         Thomson Reuters reference library file!")
-  #       stop(error)
-  #     }
-  #     
-  #     ## 	Check to see if this is a "ISI Export Format" file, in which
-  #     ## 		case we need to parse out the first line into three fields:
-  #     if (substr(line_text, 1, 3) == "ISI") {
-  #       field <- pre_text
-  #       
-  #       ## 	Pull apart the FN, VR and PT fields all contained on the first
-  #       ## 		line of the ISI file format:
-  #       matches <- regexec(
-  #         "^(.*) VR (.*) PT (.*)",
-  #         line_text
-  #       )
-  #       
-  #       match_strings <- regmatches(
-  #         line_text,
-  #         matches
-  #       )
-  #       
-  #       ## 	Store those fields:
-  #       output[i, "FN"] <- paste(match_strings[[1]][2], "\n", sep = "")
-  #       
-  #       output[i, "VR"] <- paste(match_strings[[1]][3], "\n", sep = "")
-  #       
-  #       output[i, "PT"] <- paste(match_strings[[1]][4], "\n", sep = "")
-  #     } else {
-  #       ## 	If this is not an ISI export format then just parse the first
-  #       ## 		line normally into the FN field:
-  #       field <- pre_text
-  #       if (field %in% names(output)) {
-  #         output[i, field] <- ""
-  #         output[i, field] <- trimws(
-  #           ifelse(length(line_text) == 1,
-  #                  paste(output[i, field], line_text,
-  #                        sep = "\n")),
-  #           "both")
-  #       }
-  #     }
-  #   } else {
-  #     utils::flush.console()
-  #     stop("WARNING:  Nothing contained in the specified file!")
-  #   }
-  #   
-  #   ## 	Process the remaining lines in the file (see the note above about
-  #   ## 		the encoding= flag and necessity for it, but why we didn't use it):
-  #   while (length(read_line <- readLines(in_file, n = 1, warn = FALSE)) > 0) {
-  #     ## 	Strip the first three characters from the text line:
-  #     pre_text <- substr(read_line, 1, 2)
-  #     
-  #     line_text <- substr(read_line, 4, nchar(read_line))
-  #     
-  #     ## 	Check to see if this is a new field:
-  #     if (pre_text != "  ") {
-  #       field <- pre_text
-  #       ## 	If the field is in our file and in our data structure then
-  #       ## 		initialize it to an empty string:
-  #       if (field %in% names(output)) {
-  #         output[i, field] <- ""
-  #       }
-  #     }
-  #     
-  #     ## 	Check to see if the current field is one we are saving to output:
-  #     if (field %in% names(output)) {
-  #       ##... if it is then append this line's data to the field in our output:
-  #       
-  #       output[i, field] <- trimws(
-  #         ifelse(length(line_text) == 1,
-  #                paste(output[i, field], line_text, sep = "\n")),
-  #         "both")
-  #     }
-  #     
-  #     # 	If this is the end of a record then add any per-record items and
-  #     # 		advance our row:
-  #     if (field == "ER") {
-  #       output[i, "filename"] <- filename
-  #       
-  #       ## 	These fields are not repeated for every record, so we set them
-  #       ## 		from the first record where they were recorded:
-  #       
-  #       output[i, "FN"] <- output[1, "FN"]
-  #       output[i, "VR"] <- output[1, "VR"]
-  #       
-  #       i <- i + 1
-  #     }
-  #   }
-  #   
-  #   close(in_file)
-  #   ############################### Clock######################################
-  #   total <- length(file_list)
-  #   pb <- utils::txtProgressBar(min = 0, max = total, style = 3)
-  #   utils::setTxtProgressBar(pb, counter)
-  #   counter <- counter + 1
-  #   utils::flush.console()
-  #   ###########################################################################
-  # }
-  # 
-  # 
-output <- file_list %>% 
-  lapply(read_csv,col_types = cols(.default = "c")) %>% # read all the files at once
-  bind_rows(.id = "csv_id") %>% # bind all tables into one object, and give id for each
-  rename_all(~str_replace_all(.,"prism","")) %>% 
-  rename_all(~str_replace_all(.,"dc","")) %>% 
-  rename_all(~str_replace_all(.,"\\:","")) %>% 
-  # mutate(PY=str_sub(coverDate, 1, 4)) %>% 
-  rename("SO"="Source title",
-         "AB"="Abstract",
-         "DI"="DOI",
-         "DT"="Document Type",
-         "VL"="Volume",
-         "filename"="csv_id",
-         "TI"="Title",
-         "PY"="Year",
-         "SO"="Source title",
-         "VL"="Volume",
-         "IS"="Issue",
-         "AR"="Art. No.",
-         "BP"="Page start",
-         "EP"="Page end",
-         "PG"="Page count",
-         "DI"="DOI",
-         "DE"="Author Keywords",
-         "RP"="Correspondence Address",
-         "BE"="Editors",
-         "PU"="Publisher",
-         "SN"="ISSN",
-         "BN"="ISBN",
-         "PM"="PubMed ID",
-         "LA"="Language of Original Document",
-         "JI"="Abbreviated Source Title",
-         "DT"="Document Type",
-         "OA"="Open Access",
-         "UT"="EID",
-         "TC"="Cited by",
-         "ID"="Index Keywords",
-         "FU"="Funding Details",
-         "FX"="Funding Texts",
-         "PubStage"="Publication Stage",
-         "CODEN"="CODEN",
-         "WE"="Source",
-         "URL"="Link",
-         "C3"="Affiliations", # in WOS csv download it is also "affiliations"
-         "C1"="Authors with affiliations", #"Addresses" in WOS csv downloads
-         "AU"="Authors",
-         "AF"="Author full names",
-         "SID"="Author(s) ID" #scopus ID number
-         ) %>% 
-  mutate(SO=tolower(SO)) %>% 
-  distinct() %>% 
-  filter(if_any(everything(), is.na)) %>% 
-  filter(!is.na(URL)) %>% 
-  mutate(PG=str_replace_all(PG,"�","")) %>% 
-  mutate(PG=str_replace_all(PG,"E-","E")) %>% 
-  # separate(PG,c("BP","EP"),remove=FALSE,sep="-",extra="merge") %>% 
-  unite(FU,FX,sep="-",na.rm=TRUE,remove = TRUE) %>% 
-  mutate(refID = row_number(),.before=1) %>% 
-  unite("refID",refID:filename,sep="-",na.rm=TRUE,remove=FALSE) %>% 
-  mutate(DE=str_replace_all(DE,"\\|",";")) %>% 
-  # select(-"@_fa",
-  #        -"coverDisplayDate",
-  #        -"aggregationType",
-  #        -"author-count.@limit",
-  #        -"openaccess",
-  #        -"freetoread.value.$",
-  #        -"freetoreadLabel.value.$",
-  #        # -"pii",
-  #        -'author-count.$',
-  #        -"coverDate",
-  #        # -"error",
-  #        -"eid",
-  #        -"url",
-  #        -"pageRange",
-  #        -article_type_long,
-  # ) %>% 
-  # mutate_all(tolower) %>% 
-  mutate_all(trimws)
-
-
-  output<-output %>% distinct(DI,TI,.keep_all = TRUE)
-  output
-
-write_csv(output,file="./data/for_testing_updates/scopus_api/scopus_papers.csv")
-
-# names(scopus_papers)
-# unique(scopus_papers$SO)
-# names(scopus_papers)
-# head(scopus_papers)
-# unique(scopus_papers$DE)[999]
-
-# unique(scopus_papers$journal)
-
-
-scopus_refs<-scopus_papers %>% 
-  group_by(SO,PY) %>% 
-  tally() 
-
-
-scopus_papers %>% 
-  group_by(SO) %>% 
-  tally() 
-
-
-
-
-# read & standardize - SCOPUS affiliations --------------------------------
-
-
-scopus_affils <- list.files(path='./data/for_testing_updates/scopus_api/affils',
-                            full.names = TRUE) %>% 
-  lapply(read_csv,col_types = cols(.default = "c")) %>% 
-  bind_rows %>% 
-  select(-"@_fa") %>% 
-  mutate(affilname =str_replace_all(affilname, "\\,", "")) %>% 
-  select(-'affiliation-url',
-         # -"entry_number"
-         ) %>% 
-  distinct() %>%
-  mutate(C1=paste(affilname,`affiliation-city`,`affiliation-country`,sep=", ")) %>% 
-  # mutate(C1=paste("[", C1,"]",sep="")) %>% 
-  relocate(C1, .after = afid) %>% 
-  rename("university" = "affilname",
-         "city" = "affiliation-city",
-         "country" = "affiliation-country") %>% 
-  # mutate_all(tolower) %>% 
-  mutate_all(trimws)
-
-
-
-# read & standardize: SCOPUS authors --------------------------------------
-
-scopus_authors <- list.files(path='./data/for_testing_updates/scopus_api/authors',
-                             full.names = TRUE)
-
-scopus_authors <- scopus_authors %>% 
-  lapply(read_csv,col_types = cols(.default = "c")) %>% # read all the files at once
-  bind_rows(.id = "csv_id") %>% # bind all tables into one object, and give id for each
-  mutate(author_key = dplyr::row_number()) %>% 
-  # left_join(scopus_authors2) %>% # join month column created earlier
-  select(-"@_fa",
-         -"afid.@_fa") %>% 
-  rename("author_order"="@seq",
-         "afid"="afid.$",
-         "first_name"="given-name",
-         "author_url"="author-url") %>% 
-  # relocate(author_key,SO,PY,entry_number, .before="author_order") %>% 
-  relocate(author_key, .before="author_order") %>% 
-  mutate(first_name = str_replace_all(first_name, "\\. " ,"")) %>%
-  mutate(first_name = str_replace_all(first_name, "\\." ,"")) %>%
-  mutate(initials = str_replace_all(initials, "\\. " ,"")) %>%
-  mutate(initials = str_replace_all(initials, "\\." ,"")) %>%
-  mutate(last_init = paste(surname, initials, sep=" ")) %>% 
-  mutate(authname = paste(surname, first_name,sep= ", ")) %>% 
-  relocate(authid,authname,last_init, .after="author_key") %>% 
-  rename("author_count"="author_order") %>% 
-  # mutate_all(tolower) %>% 
-  mutate_all(trimws) %>% 
-  unite(refID,csv_id,sep="-",na.rm=TRUE,remove=FALSE)
-
-scopus_authors
-write_csv(scopus_authors,'./data/for_testing_updates/scopus_api/scopus_authors.csv')
-rm(scopus_authors2)
-
-
-names(scopus_affils)
-# add the affiliations to authors -----------------------------------------
-
-# scopus_authors_affils<-scopus_authors %>% left_join(scopus_affils)
-names(scopus_authors)
-names(scopus_affils)
-# Add the name to C1 to make it consistent with WOS
-# excludes secondary/current
-scopus_authors_affils<-scopus_authors %>% 
-  left_join(scopus_affils) %>% 
-  select(-author_url) %>% 
-  # relocate(SO,.before="PY") %>% 
-  mutate(first_name = str_replace_all(first_name, "\\. " ,"\\.")) %>% 
-  mutate(last_init = paste(surname, initials, sep=" ")) %>% 
-  mutate(authname = paste(surname, first_name,sep= ", ")) %>% 
-  mutate(C1 = paste("[",authname,"] ", C1 ,sep= "")) %>% 
-  # distinct(SO,PY,entry_number,authid,afid,.keep_all = TRUE)%>% 
-  distinct(authid,afid,.keep_all = TRUE)%>% 
-  # mutate_all(tolower) %>% 
-  mutate_all(trimws) %>% 
-  mutate(C1=gsub("Second Author, ","No Inst Given, ",C1)) 
-
-# unique(scopus_authors_affils$C1)
-
-# are there any remaining?
-scopus_authors_affils %>% filter(is.na(author_key)) 
-
-write_csv(scopus_authors_affils,'./data/for_testing_updates/scopus_api/scopus_authors_affils.csv')
-
-# add the authors and affiliations to papers -------------------------------
-
-names(scopus_authors_affils)
-# author_affils in WIDE FORMAT (authors for each paper)
-scopus_authors_affils_wide<-scopus_authors_affils %>% 
-  select(
-    -"university", 
-    -"city", 
-    -"country", 
-    -"last_init", 
-    -"authid",
-    #       -"C1", 
-    - "afid", 
-    -"surname" ,
-    -"first_name" ,
-    #       -"entry_number",
-    -author_key,
-    -authname,
-    #        # -SO,
-    #        # -PY,
-    -"initials") %>%
-  pivot_wider(names_from = author_count, 
-              id_cols=refID,
-              values_from = C1,
-              names_prefix = "C") %>% 
-  # BE SURE TO CHECK HOW MANY C COLUMNS AND EDIT BELOW
-  unite("C1", C1:last(matches("C[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="./") %>% 
-  mutate(C1 = str_replace_all(C1, " \\[na, na, na]" ," \\[missing]"))
-# head(scopus_authors_affils_wide$C1,40)
-names(scopus_authors_affils_wide)
-
-
-
-# select(scopus_authors_affils, matches("C[[:digit:]]"))
-# select(scopus_authors_affils, last(matches("C[[:digit:]]")))
-# scopus_authors_affils$C2<-NULL
-# scopus_authors_affils$C3<-NULL
-scopus_article_authors_wide<-scopus_authors_affils %>% 
-  distinct() %>% 
-  ungroup() %>% 
-  select(-"C1", 
-         -"university", 
-         -"city", 
-         -"country", 
-         -"last_init", 
-         -"authid",
-         - "afid", 
-         -"surname" ,
-         -"first_name" ,
-         # -"entry_number",
-         # -authname,
-         -author_key,
-         # -SO,
-         # -PY,
-         -"initials") %>%
-  mutate(orcid=paste(authname,"/",orcid,";",sep="")) %>% 
-  pivot_wider(names_from = author_count,
-              values_from = c("authname","orcid"),
-              id_cols=refID,
-              # names_prefix = c("AF","OI")
-              ) %>%
-  rename_with(~ gsub("authname_", "AF", .x, fixed = TRUE)) %>% 
-  rename_with(~ gsub("orcid_", "OI", .x, fixed = TRUE)) %>% 
-  # CHECK HOW MANY AUTHOR COLUMNS
-  unite("AF", AF1:last(matches("AF[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="; ") %>% 
-  unite("OI", OI1:last(matches("OI[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="") 
-# mutate(C1 = str_replace_all(C1, " \\[NA, NA, NA]" ," \\[MISSING]"))
-# head(scopus_allauthors_wide$AF,40)
-names(scopus_article_authors_wide)
-
-
-
-
-scopus_papers_complete <- scopus_papers %>% 
-  left_join(scopus_authors_affils_wide) %>% 
-  left_join(scopus_article_authors_wide) %>% 
-  rename("filename"="csv_id")
-
-
-# final tweaks to make it possible to use refsplitr on scopus -------------
-names(scopus_papers_complete)
-scopus_papers_complete<-scopus_papers_complete %>% 
- # mutate(AU=if_else(is.na(AU), AF, AU)) %>% 
-  mutate(AU=AF) %>%
-  # mutate(refID=gsub("scopus_", "", refID)) %>% 
-  mutate(AF=gsub("\\.", "", AF)) %>% 
-  mutate(AU=gsub("\\.", "", AU)) %>% 
-  mutate(AF=gsub("\\;", "\n", AF)) %>% 
-  mutate(AU=gsub("\\;", "\n", AU)) %>% 
-  mutate(AF=gsub("\n ", "\n", AF)) %>% 
-  mutate(AU=gsub("\n ;", "\n", AU)) %>% 
-  # mutate(orcid=paste0(creator, " /", orcid)) %>% # authors clean requires orcid be in wos format "[name]/orcid" 
-  relocate(SO,PY,AF,C1,DI,TI,VL,BP,EP,.after="filename")
-
-# remove the last semicolon from every orcid id (last character)
-scopus_papers_complete<-scopus_papers_complete %>% 
-  mutate(OI=str_sub(OI,end=-2))
-# scopus_papers_complete$EM<-NA
-  # mutate(refID=as.numeric(refID)*1000) 
-
-
-
-write_csv(scopus_papers_complete,"./data/for_testing_updates/scopus_api/scopus_refs.csv")
-# save csv ----------------------------------------------------------------

From 3e98fe0a1122cd506bc3253135a0543e12545ceb Mon Sep 17 00:00:00 2001
From: embruna <embruna@ufl.edu>
Date: Mon, 12 Aug 2024 11:00:20 -0400
Subject: [PATCH 6/7] style/formatting

---
 R/references_read.R | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/R/references_read.R b/R/references_read.R
index 524d2f6..ed652d9 100644
--- a/R/references_read.R
+++ b/R/references_read.R
@@ -59,15 +59,11 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
     "FU" = character(0),
     "FX" = character(0),
     "GA" = character(0),
-    # "GE" = character(0), (removed by EB Sept 2024)
     "ID" = character(0),
     "IS" = character(0),
     "J9" = character(0),
     "JI" = character(0),
     "LA" = character(0),
-    # "LT" = character(0),
-    # "MC" = character(0),
-    # "MI" = character(0),
     "NR" = character(0),
     "PA" = character(0),
     "PD" = character(0),
@@ -78,11 +74,10 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
     "PT" = character(0),
     "PU" = character(0),
     "PY" = character(0),
-    "RI" = character(0), # New field code for Thomson-Reuters ResearcherID
-    "RID" = character(0), # Original field code for Thomson-Reuters ResearcherID
-                         # Older searchers will have RID (added by EB Sept 2024)
-    "OI" = character(0), # Field code for ORCID ID (added by EB Jan 2017)
-    "PM" = character(0), # Pubmed ID Number (added by EB Dec 2017)
+    "RI" = character(0),  # New Thomson-Reuters ResearcherID
+    "RID" = character(0), # Original Thomson-Reuters ResearcherID 
+    "OI" = character(0),  # ORCID
+    "PM" = character(0),  # Pubmed ID Number
     "RP" = character(0),
     "SC" = character(0),
     "SI" = character(0),
@@ -90,7 +85,6 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
     "EI" = character(0),
     "SO" = character(0),
     "SU" = character(0),
-    # "TA" = character(0), (removed by EB Sept 2024)
     "TC" = character(0),
     "TI" = character(0),
     "UT" = character(0),
@@ -100,7 +94,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
     "Z9" = character(0),
     "AR" = character(0),
     "WE" = character(0),
-    "OA" = character(0), # Field code for Open Acceess (added by EB Sept 2024)
+    "OA" = character(0), # Open Access 
     stringsAsFactors = FALSE
   )
 

From 69f3e7190f0e029245a1c440f946175a0f907c0e Mon Sep 17 00:00:00 2001
From: embruna <embruna@ufl.edu>
Date: Mon, 12 Aug 2024 11:00:39 -0400
Subject: [PATCH 7/7] style/formatting

---
 R/references_read.R | 100 ++++++++++++++++++++++++--------------------
 1 file changed, 54 insertions(+), 46 deletions(-)

diff --git a/R/references_read.R b/R/references_read.R
index ed652d9..9154a8b 100644
--- a/R/references_read.R
+++ b/R/references_read.R
@@ -2,34 +2,33 @@
 #'
 #' \code{references_read} This function reads Thomson Reuters Web of Knowledge
 #' and ISI format reference data files into an R-friendly data format. The resulting dataframe
-#' is the argument for the refsplitr function `authors_clean()`.    
+#' is the argument for the refsplitr function `authors_clean()`.
 #'
-#' @param data the location of the file or files to be imported. This can be either the absolute or 
-#' relative name of the file (for a single file) or folder (for multiple files stored in the same folder; 
+#' @param data the location of the file or files to be imported. This can be either the absolute or
+#' relative name of the file (for a single file) or folder (for multiple files stored in the same folder;
 #' used in conjunction with `dir = TRUE``). If left blank it is assumed the location is the working directory.
-#' @param dir if FALSE it is assumed a single file is to be imported. 
-#' Set to TRUE if importing multiple files (the path to the folder in which files are stored is set with `data=``; 
-#' all files in the folder will be imported). Defaults to FALSE. 
-#' @param include_all if FALSE only a subset of commonly used fields from references records are imported. 
-#' If TRUE then all fields from the reference records are imported. Defaults to FALSE.  
-#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, FX, GA, J9, 
+#' @param dir if FALSE it is assumed a single file is to be imported.
+#' Set to TRUE if importing multiple files (the path to the folder in which files are stored is set with `data=``;
+#' all files in the folder will be imported). Defaults to FALSE.
+#' @param include_all if FALSE only a subset of commonly used fields from references records are imported.
+#' If TRUE then all fields from the reference records are imported. Defaults to FALSE.
+#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, FX, GA, J9,
 #' LA, PA, PI, PN, PS, RID, SU, VR.
 #' @export references_read
-#' 
-#' @examples 
-#' ## If a single files is being imported from a folder called "data" located in an RStudio Project: 
+#'
+#' @examples
+#' ## If a single files is being imported from a folder called "data" located in an RStudio Project:
 #' ## imported_refs<-references_read(data = './data/refs.txt', dir = FALSE, include_all=FALSE)
-#' 
+#'
 #' ## If multiple files are being imported from a folder named "heliconia" nested within a folder
-#' ## called "data" located in an RStudio Project: 
+#' ## called "data" located in an RStudio Project:
 #' ## heliconia_refs<-references_read(data = './data/heliconia', dir = TRUE, include_all=FALSE)
-#' 
-#' ## To load the Web of Science records used in the examples in the documentation  
-#' BITR_data_example <- system.file('extdata', 'BITR_test.txt', package = 'refsplitr')
+#'
+#' ## To load the Web of Science records used in the examples in the documentation
+#' BITR_data_example <- system.file("extdata", "BITR_test.txt", package = "refsplitr")
 #' BITR <- references_read(BITR_data_example)
-#' 
-#' 
-references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
+#'
+references_read <- function(data = ".", dir = FALSE, include_all = FALSE) {
   ## 	NOTE: The fields stored in our output table are a combination of the
   ## 	"Thomson Reuters Web of Knowledge" FN format and the "ISI Export
   ## 	Format" both of which are version 1.0:
@@ -74,10 +73,10 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
     "PT" = character(0),
     "PU" = character(0),
     "PY" = character(0),
-    "RI" = character(0),  # New Thomson-Reuters ResearcherID
-    "RID" = character(0), # Original Thomson-Reuters ResearcherID 
-    "OI" = character(0),  # ORCID
-    "PM" = character(0),  # Pubmed ID Number
+    "RI" = character(0), # New Thomson-Reuters ResearcherID
+    "RID" = character(0), # Original Thomson-Reuters ResearcherID
+    "OI" = character(0), # ORCID
+    "PM" = character(0), # Pubmed ID Number
     "RP" = character(0),
     "SC" = character(0),
     "SI" = character(0),
@@ -94,7 +93,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
     "Z9" = character(0),
     "AR" = character(0),
     "WE" = character(0),
-    "OA" = character(0), # Open Access 
+    "OA" = character(0), # Open Access
     stringsAsFactors = FALSE
   )
 
@@ -108,7 +107,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
   }
 
   ## 	Strip out any files in the directory that aren't Web of Knowledge files:
-  file_list <- file_list[ grep(".ciw|.txt", file_list) ]
+  file_list <- file_list[grep(".ciw|.txt", file_list)]
 
   if (length(file_list) == 0) {
     stop("ERROR:  The specified file or directory does not contain any
@@ -138,7 +137,6 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
     read_line <- readLines(in_file, n = 1, warn = FALSE)
 
     if (length(read_line) > 0) {
-
       read_line <- gsub("^[^A-Z]*([A-Z]+)(.*)$", "\\1\\2", read_line)
 
       ##  Strip the first two characters from the text line,
@@ -148,10 +146,12 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
 
       if (pre_text != "FN") {
         close(in_file)
-        error <- paste0("ERROR:  The file ",
+        error <- paste0(
+          "ERROR:  The file ",
           filename,
           " doesn't appear to be a valid ISI or
-          Thomson Reuters reference library file!")
+          Thomson Reuters reference library file!"
+        )
         stop(error)
       }
 
@@ -187,8 +187,11 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
           output[i, field] <- trimws(
             ifelse(length(line_text) == 1,
               paste(output[i, field], line_text,
-                sep = "\n")),
-            "both")
+                sep = "\n"
+              )
+            ),
+            "both"
+          )
         }
       }
     } else {
@@ -216,12 +219,14 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
 
       ## 	Check to see if the current field is one we are saving to output:
       if (field %in% names(output)) {
-      ##... if it is then append this line's data to the field in our output:
+        ## ... if it is then append this line's data to the field in our output:
 
         output[i, field] <- trimws(
           ifelse(length(line_text) == 1,
-            paste(output[i, field], line_text, sep = "\n")),
-          "both")
+            paste(output[i, field], line_text, sep = "\n")
+          ),
+          "both"
+        )
       }
 
       # 	If this is the end of a record then add any per-record items and
@@ -247,7 +252,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
     counter <- counter + 1
     utils::flush.console()
     ###########################################################################
-    }
+  }
   ############################################## 3
   # Post Processing
   # We need to clean this file, page breaks are inserted in the raw file
@@ -256,7 +261,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
 
   output$RI <- gsub("\n", "", output$RI, fixed = TRUE)
   output$RI <- gsub("; ", ";", output$RI, fixed = TRUE)
-  #This fixes a problem where in earlier WOS pulls RI is stored
+  # This fixes a problem where in earlier WOS pulls RI is stored
   # as RID with no name associated
   output$RI[!grepl("/", output$RI)] < -NA
   output$OI <- gsub("\n", "", output$OI, fixed = TRUE)
@@ -267,24 +272,27 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) {
   output$refID <- seq_len(nrow(output))
 
   # now done in base R, runs slower
-  dupe_output <- do.call(rbind, lapply(unique(output$UT),
-    function(x) output[output$UT == x, ][1, ]))
+  dupe_output <- do.call(rbind, lapply(
+    unique(output$UT),
+    function(x) output[output$UT == x, ][1, ]
+  ))
   ############################################
   # Prepare for printing
 
-  if (include_all == TRUE){
+  if (include_all == TRUE) {
     return(dupe_output)
   }
 
-  if (include_all != TRUE){
-
-    dropnames <- c("CC", "CH", "CL", "CT", "CY",
-                   "FX", "GA", "GE", "J9", "LA",
-                   "PA", "PI", "PN", "PS", "RID", 
-                   "SI", "SU", "VR", "OA")
+  if (include_all != TRUE) {
+    dropnames <- c(
+      "CC", "CH", "CL", "CT", "CY",
+      "FX", "GA", "GE", "J9", "LA",
+      "PA", "PI", "PN", "PS", "RID",
+      "SI", "SU", "VR", "OA"
+    )
 
     rdo <- dupe_output[, !(names(dupe_output) %in% dropnames)]
 
     return(rdo)
   }
-  }
\ No newline at end of file
+}