From d4292f8d7a4858af2634db12b5efe9c67500550a Mon Sep 17 00:00:00 2001 From: embruna Date: Wed, 24 Jul 2024 10:59:34 -0400 Subject: [PATCH 1/7] added WOS tag AR --- R/references_read.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/R/references_read.R b/R/references_read.R index 45702ea..1ea35fb 100644 --- a/R/references_read.R +++ b/R/references_read.R @@ -96,6 +96,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { "VL" = character(0), "WC" = character(0), "Z9" = character(0), + "AR" = character(0), stringsAsFactors = FALSE ) @@ -280,11 +281,11 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { if (include_all != TRUE){ dropnames <- c("CC", "CH", "CL", "CT", "CY", - "DT", "FX", "GA", "GE", "ID", - "IS", "J9", "JI", "LA", "LT", - "MC", "MI", "NR", "PA", "PI", - "PN", "PS", "RID", "SI", "SU", - "TA", "VR") + "DT", "FX", "GA", "GE", "ID", + "IS", "J9", "JI", "LA", "LT", + "MC", "MI", "NR", "PA", "PI", + "PN", "PS", "RID", "SI", "SU", + "TA", "VR") rdo <- dupe_output[, !(names(dupe_output) %in% dropnames)] From c4294d5fdf6bbd039eeefb01b57a0e143d4b316e Mon Sep 17 00:00:00 2001 From: embruna Date: Fri, 9 Aug 2024 13:03:07 -0400 Subject: [PATCH 2/7] updating references_read with new field codes, deleting old ones added notes to reflect these changes --- NEWS.md | 15 +++++++++++++++ R/references_read.R | 35 ++++++++++++++++++----------------- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/NEWS.md b/NEWS.md index cfa2411..6ca0185 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,20 @@ # refsplitr News + +refsplitr 1.X.X (2024-XX-XX) +========================= + +### NEW FEATURES + + * `references_read` now extracts additional fields from Web of Science records: WE (Source Database), C3 (affiliations*), EI (eISSN) and RID (the original version of the Thomson-Reuters ResearcherID (RI); authors of some older publications might have an RID but not an RI). To include these in the output of`references_read` use the setting `include_all=TRUE`. + + *a single cell with list of all affiliations, not brtoken down by author. to match scopus + * `references_read` no longer extracts some rarely used field codes: GE, LT, MC, MI, and TA + + * the Document Type (DT), Keywords Plus (ID), Issue (IS), ISO abbreviated source code (JI), and number of references cited in an article (NR) are now returned by default (`include_all=FALSE`). + + + refsplitr 1.0.1 (2024-07-23) ========================= diff --git a/R/references_read.R b/R/references_read.R index 1ea35fb..c0551fa 100644 --- a/R/references_read.R +++ b/R/references_read.R @@ -12,8 +12,8 @@ #' all files in the folder will be imported). Defaults to FALSE. #' @param include_all if FALSE only a subset of commonly used fields from references records are imported. #' If TRUE then all fields from the reference records are imported. Defaults to FALSE. -#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, DT, FX, GA, GE, ID, IS, J9, JI, -#' LA, LT, MC, MI, NR, PA, PI, PN, PS, RID, SU, TA, VR. +#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, FX, GA, J9, +#' LA, PA, PI, PN, PS, RID, SU, VR. #' @export references_read #' #' @examples @@ -41,6 +41,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { "CA" = character(0), "BP" = character(0), "C1" = character(0), + "C3" = character(0), "CC" = character(0), "CH" = character(0), "CL" = character(0), @@ -58,15 +59,15 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { "FU" = character(0), "FX" = character(0), "GA" = character(0), - "GE" = character(0), + # "GE" = character(0), (removed by EB Sept 2024) "ID" = character(0), "IS" = character(0), "J9" = character(0), "JI" = character(0), "LA" = character(0), - "LT" = character(0), - "MC" = character(0), - "MI" = character(0), + # "LT" = character(0), + # "MC" = character(0), + # "MI" = character(0), "NR" = character(0), "PA" = character(0), "PD" = character(0), @@ -77,18 +78,19 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { "PT" = character(0), "PU" = character(0), "PY" = character(0), - "RI" = character(0), # NEW field code for Thomson-Reuters ResearcherID - "RID" = character(0), # OLD field code for Thomson-Reuters ResearcherID - # Older searchers will have RID, not RI ACTUALLY LOOK SL IKE NOT - "OI" = character(0), # New field code for ORCID ID (added EB Jan 2017) - "PM" = character(0), # Pubmed ID Number (added by EB 3 dec 2017) + "RI" = character(0), # New field code for Thomson-Reuters ResearcherID + "RID" = character(0), # Original field code for Thomson-Reuters ResearcherID + # Older searchers will have RID (added by EB Sept 2024) + "OI" = character(0), # Field code for ORCID ID (added by EB Jan 2017) + "PM" = character(0), # Pubmed ID Number (added by EB Dec 2017) "RP" = character(0), "SC" = character(0), "SI" = character(0), "SN" = character(0), + "EI" = character(0), "SO" = character(0), "SU" = character(0), - "TA" = character(0), + # "TA" = character(0), (removed by EB Sept 2024) "TC" = character(0), "TI" = character(0), "UT" = character(0), @@ -97,6 +99,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { "WC" = character(0), "Z9" = character(0), "AR" = character(0), + "WE" = character(0), stringsAsFactors = FALSE ) @@ -281,11 +284,9 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { if (include_all != TRUE){ dropnames <- c("CC", "CH", "CL", "CT", "CY", - "DT", "FX", "GA", "GE", "ID", - "IS", "J9", "JI", "LA", "LT", - "MC", "MI", "NR", "PA", "PI", - "PN", "PS", "RID", "SI", "SU", - "TA", "VR") + "FX", "GA", "GE", "J9", "LA", + "PA", "PI", "PN", "PS", "RID", + "SI", "SU", "VR") rdo <- dupe_output[, !(names(dupe_output) %in% dropnames)] From 006b2ec64ef81972980e61d9ccd8e73f9f1678f8 Mon Sep 17 00:00:00 2001 From: embruna Date: Fri, 9 Aug 2024 13:03:36 -0400 Subject: [PATCH 3/7] adding draft function for references_read of scopus files downloaded as csv --- R/references_read_scopus_download.R | 570 ++++++++++++++++++++++++++++ 1 file changed, 570 insertions(+) create mode 100644 R/references_read_scopus_download.R diff --git a/R/references_read_scopus_download.R b/R/references_read_scopus_download.R new file mode 100644 index 0000000..91d2827 --- /dev/null +++ b/R/references_read_scopus_download.R @@ -0,0 +1,570 @@ +#' Reads SCOPUS download Output +##' Thomson Reuters Web of Knowledge/Science and ISI reference export files (both .txt or .ciw format accepted) +#' +#' \code{references_read_scopus_download} This function reads Scopus +#' reference data files downloaded directly as csv into an R-friendly data format. The resulting dataframe +#' is the argument for the refsplitr function `authors_clean()`. +#' +#' @param data the location of the file or files to be imported. This can be either the absolute or +#' relative name of the file (for a single file) or folder (for multiple files stored in the same folder; +#' used in conjunction with `dir = TRUE``). If left blank it is assumed the location is the working directory. +#' @param dir if FALSE it is assumed a single file is to be imported. +#' Set to TRUE if importing multiple files (the path to the folder in which files are stored is set with `data=``; +#' all files in the folder will be imported). Defaults to FALSE. +#' @param include_all if FALSE only a subset of commonly used fields from references records are imported. +#' If TRUE then all fields from the reference records are imported. Defaults to FALSE. +#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, DT, FX, GA, GE, ID, IS, J9, JI, +#' LA, LT, MC, MI, NR, PA, PI, PN, PS, RID, SU, TA, VR. +#' @export references_read_scopus_api +#' +#' @examples +#' ## If a single files is being imported from a folder called "data" located in an RStudio Project: +#' ## imported_refs<-references_read_scopus_api(data = './data/refs.txt', dir = FALSE, include_all=FALSE) +#' +#' ## If multiple files are being imported from a folder named "heliconia" nested within a folder +#' ## called "data" located in an RStudio Project: +#' ## heliconia_refs<-references_read_scopus_api(data = './data/heliconia', dir = TRUE, include_all=FALSE) +#' +#' ## To load the Scopus API records used in the examples in the documentation +#' scopus_api_data <- system.file('extdata', 'BITR_test.txt', package = 'refsplitr') +#' scopus_api_example <- references_read_scopus_api(scopus_api_data) +#' +#' + +# SEE IMPORTANT NOTES ON ORIGINAL 265 and 295 IN TROPICAL SCIENTOMETRIX + + +library(tidyverse) +# read & standardize: SCOPUS papers --------------------------------------- + +file_list <- list.files(path='./data/for_testing_updates/scopus_csv', + full.names = TRUE) + +references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { + ## NOTE: The fields stored in our output table are a combination of the + ## "Thomson Reuters Web of Knowledge" FN format and the "ISI Export + ## Format" both of which are version 1.0: + output <- data.frame( + "filename" = character(0), + "AB" = character(0), + "AF" = character(0), + "AU" = character(0), + "CA" = character(0), + "BP" = character(0), + "C1" = character(0), + "C3" = character(0), + "CC" = character(0), + "CH" = character(0), + "CL" = character(0), + "CR" = character(0), + "CT" = character(0), + "CY" = character(0), + "DE" = character(0), + "DI" = character(0), + "DT" = character(0), + # "EF" = character(0), ## End file + "EM" = character(0), + "EP" = character(0), + # "ER" = character(0), ## End record + "FN" = character(0), + "FU" = character(0), + "FX" = character(0), + "GA" = character(0), + # "GE" = character(0), (removed by EB Sept 2024) + "ID" = character(0), + "IS" = character(0), + "J9" = character(0), + "JI" = character(0), + "LA" = character(0), + # "LT" = character(0), + # "MC" = character(0), + # "MI" = character(0), + "NR" = character(0), + "PA" = character(0), + "PD" = character(0), + "PG" = character(0), + "PI" = character(0), + "PN" = character(0), + "PS" = character(0), + "PT" = character(0), + "PU" = character(0), + "PY" = character(0), + "RI" = character(0), # New field code for Thomson-Reuters ResearcherID + "RID" = character(0), # Original field code for Thomson-Reuters ResearcherID + # Older searchers will have RID (added by EB Sept 2024) + "OI" = character(0), # Field code for ORCID ID (added by EB Jan 2017) + "PM" = character(0), # Pubmed ID Number (added by EB Dec 2017) + "RP" = character(0), + "SC" = character(0), + "SI" = character(0), + "SN" = character(0), + "EI" = character(0), + "SO" = character(0), + "SU" = character(0), + # "TA" = character(0), (removed by EB Sept 2024) + "TC" = character(0), + "TI" = character(0), + "UT" = character(0), + "VR" = character(0), + "VL" = character(0), + "WC" = character(0), + "Z9" = character(0), + "AR" = character(0), + "WE" = character(0), + stringsAsFactors = FALSE + ) + + + ## This is an index for the current record, it gets iterated for each + # record we advance through: + i <- 1 + if (dir) { + file_list <- dir(path = data) + } else { + file_list <- data + } + + + + ## Strip out any files in the directory that aren't Web of Knowledge files: + file_list <- file_list[ grep(".ciw|.csv", file_list) ] + + if (length(file_list) == 0) { + stop("ERROR: The specified file or directory does not contain any + Scopus download records in .csv format!") + } + message("Now processing all references files") + + + + # for (filename in file_list) { + # if (dir) { + # in_file <- file(paste0(data, "/", filename), "r") + # } + # if (!dir) { + # in_file <- file(filename, "r") + # } + # + # field <- "" + # + # ## Process the first line to determine what file type it is: + # ## NOTE: We could add the encoding="UTF-8" flag to the readLines in + # ## order to remove the byte-order mark (BOM) from some exported + # ## files coming out of ISI, but there seems to be a bug in the + # ## readLines() function after bringing a UTF-8 file in, in that + # ## it doesn't respsect the BOM characters. So we'll just read + # ## the files in with no encoding specified and strip the BOM if + # + # read_line <- readLines(in_file, n = 1, warn = FALSE) + # + # if (length(read_line) > 0) { + # + # read_line <- gsub("^[^A-Z]*([A-Z]+)(.*)$", "\\1\\2", read_line) + # + # ## Strip the first two characters from the text line, + # # skip the third (should be a space) and store the rest: + # pre_text <- substr(read_line, 1, 2) + # line_text <- substr(read_line, 4, nchar(read_line)) + # + # if (pre_text != "FN") { + # close(in_file) + # error <- paste0("ERROR: The file ", + # filename, + # " doesn't appear to be a valid ISI or + # Thomson Reuters reference library file!") + # stop(error) + # } + # + # ## Check to see if this is a "ISI Export Format" file, in which + # ## case we need to parse out the first line into three fields: + # if (substr(line_text, 1, 3) == "ISI") { + # field <- pre_text + # + # ## Pull apart the FN, VR and PT fields all contained on the first + # ## line of the ISI file format: + # matches <- regexec( + # "^(.*) VR (.*) PT (.*)", + # line_text + # ) + # + # match_strings <- regmatches( + # line_text, + # matches + # ) + # + # ## Store those fields: + # output[i, "FN"] <- paste(match_strings[[1]][2], "\n", sep = "") + # + # output[i, "VR"] <- paste(match_strings[[1]][3], "\n", sep = "") + # + # output[i, "PT"] <- paste(match_strings[[1]][4], "\n", sep = "") + # } else { + # ## If this is not an ISI export format then just parse the first + # ## line normally into the FN field: + # field <- pre_text + # if (field %in% names(output)) { + # output[i, field] <- "" + # output[i, field] <- trimws( + # ifelse(length(line_text) == 1, + # paste(output[i, field], line_text, + # sep = "\n")), + # "both") + # } + # } + # } else { + # utils::flush.console() + # stop("WARNING: Nothing contained in the specified file!") + # } + # + # ## Process the remaining lines in the file (see the note above about + # ## the encoding= flag and necessity for it, but why we didn't use it): + # while (length(read_line <- readLines(in_file, n = 1, warn = FALSE)) > 0) { + # ## Strip the first three characters from the text line: + # pre_text <- substr(read_line, 1, 2) + # + # line_text <- substr(read_line, 4, nchar(read_line)) + # + # ## Check to see if this is a new field: + # if (pre_text != " ") { + # field <- pre_text + # ## If the field is in our file and in our data structure then + # ## initialize it to an empty string: + # if (field %in% names(output)) { + # output[i, field] <- "" + # } + # } + # + # ## Check to see if the current field is one we are saving to output: + # if (field %in% names(output)) { + # ##... if it is then append this line's data to the field in our output: + # + # output[i, field] <- trimws( + # ifelse(length(line_text) == 1, + # paste(output[i, field], line_text, sep = "\n")), + # "both") + # } + # + # # If this is the end of a record then add any per-record items and + # # advance our row: + # if (field == "ER") { + # output[i, "filename"] <- filename + # + # ## These fields are not repeated for every record, so we set them + # ## from the first record where they were recorded: + # + # output[i, "FN"] <- output[1, "FN"] + # output[i, "VR"] <- output[1, "VR"] + # + # i <- i + 1 + # } + # } + # + # close(in_file) + # ############################### Clock###################################### + # total <- length(file_list) + # pb <- utils::txtProgressBar(min = 0, max = total, style = 3) + # utils::setTxtProgressBar(pb, counter) + # counter <- counter + 1 + # utils::flush.console() + # ########################################################################### + # } + # + # +output <- file_list %>% + lapply(read_csv,col_types = cols(.default = "c")) %>% # read all the files at once + bind_rows(.id = "csv_id") %>% # bind all tables into one object, and give id for each + rename_all(~str_replace_all(.,"prism","")) %>% + rename_all(~str_replace_all(.,"dc","")) %>% + rename_all(~str_replace_all(.,"\\:","")) %>% + # mutate(PY=str_sub(coverDate, 1, 4)) %>% + rename("SO"="Source title", + "AB"="Abstract", + "DI"="DOI", + "DT"="Document Type", + "VL"="Volume", + "filename"="csv_id", + "TI"="Title", + "PY"="Year", + "SO"="Source title", + "VL"="Volume", + "IS"="Issue", + "AR"="Art. No.", + "BP"="Page start", + "EP"="Page end", + "PG"="Page count", + "DI"="DOI", + "DE"="Author Keywords", + "RP"="Correspondence Address", + "BE"="Editors", + "PU"="Publisher", + "SN"="ISSN", + "BN"="ISBN", + "PM"="PubMed ID", + "LA"="Language of Original Document", + "JI"="Abbreviated Source Title", + "DT"="Document Type", + "OA"="Open Access", + "UT"="EID", + "TC"="Cited by", + "ID"="Index Keywords", + "FU"="Funding Details", + "FX"="Funding Texts", + "PubStage"="Publication Stage", + "CODEN"="CODEN", + "WE"="Source", + "URL"="Link", + "C3"="Affiliations", # in WOS csv download it is also "affiliations" + "C1"="Authors with affiliations", #"Addresses" in WOS csv downloads + "AU"="Authors", + "AF"="Author full names", + "SID"="Author(s) ID" #scopus ID number + ) %>% + mutate(SO=tolower(SO)) %>% + distinct() %>% + filter(if_any(everything(), is.na)) %>% + filter(!is.na(URL)) %>% + mutate(PG=str_replace_all(PG,"�","")) %>% + mutate(PG=str_replace_all(PG,"E-","E")) %>% + # separate(PG,c("BP","EP"),remove=FALSE,sep="-",extra="merge") %>% + unite(FU,FX,sep="-",na.rm=TRUE,remove = TRUE) %>% + mutate(refID = row_number(),.before=1) %>% + unite("refID",refID:filename,sep="-",na.rm=TRUE,remove=FALSE) %>% + mutate(DE=str_replace_all(DE,"\\|",";")) %>% + # select(-"@_fa", + # -"coverDisplayDate", + # -"aggregationType", + # -"author-count.@limit", + # -"openaccess", + # -"freetoread.value.$", + # -"freetoreadLabel.value.$", + # # -"pii", + # -'author-count.$', + # -"coverDate", + # # -"error", + # -"eid", + # -"url", + # -"pageRange", + # -article_type_long, + # ) %>% + # mutate_all(tolower) %>% + mutate_all(trimws) + + + output<-output %>% distinct(DI,TI,.keep_all = TRUE) + output + +write_csv(output,file="./data/for_testing_updates/scopus_api/scopus_papers.csv") + +# names(scopus_papers) +# unique(scopus_papers$SO) +# names(scopus_papers) +# head(scopus_papers) +# unique(scopus_papers$DE)[999] + +# unique(scopus_papers$journal) + + +scopus_refs<-scopus_papers %>% + group_by(SO,PY) %>% + tally() + + +scopus_papers %>% + group_by(SO) %>% + tally() + + + + +# read & standardize - SCOPUS affiliations -------------------------------- + + +scopus_affils <- list.files(path='./data/for_testing_updates/scopus_api/affils', + full.names = TRUE) %>% + lapply(read_csv,col_types = cols(.default = "c")) %>% + bind_rows %>% + select(-"@_fa") %>% + mutate(affilname =str_replace_all(affilname, "\\,", "")) %>% + select(-'affiliation-url', + # -"entry_number" + ) %>% + distinct() %>% + mutate(C1=paste(affilname,`affiliation-city`,`affiliation-country`,sep=", ")) %>% + # mutate(C1=paste("[", C1,"]",sep="")) %>% + relocate(C1, .after = afid) %>% + rename("university" = "affilname", + "city" = "affiliation-city", + "country" = "affiliation-country") %>% + # mutate_all(tolower) %>% + mutate_all(trimws) + + + +# read & standardize: SCOPUS authors -------------------------------------- + +scopus_authors <- list.files(path='./data/for_testing_updates/scopus_api/authors', + full.names = TRUE) + +scopus_authors <- scopus_authors %>% + lapply(read_csv,col_types = cols(.default = "c")) %>% # read all the files at once + bind_rows(.id = "csv_id") %>% # bind all tables into one object, and give id for each + mutate(author_key = dplyr::row_number()) %>% + # left_join(scopus_authors2) %>% # join month column created earlier + select(-"@_fa", + -"afid.@_fa") %>% + rename("author_order"="@seq", + "afid"="afid.$", + "first_name"="given-name", + "author_url"="author-url") %>% + # relocate(author_key,SO,PY,entry_number, .before="author_order") %>% + relocate(author_key, .before="author_order") %>% + mutate(first_name = str_replace_all(first_name, "\\. " ,"")) %>% + mutate(first_name = str_replace_all(first_name, "\\." ,"")) %>% + mutate(initials = str_replace_all(initials, "\\. " ,"")) %>% + mutate(initials = str_replace_all(initials, "\\." ,"")) %>% + mutate(last_init = paste(surname, initials, sep=" ")) %>% + mutate(authname = paste(surname, first_name,sep= ", ")) %>% + relocate(authid,authname,last_init, .after="author_key") %>% + rename("author_count"="author_order") %>% + # mutate_all(tolower) %>% + mutate_all(trimws) %>% + unite(refID,csv_id,sep="-",na.rm=TRUE,remove=FALSE) + +scopus_authors +write_csv(scopus_authors,'./data/for_testing_updates/scopus_api/scopus_authors.csv') +rm(scopus_authors2) + + +names(scopus_affils) +# add the affiliations to authors ----------------------------------------- + +# scopus_authors_affils<-scopus_authors %>% left_join(scopus_affils) +names(scopus_authors) +names(scopus_affils) +# Add the name to C1 to make it consistent with WOS +# excludes secondary/current +scopus_authors_affils<-scopus_authors %>% + left_join(scopus_affils) %>% + select(-author_url) %>% + # relocate(SO,.before="PY") %>% + mutate(first_name = str_replace_all(first_name, "\\. " ,"\\.")) %>% + mutate(last_init = paste(surname, initials, sep=" ")) %>% + mutate(authname = paste(surname, first_name,sep= ", ")) %>% + mutate(C1 = paste("[",authname,"] ", C1 ,sep= "")) %>% + # distinct(SO,PY,entry_number,authid,afid,.keep_all = TRUE)%>% + distinct(authid,afid,.keep_all = TRUE)%>% + # mutate_all(tolower) %>% + mutate_all(trimws) %>% + mutate(C1=gsub("Second Author, ","No Inst Given, ",C1)) + +# unique(scopus_authors_affils$C1) + +# are there any remaining? +scopus_authors_affils %>% filter(is.na(author_key)) + +write_csv(scopus_authors_affils,'./data/for_testing_updates/scopus_api/scopus_authors_affils.csv') + +# add the authors and affiliations to papers ------------------------------- + +names(scopus_authors_affils) +# author_affils in WIDE FORMAT (authors for each paper) +scopus_authors_affils_wide<-scopus_authors_affils %>% + select( + -"university", + -"city", + -"country", + -"last_init", + -"authid", + # -"C1", + - "afid", + -"surname" , + -"first_name" , + # -"entry_number", + -author_key, + -authname, + # # -SO, + # # -PY, + -"initials") %>% + pivot_wider(names_from = author_count, + id_cols=refID, + values_from = C1, + names_prefix = "C") %>% + # BE SURE TO CHECK HOW MANY C COLUMNS AND EDIT BELOW + unite("C1", C1:last(matches("C[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="./") %>% + mutate(C1 = str_replace_all(C1, " \\[na, na, na]" ," \\[missing]")) +# head(scopus_authors_affils_wide$C1,40) +names(scopus_authors_affils_wide) + + + +# select(scopus_authors_affils, matches("C[[:digit:]]")) +# select(scopus_authors_affils, last(matches("C[[:digit:]]"))) +# scopus_authors_affils$C2<-NULL +# scopus_authors_affils$C3<-NULL +scopus_article_authors_wide<-scopus_authors_affils %>% + distinct() %>% + ungroup() %>% + select(-"C1", + -"university", + -"city", + -"country", + -"last_init", + -"authid", + - "afid", + -"surname" , + -"first_name" , + # -"entry_number", + # -authname, + -author_key, + # -SO, + # -PY, + -"initials") %>% + mutate(orcid=paste(authname,"/",orcid,";",sep="")) %>% + pivot_wider(names_from = author_count, + values_from = c("authname","orcid"), + id_cols=refID, + # names_prefix = c("AF","OI") + ) %>% + rename_with(~ gsub("authname_", "AF", .x, fixed = TRUE)) %>% + rename_with(~ gsub("orcid_", "OI", .x, fixed = TRUE)) %>% + # CHECK HOW MANY AUTHOR COLUMNS + unite("AF", AF1:last(matches("AF[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="; ") %>% + unite("OI", OI1:last(matches("OI[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="") +# mutate(C1 = str_replace_all(C1, " \\[NA, NA, NA]" ," \\[MISSING]")) +# head(scopus_allauthors_wide$AF,40) +names(scopus_article_authors_wide) + + + + +scopus_papers_complete <- scopus_papers %>% + left_join(scopus_authors_affils_wide) %>% + left_join(scopus_article_authors_wide) %>% + rename("filename"="csv_id") + + +# final tweaks to make it possible to use refsplitr on scopus ------------- +names(scopus_papers_complete) +scopus_papers_complete<-scopus_papers_complete %>% + # mutate(AU=if_else(is.na(AU), AF, AU)) %>% + mutate(AU=AF) %>% + # mutate(refID=gsub("scopus_", "", refID)) %>% + mutate(AF=gsub("\\.", "", AF)) %>% + mutate(AU=gsub("\\.", "", AU)) %>% + mutate(AF=gsub("\\;", "\n", AF)) %>% + mutate(AU=gsub("\\;", "\n", AU)) %>% + mutate(AF=gsub("\n ", "\n", AF)) %>% + mutate(AU=gsub("\n ;", "\n", AU)) %>% + # mutate(orcid=paste0(creator, " /", orcid)) %>% # authors clean requires orcid be in wos format "[name]/orcid" + relocate(SO,PY,AF,C1,DI,TI,VL,BP,EP,.after="filename") + +# remove the last semicolon from every orcid id (last character) +scopus_papers_complete<-scopus_papers_complete %>% + mutate(OI=str_sub(OI,end=-2)) +# scopus_papers_complete$EM<-NA + # mutate(refID=as.numeric(refID)*1000) + + + +write_csv(scopus_papers_complete,"./data/for_testing_updates/scopus_api/scopus_refs.csv") +# save csv ---------------------------------------------------------------- From 3c85734403ceb0a07acb2e745ede3b4611305ad9 Mon Sep 17 00:00:00 2001 From: embruna Date: Fri, 9 Aug 2024 13:04:05 -0400 Subject: [PATCH 4/7] addinfg draft code for references_read from scopus api --- R/references_read_scopus_api.R | 318 +++++++++++++++++++++++++++++++++ 1 file changed, 318 insertions(+) create mode 100644 R/references_read_scopus_api.R diff --git a/R/references_read_scopus_api.R b/R/references_read_scopus_api.R new file mode 100644 index 0000000..f9e75c7 --- /dev/null +++ b/R/references_read_scopus_api.R @@ -0,0 +1,318 @@ +#' Reads SCOPUS API Output +##' Thomson Reuters Web of Knowledge/Science and ISI reference export files (both .txt or .ciw format accepted) +#' +#' \code{references_read_scopus_api} This function reads Scopus +#' reference data files downloaded via API into an R-friendly data format. The resulting dataframe +#' is the argument for the refsplitr function `authors_clean()`. +#' +#' @param data the location of the file or files to be imported. This can be either the absolute or +#' relative name of the file (for a single file) or folder (for multiple files stored in the same folder; +#' used in conjunction with `dir = TRUE``). If left blank it is assumed the location is the working directory. +#' @param dir if FALSE it is assumed a single file is to be imported. +#' Set to TRUE if importing multiple files (the path to the folder in which files are stored is set with `data=``; +#' all files in the folder will be imported). Defaults to FALSE. +#' @param include_all if FALSE only a subset of commonly used fields from references records are imported. +#' If TRUE then all fields from the reference records are imported. Defaults to FALSE. +#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, DT, FX, GA, GE, ID, IS, J9, JI, +#' LA, LT, MC, MI, NR, PA, PI, PN, PS, RID, SU, TA, VR. +#' @export references_read_scopus_api +#' +#' @examples +#' ## If a single files is being imported from a folder called "data" located in an RStudio Project: +#' ## imported_refs<-references_read_scopus_api(data = './data/refs.txt', dir = FALSE, include_all=FALSE) +#' +#' ## If multiple files are being imported from a folder named "heliconia" nested within a folder +#' ## called "data" located in an RStudio Project: +#' ## heliconia_refs<-references_read_scopus_api(data = './data/heliconia', dir = TRUE, include_all=FALSE) +#' +#' ## To load the Scopus API records used in the examples in the documentation +#' scopus_api_data <- system.file('extdata', 'BITR_test.txt', package = 'refsplitr') +#' scopus_api_example <- references_read_scopus_api(scopus_api_data) +#' +#' + +# SEE IMPORTANT NOTES ON ORIGINAL 265 and 295 IN TROPICAL SCIENTOMETRIX + + +library(tidyverse) +# read & standardize: SCOPUS papers --------------------------------------- + +scopus_papers <- list.files(path='./data/for_testing_updates/scopus_api/papers', + full.names = TRUE) + +scopus_papers <- scopus_papers %>% + lapply(read_csv,col_types = cols(.default = "c")) %>% # read all the files at once + bind_rows(.id = "filename") %>% # bind all tables into one object, and give id for each + rename_all(~str_replace_all(.,"prism","")) %>% + rename_all(~str_replace_all(.,"dc","")) %>% + rename_all(~str_replace_all(.,"\\:","")) %>% + mutate(PY=str_sub(coverDate, 1, 4)) %>% + rename("SO"="publicationName", + "AB"="description", + "DI"="doi", + "DT"="subtype", + "VL"="volume", + "article_type_long"="subtypeDescription", + "author_count"='author-count.@total', + "IS"='issueIdentifier', + "SN"="issn", + "EI"="eIssn", + "DE"="authkeywords", + "PM"="pubmed-id", + "TC"="citedby-count", + "TI"="title", + # "UT"="url", + "fund_acr"="fund-acr", + "fund_no"="fund-no", + "fund_sp"="fund-sponsor", + "UT"="identifier", + "OA"="openaccessFlag" + ) %>% + mutate(SO=tolower(SO)) %>% + distinct() %>% + filter(if_any(everything(), is.na)) %>% + filter(!is.na(url)) %>% + # mutate(pub_number = row_number()) %>% + mutate(pageRange=str_replace_all(pageRange,"�","")) %>% + mutate(pageRange=str_replace_all(pageRange,"E-","E")) %>% + separate(pageRange,c("BP","EP"),remove=FALSE,sep="-",extra="merge") %>% + unite(FU,fund_acr,fund_sp,fund_no, sep="-",na.rm=TRUE,remove = TRUE) %>% + unite(refID,filename,entry_number,sep="-",na.rm=TRUE,remove=FALSE) %>% + mutate(DE=str_replace_all(DE,"\\|",";")) %>% + select(-"@_fa", + -"coverDisplayDate", + -"aggregationType", + -"author-count.@limit", + -"openaccess", + -"freetoread.value.$", + -"freetoreadLabel.value.$", + # -"pii", + -'author-count.$', + -"coverDate", + # -"error", + -"eid", + -"url", + -"pageRange", + -article_type_long, + ) %>% + # mutate_all(tolower) %>% + mutate_all(trimws) + + +scopus_papers<-scopus_papers %>% distinct(DI,TI,.keep_all = TRUE) +rm(scopus_papers2) + +write_csv(scopus_papers,file="./data/for_testing_updates/scopus_api/scopus_papers.csv") + +# names(scopus_papers) +# unique(scopus_papers$SO) +# names(scopus_papers) +# head(scopus_papers) +# unique(scopus_papers$DE)[999] + +# unique(scopus_papers$journal) + + +scopus_refs<-scopus_papers %>% + group_by(SO,PY) %>% + tally() + + +scopus_papers %>% + group_by(SO) %>% + tally() + + + + +# read & standardize - SCOPUS affiliations -------------------------------- + + +scopus_affils <- list.files(path='./data/for_testing_updates/scopus_api/affils', + full.names = TRUE) %>% + lapply(read_csv,col_types = cols(.default = "c")) %>% + bind_rows %>% + select(-"@_fa") %>% + mutate(affilname =str_replace_all(affilname, "\\,", "")) %>% + select(-'affiliation-url', + # -"entry_number" + ) %>% + distinct() %>% + mutate(C1=paste(affilname,`affiliation-city`,`affiliation-country`,sep=", ")) %>% + # mutate(C1=paste("[", C1,"]",sep="")) %>% + relocate(C1, .after = afid) %>% + rename("university" = "affilname", + "city" = "affiliation-city", + "country" = "affiliation-country") %>% + # mutate_all(tolower) %>% + mutate_all(trimws) + + + +# read & standardize: SCOPUS authors -------------------------------------- + +scopus_authors <- list.files(path='./data/for_testing_updates/scopus_api/authors', + full.names = TRUE) + +scopus_authors <- scopus_authors %>% + lapply(read_csv,col_types = cols(.default = "c")) %>% # read all the files at once + bind_rows(.id = "filename") %>% # bind all tables into one object, and give id for each + mutate(author_key = dplyr::row_number()) %>% + # left_join(scopus_authors2) %>% # join month column created earlier + select(-"@_fa", + -"afid.@_fa") %>% + rename("author_order"="@seq", + "afid"="afid.$", + "first_name"="given-name", + "author_url"="author-url") %>% + # relocate(author_key,SO,PY,entry_number, .before="author_order") %>% + relocate(author_key,entry_number, .before="author_order") %>% + mutate(first_name = str_replace_all(first_name, "\\. " ,"")) %>% + mutate(first_name = str_replace_all(first_name, "\\." ,"")) %>% + mutate(initials = str_replace_all(initials, "\\. " ,"")) %>% + mutate(initials = str_replace_all(initials, "\\." ,"")) %>% + mutate(last_init = paste(surname, initials, sep=" ")) %>% + mutate(authname = paste(surname, first_name,sep= ", ")) %>% + relocate(authid,authname,last_init, .after="author_key") %>% + rename("author_count"="author_order") %>% + # mutate_all(tolower) %>% + mutate_all(trimws) %>% + unite(refID,filename,entry_number,sep="-",na.rm=TRUE,remove=FALSE) + +scopus_authors +write_csv(scopus_authors,'./data/for_testing_updates/scopus_api/scopus_authors.csv') +rm(scopus_authors2) + + +names(scopus_affils) +# add the affiliations to authors ----------------------------------------- + +# scopus_authors_affils<-scopus_authors %>% left_join(scopus_affils) +names(scopus_authors) +names(scopus_affils) +# Add the name to C1 to make it consistent with WOS +# excludes secondary/current +scopus_authors_affils<-scopus_authors %>% + left_join(scopus_affils) %>% + select(-author_url) %>% + # relocate(SO,.before="PY") %>% + mutate(first_name = str_replace_all(first_name, "\\. " ,"\\.")) %>% + mutate(last_init = paste(surname, initials, sep=" ")) %>% + mutate(authname = paste(surname, first_name,sep= ", ")) %>% + mutate(C1 = paste("[",authname,"] ", C1 ,sep= "")) %>% + # distinct(SO,PY,entry_number,authid,afid,.keep_all = TRUE)%>% + distinct(entry_number,authid,afid,.keep_all = TRUE)%>% + # mutate_all(tolower) %>% + mutate_all(trimws) %>% + mutate(C1=gsub("Second Author, ","No Inst Given, ",C1)) + +# unique(scopus_authors_affils$C1) + +# are there any remaining? +scopus_authors_affils %>% filter(is.na(author_key)) + +write_csv(scopus_authors_affils,'./data/for_testing_updates/scopus_api/scopus_authors_affils.csv') + +# add the authors and affiliations to papers ------------------------------- + +names(scopus_authors_affils) +# author_affils in WIDE FORMAT (authors for each paper) +scopus_authors_affils_wide<-scopus_authors_affils %>% + select( + -"university", + -"city", + -"country", + -"last_init", + -"authid", + # -"C1", + - "afid", + -"surname" , + -"first_name" , + # -"entry_number", + -author_key, + -authname, + # # -SO, + # # -PY, + -"initials") %>% + pivot_wider(names_from = author_count, + id_cols=refID, + values_from = C1, + names_prefix = "C") %>% + # BE SURE TO CHECK HOW MANY C COLUMNS AND EDIT BELOW + unite("C1", C1:last(matches("C[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="./") %>% + mutate(C1 = str_replace_all(C1, " \\[na, na, na]" ," \\[missing]")) +# head(scopus_authors_affils_wide$C1,40) +names(scopus_authors_affils_wide) + + + +# select(scopus_authors_affils, matches("C[[:digit:]]")) +# select(scopus_authors_affils, last(matches("C[[:digit:]]"))) +# scopus_authors_affils$C2<-NULL +# scopus_authors_affils$C3<-NULL +scopus_article_authors_wide<-scopus_authors_affils %>% + distinct() %>% + ungroup() %>% + select(-"C1", + -"university", + -"city", + -"country", + -"last_init", + -"authid", + - "afid", + -"surname" , + -"first_name" , + # -"entry_number", + # -authname, + -author_key, + # -SO, + # -PY, + -"initials") %>% + mutate(orcid=paste(authname,"/",orcid,";",sep="")) %>% + pivot_wider(names_from = author_count, + values_from = c("authname","orcid"), + id_cols=refID, + # names_prefix = c("AF","OI") + ) %>% + rename_with(~ gsub("authname_", "AF", .x, fixed = TRUE)) %>% + rename_with(~ gsub("orcid_", "OI", .x, fixed = TRUE)) %>% + # CHECK HOW MANY AUTHOR COLUMNS + unite("AF", AF1:last(matches("AF[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="; ") %>% + unite("OI", OI1:last(matches("OI[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="") +# mutate(C1 = str_replace_all(C1, " \\[NA, NA, NA]" ," \\[MISSING]")) +# head(scopus_allauthors_wide$AF,40) +names(scopus_article_authors_wide) + + + + +scopus_papers_complete <- scopus_papers %>% + left_join(scopus_authors_affils_wide) %>% + left_join(scopus_article_authors_wide) + + +# final tweaks to make it possible to use refsplitr on scopus ------------- +names(scopus_papers_complete) +scopus_papers_complete<-scopus_papers_complete %>% + # mutate(AU=if_else(is.na(AU), AF, AU)) %>% + mutate(AU=AF) %>% + # mutate(refID=gsub("scopus_", "", refID)) %>% + mutate(AF=gsub("\\.", "", AF)) %>% + mutate(AU=gsub("\\.", "", AU)) %>% + mutate(AF=gsub("\\;", "\n", AF)) %>% + mutate(AU=gsub("\\;", "\n", AU)) %>% + mutate(AF=gsub("\n ", "\n", AF)) %>% + mutate(AU=gsub("\n ;", "\n", AU)) %>% + # mutate(orcid=paste0(creator, " /", orcid)) %>% # authors clean requires orcid be in wos format "[name]/orcid" + relocate(SO,PY,AF,C1,DI,TI,VL,BP,EP,.after="filename") + +# remove the last semicolon from every orcid id (last character) +scopus_papers_complete<-scopus_papers_complete %>% + mutate(OI=str_sub(OI,end=-2)) +# scopus_papers_complete$EM<-NA + # mutate(refID=as.numeric(refID)*1000) + + + +write_csv(scopus_papers_complete,"./data/for_testing_updates/scopus_api/scopus_refs.csv") +# save csv ---------------------------------------------------------------- From 626b64ad679daec27f380574eff02404a2acf252 Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 12 Aug 2024 10:54:20 -0400 Subject: [PATCH 5/7] cleanup to merge --- NEWS.md | 9 +- R/plot_net_country.R | 5 - R/references_read.R | 3 +- R/references_read_scopus_api.R | 318 ---------------- R/references_read_scopus_download.R | 570 ---------------------------- 5 files changed, 6 insertions(+), 899 deletions(-) delete mode 100644 R/references_read_scopus_api.R delete mode 100644 R/references_read_scopus_download.R diff --git a/NEWS.md b/NEWS.md index 6ca0185..ea7a4d7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,18 +1,17 @@ # refsplitr News -refsplitr 1.X.X (2024-XX-XX) +refsplitr 1.0.2 (2024-08-12) ========================= ### NEW FEATURES - * `references_read` now extracts additional fields from Web of Science records: WE (Source Database), C3 (affiliations*), EI (eISSN) and RID (the original version of the Thomson-Reuters ResearcherID (RI); authors of some older publications might have an RID but not an RI). To include these in the output of`references_read` use the setting `include_all=TRUE`. - *a single cell with list of all affiliations, not brtoken down by author. to match scopus + * `references_read` now extracts additional fields from Web of Science records: WE (Source Database), C3 (all author affiliations, equivalent to the Scopus `affiliations` field code), EI (eISSN), OA (Open Access), and RID (the original version of the Thomson-Reuters ResearcherID (RI); authors of some older publications might have an RID but not an RI). These are not included in the default output of `references_read`, to include them use `include_all = TRUE`. + * `references_read` no longer extracts some rarely used field codes: GE, LT, MC, MI, and TA - * the Document Type (DT), Keywords Plus (ID), Issue (IS), ISO abbreviated source code (JI), and number of references cited in an article (NR) are now returned by default (`include_all=FALSE`). - + * The following field codes are now returned by default when using `references_read`: DT (Document Type), ID (Keywords Plus), IS (Issue), JI (ISO abbreviated source code), and NR (number of references cited by the article). refsplitr 1.0.1 (2024-07-23) diff --git a/R/plot_net_country.R b/R/plot_net_country.R index a1d8f71..1a16d47 100644 --- a/R/plot_net_country.R +++ b/R/plot_net_country.R @@ -57,11 +57,6 @@ plot_net_country <- function(data, data <- data[!is.na(data$country), ] - - - - - # names in WOS often don't match those in rworldmap' data<-data %>% dplyr::mutate(country=dplyr::case_when( diff --git a/R/references_read.R b/R/references_read.R index c0551fa..524d2f6 100644 --- a/R/references_read.R +++ b/R/references_read.R @@ -100,6 +100,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { "Z9" = character(0), "AR" = character(0), "WE" = character(0), + "OA" = character(0), # Field code for Open Acceess (added by EB Sept 2024) stringsAsFactors = FALSE ) @@ -286,7 +287,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { dropnames <- c("CC", "CH", "CL", "CT", "CY", "FX", "GA", "GE", "J9", "LA", "PA", "PI", "PN", "PS", "RID", - "SI", "SU", "VR") + "SI", "SU", "VR", "OA") rdo <- dupe_output[, !(names(dupe_output) %in% dropnames)] diff --git a/R/references_read_scopus_api.R b/R/references_read_scopus_api.R deleted file mode 100644 index f9e75c7..0000000 --- a/R/references_read_scopus_api.R +++ /dev/null @@ -1,318 +0,0 @@ -#' Reads SCOPUS API Output -##' Thomson Reuters Web of Knowledge/Science and ISI reference export files (both .txt or .ciw format accepted) -#' -#' \code{references_read_scopus_api} This function reads Scopus -#' reference data files downloaded via API into an R-friendly data format. The resulting dataframe -#' is the argument for the refsplitr function `authors_clean()`. -#' -#' @param data the location of the file or files to be imported. This can be either the absolute or -#' relative name of the file (for a single file) or folder (for multiple files stored in the same folder; -#' used in conjunction with `dir = TRUE``). If left blank it is assumed the location is the working directory. -#' @param dir if FALSE it is assumed a single file is to be imported. -#' Set to TRUE if importing multiple files (the path to the folder in which files are stored is set with `data=``; -#' all files in the folder will be imported). Defaults to FALSE. -#' @param include_all if FALSE only a subset of commonly used fields from references records are imported. -#' If TRUE then all fields from the reference records are imported. Defaults to FALSE. -#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, DT, FX, GA, GE, ID, IS, J9, JI, -#' LA, LT, MC, MI, NR, PA, PI, PN, PS, RID, SU, TA, VR. -#' @export references_read_scopus_api -#' -#' @examples -#' ## If a single files is being imported from a folder called "data" located in an RStudio Project: -#' ## imported_refs<-references_read_scopus_api(data = './data/refs.txt', dir = FALSE, include_all=FALSE) -#' -#' ## If multiple files are being imported from a folder named "heliconia" nested within a folder -#' ## called "data" located in an RStudio Project: -#' ## heliconia_refs<-references_read_scopus_api(data = './data/heliconia', dir = TRUE, include_all=FALSE) -#' -#' ## To load the Scopus API records used in the examples in the documentation -#' scopus_api_data <- system.file('extdata', 'BITR_test.txt', package = 'refsplitr') -#' scopus_api_example <- references_read_scopus_api(scopus_api_data) -#' -#' - -# SEE IMPORTANT NOTES ON ORIGINAL 265 and 295 IN TROPICAL SCIENTOMETRIX - - -library(tidyverse) -# read & standardize: SCOPUS papers --------------------------------------- - -scopus_papers <- list.files(path='./data/for_testing_updates/scopus_api/papers', - full.names = TRUE) - -scopus_papers <- scopus_papers %>% - lapply(read_csv,col_types = cols(.default = "c")) %>% # read all the files at once - bind_rows(.id = "filename") %>% # bind all tables into one object, and give id for each - rename_all(~str_replace_all(.,"prism","")) %>% - rename_all(~str_replace_all(.,"dc","")) %>% - rename_all(~str_replace_all(.,"\\:","")) %>% - mutate(PY=str_sub(coverDate, 1, 4)) %>% - rename("SO"="publicationName", - "AB"="description", - "DI"="doi", - "DT"="subtype", - "VL"="volume", - "article_type_long"="subtypeDescription", - "author_count"='author-count.@total', - "IS"='issueIdentifier', - "SN"="issn", - "EI"="eIssn", - "DE"="authkeywords", - "PM"="pubmed-id", - "TC"="citedby-count", - "TI"="title", - # "UT"="url", - "fund_acr"="fund-acr", - "fund_no"="fund-no", - "fund_sp"="fund-sponsor", - "UT"="identifier", - "OA"="openaccessFlag" - ) %>% - mutate(SO=tolower(SO)) %>% - distinct() %>% - filter(if_any(everything(), is.na)) %>% - filter(!is.na(url)) %>% - # mutate(pub_number = row_number()) %>% - mutate(pageRange=str_replace_all(pageRange,"�","")) %>% - mutate(pageRange=str_replace_all(pageRange,"E-","E")) %>% - separate(pageRange,c("BP","EP"),remove=FALSE,sep="-",extra="merge") %>% - unite(FU,fund_acr,fund_sp,fund_no, sep="-",na.rm=TRUE,remove = TRUE) %>% - unite(refID,filename,entry_number,sep="-",na.rm=TRUE,remove=FALSE) %>% - mutate(DE=str_replace_all(DE,"\\|",";")) %>% - select(-"@_fa", - -"coverDisplayDate", - -"aggregationType", - -"author-count.@limit", - -"openaccess", - -"freetoread.value.$", - -"freetoreadLabel.value.$", - # -"pii", - -'author-count.$', - -"coverDate", - # -"error", - -"eid", - -"url", - -"pageRange", - -article_type_long, - ) %>% - # mutate_all(tolower) %>% - mutate_all(trimws) - - -scopus_papers<-scopus_papers %>% distinct(DI,TI,.keep_all = TRUE) -rm(scopus_papers2) - -write_csv(scopus_papers,file="./data/for_testing_updates/scopus_api/scopus_papers.csv") - -# names(scopus_papers) -# unique(scopus_papers$SO) -# names(scopus_papers) -# head(scopus_papers) -# unique(scopus_papers$DE)[999] - -# unique(scopus_papers$journal) - - -scopus_refs<-scopus_papers %>% - group_by(SO,PY) %>% - tally() - - -scopus_papers %>% - group_by(SO) %>% - tally() - - - - -# read & standardize - SCOPUS affiliations -------------------------------- - - -scopus_affils <- list.files(path='./data/for_testing_updates/scopus_api/affils', - full.names = TRUE) %>% - lapply(read_csv,col_types = cols(.default = "c")) %>% - bind_rows %>% - select(-"@_fa") %>% - mutate(affilname =str_replace_all(affilname, "\\,", "")) %>% - select(-'affiliation-url', - # -"entry_number" - ) %>% - distinct() %>% - mutate(C1=paste(affilname,`affiliation-city`,`affiliation-country`,sep=", ")) %>% - # mutate(C1=paste("[", C1,"]",sep="")) %>% - relocate(C1, .after = afid) %>% - rename("university" = "affilname", - "city" = "affiliation-city", - "country" = "affiliation-country") %>% - # mutate_all(tolower) %>% - mutate_all(trimws) - - - -# read & standardize: SCOPUS authors -------------------------------------- - -scopus_authors <- list.files(path='./data/for_testing_updates/scopus_api/authors', - full.names = TRUE) - -scopus_authors <- scopus_authors %>% - lapply(read_csv,col_types = cols(.default = "c")) %>% # read all the files at once - bind_rows(.id = "filename") %>% # bind all tables into one object, and give id for each - mutate(author_key = dplyr::row_number()) %>% - # left_join(scopus_authors2) %>% # join month column created earlier - select(-"@_fa", - -"afid.@_fa") %>% - rename("author_order"="@seq", - "afid"="afid.$", - "first_name"="given-name", - "author_url"="author-url") %>% - # relocate(author_key,SO,PY,entry_number, .before="author_order") %>% - relocate(author_key,entry_number, .before="author_order") %>% - mutate(first_name = str_replace_all(first_name, "\\. " ,"")) %>% - mutate(first_name = str_replace_all(first_name, "\\." ,"")) %>% - mutate(initials = str_replace_all(initials, "\\. " ,"")) %>% - mutate(initials = str_replace_all(initials, "\\." ,"")) %>% - mutate(last_init = paste(surname, initials, sep=" ")) %>% - mutate(authname = paste(surname, first_name,sep= ", ")) %>% - relocate(authid,authname,last_init, .after="author_key") %>% - rename("author_count"="author_order") %>% - # mutate_all(tolower) %>% - mutate_all(trimws) %>% - unite(refID,filename,entry_number,sep="-",na.rm=TRUE,remove=FALSE) - -scopus_authors -write_csv(scopus_authors,'./data/for_testing_updates/scopus_api/scopus_authors.csv') -rm(scopus_authors2) - - -names(scopus_affils) -# add the affiliations to authors ----------------------------------------- - -# scopus_authors_affils<-scopus_authors %>% left_join(scopus_affils) -names(scopus_authors) -names(scopus_affils) -# Add the name to C1 to make it consistent with WOS -# excludes secondary/current -scopus_authors_affils<-scopus_authors %>% - left_join(scopus_affils) %>% - select(-author_url) %>% - # relocate(SO,.before="PY") %>% - mutate(first_name = str_replace_all(first_name, "\\. " ,"\\.")) %>% - mutate(last_init = paste(surname, initials, sep=" ")) %>% - mutate(authname = paste(surname, first_name,sep= ", ")) %>% - mutate(C1 = paste("[",authname,"] ", C1 ,sep= "")) %>% - # distinct(SO,PY,entry_number,authid,afid,.keep_all = TRUE)%>% - distinct(entry_number,authid,afid,.keep_all = TRUE)%>% - # mutate_all(tolower) %>% - mutate_all(trimws) %>% - mutate(C1=gsub("Second Author, ","No Inst Given, ",C1)) - -# unique(scopus_authors_affils$C1) - -# are there any remaining? -scopus_authors_affils %>% filter(is.na(author_key)) - -write_csv(scopus_authors_affils,'./data/for_testing_updates/scopus_api/scopus_authors_affils.csv') - -# add the authors and affiliations to papers ------------------------------- - -names(scopus_authors_affils) -# author_affils in WIDE FORMAT (authors for each paper) -scopus_authors_affils_wide<-scopus_authors_affils %>% - select( - -"university", - -"city", - -"country", - -"last_init", - -"authid", - # -"C1", - - "afid", - -"surname" , - -"first_name" , - # -"entry_number", - -author_key, - -authname, - # # -SO, - # # -PY, - -"initials") %>% - pivot_wider(names_from = author_count, - id_cols=refID, - values_from = C1, - names_prefix = "C") %>% - # BE SURE TO CHECK HOW MANY C COLUMNS AND EDIT BELOW - unite("C1", C1:last(matches("C[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="./") %>% - mutate(C1 = str_replace_all(C1, " \\[na, na, na]" ," \\[missing]")) -# head(scopus_authors_affils_wide$C1,40) -names(scopus_authors_affils_wide) - - - -# select(scopus_authors_affils, matches("C[[:digit:]]")) -# select(scopus_authors_affils, last(matches("C[[:digit:]]"))) -# scopus_authors_affils$C2<-NULL -# scopus_authors_affils$C3<-NULL -scopus_article_authors_wide<-scopus_authors_affils %>% - distinct() %>% - ungroup() %>% - select(-"C1", - -"university", - -"city", - -"country", - -"last_init", - -"authid", - - "afid", - -"surname" , - -"first_name" , - # -"entry_number", - # -authname, - -author_key, - # -SO, - # -PY, - -"initials") %>% - mutate(orcid=paste(authname,"/",orcid,";",sep="")) %>% - pivot_wider(names_from = author_count, - values_from = c("authname","orcid"), - id_cols=refID, - # names_prefix = c("AF","OI") - ) %>% - rename_with(~ gsub("authname_", "AF", .x, fixed = TRUE)) %>% - rename_with(~ gsub("orcid_", "OI", .x, fixed = TRUE)) %>% - # CHECK HOW MANY AUTHOR COLUMNS - unite("AF", AF1:last(matches("AF[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="; ") %>% - unite("OI", OI1:last(matches("OI[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="") -# mutate(C1 = str_replace_all(C1, " \\[NA, NA, NA]" ," \\[MISSING]")) -# head(scopus_allauthors_wide$AF,40) -names(scopus_article_authors_wide) - - - - -scopus_papers_complete <- scopus_papers %>% - left_join(scopus_authors_affils_wide) %>% - left_join(scopus_article_authors_wide) - - -# final tweaks to make it possible to use refsplitr on scopus ------------- -names(scopus_papers_complete) -scopus_papers_complete<-scopus_papers_complete %>% - # mutate(AU=if_else(is.na(AU), AF, AU)) %>% - mutate(AU=AF) %>% - # mutate(refID=gsub("scopus_", "", refID)) %>% - mutate(AF=gsub("\\.", "", AF)) %>% - mutate(AU=gsub("\\.", "", AU)) %>% - mutate(AF=gsub("\\;", "\n", AF)) %>% - mutate(AU=gsub("\\;", "\n", AU)) %>% - mutate(AF=gsub("\n ", "\n", AF)) %>% - mutate(AU=gsub("\n ;", "\n", AU)) %>% - # mutate(orcid=paste0(creator, " /", orcid)) %>% # authors clean requires orcid be in wos format "[name]/orcid" - relocate(SO,PY,AF,C1,DI,TI,VL,BP,EP,.after="filename") - -# remove the last semicolon from every orcid id (last character) -scopus_papers_complete<-scopus_papers_complete %>% - mutate(OI=str_sub(OI,end=-2)) -# scopus_papers_complete$EM<-NA - # mutate(refID=as.numeric(refID)*1000) - - - -write_csv(scopus_papers_complete,"./data/for_testing_updates/scopus_api/scopus_refs.csv") -# save csv ---------------------------------------------------------------- diff --git a/R/references_read_scopus_download.R b/R/references_read_scopus_download.R deleted file mode 100644 index 91d2827..0000000 --- a/R/references_read_scopus_download.R +++ /dev/null @@ -1,570 +0,0 @@ -#' Reads SCOPUS download Output -##' Thomson Reuters Web of Knowledge/Science and ISI reference export files (both .txt or .ciw format accepted) -#' -#' \code{references_read_scopus_download} This function reads Scopus -#' reference data files downloaded directly as csv into an R-friendly data format. The resulting dataframe -#' is the argument for the refsplitr function `authors_clean()`. -#' -#' @param data the location of the file or files to be imported. This can be either the absolute or -#' relative name of the file (for a single file) or folder (for multiple files stored in the same folder; -#' used in conjunction with `dir = TRUE``). If left blank it is assumed the location is the working directory. -#' @param dir if FALSE it is assumed a single file is to be imported. -#' Set to TRUE if importing multiple files (the path to the folder in which files are stored is set with `data=``; -#' all files in the folder will be imported). Defaults to FALSE. -#' @param include_all if FALSE only a subset of commonly used fields from references records are imported. -#' If TRUE then all fields from the reference records are imported. Defaults to FALSE. -#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, DT, FX, GA, GE, ID, IS, J9, JI, -#' LA, LT, MC, MI, NR, PA, PI, PN, PS, RID, SU, TA, VR. -#' @export references_read_scopus_api -#' -#' @examples -#' ## If a single files is being imported from a folder called "data" located in an RStudio Project: -#' ## imported_refs<-references_read_scopus_api(data = './data/refs.txt', dir = FALSE, include_all=FALSE) -#' -#' ## If multiple files are being imported from a folder named "heliconia" nested within a folder -#' ## called "data" located in an RStudio Project: -#' ## heliconia_refs<-references_read_scopus_api(data = './data/heliconia', dir = TRUE, include_all=FALSE) -#' -#' ## To load the Scopus API records used in the examples in the documentation -#' scopus_api_data <- system.file('extdata', 'BITR_test.txt', package = 'refsplitr') -#' scopus_api_example <- references_read_scopus_api(scopus_api_data) -#' -#' - -# SEE IMPORTANT NOTES ON ORIGINAL 265 and 295 IN TROPICAL SCIENTOMETRIX - - -library(tidyverse) -# read & standardize: SCOPUS papers --------------------------------------- - -file_list <- list.files(path='./data/for_testing_updates/scopus_csv', - full.names = TRUE) - -references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { - ## NOTE: The fields stored in our output table are a combination of the - ## "Thomson Reuters Web of Knowledge" FN format and the "ISI Export - ## Format" both of which are version 1.0: - output <- data.frame( - "filename" = character(0), - "AB" = character(0), - "AF" = character(0), - "AU" = character(0), - "CA" = character(0), - "BP" = character(0), - "C1" = character(0), - "C3" = character(0), - "CC" = character(0), - "CH" = character(0), - "CL" = character(0), - "CR" = character(0), - "CT" = character(0), - "CY" = character(0), - "DE" = character(0), - "DI" = character(0), - "DT" = character(0), - # "EF" = character(0), ## End file - "EM" = character(0), - "EP" = character(0), - # "ER" = character(0), ## End record - "FN" = character(0), - "FU" = character(0), - "FX" = character(0), - "GA" = character(0), - # "GE" = character(0), (removed by EB Sept 2024) - "ID" = character(0), - "IS" = character(0), - "J9" = character(0), - "JI" = character(0), - "LA" = character(0), - # "LT" = character(0), - # "MC" = character(0), - # "MI" = character(0), - "NR" = character(0), - "PA" = character(0), - "PD" = character(0), - "PG" = character(0), - "PI" = character(0), - "PN" = character(0), - "PS" = character(0), - "PT" = character(0), - "PU" = character(0), - "PY" = character(0), - "RI" = character(0), # New field code for Thomson-Reuters ResearcherID - "RID" = character(0), # Original field code for Thomson-Reuters ResearcherID - # Older searchers will have RID (added by EB Sept 2024) - "OI" = character(0), # Field code for ORCID ID (added by EB Jan 2017) - "PM" = character(0), # Pubmed ID Number (added by EB Dec 2017) - "RP" = character(0), - "SC" = character(0), - "SI" = character(0), - "SN" = character(0), - "EI" = character(0), - "SO" = character(0), - "SU" = character(0), - # "TA" = character(0), (removed by EB Sept 2024) - "TC" = character(0), - "TI" = character(0), - "UT" = character(0), - "VR" = character(0), - "VL" = character(0), - "WC" = character(0), - "Z9" = character(0), - "AR" = character(0), - "WE" = character(0), - stringsAsFactors = FALSE - ) - - - ## This is an index for the current record, it gets iterated for each - # record we advance through: - i <- 1 - if (dir) { - file_list <- dir(path = data) - } else { - file_list <- data - } - - - - ## Strip out any files in the directory that aren't Web of Knowledge files: - file_list <- file_list[ grep(".ciw|.csv", file_list) ] - - if (length(file_list) == 0) { - stop("ERROR: The specified file or directory does not contain any - Scopus download records in .csv format!") - } - message("Now processing all references files") - - - - # for (filename in file_list) { - # if (dir) { - # in_file <- file(paste0(data, "/", filename), "r") - # } - # if (!dir) { - # in_file <- file(filename, "r") - # } - # - # field <- "" - # - # ## Process the first line to determine what file type it is: - # ## NOTE: We could add the encoding="UTF-8" flag to the readLines in - # ## order to remove the byte-order mark (BOM) from some exported - # ## files coming out of ISI, but there seems to be a bug in the - # ## readLines() function after bringing a UTF-8 file in, in that - # ## it doesn't respsect the BOM characters. So we'll just read - # ## the files in with no encoding specified and strip the BOM if - # - # read_line <- readLines(in_file, n = 1, warn = FALSE) - # - # if (length(read_line) > 0) { - # - # read_line <- gsub("^[^A-Z]*([A-Z]+)(.*)$", "\\1\\2", read_line) - # - # ## Strip the first two characters from the text line, - # # skip the third (should be a space) and store the rest: - # pre_text <- substr(read_line, 1, 2) - # line_text <- substr(read_line, 4, nchar(read_line)) - # - # if (pre_text != "FN") { - # close(in_file) - # error <- paste0("ERROR: The file ", - # filename, - # " doesn't appear to be a valid ISI or - # Thomson Reuters reference library file!") - # stop(error) - # } - # - # ## Check to see if this is a "ISI Export Format" file, in which - # ## case we need to parse out the first line into three fields: - # if (substr(line_text, 1, 3) == "ISI") { - # field <- pre_text - # - # ## Pull apart the FN, VR and PT fields all contained on the first - # ## line of the ISI file format: - # matches <- regexec( - # "^(.*) VR (.*) PT (.*)", - # line_text - # ) - # - # match_strings <- regmatches( - # line_text, - # matches - # ) - # - # ## Store those fields: - # output[i, "FN"] <- paste(match_strings[[1]][2], "\n", sep = "") - # - # output[i, "VR"] <- paste(match_strings[[1]][3], "\n", sep = "") - # - # output[i, "PT"] <- paste(match_strings[[1]][4], "\n", sep = "") - # } else { - # ## If this is not an ISI export format then just parse the first - # ## line normally into the FN field: - # field <- pre_text - # if (field %in% names(output)) { - # output[i, field] <- "" - # output[i, field] <- trimws( - # ifelse(length(line_text) == 1, - # paste(output[i, field], line_text, - # sep = "\n")), - # "both") - # } - # } - # } else { - # utils::flush.console() - # stop("WARNING: Nothing contained in the specified file!") - # } - # - # ## Process the remaining lines in the file (see the note above about - # ## the encoding= flag and necessity for it, but why we didn't use it): - # while (length(read_line <- readLines(in_file, n = 1, warn = FALSE)) > 0) { - # ## Strip the first three characters from the text line: - # pre_text <- substr(read_line, 1, 2) - # - # line_text <- substr(read_line, 4, nchar(read_line)) - # - # ## Check to see if this is a new field: - # if (pre_text != " ") { - # field <- pre_text - # ## If the field is in our file and in our data structure then - # ## initialize it to an empty string: - # if (field %in% names(output)) { - # output[i, field] <- "" - # } - # } - # - # ## Check to see if the current field is one we are saving to output: - # if (field %in% names(output)) { - # ##... if it is then append this line's data to the field in our output: - # - # output[i, field] <- trimws( - # ifelse(length(line_text) == 1, - # paste(output[i, field], line_text, sep = "\n")), - # "both") - # } - # - # # If this is the end of a record then add any per-record items and - # # advance our row: - # if (field == "ER") { - # output[i, "filename"] <- filename - # - # ## These fields are not repeated for every record, so we set them - # ## from the first record where they were recorded: - # - # output[i, "FN"] <- output[1, "FN"] - # output[i, "VR"] <- output[1, "VR"] - # - # i <- i + 1 - # } - # } - # - # close(in_file) - # ############################### Clock###################################### - # total <- length(file_list) - # pb <- utils::txtProgressBar(min = 0, max = total, style = 3) - # utils::setTxtProgressBar(pb, counter) - # counter <- counter + 1 - # utils::flush.console() - # ########################################################################### - # } - # - # -output <- file_list %>% - lapply(read_csv,col_types = cols(.default = "c")) %>% # read all the files at once - bind_rows(.id = "csv_id") %>% # bind all tables into one object, and give id for each - rename_all(~str_replace_all(.,"prism","")) %>% - rename_all(~str_replace_all(.,"dc","")) %>% - rename_all(~str_replace_all(.,"\\:","")) %>% - # mutate(PY=str_sub(coverDate, 1, 4)) %>% - rename("SO"="Source title", - "AB"="Abstract", - "DI"="DOI", - "DT"="Document Type", - "VL"="Volume", - "filename"="csv_id", - "TI"="Title", - "PY"="Year", - "SO"="Source title", - "VL"="Volume", - "IS"="Issue", - "AR"="Art. No.", - "BP"="Page start", - "EP"="Page end", - "PG"="Page count", - "DI"="DOI", - "DE"="Author Keywords", - "RP"="Correspondence Address", - "BE"="Editors", - "PU"="Publisher", - "SN"="ISSN", - "BN"="ISBN", - "PM"="PubMed ID", - "LA"="Language of Original Document", - "JI"="Abbreviated Source Title", - "DT"="Document Type", - "OA"="Open Access", - "UT"="EID", - "TC"="Cited by", - "ID"="Index Keywords", - "FU"="Funding Details", - "FX"="Funding Texts", - "PubStage"="Publication Stage", - "CODEN"="CODEN", - "WE"="Source", - "URL"="Link", - "C3"="Affiliations", # in WOS csv download it is also "affiliations" - "C1"="Authors with affiliations", #"Addresses" in WOS csv downloads - "AU"="Authors", - "AF"="Author full names", - "SID"="Author(s) ID" #scopus ID number - ) %>% - mutate(SO=tolower(SO)) %>% - distinct() %>% - filter(if_any(everything(), is.na)) %>% - filter(!is.na(URL)) %>% - mutate(PG=str_replace_all(PG,"�","")) %>% - mutate(PG=str_replace_all(PG,"E-","E")) %>% - # separate(PG,c("BP","EP"),remove=FALSE,sep="-",extra="merge") %>% - unite(FU,FX,sep="-",na.rm=TRUE,remove = TRUE) %>% - mutate(refID = row_number(),.before=1) %>% - unite("refID",refID:filename,sep="-",na.rm=TRUE,remove=FALSE) %>% - mutate(DE=str_replace_all(DE,"\\|",";")) %>% - # select(-"@_fa", - # -"coverDisplayDate", - # -"aggregationType", - # -"author-count.@limit", - # -"openaccess", - # -"freetoread.value.$", - # -"freetoreadLabel.value.$", - # # -"pii", - # -'author-count.$', - # -"coverDate", - # # -"error", - # -"eid", - # -"url", - # -"pageRange", - # -article_type_long, - # ) %>% - # mutate_all(tolower) %>% - mutate_all(trimws) - - - output<-output %>% distinct(DI,TI,.keep_all = TRUE) - output - -write_csv(output,file="./data/for_testing_updates/scopus_api/scopus_papers.csv") - -# names(scopus_papers) -# unique(scopus_papers$SO) -# names(scopus_papers) -# head(scopus_papers) -# unique(scopus_papers$DE)[999] - -# unique(scopus_papers$journal) - - -scopus_refs<-scopus_papers %>% - group_by(SO,PY) %>% - tally() - - -scopus_papers %>% - group_by(SO) %>% - tally() - - - - -# read & standardize - SCOPUS affiliations -------------------------------- - - -scopus_affils <- list.files(path='./data/for_testing_updates/scopus_api/affils', - full.names = TRUE) %>% - lapply(read_csv,col_types = cols(.default = "c")) %>% - bind_rows %>% - select(-"@_fa") %>% - mutate(affilname =str_replace_all(affilname, "\\,", "")) %>% - select(-'affiliation-url', - # -"entry_number" - ) %>% - distinct() %>% - mutate(C1=paste(affilname,`affiliation-city`,`affiliation-country`,sep=", ")) %>% - # mutate(C1=paste("[", C1,"]",sep="")) %>% - relocate(C1, .after = afid) %>% - rename("university" = "affilname", - "city" = "affiliation-city", - "country" = "affiliation-country") %>% - # mutate_all(tolower) %>% - mutate_all(trimws) - - - -# read & standardize: SCOPUS authors -------------------------------------- - -scopus_authors <- list.files(path='./data/for_testing_updates/scopus_api/authors', - full.names = TRUE) - -scopus_authors <- scopus_authors %>% - lapply(read_csv,col_types = cols(.default = "c")) %>% # read all the files at once - bind_rows(.id = "csv_id") %>% # bind all tables into one object, and give id for each - mutate(author_key = dplyr::row_number()) %>% - # left_join(scopus_authors2) %>% # join month column created earlier - select(-"@_fa", - -"afid.@_fa") %>% - rename("author_order"="@seq", - "afid"="afid.$", - "first_name"="given-name", - "author_url"="author-url") %>% - # relocate(author_key,SO,PY,entry_number, .before="author_order") %>% - relocate(author_key, .before="author_order") %>% - mutate(first_name = str_replace_all(first_name, "\\. " ,"")) %>% - mutate(first_name = str_replace_all(first_name, "\\." ,"")) %>% - mutate(initials = str_replace_all(initials, "\\. " ,"")) %>% - mutate(initials = str_replace_all(initials, "\\." ,"")) %>% - mutate(last_init = paste(surname, initials, sep=" ")) %>% - mutate(authname = paste(surname, first_name,sep= ", ")) %>% - relocate(authid,authname,last_init, .after="author_key") %>% - rename("author_count"="author_order") %>% - # mutate_all(tolower) %>% - mutate_all(trimws) %>% - unite(refID,csv_id,sep="-",na.rm=TRUE,remove=FALSE) - -scopus_authors -write_csv(scopus_authors,'./data/for_testing_updates/scopus_api/scopus_authors.csv') -rm(scopus_authors2) - - -names(scopus_affils) -# add the affiliations to authors ----------------------------------------- - -# scopus_authors_affils<-scopus_authors %>% left_join(scopus_affils) -names(scopus_authors) -names(scopus_affils) -# Add the name to C1 to make it consistent with WOS -# excludes secondary/current -scopus_authors_affils<-scopus_authors %>% - left_join(scopus_affils) %>% - select(-author_url) %>% - # relocate(SO,.before="PY") %>% - mutate(first_name = str_replace_all(first_name, "\\. " ,"\\.")) %>% - mutate(last_init = paste(surname, initials, sep=" ")) %>% - mutate(authname = paste(surname, first_name,sep= ", ")) %>% - mutate(C1 = paste("[",authname,"] ", C1 ,sep= "")) %>% - # distinct(SO,PY,entry_number,authid,afid,.keep_all = TRUE)%>% - distinct(authid,afid,.keep_all = TRUE)%>% - # mutate_all(tolower) %>% - mutate_all(trimws) %>% - mutate(C1=gsub("Second Author, ","No Inst Given, ",C1)) - -# unique(scopus_authors_affils$C1) - -# are there any remaining? -scopus_authors_affils %>% filter(is.na(author_key)) - -write_csv(scopus_authors_affils,'./data/for_testing_updates/scopus_api/scopus_authors_affils.csv') - -# add the authors and affiliations to papers ------------------------------- - -names(scopus_authors_affils) -# author_affils in WIDE FORMAT (authors for each paper) -scopus_authors_affils_wide<-scopus_authors_affils %>% - select( - -"university", - -"city", - -"country", - -"last_init", - -"authid", - # -"C1", - - "afid", - -"surname" , - -"first_name" , - # -"entry_number", - -author_key, - -authname, - # # -SO, - # # -PY, - -"initials") %>% - pivot_wider(names_from = author_count, - id_cols=refID, - values_from = C1, - names_prefix = "C") %>% - # BE SURE TO CHECK HOW MANY C COLUMNS AND EDIT BELOW - unite("C1", C1:last(matches("C[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="./") %>% - mutate(C1 = str_replace_all(C1, " \\[na, na, na]" ," \\[missing]")) -# head(scopus_authors_affils_wide$C1,40) -names(scopus_authors_affils_wide) - - - -# select(scopus_authors_affils, matches("C[[:digit:]]")) -# select(scopus_authors_affils, last(matches("C[[:digit:]]"))) -# scopus_authors_affils$C2<-NULL -# scopus_authors_affils$C3<-NULL -scopus_article_authors_wide<-scopus_authors_affils %>% - distinct() %>% - ungroup() %>% - select(-"C1", - -"university", - -"city", - -"country", - -"last_init", - -"authid", - - "afid", - -"surname" , - -"first_name" , - # -"entry_number", - # -authname, - -author_key, - # -SO, - # -PY, - -"initials") %>% - mutate(orcid=paste(authname,"/",orcid,";",sep="")) %>% - pivot_wider(names_from = author_count, - values_from = c("authname","orcid"), - id_cols=refID, - # names_prefix = c("AF","OI") - ) %>% - rename_with(~ gsub("authname_", "AF", .x, fixed = TRUE)) %>% - rename_with(~ gsub("orcid_", "OI", .x, fixed = TRUE)) %>% - # CHECK HOW MANY AUTHOR COLUMNS - unite("AF", AF1:last(matches("AF[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="; ") %>% - unite("OI", OI1:last(matches("OI[[:digit:]]")), remove = TRUE,na.rm=TRUE, sep="") -# mutate(C1 = str_replace_all(C1, " \\[NA, NA, NA]" ," \\[MISSING]")) -# head(scopus_allauthors_wide$AF,40) -names(scopus_article_authors_wide) - - - - -scopus_papers_complete <- scopus_papers %>% - left_join(scopus_authors_affils_wide) %>% - left_join(scopus_article_authors_wide) %>% - rename("filename"="csv_id") - - -# final tweaks to make it possible to use refsplitr on scopus ------------- -names(scopus_papers_complete) -scopus_papers_complete<-scopus_papers_complete %>% - # mutate(AU=if_else(is.na(AU), AF, AU)) %>% - mutate(AU=AF) %>% - # mutate(refID=gsub("scopus_", "", refID)) %>% - mutate(AF=gsub("\\.", "", AF)) %>% - mutate(AU=gsub("\\.", "", AU)) %>% - mutate(AF=gsub("\\;", "\n", AF)) %>% - mutate(AU=gsub("\\;", "\n", AU)) %>% - mutate(AF=gsub("\n ", "\n", AF)) %>% - mutate(AU=gsub("\n ;", "\n", AU)) %>% - # mutate(orcid=paste0(creator, " /", orcid)) %>% # authors clean requires orcid be in wos format "[name]/orcid" - relocate(SO,PY,AF,C1,DI,TI,VL,BP,EP,.after="filename") - -# remove the last semicolon from every orcid id (last character) -scopus_papers_complete<-scopus_papers_complete %>% - mutate(OI=str_sub(OI,end=-2)) -# scopus_papers_complete$EM<-NA - # mutate(refID=as.numeric(refID)*1000) - - - -write_csv(scopus_papers_complete,"./data/for_testing_updates/scopus_api/scopus_refs.csv") -# save csv ---------------------------------------------------------------- From 3e98fe0a1122cd506bc3253135a0543e12545ceb Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 12 Aug 2024 11:00:20 -0400 Subject: [PATCH 6/7] style/formatting --- R/references_read.R | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/R/references_read.R b/R/references_read.R index 524d2f6..ed652d9 100644 --- a/R/references_read.R +++ b/R/references_read.R @@ -59,15 +59,11 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { "FU" = character(0), "FX" = character(0), "GA" = character(0), - # "GE" = character(0), (removed by EB Sept 2024) "ID" = character(0), "IS" = character(0), "J9" = character(0), "JI" = character(0), "LA" = character(0), - # "LT" = character(0), - # "MC" = character(0), - # "MI" = character(0), "NR" = character(0), "PA" = character(0), "PD" = character(0), @@ -78,11 +74,10 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { "PT" = character(0), "PU" = character(0), "PY" = character(0), - "RI" = character(0), # New field code for Thomson-Reuters ResearcherID - "RID" = character(0), # Original field code for Thomson-Reuters ResearcherID - # Older searchers will have RID (added by EB Sept 2024) - "OI" = character(0), # Field code for ORCID ID (added by EB Jan 2017) - "PM" = character(0), # Pubmed ID Number (added by EB Dec 2017) + "RI" = character(0), # New Thomson-Reuters ResearcherID + "RID" = character(0), # Original Thomson-Reuters ResearcherID + "OI" = character(0), # ORCID + "PM" = character(0), # Pubmed ID Number "RP" = character(0), "SC" = character(0), "SI" = character(0), @@ -90,7 +85,6 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { "EI" = character(0), "SO" = character(0), "SU" = character(0), - # "TA" = character(0), (removed by EB Sept 2024) "TC" = character(0), "TI" = character(0), "UT" = character(0), @@ -100,7 +94,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { "Z9" = character(0), "AR" = character(0), "WE" = character(0), - "OA" = character(0), # Field code for Open Acceess (added by EB Sept 2024) + "OA" = character(0), # Open Access stringsAsFactors = FALSE ) From 69f3e7190f0e029245a1c440f946175a0f907c0e Mon Sep 17 00:00:00 2001 From: embruna Date: Mon, 12 Aug 2024 11:00:39 -0400 Subject: [PATCH 7/7] style/formatting --- R/references_read.R | 100 ++++++++++++++++++++++++-------------------- 1 file changed, 54 insertions(+), 46 deletions(-) diff --git a/R/references_read.R b/R/references_read.R index ed652d9..9154a8b 100644 --- a/R/references_read.R +++ b/R/references_read.R @@ -2,34 +2,33 @@ #' #' \code{references_read} This function reads Thomson Reuters Web of Knowledge #' and ISI format reference data files into an R-friendly data format. The resulting dataframe -#' is the argument for the refsplitr function `authors_clean()`. +#' is the argument for the refsplitr function `authors_clean()`. #' -#' @param data the location of the file or files to be imported. This can be either the absolute or -#' relative name of the file (for a single file) or folder (for multiple files stored in the same folder; +#' @param data the location of the file or files to be imported. This can be either the absolute or +#' relative name of the file (for a single file) or folder (for multiple files stored in the same folder; #' used in conjunction with `dir = TRUE``). If left blank it is assumed the location is the working directory. -#' @param dir if FALSE it is assumed a single file is to be imported. -#' Set to TRUE if importing multiple files (the path to the folder in which files are stored is set with `data=``; -#' all files in the folder will be imported). Defaults to FALSE. -#' @param include_all if FALSE only a subset of commonly used fields from references records are imported. -#' If TRUE then all fields from the reference records are imported. Defaults to FALSE. -#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, FX, GA, J9, +#' @param dir if FALSE it is assumed a single file is to be imported. +#' Set to TRUE if importing multiple files (the path to the folder in which files are stored is set with `data=``; +#' all files in the folder will be imported). Defaults to FALSE. +#' @param include_all if FALSE only a subset of commonly used fields from references records are imported. +#' If TRUE then all fields from the reference records are imported. Defaults to FALSE. +#' The additional data fields included if `include_all=TRUE`: CC, CH, CL, CT, CY, FX, GA, J9, #' LA, PA, PI, PN, PS, RID, SU, VR. #' @export references_read -#' -#' @examples -#' ## If a single files is being imported from a folder called "data" located in an RStudio Project: +#' +#' @examples +#' ## If a single files is being imported from a folder called "data" located in an RStudio Project: #' ## imported_refs<-references_read(data = './data/refs.txt', dir = FALSE, include_all=FALSE) -#' +#' #' ## If multiple files are being imported from a folder named "heliconia" nested within a folder -#' ## called "data" located in an RStudio Project: +#' ## called "data" located in an RStudio Project: #' ## heliconia_refs<-references_read(data = './data/heliconia', dir = TRUE, include_all=FALSE) -#' -#' ## To load the Web of Science records used in the examples in the documentation -#' BITR_data_example <- system.file('extdata', 'BITR_test.txt', package = 'refsplitr') +#' +#' ## To load the Web of Science records used in the examples in the documentation +#' BITR_data_example <- system.file("extdata", "BITR_test.txt", package = "refsplitr") #' BITR <- references_read(BITR_data_example) -#' -#' -references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { +#' +references_read <- function(data = ".", dir = FALSE, include_all = FALSE) { ## NOTE: The fields stored in our output table are a combination of the ## "Thomson Reuters Web of Knowledge" FN format and the "ISI Export ## Format" both of which are version 1.0: @@ -74,10 +73,10 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { "PT" = character(0), "PU" = character(0), "PY" = character(0), - "RI" = character(0), # New Thomson-Reuters ResearcherID - "RID" = character(0), # Original Thomson-Reuters ResearcherID - "OI" = character(0), # ORCID - "PM" = character(0), # Pubmed ID Number + "RI" = character(0), # New Thomson-Reuters ResearcherID + "RID" = character(0), # Original Thomson-Reuters ResearcherID + "OI" = character(0), # ORCID + "PM" = character(0), # Pubmed ID Number "RP" = character(0), "SC" = character(0), "SI" = character(0), @@ -94,7 +93,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { "Z9" = character(0), "AR" = character(0), "WE" = character(0), - "OA" = character(0), # Open Access + "OA" = character(0), # Open Access stringsAsFactors = FALSE ) @@ -108,7 +107,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { } ## Strip out any files in the directory that aren't Web of Knowledge files: - file_list <- file_list[ grep(".ciw|.txt", file_list) ] + file_list <- file_list[grep(".ciw|.txt", file_list)] if (length(file_list) == 0) { stop("ERROR: The specified file or directory does not contain any @@ -138,7 +137,6 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { read_line <- readLines(in_file, n = 1, warn = FALSE) if (length(read_line) > 0) { - read_line <- gsub("^[^A-Z]*([A-Z]+)(.*)$", "\\1\\2", read_line) ## Strip the first two characters from the text line, @@ -148,10 +146,12 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { if (pre_text != "FN") { close(in_file) - error <- paste0("ERROR: The file ", + error <- paste0( + "ERROR: The file ", filename, " doesn't appear to be a valid ISI or - Thomson Reuters reference library file!") + Thomson Reuters reference library file!" + ) stop(error) } @@ -187,8 +187,11 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { output[i, field] <- trimws( ifelse(length(line_text) == 1, paste(output[i, field], line_text, - sep = "\n")), - "both") + sep = "\n" + ) + ), + "both" + ) } } } else { @@ -216,12 +219,14 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { ## Check to see if the current field is one we are saving to output: if (field %in% names(output)) { - ##... if it is then append this line's data to the field in our output: + ## ... if it is then append this line's data to the field in our output: output[i, field] <- trimws( ifelse(length(line_text) == 1, - paste(output[i, field], line_text, sep = "\n")), - "both") + paste(output[i, field], line_text, sep = "\n") + ), + "both" + ) } # If this is the end of a record then add any per-record items and @@ -247,7 +252,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { counter <- counter + 1 utils::flush.console() ########################################################################### - } + } ############################################## 3 # Post Processing # We need to clean this file, page breaks are inserted in the raw file @@ -256,7 +261,7 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { output$RI <- gsub("\n", "", output$RI, fixed = TRUE) output$RI <- gsub("; ", ";", output$RI, fixed = TRUE) - #This fixes a problem where in earlier WOS pulls RI is stored + # This fixes a problem where in earlier WOS pulls RI is stored # as RID with no name associated output$RI[!grepl("/", output$RI)] < -NA output$OI <- gsub("\n", "", output$OI, fixed = TRUE) @@ -267,24 +272,27 @@ references_read <- function(data = ".", dir = FALSE, include_all=FALSE) { output$refID <- seq_len(nrow(output)) # now done in base R, runs slower - dupe_output <- do.call(rbind, lapply(unique(output$UT), - function(x) output[output$UT == x, ][1, ])) + dupe_output <- do.call(rbind, lapply( + unique(output$UT), + function(x) output[output$UT == x, ][1, ] + )) ############################################ # Prepare for printing - if (include_all == TRUE){ + if (include_all == TRUE) { return(dupe_output) } - if (include_all != TRUE){ - - dropnames <- c("CC", "CH", "CL", "CT", "CY", - "FX", "GA", "GE", "J9", "LA", - "PA", "PI", "PN", "PS", "RID", - "SI", "SU", "VR", "OA") + if (include_all != TRUE) { + dropnames <- c( + "CC", "CH", "CL", "CT", "CY", + "FX", "GA", "GE", "J9", "LA", + "PA", "PI", "PN", "PS", "RID", + "SI", "SU", "VR", "OA" + ) rdo <- dupe_output[, !(names(dupe_output) %in% dropnames)] return(rdo) } - } \ No newline at end of file +}