diff --git a/.gitignore b/.gitignore
index e75435c..69e2a14 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,49 +1,37 @@
# History files
.Rhistory
.Rapp.history
-
# Session Data files
.RData
.RDataTmp
-
# User-specific files
.Ruserdata
-
# Example code in package build process
*-Ex.R
-
# Output files from R CMD build
/*.tar.gz
-
# Output files from R CMD check
/*.Rcheck/
-
# RStudio files
.Rproj.user/
-
# produced vignettes
vignettes/*.html
vignettes/*.pdf
-
# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
.httr-oauth
-
# knitr and R markdown default cache directories
*_cache/
/cache/
-
# Temporary files created by R markdown
*.utf8.md
*.knit.md
-
# R Environment Variables
.Renviron
-
# pkgdown site
docs/
-
# translation temp files
po/*~
-
# RStudio Connect folder
rsconnect/
+record_linkage.Rproj
+deduplicate.html
diff --git a/missing_gender_result.csv b/data-raw/missing_gender_result.csv
similarity index 100%
rename from missing_gender_result.csv
rename to data-raw/missing_gender_result.csv
diff --git a/deduplicate.R b/deduplicate.R
deleted file mode 100644
index fa3e6c0..0000000
--- a/deduplicate.R
+++ /dev/null
@@ -1,774 +0,0 @@
-##################################################
-### A script workflow for Record linkage ----------
-##################################################
-
-library(tidyverse)
-# install.packages("fastLink")
-library(fastLink)
-
-## Load the data -
-# which is here already the results of merging multiple list from different excel files
-data <- readxl::read_excel(here::here("data-raw", "Registros2.xlsx"),
- sheet = "Sheet1",
- col_types = c("numeric",
- "text", "text", "text", "text", "text",
- "text", "text", "date", "numeric",
- "numeric", "numeric", "text", "text",
- "text", "text", "text", "text", "text",
- "text", "text", "text", "text", "text",
- "text", "text", "text", "text", "text",
- "text", "text")) |>
- janitor::clean_names()
-
-
-#dput(names(data))
-## Cleaning functions ###############
-
-
-#' clean_age
-#'
-#' This functions cleans the age identifiers in the context of record linkage.
-#'
-#' If the date of birth is present, it will parse it, extract year and month,
-#' and recalculate both age and age range
-#'
-#' If the date of birth is not mentioned, then it will use age and date_record
-#' to reconstruct an estimated date of birth - and recalculate the rest
-#'
-#' @param frame frame with the data
-#' @param date_birth variable name for the date of birth in the frame
-#' @param date_record variable name for the date of birth in the frame
-#' @param age variable name for the date of birth in the frame
-#' @param age_range variable name for the date of birth in the frame
-#'
-#' @return the same column but cleaned...
-#'
-#' @export
-clean_age <- function(frame,
- date_birth,
- date_record,
- age,
- age_range){
-
- #frame$date_birth
- frame2 <- frame |>
- ## Rename variable
- dplyr::rename( date_birth = paste0(date_birth),
- date_record = paste0(date_record),
- age = paste0(age),
- age_range = paste0(age_range) ) |>
- dplyr::mutate(
- ## In case there is no date of birth - but we have an age, we recalculate it..
-
- date_birth = dplyr::case_when(
- # Case we age but no DOB and date registration
- is.na(date_birth) & !(is.na(age)) & !(is.na(date_record)) ~
- lubridate::as_date(date_record) - lubridate::dyears(age) ,
-
- # Case we age but no DOB and date registration
- is.na(date_birth) & !(is.na(age)) & is.na(date_record) ~
- today() - lubridate::dyears(age) ,
-
- # Case take what we have...
- TRUE ~ lubridate::as_date(date_birth) ) ,
- ## make sure it is in the correct format
- date_birth = lubridate::as_date(date_birth),
-
- day_birth = as.numeric(
- lubridate::day(date_birth)),
-
- month_birth = as.numeric(
- lubridate::month(date_birth)),
-
- year_birth= as.numeric(
- lubridate::year(date_birth)),
-
- age = round( as.numeric(today() - date_birth) / 365),
-
- age_range = dplyr::case_when(
- ## if age cohort was already present and we have no DOb, retain it
- # !(is.na(age_range )) & is.na( date_birth) &
- age <5 ~ "0-4",
-
- # !(is.na(age_range )) & is.na( date_birth) &
- age >=5 & age <= 11 ~ "4-11",
-
- #!(is.na(age_range )) & is.na( date_birth) &
- age >= 12 & age <= 17 ~ "12-17",
- # !(is.na(age_range )) & is.na( date_birth) &
- age >= 18 & age <= 59 ~ "18-59",
-
- # !(is.na(age_range )) & is.na( date_birth) &
- age >= 60 ~ "60+",
-
- TRUE ~ age_range ) )
-
- return(frame2)
-}
-
-
-## Testing...
-# frame <- data |>
-# dplyr::select(fecha_de_nacimiento,
-# date_record, edad, age_range) |>
-# clean_age( date_birth = "fecha_de_nacimiento",
-# date_record = "date_record",
-# age = "edad",
-# age_range = "age_range")
-
-
-
-#' remove_spaces_based_on_patterns
-#' Utility sub-function to remove spaces based on patterns
-#' This function helps cleaning name decomposition - in case it is not included in the original data
-#'
-#' In case, there's only 2 elements, it will fill only firstname and fathername
-#' The function also identify family prefix to be bind such as for spanish
-#' "DEL", 'DE", "DE LOS", "DE LAS"
-#'
-#' @param vector a list of string with names
-remove_spaces_based_on_patterns <- function(vector,
- nameprefix = data.frame(
- pat1 = c( "DE LA ", "DEL ", "DE LOS ", "DE LAS ","DE ", "SAN ", "LA ", "DA "),
- pat2 = c( "DE_LA_", "DEL_", "DE_LOS_", "DE_LAS_", "DE_", "SAN_", "LA_", "DA_") )
-) {
- for (i in 1:nrow(nameprefix) ) {
- # i <- 4
- #cat(paste0(nameprefix[i, c("pat1")],"\n", vector, "\n"))
- vector <- stringr::str_replace_all(string = vector,
- pattern = nameprefix[i, c("pat1")],
- replacement = nameprefix[i, c("pat2")])
- # cat(paste0(vector, "\n"))
- }
- return(vector)
-}
-## Test function
-remove_spaces_based_on_patterns(vector = "ADRIENNE DE LOS ANGELES MILANESE PARISIANNA")
-
-
-#' reset_spaces_based_on_patterns
-#' Utility sub-function to remove spaces based on patterns
-#' This function helps cleaning name decomposition - in case it is not included in the original data
-#'
-#' In case, there's only 2 elements, it will fill only firstname and fathername
-#' The function also identify family prefix to be bind such as for spanish
-#' "DEL", 'DE", "DE LOS", "DE LAS"
-#'
-#' @param vector a list of string with names
-reset_spaces_based_on_patterns <- function(vector,
- nameprefix = data.frame(
- pat1 = c( "DE LA ", "DEL ", "DE LOS ", "DE LAS ","DE ", "SAN ", "LA ", "DA "),
- pat2 = c( "DE_LA_", "DEL_", "DE_LOS_", "DE_LAS_", "DE_", "SAN_", "LA_", "DA_") )
- ) {
-
-
- for (i in 1:nrow(nameprefix) ) {
- # i <- 1
- # cat(paste0( nameprefix[i, c("pat1")],"\n",vector, "\n"))
- vector <- stringr::str_replace_all(string = vector,
- pattern = nameprefix[i, c("pat2")],
- replacement = nameprefix[i, c("pat1")])
- #cat(paste0(vector, "\n"))
- }
- return(vector)
-}
-
-
-
-
-#' separate_fullname
-#'
-#' This function clean name decomposition - in case it is not included in the original data
-#'
-#' Performing this name decomposition is important in order to enhance record linkage
-#' as the name pattern can be different (either "firstname_fathername_mother_name" or
-#' "fathername_mothername_firstname") which will minimise linkage probabibility
-#'
-#' In case, there's only 2 elements, it will fill only firstname and fathername
-#' The function also identify family prefix to be bind such as for spanish
-#' "DEL", 'DE", "DE LOS", "DE LAS"
-#'
-#' @param fullname full name including everything together
-#' @param firstname first name
-#' @param fathername father name
-#' @param mothername mother name
-#' @param namepattern either "firstname_fathername_mother_name" or
-#' "fathername_mothername_firstname"
-#'
-#' @return a clean list with c("firstname","fathername","mothername")
-#'
-#' @export
-separate_fullname <- function(frame,
- fullname,
- firstname,
- fathername,
- mothername,
- namepattern
- ){
- ## Let's go!
- framesp <- frame |>
- ## Rename variable
- dplyr::rename( fullname = paste0(fullname),
- firstname = paste0(firstname),
- fathername = paste0(fathername),
- mothername = paste0(mothername),
- namepattern = paste0(namepattern) ) |>
-
- ### Lets clean all spaces and get everything to upper
- dplyr::mutate( fullname_or = fullname,
- fullname = trimws(stringr::str_squish(fullname),
- which = "both",
- whitespace = "[ \t\r\n]"),
- fullname = toupper(fullname)) |>
- ### Lets apply the prefix space replacement...
- dplyr::mutate( fullname_pref = remove_spaces_based_on_patterns(vector = fullname),
- fullname = fullname_pref ) |>
- ## Counting th enumber of space to understand the structure of the full name
- dplyr::mutate( numspace = stringr::str_count(fullname, ' ')) |>
- tidyr::separate_wider_delim(fullname,
- delim = " ",
- names_sep = "",
- too_few = "align_start") |>
-
- ### Now reconstruct the first name, father and mother name based on cases
-
- ## Let summarize the logic...!
- ## Based on the number of componnent in the full name - ranging from 0 to 4
-
- ## Case A: "firstname_fathername_mother_name"
- # numspace == 0 ## only firstname = fullname1
- # numspace = 1 ## firstname = fullname1 & fathername = fullname2
- # numspace = 2 ## firstname = fullname1 & fathername = fullname2 & mothername = fullname3
- # numspace = 3 ## firstname = paste0(fullname1, " ", fullname2) & fathername = fullname3 & mothername = fullname4
- # numspace = 4 ## firstname = paste0(fullname1, " ", fullname2, " ", fullname3) & fathername = fullname4 & mothername = fullname5
-
- # ## Case B: "fathername_mothername_firstname"
- # numspace == 0 ## only fathername = fullname1
- # numspace = 1 ## fathername = fullname1 & firstname = fullname2
- # numspace = 2 ## fathername = fullname1 & mothername = fullname2 & firstname = fullname3
- # numspace = 3 ## fathername = fullname1 & mothername = fullname2 & firstname = paste0(fullname3, " ", fullname3)
- # numspace = 4 ## fathername = fullname1 & mothername = fullname2 & firstname = paste0(fullname3, " ", fullname4, " ", fullname5)
-
- ### Now reconstruct fathername
- dplyr::mutate( fathername = dplyr::case_when(
- is.na(fathername) & namepattern == "firstname_fathername_mother_name" &
- numspace == 0 ~ "",
-
- is.na(fathername) & namepattern == "firstname_fathername_mother_name" &
- numspace == 1 ~ fullname2,
-
- is.na(fathername) & namepattern == "firstname_fathername_mother_name" &
- numspace == 2 ~ fullname2,
-
- is.na(fathername) & namepattern == "firstname_fathername_mother_name" &
- numspace == 3 ~ fullname3,
-
- is.na(fathername) & namepattern == "firstname_fathername_mother_name" &
- numspace == 4 ~ fullname4,
-
-
- is.na(fathername) & namepattern == "fathername_mothername_firstname" &
- numspace == 0 ~ fullname1,
-
- is.na(fathername) & namepattern == "fathername_mothername_firstname" &
- numspace == 1 ~ fullname1,
-
- is.na(fathername) & namepattern == "fathername_mothername_firstname" &
- numspace == 2 ~ fullname1,
-
- is.na(fathername) & namepattern == "fathername_mothername_firstname" &
- numspace == 3 ~ fullname1,
-
- is.na(fathername) & namepattern == "fathername_mothername_firstname" &
- numspace == 4 ~ fullname1,
-
- TRUE ~ fathername )) |>
-
- ### Now reconstruct mothername
- dplyr::mutate( mothername = dplyr::case_when(
- is.na(mothername) & namepattern == "firstname_fathername_mother_name" &
- numspace == 0 ~ "",
-
- is.na(mothername) & namepattern == "firstname_fathername_mother_name" &
- numspace == 1 ~ "",
-
- is.na(mothername) & namepattern == "firstname_fathername_mother_name" &
- numspace == 2 ~ fullname3,
-
- is.na(mothername) & namepattern == "firstname_fathername_mother_name" &
- numspace == 3 ~ fullname4,
-
- is.na(mothername) & namepattern == "firstname_fathername_mother_name" &
- numspace == 4 ~ fullname5,
-
-
- is.na(mothername) & namepattern == "fathername_mothername_firstname" &
- numspace == 0 ~ "",
-
- is.na(mothername) & namepattern == "fathername_mothername_firstname" &
- numspace == 1 ~ "",
-
- is.na(mothername) & namepattern == "fathername_mothername_firstname" &
- numspace == 2 ~ fullname2,
-
- is.na(mothername) & namepattern == "fathername_mothername_firstname" &
- numspace == 3 ~ fullname2,
-
- is.na(mothername) & namepattern == "fathername_mothername_firstname" &
- numspace == 4 ~ fullname2,
-
- TRUE ~ mothername )) |>
-
-
- ### Now reconstruct firstname
- dplyr::mutate( firstname = dplyr::case_when(
- is.na(firstname) & namepattern == "firstname_fathername_mother_name" &
- numspace == 0 ~ fullname1,
-
- is.na(firstname) & namepattern == "firstname_fathername_mother_name" &
- numspace == 1 ~ fullname1,
-
- is.na(firstname) & namepattern == "firstname_fathername_mother_name" &
- numspace == 2 ~ fullname1,
-
- is.na(firstname) & namepattern == "firstname_fathername_mother_name" &
- numspace == 3 ~ paste0(fullname1, " ", fullname2 ),
-
- is.na(firstname) & namepattern == "firstname_fathername_mother_name" &
- numspace == 4 ~ paste0(fullname1, " ", fullname2, " ", fullname3),
-
- is.na(firstname) & namepattern == "fathername_mothername_firstname" &
- numspace == 0 ~ "",
-
- is.na(firstname) & namepattern == "fathername_mothername_firstname" &
- numspace == 1 ~ fullname2,
-
- is.na(firstname) & namepattern == "fathername_mothername_firstname" &
- numspace == 2 ~ fullname3,
-
- is.na(firstname) & namepattern == "fathername_mothername_firstname" &
- numspace == 3 ~ paste0(fullname3, " ", fullname4 ) ,
-
- is.na(firstname) & namepattern == "fathername_mothername_firstname" &
- numspace == 4 ~ paste0(fullname3, " ", fullname4, " ", fullname5) ,
-
- TRUE ~ firstname )) |>
-
-
- ### Lets reset the prefix space replacement...
- dplyr::mutate(
- firstname = reset_spaces_based_on_patterns(vector = firstname),
- fathername = reset_spaces_based_on_patterns(vector = fathername),
- mothername = reset_spaces_based_on_patterns(vector = mothername)) |>
-
- ## then clean intermediate variables
- dplyr::select( - fullname1, - fullname2, - fullname3, - fullname4,
- - fullname5, fullname_or, - fullname_pref, - numspace )
-
-
- return(framesp)
-}
-
-## Testing...
-# frame <- data |>
-# dplyr::filter( is.na(nombres) ) |>
-# # dplyr::select(nombre_completo, name_pattern,
-# # nombres, apellido_paterno, apellido_materno) |>
-# separate_fullname(fullname= "nombre_completo",
-# namepattern= "name_pattern",
-# firstname = "nombres",
-# fathername = "apellido_paterno",
-# mothername = "apellido_materno")
-
-
-
-
-
-#' separate_firstname
-#'
-#' This function decomposition - in case there is a space in
-#'
-#' Performing this name decomposition is important in order to enhance record linkage
-#'
-#' @param firstname first name
-#' @return a clean list with c("firstname1","firstname2","firstname3")
-#'
-#' @export
-separate_firstname <- function(frame,
- firstname ) {
- ## Let's go!
- framesp <- frame |>
- ## Rename variable
- dplyr::rename( firstname = paste0(firstname) ) |>
-
- ### Lets clean all spaces and get everything to upper
- dplyr::mutate( firstname_or = firstname,
- firstname = trimws(stringr::str_squish(firstname),
- which = "both",
- whitespace = "[ \t\r\n]"),
- firstname = toupper(firstname)) |>
- dplyr::mutate( firstname = remove_spaces_based_on_patterns(vector = firstname) ) |>
-
- ## Counting the number of space to understand the structure of the full name
- tidyr::separate_wider_delim(firstname,
- delim = " ",
- names_sep = "",
- too_few = "align_start") |>
- dplyr::mutate( firstname1 = reset_spaces_based_on_patterns(vector = firstname1),
- firstname2 = reset_spaces_based_on_patterns(vector = firstname2),
- firstname3 = reset_spaces_based_on_patterns(vector = firstname3))
-
- return(framesp)
-}
-
-framefirstname <- data |>
- separate_firstname(firstname = "nombres" )
-
-
-
-#' separate_familyname
-#'
-#' This function helps in decomposing names in case there is a space
-#' in the father name and the mother name is empty
-#'
-#' Performing this name decomposition is important in order to enhance record linkage
-#'
-#' @param fathernamevar father name
-#' @param mothernamevar mother name
-#' @return a clean list with c("firstname1","firstname2","firstname3")
-#'
-#' @export
-separate_familyname <- function(frame,
- fathernamevar,
- mothernamevar) {
- ## Let's go!
- framesp <- frame |>
- ## Rename variable
- dplyr::rename( fathername = paste0(fathernamevar),
- mothername = paste0(mothernamevar) ) |>
-
- ### Lets clean all spaces and get everything to upper
- dplyr::mutate( fathername_or = fathername,
- fathername= trimws(stringr::str_squish( fathername),
- which = "both",
- whitespace = "[ \t\r\n]"),
- fathername = toupper( fathername)) |>
- dplyr::mutate( fathername = remove_spaces_based_on_patterns(vector = fathername) ) |>
-
- ## Counting the number of space to understand the structure of the full name
- tidyr::separate_wider_delim( fathername,
- delim = " ",
- names_sep = "",
- too_few = "align_start") |>
- dplyr::mutate( fathername1 = reset_spaces_based_on_patterns(vector = fathername1),
- fathername2 = reset_spaces_based_on_patterns(vector = fathername2),
- fathername3 = reset_spaces_based_on_patterns(vector = fathername3)) |>
-
- dplyr::mutate( fathername = dplyr::if_else( is.na(mothername), fathername1, fathername_or ) ) |>
- dplyr::mutate( mothername = dplyr::if_else( is.na(mothername), fathername2, mothername)) |>
- dplyr::select( - fathername1, - fathername2, - fathername3, - fathername_or)
-
- return(framesp)
-}
-
-# frame <- data |>
-# dplyr::select(apellido_paterno, apellido_materno) |>
-# separate_familyname( fathernamevar = "apellido_paterno",
-# mothernamevar = "apellido_materno" )
-
-
-#' cleanvar
-#'
-#' function for data cleaning with additional name removal logic
-#'
-#' @param names_column name of the column to treat
-#' @param toRemove default vector with stuff to remove from name
-#' c(" JR", " SR", " IV", " III", " II")
-#' @return names_column_new name of the column treat
-#'
-#' @export
-cleanvar <- function(names_column,
- toRemove = c(" JR", " JUNIOR", " SR", " IV", " III", " II")) {
- # Convert to uppercase
- names_column_new <- toupper(names_column)
- # Remove specified name suffixes
- for (tR in toRemove) {
- names_column_new <- gsub(tR, "", names_column_new)
- }
- # Convert special characters to ASCII equivalents
- names_column_new <- iconv(names_column_new, "latin1", "ASCII//TRANSLIT", sub = "")
- # Remove punctuation, digits, and all spaces
- names_column_new <- gsub("[[:punct:][:digit:]][[:space:]]", "", names_column_new)
- # Create a new variable with only alphabetic characters
- names_column_new <- gsub("[^[:alpha:]]", "", names_column_new)
-
- return(names_column_new)
-}
-
-
-## Pipeline for data post processing #####################
-data.prep <- data |>
-
- ## Filter where the phone number is not available -- "NO REFIERE"
- dplyr::filter( telefono != "NO REFIERE") |>
-
- ## Clean age_range
- #data |> dplyr::select(age_range) |> dplyr::distinct() |> dplyr::pull()
- dplyr::mutate( age_range = dplyr::case_when(
- age_range == "18 A 59 AÑOS" ~ "0-4",
- TRUE ~ age_range )) |>
-
- ## Clean DOb & Age
- clean_age( date_birth = "fecha_de_nacimiento",
- date_record = "date_record",
- age = "edad",
- age_range = "age_range") |>
-
- ## Clean the names
- separate_fullname(fullname= "nombre_completo",
- namepattern= "name_pattern",
- firstname = "nombres",
- fathername = "apellido_paterno",
- mothername = "apellido_materno") |>
-
- separate_firstname(firstname = "firstname") |>
-
- separate_familyname( fathername = "fathername",
- mothername = "mothername") |>
-
-
- ## Clean the gender variable
- # data |> dplyr::select(genero) |> dplyr::distinct() |> dplyr::pull()
- dplyr::mutate(gender = dplyr::case_when(
- genero %in% c("F" ,"FEMENINO" ,"f", "Femenino") ~ "F",
- genero %in% c("M" , "MASCULINO" , "Masculino") ~ "M",
- genero %in% c("X", "Otro") ~ "Ot",
- TRUE ~ NA )) |>
-
- ## Only retain the nationality of interest
- # data |> dplyr::select(nacionalidad) |> dplyr::distinct() |> dplyr::pull()
- dplyr::mutate(nationality = dplyr::case_when(
- nacionalidad %in% c("Venzuela", "venezuela", "Venezolana",
- "VENEZUELA",
- "Venezuela", "VENEZOLANO", "VENEZOLANA") ~ "VEN",
- nacionalidad %in% c("COLOMBIANO", "COLOMBIANA", "COLOMBIA",
- "colombia", "Colombia", "Nac. Colombia",
- "Colombiana" ) ~ "COL",
- TRUE ~ "other" )) |>
- dplyr::filter( nationality %in% c("VEN", "COL" )) |>
- ## Apply cleanvar()
- # Perform data cleaning on dfA using the clean_names function
- dplyr::mutate_at( dplyr::vars(firstname1, firstname2, fathername, mothername,
- asistencia, departamento, #telefono,
- planilla, socio),
- list(new = cleanvar)) |>
-
-
- ### identify single data source
- dplyr::mutate(datasource = paste0(socio_new, "_", planilla_new)) #|>
-
- # # ## Retain only fields for record linkage
- # dplyr::select(datasource, nationality,
- # firstname_new, fathername_new, mothername_new,
- # asistencia_new, departamento_new,
- # telefono, gender)
-
-
-
-dput(names(data.prep))
-
-
-table(data.prep$gender, useNA = "ifany")
-table(data.prep$nationality, useNA = "ifany")
-
-## Check the datasource that we will compare
-table(data.prep$datasource, useNA = "ifany")
-
-## See if can use departamento for blocking
-table(data.prep$datasource, data.prep$departamento_new, useNA = "ifany")
-
-alldatasource <- data.prep |>
- dplyr::select(datasource) |>
- dplyr::distinct() |>
- dplyr::pull()
-
-alldatasource
-
-
-## Let's get 2 comparison dataset... ##########
-dfA <- data.prep |>
- dplyr::filter(datasource == alldatasource[9]) |>
- dplyr::select( - datasource)
-
-dfB <- data.prep |>
- dplyr::filter(datasource == alldatasource[6])|>
- dplyr::select( - datasource)
-
-matches.out <- fastLink::fastLink(
- dfA = dfA,
- dfB = dfB,
- # Specify the vector of variable names to be used for matching.
- # These variable names should exist in both dfA and dfB
- varnames = c("nationality" ,
- "firstname1_new" ,"firstname2_new" ,
- "fathername_new" ,"mothername_new",
- "day_birth", "month_birth", "year_birth",
- # "asistencia_new" , "departamento_new", "telefono_new" ,
- "gender" ),
-
- # Specify which variables among varnames should be compared using string distance
- stringdist.match = c( "firstname1_new" ,"firstname2_new" ,
- "fathername_new" ,"mothername_new"),
-
- # Specify which variables present in stringdist.match can be partially matched
- partial.match = c( "firstname1_new" ,"firstname2_new" ,
- "fathername_new" ,"mothername_new"),
-
- # Specify which variables should be matched numerically
- # Must be a subset of 'varnames' and must not be present in 'stringdist.match'.
- numeric.match = c( "day_birth", "month_birth", "year_birth"
- # "telefono_new"
- #'dob_day', 'dob_month', 'dob_year'
- ),
-
- # Specify the number of CPU cores to utilize (parallel processing).
- ## Get the number of detected cores minus 1, Reserve one core for
- #non-computational tasks to help prevent system slowdowns or unresponsiveness
- n.cores = parallel::detectCores() - 1,
- return.all = TRUE,
- return.df = TRUE)
-
-## Review the matching in each orginal frame
-matchedA <- dfA[matches.out$matches$inds.a, ]
-matchedB <- dfB[matches.out$matches$inds.b, ]
-
-# Confusion Matrice
-fastLink::confusion(matches.out, threshold = 0.98)
-
-# Examine the EM object:
-#matches.out$EM
-
-# Summarize the accuracy of the match:
-# each column gives the match count, match rate,
-# false discovery rate (FDR) and false negative rate (FNR)
-# under different cutoffs for matches based on the posterior
-# probability of a match.
-summary(matches.out)
-
-## Gt the output...
-matched_dfs <- fastLink::getMatches(
- dfA = dfA,
- dfB = dfB,
- fl.out = matches.out,
- threshold.match = 0.85
-)
-
-# Display the matches ###################
-# convert dfA rownames to a column
-dfA_clean <- dfA |> rownames_to_column()
-
-# convert dfB rownames to a column
-dfB_clean <- dfB |> rownames_to_column()
-
-# convert all columns in matches dataset to character,
-#so they can be joined to the rownames
-matches_clean <- matched_dfs |>
- dplyr::mutate(dplyr::across(dplyr::everything(), as.character))
-
-# Join matches to dfA, then add dfB
-# column "inds.b" is added to dfA
-complete <- dplyr::left_join(dfA_clean,
- matches_clean,
- by = c("rowname" = "inds.a"))
-
-# column(s) from dfB are added
-complete <- dplyr::left_join(complete,
- dfB_clean,
- by = c("inds.b" = "rowname"))
-
-
-
-# Preprocessing Matches via Blocking #################
-blockgender_out <- fastLink::blockData(dfA, dfB, varnames = "gender")
-
-## Subset dfA into blocks
-dfA_block1 <- dfA[blockgender_out$block.1$dfA.inds,]
-dfA_block2 <- dfA[blockgender_out$block.2$dfA.inds,]
-
-## Subset dfB into blocks
-dfB_block1 <- dfB[blockgender_out$block.1$dfB.inds,]
-dfB_block2 <- dfB[blockgender_out$block.2$dfB.inds,]
-
-## Run fastLink on each
-link.1 <- fastLink::fastLink(
- dfA = dfA_block1,
- dfB = dfB_block1,
- # Specify the vector of variable names to be used for matching.
- # These variable names should exist in both dfA and dfB
- varnames = c("nationality" ,
- "firstname1_new" ,"firstname2_new" ,
- "fathername_new" ,"mothername_new",
- "day_birth", "month_birth", "year_birth",
- # "asistencia_new" , "departamento_new", "telefono_new" ,
- "gender" ),
-
- # Specify which variables among varnames should be compared using string distance
- stringdist.match = c( "firstname1_new" ,"firstname2_new" ,
- "fathername_new" ,"mothername_new"),
-
- # Specify which variables present in stringdist.match can be partially matched
- partial.match = c( "firstname1_new" ,"firstname2_new" ,
- "fathername_new" ,"mothername_new"),
-
- # Specify which variables should be matched numerically
- # Must be a subset of 'varnames' and must not be present in 'stringdist.match'.
- numeric.match = c( "day_birth", "month_birth", "year_birth"
- # "telefono_new"
- #'dob_day', 'dob_month', 'dob_year'
- ),
-
- # Specify the number of CPU cores to utilize (parallel processing).
- ## Get the number of detected cores minus 1, Reserve one core for
- #non-computational tasks to help prevent system slowdowns or unresponsiveness
- n.cores = parallel::detectCores() - 1,
- return.df = TRUE)
-
-link.2 <- fastLink::fastLink(
- dfA = dfA_block2,
- dfB = dfB_block2,
- # Specify the vector of variable names to be used for matching.
- # These variable names should exist in both dfA and dfB
- varnames = c("nationality" ,
- "firstname1_new" ,"firstname2_new" ,
- "fathername_new" ,"mothername_new",
- "day_birth", "month_birth", "year_birth",
- # "asistencia_new" , "departamento_new", "telefono_new" ,
- "gender" ),
-
- # Specify which variables among varnames should be compared using string distance
- stringdist.match = c( "firstname1_new" ,"firstname2_new" ,
- "fathername_new" ,"mothername_new"),
-
- # Specify which variables present in stringdist.match can be partially matched
- partial.match = c( "firstname1_new" ,"firstname2_new" ,
- "fathername_new" ,"mothername_new"),
-
- # Specify which variables should be matched numerically
- # Must be a subset of 'varnames' and must not be present in 'stringdist.match'.
- numeric.match = c( "day_birth", "month_birth", "year_birth"
- # "telefono_new"
- #'dob_day', 'dob_month', 'dob_year'
- ),
-
- # Specify the number of CPU cores to utilize (parallel processing).
- ## Get the number of detected cores minus 1, Reserve one core for
- #non-computational tasks to help prevent system slowdowns or unresponsiveness
- n.cores = parallel::detectCores() - 1,
- return.df = TRUE)
-
-## aggregate multiple matches into a single summary with aggregateEM()
-agg.out <- fastLink::aggregateEM(em.list = list(link.1, link.2))
-
-
-
-# Preprocessing Matches via Blocking #################
diff --git a/deduplicate.Rmd b/deduplicate.Rmd
index 95c1ee0..9699b05 100644
--- a/deduplicate.Rmd
+++ b/deduplicate.Rmd
@@ -648,11 +648,11 @@ missing_gender <- data.prep |>
## let's us an API to identify gender based on firstname
# devtools::install_github("coccopuffs/GenderGuesser")
#missing_gender_result <- GenderGuesser::guessGender(missing_gender)
-# write.csv(missing_gender_result, "missing_gender_result.csv", row.names = FALSE)
+# write.csv(missing_gender_result, here::here("data-raw","missing_gender_result.csv"), row.names = FALSE)
# Results are saved locally to avoid calling too many times the API
-missing_gender_result <- readr::read_csv("missing_gender_result.csv")
+missing_gender_result <- readr::read_csv(here::here("data-raw","missing_gender_result.csv"))
missing_gender_results <- missing_gender_result |>
dplyr::filter(!(is.na(gender))) |>
@@ -829,8 +829,8 @@ dup <- dplyr::rows_append(dup, matchedA )
dup1 <- dup |>
dplyr::distinct()
-write.csv(dup1, "dup.csv", row.names = FALSE )
-write.csv(data.prep, "data.prep.csv", row.names = FALSE )
+write.csv(dup1, here::here("data-raw","dup.csv"), row.names = FALSE )
+write.csv(data.prep, here::here("data-raw","data.prep.csv"), row.names = FALSE )
# Examine the EM object:
#matches.out$EM
diff --git a/fastlink.R b/fastlink.R
deleted file mode 100644
index 75c5530..0000000
--- a/fastlink.R
+++ /dev/null
@@ -1,260 +0,0 @@
-# Clear the workspace by removing all objects
-rm(list = ls())
-
-# Load the pacman package if it is not already installed.
-# if (!requireNamespace("pacman", quietly = TRUE)) {
-# install.packages("pacman")
-# }
-#
-# pacman::p_load(
-# dplyr, # Data manipulation and transformation
-# data.table, # Fast data manipulation with data tables
-# stringi, # Character string processing
-# lubridate, # Date and time handling
-# janitor, # Data cleaning and tabulation functions
-# parallel, # Parallel computing
-# fastLink, # Record linkage and deduplication
-# stringdist # String distance computation
-# )
-
-# Get the number of detected cores minus 1
-# Reserve one core for non-computational tasks to help prevent system slowdowns or unresponsiveness
-numCores <- parallel::detectCores() - 1
-
-# Example data frames dfA and dfB
-# Replace these with your actual data frames
-
-
-#data(samplematch)
-
-dfA <- data.frame(
- FIRST_NAME = c("John", "Mary", "Robert", "Michael", "Jennifer", "David", "Karen", "Maria", "Carlos", "James"),
- LAST_NAME = c("Smith III", "Johnson!", "Williams123", "Brown", "Jones", "Davis", "Miller", "Garcia", "Martinez", "Andrson"),
- BIRTH_DATE = c("1981-05-20", "1990-05-15", "1978-12-10", "1985-08-02", "1993-11-25", "1977-03-30", "1988-06-18", "1991-02-05", "1980-09-12", "1982-07-09"),
- gender = c("M", "F", "M", "M", "F", "M", "F", "F", "M", "M")
-)
-
-dfB <- data.frame(
- RecipientNameFirst = c("John", "Mary", "Robert", "Michael", "Jennifer", "David", "Karenn", "Carloas", "Mariaa", "James"),
- RecipientNameLast = c("Smith iv", "Brown-", "Williams", "Jones", "John son", "No Name", "Miller", "Martinez", "Garcia", "Anderson"),
- RecipientDateOfBirth = c("1981-05-21", "1992-09-25", "1978-10-12", "1985-08-02", "1993-11-25", "1977-03-30", "1988-06-18", "1980-09-12", "1991-02-05", "1982-07-09"),
- gender = c("M", "F", "M", "M", "F", "M", "F", "M", "F", "M")
-)
-
-# # Example data frames dfA and dfB
-# dfA <- fread(file = 'dfA.csv',
-# sep = ",",
-# header = T,
-# nThread = numCores)
-#
-# dfB <- fread(file = 'dfB.csv',
-# sep = ",",
-# header = T,
-# nThread = numCores)
-
-# Assign a unique key ID to each row
-dfA <- dfA |>
- dplyr::mutate(row_idA = paste("dfA_", dplyr::row_number(), sep = ""))
-
-dfB <- dfB |>
- dplyr::mutate(row_idB = paste("dfB_", dplyr::row_number(), sep = ""))
-
-# Convert the date column to a valid date format.
-dfA$BIRTH_DATE <- as.Date(dfA$BIRTH_DATE, format = "%Y-%m-%d") # "%m/%d/%Y"
-dfB$RecipientDateOfBirth <- as.Date(dfB$RecipientDateOfBirth, format = "%Y-%m-%d") # "%m/%d/%Y"
-#
-# # Create new date variables by splitting date of birth into three different parts.
-dfA <- dfA |>
- dplyr::mutate(dob_day = as.numeric(lubridate::day(BIRTH_DATE)),
- dob_month = as.numeric(lubridate::month(BIRTH_DATE)),
- dob_year = as.numeric(lubridate::year(BIRTH_DATE)),
- DOB=BIRTH_DATE)
-
-dfB <- dfB |>
- dplyr::mutate(dob_day = as.numeric(lubridate::day(RecipientDateOfBirth)),
- dob_month = as.numeric(lubridate::month(RecipientDateOfBirth)),
- dob_year = as.numeric(lubridate::year(RecipientDateOfBirth)),
- DOB=RecipientDateOfBirth)
-
-
-# Define a function for data cleaning with additional name removal logic
-clean_names <- function(names_column) {
- # Step 0: Convert to uppercase
- names_column_new <- toupper(names_column)
- # Step 1: Remove specified name suffixes
- toRemove <- c(" JR", " SR", " IV", " III", " II")
- for (tR in toRemove) {
- names_column_new <- gsub(tR, "", names_column_new)
- }
- # Step 2: Convert special characters to ASCII equivalents
- names_column_new <- iconv(names_column_new, "latin1", "ASCII//TRANSLIT", sub = "")
- # Step 3: Remove punctuation, digits, and all sapces
- names_column_new <- gsub("[[:punct:][:digit:]][[:space:]]", "", names_column_new)
- # Step 4: Create a new variable with only alphabetic characters
- names_column_new <- gsub("[^[:alpha:]]", "", names_column_new)
- return(names_column_new)
-}
-
-# Perform data cleaning on dfA using the clean_names function
-dfA <- dfA |>
- dplyr::mutate_at(dplyr::vars(FIRST_NAME, LAST_NAME), list(new = clean_names)) |>
- dplyr::mutate(FN = FIRST_NAME_new, LN = LAST_NAME_new)
-
-# Perform data cleaning on dfB using the clean_names function
-dfB <- dfB |>
- dplyr::mutate_at(dplyr::vars(RecipientNameFirst, RecipientNameLast), list(new = clean_names)) |>
- dplyr::mutate(FN = RecipientNameFirst_new, LN = RecipientNameLast_new)
-
-
-rm(clean_names)
-
-# # Create no name list
-NoNameList <- c(
- "NICKNAME",
- "NOFAMILYNAME",
- "NOFIRSTNAME",
- "NOLASTNAME",
- "NOMIDDLENAME",
- "NONAME",
- "NO",
- "UNKNOWN",
- "UNK",
- "UN",
- "NA"
-)
-#
-# # Blank out the names in the data if they match any of the strings in the NoNameList
-dfA <- dfA |>
- dplyr::mutate(FN = dplyr::case_when(
- FN %in% NoNameList~ "",
- TRUE ~ FN),
- LN = dplyr::case_when(
- LN %in% NoNameList ~ "",
- TRUE ~ LN))
-
-dfB <- dfB |>
- dplyr::mutate(FN = dplyr::case_when(
- FN %in% NoNameList~ "",
- TRUE ~ FN),
- LN = dplyr::case_when(
- LN %in% NoNameList ~ "",
- TRUE ~ LN))
-rm(NoNameList)
-
-# Delete rows that have missing First Name (FN), Last Name (LN), or Date of Birth (DOB).
-dfA <- dfA |>
- dplyr::filter(!is.na(FN) & FN != "" & !is.na( LN) & LN != "" & !is.na(DOB) )
-
-dfB <- dfB |>
- dplyr::filter(!is.na(FN) & FN != "" & !is.na( LN) & LN != "" & !is.na(DOB) )
-
-
-# Exact Matching
-Exact <- merge(dfA, dfB, by=c("FN","LN","DOB", "gender"))
-
-
-# Using the fastLink R package for record linkage
-matches.out <- fastLink::fastLink(
- dfA = dfA, dfB = dfB,
- # Specify the vector of variable names to be used for matching.
- # These variable names should exist in both dfA and dfB
- varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year', 'gender'),
- # Specify which variables among varnames should be compared using string distance
- stringdist.match = c('FN', 'LN'),
- # Specify which variables present in stringdist.match can be partially matched
- partial.match = c('FN', 'LN'),
- # Specify which variables should be matched numerically
- # Must be a subset of 'varnames' and must not be present in 'stringdist.match'.
- numeric.match = c('dob_day', 'dob_month', 'dob_year'),
- # Specify the number of CPU cores to utilize (parallel processing). The default value is NULL.
- n.cores = numCores,
- return.all = TRUE,
- return.df = TRUE
-)
-
-# The output from fastLink() will be a list that includes a matrix where each row
-# is a match with the relevant indices of dfA (column 1) and dfB (column 2).
-
-# Confusion Matrice
-fastLink::confusion(matches.out, threshold = 0.98)
-
-# Examine the EM object:
-matches.out$EM
-
-# Summarize the accuracy of the match:
-# each column gives the match count, match rate,
-# false discovery rate (FDR) and false negative rate (FNR)
-# under different cutoffs for matches based on the posterior
-# probability of a match.
-summary(matches.out)
-
-
-# Get fuzzy matches using the results from fastLink
-# A threshold of 0.98 is set for match classification
-fuzzy_matches <- fastLink::getMatches(dfA, dfB,
- fl.out = matches.out,
- threshold.match = 0.98)
-fuzzy_matches #|> knitr::kable()
-
-blockdata_out <- fastLink::blockData(dfA, dfB,
- varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year'),
- # window blocking for numeric variables, where a given observation in dataset A
- #will be compared to all observations in dataset B where the value of the blocking
- #variable is within ±K of the value of the same variable in dataset A.
- #The value of K is the size of the window
- # window.block = "birthyear",
- # window.size = 1,
- # using k-means clustering, so that similar values of string and numeric
- # variables are blocked together.
- kmeans.block = "FN",
- nclusters = 2)
-
-
-## Aggregating Multiple Matches Together ###
-
-#Often, we run several different matches for a single data set -
-#for instance, when blocking by gender or by some other criterion to reduce the
-#number of pairwise comparisons. Here, we walk through how to aggregate those
-#multiple matches into a single summary.
-blockgender_out <- fastLink::blockData(dfA, dfB, varnames = "gender")
-## Subset dfA into blocks
-dfA_block1 <- dfA[blockgender_out$block.1$dfA.inds,]
-dfA_block2 <- dfA[blockgender_out$block.2$dfA.inds,]
-
-## Subset dfB into blocks
-dfB_block1 <- dfB[blockgender_out$block.1$dfB.inds,]
-dfB_block2 <- dfB[blockgender_out$block.2$dfB.inds,]
-
-## Run fastLink on each
-fl_out_block1 <- fastLink::fastLink(
- dfA_block1, dfB_block1,
- varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year'),
- n.cores = numCores
-)
-fl_out_block2 <- fastLink::fastLink(
- dfA_block2, dfB_block2,
- varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year'),
- n.cores = numCores
-)
-
-#Here, we run fastLink() on the subsets of data defined by blocking on gender in the previous section:
-
-## Run fastLink on each
-link.1 <- fastLink::fastLink(
- dfA_block1, dfB_block1,
- varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year'),
- n.cores = numCores
- )
-
-link.2 <- fastLink::fastLink(
- dfA_block2, dfB_block2,
- varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year'),
- n.cores = numCores
-)
-
-#To aggregate the two matches into a single summary, we use the aggregateEM() function as follows:
-agg.out <- fastLink::aggregateEM(em.list = list(link.1, link.2))
-
-summary(agg.out )
-
-
diff --git a/img/Record-linkage-example.png b/img/Record-linkage-example.png
new file mode 100644
index 0000000..db03d3e
Binary files /dev/null and b/img/Record-linkage-example.png differ
diff --git a/concept.png b/img/concept.png
similarity index 100%
rename from concept.png
rename to img/concept.png
diff --git a/evaluation.png b/img/evaluation.png
similarity index 100%
rename from evaluation.png
rename to img/evaluation.png
diff --git a/pipe.png b/img/pipe.png
similarity index 100%
rename from pipe.png
rename to img/pipe.png
diff --git a/img/prob.png b/img/prob.png
new file mode 100644
index 0000000..69c3afe
Binary files /dev/null and b/img/prob.png differ
diff --git a/record-linkage.png b/img/record-linkage.png
similarity index 100%
rename from record-linkage.png
rename to img/record-linkage.png
diff --git a/img/record-linkage.webp b/img/record-linkage.webp
new file mode 100644
index 0000000..b120aa3
Binary files /dev/null and b/img/record-linkage.webp differ
diff --git a/index.Rmd b/index.Rmd
index ba2842d..dfbc542 100644
--- a/index.Rmd
+++ b/index.Rmd
@@ -71,7 +71,7 @@ Can be numerous ones: ...
]
.pull-right[
-![](record-linkage.png)
+![](img/record-linkage.png)
]
@@ -94,7 +94,7 @@ The recommended workflow includes to go through specific steps:
]
.pull-right[
-![](concept.png)
+![](img/concept.png)
]
???
@@ -362,7 +362,7 @@ Revise the[demo script](FastLink.html) and [applied real use case notebook](htt
]
.pull-right[
-![](pipe.png)
+![](img/pipe.png)
]
diff --git a/index.html b/index.html
index 767431e..7cde382 100644
--- a/index.html
+++ b/index.html
@@ -1339,6 +1339,10 @@
--unhcr-logoblue: url("");
--unhcr-logowhite: url("");
}
+:root {
+--unhcr-logoblue: url("");
+--unhcr-logowhite: url("");
+}