diff --git a/deduplicate.R b/deduplicate.R index 754bc69..c9eb507 100644 --- a/deduplicate.R +++ b/deduplicate.R @@ -15,32 +15,175 @@ data <- readxl::read_excel(here::here("data-raw", "Registros2.xlsx"), ## Cleaning functions ############### -#' separate_name + +#' clean_age +#' +#' This functions cleans the age identifiers in the context of record linkage. +#' +#' If the date of birth is present, it will parse it, extract year and month, +#' and recalculate both age and age range +#' +#' If the date of birth is not mentioned, then it will use age and date_record +#' to reconstruct an estimated date of birth - and recalculate the rest +#' +#' @param frame frame with the data +#' @param date_birth variable name for the date of birth in the frame +#' @param date_record variable name for the date of birth in the frame +#' @param year_birth variable name for the year of birth in the frame +#' @param month_birth variable name for the month of birth in the frame +#' @param age variable name for the date of birth in the frame +#' @param age_range variable name for the date of birth in the frame +#' +#' @return the same column but cleaned... +#' +#' @export +clean_age <- function(frame, + date_birth, + date_record, + year_birth, + month_birth, + age, + age_range){ + + frame2 <- frame |> + dplyr::mutate( + DateOfBirth = lubridate::as_date(DoB), + + dob_day = as.numeric( + lubridate::day(DateOfBirth)), + + dob_month = as.numeric( + lubridate::month(DateOfBirth)), + + dob_year = as.numeric( + lubridate::year(DateOfBirth)), + + age = today(), + + age_cohort = dplyr::case_when( + ~ "0-4", + ~ "4-11", + ~ "12-17", + ~ "18-59", + ~ "60+", + TRUE ~ NA ) ) + + return(frame2) +} + + + +#' separate_fullname +#' +#' This function clean name decomposition - in case it is not included in the original data +#' +#' Performing this name decomposition is important in order to enhance record linkage +#' as the name pattern can be different (either "firstname_fathername_mother_name" or +#' "fathername_mothername_firstname") which will minimise linkage probabibility +#' +#' In case, there's only 2 elements, it will fill only firstname and fathername +#' The function also identify family prefix to be bind such as for spanish +#' "DEL", 'DE", "DE LOS", "DE LAS" #' -#' use the name pattern to separate the name -#' in case family name is more than one word, -#' identify family prefix to bind with such as "DEL", 'DE", "DE LOS", "DE LAS" #' @param fullname full name including everything together -#' @param namepattern either "firstname_fathername_mother_name" or "fathername_mothername_firstname -#' @return a list with c("firstname","fathername","mothername") +#' @param firstname full name including everything together +#' @param fathername full name including everything together +#' @param mothername full name including everything together +#' @param namepattern either "firstname_fathername_mother_name" or +#' "fathername_mothername_firstname" +#' @param nameprefix default vector with stuff to remove from name +#' c("DE ", "DE LA ", "DEL ", "DE LOS ", "DE LAS") +#' @return a clean list with c("firstname","fathername","mothername") #' #' @export -separate_fullname <- function(fullname, namepattern){ +separate_fullname <- function(frame, + fullname, + firstname, + fathername, + mothername, + namepattern, + nameprefix = data.frame( + pat1 = c( "DE LA ", "DEL ", "DE LOS ", "DE LAS ","DE "), + pat2 = c( "DE_LA_", "DEL_", "DE_LOS_", "DE_LAS_", "DE_") ) + ){ - # ### - # sp <- tidyr::separate(fullname, " ") + + # Utility sub-function to remove spaces based on patterns + remove_spaces_based_on_patterns <- function(vector, nameprefix) { + for (i in 1:nrow(nameprefix) ) { + # i <- 4 + #cat(paste0(nameprefix[i, c("pat1")],"\n", vector, "\n")) + vector <- stringr::str_replace_all(string = vector, + pattern = nameprefix[i, c("pat1")], + replacement = nameprefix[i, c("pat2")]) + # cat(paste0(vector, "\n")) + } + return(vector) + } + ## Test function + #remove_spaces_based_on_patterns(vector = "ADRIENNE DE LOS ANGELES MILANESE PARISIANNA", nameprefix) + + # Utility sub-function to remove spaces based on patterns + reset_spaces_based_on_patterns <- function(vector, nameprefix) { + for (i in 1:nrow(nameprefix) ) { + # i <- 1 + # cat(paste0( nameprefix[i, c("pat1")],"\n",vector, "\n")) + vector <- stringr::str_replace_all(string = vector, + pattern = nameprefix[i, c("pat2")], + replacement = nameprefix[i, c("pat1")]) + #cat(paste0(vector, "\n")) + } + return(vector) + } + + ## Let's go! + framesp <- frame |> + ### Lets get everything to uper + dplyr::mutate( fullname_or = fullname, + fullname = trimws(stringr::str_squish(fullname), + which = "both", + whitespace = "[ \t\r\n]"), + fullname = toupper(fullname)) |> + ### Lets apply the prefix space replacement... + dplyr::mutate( fullname_pref = remove_spaces_based_on_patterns(vector = fullname, + nameprefix), + fullname = fullname_pref ) |> + + # "firstname_fathername_mother_name" or + #' "fathername_mothername_firstname" + tidyr::separate_wider_delim(fullname, + delim = " ", + names_sep = "", + too_few = "align_start") |> + + ### Now reconstruct the first name, father and mother name based on cases + + + + tidyr::separate( fullname, + #into=c("FirstName", "FatherName", "MotherName"), + sep=" ") |> + ### Lets reset the prefix space replacement... + dplyr::mutate( FatherName = reset_spaces_based_on_patterns(vector = FatherName, + nameprefix), + MotherName = reset_spaces_based_on_patterns(vector = MotherName, + nameprefix)) + # - # return(sp) + return(framesp) } - -test <- data |> +## Testing... +frame <- data |> dplyr::filter( is.na(nombres) ) |> - dplyr::select(nombre_completo, name_pattern) + dplyr::select(nombre_completo, name_pattern) |> + dplyr::rename( fullname= "nombre_completo", + namepattern = "name_pattern") -fullname <- test$nombre_completo -namepattern <- test$name_pattern -sep <- separate_fullname(fullname, namepattern) + +sep <- separate_fullname(frame = frame, + fullname = "fullname", + namepattern= "namepattern") #' cleanvar #' @@ -53,7 +196,7 @@ sep <- separate_fullname(fullname, namepattern) #' #' @export cleanvar <- function(names_column, - toRemove = c(" JR", " SR", " IV", " III", " II")) { + toRemove = c(" JR", " JUNIOR", " SR", " IV", " III", " II")) { # Convert to uppercase names_column_new <- toupper(names_column) # Remove specified name suffixes @@ -71,13 +214,14 @@ cleanvar <- function(names_column, } -## FGet the data post processing ##################### +## Pipeline for data post processing ##################### data.prep <- data |> ## Filter where the phone number is not available -- "NO REFIERE" dplyr::filter( telefono != "NO REFIERE") |> ## Clean age_range + data |> dplyr::select(age_range) |> dplyr::distinct() |> dplyr::pull() # dplyr::mutate( age_range = dplyr::case_when( # !(is.null(edad)) & (edad <5 ) ~ "0-4", # genero %in% c("M" , "MASCULINO" , "Masculino") ~ "M", @@ -133,6 +277,9 @@ alldatasource <- data.prep |> dplyr::distinct() |> dplyr::pull() + + + ## Let's get 2 comparison dataset... ########## dfA <- data.prep |> dplyr::filter(datasource == alldatasource[9]) |> @@ -169,8 +316,9 @@ matches.out <- fastLink::fastLink( n.cores = parallel::detectCores() - 1, return.df = TRUE) - - +## Review the matching in each orginal frame +matchedA <- dfA[matches.out$matches$inds.a, ] +matchedB <- dfB[matches.out$matches$inds.b, ] # Confusion Matrice fastLink::confusion(matches.out, threshold = 0.98) @@ -193,30 +341,28 @@ matched_dfs <- fastLink::getMatches( threshold.match = 0.85 ) +# Display the matches ################### +# convert dfA rownames to a column +dfA_clean <- dfA %>% rownames_to_column() -# convert cases rownames to a column -cases_clean <- cases %>% rownames_to_column() - -# convert test_results rownames to a column -results_clean <- results %>% rownames_to_column() +# convert dfB rownames to a column +dfB_clean <- dfB %>% rownames_to_column() # convert all columns in matches dataset to character, #so they can be joined to the rownames -matches_clean <- my_matches %>% +matches_clean <- matched_dfs %>% mutate(across(everything(), as.character)) - - # Join matches to dfA, then add dfB # column "inds.b" is added to dfA -complete <- left_join(cases_clean, matches_clean, by = c("rowname" = "inds.a")) +complete <- left_join(dfA_clean, matches_clean, by = c("rowname" = "inds.a")) # column(s) from dfB are added -complete <- left_join(complete, results_clean, by = c("inds.b" = "rowname")) +complete <- left_join(complete, dfB_clean, by = c("inds.b" = "rowname")) -#P reprocessing Matches via Blocking ################# +# Preprocessing Matches via Blocking ################# blockgender_out <- fastLink::blockData(dfA, dfB, varnames = "gender") @@ -287,3 +433,5 @@ link.2 <- fastLink::fastLink( agg.out <- fastLink::aggregateEM(em.list = list(link.1, link.2)) + +# Preprocessing Matches via Blocking ################# \ No newline at end of file diff --git a/fastlink.R b/fastlink.R new file mode 100644 index 0000000..75c5530 --- /dev/null +++ b/fastlink.R @@ -0,0 +1,260 @@ +# Clear the workspace by removing all objects +rm(list = ls()) + +# Load the pacman package if it is not already installed. +# if (!requireNamespace("pacman", quietly = TRUE)) { +# install.packages("pacman") +# } +# +# pacman::p_load( +# dplyr, # Data manipulation and transformation +# data.table, # Fast data manipulation with data tables +# stringi, # Character string processing +# lubridate, # Date and time handling +# janitor, # Data cleaning and tabulation functions +# parallel, # Parallel computing +# fastLink, # Record linkage and deduplication +# stringdist # String distance computation +# ) + +# Get the number of detected cores minus 1 +# Reserve one core for non-computational tasks to help prevent system slowdowns or unresponsiveness +numCores <- parallel::detectCores() - 1 + +# Example data frames dfA and dfB +# Replace these with your actual data frames + + +#data(samplematch) + +dfA <- data.frame( + FIRST_NAME = c("John", "Mary", "Robert", "Michael", "Jennifer", "David", "Karen", "Maria", "Carlos", "James"), + LAST_NAME = c("Smith III", "Johnson!", "Williams123", "Brown", "Jones", "Davis", "Miller", "Garcia", "Martinez", "Andrson"), + BIRTH_DATE = c("1981-05-20", "1990-05-15", "1978-12-10", "1985-08-02", "1993-11-25", "1977-03-30", "1988-06-18", "1991-02-05", "1980-09-12", "1982-07-09"), + gender = c("M", "F", "M", "M", "F", "M", "F", "F", "M", "M") +) + +dfB <- data.frame( + RecipientNameFirst = c("John", "Mary", "Robert", "Michael", "Jennifer", "David", "Karenn", "Carloas", "Mariaa", "James"), + RecipientNameLast = c("Smith iv", "Brown-", "Williams", "Jones", "John son", "No Name", "Miller", "Martinez", "Garcia", "Anderson"), + RecipientDateOfBirth = c("1981-05-21", "1992-09-25", "1978-10-12", "1985-08-02", "1993-11-25", "1977-03-30", "1988-06-18", "1980-09-12", "1991-02-05", "1982-07-09"), + gender = c("M", "F", "M", "M", "F", "M", "F", "M", "F", "M") +) + +# # Example data frames dfA and dfB +# dfA <- fread(file = 'dfA.csv', +# sep = ",", +# header = T, +# nThread = numCores) +# +# dfB <- fread(file = 'dfB.csv', +# sep = ",", +# header = T, +# nThread = numCores) + +# Assign a unique key ID to each row +dfA <- dfA |> + dplyr::mutate(row_idA = paste("dfA_", dplyr::row_number(), sep = "")) + +dfB <- dfB |> + dplyr::mutate(row_idB = paste("dfB_", dplyr::row_number(), sep = "")) + +# Convert the date column to a valid date format. +dfA$BIRTH_DATE <- as.Date(dfA$BIRTH_DATE, format = "%Y-%m-%d") # "%m/%d/%Y" +dfB$RecipientDateOfBirth <- as.Date(dfB$RecipientDateOfBirth, format = "%Y-%m-%d") # "%m/%d/%Y" +# +# # Create new date variables by splitting date of birth into three different parts. +dfA <- dfA |> + dplyr::mutate(dob_day = as.numeric(lubridate::day(BIRTH_DATE)), + dob_month = as.numeric(lubridate::month(BIRTH_DATE)), + dob_year = as.numeric(lubridate::year(BIRTH_DATE)), + DOB=BIRTH_DATE) + +dfB <- dfB |> + dplyr::mutate(dob_day = as.numeric(lubridate::day(RecipientDateOfBirth)), + dob_month = as.numeric(lubridate::month(RecipientDateOfBirth)), + dob_year = as.numeric(lubridate::year(RecipientDateOfBirth)), + DOB=RecipientDateOfBirth) + + +# Define a function for data cleaning with additional name removal logic +clean_names <- function(names_column) { + # Step 0: Convert to uppercase + names_column_new <- toupper(names_column) + # Step 1: Remove specified name suffixes + toRemove <- c(" JR", " SR", " IV", " III", " II") + for (tR in toRemove) { + names_column_new <- gsub(tR, "", names_column_new) + } + # Step 2: Convert special characters to ASCII equivalents + names_column_new <- iconv(names_column_new, "latin1", "ASCII//TRANSLIT", sub = "") + # Step 3: Remove punctuation, digits, and all sapces + names_column_new <- gsub("[[:punct:][:digit:]][[:space:]]", "", names_column_new) + # Step 4: Create a new variable with only alphabetic characters + names_column_new <- gsub("[^[:alpha:]]", "", names_column_new) + return(names_column_new) +} + +# Perform data cleaning on dfA using the clean_names function +dfA <- dfA |> + dplyr::mutate_at(dplyr::vars(FIRST_NAME, LAST_NAME), list(new = clean_names)) |> + dplyr::mutate(FN = FIRST_NAME_new, LN = LAST_NAME_new) + +# Perform data cleaning on dfB using the clean_names function +dfB <- dfB |> + dplyr::mutate_at(dplyr::vars(RecipientNameFirst, RecipientNameLast), list(new = clean_names)) |> + dplyr::mutate(FN = RecipientNameFirst_new, LN = RecipientNameLast_new) + + +rm(clean_names) + +# # Create no name list +NoNameList <- c( + "NICKNAME", + "NOFAMILYNAME", + "NOFIRSTNAME", + "NOLASTNAME", + "NOMIDDLENAME", + "NONAME", + "NO", + "UNKNOWN", + "UNK", + "UN", + "NA" +) +# +# # Blank out the names in the data if they match any of the strings in the NoNameList +dfA <- dfA |> + dplyr::mutate(FN = dplyr::case_when( + FN %in% NoNameList~ "", + TRUE ~ FN), + LN = dplyr::case_when( + LN %in% NoNameList ~ "", + TRUE ~ LN)) + +dfB <- dfB |> + dplyr::mutate(FN = dplyr::case_when( + FN %in% NoNameList~ "", + TRUE ~ FN), + LN = dplyr::case_when( + LN %in% NoNameList ~ "", + TRUE ~ LN)) +rm(NoNameList) + +# Delete rows that have missing First Name (FN), Last Name (LN), or Date of Birth (DOB). +dfA <- dfA |> + dplyr::filter(!is.na(FN) & FN != "" & !is.na( LN) & LN != "" & !is.na(DOB) ) + +dfB <- dfB |> + dplyr::filter(!is.na(FN) & FN != "" & !is.na( LN) & LN != "" & !is.na(DOB) ) + + +# Exact Matching +Exact <- merge(dfA, dfB, by=c("FN","LN","DOB", "gender")) + + +# Using the fastLink R package for record linkage +matches.out <- fastLink::fastLink( + dfA = dfA, dfB = dfB, + # Specify the vector of variable names to be used for matching. + # These variable names should exist in both dfA and dfB + varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year', 'gender'), + # Specify which variables among varnames should be compared using string distance + stringdist.match = c('FN', 'LN'), + # Specify which variables present in stringdist.match can be partially matched + partial.match = c('FN', 'LN'), + # Specify which variables should be matched numerically + # Must be a subset of 'varnames' and must not be present in 'stringdist.match'. + numeric.match = c('dob_day', 'dob_month', 'dob_year'), + # Specify the number of CPU cores to utilize (parallel processing). The default value is NULL. + n.cores = numCores, + return.all = TRUE, + return.df = TRUE +) + +# The output from fastLink() will be a list that includes a matrix where each row +# is a match with the relevant indices of dfA (column 1) and dfB (column 2). + +# Confusion Matrice +fastLink::confusion(matches.out, threshold = 0.98) + +# Examine the EM object: +matches.out$EM + +# Summarize the accuracy of the match: +# each column gives the match count, match rate, +# false discovery rate (FDR) and false negative rate (FNR) +# under different cutoffs for matches based on the posterior +# probability of a match. +summary(matches.out) + + +# Get fuzzy matches using the results from fastLink +# A threshold of 0.98 is set for match classification +fuzzy_matches <- fastLink::getMatches(dfA, dfB, + fl.out = matches.out, + threshold.match = 0.98) +fuzzy_matches #|> knitr::kable() + +blockdata_out <- fastLink::blockData(dfA, dfB, + varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year'), + # window blocking for numeric variables, where a given observation in dataset A + #will be compared to all observations in dataset B where the value of the blocking + #variable is within ±K of the value of the same variable in dataset A. + #The value of K is the size of the window + # window.block = "birthyear", + # window.size = 1, + # using k-means clustering, so that similar values of string and numeric + # variables are blocked together. + kmeans.block = "FN", + nclusters = 2) + + +## Aggregating Multiple Matches Together ### + +#Often, we run several different matches for a single data set - +#for instance, when blocking by gender or by some other criterion to reduce the +#number of pairwise comparisons. Here, we walk through how to aggregate those +#multiple matches into a single summary. +blockgender_out <- fastLink::blockData(dfA, dfB, varnames = "gender") +## Subset dfA into blocks +dfA_block1 <- dfA[blockgender_out$block.1$dfA.inds,] +dfA_block2 <- dfA[blockgender_out$block.2$dfA.inds,] + +## Subset dfB into blocks +dfB_block1 <- dfB[blockgender_out$block.1$dfB.inds,] +dfB_block2 <- dfB[blockgender_out$block.2$dfB.inds,] + +## Run fastLink on each +fl_out_block1 <- fastLink::fastLink( + dfA_block1, dfB_block1, + varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year'), + n.cores = numCores +) +fl_out_block2 <- fastLink::fastLink( + dfA_block2, dfB_block2, + varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year'), + n.cores = numCores +) + +#Here, we run fastLink() on the subsets of data defined by blocking on gender in the previous section: + +## Run fastLink on each +link.1 <- fastLink::fastLink( + dfA_block1, dfB_block1, + varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year'), + n.cores = numCores + ) + +link.2 <- fastLink::fastLink( + dfA_block2, dfB_block2, + varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year'), + n.cores = numCores +) + +#To aggregate the two matches into a single summary, we use the aggregateEM() function as follows: +agg.out <- fastLink::aggregateEM(em.list = list(link.1, link.2)) + +summary(agg.out ) + + diff --git a/index.Rmd b/index.Rmd index 3a77867..2a5b20f 100644 --- a/index.Rmd +++ b/index.Rmd @@ -55,7 +55,7 @@ Performing exact matching is often not do-able when reconciling registration lis .pull-left[ -What needs to be fixed... +Can be numerous ones: ... * Spelling of names and first names can be different @@ -212,17 +212,50 @@ for (tR in toRemove) { ``` ] + --- -## Pre-processing: Date +## Pre-processing: name Decomposition + .pull-left[ +From Full name to First name, Father and Mother name + +This function clean name decomposition - in case it is not included in the original data + +Performing this name decomposition is important in order to enhance record linkage as the name pattern can be different (either "firstname_fathername_mother_name" or "fathername_mothername_firstname") which will minimise linkage probabibility + +In case, there's only 2 elements, it will fill only firstname and fathername + +The function will clean all unexpected spaces within the names, including double spaces + +The function also identify family prefix to be bind such as for Spanish "DEL", 'DE", "DE LOS", "DE LAS" + + +] +.pull-right[ + +``` -Turn Date of Birth to Year-Month and Age -Turn Age to Age Cohort +``` +] + + + +--- + +## Pre-processing: Age & Date of Birth + +.pull-left[ + +Creating a dedicated function.... + +If the date of birth is present, it will parse it, extract year and month, and recalculate both age and age range + +If the date of birth is not mentioned, then it will use age and date_record to reconstruct an estimated date of birth - and recalculate the rest ] .pull-right[ @@ -245,40 +278,19 @@ dplyr::mutate( age = today()- age_cohort = dplyr::case_when( - genero %in% c("F" ,"FEMENINO" , - "f", "Femenino") ~ "F", - - genero %in% c("M" , "MASCULINO" , - "Masculino") ~ "M", - - genero %in% c("X", "Otro") ~ "Ot", + ~ "0-4", + ~ "4-11", + ~ "12-17", + ~ "18-59", + ~ "60+", TRUE ~ NA ) ) ``` ] + ---- - -## Pre-processing: Separate name - - -.pull-left[ - -Use the name pattern to separate the name - - in case family name is more than one word, - -identify family prefix to bind with such as "DEL", 'DE", "DE LOS", "DE LAS" -] -.pull-right[ - -``` - - - -``` -] + @@ -296,7 +308,7 @@ Each steps described above to be: * piped using the tidyverse approach - [Revise this presentation notebook to use the functions]() +Revise the[ demo script](https://github.com/unhcr-americas/record_linkage/blob/main/fastlink.R) and [ applied real use case script](https://github.com/unhcr-americas/record_linkage/blob/main/deduplicate.R) to use the functions ] .pull-right[ @@ -305,137 +317,6 @@ Each steps described above to be: ] -```{r function-separate_fullname, include=FALSE} -#' separate_name -#' -#' use the name pattern to separate the name -#' in case family name is more than one word, -#' identify family prefix to bind with such as "DEL", 'DE", "DE LOS", "DE LAS" -#' @param fullname full name including everything together -#' @param namepattern either "firstname_fathername_mother_name" or "fathername_mothername_firstname -#' @return a list with c("firstname","fathername","mothername") -#' -#' @export -separate_fullname <- function(fullname, namepattern){ - - # ### - # sp <- tidyr::separate(fullname, " ") - # - # return(sp) -} - -test <- data |> - dplyr::filter( is.na(nombres) ) |> - dplyr::select(nombre_completo, name_pattern) - -fullname <- test$nombre_completo -namepattern <- test$name_pattern -sep <- separate_fullname(fullname, namepattern) - -# data |> dplyr::select( name_pattern) |> dplyr::distinct() |> dplyr::pull() - -``` - - - -```{r function-cleanvar, include=FALSE} -#' cleanvar -#' -#' function for data cleaning with additional name removal logic -#' -#' @param names_column name of the column to treat -#' @param toRemove default vector with stuff to remove from name -#' c(" JR", " SR", " IV", " III", " II") -#' @return names_column_new name of the column treat -#' -#' @export -cleanvar <- function(names_column, - toRemove = c(" JR", " SR", " IV", " III", " II")) { - # Convert to uppercase - names_column_new <- toupper(names_column) - # Remove specified name suffixes - for (tR in toRemove) { - names_column_new <- gsub(tR, "", names_column_new) - } - # Convert special characters to ASCII equivalents - names_column_new <- iconv(names_column_new, "latin1", "ASCII//TRANSLIT", sub = "") - # Remove punctuation, digits, and all spaces - names_column_new <- gsub("[[:punct:][:digit:]][[:space:]]", "", names_column_new) - # Create a new variable with only alphabetic characters - names_column_new <- gsub("[^[:alpha:]]", "", names_column_new) - - return(names_column_new) -} - -``` - - - - -```{r example-cleanvar, include=FALSE} -## Filter the right nationality -data.prep <- data |> - - ## Filter where the phone number is not available -- "NO REFIERE" - dplyr::filter( telefono != "NO REFIERE") |> - - ## Clean age_range - # dplyr::mutate( age_range = dplyr::case_when( - # !(is.null(edad)) & (edad <5 ) ~ "0-4", - # genero %in% c("M" , "MASCULINO" , "Masculino") ~ "M", - # genero %in% c("X", "Otro") ~ "Ot", - # TRUE ~ age_range )) |> - - ## Clean the gender variable - # data |> dplyr::select(genero) |> dplyr::distinct() |> dplyr::pull() - dplyr::mutate(gender = dplyr::case_when( - genero %in% c("F" ,"FEMENINO" ,"f", "Femenino") ~ "F", - genero %in% c("M" , "MASCULINO" , "Masculino") ~ "M", - genero %in% c("X", "Otro") ~ "Ot", - TRUE ~ NA )) |> - - ## Only retain the nationality of interest - # data |> dplyr::select(nacionalidad) |> dplyr::distinct() |> dplyr::pull() - dplyr::mutate(nationality = dplyr::case_when( - nacionalidad %in% c("Venzuela", "venezuela", "Venezolana", - "VENEZUELA", - "Venezuela", "VENEZOLANO", "VENEZOLANA") ~ "VEN", - nacionalidad %in% c("COLOMBIANO", "COLOMBIANA", "COLOMBIA", - "colombia", "Colombia", "Nac. Colombia", - "Colombiana" ) ~ "COL", - TRUE ~ "other" )) |> - dplyr::filter( nationality %in% c("VEN", "COL" )) -``` - - -```{r example-cleanvar2, include=FALSE} -data.prep <- data.prep |> - ## Apply cleanvar() - # Perform data cleaning on dfA using the clean_names function - dplyr::mutate_at( dplyr::vars(nombres, apellido_paterno, apellido_materno, - asistencia, departamento, telefono, - planilla, socio), - list(new = cleanvar)) |> - - - ### identify single data source - dplyr::mutate(datasource = paste0(socio_new, "_", planilla_new)) |> - - # ## Retain only fields for record linkage - dplyr::select(datasource, nationality, nombres_new, apellido_paterno_new, - apellido_materno_new, asistencia_new, departamento_new, - telefono_new, gender) - -# table(data.prep$gender, useNA = "ifany") -# table(data.prep$nationality, useNA = "ifany") -# table(data.prep$datasource, useNA = "ifany") -# ## See if can use departamento for blocking -# table(data.prep$datasource, data.prep$departamento_new, useNA = "ifany") -#dput(names(data.prep)) -``` - - - --- ## Reduction of search space: Blocking @@ -473,13 +354,13 @@ matches.out <- fastLink::fastLink( # Specify the vector of variable names to be used for matching. # These variable names should exist in both dfA and dfB - varnames = c('FN', 'LN', 'dob_day', 'dob_month', 'dob_year'), + varnames = c('FirstName', 'LastName', 'dob_day', 'dob_month', 'dob_year'), # Specify which variables among varnames should be compared using string distance - stringdist.match = c('FN', 'LN'), + stringdist.match = c('FirstName', 'LastName'), # Specify which variables present in stringdist.match can be partially matched - partial.match = c('FN', 'LN'), + partial.match = c('FirstName', 'LastName'), # Specify which variables should be matched numerically # Must be a subset of 'varnames' and must not be present in 'stringdist.match'. @@ -494,6 +375,17 @@ matches.out <- fastLink::fastLink( ??? The default threshold for matches is 0.94 (threshold.match =) but you can adjust it higher or lower. If you define the threshold, consider that higher thresholds could yield more false-negatives (rows that do not match which actually should match) and likewise a lower threshold could yield more false-positive matches. +--- + +## Results on dummy data + +```{r} +#source("fastlink.R") + +``` + + + --- ## Evaluate results diff --git a/index.html b/index.html index a71f95b..02e31d9 100644 --- a/index.html +++ b/index.html @@ -3,7 +3,7 @@ Record Linkage & Deduplication - +