Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix type coercion in bmerge #6603

Draft
wants to merge 24 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 65 additions & 30 deletions R/bmerge.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,86 +34,121 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
ans
}

cast_with_atts = function(x, as.f) {
ans = as.f(x)
if (!is.null(attributes(x))) attributes(ans) = attributes(x)
ans
}

if (nrow(i)) for (a in seq_along(icols)) {
# - check that join columns have compatible types
# - do type coercions if necessary on just the shallow local copies for the purpose of join
# - handle factor columns appropriately
# Note that if i is keyed, if this coerces i's key gets dropped by set()
ic = icols[a]
xc = xcols[a]
xclass = getClass(x[[xc]])
iclass = getClass(i[[ic]])
x_merge_type = getClass(x[[xc]])
i_merge_type = getClass(i[[ic]])
xname = paste0("x.", names(x)[xc])
iname = paste0("i.", names(i)[ic])
if (!xclass %chin% supported) stopf("%s is type %s which is not supported by data.table join", xname, xclass)
if (!iclass %chin% supported) stopf("%s is type %s which is not supported by data.table join", iname, iclass)
if (xclass=="factor" || iclass=="factor") {
if (!x_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", xname, x_merge_type)
if (!i_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", iname, i_merge_type)
if (x_merge_type=="factor" || i_merge_type=="factor") {
if (roll!=0.0 && a==length(icols))
stopf("Attempting roll join on factor column when joining %s to %s. Only integer, double or character columns may be roll joined.", xname, iname)
if (xclass=="factor" && iclass=="factor") {
if (x_merge_type=="factor" && i_merge_type=="factor") {
if (verbose) catf("Matching %s factor levels to %s factor levels.\n", iname, xname)
set(i, j=ic, value=chmatch(levels(i[[ic]]), levels(x[[xc]]), nomatch=0L)[i[[ic]]]) # nomatch=0L otherwise a level that is missing would match to NA values
next
} else {
if (xclass=="character") {
if (x_merge_type=="character") {
if (verbose) catf("Coercing factor column %s to type character to match type of %s.\n", iname, xname)
set(i, j=ic, value=val<-as.character(i[[ic]]))
set(callersi, j=ic, value=val) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581
next
} else if (iclass=="character") {
} else if (i_merge_type=="character") {
if (verbose) catf("Matching character column %s to factor levels in %s.\n", iname, xname)
newvalue = chmatch(i[[ic]], levels(x[[xc]]), nomatch=0L)
if (anyNA(i[[ic]])) newvalue[is.na(i[[ic]])] = NA_integer_ # NA_character_ should match to NA in factor, #3809
set(i, j=ic, value=newvalue)
next
}
}
stopf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", xname, xclass, iname, iclass)
stopf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", xname, x_merge_type, iname, i_merge_type)
}
if (xclass == iclass) {
if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, xclass, xname)
if (x_merge_type == i_merge_type) {
if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, x_merge_type, xname)
next
}
if (xclass=="character" || iclass=="character" ||
xclass=="logical" || iclass=="logical" ||
xclass=="factor" || iclass=="factor") {
if (x_merge_type=="character" || i_merge_type=="character" ||
x_merge_type=="logical" || i_merge_type=="logical" ||
x_merge_type=="factor" || i_merge_type=="factor") {
if (anyNA(i[[ic]]) && allNA(i[[ic]])) {
if (verbose) catf("Coercing all-NA %s (%s) to type %s to match type of %s.\n", iname, iclass, xclass, xname)
set(i, j=ic, value=match.fun(paste0("as.", xclass))(i[[ic]]))
if (verbose) catf("Coercing all-NA %s (%s) to type %s to match type of %s.\n", iname, i_merge_type, x_merge_type, xname)
set(i, j=ic, value=match.fun(paste0("as.", x_merge_type))(i[[ic]]))
next
}
else if (anyNA(x[[xc]]) && allNA(x[[xc]])) {
if (verbose) catf("Coercing all-NA %s (%s) to type %s to match type of %s.\n", xname, xclass, iclass, iname)
set(x, j=xc, value=match.fun(paste0("as.", iclass))(x[[xc]]))
if (anyNA(x[[xc]]) && allNA(x[[xc]])) {
if (verbose) catf("Coercing all-NA %s (%s) to type %s to match type of %s.\n", xname, x_merge_type, i_merge_type, iname)
set(x, j=xc, value=match.fun(paste0("as.", i_merge_type))(x[[xc]]))
next
}
stopf("Incompatible join types: %s (%s) and %s (%s)", xname, xclass, iname, iclass)
stopf("Incompatible join types: %s (%s) and %s (%s)", xname, x_merge_type, iname, i_merge_type)
}
if (xclass=="integer64" || iclass=="integer64") {
if (x_merge_type=="integer64" || i_merge_type=="integer64") {
nm = c(iname, xname)
if (xclass=="integer64") { w=i; wc=ic; wclass=iclass; } else { w=x; wc=xc; wclass=xclass; nm=rev(nm) } # w is which to coerce
if (x_merge_type=="integer64") { w=i; wc=ic; wclass=i_merge_type; } else { w=x; wc=xc; wclass=x_merge_type; nm=rev(nm) } # w is which to coerce
if (wclass=="integer" || (wclass=="double" && !isReallyReal(w[[wc]]))) {
if (verbose) catf("Coercing %s column %s%s to type integer64 to match type of %s.\n", wclass, nm[1L], if (wclass=="double") " (which contains no fractions)" else "", nm[2L])
set(w, j=wc, value=bit64::as.integer64(w[[wc]]))
} else stopf("Incompatible join types: %s is type integer64 but %s is type double and contains fractions", nm[2L], nm[1L])
} else {
# just integer and double left
if (iclass=="double") {
ic_idx = which(ic == icols)
if (i_merge_type=="double") {
coerce_x = FALSE
if (!isReallyReal(i[[ic]])) {
coerce_x = TRUE
# common case of ad hoc user-typed integers missing L postfix joining to correct integer keys
# we've always coerced to int and returned int, for convenience.
if (verbose) catf("Coercing double column %s (which contains no fractions) to type integer to match type of %s.\n", iname, xname)
val = as.integer(i[[ic]])
if (!is.null(attributes(i[[ic]]))) attributes(val) = attributes(i[[ic]]) # to retain Date for example; 3679
set(i, j=ic, value=val)
set(callersi, j=ic, value=val) # change the shallow copy of i up in [.data.table to reflect in the result, too.
} else {
if (length(ic_idx)>1L) {
xc_idx = xcols[ic_idx]
for (xb in xcols[which(vapply_1c(x[0L, xc_idx, with=FALSE], getClass) == "double")]) {
if (isReallyReal(x[[xb]])) {
coerce_x = FALSE
break
}
}
}
if (coerce_x) {
if (verbose) catf("Coercing double column %s (which contains no fractions) to type integer to match type of %s.\n", iname, xname)
val = cast_with_atts(i[[ic]], as.integer) # to retain Date for example; 3679
set(i, j=ic, value=val)
set(callersi, j=ic, value=val) # change the shallow copy of i up in [.data.table to reflect in the result, too.
if (length(ic_idx)>1L) {
xc_idx = xcols[ic_idx]
for (xb in xcols[which(vapply_1c(x[0L, xc_idx, with=FALSE], getClass) == "double")]) {
if (verbose) catf("Coercing double column %s (which contains no fractions) to type integer to match type of %s.\n", paste0("x.", names(x)[xb]), xname)
set(x, j=xb, value=cast_with_atts(x[[xb]], as.integer))
}
}
}
}
if (!coerce_x) {
if (verbose) catf("Coercing integer column %s to type double to match type of %s which contains fractions.\n", xname, iname)
set(x, j=xc, value=as.double(x[[xc]]))
}
} else {
if (verbose) catf("Coercing integer column %s to type double for join to match type of %s.\n", iname, xname)
set(i, j=ic, value=as.double(i[[ic]]))
val = cast_with_atts(i[[ic]], as.double)
set(i, j=ic, value=val)
if (length(ic_idx)>1L) {
xc_idx = xcols[ic_idx]
for (xb in xcols[which(vapply_1c(x[0L, xc_idx, with=FALSE], getClass) == "integer")]) {
if (verbose) catf("Coercing integer column %s to type double for join to match type of %s.\n", paste0("x.", names(x)[xb]), xname)
set(x, j=xb, value=cast_with_atts(x[[xb]], as.double))
}
}
}
}
}
Expand Down
34 changes: 31 additions & 3 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -12226,10 +12226,11 @@ DT1 = data.table(RANDOM_STRING = rand_strings(n),
DATE = sample(seq(as.Date('2016-01-01'), as.Date('2016-12-31'), by="day"), n, replace=TRUE))
DT2 = data.table(RANDOM_STRING = rand_strings(n),
START_DATE = sample(seq(as.Date('2015-01-01'), as.Date('2017-12-31'), by="day"), n, replace=TRUE))
as.intDate = function(x) .Date(as.integer(as.Date(x)))
DT2[, EXPIRY_DATE := START_DATE + floor(runif(1000, 200,300))]
DT1[, DT1_ID := .I][, DATE := as.Date(DATE)]
DT1[, DT1_ID := .I][, DATE := as.intDate(DATE)]
cols = c("START_DATE", "EXPIRY_DATE")
DT2[, DT2_ID := .I][, (cols) := lapply(.SD, as.Date), .SDcols=cols]
DT2[, DT2_ID := .I][, (cols) := lapply(.SD, as.intDate), .SDcols=cols]
ans1 = DT2[DT1, on=.(RANDOM_STRING, START_DATE <= DATE, EXPIRY_DATE >= DATE), .N, by=.EACHI ]$N > 0L
tmp = DT1[DT2, on=.(RANDOM_STRING, DATE >= START_DATE, DATE <= EXPIRY_DATE), which=TRUE, nomatch=0L]
ans2 = DT1[, DT1_ID %in% tmp]
Expand Down Expand Up @@ -15727,7 +15728,8 @@ DT = data.table(z = 1i)
test(2069.33, DT[DT, on = 'z'], error = "Type 'complex' is not supported for joining/merging")

# forder verbose message when !isReallyReal Date, #1738
DT = data.table(d=sample(seq(as.Date("2015-01-01"), as.Date("2015-01-05"), by="days"), 20, replace=TRUE))
date_dbl = as.Date(as.double(seq(as.Date("2015-01-01"), as.Date("2015-01-05"), by="days")), origin="1970-01-01")
DT = data.table(d=sample(date_dbl, 20, replace=TRUE))
test(2070.01, typeof(DT$d), "double")
test(2070.02, DT[, .N, keyby=d, verbose=TRUE], output="Column 1.*date.*8 byte double.*no fractions are present.*4 byte integer.*to save space and time")

Expand Down Expand Up @@ -20596,3 +20598,29 @@ test(2295.3, is.data.table(d2))

# #6588: .checkTypos used to give arbitrary strings to stopf as the first argument
test(2296, d2[x %no such operator% 1], error = '%no such operator%')

# coerce Dates to double if join on multiple columns, #6602
x = data.table(a=1L)
y = data.table(c=1L, d=1)
test(2297.01, options=c(datatable.verbose=TRUE), y[x, on=.(c == a, d == a)], data.table(c=1L, d=1L), output="Coercing .*c to type double")
test(2297.02, options=c(datatable.verbose=TRUE), y[x, on=.(d == a, c == a)], data.table(c=1L, d=1L), output="Coercing .*c to type double")
x = data.table(a=1)
y = data.table(c=1, d=1L)
test(2297.03, options=c(datatable.verbose=TRUE), y[x, on=.(c == a, d == a)], data.table(c=1L, d=1L), output="Coercing double column x.c (which contains no fractions) to type integer")
test(2297.04, options=c(datatable.verbose=TRUE), y[x, on=.(d == a, c == a)], data.table(c=1L, d=1L), output="Coercing double column x.c (which contains no fractions) to type integer")
# dates
d_int = .Date(1L)
d_dbl = .Date(1)
x = data.table(a=d_int)
y = data.table(c=d_int, d=d_dbl)
test(2297.11, y[x, on=.(c == a, d == a)], data.table(c=d_int, d=d_int))
test(2297.12, y[x, on=.(d == a, c == a)], data.table(c=d_int, d=d_int))
x = data.table(a=d_dbl)
y = data.table(c=d_dbl, d=d_int)
test(2297.13, y[x, on=.(c == a, d == a)], data.table(c=d_int, d=d_int))
test(2297.14, y[x, on=.(d == a, c == a)], data.table(c=d_int, d=d_int))
# real double
x = data.table(a=1)
y = data.table(c=1.5, d=1L)
test(2297.21, y[x, on=.(c == a, d == a)], data.table(c=1, d=1))
test(2297.22, y[x, on=.(d == a, c == a)], data.table(c=1, d=1))
Loading