-
Notifications
You must be signed in to change notification settings - Fork 0
/
functions.R
executable file
·332 lines (263 loc) · 11.6 KB
/
functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
## trims whitespace
trim <- function (x) gsub("^\\s+|\\s+$", "", x)
trim_internal = function(x) gsub("\\s+", " ", x)
trim_all = function(x){
trimmed = trim(x)
trimmed_internal = trim_internal(trimmed)
return(trimmed_internal)
}
extract_titles = function(name_strings){
patterns = "ms|miss|mrs|mr|master|ref|fr|dr|atty|prof|hon|pres|gov|coach|ofc|unknown|ms\\.|miss\\.|mrs\\.|mr\\.|master\\.|ref\\.|fr\\.|dr\\.|atty\\.|prof\\.|hon\\.|pres\\.|gov\\.|coach\\.|ofc\\."
titles = str_match(name_strings, patterns)
return(titles)
}
strip_titles = function(name_strings){
patterns = "ms|miss|mrs|mr|master|ref|fr|dr|atty|prof|hon|pres|gov|coach|ofc|ms\\.|miss\\.|mrs\\.|mr\\.|master\\.|ref\\.|fr\\.|dr\\.|atty\\.|prof\\.|hon\\.|pres\\.|gov\\.|coach\\.|ofc\\."
scrubbed_names = trim_all(gsub(patterns, "", name_strings, ignore.case=TRUE))
return(scrubbed_names)
}
top_tail_names = function(name_strings){
matches = str_match(name_strings, "(^['-[\\w]]+)(.+?)(['-[\\w]]+$)")
matches[,3] = trim_all(matches[,3])
return(matches[,2:4])
}
## Rounds up (badly -fix this)
roundUp <- function(x) 10^ceiling(log10(x))
## Function that populates a new column with a standard score
## based the frequency or sum of a certain attribute against a category.
category_to_value = function(df, cat_col, var_col=NULL, new_col_name, scale_data=TRUE, type="sum"){
if(type == "sum") {
cat_table = aggregate(df[,var_col], by=list(cat_col = df[,cat_col]), FUN=eval(type))
} else if (type == "freq") {
cat_table = data.frame(table(df[,cat_col]))
colnames(cat_table) = c("cat_col", "x")
}
if(scale_data == TRUE) {
cat_table$x = scale(cat_table$x)
}
df[,new_col_name] = cat_table$x[match(df[,cat_col], cat_table$cat_col)]
return(df)
}
## create a matrix of aggregated columns by category, choose your columns, and what function to aggregate with
create_aggregate_matrix = function(dataframe, comparison_col, aggregate_columns=c(), func="sum") {
aggregate_matrix = data.frame(table(dataframe[,comparison_col]))
for (i in aggregate_columns) {
aggregate_matrix = merge(aggregate_matrix, setNames(aggregate(dataframe[,i], by=list(Var1 = dataframe[,comparison_col]), FUN=eval(func)), c("Var1", i)), by="Var1")
}
#Set column names
colnames(aggregate_matrix) = c(c(comparison_col,"count"), aggregate_columns )
return(aggregate_matrix)
}
## Compare two factors in a df
table_comparison = function(df, col1, col2) {
tab = data.frame(prop.table(table(df[,col1], df[, col2])))
colnames(tab) = c(col1, col2, "Freq")
return(tab)
}
## Generic function to match an existing table of values and fill a data frame with its contents - NOTE - both df's have to have the same match colname"
match_and_populate= function(df, match_df, match_col, fill_col, new_col_name){
df[,new_col_name] = match_df[,fill_col][match(df[,match_col], match_df[,match_col])]
return(df)
}
## Add custom metadata to a new table
custom_table = function(df, table_col, new_col_name, new_col_dat, scale_dat=FALSE){
df_table = data.frame(table(df[,table_col]))
if(scale_dat==TRUE){
new_col_dat = scale(new_col_dat)
}
df_table[,new_col_name] = new_col_dat
colnames(df_table) = c(table_col, "count", new_col_name)
return(df_table)
}
## Partition dataset into train and test
partition_df = function(df, set="train", seed = 123, train_size = 0.75){
## Set train size
smp_size = floor(train_size * nrow(df))
## Set seed to make it reproduceable
set.seed(seed)
train_ind = sample(seq_len(nrow(df)), size = smp_size)
train=df[train_ind,]
test =df[-train_ind,]
if (set == "train"){
return(train)
} else if (set == "test") {
return(test)
}
}
### Data integrity check
data_integrity_check = function(dataframe, by_col=FALSE){
if(by_col==FALSE){
di_check = data.frame(colname = colnames(dataframe), records = NA, percent=NA)
for (col in colnames(dataframe)){
di_check[di_check$colname == col, ]$records = nrow(dataframe[!is.na(dataframe[,col]),])
di_check[di_check$colname == col, ]$percent = round(nrow(dataframe[!is.na(dataframe[,col]),])/nrow(dataframe)*100, 2)
}
return(di_check[rev(order(di_check$percent)),])
} else {
## Split factor records into columns with one to list the original column names
## and place the column names in the df
di_cols = unique(dataframe[,by_col])
di_cols = c("column_name", di_cols)
di_check = data.frame(matrix(ncol=length(di_cols), nrow=length(colnames(dataframe))))
colnames(di_check) = as.list(di_cols)
df_cols = colnames(dataframe)
di_check$column_name = df_cols
## Start this mad iteration
for (di_col in colnames(di_check[,2:length(colnames(di_check))])) {
## di_check iteration
## slice by current col and then iterate over every column in original df
slice = dataframe[dataframe[,by_col] == di_col,]
## Dream within a dream - count NA in each DF col and place it in the DI place
for (df_col in df_cols) {
has_data = length(na.omit(slice[,df_col]))
if(has_data != 0){
no_data = has_data / nrow(slice)
} else {
no_data = 0
}
di_check[di_check[,"column_name"] == df_col,di_col] = round(no_data, 2)
}
}
return(di_check)
}
}
### store normalization scale values
store_scale_values = function(df, cols = c()){
df_attrs = data.frame(col_scaled = cols, scale=NA, center=NA)
for(col in cols){
scale.center = c(attr(scale(df[,col]), "scaled:scale"), attr(scale(df[,col]), "scaled:center"))
df_attrs[df_attrs$col_scaled == col,]$scale = scale.center[1]
df_attrs[df_attrs$col_scaled == col,]$center = scale.center[2]
}
return(df_attrs)
}
### Cast many rows into single row with group members
cast_long_df = function(df, uid, cast_col, cast_prefix){
## Trim to only include needed columns
df_trimmed = df[, c(uid, cast_col)]
## Remove empty rows
df_trimmed = na.omit(df_trimmed)
## Convert to data table
setDT(df_trimmed)
## New column with unique group count
df_trimmed[, N:= 1:.N, by = uid]
## Cast the data to rows
df_trimmed = dcast(df_trimmed, get(uid)~paste(cast_prefix, N, sep=""), value.var=cast_col, sep="")
## Rename uid to original name
colnames(df_trimmed)[1] = uid
## Return these to original df
df = merge(df, df_trimmed, by=uid)
## Remove original single column
df[,c(cast_col)] = NULL
## Remove duplicate entries
df = df[!duplicated(df[,uid]),]
## Return df
return(df)
}
melt_wide_df = function(df, uid, melt_cols = c(), variable_name = "variable", value_name="values", remove_na = "TRUE"){
df_melted = melt(dwp_melted, uid, melt_cols, variable.name = variable_name, value.name=value_name, na.rm=remove_na)
}
### matching functions
### start with simple - try the different methods, pick row they converge on
fuzzy_match = function(df1, df2, id1, id2, match_cols1, match_cols2) {
## Add match_strings to both dfs and store the column information
df1 = add_match_strings_to_df(df1, match_cols1)
df1_col_names = store_match_col_names(match_cols1)
df2 = add_match_strings_to_df(df2, match_cols2)
df2_col_names = store_match_col_names(match_cols2)
print(colnames(df2))
print(colnames(df1))
## rip this into a function
## Try the different match methods and add the distance nextdoor
## For each match variant column
## for each match method
##add two columns - one containing the foreign row id,
##the other containing the lowest distance score
## For each method converge on the common match row and add a the foreign id for the ,
## the sum of the distances, and the foreign match_string for that one
## Store in a new variant df of matches, and variant df of non-matches
## For each variant df
## Sum the sum of method distances
## Pick the one with the lowest sum
## Save lowest sum DF (with correct meta) and its non-matches to disk, with additional file including match success
}
###
fuzzy_get_match_info = function(local_match_string, foreign_df, foreign_col, fuzzy_method="lv"){
row_dists = stringdist(local_match_string, foreign_df[,foreign_col], method=fuzzy_method)
min_dist = min(row_dists)
num_matches = length(which(row_dists == min(row_dists)))
## Returns the minimum distance and the number of matches
return(matrix(nrow=1, ncol=2, c(min_dist, num_matches)))
}
fuzzy_get_multiple = function(local_match_string, foreign_df, foreign_col) {
distances = stringdist(local_match_string, foreign_df[, foreign_col])
min_rows = which(distances == min(distances))
match_strings = foreign_df[min_rows, foreign_col]
return(match_strings)
}
### Add fuzzy matching columns to df1 and df2
fuzzy_add_cols = function(local_df, master_df, match_col = "match_string", master_id_col, max_dist=60, fuzzy_method="lv"){
### Get all match info between the dfs
info_cols = sapply(local_df[,match_col], function(x) {
best_match = amatch(x, master_df[,match_col], maxDist=max_dist, method=fuzzy_method)
match_info = fuzzy_get_match_info(x, master_df, match_col, fuzzy_method)
min_dist = match_info[1]
num_matches = match_info[2]
c(best_match,min_dist,num_matches)
})
## info cols needs to be transposed
info_cols = t(info_cols)
### Place match cols in local_df
local_df[,"master_match_string"] = master_df[info_cols[,1], match_col]
local_df[,"master_match_id"] = master_df[info_cols[,1], master_id_col]
local_df[,"master_min_dist"] = info_cols[,2]
local_df[,"master_num_matches"] = info_cols[,3]
return(local_df)
}
### Add match strings to df based on match_cols
### EDIT - changed to reflect new approach with no variants
add_match_string_to_df = function(df, match_cols){
## Kept in case of a return to variant approach
# match_col_variants = get_match_col_variants(match_cols)
# for(match_col_variant in match_col_variants){
# match_col_title = paste("match_string", paste(match_col_variant, collapse="-"), sep="-")
# df[, match_col_title] = generate_match_string(df, match_col_variant)
# }
## Simples = return match string from chosen cols
df$match_string = generate_match_string(df, match_cols)
return(df)
}
### Store match_col names for accessing the new columns
store_match_col_names = function(match_cols){
match_col_variants = get_match_col_variants(match_cols)
for(match_col_variant in match_col_variants){
match_col_name = paste("match_string", paste(match_col_variant, collapse="-"), sep="-")
match_col_names = c(match_col_names, match_col_name)
}
return(match_col_names)
}
### Get match string for column names and return to datasets
generate_match_string = function(df, match_cols){
if(length(match_cols) == 1)
match_string = tolower(df[,match_cols])
else {
match_string = tolower(do.call(paste, c(df[,match_cols], sep=" ")))
}
match_string = trim_all(match_string)
return(match_string)
}
### Get possible variants for matching columns and return list of variants
get_match_col_variants = function(match_cols) {
variants = list()
### for each item in match_cols
for(i in 1:length(match_cols)){
## Set the combinations to the size we are currently on
combination = combn(match_cols, i)
## Loop through each of the combinations
for(comb_col in 1:ncol(combination)){
## Store them in this list
variants = c(variants, list(combination[,comb_col]))
}
}
return(variants)
}