-
Notifications
You must be signed in to change notification settings - Fork 1
/
useful_functions.R
executable file
·485 lines (415 loc) · 20 KB
/
useful_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
#Useful libraries
library(ape)
library(phytools)
library(seqinr)
library(RColorBrewer)
library(gplots)
library(phangorn)
library(stringr)
library(ggtree)
library(BiocInstaller)
library(ggbio)
library(Biostrings)
plotphylo = function(tree, clustervar, type="fan", cex=0.75, pch=16, col = c(colors3, colors200), main="", legend = F, columnlength = 10, sub="", legend_title="",
edge.width = 1){
#clustervar is a named integer vector where the names are the tip labels and the ints are the cluster number
if(legend == FALSE){
plot(tree, type=type, show.tip.label=F, main=main, sub = sub, edge.width = edge.width)
tips = which(tree$tip.label %in% names(clustervar))
clustervar = clustervar[names(clustervar) %in% tree$tip.label]
tiplabels(tip = tips, cex = cex, pch=pch,
col = col[clustervar[order(match(names(clustervar), tree$tip.label))]])
}
if (legend== TRUE){
par(oma = c(1,1,1,10)) #makes plot with space on the right (large margin)
#If you want space on the bottom make it oma = c(10,1,1,1) - goes (bottom, left, top, right)
#You can also play around with the 10 and make it bigger or smaller
plotphylo(tree, clustervar, type, cex, pch, col, main, legend = FALSE, columnlength, sub, legend_title, edge.width = edge.width) #plot main figure
par(fig = c(0, 1, 0, 1), oma = c(1, 1, 1, 1), mar = c(0, 0, 0, 0), new = TRUE) #make new margins the whole space
plot(0, 0, type = "n", bty = "n", xaxt = "n", yaxt = "n") #plot blank figure on whole space
#add legend or whatever you want to the right.
legend(x="right", horiz = F, legend=(sort(unique(clustervar))),
bty="n", col=col, pch=16, cex = 1, x.intersp = 1, y.intersp = 0.75,
ncol = ceiling(length(unique(clustervar))/columnlength), title=legend_title)
}
}
plot_clusters = function(cl_list, tree, titles=NULL, cex = 0.75, type = "phylogram"){
#n = length(tree$tip.label)
sapply(1:length(cl_list), function(i){
x = cl_list[[i]]
#clustervar = c(rep(1,length(x)), rep(2, n-length(x)))
#not_in_cluster = tree$tip.label[which(!(tree$tip.label %in% x))]
#names(clustervar) = c(x,not_in_cluster)
#plotphylo(tree, clustervar, col = c("red", "grey"), type = type, main = titles[i], cex = cex)
tips = match(x, tree$tip.label)
plot(tree, show.tip.label = F, type = type, main = titles[i], edge.width = 0.5)
tiplabels(tip = tips, cex = cex, pch = 16, col = "red")
})
}
####NJ CLUSTERS#####
all_NJclusters <- function(vcf, min = 3){
# calculates how many unique variants are in a cluster
# inputs:
# vcf: matrix with columns of genomes and rows of variant positions
# min: minimum acceptable number of shared variants for a cluster to be returned
#clean vcf if not already
if("ALT" %in% colnames(vcf)){
#get rid of variants whose reference or alternative allele is N
vcf = vcf[!(grepl("N", vcf$ALT)),]
vcf = vcf[!(grepl("N", vcf$REF)),]
reps = sapply(vcf$POS, function(x){
return(length(which(vcf$POS == x)))
})
vcf = vcf[which(reps==1),]
rownames(vcf) = vcf$POS #make variant positions row names
vcf = vcf[,10:ncol(vcf)] #get rid of explanatory columns, just keep genome names
}
basemat = base_matrix(vcf)
isolates = colnames(vcf)
clusters = vector(mode="list", length = length(isolates))
for(i in 1:length(isolates)){
write(i, file="/nfs/esnitkin/Project_Cdiff/Analysis/targeted_sequencing_paper/2018-05-16_NJ_cluster_picker/2018-07-18_RM_variants/isolates", append = T)
cls = NJcluster(vcf, min, basemat, start = isolates[i], clusters)
if(length(cls)>0){
n = first_null(clusters)
clusters[n:(n+length(cls)-1)] <- cls
}
}
#find and return unique clusters
l = clusters[-which(sapply(clusters, is.null))]
l = sapply(l, function(x){
x = sort(x)
})
l = unique(l)
return(l)
}
first_null = function(list){
for(i in 1:length(list)){
if(is.null(list[[i]])){
return(i)
}
}
return(length(list)+1)
}
NJcluster <- function(vcf, min = 3, basemat, start = NULL, set_clusters){
# finds clusters that have unique defining variants that all include a specific genome
# inputs:
# vcf: matrix with columns of genomes and rows of variant positions
# min: minimum acceptable number of shared variants for a cluster to be returned
# basemat: matrix with the same number of rows as vcf and 4 columns which are the number of A's, T's, G's & C's in that position
# start: genome to start clustering at- if NULL, clustering starts with 2 most related genomes in vcf
# set_clusters: clusters already defined
clusters = vector(mode="list", length = (ncol(vcf) - 1)) #initialize list of possible clusters
num_unique_vars = numeric(ncol(vcf) - 1) #initialize vector of number of unique variants for each cluster
set_clusters = sapply(set_clusters, function(x){
x = sort(x)
})
if(is.null(start)){ #if start is not specified
cluster = two_closest(vcf) #pick 2 isolates that are closest
}
else{ #if start is specified
add = find_closest(vcf, start) #find closest other genome to specified starting genome
cluster = c(start,add) #make those 2 genomes the working cluster
}
if(in_list(sort(cluster),set_clusters)){ #if cluster is already picked, break out of loop
return(list())
}
clusters[[1]] <- cluster #store cluster
num_unique_vars[1] = num_unique_variants(vcf, cluster, basemat) #calculate number of unique variants, store
#print(num_unique_vars)
for(i in 2:(length(num_unique_vars))){ #keep adding one genome at a time until all are in the cluster
reduced_vcf = reduce_vcf(vcf, cluster) #reduce vcf to just variants that are the same in cluster
add = find_closest(reduced_vcf, cluster[1]) #find closest isolate
cluster = c(cluster, add) #add next closest isolate to cluster
if(in_list(sort(cluster),set_clusters)){ #if cluster is already picked, break out of loop
break
}
clusters[[i]] <- cluster #save cluster
num_unique_vars[i] = num_unique_variants(vcf, cluster, basemat) #calculate number of unique variants, store, etc.
#print(num_unique_vars)
}
#pick clusters that are local maximia for number of unique variants and is above the min num of unique vars
above_min = which(num_unique_vars >= min) #clusters with the minimum number of unique snps or more
#add 2nd value to beginning and 2nd to last value to end of num_uniuqe_vars so first and last clusters can be local maxima
num_unique_vars=c(num_unique_vars[2],num_unique_vars,num_unique_vars[length(num_unique_vars)-1])
#find local maxima
difs = diff(sign(diff(num_unique_vars)))
difs = which(difs == -2 | difs == -1)
r = intersect(difs, above_min) #find clusters which are local maxima and fall above the min unique snps cutoff
return(clusters[r]) #return clusters that fit criteria
}
in_list = function(vector, list){
if(length(list) == 0){return(F)}
for(i in 1:length(list)){
x = list[[i]]
if(is.null(x)){
return(F)
}
else if(length(x) != length(vector)){
next
}
else if(sum(x==vector) == length(vector)){
return(T)
}
}
}
num_unique_variants = function(vcf, cluster, basemat){
# calculates how many unique variants are in a cluster
# inputs:
# vcf: matrix with columns of genomes and rows of variant positions
# cluster: character vector with genome names or numerical vector with index of genome in vcf
return(length(unique_variants(vcf, cluster, basemat)))
}
unique_variants = function(vcf, cluster, basemat){
# identifies unique variants in a cluster
# inputs:
# vcf: matrix with columns of genomes and rows of variant positions
# cluster: character vector with genome names or numerical vector with index of genome in vcf
#clean vcf
if("ALT" %in% colnames(vcf)){
vcf = vcf[!(grepl("N", vcf$ALT)),]
vcf = vcf[!(grepl("N", vcf$REF)),]
rownames(vcf) = vcf$POS
vcf = vcf[,10:ncol(vcf)]
}
right_n = apply(basemat, 1, function(x){
length(cluster) %in% x
})
vcf = vcf[which(right_n == T),]
if(is.vector(vcf)){
vcf = t(as.matrix(vcf))
}
rownames(vcf) = which(right_n == T)
# if cluster is a character vector, convert to numerical
if(is.character(cluster)){
cluster = which(colnames(vcf) %in% cluster)
}
#calculate number of unique variants
unique_vars = apply(vcf,1, function(x){ #for each variant in vcf
clusterx = x[cluster] #bases in cluster
non_cluster = x[setdiff(1:(length(x)),cluster)] #bases not in cluster
if(length(unique(clusterx)) > 1){ #if more than one base in cluster
return(FALSE)
}
if(length(non_cluster)>0){ #if cluster is not the whole set in vcf
return(length(intersect(clusterx,non_cluster)) == 0) #TRUE if bases in cluster don't intersect with bases in non-cluster
}
else{ #if all genomes are in cluster
return(TRUE)
}
})
return(names(unique_vars)[which(unique_vars==TRUE)])
}
base_matrix = function(vcf){
#calculate number of A's T's G's and C's for each variant
bases = c("A", "T", "G", "C")
basemat = matrix(nrow = nrow(vcf), ncol = 4)
colnames(basemat) = bases
basemat = sapply(bases, function(b){
temp = ifelse(vcf == b, 1, 0)
return(rowSums(temp))
})
return(basemat)
}
find_closest = function(vcf, genome){
# finds closest genome to a given genome in vcf file
# inputs:
# vcf: matrix with columns of genomes and rows of variant positions
# cluster: character genome name or numerical with index of genome in vcf
# if genome is a character vector, convert to numerical
if(is.character(genome)){
genome = which(colnames(vcf)==genome)
}
if(ncol(vcf) == 2){ #if only one other genome in vcf, return other genome name
return(colnames(vcf)[-genome])
}
distance = apply(vcf[,-genome], 2, function(x){ #get distance of each genome to given genome
return(sum(x != vcf[,genome])) #return number of bases that are different from the given genome
})
return(names(which(distance == min(distance)))[1]) #return one genome that has the minimum distance to the given genome
}
two_closest = function(vcf){
# finds the two most related genomes in a vcf file
# inputs:
# vcf: matrix with columns of genomes and rows of variant positions
out_mat = matrix(ncol = ncol(vcf), nrow = ncol(vcf)) #initialize distance matrix- square with nrow, ncol = number of genomes
c_list = 2:(nrow(out_mat)) #initialize columns to go through for each row
for(r in 1:nrow(out_mat)){ #for each row in the distance matrix
for(c in c_list){ #go through non-redundant columns
out_mat[r,c] = sum(vcf[,r]==vcf[,c]) #set cell in distance matrix to be equal to the number of shared variants between the row and column genome
}
c_list = c_list[-1] #reduce which columns you go through for each row to avoid redundancy
}
max = which(out_mat == max(out_mat, na.rm = TRUE), arr.ind = TRUE)[1,] #find index of one cell with most shared variants
return(colnames(vcf)[max]) #return genome names
}
reduce_vcf = function(vcf, cluster){
# produce vcf matrix with only the variants that are shared by all of the genomes in a cluster
# and with only the first genome in the cluster and all genomes not in cluster
# inputs:
# vcf: matrix with columns of genomes and rows of variant positions
# cluster: character genome name or numerical with index of genome in vcf
cluster_vcf = vcf[,cluster] #subset of vcf with just genomes in cluster
cluster_vcf_rows = apply(cluster_vcf, 1, function(x){ #for each variant
return(length(unique(x)) == 1) #return if the same for all genomes in cluster
})
cluster_vcf_rows = which(cluster_vcf_rows == TRUE)
return_vcf = vcf[cluster_vcf_rows,] #vcf with all genomes and only variants that are all equal in cluster
col_remove = cluster[-1]
col_remove = which(colnames(vcf) %in% col_remove)
return_vcf = return_vcf[,-col_remove] #keep only one genome from cluster in the vcf
return(return_vcf)
}
#####RECOMBINATION PERMUTATION TEST#######
recombination_events_per_gene = function(recombinations, genes, fullmsapos=NULL){
#function to determine how many recombination events overlap with each gene
#recombinations is data frame with 2 columns, one for start position and one for end position
#genes is data frame with 2 columns, one for start position and one for end position
#fullmsapos if recombination events in terms of core genome
#convert recombination start and end positions to be in concordance with reference genome
if(!is.null(fullmsapos)){
recombinations[,1] = fullmsapos[recombinations[,1],]
recombinations[,2] = fullmsapos[recombinations[,2],]
}
#define column names
colnames(recombinations) = c("start", "end")
colnames(genes) = c("start", "end")
#calculate number of recombination events whose start and end positions are not both less than
#the gene start position or greater than the gene end position
num = sapply(1:nrow(genes), function(c){
a = which(!(recombinations$start < genes$start[c] & recombinations$end < genes$start[c]))
b = which(!(recombinations$start > genes$end[c] & recombinations$end > genes$end[c]))
return(length(intersect(a,b)))
})
#return numerical vector of recombination events corresponding to each gene
return(num)
}
place_rec_events = function(recombinations, core_genome_length=1754413){
#function to randomly place recombination events throughout the genome based on length and number of inputs
#recombinations is data frame with 2 columns, one for start position and one for end position
#genome length is the length of the reference genome
#fullmsapos if recombination events in terms of core genome
colnames(recombinations) = c("start", "end")
#determine length of each recombination event
recombinations$length = recombinations$end-recombinations$start
#choose random start positions for each recombination event
recombinations$new_start = sample(1:core_genome_length, nrow(recombinations), replace = T)
#make new random recombination event same length as original
recombinations$new_end = recombinations$new_start + recombinations$length
#re-calculate length
recombinations$new_length = recombinations$new_end - recombinations$new_start
#re-do new recombination events whose end is beyond the end of the reference genome
no_end = which(recombinations$new_end > core_genome_length)
for(i in no_end){
rec_length = recombinations$length[i]
start = sample(1:(core_genome_length-rec_length), 1)
recombinations$new_start[i] = start
recombinations$new_end[i] = start+rec_length
recombinations$new_length = recombinations$new_end - recombinations$new_start
}
#return new recombination start and end positions
return(recombinations[c("new_start", "new_end")])
}
rec_events_per_gene_permute = function(n, recombinations, genes, core_genome_length=1754413, fullmsapos=NULL){
#function randomly places recombination events and calculates how many recombination events
#overlap with each gene n number of times
#output is a matrix where each column corresponds to the inputted genes, and the rows are
#the number of recombination events that overlap with that gene in each permutation
#n = number of permutations
#recombinations is data frame with 2 columns, one for start position and one for end position
#genes is data frame with 2 columns, one for start position and one for end position
#genome length is the length of the reference genome
#fullmsapos if recombination events in terms of core genome
#initialize output matrix
output = matrix(nrow = n, ncol = nrow(genes))
#permute n times
for(i in 1:n){
#print(i) #counter to keep track of which permutation is running
write(i, file = "/nfs/esnitkin/Project_Cdiff/Analysis/targeted_sequencing_paper/2018-06-15_RM_variants_with_Ns_and_dashes/clades_1_and_2/counter", append = F)
#randomly place recombination events
random_rec_events = place_rec_events(recombinations, core_genome_length)
#calculate number of overlapping recombination events for each gene
random_rec_events_per_gene = recombination_events_per_gene(random_rec_events, genes, fullmsapos)
#write to output matrix
output[i,] = random_rec_events_per_gene
}
return(output)
}
rec_events_per_gene_pval = function(n, recombinations, genes, core_genome_length=1754413, fullmsapos=NULL){
#function calculates empirical p-values for permutation test described in rec_events_per_gene_permute
#n = number of permutations
#recombinations is data frame with 2 columns, one for start position and one for end position
#genes is data frame with 2 columns, one for start position and one for end position
#genome length is the length of the reference genome
#fullmsapos if recombination events in terms of core genome
#calculate observed number of recombination events per gene
observed_rec_events = recombination_events_per_gene(recombinations, genes, fullmsapos)
#permute recombination events in random places
permuted_rec_events = rec_events_per_gene_permute(n, recombinations, genes, core_genome_length, fullmsapos)
#calculate p-value for each gene that gives probability of observing the number of observed recombination
#events if recombination events occured randomly throughout the genome
pval = sapply(1:length(observed_rec_events), function(i){
return((1+sum(permuted_rec_events[,i]>=observed_rec_events[i]))/(n+1))
})
out = rbind(permuted_rec_events, pval)
return(out)
}
cl_list_to_clustervar = function(cl_list){
clustervar = sapply(1:length(cl_list), function(x){
return(rep(x,length(cl_list[[x]])))
})
clustervar = unlist(clustervar)
names(clustervar) = unlist(cl_list)
return(clustervar)
}
#######RECOMBINATION EVENTS PER BASE#################
gffRead <- function(gffFile, nrows = -1) {
cat("Reading ", gffFile, ": ", sep="")
gff = read.table(gffFile, sep="\t", as.is=TRUE, quote="",
header=FALSE, comment.char="#", nrows = nrows,
colClasses=c("character", "character", "character", "integer",
"integer",
"character", "character", "character", "character"))
colnames(gff) = c("seqname", "source", "feature", "start", "end",
"score", "strand", "frame", "attributes")
cat("found", nrow(gff), "rows with classes:",
paste(sapply(gff, class), collapse=", "), "\n")
stopifnot(!any(is.na(gff$start)), !any(is.na(gff$end)))
return(gff)
}
read.gff <- function(gubbins_gff_file){
gff = gffRead(gubbins_gff_file)
gff$node <- getAttributeField(gff$attributes, "node")
gff$neg_log_likelihood <- getAttributeField(gff$attributes, "neg_log_likelihood")
gff$taxa <- getAttributeField(gff$attributes, "taxa")
gff$snp_count <- getAttributeField(gff$attributes, "snp_count")
gff$snp_count <- gsub('"', "", gff$snp_count )
gff$snp_count <- as.numeric(gff$snp_count)
gff$length <- gff$end - gff$start
return(gff)
}
rec_events_per_base <- function(gff_file, fullmsapos_file, genome_length=4290252){
#Calculates how many recombination events overlap with each base in the reference genome
#gff_file is the *.recombination_predictions.gff file from gubbins output
#Read in gff and fullmsapos files
gff = read.gff(gff_file)
fullmsapos = read.table(fullmsapos_file)
gff$ref_start = fullmsapos[gff$start,]
gff$ref_end = fullmsapos[gff$end,]
#Variable with all of the positions in each recombination event
pos_rec_events = apply(gff, 1, function(x){
return(x["ref_start"]: x["ref_end"])
})
#Combines positions of rec events into one long vector with each position in each event included
#If a position is in 2 rec events, it will be in the list twice
pos_rec_events = as.numeric(unlist(pos_rec_events))
#Calculates how many times each position is in the vector
pos_rec_events_table = table(pos_rec_events)
#Add postitions without any recombination events to table
w = which(!(0:genome_length %in% names(pos_rec_events_table)))
w_table = table(w)
w_table[0:length(w)] = 0
pos_rec_events_table_with_zeros = c(w_table, pos_rec_events_table)
pos_rec_events_table_with_zeros = pos_rec_events_table_with_zeros[order(as.numeric(names(pos_rec_events_table_with_zeros)))]
return(pos_rec_events_table_with_zeros)
}