-
Notifications
You must be signed in to change notification settings - Fork 0
/
filter.R
59 lines (51 loc) · 1.59 KB
/
filter.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# df <- read_csv2("data/mags.csv")
mat <- df %>%
select(ends_with("RPKM")) %>%
as.matrix()
df$AbundMean <- rowMeans(mat)
df$AbundMedian <- rowMedians(mat)
df$AbundMin <- rowMins(mat)
df$AbundMax <- rowMaxs(mat)
df$AbundRange <- rowMaxs(mat) - rowMins(mat)
df$AbundSd <- rowSds(mat)
abund_max_idx <- c()
for (i in 1:nrow(rpkm_mat)) abund_max_idx <- append(abund_max_idx, which(rpkm_mat[i, ] == rowMaxs(rpkm_mat, na.rm = TRUE)[[i]])[[1]])
df$AbundMaxIdx <- abund_max_idx
df$AbundRatioMean <- rowMeans(rpkm_shift_mat)
df$AbundRatioMedian <- rowMedians(rpkm_shift_mat)
df$AbundRatioMin <- rowMins(rpkm_shift_mat)
df$AbundRatioMax <- rowMaxs(rpkm_shift_mat)
df$AbundRatioRange <- rowMaxs(rpkm_shift_mat) - rowMins(rpkm_shift_mat)
df$AbundRatioSd <- rowSds(rpkm_shift_mat)
#p1 <- ggplot(df, aes(x = MaxCov, y = Scaffolds)) +
# geom_point() +
# geom_smooth() +
# scale_x_log10() +
# scale_y_log10() +
# labs(x = "Maximum Coverage",
# y = "Number of Contigs") +
# theme_bw() +
# theme(panel.grid = element_blank())
#
#p2 <- ggplot(df, aes(x = MaxCov, y = N50)) +
# geom_point() +
# geom_smooth() +
# scale_x_log10() +
# scale_y_log10() +
# labs(x = "Maximum Coverage") +
# theme_bw() +
# theme(panel.grid = element_blank())
#
#p3 <- p1 + p2
#
#ggsave("quality_check.pdf",
# plot = p3,
# device = "pdf",
# width = 14,
# height = 7,
# path = "results")
max_coverage_threshold <- 10
filtered_df <- df %>%
filter(MaxCov > max_coverage_threshold) %>%
arrange(desc(AbundMax))
filtered_df$ID <- paste("MAG", order(gsub("UU(\\w)(\\d*)_(\\d*)", "\\2_\\1_\\3", filtered_df$genome)))