-
Notifications
You must be signed in to change notification settings - Fork 0
Use case 1: One biological sample
Prepare the environment...
library("QCumber")
library("tidyverse")
# To install QCumber:
# devtools::install_github("gimelbrantlab/QCumber")
...including data files with allelic counts
for FILE in UNTREATED_processed_gene.v3.1.txt TREATED_processed_gene.v3.1.txt
do
wget -nc https://github.com/gimelbrantlab/QCumber/wiki/data/$FILE
done
For the walk-through, we are using a small example dataset, comparing AI in mouse lymphocytes before and after treatment with DNMT inhibitor [from Gupta et al]. Lymphocytes are from a female mouse with high SNP density, an F1 cross between B6 and Cast strains. We analyze just a few chromosomes. Note that full-scale data would take longer to process, depending on the size.
We have 2 samples, control cells and cells treated with 10µM 5-aza-dC.
# | experiment | number of repicates |
---|---|---|
1 | Control | 2 |
2 | Treated | 2 |
Design matrix describes the experiment, enabling convenient access to input parameters for QCumber functions.
experimentNames <- c("Control", "Treated")
techReps <- c(2, 2)
designMatrix <- BuildDesign(experimentNames = experimentNames, techReps = techReps)
Load tables of allelic counts into a dataframe.
# (!) If you've used the cell above to download the example data files, they are in the working directory. Change as needed for your data.
D <- getwd()
CTRL <- "UNTREATED_processed_gene.v3.1.txt"
TRTM <- "TREATED_processed_gene.v3.1.txt"
inTables <- file.path(D, c(CTRL,TRTM))
geneCountTab <- GetGatkPipelineTabs(inFiles = inTables, nReps = designMatrix$techReps, contigs = 1:19)
# (!) Note that we have filtered for mouse autosomes (contigs = 1:19)
Calculate mean alellic counts and allelic imbalances for all genes, for all technical replicates in Control data. To exclude genes with very low coverage, we use threshold of 10 counts.
condition <- which(designMatrix$experimentNames == "Control")
aiTable <- Reduce(function(x,y){merge(x, y, by="ID")},
lapply(unlist(designMatrix$replicateNums[condition]), function(i){
CountsToAI(df = geneCountTab, reps = i, thr=10)
})
)
colnames(aiTable) <- c("ID", paste("Gene AI - replicate", unlist(designMatrix$replicateNums[condition])))
covTable <- MeanCoverage(df = geneCountTab, reps = unlist(designMatrix$replicateNums[condition]), thr=10)
colnames(covTable) <- c("ID", "Mean Allelic Coverage")
aicovTable <- merge(aiTable, covTable, by="ID")
And visualize AI correlations:
gg_AI <- ggplot(na.omit(aicovTable),
aes_string(x = as.name(colnames(aicovTable)[2]),
y = as.name(colnames(aicovTable)[3]),
col = as.name(colnames(aicovTable)[ncol(aicovTable)]))) +
geom_point(size=0.5) +
scale_color_gradient(low="lightgray", high="black", trans = "log2",
breaks = c(10, 100, 1000),
labels = c(10, 100, 1000)) +
theme_bw() +
coord_fixed()
gg_AI
We then calculate AI confidence intervals for each gene in our samples. We construct 95%-CIs around AIs and get the resulting classification finding the genes demonstrating statistical difference with 50:50.
Compare replicates to calculate QCC.
Two important parameters:
EPS
(default=1.05) defines logarithmic bin size for gene allelic coverage.
binNObs
(default=40) - minimum number of genes per coverage bin (bins with fewer genes are ignored).
Changing these parameters should only be done with clear understanding of potential risks.
If you want to skip this step, you can use
QCCs = list(1.4, 1.2)
for the example dataset. Note that these values may vary slightly between runs, especially in our toy example with its very low counts.
QCCs <- lapply(1:length(designMatrix$techReps), function(i){
resCC <- ComputeCorrConstantsForAllPairsReps(inDF = geneCountTab,
vectReps = unlist(designMatrix$replicateNums[i]),
EPS = 1.05, thr = 10)
sapply(resCC, function(x){x$fittedCC})
})
Now we complete filling the design matrix with adding column of QCCs (mean of QCCs for each experiment, in case of >2 replicates):
designMatrix <- BuildDesign(experimentNames = experimentNames, techReps = techReps,
corrConst = sapply(QCCs, function(cc){mean(cc)}))
We test null hypothesis of AI=0.5 for all genes. If it is rejected by QCC-corrected test, we classify the gene as showing imbalance.
AI confidence intervals are calculated for p=0.05, with Bonferroni correction for multiple hypothesis testing
AICI_tables <- lapply(1:length(designMatrix$techReps), function(i){
PerformBinTestAIAnalysisForConditionNPoint_knownCC(inDF = geneCountTab, vectReps = designMatrix$replicateNums[i],
vectRepsCombsCC = designMatrix$corrConst[i], thr = 10)
})
head(AICI_tables[[1]], n = 5) # first rows
Columns:
sumCOV, matCOV : total allelic counts and "maternal" counts (for first allele)
AI : point AI estimate (AI=matCOV/sumCOV)
BT_CIleft, BT_CIright : 95% CI boundaries for binomial test (Bonferroni-corrected)
BT_CIleft_CC, BT_CIright_CC : 95% CI boundaries for QCC-corrected proportional test
BT_pval and BT : test decision for binomial test
BT_pval_CC and BT_CC : test decision for QCC-corrected test
NA mask is applied if the gene didn't pass the threshold and thus measurements are not trusted.
Compare the numbers of genes with significant AI (i.e., rejected null hypothesis of AI=0.5) according to two tests.
before <- nrow(AICI_tables[[1]][!is.na(AICI_tables[[1]]$BT) & AICI_tables[[1]]$BT == T, ])
after <- nrow(AICI_tables[[1]][!is.na(AICI_tables[[1]]$BT_CC) & AICI_tables[[1]]$BT_CC == T, ])
gg_test <- lapply(1:length(designMatrix$experimentNames), function(i){
cowplot::plot_grid(
ggplot(na.omit(AICI_tables[[i]]), aes(sumCOV, AI, col=BT, alpha=BT)) +
geom_point(size=1) +
scale_color_manual(values=c("black","red")) +
scale_alpha_manual(values=c(0.1, 1)) +
theme_bw() +
scale_x_log10() +
xlab("Total allelic count") +
ggtitle("Binomial test (assumes QCC=1)",
subtitle = paste(before, "biased genes")) +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, colour = 'red')),
ggplot(na.omit(AICI_tables[[i]]), aes(sumCOV, AI, col=BT_CC, alpha=BT)) +
geom_point(size=1) +
scale_color_manual(values=c("black","red")) +
scale_alpha_manual(values=c(0.1, 1)) +
theme_bw() +
scale_x_log10() +
xlab("Total allelic count") +
ggtitle(paste0("Corrected test (QCC=",round(as.numeric(QCCs[1]),2),")"),
subtitle = paste(after, "biased genes")) +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, colour = 'red'))
)
})
gg_test[[1]]
You can experiment here to see how QCC value affects the results. QCC-corrected CI is equivalent to CI estimated from (Bonferroni-corrected) beta-binomial test the number of sequencing reads divided by QCC squared. QCC=1 means no overdispersion. We have observed QCC=~3 in actual RNA-seq experiments.
QCCs_sim <- list(2.5, 2.2) # set your tested QCCs values here
designMatrix_sim <- BuildDesign(experimentNames = experimentNames, techReps = techReps,
corrConst = sapply(QCCs_sim, function(cc){mean(cc)}))
AICI_tables_sim <- lapply(1:length(designMatrix_sim$techReps), function(i){
PerformBinTestAIAnalysisForConditionNPoint_knownCC(inDF = geneCountTab, vectReps = designMatrix_sim$replicateNums[i],
vectRepsCombsCC = designMatrix_sim$corrConst[i], thr = 10)
})
before_sim <- nrow(AICI_tables_sim[[1]][!is.na(AICI_tables_sim[[1]]$BT) & AICI_tables_sim[[1]]$BT == T, ])
after_sim <- nrow(AICI_tables_sim[[1]][!is.na(AICI_tables_sim[[1]]$BT_CC) & AICI_tables_sim[[1]]$BT_CC == T, ])
gg_test_sim <- lapply(1:length(designMatrix_sim$experimentNames), function(i){
cowplot::plot_grid(
ggplot(na.omit(AICI_tables_sim[[i]]), aes(sumCOV, AI, col=BT, alpha=BT)) +
geom_point(size=1) +
scale_color_manual(values=c("black","red")) +
scale_alpha_manual(values=c(0.1, 1)) +
theme_bw() +
scale_x_log10() +
xlab("Total allelic count") +
ggtitle("Binomial test (assumes QCC=1)",
subtitle = paste(before_sim, "biased genes")) +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, colour = 'red')),
ggplot(na.omit(AICI_tables_sim[[i]]), aes(sumCOV, AI, col=BT_CC, alpha=BT)) +
geom_point(size=1) +
scale_color_manual(values=c("black","red")) +
scale_alpha_manual(values=c(0.1, 1)) +
theme_bw() +
scale_x_log10() +
xlab("Total allelic count") +
ggtitle(paste0("Corrected test (QCC=", round(as.numeric(QCCs_sim[1]),2),")"),
subtitle = paste(after_sim, "biased genes")) +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, colour = 'red'))
)
})
gg_test_sim[[1]]