Use case 1: One biological sample

The markdown for this use case is available via this link.

Prepare the environment...

library("QCumber")
library("tidyverse")

# To install QCumber:
# devtools::install_github("gimelbrantlab/QCumber")

...including data files with allelic counts

wget -nc https://github.com/gimelbrantlab/QCumber/wiki/data/UNTREATED_processed_gene.v3.1.txt
# in a case of any error, please download directly, following the link

Data

For the walk-through, we are using a small example dataset, comparing AI in mouse lymphocytes before and after treatment with DNMT inhibitor [from Gupta et al]. Lymphocytes are from a female mouse with high SNP density, an F1 cross between B6 and Cast strains.

Note that full-scale data may take substantial time to process, growing with gene coverage depth. To reduce processing time for the walk-through, we created a toy dataset, so that the whole analysis takes a few minutes: we randomly sampled 5M reads from each technical replicate, and then selected a subset of chromosomes (1-5,10,15,X). Note that this decreases the number of genes remaining in the analysis (after chromosome filtering and coverage thresholds) and sensitivity of the results (due to low gene coverage). For the full analysis, see [Gupta et al].

Experiment design

We analyse here control cells sample.

#	experiment	number of repicates
1	Control	2

Design matrix describes the experiment, enabling convenient access to input parameters for QCumber functions.

experimentNames <- c("Control (sampled subset)")
techReps <- c(2)
designMatrix <- BuildDesign(experimentNames = experimentNames, techReps = techReps)

Load the data

Load table of allelic counts into a dataframe.

# (!) If you've used the cell above to download the example data files, they are in the working directory. Change as needed for your data.  
D <- getwd()
CTRL <- "UNTREATED_processed_gene.v3.1.txt"

inTables <- file.path(D, CTRL)
geneCountTab <- GetGatkPipelineTabs(inFiles = inTables, nReps = designMatrix$techReps, contigs = 1:19)
# (!) Note that we have filtered for mouse autosomes (contigs = 1:19)

One sample analysis

Calculate mean alellic counts and allelic imbalances for all genes, for all technical replicates in Control data. To exclude genes with very low coverage, we use threshold of 10 counts.

condition <- which(designMatrix$experimentNames == "Control (sampled subset)")

aiTable <- Reduce(function(x,y){merge(x, y, by="ID")},
  lapply(unlist(designMatrix$replicateNums[condition]), function(i){
    CountsToAI(df = geneCountTab, reps = i, thr=10)
  })
)
colnames(aiTable) <- c("ID", paste("Gene AI - replicate", unlist(designMatrix$replicateNums[condition])))

covTable <- MeanCoverage(df = geneCountTab, reps = unlist(designMatrix$replicateNums[condition]), thr=10)
colnames(covTable) <- c("ID", "Mean Allelic Coverage")

aicovTable <- merge(aiTable, covTable, by="ID")

And visualize AI correlations:

gg_AI <- ggplot(na.omit(aicovTable), 
                aes_string(x = as.name(colnames(aicovTable)[2]), 
                           y = as.name(colnames(aicovTable)[3]),
                           col = as.name(colnames(aicovTable)[ncol(aicovTable)]))) +
           geom_point(size=0.5) +
           scale_color_gradient(low="lightgray", high="black", trans = "log2",
                                breaks = c(10, 100, 1000),
                                labels = c(10, 100, 1000)) +
           theme_bw() +
           coord_fixed() 
gg_AI

We then calculate AI confidence intervals for each gene in our samples. We construct 95%-CIs around AIs and get the resulting classification finding the genes demonstrating statistical difference with 50:50.

QCC calculation

Compare replicates to calculate QCC.

Two important parameters: EPS (default=1.05) defines logarithmic bin size for gene allelic coverage. binNObs (default=40) - minimum number of genes per coverage bin (bins with fewer genes are ignored). Changing these parameters should only be done with clear understanding of potential risks.

If you want to skip this step, you can use QCCs = list(1.4, 1.2) for the example dataset. Note that these values may vary slightly between runs, especially in our toy example with its very low counts.

QCCs <- lapply(1:length(designMatrix$techReps), function(i){
   resCC <- ComputeCorrConstantsForAllPairsReps(inDF = geneCountTab,
                                                vectReps = unlist(designMatrix$replicateNums[i]),
                                                EPS = 1.05, thr = 10)
   sapply(resCC, function(x){x$fittedCC})
 })

Now we complete filling the design matrix with adding column of QCCs (mean of QCCs for each experiment, in case of >2 replicates):

designMatrix <- BuildDesign(experimentNames = experimentNames, techReps = techReps, 
                            corrConst = sapply(QCCs, function(cc){mean(cc)}))

CI(AI) calculation

We test null hypothesis of AI=0.5 for all genes. If it is rejected by QCC-corrected test, we classify the gene as showing imbalance.

AI confidence intervals are calculated for p=0.05, with Bonferroni correction for multiple hypothesis testing

AICI_tables <- lapply(1:length(designMatrix$techReps), function(i){
  PerformBinTestAIAnalysisForConditionNPoint_knownCC(inDF = geneCountTab, vectReps = designMatrix$replicateNums[i], 
                                                     vectRepsCombsCC = designMatrix$corrConst[i], thr = 10)
})

head(AICI_tables[[1]], n = 5) # first rows

Columns:

sumCOV, matCOV : total allelic counts and "maternal" counts (for first allele)

AI : point AI estimate (AI=matCOV/sumCOV)

BT_CIleft, BT_CIright : 95% CI boundaries for binomial test (Bonferroni-corrected)

BT_CIleft_CC, BT_CIright_CC : 95% CI boundaries for QCC-corrected proportional test

BT_pval and BT : test decision for binomial test

BT_pval_CC and BT_CC : test decision for QCC-corrected test

NA mask is applied if the gene didn't pass the threshold and thus measurements are not trusted.

Compare the numbers of genes with significant AI (i.e., rejected null hypothesis of AI=0.5) according to two tests.

before <- nrow(AICI_tables[[1]][!is.na(AICI_tables[[1]]$BT) & AICI_tables[[1]]$BT == T, ])
after  <- nrow(AICI_tables[[1]][!is.na(AICI_tables[[1]]$BT_CC) & AICI_tables[[1]]$BT_CC == T, ])

gg_test <- lapply(1:length(designMatrix$experimentNames), function(i){
  cowplot::plot_grid(
    ggplot(na.omit(AICI_tables[[i]]), aes(sumCOV, AI, col=BT, alpha=BT)) +
      geom_point(size=1) +
      scale_color_manual(values=c("black","red")) +
      scale_alpha_manual(values=c(0.1, 1)) +
      theme_bw() +
      scale_x_log10() +
      xlab("Total allelic count") + 
      ggtitle("Binomial test (assumes QCC=1)", 
              subtitle = paste(before, "biased genes")) +
      theme(legend.position = "none", 
            plot.title = element_text(hjust = 0.5), 
            plot.subtitle = element_text(hjust = 0.5, colour = 'red')),
    ggplot(na.omit(AICI_tables[[i]]), aes(sumCOV, AI, col=BT_CC, alpha=BT)) +
      geom_point(size=1) +
      scale_color_manual(values=c("black","red")) +
      scale_alpha_manual(values=c(0.1, 1)) +
      theme_bw() +
      scale_x_log10() +
      xlab("Total allelic count") + 
      ggtitle(paste0("Corrected test (QCC=",round(as.numeric(QCCs[1]),2),")"), 
              subtitle = paste(after, "biased genes")) +
      theme(legend.position = "none", 
            plot.title = element_text(hjust = 0.5), 
            plot.subtitle = element_text(hjust = 0.5, colour = 'red'))
  )
})

gg_test[[1]]

Experimental sandbox:

You can experiment here to see how QCC value affects the results. QCC-corrected CI is equivalent to CI estimated from (Bonferroni-corrected) beta-binomial test the number of sequencing reads divided by QCC squared. QCC=1 means no overdispersion. We have observed QCC=~3 in actual RNA-seq experiments.

QCCs_sim <- list(2.5, 2.2)  # set your tested QCCs values here

designMatrix_sim <- BuildDesign(experimentNames = experimentNames, techReps = techReps, 
                                corrConst = sapply(QCCs_sim, function(cc){mean(cc)}))

AICI_tables_sim <- lapply(1:length(designMatrix_sim$techReps), function(i){
  PerformBinTestAIAnalysisForConditionNPoint_knownCC(inDF = geneCountTab, vectReps = designMatrix_sim$replicateNums[i], 
                                                     vectRepsCombsCC = designMatrix_sim$corrConst[i], thr = 10)
})

before_sim <- nrow(AICI_tables_sim[[1]][!is.na(AICI_tables_sim[[1]]$BT) & AICI_tables_sim[[1]]$BT == T, ])
after_sim  <- nrow(AICI_tables_sim[[1]][!is.na(AICI_tables_sim[[1]]$BT_CC) & AICI_tables_sim[[1]]$BT_CC == T, ])

gg_test_sim <- lapply(1:length(designMatrix_sim$experimentNames), function(i){
  cowplot::plot_grid(
    ggplot(na.omit(AICI_tables_sim[[i]]), aes(sumCOV, AI, col=BT, alpha=BT)) +
      geom_point(size=1) +
      scale_color_manual(values=c("black","red")) +
      scale_alpha_manual(values=c(0.1, 1)) +
      theme_bw() +
      scale_x_log10() +
      xlab("Total allelic count") + 
      ggtitle("Binomial test (assumes QCC=1)", 
              subtitle = paste(before_sim, "biased genes")) +
      theme(legend.position = "none", 
            plot.title = element_text(hjust = 0.5), 
            plot.subtitle = element_text(hjust = 0.5, colour = 'red')),
    ggplot(na.omit(AICI_tables_sim[[i]]), aes(sumCOV, AI, col=BT_CC, alpha=BT)) +
      geom_point(size=1) +
      scale_color_manual(values=c("black","red")) +
      scale_alpha_manual(values=c(0.1, 1)) +
      theme_bw() +
      scale_x_log10() +
      xlab("Total allelic count") + 
      ggtitle(paste0("Corrected test (QCC=", round(as.numeric(QCCs_sim[1]),2),")"), 
              subtitle = paste(after_sim, "biased genes")) +
      theme(legend.position = "none", 
            plot.title = element_text(hjust = 0.5), 
            plot.subtitle = element_text(hjust = 0.5, colour = 'red'))
  )
})

gg_test_sim[[1]]

Provide feedback

Saved searches

Use saved searches to filter your results more quickly