From 2bdcdf17f2217f19d7f76236f762758e3bbde2aa Mon Sep 17 00:00:00 2001 From: florence mehl Date: Wed, 13 Mar 2024 10:06:54 +0100 Subject: [PATCH 1/2] Update README.md --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index f9c5636..033ebfe 100644 --- a/README.md +++ b/README.md @@ -51,8 +51,7 @@ Thursday 14 March 15h - 17h room: A04.3018 #### Day 1 - 9h-10h general introduction -- 10h-12h30 PCA/PLS theory and exercise -- 13h30-17h PCA/PLS theory and exercise +- 10h-17h PCA/PLS theory and exercise #### Day 2 - 9h-17h multiblock analyses theory and exercise From 3545aca93a708b81aecf9bcb86aefb22319f208b Mon Sep 17 00:00:00 2001 From: Thuong Tran Date: Wed, 13 Mar 2024 12:02:32 +0100 Subject: [PATCH 2/2] add Rmd dim red --- Day1/practicals_dimensionality_reduction.Rmd | 373 ++++++++++++ Day1/practicals_dimensionality_reduction.html | 549 +++++++++++++++++- 2 files changed, 920 insertions(+), 2 deletions(-) create mode 100644 Day1/practicals_dimensionality_reduction.Rmd diff --git a/Day1/practicals_dimensionality_reduction.Rmd b/Day1/practicals_dimensionality_reduction.Rmd new file mode 100644 index 0000000..7ff50f6 --- /dev/null +++ b/Day1/practicals_dimensionality_reduction.Rmd @@ -0,0 +1,373 @@ +--- +title: "Dimensionality reduction" +author: "Van Du Tran" +date: "`r Sys.Date()`" +output: html_document +--- + +```{r setup, include=TRUE} +knitr::opts_knit$set( + root.dir = '.', + keep.tex = TRUE +); +knitr::opts_chunk$set( + fig.width = 9, + fig.height = 9, + fig.align = 'center', + fig.pos = 'h', + fig.show = 'hold', + echo = TRUE, + warning = FALSE, + message = FALSE, + cache = FALSE +); +``` + +# Exercises + +## PCA + +#### 1. Load the `nutrimouse` data from the `mixOmics` R package and investigate its structure. + +```{r libraries, include=TRUE, eval=TRUE} +library(mixOmics) +``` + +A data object provided by an R package can be loaded with `data`. Its structure can be obainted with `str`, `length`, `dim`, etc. + +```{r, include=TRUE, eval=TRUE} +data("nutrimouse") +## display the structure of the nutrimouse object +str(nutrimouse) +## check dimensions +lapply(nutrimouse, dim) # apply function dim to each element in list nutrimouse +lapply(nutrimouse, length) # apply function length to each element in list nutrimouse +``` + +#### 2. Take the gene expression dataset in `samples` x `variables` matrix format. Investigate variables' distribution. + +```{r, include=TRUE, eval=TRUE} +## get gene expression data structure +class(nutrimouse$gene) +dim(nutrimouse$gene) +rownames(nutrimouse$gene)[1:10] +colnames(nutrimouse$gene)[1:10] + +## check if there are missing values +any(is.na(nutrimouse$gene)) + +## investigate each variable +summary(nutrimouse$gene[, 1:4]) +colors <- rainbow(20, alpha=1) +plot(density(scale(nutrimouse$gene[, 1], center=T, scale=F)), + col=colors[1], xlim=c(-0.5,0.5), ylim=c(0,8), + main='Density plot of the first 20 genes') +invisible(sapply(2:20, function(i) { + lines(density(scale(nutrimouse$gene[, i], center=T, scale=F)), col=colors[i]) +})) +``` + +#### 3. Perform PCA and investigate variances, sample distribution and variable relationship with plots. + +A number of methods in different R packages can perform PCA, e.g. `stats::prcomp`, `stats::princomp`, `mixOmics::pca`, `multiblock::pca`, `psych::principal`, `FactoMineR::PCA`, etc. + +```{r pca, include=TRUE, eval=TRUE} +pca.res <- prcomp(nutrimouse$gene, center=TRUE, scale.=F) +names(pca.res) +summary(pca.res) +``` + +Variances = eigenvalues of the covariance matrix = (standard deviation)\^2. + +```{r pca_var, include=TRUE, eval=TRUE} +variances <- pca.res$sdev^2 +variances +``` + +Scree plot: plot of variances. + +```{r pca_screeplot, include=TRUE, eval=TRUE} +screeplot(pca.res, npcs=length(variances), type='lines') +screeplot(pca.res, npcs=length(variances), type='barplot') +barplot(variances, xlab='PC', ylab='Variance', names.arg=1:length(variances)) +``` + +Scree plot on variance percentage. + +```{r pca_var_per, include=TRUE, eval=TRUE} +varPercent <- variances/sum(variances) * 100 +barplot(varPercent, xlab='PC', ylab='Percent Variance', names.arg=1:length(varPercent)) +``` + +Scores: sample coordinates in the new reference (rotated axes or principal components). + +```{r pca_scores, include=TRUE, eval=TRUE} +scores <- pca.res$x +str(scores) +``` + +Score plot: plot of sample distribution. + +```{r pca_score_plot, include=TRUE, eval=TRUE} +PCx <- "PC1" +PCy <- "PC2" +plot(scores[, PCx], scores[, PCy], xlab=PCx, ylab=PCy, pch=16) +text(scores[, PCx], scores[, PCy]-0.05, rownames(scores), col='blue', cex=0.7) +``` + +Loadings: contributions of variables to principal components (eigenvectors of covariance matrix). + +```{r pca_loadings, include=TRUE, eval=TRUE} +loadings <- pca.res$rotation +str(loadings) +``` + +Loading plot: plot of variables' contribution, revealing their relationship. + +```{r pca_loading_plot, include=TRUE, eval=TRUE} +plot(loadings[, PCx], loadings[, PCy], type='n', main="Loadings") +arrows(0, 0, loadings[, PCx], loadings[, PCy], xlab=PCx, ylab=PCy, + length=0.1, angle=20, col=rgb(0,0,1,alpha=apply(loadings[, c(PCx, PCy)], 1, norm, "2"))) +text(loadings[, PCx], loadings[, PCy], rownames(loadings), col='grey', cex=0.7) +``` + +Both score and loading plots can be plotted altogether with the `biplot` function. + +```{r pca_biplot, include=TRUE, eval=TRUE} +## biplot +biplot(pca.res, expand=1, cex=c(0.5, 0.7), col=c("gray50", "red")) + +library(factoextra) +fviz_pca_biplot(pca.res, repel = TRUE, + col.var = "blue", # Variables color + habillage = nutrimouse$genotype, + addEllipses = T, + legend="none" + ) +``` + +#### 4. Visually investigate the sample distribution with coloring by metadata or expression of certain genes. + +The samples can be colored with some metadata, e.g *genotype* or *diet*, + +```{r pca_score_plot_anno1, include=TRUE, eval=TRUE} +plot(scores[, PCx], scores[, PCy], main="Scores", + col=c(1:nlevels(nutrimouse$diet))[nutrimouse$diet], + pch=c(17,19)[nutrimouse$genotype], + xlab=paste0(PCx, + " (", + round((summary(pca.res)$importance)[2, PCx], 2), ")"), + ylab=paste0(PCy, + " (", + round((summary(pca.res)$importance)[2, PCy], 2), ")") +) +legend("topright", title="genotype", + legend=levels(nutrimouse$genotype), + pch=c(17,19), cex=0.7) +legend("bottomright", title="diet", + legend=levels(nutrimouse$diet), + col=c(1:5), cex=0.7, pch=16) +``` + +or by some gene expression. + +```{r pca_score_plot_anno2, include=TRUE, eval=TRUE} +nbreaks <- 5 +plot(scores[, PCx], scores[, PCy], xlab=PCx, ylab=PCy, + pch=c(17,19)[nutrimouse$genotype], + col=colorRampPalette(c('red','blue'))(nbreaks)[as.numeric(cut(nutrimouse$gene$ALDH3,breaks = nbreaks))]) +``` + +## PLS + +#### 1. Perform PLS (`mixOmics::pls`) between gene and lipid. Investigate its output, sample distribution and variable relationship with plots. + +```{r pls, include=TRUE, eval=TRUE} +pls.res <- pls(X=nutrimouse$gene, Y=nutrimouse$lipid, ncomp=2, scale=TRUE, mode="canonical") +max(abs(scale(nutrimouse$gene, center=T, scale=T) - pls.res$X)) +max(abs(scale(nutrimouse$lipid, center=T, scale=T) - pls.res$Y)) +``` + +The sample distribution plot can be performed with **variates**, sample coordinates in the new reference (rotated axes) for each of the two blocks, + +```{r pls_score_plot1, include=TRUE, eval=TRUE, fig.show="hold"} +str(pls.res$variates) +PCx <- "comp1" +PCy <- "comp2" +par(mfrow=c(1,2)) +plot(pls.res$variates$X[, PCx], pls.res$variates$X[, PCy], xlab=PCx, ylab=PCy, main="X", type='n') +text(pls.res$variates$X[, PCx], pls.res$variates$X[, PCy], rownames(pls.res$variates$X), col='blue', cex=0.6) +plot(pls.res$variates$Y[, PCx], pls.res$variates$Y[, PCy], xlab=PCx, ylab=PCy, main="Y", type='n') +text(pls.res$variates$Y[, PCx], pls.res$variates$Y[, PCy], rownames(pls.res$variates$Y), col='blue', cex=0.6) +``` + +which is also produced with `plotIndiv`. + +```{r pls_score_plot2, include=TRUE, eval=TRUE} +plotIndiv(pls.res) +``` + +Loading plot: plot of variables' contribution in each data block to each variate, after deflating more *important* variates, + +```{r pls_loading_plot1, include=TRUE, eval=TRUE, fig.show="hold"} +par(mfrow=c(1,2), las=2, mar=c(4,8,1,1)) +loadings.ind.X <- order(abs(pls.res$loadings$X[, "comp1"]), decreasing = T) +barplot(head(pls.res$loadings$X[loadings.ind.X, "comp1"], 10), main="X", horiz = T, cex.names=0.8) +loadings.ind.Y <- order(abs(pls.res$loadings$Y[, "comp1"]), decreasing = T) +barplot(head(pls.res$loadings$Y[loadings.ind.Y, "comp1"], 10), main="Y", horiz = T, cex.names=0.8) +``` + +which is the same as with `plotLoadings`. + +```{r pls_loading_plot2, include=TRUE, eval=TRUE} +plotLoadings(pls.res, ndisplay = 10) +``` + +The plot of variable relationship could be obtained from **loadings.star**. + +```{r pls_var_plot, include=TRUE, eval=TRUE} +names(pls.res$loadings.star) <- c("X", "Y") +colnames(pls.res$loadings.star$X) <- colnames(pls.res$loadings.star$Y) <- c(PCx, PCy) +plot(1,1,type='n', + xlim=range(c(pls.res$loadings.star$X[, PCx],pls.res$loadings.star$Y[, PCx])), + ylim=range(c(pls.res$loadings.star$X[, PCy],pls.res$loadings.star$Y[, PCy]))) +arrows(0, 0, pls.res$loadings.star$X[, PCx], pls.res$loadings.star$X[, PCy], + length=0.1, angle=20, col=rgb(0,0,1,alpha=apply(pls.res$loadings.star$X[, c(PCx, PCy)], 1, norm, "2"))) +text(pls.res$loadings.star$X[, PCx], + pls.res$loadings.star$X[, PCy], + rownames(pls.res$loadings.star$X), col='grey', cex=0.7) +arrows(0, 0, pls.res$loadings.star$Y[, PCx], pls.res$loadings.star$Y[, PCy], + length=0.1, angle=20, col=rgb(1,0,0,alpha=apply(pls.res$loadings.star$Y[, c(PCx, PCy)], 1, norm, "2"))) +text(pls.res$loadings.star$Y[, PCx], + pls.res$loadings.star$Y[, PCy], + rownames(pls.res$loadings.star$Y), col='grey', cex=0.7) +plotVar(pls.res) +``` + +Both sample distribution and variable relationship plot could be done with `biplot` function. + +```{r pls_biplot, include=TRUE, eval=TRUE, fig.show="hold"} +biplot(pls.res, block="X", ind.names.size=3, var.names.size=2) +biplot(pls.res, block="Y", ind.names.size=3, var.names.size=2) +``` + +#### 2. Observe the difference between the two modes *regression* and *canonical* of PLS. + +```{r plsregression, include=TRUE, eval=TRUE, fig.show="hold"} +pls.reg.res <- pls(X=nutrimouse$gene, Y=nutrimouse$lipid, ncomp=2, scale=TRUE, mode="regression") +pls.can.res <- pls(X=nutrimouse$gene, Y=nutrimouse$lipid, ncomp=2, scale=TRUE, mode="canonical") +par(mfrow=c(2,2)) +biplot(pls.reg.res, block="X", ind.names.size=3, var.names.size=2) +biplot(pls.can.res, block="X", ind.names.size=3, var.names.size=2) +biplot(pls.reg.res, block="Y", ind.names.size=3, var.names.size=2) +biplot(pls.can.res, block="Y", ind.names.size=3, var.names.size=2) +``` + + +#### 3. Perform PLS-DA (`mixOmics::plsda`) between gene and genotype. Redo PLS-DA using `mixOmics::pls` and compare the results. + +```{r plsda, include=TRUE, eval=TRUE, fig.show="hold"} +pls.da.res <- plsda(X=nutrimouse$gene, Y=nutrimouse$genotype, ncomp=2, scale=TRUE) +pls.regda.res <- pls(X=nutrimouse$gene, Y=c(0,1)[nutrimouse$genotype], ncomp=2, scale=TRUE, mode="regression") +par(mfrow=c(1,2)) +biplot(pls.da.res, block="X", ind.names.size=3, var.names.size=2, + group=nutrimouse$genotype, col.per.group = c("red", "blue"), legend.title = 'genotype') +biplot(pls.regda.res, block="X", ind.names.size=3, var.names.size=2, + group=nutrimouse$genotype, col.per.group = c("red", "blue"), legend.title = 'genotype') +``` + +#### 4. Perform OPLS-DA (`ropls::opls`) between gene and genotype. Investigate its output, sample distribution, variable relationship and predictive performance. + +(https://bioconductor.org/packages/release/bioc/vignettes/ropls/inst/doc/ropls-vignette.html) + +```{r opls, include=TRUE, eval=TRUE, fig.show="hold"} +library(ropls) +opls.res <- opls(x=nutrimouse$gene, y=nutrimouse$genotype, predI=1, orthoI=NA, fig.pdfC='none') +par(mfrow=c(2,2)) +plot(opls.res, typeVc='x-score') +plot(opls.res, typeVc='x-loading') +plot(opls.res, typeVc='overview') +plot(opls.res, typeVc='correlation') + +opls.res <- opls(x=nutrimouse$gene, y=nutrimouse$genotype, predI=1, orthoI=NA, permI=100, fig.pdfC='none') +plot(opls.res, typeVc='permutation') + +opls.res <- opls(x=nutrimouse$gene, y=nutrimouse$genotype, predI=1, orthoI=NA, subset=c(1:13, 21:33), fig.pdfC='none') +par(mfrow=c(1,2)) +plot(opls.res, typeVc='predict-train') +plot(opls.res, typeVc='predict-test') + +# confusion matrix on training set +train_id <- getSubsetVi(opls.res) +table(nutrimouse$genotype[train_id], fitted(opls.res)) +# confusion matrix on test set +table(nutrimouse$genotype[-train_id], predict(opls.res, nutrimouse$gene[-train_id,])) +``` + +## More on CCA + +#### 1. Perform CCA (`mixOmics::rcc`) between 20 genes and all lipids. Investigate correlations, sample distribution and variable relationship with plots. + +The gene expression data is reduced to 20 genes so that the number of variables is less than the number of samples, to perform an unregularized CCA. + +```{r selection20, include=TRUE, eval=TRUE} +nutrimouse$gene_selected <- nutrimouse$gene[, 1:20] +str(nutrimouse$gene_selected) +``` + + +```{r cca, include=TRUE, eval=TRUE} +cca.res <- rcc(X=nutrimouse$gene_selected, Y=nutrimouse$lipid, ncomp=2) +max(abs(nutrimouse$gene_selected - cca.res$X)) +max(abs(nutrimouse$lipid - cca.res$Y)) +str(cca.res) +cca.res$cor +``` + +The sample distribution plot can be performed with **variates**, sample coordinates in the new reference (rotated axes) for each of the two blocks. + +```{r cca_score_plot, include=TRUE, eval=TRUE} +str(cca.res$variates) +PCx <- 1 +PCy <- 2 +par(mfrow=c(1,2), las=1, mar=c(4,3,1,1)) +plot(cca.res$variates$X[, PCx], cca.res$variates$X[, PCy], xlab=PCx, ylab=PCy, main="X", type='n') +text(cca.res$variates$X[, PCx], cca.res$variates$X[, PCy], rownames(cca.res$variates$X), col='blue', cex=0.6) +plot(cca.res$variates$Y[, PCx], cca.res$variates$Y[, PCy], xlab=PCx, ylab=PCy, main="Y", type='n') +text(cca.res$variates$Y[, PCx], cca.res$variates$Y[, PCy], rownames(cca.res$variates$Y), col='blue', cex=0.6) +cor(cca.res$variates$X[,1], cca.res$variates$Y[,1]) +cor(cca.res$variates$X[,2], cca.res$variates$Y[,2]) +plotIndiv(cca.res) +plotArrow(cca.res) +``` + +Variable relationship is obtained from **loadings** or with `plotVar`. + +```{r cca_loading_plot, include=TRUE, eval=FALSE} +par(mfrow=c(1,2), las=2, mar=c(4,8,1,1)) +loadings.ind.X <- order(abs(cca.res$loadings$X[, 1]), decreasing = T) +barplot(head(cca.res$loadings$X[loadings.ind.X, 1], 10), main="X", horiz = T, cex.names=0.8) +loadings.ind.Y <- order(abs(cca.res$loadings$Y[, 1]), decreasing = T) +barplot(head(cca.res$loadings$Y[loadings.ind.Y, 1], 10), main="Y", horiz = T, cex.names=0.8) +max(abs(cca.res$variates$X - scale(cca.res$X, center=T, scale=F) %*% cca.res$loadings$X)) +max(abs(cca.res$variates$Y - scale(cca.res$Y, center=T, scale=F) %*% cca.res$loadings$Y)) +plotVar(cca.res) +``` + +#### 2. Perform CCA with scaled datasets and observe the difference + +```{r cca_scaled, include=TRUE, eval=TRUE} +cca.res.scale <- rcc(X=scale(nutrimouse$gene_selected, center=T, scale=T), + Y=scale(nutrimouse$lipid, center=T, scale=T), ncomp=2) +max(abs(cca.res.scale$cor - cca.res$cor)) +max(abs(cca.res.scale$variates$X - cca.res$variates$X)) +max(abs(cca.res.scale$variates$Y - cca.res$variates$Y)) +max(abs(cca.res.scale$loadings$X - cca.res$loadings$X)) +max(abs(cca.res.scale$loadings$Y - cca.res$loadings$Y)) +``` + +#### 3. Perform regularized CCA with all genes and lipids. +```{r rcca, include=TRUE, eval=TRUE} +rcca.res <- rcc(X=nutrimouse$gene, Y=nutrimouse$lipid, ncomp=2, method="shrinkage") +plotVar(rcca.res) +``` diff --git a/Day1/practicals_dimensionality_reduction.html b/Day1/practicals_dimensionality_reduction.html index 4bf1f52..a124130 100644 --- a/Day1/practicals_dimensionality_reduction.html +++ b/Day1/practicals_dimensionality_reduction.html @@ -11,7 +11,7 @@ - + Dimensionality reduction @@ -352,11 +352,26 @@

Dimensionality reduction

Van Du Tran

-

2024-03-12

+

2024-03-13

+
knitr::opts_knit$set(
+    root.dir = '.',
+    keep.tex = TRUE
+);
+knitr::opts_chunk$set(
+    fig.width  = 9,
+    fig.height = 9,
+    fig.align  = 'center',
+    fig.pos    = 'h',
+    fig.show   = 'hold',
+    echo       = TRUE,
+    warning    = FALSE,
+    message    = FALSE,
+    cache      = FALSE
+);

Exercises

@@ -364,14 +379,199 @@

PCA

1. Load the nutrimouse data from the mixOmics R package and investigate its structure.

+
library(mixOmics)

A data object provided by an R package can be loaded with data. Its structure can be obainted with str, length, dim, etc.

+
data("nutrimouse")
+## display the structure of the nutrimouse object
+str(nutrimouse)
+
## List of 4
+##  $ gene    :'data.frame':    40 obs. of  120 variables:
+##   ..$ X36b4    : num [1:40] -0.42 -0.44 -0.48 -0.45 -0.42 -0.43 -0.53 -0.49 -0.36 -0.5 ...
+##   ..$ ACAT1    : num [1:40] -0.65 -0.68 -0.74 -0.69 -0.71 -0.69 -0.62 -0.69 -0.66 -0.62 ...
+##   ..$ ACAT2    : num [1:40] -0.84 -0.91 -1.1 -0.65 -0.54 -0.8 -1 -0.91 -0.74 -0.79 ...
+##   ..$ ACBP     : num [1:40] -0.34 -0.32 -0.46 -0.41 -0.38 -0.32 -0.44 -0.37 -0.39 -0.36 ...
+##   ..$ ACC1     : num [1:40] -1.29 -1.23 -1.3 -1.26 -1.21 -1.13 -1.22 -1.29 -1.15 -1.21 ...
+##   ..$ ACC2     : num [1:40] -1.13 -1.06 -1.09 -1.09 -0.89 -0.79 -1 -1.06 -1.08 -0.82 ...
+##   ..$ ACOTH    : num [1:40] -0.93 -0.99 -1.06 -0.93 -1 -0.93 -0.94 -1.05 -0.88 -0.92 ...
+##   ..$ ADISP    : num [1:40] -0.98 -0.97 -1.08 -1.02 -0.95 -0.97 -0.94 -1.02 -0.98 -0.99 ...
+##   ..$ ADSS1    : num [1:40] -1.19 -1 -1.18 -1.07 -1.08 -1.07 -1.05 -1.16 -1.05 -1 ...
+##   ..$ ALDH3    : num [1:40] -0.68 -0.62 -0.75 -0.71 -0.76 -0.75 -0.67 -0.75 -0.66 -0.69 ...
+##   ..$ AM2R     : num [1:40] -0.59 -0.58 -0.66 -0.65 -0.59 -0.55 -0.66 -0.66 -0.53 -0.62 ...
+##   ..$ AOX      : num [1:40] -0.16 -0.12 -0.16 -0.17 -0.31 -0.23 -0.09 -0.22 -0.06 -0.23 ...
+##   ..$ BACT     : num [1:40] -0.22 -0.32 -0.32 -0.32 -0.31 -0.29 -0.25 -0.21 -0.15 -0.2 ...
+##   ..$ BIEN     : num [1:40] -0.89 -0.88 -0.89 -0.77 -0.97 -0.84 -0.86 -0.9 -0.74 -0.76 ...
+##   ..$ BSEP     : num [1:40] -0.69 -0.6 -0.7 -0.67 -0.68 -0.55 -0.67 -0.66 -0.6 -0.58 ...
+##   ..$ Bcl.3    : num [1:40] -1.18 -1.07 -1.17 -1.12 -0.93 -1.08 -1.03 -1.01 -1.01 -1.1 ...
+##   ..$ C16SR    : num [1:40] 1.66 1.65 1.57 1.61 1.66 1.7 1.58 1.62 1.72 1.55 ...
+##   ..$ CACP     : num [1:40] -0.92 -0.87 -1.02 -0.89 -0.93 -0.97 -0.97 -0.96 -0.85 -0.95 ...
+##   ..$ CAR1     : num [1:40] -0.97 -0.92 -0.98 -0.97 -1.06 -1.03 -0.91 -1.11 -0.85 -0.99 ...
+##   ..$ CBS      : num [1:40] -0.26 -0.36 -0.4 -0.39 -0.35 -0.31 -0.32 -0.4 -0.26 -0.39 ...
+##   ..$ CIDEA    : num [1:40] -1.21 -1.17 -1.29 -1.18 -1.15 -1.14 -1.16 -1.26 -1.12 -1.08 ...
+##   ..$ COX1     : num [1:40] -1.11 -1.06 -1.17 -1.03 -0.99 -1.03 -1.15 -1.18 -0.94 -1.07 ...
+##   ..$ COX2     : num [1:40] -1.18 -1.06 -1.14 -1.13 -1.1 -1.16 -1.06 -1.24 -1.23 -1.09 ...
+##   ..$ CPT2     : num [1:40] -0.87 -0.87 -0.95 -0.88 -0.91 -0.92 -0.86 -0.93 -0.82 -0.88 ...
+##   ..$ CYP24    : num [1:40] -1.37 -1.14 -1.3 -1.27 -1.2 -1.11 -1.12 -1.3 -1.14 -1.08 ...
+##   ..$ CYP26    : num [1:40] -1.21 -1.12 -1.22 -1.18 -1.16 -1.1 -1.07 -1.23 -1.1 -1.1 ...
+##   ..$ CYP27a1  : num [1:40] -0.71 -0.62 -0.78 -0.71 -0.69 -0.6 -0.69 -0.81 -0.62 -0.62 ...
+##   ..$ CYP27b1  : num [1:40] -1.31 -1.14 -1.29 -1.27 -1.2 -1.15 -1.17 -1.28 -1.13 -1.15 ...
+##   ..$ CYP2b10  : num [1:40] -1.23 -1.2 -1.32 -1.23 -1.22 -1.1 -1.07 -1.26 -1.19 -1.1 ...
+##   ..$ CYP2b13  : num [1:40] -1.19 -1.06 -1.25 -1.13 -1.1 -1.07 -1.2 -1.37 -1.15 -1.11 ...
+##   ..$ CYP2c29  : num [1:40] -0.06 -0.2 -0.3 -0.07 -0.29 -0.28 -0.1 -0.1 0.18 -0.33 ...
+##   ..$ CYP3A11  : num [1:40] -0.09 -0.34 -0.45 -0.11 -0.51 -0.55 -0.18 -0.25 0.06 -0.4 ...
+##   ..$ CYP4A10  : num [1:40] -0.81 -0.88 -0.71 -0.65 -1.16 -0.99 -0.62 -0.82 -0.48 -0.79 ...
+##   ..$ CYP4A14  : num [1:40] -0.81 -0.84 -0.98 -0.41 -1.16 -1.09 -0.76 -0.87 -0.37 -0.95 ...
+##   ..$ CYP7a    : num [1:40] -0.77 -0.71 -0.93 -0.8 -0.71 -0.74 -0.76 -0.88 -0.77 -0.77 ...
+##   ..$ CYP8b1   : num [1:40] -0.77 -0.63 -0.53 -0.73 -0.51 -0.55 -0.57 -0.63 -0.6 -0.66 ...
+##   ..$ FAS      : num [1:40] -0.41 -0.37 -0.3 -0.59 -0.06 0.18 -0.16 0.04 -0.53 0.08 ...
+##   ..$ FAT      : num [1:40] -1.03 -0.98 -1.03 -1.06 -0.99 -0.99 -0.89 -1.08 -1.04 -0.91 ...
+##   ..$ FDFT     : num [1:40] -0.98 -0.92 -1.04 -1 -0.99 -1 -1.02 -0.97 -1.03 -0.95 ...
+##   ..$ FXR      : num [1:40] -0.93 -0.87 -1 -0.9 -0.89 -0.89 -0.86 -1.01 -0.81 -0.91 ...
+##   ..$ G6PDH    : num [1:40] -1.22 -1.09 -1.28 -1.19 -1.16 -0.96 -1.15 -1.26 -1.13 -1.03 ...
+##   ..$ G6Pase   : num [1:40] -0.46 -0.63 -1.06 -0.71 -0.58 -0.49 -0.51 -0.61 -0.38 -0.6 ...
+##   ..$ GK       : num [1:40] -0.71 -0.67 -0.68 -0.75 -0.62 -0.59 -0.59 -0.66 -0.68 -0.47 ...
+##   ..$ GS       : num [1:40] -1.24 -1.22 -1.36 -1.21 -1.22 -1.16 -1.15 -1.31 -1.16 -1.19 ...
+##   ..$ GSTa     : num [1:40] 0 -0.05 -0.13 -0.09 -0.02 -0.11 -0.06 -0.04 0.03 -0.02 ...
+##   ..$ GSTmu    : num [1:40] 0.02 -0.05 -0.19 0.03 -0.23 -0.05 -0.22 -0.07 0.23 -0.14 ...
+##   ..$ GSTpi2   : num [1:40] 0.45 0.3 0.18 0.36 0.3 0.17 0.12 0.48 0.53 0.01 ...
+##   ..$ HMGCoAred: num [1:40] -0.95 -0.86 -0.96 -1.02 -0.7 -0.76 -1 -0.88 -0.96 -0.7 ...
+##   ..$ HPNCL    : num [1:40] -0.65 -0.69 -0.75 -0.61 -0.66 -0.56 -0.61 -0.71 -0.53 -0.6 ...
+##   ..$ IL.2     : num [1:40] -0.94 -0.94 -1.16 -0.97 -0.93 -0.96 -0.96 -0.85 -0.84 -0.95 ...
+##   ..$ L.FABP   : num [1:40] 0.24 0.27 0.17 0.16 0 0.23 0.18 0.18 0.2 0.2 ...
+##   ..$ LCE      : num [1:40] 0.09 0.06 -0.05 0.01 -0.07 -0.1 -0.03 -0.08 0.12 -0.1 ...
+##   ..$ LDLr     : num [1:40] -0.82 -0.68 -0.82 -0.94 -0.73 -0.74 -0.8 -0.83 -0.81 -0.72 ...
+##   ..$ LPK      : num [1:40] -0.32 -0.39 -0.38 -0.38 -0.17 -0.14 -0.35 -0.13 -0.32 -0.24 ...
+##   ..$ LPL      : num [1:40] -1.01 -0.97 -1.11 -0.99 -1.05 -0.99 -0.93 -1.07 -0.94 -0.95 ...
+##   ..$ LXRa     : num [1:40] -0.82 -0.82 -0.91 -0.85 -0.83 -0.79 -0.77 -0.84 -0.75 -0.78 ...
+##   ..$ LXRb     : num [1:40] -1 -0.95 -1.16 -1.01 -1.01 -0.99 -0.98 -1.04 -0.98 -0.99 ...
+##   ..$ Lpin     : num [1:40] -0.87 -0.97 -0.95 -1 -0.57 -0.51 -0.81 -0.83 -0.83 -0.48 ...
+##   ..$ Lpin1    : num [1:40] -0.85 -0.99 -0.94 -1.02 -0.53 -0.51 -0.81 -0.87 -0.82 -0.49 ...
+##   ..$ Lpin2    : num [1:40] -0.85 -0.87 -0.9 -0.88 -0.72 -0.68 -0.8 -0.9 -0.68 -0.67 ...
+##   ..$ Lpin3    : num [1:40] -1.23 -1.12 -1.25 -1.18 -1.12 -1.09 -1.04 -1.23 -1.13 -1.11 ...
+##   ..$ M.CPT1   : num [1:40] -1.15 -1.06 -1.26 -1.1 -1.11 -1.14 -1.08 -1.19 -1.06 -1.09 ...
+##   ..$ MCAD     : num [1:40] -0.6 -0.62 -0.7 -0.59 -0.69 -0.66 -0.53 -0.66 -0.45 -0.62 ...
+##   ..$ MDR1     : num [1:40] -1.15 -1.1 -1.26 -1.13 -1.11 -1.09 -1.09 -1.19 -1.06 -1.1 ...
+##   ..$ MDR2     : num [1:40] -0.77 -0.65 -0.86 -0.77 -0.7 -0.69 -0.81 -0.81 -0.69 -0.75 ...
+##   ..$ MRP6     : num [1:40] -0.99 -0.85 -0.9 -0.95 -0.91 -0.84 -0.88 -1.02 -0.83 -0.86 ...
+##   ..$ MS       : num [1:40] -1.11 -1.06 -1.2 -1.09 -1.09 -1.09 -0.99 -1.16 -1.06 -0.98 ...
+##   ..$ MTHFR    : num [1:40] -0.96 -0.99 -1.1 -0.95 -0.93 -0.96 -0.88 -1.03 -1.01 -0.95 ...
+##   ..$ NGFiB    : num [1:40] -1.21 -1.08 -1.24 -1.12 -1.11 -1.04 -1.02 -1.21 -1.11 -1.04 ...
+##   ..$ NURR1    : num [1:40] -1.21 -1.1 -1.32 -1.11 -1.14 -1.18 -1.1 -1.26 -1.14 -1.09 ...
+##   ..$ Ntcp     : num [1:40] -0.49 -0.45 -0.44 -0.54 -0.47 -0.46 -0.55 -0.5 -0.44 -0.43 ...
+##   ..$ OCTN2    : num [1:40] -1.15 -1.15 -1.2 -1.17 -1.19 -1.11 -1.08 -1.21 -1.05 -1.08 ...
+##   ..$ PAL      : num [1:40] -1.32 -1.25 -1.16 -1.25 -1.24 -1.02 -1.04 -1.27 -0.93 -0.92 ...
+##   ..$ PDK4     : num [1:40] -1.16 -1.16 -1.27 -1.16 -1.13 -1.08 -1.14 -1.24 -1.19 -1.04 ...
+##   ..$ PECI     : num [1:40] -0.68 -0.69 -0.92 -0.71 -0.83 -0.81 -0.79 -0.85 -0.58 -0.82 ...
+##   ..$ PLTP     : num [1:40] -1.1 -0.99 -1.03 -1.08 -0.98 -0.89 -1.05 -1.07 -1.02 -0.85 ...
+##   ..$ PMDCI    : num [1:40] -0.52 -0.52 -0.6 -0.52 -0.71 -0.69 -0.55 -0.57 -0.46 -0.69 ...
+##   ..$ PON      : num [1:40] -0.52 -0.55 -0.65 -0.64 -0.57 -0.63 -0.56 -0.65 -0.6 -0.64 ...
+##   ..$ PPARa    : num [1:40] -0.93 -0.86 -0.95 -0.97 -0.94 -0.95 -0.9 -1.12 -0.88 -0.95 ...
+##   ..$ PPARd    : num [1:40] -1.51 -1.59 -1.71 -1.57 -1.53 -1.56 -1.49 -1.57 -1.58 -1.54 ...
+##   ..$ PPARg    : num [1:40] -1.06 -1.02 -1.14 -1.05 -1.09 -1.01 -1 -1.13 -0.97 -1.07 ...
+##   ..$ PXR      : num [1:40] -0.99 -0.96 -1.1 -0.99 -1 -1.03 -0.93 -1.07 -0.98 -0.96 ...
+##   ..$ Pex11a   : num [1:40] -1 -1.02 -1.2 -1 -0.95 -1.07 -1.05 -1.02 -1 -1.01 ...
+##   ..$ RARa     : num [1:40] -1.2 -1.06 -1.16 -1.17 -1.15 -1.13 -1.09 -1.24 -1.03 -1.09 ...
+##   ..$ RARb2    : num [1:40] -1.19 -1.11 -1.23 -1.16 -1.14 -1.07 -1.09 -1.18 -1.12 -1.1 ...
+##   ..$ RXRa     : num [1:40] -0.67 -0.59 -0.68 -0.72 -0.78 -0.62 -0.65 -0.76 -0.55 -0.67 ...
+##   ..$ RXRb2    : num [1:40] -0.95 -0.95 -1.07 -0.95 -0.98 -0.94 -0.92 -1.03 -0.94 -0.95 ...
+##   ..$ RXRg1    : num [1:40] -1.16 -1.1 -1.21 -1.1 -1.11 -1.03 -1.07 -1.19 -1.05 -1.04 ...
+##   ..$ S14      : num [1:40] -0.93 -0.86 -0.84 -1.05 -0.65 -0.4 -0.73 -0.62 -0.99 -0.25 ...
+##   ..$ SHP1     : num [1:40] -1.1 -0.97 -1.09 -1.03 -1.13 -0.98 -0.95 -1.21 -0.93 -0.97 ...
+##   ..$ SIAT4c   : num [1:40] -1.07 -0.97 -1.04 -0.99 -0.94 -0.93 -0.89 -1.04 -0.93 -0.95 ...
+##   ..$ SPI1.1   : num [1:40] 1.19 1.15 1.09 1.07 1.22 1.05 1.15 1.18 1.21 1.04 ...
+##   ..$ SR.BI    : num [1:40] -0.84 -0.86 -0.95 -0.95 -1.06 -0.8 -0.83 -1 -0.83 -0.77 ...
+##   ..$ THB      : num [1:40] -0.79 -0.85 -0.92 -0.79 -0.84 -0.86 -0.8 -0.86 -0.83 -0.85 ...
+##   ..$ THIOL    : num [1:40] -0.18 -0.15 -0.24 -0.15 -0.35 -0.29 -0.22 -0.23 -0.17 -0.18 ...
+##   ..$ TRa      : num [1:40] -1.48 -1.46 -1.58 -1.54 -1.46 -1.44 -1.32 -1.56 -1.46 -1.35 ...
+##   ..$ TRb      : num [1:40] -1.07 -1 -1.16 -1.11 -1.01 -1 -0.97 -1.08 -1.02 -0.98 ...
+##   ..$ Tpalpha  : num [1:40] -0.69 -0.74 -0.81 -0.74 -0.82 -0.76 -0.72 -0.76 -0.65 -0.83 ...
+##   ..$ Tpbeta   : num [1:40] -1.11 -1.09 -1.14 -1.04 -1.2 -1.05 -1 -1.16 -0.91 -1.07 ...
+##   .. [list output truncated]
+##  $ lipid   :'data.frame':    40 obs. of  21 variables:
+##   ..$ C14.0   : num [1:40] 0.34 0.38 0.36 0.22 0.37 1.7 0.35 0.34 0.22 1.38 ...
+##   ..$ C16.0   : num [1:40] 26.4 24 23.7 25.5 24.8 ...
+##   ..$ C18.0   : num [1:40] 10.22 9.93 8.96 8.14 9.63 ...
+##   ..$ C16.1n.9: num [1:40] 0.35 0.55 0.55 0.49 0.46 0.66 0.36 0.29 0.44 0.9 ...
+##   ..$ C16.1n.7: num [1:40] 3.1 2.54 2.65 2.82 2.85 7.26 3.6 3.27 2.36 7.01 ...
+##   ..$ C18.1n.9: num [1:40] 17 20.1 22.9 21.9 21.4 ...
+##   ..$ C18.1n.7: num [1:40] 2.41 3.92 3.96 2.52 2.96 8.99 2.15 1.99 1.81 8.85 ...
+##   ..$ C20.1n.9: num [1:40] 0.26 0.23 0.26 0 0.3 0.36 0.25 0.31 0 0.21 ...
+##   ..$ C20.3n.9: num [1:40] 0 0 0.19 0 0.27 2.89 0 0 0 2.03 ...
+##   ..$ C18.2n.6: num [1:40] 8.93 14.98 16.06 13.89 14.55 ...
+##   ..$ C18.3n.6: num [1:40] 0 0.3 0.27 0 0.27 2.66 0 0 0 0 ...
+##   ..$ C20.2n.6: num [1:40] 0 0.3 0.33 0 0.23 0 0 0 0 0 ...
+##   ..$ C20.3n.6: num [1:40] 0.78 1.64 1.51 1.1 1.58 0.81 0.68 0.72 1.07 0.59 ...
+##   ..$ C20.4n.6: num [1:40] 3.07 15.34 13.27 3.92 11.85 ...
+##   ..$ C22.4n.6: num [1:40] 0 0.58 0.54 0 0.32 0 0 0 0 0 ...
+##   ..$ C22.5n.6: num [1:40] 0 2.1 1.77 0 0.44 0.56 0 0 0 0.39 ...
+##   ..$ C18.3n.3: num [1:40] 5.97 0 0 0.49 0.42 0 8.4 6.01 0.55 0 ...
+##   ..$ C20.3n.3: num [1:40] 0.37 0 0 0 0 0 0.42 0.39 0 0 ...
+##   ..$ C20.5n.3: num [1:40] 8.62 0 0 2.99 0.3 0 7.37 7.96 3.13 0 ...
+##   ..$ C22.5n.3: num [1:40] 1.75 0.48 0.22 1.04 0.35 2.13 2.05 2.33 1.65 0 ...
+##   ..$ C22.6n.3: num [1:40] 10.39 2.61 2.51 14.99 6.69 ...
+##  $ diet    : Factor w/ 5 levels "coc","fish","lin",..: 3 5 5 2 4 1 3 3 2 1 ...
+##  $ genotype: Factor w/ 2 levels "wt","ppar": 1 1 1 1 1 1 1 1 1 1 ...
+
## check dimensions
+lapply(nutrimouse, dim) # apply function dim to each element in list nutrimouse
+
## $gene
+## [1]  40 120
+## 
+## $lipid
+## [1] 40 21
+## 
+## $diet
+## NULL
+## 
+## $genotype
+## NULL
+
lapply(nutrimouse, length) # apply function length to each element in list nutrimouse
+
## $gene
+## [1] 120
+## 
+## $lipid
+## [1] 21
+## 
+## $diet
+## [1] 40
+## 
+## $genotype
+## [1] 40

2. Take the gene expression dataset in samples x variables matrix format. Investigate variables’ distribution.

+
## get gene expression data structure
+class(nutrimouse$gene)
+
## [1] "data.frame"
+
dim(nutrimouse$gene)
+
## [1]  40 120
+
rownames(nutrimouse$gene)[1:10]
+
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"
+
colnames(nutrimouse$gene)[1:10]
+
##  [1] "X36b4" "ACAT1" "ACAT2" "ACBP"  "ACC1"  "ACC2"  "ACOTH" "ADISP" "ADSS1"
+## [10] "ALDH3"
+
## check if there are missing values
+any(is.na(nutrimouse$gene))
+
## [1] FALSE
+
## investigate each variable
+summary(nutrimouse$gene[, 1:4])
+
##      X36b4             ACAT1             ACAT2              ACBP        
+##  Min.   :-0.5800   Min.   :-0.7500   Min.   :-1.1000   Min.   :-0.6600  
+##  1st Qu.:-0.5025   1st Qu.:-0.6900   1st Qu.:-0.8800   1st Qu.:-0.5025  
+##  Median :-0.4600   Median :-0.6600   Median :-0.7950   Median :-0.4250  
+##  Mean   :-0.4552   Mean   :-0.6552   Mean   :-0.7668   Mean   :-0.4338  
+##  3rd Qu.:-0.4200   3rd Qu.:-0.6200   3rd Qu.:-0.6450   3rd Qu.:-0.3550  
+##  Max.   :-0.3000   Max.   :-0.5200   Max.   :-0.3900   Max.   :-0.2400
+
colors <- rainbow(20, alpha=1)
+plot(density(scale(nutrimouse$gene[, 1], center=T, scale=F)), 
+     col=colors[1], xlim=c(-0.5,0.5), ylim=c(0,8),
+     main='Density plot of the first 20 genes')
+invisible(sapply(2:20, function(i) {
+    lines(density(scale(nutrimouse$gene[, i], center=T, scale=F)), col=colors[i])
+}))
+

3. Perform PCA and investigate variances, sample distribution and @@ -380,26 +580,127 @@

3. Perform PCA and investigate variances, sample distribution and e.g. stats::prcomp, stats::princomp, mixOmics::pca, multiblock::pca, psych::principal, FactoMineR::PCA, etc.

+
pca.res <- prcomp(nutrimouse$gene, center=TRUE, scale.=F)
+names(pca.res)
+
## [1] "sdev"     "rotation" "center"   "scale"    "x"
+
summary(pca.res)
+
## Importance of components:
+##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
+## Standard deviation     0.6763 0.5064 0.4033 0.28206 0.24164 0.19445 0.17513
+## Proportion of Variance 0.3497 0.1961 0.1244 0.06084 0.04465 0.02891 0.02345
+## Cumulative Proportion  0.3497 0.5458 0.6702 0.73107 0.77572 0.80463 0.82808
+##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
+## Standard deviation     0.16498 0.14796 0.13623 0.13425 0.11505 0.11208 0.11052
+## Proportion of Variance 0.02081 0.01674 0.01419 0.01378 0.01012 0.00961 0.00934
+## Cumulative Proportion  0.84889 0.86563 0.87983 0.89361 0.90373 0.91333 0.92267
+##                           PC15    PC16    PC17    PC18    PC19    PC20    PC21
+## Standard deviation     0.10450 0.09952 0.09052 0.08962 0.07914 0.07511 0.07313
+## Proportion of Variance 0.00835 0.00757 0.00627 0.00614 0.00479 0.00431 0.00409
+## Cumulative Proportion  0.93102 0.93860 0.94486 0.95101 0.95579 0.96011 0.96420
+##                           PC22    PC23    PC24    PC25    PC26    PC27    PC28
+## Standard deviation     0.06913 0.06708 0.06308 0.06186 0.06029 0.05810 0.05639
+## Proportion of Variance 0.00365 0.00344 0.00304 0.00293 0.00278 0.00258 0.00243
+## Cumulative Proportion  0.96785 0.97129 0.97434 0.97726 0.98004 0.98262 0.98505
+##                           PC29    PC30    PC31    PC32    PC33    PC34    PC35
+## Standard deviation     0.05151 0.04984 0.04840 0.04724 0.04602 0.04083 0.03979
+## Proportion of Variance 0.00203 0.00190 0.00179 0.00171 0.00162 0.00127 0.00121
+## Cumulative Proportion  0.98708 0.98898 0.99077 0.99248 0.99410 0.99538 0.99659
+##                           PC36    PC37    PC38    PC39      PC40
+## Standard deviation     0.03680 0.03468 0.03282 0.02883 1.865e-15
+## Proportion of Variance 0.00104 0.00092 0.00082 0.00064 0.000e+00
+## Cumulative Proportion  0.99762 0.99854 0.99936 1.00000 1.000e+00

Variances = eigenvalues of the covariance matrix = (standard deviation)^2.

+
variances <- pca.res$sdev^2
+variances
+
##  [1] 4.573742e-01 2.564410e-01 1.626804e-01 7.955690e-02 5.838751e-02
+##  [6] 3.780991e-02 3.066913e-02 2.721979e-02 2.189256e-02 1.855921e-02
+## [11] 1.802243e-02 1.323667e-02 1.256153e-02 1.221509e-02 1.091924e-02
+## [16] 9.905054e-03 8.193579e-03 8.032182e-03 6.262595e-03 5.641794e-03
+## [21] 5.348430e-03 4.779376e-03 4.500169e-03 3.978795e-03 3.826089e-03
+## [26] 3.634453e-03 3.376009e-03 3.179730e-03 2.653212e-03 2.484088e-03
+## [31] 2.342963e-03 2.231208e-03 2.117845e-03 1.667351e-03 1.583188e-03
+## [36] 1.353925e-03 1.202714e-03 1.076873e-03 8.311648e-04 3.480070e-30

Scree plot: plot of variances.

+
screeplot(pca.res, npcs=length(variances), type='lines')
+screeplot(pca.res, npcs=length(variances), type='barplot')
+barplot(variances, xlab='PC', ylab='Variance', names.arg=1:length(variances))
+

Scree plot on variance percentage.

+
varPercent <- variances/sum(variances) * 100
+barplot(varPercent, xlab='PC', ylab='Percent Variance', names.arg=1:length(varPercent))
+

Scores: sample coordinates in the new reference (rotated axes or principal components).

+
scores <- pca.res$x
+str(scores)
+
##  num [1:40, 1:40] -0.5036 -0.6119 0.0596 -0.4686 -0.2457 ...
+##  - attr(*, "dimnames")=List of 2
+##   ..$ : chr [1:40] "1" "2" "3" "4" ...
+##   ..$ : chr [1:40] "PC1" "PC2" "PC3" "PC4" ...

Score plot: plot of sample distribution.

+
PCx <- "PC1"
+PCy <- "PC2"
+plot(scores[, PCx], scores[, PCy], xlab=PCx, ylab=PCy, pch=16)
+text(scores[, PCx], scores[, PCy]-0.05, rownames(scores), col='blue', cex=0.7)
+

Loadings: contributions of variables to principal components (eigenvectors of covariance matrix).

+
loadings <- pca.res$rotation
+str(loadings)
+
##  num [1:120, 1:40] -0.0425 -0.023 -0.0131 -0.107 -0.0618 ...
+##  - attr(*, "dimnames")=List of 2
+##   ..$ : chr [1:120] "X36b4" "ACAT1" "ACAT2" "ACBP" ...
+##   ..$ : chr [1:40] "PC1" "PC2" "PC3" "PC4" ...

Loading plot: plot of variables’ contribution, revealing their relationship.

+
plot(loadings[, PCx], loadings[, PCy], type='n', main="Loadings")
+arrows(0, 0, loadings[, PCx], loadings[, PCy], xlab=PCx, ylab=PCy,
+       length=0.1, angle=20, col=rgb(0,0,1,alpha=apply(loadings[, c(PCx, PCy)], 1, norm, "2")))
+text(loadings[, PCx], loadings[, PCy], rownames(loadings), col='grey', cex=0.7)
+

Both score and loading plots can be plotted altogether with the biplot function.

+
## biplot
+biplot(pca.res, expand=1, cex=c(0.5, 0.7), col=c("gray50", "red"))
+
+library(factoextra)
+fviz_pca_biplot(pca.res, repel = TRUE,
+                col.var = "blue", # Variables color
+                habillage = nutrimouse$genotype,
+                addEllipses = T,
+                legend="none"
+                )
+

4. Visually investigate the sample distribution with coloring by metadata or expression of certain genes.

The samples can be colored with some metadata, e.g genotype or diet,

+
plot(scores[, PCx], scores[, PCy], main="Scores",
+     col=c(1:nlevels(nutrimouse$diet))[nutrimouse$diet],
+     pch=c(17,19)[nutrimouse$genotype],
+     xlab=paste0(PCx,
+                 " (",
+                 round((summary(pca.res)$importance)[2, PCx], 2), ")"),
+     ylab=paste0(PCy,
+                 " (",
+                 round((summary(pca.res)$importance)[2, PCy], 2), ")")
+)
+legend("topright", title="genotype",
+       legend=levels(nutrimouse$genotype),
+       pch=c(17,19), cex=0.7)
+legend("bottomright", title="diet",
+       legend=levels(nutrimouse$diet),
+       col=c(1:5), cex=0.7, pch=16)
+

or by some gene expression.

+
nbreaks <- 5
+plot(scores[, PCx], scores[, PCy], xlab=PCx, ylab=PCy, 
+     pch=c(17,19)[nutrimouse$genotype], 
+     col=colorRampPalette(c('red','blue'))(nbreaks)[as.numeric(cut(nutrimouse$gene$ALDH3,breaks = nbreaks))])
+

@@ -408,32 +709,146 @@

PLS

1. Perform PLS (mixOmics::pls) between gene and lipid. Investigate its output, sample distribution and variable relationship with plots.

+
pls.res <- pls(X=nutrimouse$gene, Y=nutrimouse$lipid, ncomp=2, scale=TRUE, mode="canonical")
+max(abs(scale(nutrimouse$gene, center=T, scale=T) - pls.res$X))
+
## [1] 1.332268e-15
+
max(abs(scale(nutrimouse$lipid, center=T, scale=T) - pls.res$Y))
+
## [1] 8.881784e-16

The sample distribution plot can be performed with variates, sample coordinates in the new reference (rotated axes) for each of the two blocks,

+
str(pls.res$variates)
+
## List of 2
+##  $ X: num [1:40, 1:2] 6.659 0.614 9.876 4.864 0.934 ...
+##   ..- attr(*, "dimnames")=List of 2
+##   .. ..$ : chr [1:40] "1" "2" "3" "4" ...
+##   .. ..$ : chr [1:2] "comp1" "comp2"
+##  $ Y: num [1:40, 1:2] 2.33 2.6 1.98 1.94 2.29 ...
+##   ..- attr(*, "dimnames")=List of 2
+##   .. ..$ : chr [1:40] "1" "2" "3" "4" ...
+##   .. ..$ : chr [1:2] "comp1" "comp2"
+
PCx <- "comp1"
+PCy <- "comp2"
+par(mfrow=c(1,2))
+plot(pls.res$variates$X[, PCx], pls.res$variates$X[, PCy], xlab=PCx, ylab=PCy, main="X", type='n')
+text(pls.res$variates$X[, PCx], pls.res$variates$X[, PCy], rownames(pls.res$variates$X), col='blue', cex=0.6)
+plot(pls.res$variates$Y[, PCx], pls.res$variates$Y[, PCy], xlab=PCx, ylab=PCy, main="Y", type='n')
+text(pls.res$variates$Y[, PCx], pls.res$variates$Y[, PCy], rownames(pls.res$variates$Y), col='blue', cex=0.6)
+

which is also produced with plotIndiv.

+
plotIndiv(pls.res)
+

Loading plot: plot of variables’ contribution in each data block to each variate, after deflating more important variates,

+
par(mfrow=c(1,2), las=2, mar=c(4,8,1,1))
+loadings.ind.X <- order(abs(pls.res$loadings$X[, "comp1"]), decreasing = T)
+barplot(head(pls.res$loadings$X[loadings.ind.X, "comp1"], 10), main="X", horiz = T, cex.names=0.8)
+loadings.ind.Y <- order(abs(pls.res$loadings$Y[, "comp1"]), decreasing = T)
+barplot(head(pls.res$loadings$Y[loadings.ind.Y, "comp1"], 10), main="Y", horiz = T, cex.names=0.8)
+

which is the same as with plotLoadings.

+
plotLoadings(pls.res, ndisplay = 10)
+

The plot of variable relationship could be obtained from loadings.star.

+
names(pls.res$loadings.star) <- c("X", "Y")
+colnames(pls.res$loadings.star$X) <- colnames(pls.res$loadings.star$Y) <- c(PCx, PCy)
+plot(1,1,type='n',
+     xlim=range(c(pls.res$loadings.star$X[, PCx],pls.res$loadings.star$Y[, PCx])), 
+     ylim=range(c(pls.res$loadings.star$X[, PCy],pls.res$loadings.star$Y[, PCy])))
+arrows(0, 0, pls.res$loadings.star$X[, PCx], pls.res$loadings.star$X[, PCy],
+       length=0.1, angle=20, col=rgb(0,0,1,alpha=apply(pls.res$loadings.star$X[, c(PCx, PCy)], 1, norm, "2")))
+text(pls.res$loadings.star$X[, PCx], 
+     pls.res$loadings.star$X[, PCy], 
+     rownames(pls.res$loadings.star$X), col='grey', cex=0.7)
+arrows(0, 0, pls.res$loadings.star$Y[, PCx], pls.res$loadings.star$Y[, PCy],
+       length=0.1, angle=20, col=rgb(1,0,0,alpha=apply(pls.res$loadings.star$Y[, c(PCx, PCy)], 1, norm, "2")))
+text(pls.res$loadings.star$Y[, PCx], 
+     pls.res$loadings.star$Y[, PCy], 
+     rownames(pls.res$loadings.star$Y), col='grey', cex=0.7)
+plotVar(pls.res)
+

Both sample distribution and variable relationship plot could be done with biplot function.

+
biplot(pls.res, block="X", ind.names.size=3, var.names.size=2)
+biplot(pls.res, block="Y", ind.names.size=3, var.names.size=2)
+

2. Observe the difference between the two modes regression and canonical of PLS.

+
pls.reg.res <- pls(X=nutrimouse$gene, Y=nutrimouse$lipid, ncomp=2, scale=TRUE, mode="regression")
+pls.can.res <- pls(X=nutrimouse$gene, Y=nutrimouse$lipid, ncomp=2, scale=TRUE, mode="canonical")
+par(mfrow=c(2,2))
+biplot(pls.reg.res, block="X", ind.names.size=3, var.names.size=2)
+biplot(pls.can.res, block="X", ind.names.size=3, var.names.size=2)
+biplot(pls.reg.res, block="Y", ind.names.size=3, var.names.size=2)
+biplot(pls.can.res, block="Y", ind.names.size=3, var.names.size=2)
+

3. Perform PLS-DA (mixOmics::plsda) between gene and genotype. Redo PLS-DA using mixOmics::pls and compare the results.

+
pls.da.res <- plsda(X=nutrimouse$gene, Y=nutrimouse$genotype, ncomp=2, scale=TRUE)
+pls.regda.res <- pls(X=nutrimouse$gene, Y=c(0,1)[nutrimouse$genotype], ncomp=2, scale=TRUE, mode="regression")
+par(mfrow=c(1,2))
+biplot(pls.da.res, block="X", ind.names.size=3, var.names.size=2,
+       group=nutrimouse$genotype, col.per.group = c("red", "blue"), legend.title = 'genotype')
+biplot(pls.regda.res, block="X", ind.names.size=3, var.names.size=2,
+       group=nutrimouse$genotype, col.per.group = c("red", "blue"), legend.title = 'genotype')
+

4. Perform OPLS-DA (ropls::opls) between gene and genotype. Investigate its output, sample distribution, variable relationship and predictive performance.

(https://bioconductor.org/packages/release/bioc/vignettes/ropls/inst/doc/ropls-vignette.html)

+
library(ropls)
+opls.res <- opls(x=nutrimouse$gene, y=nutrimouse$genotype, predI=1, orthoI=NA, fig.pdfC='none')
+
## OPLS-DA
+## 40 samples x 120 variables and 1 response
+## standard scaling of predictors and response(s)
+##       R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y  pQ2
+## Total    0.615    0.975   0.935 0.083   1   2 0.05 0.05
+
par(mfrow=c(2,2))
+plot(opls.res, typeVc='x-score')
+plot(opls.res, typeVc='x-loading')
+plot(opls.res, typeVc='overview')
+plot(opls.res, typeVc='correlation')
+
+opls.res <- opls(x=nutrimouse$gene, y=nutrimouse$genotype, predI=1, orthoI=NA, permI=100, fig.pdfC='none')
+
## OPLS-DA
+## 40 samples x 120 variables and 1 response
+## standard scaling of predictors and response(s)
+##       R2X(cum) R2Y(cum) Q2(cum) RMSEE pre ort pR2Y  pQ2
+## Total    0.615    0.975   0.935 0.083   1   2 0.01 0.01
+
plot(opls.res, typeVc='permutation')
+
+opls.res <- opls(x=nutrimouse$gene, y=nutrimouse$genotype, predI=1, orthoI=NA, subset=c(1:13, 21:33), fig.pdfC='none')
+
## OPLS-DA
+## 26 samples x 120 variables and 1 response
+## standard scaling of predictors and response(s)
+##       R2X(cum) R2Y(cum) Q2(cum)  RMSEE RMSEP pre ort
+## Total    0.646    0.983   0.915 0.0704 0.134   1   2
+
par(mfrow=c(1,2))
+plot(opls.res, typeVc='predict-train')
+plot(opls.res, typeVc='predict-test')
+
+# confusion matrix on training set
+train_id <- getSubsetVi(opls.res)
+table(nutrimouse$genotype[train_id], fitted(opls.res))
+
##       
+##        wt ppar
+##   wt   13    0
+##   ppar  0   13
+
# confusion matrix on test set
+table(nutrimouse$genotype[-train_id], predict(opls.res, nutrimouse$gene[-train_id,]))
+
##       
+##        wt ppar
+##   wt    7    0
+##   ppar  0    7
+

@@ -445,17 +860,147 @@

1. Perform CCA (mixOmics::rcc) between 20 genes and all

The gene expression data is reduced to 20 genes so that the number of variables is less than the number of samples, to perform an unregularized CCA.

+
nutrimouse$gene_selected <- nutrimouse$gene[, 1:20]
+str(nutrimouse$gene_selected)
+
## 'data.frame':    40 obs. of  20 variables:
+##  $ X36b4: num  -0.42 -0.44 -0.48 -0.45 -0.42 -0.43 -0.53 -0.49 -0.36 -0.5 ...
+##  $ ACAT1: num  -0.65 -0.68 -0.74 -0.69 -0.71 -0.69 -0.62 -0.69 -0.66 -0.62 ...
+##  $ ACAT2: num  -0.84 -0.91 -1.1 -0.65 -0.54 -0.8 -1 -0.91 -0.74 -0.79 ...
+##  $ ACBP : num  -0.34 -0.32 -0.46 -0.41 -0.38 -0.32 -0.44 -0.37 -0.39 -0.36 ...
+##  $ ACC1 : num  -1.29 -1.23 -1.3 -1.26 -1.21 -1.13 -1.22 -1.29 -1.15 -1.21 ...
+##  $ ACC2 : num  -1.13 -1.06 -1.09 -1.09 -0.89 -0.79 -1 -1.06 -1.08 -0.82 ...
+##  $ ACOTH: num  -0.93 -0.99 -1.06 -0.93 -1 -0.93 -0.94 -1.05 -0.88 -0.92 ...
+##  $ ADISP: num  -0.98 -0.97 -1.08 -1.02 -0.95 -0.97 -0.94 -1.02 -0.98 -0.99 ...
+##  $ ADSS1: num  -1.19 -1 -1.18 -1.07 -1.08 -1.07 -1.05 -1.16 -1.05 -1 ...
+##  $ ALDH3: num  -0.68 -0.62 -0.75 -0.71 -0.76 -0.75 -0.67 -0.75 -0.66 -0.69 ...
+##  $ AM2R : num  -0.59 -0.58 -0.66 -0.65 -0.59 -0.55 -0.66 -0.66 -0.53 -0.62 ...
+##  $ AOX  : num  -0.16 -0.12 -0.16 -0.17 -0.31 -0.23 -0.09 -0.22 -0.06 -0.23 ...
+##  $ BACT : num  -0.22 -0.32 -0.32 -0.32 -0.31 -0.29 -0.25 -0.21 -0.15 -0.2 ...
+##  $ BIEN : num  -0.89 -0.88 -0.89 -0.77 -0.97 -0.84 -0.86 -0.9 -0.74 -0.76 ...
+##  $ BSEP : num  -0.69 -0.6 -0.7 -0.67 -0.68 -0.55 -0.67 -0.66 -0.6 -0.58 ...
+##  $ Bcl.3: num  -1.18 -1.07 -1.17 -1.12 -0.93 -1.08 -1.03 -1.01 -1.01 -1.1 ...
+##  $ C16SR: num  1.66 1.65 1.57 1.61 1.66 1.7 1.58 1.62 1.72 1.55 ...
+##  $ CACP : num  -0.92 -0.87 -1.02 -0.89 -0.93 -0.97 -0.97 -0.96 -0.85 -0.95 ...
+##  $ CAR1 : num  -0.97 -0.92 -0.98 -0.97 -1.06 -1.03 -0.91 -1.11 -0.85 -0.99 ...
+##  $ CBS  : num  -0.26 -0.36 -0.4 -0.39 -0.35 -0.31 -0.32 -0.4 -0.26 -0.39 ...
+
cca.res <- rcc(X=nutrimouse$gene_selected, Y=nutrimouse$lipid, ncomp=2)
+max(abs(nutrimouse$gene_selected - cca.res$X))
+
## [1] 0
+
max(abs(nutrimouse$lipid - cca.res$Y))
+
## [1] 0
+
str(cca.res)
+
## List of 11
+##  $ call         : language rcc(X = nutrimouse$gene_selected, Y = nutrimouse$lipid, ncomp = 2)
+##  $ X            : num [1:40, 1:20] -0.42 -0.44 -0.48 -0.45 -0.42 -0.43 -0.53 -0.49 -0.36 -0.5 ...
+##   ..- attr(*, "dimnames")=List of 2
+##   .. ..$ : chr [1:40] "1" "2" "3" "4" ...
+##   .. ..$ : chr [1:20] "X36b4" "ACAT1" "ACAT2" "ACBP" ...
+##  $ Y            : num [1:40, 1:21] 0.34 0.38 0.36 0.22 0.37 1.7 0.35 0.34 0.22 1.38 ...
+##   ..- attr(*, "dimnames")=List of 2
+##   .. ..$ : chr [1:40] "1" "2" "3" "4" ...
+##   .. ..$ : chr [1:21] "C14.0" "C16.0" "C18.0" "C16.1n.9" ...
+##  $ ncomp        : num 2
+##  $ method       : chr "ridge"
+##  $ cor          : Named num [1:20] 1 1 0.999 0.996 0.981 ...
+##   ..- attr(*, "names")= chr [1:20] "1" "2" "3" "4" ...
+##  $ loadings     :List of 2
+##   ..$ X: num [1:20, 1:2] 1.408 4.802 3.234 -7.373 -0.724 ...
+##   .. ..- attr(*, "dimnames")=List of 2
+##   .. .. ..$ : chr [1:20] "X36b4" "ACAT1" "ACAT2" "ACBP" ...
+##   .. .. ..$ : NULL
+##   ..$ Y: num [1:21, 1:2] 1.1112 -0.1436 -0.4625 -1.0203 -0.0901 ...
+##   .. ..- attr(*, "dimnames")=List of 2
+##   .. .. ..$ : chr [1:21] "C14.0" "C16.0" "C18.0" "C16.1n.9" ...
+##   .. .. ..$ : NULL
+##  $ variates     :List of 2
+##   ..$ X: num [1:40, 1:2] -1.203 -1.25 -0.831 0.338 -0.119 ...
+##   .. ..- attr(*, "dimnames")=List of 2
+##   .. .. ..$ : chr [1:40] "1" "2" "3" "4" ...
+##   .. .. ..$ : NULL
+##   ..$ Y: num [1:40, 1:2] -1.203 -1.25 -0.831 0.338 -0.119 ...
+##   .. ..- attr(*, "dimnames")=List of 2
+##   .. .. ..$ : chr [1:40] "1" "2" "3" "4" ...
+##   .. .. ..$ : NULL
+##  $ names        :List of 4
+##   ..$ sample  : chr [1:40] "1" "2" "3" "4" ...
+##   ..$ colnames:List of 2
+##   .. ..$ X: chr [1:20] "X36b4" "ACAT1" "ACAT2" "ACBP" ...
+##   .. ..$ Y: chr [1:21] "C14.0" "C16.0" "C18.0" "C16.1n.9" ...
+##   ..$ blocks  : chr [1:2] "X" "Y"
+##   ..$ data    : chr [1:2] "nutrimouse$gene_selected" "nutrimouse$lipid"
+##  $ lambda       : Named num [1:2] 0 0
+##   ..- attr(*, "names")= chr [1:2] "lambda1" "lambda2"
+##  $ prop_expl_var:List of 2
+##   ..$ X: Named num [1:2] 0.00132 0.0024
+##   .. ..- attr(*, "names")= chr [1:2] "comp1" "comp2"
+##   ..$ Y: Named num [1:2] 0.0184 0.0299
+##   .. ..- attr(*, "names")= chr [1:2] "comp1" "comp2"
+##  - attr(*, "class")= chr "rcc"
+
cca.res$cor
+
##          1          2          3          4          5          6          7 
+## 1.00000000 1.00000000 0.99922446 0.99607902 0.98142435 0.95641141 0.89083472 
+##          8          9         10         11         12         13         14 
+## 0.88959894 0.78648273 0.76470925 0.75189350 0.66984945 0.63240310 0.53662009 
+##         15         16         17         18         19         20 
+## 0.49948385 0.34852831 0.33274136 0.27818295 0.22569639 0.03783839

The sample distribution plot can be performed with variates, sample coordinates in the new reference (rotated axes) for each of the two blocks.

+
str(cca.res$variates)
+
## List of 2
+##  $ X: num [1:40, 1:2] -1.203 -1.25 -0.831 0.338 -0.119 ...
+##   ..- attr(*, "dimnames")=List of 2
+##   .. ..$ : chr [1:40] "1" "2" "3" "4" ...
+##   .. ..$ : NULL
+##  $ Y: num [1:40, 1:2] -1.203 -1.25 -0.831 0.338 -0.119 ...
+##   ..- attr(*, "dimnames")=List of 2
+##   .. ..$ : chr [1:40] "1" "2" "3" "4" ...
+##   .. ..$ : NULL
+
PCx <- 1
+PCy <- 2
+par(mfrow=c(1,2), las=1, mar=c(4,3,1,1))
+plot(cca.res$variates$X[, PCx], cca.res$variates$X[, PCy], xlab=PCx, ylab=PCy, main="X", type='n')
+text(cca.res$variates$X[, PCx], cca.res$variates$X[, PCy], rownames(cca.res$variates$X), col='blue', cex=0.6)
+plot(cca.res$variates$Y[, PCx], cca.res$variates$Y[, PCy], xlab=PCx, ylab=PCy, main="Y", type='n')
+text(cca.res$variates$Y[, PCx], cca.res$variates$Y[, PCy], rownames(cca.res$variates$Y), col='blue', cex=0.6)
+cor(cca.res$variates$X[,1], cca.res$variates$Y[,1])
+
## [1] 1
+
cor(cca.res$variates$X[,2], cca.res$variates$Y[,2])
+
## [1] 1
+
plotIndiv(cca.res)
+plotArrow(cca.res)
+

Variable relationship is obtained from loadings or with plotVar.

+
par(mfrow=c(1,2), las=2, mar=c(4,8,1,1))
+loadings.ind.X <- order(abs(cca.res$loadings$X[, 1]), decreasing = T)
+barplot(head(cca.res$loadings$X[loadings.ind.X, 1], 10), main="X", horiz = T, cex.names=0.8)
+loadings.ind.Y <- order(abs(cca.res$loadings$Y[, 1]), decreasing = T)
+barplot(head(cca.res$loadings$Y[loadings.ind.Y, 1], 10), main="Y", horiz = T, cex.names=0.8)
+max(abs(cca.res$variates$X - scale(cca.res$X, center=T, scale=F) %*% cca.res$loadings$X))
+max(abs(cca.res$variates$Y - scale(cca.res$Y, center=T, scale=F) %*% cca.res$loadings$Y))
+plotVar(cca.res)

2. Perform CCA with scaled datasets and observe the difference

+
cca.res.scale <- rcc(X=scale(nutrimouse$gene_selected, center=T, scale=T), 
+                     Y=scale(nutrimouse$lipid, center=T, scale=T), ncomp=2)
+max(abs(cca.res.scale$cor - cca.res$cor))
+
## [1] 1.149648e-10
+
max(abs(cca.res.scale$variates$X - cca.res$variates$X))
+
## [1] 2.64486
+
max(abs(cca.res.scale$variates$Y - cca.res$variates$Y))
+
## [1] 2.64486
+
max(abs(cca.res.scale$loadings$X - cca.res$loadings$X))
+
## [1] 7.845927
+
max(abs(cca.res.scale$loadings$Y - cca.res$loadings$Y))
+
## [1] 123.5061

3. Perform regularized CCA with all genes and lipids.

+
rcca.res <- rcc(X=nutrimouse$gene, Y=nutrimouse$lipid, ncomp=2, method="shrinkage")
+plotVar(rcca.res)
+