diff --git a/Servier_2024_cours_FM.pdf b/Servier_2024_cours_FM.pdf new file mode 100644 index 0000000..27e7ccd Binary files /dev/null and b/Servier_2024_cours_FM.pdf differ diff --git a/Supervised_multiblock_analyses.Rmd b/Supervised_multiblock_analyses.Rmd new file mode 100644 index 0000000..48ec4f9 --- /dev/null +++ b/Supervised_multiblock_analyses.Rmd @@ -0,0 +1,240 @@ +--- +title: "Supervised Multiblock analyses" +author: "Florence Mehl" +date: "`r format(Sys.time(), '%B %d, %Y')`" +output: + html_document: + toc: TRUE + theme: paper + higlight: tango + code_folding: "hide" + +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) + +library(CCA) +library(ggplot2) +library(ggrepel) +library(reshape2) +library(ConsensusOPLS) + +``` + +# Nutrimouse dataset + +The data sets come from a nutrigenomic study in the mouse (Martin et al., 2007, https://aasldpubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/hep.21510) in which the effects of five regimens with contrasted fatty acid compositions on liver lipids and hepatic gene expression in mice were considered. + +Two sets of variables were acquired on forty mice: +- genes: expressions of 120 genes measured in liver cells, selected (among about 30,000) as potentially relevant in the context of the nutrition study. These expressions come from a nylon macroarray with radioactive labelling +- lipids: concentrations (in percentages) of 21 hepatic fatty acids measured by gas chromatography + +Biological units (mice) were cross-classified according to two factors experimental design (4 replicates): +- genotype: 2-levels factor, wild-type (WT) and PPARalpha -/- (PPAR) +- diet: 5-levels factor. Oils used for experimental diets preparation were corn and colza oils (50/50) for a reference diet (REF), hydrogenated coconut oil for a saturated fatty acid diet (COC), sunflower oil for an Omega6 fatty acid-rich diet (SUN), linseed oil for an Omega3-rich diet (LIN) and corn/colza/enriched fish oils for the FISH diet (43/43/14) + +```{r import and format dataset, message=FALSE, warning=FALSE} + +data("nutrimouse") +genes <- nutrimouse$gene +lipids <- nutrimouse$lipid +metadata <- data.frame(genotype = nutrimouse$genotype, diet = nutrimouse$diet) +metadata$sample_name <- paste0(rownames(metadata), "_", metadata$genotype, "_", metadata$diet) +rownames(genes) <- metadata$sample_name +rownames(lipids) <- metadata$sample_name + +``` + +# Consensus OPLS Discriminant analysis of genotypes + +## Question 1: based on lipids and genes data, can we discriminate wt vs ppar samples ? + +Run ConsensusOPLS-DA analysis with ConsensusOPLS() from ConsensusOPLS package + +```{r consensusOPLS-DA, message=FALSE, warning=FALSE, fig.align='center'} + +COPLS_data <- list(genes=as.matrix(genes), lipids=as.matrix(lipids)) +COPLS_data <- lapply(COPLS_data, scale) +genotype <- metadata$genotype +dummy_genotype <- as.matrix(data.frame(wt = ifelse(genotype == "wt", 1, 0),ppar = ifelse(genotype == "ppar", 1, 0))) + +COPLS_res <- ConsensusOPLS( + data = COPLS_data, + Y = dummy_genotype, + maxPcomp = 1, + maxOcomp = 1, + modelType = "da", + cvType = "nfold", + nfold = 40, + nperm = 100, + verbose = T) + +COPLS_res +``` + +## Question 2: Is the model statistically significant? + +The results of permutations can be found in `COPLS_res$permStats`. +The results for the optimal model can be found in `COPLS_res$optimal$modelCV` and `COPLS_res$optimal$modelCV$cv` + +- plot Q2 permutations +- plot DQ2 permutations +- plot R2Y permutations + +```{r permutations, message=FALSE, warning=FALSE, fig.align='center'} + +# package's functions +####################### + +# plot Q2 values from permutation tests histogram +plotQ2(COPLS_res) +# plot Q2 values from permutation tests histogram +plotDQ2(COPLS_res) +# plot R2Y values from permutation tests histogram +plotR2(COPLS_res) + + +# or draw your own plots +####################### + +Q2perm <- data.frame(Q2perm = COPLS_res@permStats$Q2Y) + +ggplot(data = Q2perm, aes(x = Q2perm)) + + geom_histogram(color="grey", fill="grey") + + geom_vline(aes(xintercept=COPLS_res@Q2["po1"]),color="blue", linetype="dashed", size=1) + + theme_classic() + + ggtitle("Q2 Permutation test") + +DQ2perm <- data.frame(DQ2perm = COPLS_res@permStats$DQ2Y) + +ggplot(data = DQ2perm, aes(x = DQ2perm)) + + geom_histogram(color="grey", fill="grey") + + geom_vline(aes(xintercept=COPLS_res@DQ2["po1"]),color="blue", linetype="dashed", size=1) + + theme_classic() + + ggtitle("DQ2 Permutation test") + +R2Yperm <- data.frame(R2Yperm = COPLS_res@permStats$R2Y) + +ggplot(data = R2Yperm, aes(x = R2Yperm)) + + geom_histogram(color="grey", fill="grey") + + geom_vline(aes(xintercept=COPLS_res@R2Y["po1"]),color="blue", linetype="dashed", size=1) + + theme_classic() + + ggtitle("R2Y Permutation test") + +``` + +## Question 3: What is the contribution of each data block? + +- plot blockContribution of the optimal model + +```{r consensus OPLS-DA contributions, message=FALSE, warning=FALSE, fig.align='center'} + +# package's function +####################### + +# plot Q2 values from permutation tests histogram +plotContribution(COPLS_res) + +# or draw your own plot +####################### + +contributions <- COPLS_res@blockContribution +contributions <- melt(contributions) +colnames(contributions) <- c("dataset", "Dim", "value") + +ggplot(contributions, aes(x=Dim, y=value, fill=dataset)) + + geom_bar(stat = "identity", position=position_dodge()) + + theme_light() + + labs(x = "global components", y = "specific weights of block on global components", fill = "omic", + title = "contributions") + +``` + +## Question 4: Show the distribution of samples in the space of the predictive and orthogonal latent variables? + +- plot scores of the optimal model + +```{r consensus OPLS-DA scores, message=FALSE, warning=FALSE, fig.align='center'} + +# package's functions +####################### + +# plot Q2 values from permutation tests histogram +plotScores(COPLS_res) + +# or draw your own plot +####################### + +scores <- data.frame(metadata, COPLS_res@scores) + +ggplot(scores, aes(x=p_1, y=o_1, col=diet, shape = genotype)) + + geom_point(size=4) + + labs(x="Predictive", + y="Orthogonal", + title = "scores plots on predictive vs orthogonal latent variables") + + theme_light() + +``` + +## Question 5: Show the loadings of variables in the space of the predictive and orthogonal latent variables? + +- plot loadings of the optimal model + +```{r consensus OPLS-DA loadings, message=FALSE, warning=FALSE, fig.align='center'} + +# package's functions +####################### + +# plot Q2 values from permutation tests histogram +plotLoadings(COPLS_res) + +# or draw your own plot +####################### + +loadings <- rbind.data.frame(COPLS_res@loadings$genes, COPLS_res@loadings$lipids) +loadings$dataset <- c(rep("genes", nrow(COPLS_res@loadings$genes)), rep("lipids", nrow(COPLS_res@loadings$lipids))) +loadings$variable <- rownames(loadings) + +ggplot(loadings, aes(x=p_1, y=o_1, col=dataset, label = variable)) + + geom_point(size=2) + + labs(x="Predictive", + y="Orthogonal", + title = "loadings plots on predictive vs orthogonal latent variables") + + geom_text_repel() + + theme_light() + +``` + +## Question 6: Show the importance of variables in the model? + +- plot loadings and VIP of the optimal model + +```{r consensus OPLS-DA VIP loadings, message=FALSE, warning=FALSE, fig.align='center'} + +# package's functions +####################### + +# plot Q2 values from permutation tests histogram +plotVIP(COPLS_res) + +# or draw your own plot +####################### + +VIP <- data.frame(VIP = c(COPLS_res@VIP$genes[,"p"], COPLS_res@VIP$lipids[,"p"]), variable = c(rownames(COPLS_res@VIP$genes), rownames(COPLS_res@VIP$lipids))) + +loadings_VIP <- merge(loadings, VIP, by="variable") +loadings_VIP$label <- ifelse(loadings_VIP$VIP > 1, loadings_VIP$variable, NA) + +ggplot(loadings_VIP, aes(x=p_1, y=VIP, col=dataset, label = label)) + + geom_point(size=2) + + labs(x="Predictive", + y="VIP", + title = "VIP vs loadings on predictive latent variable") + + geom_text_repel(size=3, max.overlaps = 50, segment.size=.1) + + theme_light() + +``` + + + diff --git a/Supervised_multiblock_analyses.html b/Supervised_multiblock_analyses.html new file mode 100644 index 0000000..b07938e --- /dev/null +++ b/Supervised_multiblock_analyses.html @@ -0,0 +1,777 @@ + + + + + + + + + + + + + + + +Supervised Multiblock analyses + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + +
+ +
+ +
+

Nutrimouse dataset

+

The data sets come from a nutrigenomic study in the mouse (Martin et +al., 2007, https://aasldpubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/hep.21510) +in which the effects of five regimens with contrasted fatty acid +compositions on liver lipids and hepatic gene expression in mice were +considered.

+

Two sets of variables were acquired on forty mice: - genes: +expressions of 120 genes measured in liver cells, selected (among about +30,000) as potentially relevant in the context of the nutrition study. +These expressions come from a nylon macroarray with radioactive +labelling - lipids: concentrations (in percentages) of 21 hepatic fatty +acids measured by gas chromatography

+

Biological units (mice) were cross-classified according to two +factors experimental design (4 replicates): - genotype: 2-levels factor, +wild-type (WT) and PPARalpha -/- (PPAR) - diet: 5-levels factor. Oils +used for experimental diets preparation were corn and colza oils (50/50) +for a reference diet (REF), hydrogenated coconut oil for a saturated +fatty acid diet (COC), sunflower oil for an Omega6 fatty acid-rich diet +(SUN), linseed oil for an Omega3-rich diet (LIN) and corn/colza/enriched +fish oils for the FISH diet (43/43/14)

+
data("nutrimouse")
+genes <- nutrimouse$gene
+lipids <- nutrimouse$lipid
+metadata <- data.frame(genotype = nutrimouse$genotype, diet = nutrimouse$diet)
+metadata$sample_name <- paste0(rownames(metadata), "_", metadata$genotype, "_", metadata$diet)
+rownames(genes) <- metadata$sample_name
+rownames(lipids) <- metadata$sample_name
+
+
+

Consensus OPLS Discriminant analysis of genotypes

+
+

Question 1: based on lipids and genes data, can we discriminate wt +vs ppar samples ?

+

Run ConsensusOPLS-DA analysis with ConsensusOPLS() from ConsensusOPLS +package

+
COPLS_data <- list(genes=as.matrix(genes), lipids=as.matrix(lipids))
+COPLS_data <- lapply(COPLS_data, scale)
+genotype <- metadata$genotype
+dummy_genotype <- as.matrix(data.frame(wt = ifelse(genotype == "wt", 1, 0),ppar = ifelse(genotype == "ppar", 1, 0)))
+
+COPLS_res <- ConsensusOPLS(
+  data = COPLS_data,
+  Y = dummy_genotype,
+  maxPcomp = 1,
+  maxOcomp = 1,
+  modelType = "da",
+  cvType = "nfold",
+  nfold = 40,
+  nperm = 100,
+  verbose = T)
+
+COPLS_res
+
## ***Optimal Consensus OPLS model***
+## 
+## Mode:  da 
+## 
+## Number of predictive components:  1 
+## 
+## Number of orthogonal components:  1 
+## 
+## Block contribution:
+##              p_1       o_1
+## genes  0.4393358 0.4482196
+## lipids 0.5606642 0.5517804
+## 
+## Explained variance R2 in response: 0.9506812 
+## 
+## Predictive ability (cross validation Q2):  0.9256023
+
+
+

Question 2: Is the model statistically significant?

+

The results of permutations can be found in +COPLS_res$permStats. The results for the optimal model can +be found in COPLS_res$optimal$modelCV and +COPLS_res$optimal$modelCV$cv

+
    +
  • plot Q2 permutations
  • +
  • plot DQ2 permutations
  • +
  • plot R2Y permutations
  • +
+
# package's functions
+#######################
+
+# plot Q2 values from permutation tests histogram
+plotQ2(COPLS_res)
+

+
# plot Q2 values from permutation tests histogram
+plotDQ2(COPLS_res)
+

+
# plot R2Y values from permutation tests histogram
+plotR2(COPLS_res)
+

+
# or draw your own plots
+#######################
+
+Q2perm <- data.frame(Q2perm = COPLS_res@permStats$Q2Y)
+
+ggplot(data = Q2perm, aes(x = Q2perm)) +
+  geom_histogram(color="grey", fill="grey") +
+  geom_vline(aes(xintercept=COPLS_res@Q2["po1"]),color="blue", linetype="dashed", size=1) +
+  theme_classic() +
+  ggtitle("Q2 Permutation test")
+

+
DQ2perm <- data.frame(DQ2perm = COPLS_res@permStats$DQ2Y)
+
+ggplot(data = DQ2perm, aes(x = DQ2perm)) +
+  geom_histogram(color="grey", fill="grey") +
+  geom_vline(aes(xintercept=COPLS_res@DQ2["po1"]),color="blue", linetype="dashed", size=1) +
+  theme_classic() +
+  ggtitle("DQ2 Permutation test")
+

+
R2Yperm <- data.frame(R2Yperm = COPLS_res@permStats$R2Y)
+
+ggplot(data = R2Yperm, aes(x = R2Yperm)) +
+  geom_histogram(color="grey", fill="grey") +
+  geom_vline(aes(xintercept=COPLS_res@R2Y["po1"]),color="blue", linetype="dashed", size=1) +
+  theme_classic() +
+  ggtitle("R2Y Permutation test")
+

+
+
+

Question 3: What is the contribution of each data block?

+
    +
  • plot blockContribution of the optimal model
  • +
+
# package's function
+#######################
+
+# plot Q2 values from permutation tests histogram
+plotContribution(COPLS_res)
+

+
# or draw your own plot
+#######################
+
+contributions <- COPLS_res@blockContribution
+contributions <- melt(contributions)
+colnames(contributions) <- c("dataset", "Dim", "value")
+
+ggplot(contributions, aes(x=Dim, y=value, fill=dataset)) +
+  geom_bar(stat = "identity", position=position_dodge()) +
+  theme_light() +
+  labs(x = "global components", y = "specific weights of block on global components", fill = "omic",
+       title = "contributions")
+

+
+
+

Question 4: Show the distribution of samples in the space of the +predictive and orthogonal latent variables?

+
    +
  • plot scores of the optimal model
  • +
+
# package's functions
+#######################
+
+# plot Q2 values from permutation tests histogram
+plotScores(COPLS_res)
+

+
# or draw your own plot
+#######################
+
+scores <- data.frame(metadata, COPLS_res@scores)
+
+ggplot(scores, aes(x=p_1, y=o_1, col=diet, shape = genotype)) +
+  geom_point(size=4) +
+  labs(x="Predictive",
+       y="Orthogonal",
+       title = "scores plots on predictive vs orthogonal latent variables") +
+  theme_light()
+

+
+
+

Question 5: Show the loadings of variables in the space of the +predictive and orthogonal latent variables?

+
    +
  • plot loadings of the optimal model
  • +
+
# package's functions
+#######################
+
+# plot Q2 values from permutation tests histogram
+plotLoadings(COPLS_res)
+

+
# or draw your own plot
+#######################
+
+loadings <- rbind.data.frame(COPLS_res@loadings$genes, COPLS_res@loadings$lipids)
+loadings$dataset <- c(rep("genes", nrow(COPLS_res@loadings$genes)), rep("lipids", nrow(COPLS_res@loadings$lipids)))
+loadings$variable <- rownames(loadings)
+
+ggplot(loadings, aes(x=p_1, y=o_1, col=dataset, label = variable)) +
+  geom_point(size=2) +
+  labs(x="Predictive",
+       y="Orthogonal",
+       title = "loadings plots on predictive vs orthogonal latent variables") +
+  geom_text_repel() +
+  theme_light()
+

+
+
+

Question 6: Show the importance of variables in the model?

+
    +
  • plot loadings and VIP of the optimal model
  • +
+
# package's functions
+#######################
+
+# plot Q2 values from permutation tests histogram
+plotVIP(COPLS_res)
+

+
# or draw your own plot
+#######################
+
+VIP <- data.frame(VIP = c(COPLS_res@VIP$genes[,"p"], COPLS_res@VIP$lipids[,"p"]), variable = c(rownames(COPLS_res@VIP$genes), rownames(COPLS_res@VIP$lipids)))
+
+loadings_VIP <- merge(loadings, VIP, by="variable")
+loadings_VIP$label <- ifelse(loadings_VIP$VIP > 1, loadings_VIP$variable, NA)
+
+ggplot(loadings_VIP, aes(x=p_1, y=VIP, col=dataset, label = label)) +
+  geom_point(size=2) +
+  labs(x="Predictive",
+       y="VIP",
+       title = "VIP vs loadings on predictive latent variable") +
+  geom_text_repel(size=3, max.overlaps = 50, segment.size=.1) +
+  theme_light()
+

+
+
+ + + + +
+ + + + + + + + + + + + + + + + diff --git a/Unsupervised_multiblock_analyses.Rmd b/Unsupervised_multiblock_analyses.Rmd new file mode 100644 index 0000000..8e45a18 --- /dev/null +++ b/Unsupervised_multiblock_analyses.Rmd @@ -0,0 +1,289 @@ +--- +title: "Unsupervised Multiblock analyses" +author: "Florence Mehl" +date: "`r format(Sys.time(), '%B %d, %Y')`" +output: + html_document: + toc: yes + theme: paper + higlight: tango + code_folding: hide + pdf_document: + toc: yes +--- + +```{r setup, include=FALSE} + +knitr::opts_chunk$set(echo = TRUE) + +library(MBAnalysis) +library(CCA) +library(ggplot2) +library(ggrepel) +library(reshape2) + +``` + +# Nutrimouse dataset + +The data sets come from a nutrigenomic study in the mouse (Martin et al., 2007, https://aasldpubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/hep.21510) in which the effects of five regimens with contrasted fatty acid compositions on liver lipids and hepatic gene expression in mice were considered. + +Two sets of variables were acquired on forty mice: +- genes: expressions of 120 genes measured in liver cells, selected (among about 30,000) as potentially relevant in the context of the nutrition study. These expressions come from a nylon macroarray with radioactive labelling +- lipids: concentrations (in percentages) of 21 hepatic fatty acids measured by gas chromatography + +Biological units (mice) were cross-classified according to two factors experimental design (4 replicates): +- genotype: 2-levels factor, wild-type (WT) and PPARalpha -/- (PPAR) +- diet: 5-levels factor. Oils used for experimental diets preparation were corn and colza oils (50/50) for a reference diet (REF), hydrogenated coconut oil for a saturated fatty acid diet (COC), sunflower oil for an Omega6 fatty acid-rich diet (SUN), linseed oil for an Omega3-rich diet (LIN) and corn/colza/enriched fish oils for the FISH diet (43/43/14) + +```{r import and format dataset, message=FALSE, warning=FALSE} + +data("nutrimouse") +genes <- nutrimouse$gene +lipids <- nutrimouse$lipid +metadata <- data.frame(genotype = nutrimouse$genotype, diet = nutrimouse$diet) +metadata$sample_name <- paste0(rownames(metadata), "_", metadata$genotype, "_", metadata$diet) +rownames(genes) <- metadata$sample_name +rownames(lipids) <- metadata$sample_name + +``` + +# Analysis of complete dataset + +## Question 1: based on lipids and genes data, do we observe clusters of samples ? + +Prepare dataset +- concatenate genes and lipids dataframes +- define the number of variables of both block + +Run ComDim analysis +- use ComDim() from MBAnalysis package + +```{r comdim all samples, message=FALSE, warning=FALSE} + +# prepare dataset +ComDim_data <- cbind.data.frame(genes, lipids) +n_group <- c(dim(genes)[[2]], dim(lipids)[[2]]) + +# run analysis +ComDim_res <- ComDim(X = ComDim_data, + block = n_group, + name.block = c("genes", "lipids"), + scale = T, + scale.block = T) +``` + +## Question 2: how do both blocks contribute to each dimension? + +- plot saliences + +```{r comdim all samples saliences, fig.align='center', message=FALSE, warning=FALSE} + +# saliences + +saliences <- ComDim_res$saliences +rownames(saliences) <- c("genes", "lipids") +saliences <- as.data.frame(t(saliences[,1:4])) +saliences$Dim <- rownames(saliences) +saliences <- melt(saliences) + +ggplot(saliences, aes(x=Dim, y=value, fill=variable)) + + geom_bar(stat = "identity", position=position_dodge()) + + theme_light() + + labs(x = "global components", + y = "specific weights of block on global components", + fill = "omic", + title = "saliences") + +``` + +## Question 3: observe the samples distributions in the space of the common dimensions, what are the main sources of variation? + +- plot scores (Scor.g) on Dim.1 vs Dim.2 with percentages of explained variance on axes +- plot scores (Scor.g) on Dim.3 vs Dim.4 with percentages of explained variance on axes + +```{r comdim all samples scores, message=FALSE, warning=FALSE, fig.align='center'} + +scores <- data.frame(metadata, ComDim_res$Scor.g) + +ggplot(scores, aes(x=Dim.1, y=Dim.2, col=diet, shape = genotype)) + + geom_point(size=4) + + labs(x=paste0("Dim.1 - ", ComDim_res$cumexplained[1,"%explX"], "%"), + y=paste0("Dim.2 - ", ComDim_res$cumexplained[2,"%explX"], "%"), + title = "scores plots on Dim.1 Dim.2") + + theme_light() + +ggplot(scores, aes(x=Dim.3, y=Dim.4, col=diet, shape = genotype)) + + geom_point(size=4) + + labs(x=paste0("Dim.3 - ", ComDim_res$cumexplained[3,"%explX"], "%"), + y=paste0("Dim.4 - ", ComDim_res$cumexplained[4,"%explX"], "%"), + title = "scores plots on Dim.3 Dim.4") + +theme_light() + +``` + +## Question 4: which genes and lipids are responsible of the samples differences? + +- plot loadings (Load.g) on Dim.1 vs Dim.2 with percentages of explained variance on axes +- plot loadings (Load.g) on Dim.3 vs Dim.4 with percentages of explained variance on axes + +```{r comdim all samples loadings, message=FALSE, warning=FALSE, fig.align='center'} + +loadings <- data.frame(ComDim_res$Load.g) +loadings$omic <- c(rep("genes", dim(genes)[[2]]), rep("lipids", dim(lipids)[[2]])) +loadings$variable <- rownames(loadings) + +ggplot(loadings, aes(x=Dim.1, y=Dim.2, col=omic, label=variable)) + + geom_point() + + geom_text_repel() + + labs(x=paste0("Dim.1 - ", ComDim_res$cumexplained[1,"%explX"], "%"), + y=paste0("Dim.2 - ", ComDim_res$cumexplained[2,"%explX"], "%"), + title = "loadings plots on Dim.1 Dim.2") + + theme_light() + +ggplot(loadings, aes(x=Dim.3, y=Dim.4, col=omic, label=variable)) + + geom_point() + + geom_text_repel() + + labs(x=paste0("Dim.3 - ", ComDim_res$cumexplained[3,"%explX"], "%"), + y=paste0("Dim.4 - ", ComDim_res$cumexplained[4,"%explX"], "%"), + title = "loadings plots on Dim.3 Dim.4") + + theme_light() + +``` + +# Repeat analysis for wt samples only + +```{r comdim wt only, message=FALSE, warning=FALSE, fig.align='center'} + +# prepare dataset +wt_samples <- metadata$sample_name[metadata$genotype == "wt"] +wt_ComDim_data <- cbind.data.frame(genes[wt_samples,], lipids[wt_samples,]) +n_group <- c(dim(genes)[[2]], dim(lipids)[[2]]) + +# run analysis +wt_ComDim_res <- ComDim(X = wt_ComDim_data, + block = n_group, + name.block = c("genes", "lipids"), + scale = T, + scale.block = T) + +# saliences +wt_saliences <- wt_ComDim_res$saliences +rownames(wt_saliences) <- c("genes", "lipids") +wt_saliences <- as.data.frame(t(wt_saliences[,1:4])) +wt_saliences$Dim <- rownames(wt_saliences) +wt_saliences <- melt(wt_saliences) +ggplot(wt_saliences, aes(x=Dim, y=value, fill=variable)) + + geom_bar(stat = "identity", position=position_dodge()) + + theme_light() + + labs(x = "global components", + y = "specific weights of block on global components", + fill = "omic", + title = "saliences") + +# scores plots +wt_scores <- data.frame(metadata[metadata$genotype == "wt",], wt_ComDim_res$Scor.g) +ggplot(wt_scores, aes(x=Dim.1, y=Dim.2, col=diet, shape = genotype)) + + geom_point(size=4) + + labs(x=paste0("Dim.1 - ", wt_ComDim_res$cumexplained[1,"%explX"], "%"), + y=paste0("Dim.2 - ", wt_ComDim_res$cumexplained[2,"%explX"], "%"), + title = "scores plots on Dim.1 Dim.2") + +theme_light() +ggplot(wt_scores, aes(x=Dim.3, y=Dim.4, col=diet, shape = genotype)) + + geom_point(size=4) + + labs(x=paste0("Dim.3 - ", wt_ComDim_res$cumexplained[3,"%explX"], "%"), + y=paste0("Dim.4 - ", wt_ComDim_res$cumexplained[4,"%explX"], "%"), + title = "scores plots on Dim.3 Dim.4") + +theme_light() + +# loadings plots +wt_loadings <- data.frame(wt_ComDim_res$Load.g) +wt_loadings$omic <- c(rep("genes", dim(genes)[[2]]), rep("lipids", dim(lipids)[[2]])) +wt_loadings$variable <- rownames(wt_loadings) +ggplot(wt_loadings, aes(x=Dim.1, y=Dim.2, col=omic, label=variable)) + + geom_point() + + geom_text_repel() + + labs(x=paste0("Dim.1 - ", wt_ComDim_res$cumexplained[1,"%explX"], "%"), + y=paste0("Dim.2 - ", wt_ComDim_res$cumexplained[2,"%explX"], "%"), + title = "loadings plots on Dim.1 Dim.2") + + theme_light() + +ggplot(wt_loadings, aes(x=Dim.3, y=Dim.4, col=omic, label=variable)) + + geom_point() + + geom_text_repel() + + labs(x=paste0("Dim.3 - ", wt_ComDim_res$cumexplained[3,"%explX"], "%"), + y=paste0("Dim.4 - ", wt_ComDim_res$cumexplained[4,"%explX"], "%"), + title = "loadings plots on Dim.3 Dim.4") + + theme_light() + +``` + +# Repeat analysis for ppar samples only + +```{r comdim ppar only, message=FALSE, warning=FALSE, fig.align='center'} + +# prepare dataset +ppar_samples <- metadata$sample_name[metadata$genotype == "ppar"] +ppar_ComDim_data <- cbind.data.frame(genes[ppar_samples,], lipids[ppar_samples,]) +n_group <- c(dim(genes)[[2]], dim(lipids)[[2]]) + +# run analysis +ppar_ComDim_res <- ComDim(X = ppar_ComDim_data, + block = n_group, + name.block = c("genes", "lipids"), + scale = T, + scale.block = T) + +# saliences +ppar_saliences <- ppar_ComDim_res$saliences +rownames(ppar_saliences) <- c("genes", "lipids") +ppar_saliences <- as.data.frame(t(ppar_saliences[,1:4])) +ppar_saliences$Dim <- rownames(ppar_saliences) +ppar_saliences <- melt(ppar_saliences) +ggplot(ppar_saliences, aes(x=Dim, y=value, fill=variable)) + + geom_bar(stat = "identity", position=position_dodge()) + + theme_light() + + labs(x = "global components", + y = "specific weights of block on global components", + fill = "omic", + title = "saliences") + + +# scores plots +ppar_scores <- data.frame(metadata[metadata$genotype == "ppar",], ppar_ComDim_res$Scor.g) +ggplot(ppar_scores, aes(x=Dim.1, y=Dim.2, col=diet, shape = genotype)) + + geom_point(size=4) + + labs(x=paste0("Dim.1 - ", ppar_ComDim_res$cumexplained[1,"%explX"], "%"), + y=paste0("Dim.2 - ", ppar_ComDim_res$cumexplained[2,"%explX"], "%"), + title = "scores plots on Dim.1 Dim.2") + + theme_light() +ggplot(ppar_scores, aes(x=Dim.3, y=Dim.4, col=diet, shape = genotype)) + + geom_point(size=4) + + labs(x=paste0("Dim.3 - ", ppar_ComDim_res$cumexplained[3,"%explX"], "%"), + y=paste0("Dim.4 - ", ppar_ComDim_res$cumexplained[4,"%explX"], "%"), + title = "scores plots on Dim.3 Dim.4") + + theme_light() + +# loadings plots +ppar_loadings <- data.frame(ppar_ComDim_res$Load.g) +ppar_loadings$omic <- c(rep("genes", dim(genes)[[2]]), rep("lipids", dim(lipids)[[2]])) +ppar_loadings$variable <- rownames(ppar_loadings) +ggplot(ppar_loadings, aes(x=Dim.1, y=Dim.2, col=omic, label=variable)) + + geom_point() + + geom_text_repel() + + labs(x=paste0("Dim.1 - ", ppar_ComDim_res$cumexplained[1,"%explX"], "%"), + y=paste0("Dim.2 - ", ppar_ComDim_res$cumexplained[2,"%explX"], "%"), + title = "loadings plots on Dim.1 Dim.2") + + theme_light() + +ggplot(ppar_loadings, aes(x=Dim.3, y=Dim.4, col=omic, label=variable)) + + geom_point() + + geom_text_repel() + + labs(x=paste0("Dim.3 - ", ppar_ComDim_res$cumexplained[3,"%explX"], "%"), + y=paste0("Dim.4 - ", ppar_ComDim_res$cumexplained[4,"%explX"], "%"), + title = "loadings plots on Dim.3 Dim.4") + + theme_light() + +``` + + diff --git a/Unsupervised_multiblock_analyses.html b/Unsupervised_multiblock_analyses.html new file mode 100644 index 0000000..f8e69d1 --- /dev/null +++ b/Unsupervised_multiblock_analyses.html @@ -0,0 +1,805 @@ + + + + + + + + + + + + + + + +Unsupervised Multiblock analyses + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + +
+ +
+ +
+

Nutrimouse dataset

+

The data sets come from a nutrigenomic study in the mouse (Martin et +al., 2007, https://aasldpubs.onlinelibrary.wiley.com/doi/pdfdirect/10.1002/hep.21510) +in which the effects of five regimens with contrasted fatty acid +compositions on liver lipids and hepatic gene expression in mice were +considered.

+

Two sets of variables were acquired on forty mice: - genes: +expressions of 120 genes measured in liver cells, selected (among about +30,000) as potentially relevant in the context of the nutrition study. +These expressions come from a nylon macroarray with radioactive +labelling - lipids: concentrations (in percentages) of 21 hepatic fatty +acids measured by gas chromatography

+

Biological units (mice) were cross-classified according to two +factors experimental design (4 replicates): - genotype: 2-levels factor, +wild-type (WT) and PPARalpha -/- (PPAR) - diet: 5-levels factor. Oils +used for experimental diets preparation were corn and colza oils (50/50) +for a reference diet (REF), hydrogenated coconut oil for a saturated +fatty acid diet (COC), sunflower oil for an Omega6 fatty acid-rich diet +(SUN), linseed oil for an Omega3-rich diet (LIN) and corn/colza/enriched +fish oils for the FISH diet (43/43/14)

+
data("nutrimouse")
+genes <- nutrimouse$gene
+lipids <- nutrimouse$lipid
+metadata <- data.frame(genotype = nutrimouse$genotype, diet = nutrimouse$diet)
+metadata$sample_name <- paste0(rownames(metadata), "_", metadata$genotype, "_", metadata$diet)
+rownames(genes) <- metadata$sample_name
+rownames(lipids) <- metadata$sample_name
+
+
+

Analysis of complete dataset

+
+

Question 1: based on lipids and genes data, do we observe clusters +of samples ?

+

Prepare dataset - concatenate genes and lipids dataframes - define +the number of variables of both block

+

Run ComDim analysis - use ComDim() from MBAnalysis package

+
# prepare dataset
+ComDim_data <- cbind.data.frame(genes, lipids)
+n_group <- c(dim(genes)[[2]], dim(lipids)[[2]])
+
+# run analysis
+ComDim_res <- ComDim(X = ComDim_data,
+                     block = n_group,
+                     name.block = c("genes", "lipids"),
+                     scale = T,
+                     scale.block = T)
+
+
+

Question 2: how do both blocks contribute to each dimension?

+
    +
  • plot saliences
  • +
+
# saliences
+
+saliences <- ComDim_res$saliences
+rownames(saliences) <- c("genes", "lipids")
+saliences <- as.data.frame(t(saliences[,1:4]))
+saliences$Dim <- rownames(saliences)
+saliences <- melt(saliences)
+
+ggplot(saliences, aes(x=Dim, y=value, fill=variable)) +
+  geom_bar(stat = "identity", position=position_dodge()) +
+  theme_light() +
+  labs(x = "global components",
+       y = "specific weights of block on global components", 
+       fill = "omic",
+       title = "saliences")
+

+
+
+

Question 3: observe the samples distributions in the space of the +common dimensions, what are the main sources of variation?

+
    +
  • plot scores (Scor.g) on Dim.1 vs Dim.2 with percentages of explained +variance on axes
  • +
  • plot scores (Scor.g) on Dim.3 vs Dim.4 with percentages of explained +variance on axes
  • +
+
scores <- data.frame(metadata, ComDim_res$Scor.g)
+
+ggplot(scores, aes(x=Dim.1, y=Dim.2, col=diet, shape = genotype)) +
+  geom_point(size=4) +
+  labs(x=paste0("Dim.1 - ", ComDim_res$cumexplained[1,"%explX"], "%"),
+       y=paste0("Dim.2 - ", ComDim_res$cumexplained[2,"%explX"], "%"),
+       title = "scores plots on Dim.1 Dim.2") +
+  theme_light()
+

+
ggplot(scores, aes(x=Dim.3, y=Dim.4, col=diet, shape = genotype)) +
+  geom_point(size=4) +
+  labs(x=paste0("Dim.3 - ", ComDim_res$cumexplained[3,"%explX"], "%"),
+       y=paste0("Dim.4 - ", ComDim_res$cumexplained[4,"%explX"], "%"),
+       title = "scores plots on Dim.3 Dim.4") +
+theme_light()
+

+
+
+

Question 4: which genes and lipids are responsible of the samples +differences?

+
    +
  • plot loadings (Load.g) on Dim.1 vs Dim.2 with percentages of +explained variance on axes
  • +
  • plot loadings (Load.g) on Dim.3 vs Dim.4 with percentages of +explained variance on axes
  • +
+
loadings <- data.frame(ComDim_res$Load.g)
+loadings$omic <- c(rep("genes", dim(genes)[[2]]), rep("lipids", dim(lipids)[[2]]))
+loadings$variable <- rownames(loadings)
+
+ggplot(loadings, aes(x=Dim.1, y=Dim.2, col=omic, label=variable)) +
+  geom_point() +
+  geom_text_repel() +
+  labs(x=paste0("Dim.1 - ", ComDim_res$cumexplained[1,"%explX"], "%"),
+       y=paste0("Dim.2 - ", ComDim_res$cumexplained[2,"%explX"], "%"),
+       title = "loadings plots on Dim.1 Dim.2") +
+  theme_light()
+

+
ggplot(loadings, aes(x=Dim.3, y=Dim.4, col=omic, label=variable)) +
+  geom_point() +
+  geom_text_repel() +
+  labs(x=paste0("Dim.3 - ", ComDim_res$cumexplained[3,"%explX"], "%"),
+       y=paste0("Dim.4 - ", ComDim_res$cumexplained[4,"%explX"], "%"),
+       title = "loadings plots on Dim.3 Dim.4") +
+  theme_light()
+

+
+
+
+

Repeat analysis for wt samples only

+
# prepare dataset
+wt_samples <- metadata$sample_name[metadata$genotype == "wt"]
+wt_ComDim_data <- cbind.data.frame(genes[wt_samples,], lipids[wt_samples,])
+n_group <- c(dim(genes)[[2]], dim(lipids)[[2]])
+
+# run analysis
+wt_ComDim_res <- ComDim(X = wt_ComDim_data,
+                     block = n_group,
+                     name.block = c("genes", "lipids"),
+                     scale = T,
+                     scale.block = T)
+
+# saliences
+wt_saliences <- wt_ComDim_res$saliences
+rownames(wt_saliences) <- c("genes", "lipids")
+wt_saliences <- as.data.frame(t(wt_saliences[,1:4]))
+wt_saliences$Dim <- rownames(wt_saliences)
+wt_saliences <- melt(wt_saliences)
+ggplot(wt_saliences, aes(x=Dim, y=value, fill=variable)) +
+  geom_bar(stat = "identity", position=position_dodge()) +
+  theme_light() +
+  labs(x = "global components",
+       y = "specific weights of block on global components",
+       fill = "omic", 
+       title = "saliences")
+

+
# scores plots
+wt_scores <- data.frame(metadata[metadata$genotype == "wt",], wt_ComDim_res$Scor.g)
+ggplot(wt_scores, aes(x=Dim.1, y=Dim.2, col=diet, shape = genotype)) +
+  geom_point(size=4) +
+  labs(x=paste0("Dim.1 - ", wt_ComDim_res$cumexplained[1,"%explX"], "%"),
+       y=paste0("Dim.2 - ", wt_ComDim_res$cumexplained[2,"%explX"], "%"),
+       title = "scores plots on Dim.1 Dim.2") +
+theme_light()
+

+
ggplot(wt_scores, aes(x=Dim.3, y=Dim.4, col=diet, shape = genotype)) +
+  geom_point(size=4) +
+  labs(x=paste0("Dim.3 - ", wt_ComDim_res$cumexplained[3,"%explX"], "%"),
+       y=paste0("Dim.4 - ", wt_ComDim_res$cumexplained[4,"%explX"], "%"),
+       title = "scores plots on Dim.3 Dim.4") +
+theme_light()
+

+
# loadings plots
+wt_loadings <- data.frame(wt_ComDim_res$Load.g)
+wt_loadings$omic <- c(rep("genes", dim(genes)[[2]]), rep("lipids", dim(lipids)[[2]]))
+wt_loadings$variable <- rownames(wt_loadings)
+ggplot(wt_loadings, aes(x=Dim.1, y=Dim.2, col=omic, label=variable)) +
+  geom_point() +
+  geom_text_repel() +
+  labs(x=paste0("Dim.1 - ", wt_ComDim_res$cumexplained[1,"%explX"], "%"),
+       y=paste0("Dim.2 - ", wt_ComDim_res$cumexplained[2,"%explX"], "%"),
+       title = "loadings plots on Dim.1 Dim.2") +
+  theme_light()
+

+
ggplot(wt_loadings, aes(x=Dim.3, y=Dim.4, col=omic, label=variable)) +
+  geom_point() +
+  geom_text_repel() +
+  labs(x=paste0("Dim.3 - ", wt_ComDim_res$cumexplained[3,"%explX"], "%"),
+       y=paste0("Dim.4 - ", wt_ComDim_res$cumexplained[4,"%explX"], "%"),
+       title = "loadings plots on Dim.3 Dim.4") +
+  theme_light()
+

+
+
+

Repeat analysis for ppar samples only

+
# prepare dataset
+ppar_samples <- metadata$sample_name[metadata$genotype == "ppar"]
+ppar_ComDim_data <- cbind.data.frame(genes[ppar_samples,], lipids[ppar_samples,])
+n_group <- c(dim(genes)[[2]], dim(lipids)[[2]])
+
+# run analysis
+ppar_ComDim_res <- ComDim(X = ppar_ComDim_data,
+                     block = n_group,
+                     name.block = c("genes", "lipids"),
+                     scale = T,
+                     scale.block = T)
+
+# saliences
+ppar_saliences <- ppar_ComDim_res$saliences
+rownames(ppar_saliences) <- c("genes", "lipids")
+ppar_saliences <- as.data.frame(t(ppar_saliences[,1:4]))
+ppar_saliences$Dim <- rownames(ppar_saliences)
+ppar_saliences <- melt(ppar_saliences)
+ggplot(ppar_saliences, aes(x=Dim, y=value, fill=variable)) +
+  geom_bar(stat = "identity", position=position_dodge()) +
+  theme_light() +
+  labs(x = "global components",
+       y = "specific weights of block on global components",
+       fill = "omic",
+       title = "saliences")
+

+
# scores plots
+ppar_scores <- data.frame(metadata[metadata$genotype == "ppar",], ppar_ComDim_res$Scor.g)
+ggplot(ppar_scores, aes(x=Dim.1, y=Dim.2, col=diet, shape = genotype)) +
+  geom_point(size=4) +
+  labs(x=paste0("Dim.1 - ", ppar_ComDim_res$cumexplained[1,"%explX"], "%"),
+       y=paste0("Dim.2 - ", ppar_ComDim_res$cumexplained[2,"%explX"], "%"),
+       title = "scores plots on Dim.1 Dim.2") +
+  theme_light()
+

+
ggplot(ppar_scores, aes(x=Dim.3, y=Dim.4, col=diet, shape = genotype)) +
+  geom_point(size=4) +
+  labs(x=paste0("Dim.3 - ", ppar_ComDim_res$cumexplained[3,"%explX"], "%"),
+       y=paste0("Dim.4 - ", ppar_ComDim_res$cumexplained[4,"%explX"], "%"),
+       title = "scores plots on Dim.3 Dim.4") +
+  theme_light()
+

+
# loadings plots
+ppar_loadings <- data.frame(ppar_ComDim_res$Load.g)
+ppar_loadings$omic <- c(rep("genes", dim(genes)[[2]]), rep("lipids", dim(lipids)[[2]]))
+ppar_loadings$variable <- rownames(ppar_loadings)
+ggplot(ppar_loadings, aes(x=Dim.1, y=Dim.2, col=omic, label=variable)) +
+  geom_point() +
+  geom_text_repel() +
+  labs(x=paste0("Dim.1 - ", ppar_ComDim_res$cumexplained[1,"%explX"], "%"),
+       y=paste0("Dim.2 - ", ppar_ComDim_res$cumexplained[2,"%explX"], "%"),
+       title = "loadings plots on Dim.1 Dim.2") +
+  theme_light()
+

+
ggplot(ppar_loadings, aes(x=Dim.3, y=Dim.4, col=omic, label=variable)) +
+  geom_point() +
+  geom_text_repel() +
+  labs(x=paste0("Dim.3 - ", ppar_ComDim_res$cumexplained[3,"%explX"], "%"),
+       y=paste0("Dim.4 - ", ppar_ComDim_res$cumexplained[4,"%explX"], "%"),
+       title = "loadings plots on Dim.3 Dim.4") +
+  theme_light()
+

+
+ + + + +
+ + + + + + + + + + + + + + + +