From c6832d30c94a187b3c12ea116af149d1c990362a Mon Sep 17 00:00:00 2001 From: ljacquin Date: Sat, 13 Apr 2024 12:38:31 +0200 Subject: [PATCH] feat: add manhattan plot to result outputs --- get_results_scans.sh | 54 ++- khamix.sh | 4 - ...ot_results_scans.R => get_results_scans.R} | 380 +++++++++++------- programs/get_significant_haplotypes.R | 112 ------ programs/plot_manhattan_scan.R | 220 ++++++++++ 5 files changed, 488 insertions(+), 282 deletions(-) rename programs/{plot_results_scans.R => get_results_scans.R} (66%) delete mode 100644 programs/get_significant_haplotypes.R create mode 100644 programs/plot_manhattan_scan.R diff --git a/get_results_scans.sh b/get_results_scans.sh index 1bd7e95..269a753 100755 --- a/get_results_scans.sh +++ b/get_results_scans.sh @@ -21,7 +21,7 @@ new_signif_level=0.001 if [ "$modify_signif_level"=true ] ; then echo "$new_signif_level" > signif_level.txt if [ -d results_genome_scan ]; then - echo rm -rf results_genome_scan/* + rm -rf results_genome_scan/* fi fi @@ -35,9 +35,9 @@ cp kernel_index.txt ../results_genome_scan cp signif_level.txt ../results_genome_scan cd ../programs/ -cp plot_results_scans.R ../results_genome_scan +cp get_results_scans.R ../results_genome_scan cp plot_nb_hap_scans.R ../results_genome_scan -cp get_significant_haplotypes.R ../results_genome_scan +cp plot_manhattan_scan.R ../results_genome_scan cd ../ #--------------------------------# @@ -53,10 +53,9 @@ do done cd results_genome_scan -R -q --vanilla < plot_results_scans.R +R -q --vanilla < get_results_scans.R R -q --vanilla < plot_nb_hap_scans.R -R -q --vanilla < get_significant_haplotypes.R - +R -q --vanilla < plot_manhattan_scan.R if [ "$kernel_index" -gt 1 ] ; then @@ -65,13 +64,15 @@ if [ "$kernel_index" -gt 1 ] ; then for chromo_num_k in $(seq $nb_chromosomes -1 1) do if [ -d results_chromo_num_$chromo_num_k ]; then - echo rm -rf results_chromo_num_$chromo_num_k/* + rm -rf results_chromo_num_$chromo_num_k/* else mkdir results_chromo_num_$chromo_num_k fi mv flanking_markers_of_tested_positions_in_kb_with_rlrt_value_on_chromosome_$chromo_num_k* results_chromo_num_$chromo_num_k mv flanking_markers_of_tested_positions_in_kb_with_significant_rlrt_value_on_chromosome_$chromo_num_k* results_chromo_num_$chromo_num_k - mv significant_haplotypes_chromo_num_$chromo_num_k* results_chromo_num_$chromo_num_k + if ls significant_haplotypes_chromo_num_$chromo_num_k* 1> /dev/null 2>&1; then + mv significant_haplotypes_chromo_num_$chromo_num_k* results_chromo_num_$chromo_num_k + fi mv vect_rlrt_value_chromo_num_$chromo_num_k* results_chromo_num_$chromo_num_k mv vect_nb_hap_window_chromo_num_$chromo_num_k* results_chromo_num_$chromo_num_k mv number_of_haplotypes_per_window_for_chromosome_$chromo_num_k* results_chromo_num_$chromo_num_k @@ -79,35 +80,39 @@ if [ "$kernel_index" -gt 1 ] ; then done if [ -d results_all_chromosomes ]; then - echo rm -rf results_all_chromosomes/* + rm -rf results_all_chromosomes/* else mkdir results_all_chromosomes fi mv number_of_haplotypes_per_window_for_complete_genome_scan* results_all_chromosomes mv kernelized_haplotype_based_genome_scan_for_* results_all_chromosomes - + mv flanking_markers_of_tested_positions_with_statistics* results_all_chromosomes + else for chromo_num_k in $(seq $nb_chromosomes -1 1) do if [ -d results_chromo_num_$chromo_num_k ]; then - echo rm -rf results_chromo_num_$chromo_num_k/* + rm -rf results_chromo_num_$chromo_num_k/* else mkdir results_chromo_num_$chromo_num_k fi mv markers_in_kb_with_rlrt_value_on_chromosome_$chromo_num_k* results_chromo_num_$chromo_num_k mv markers_in_kb_with_significant_rlrt_value_on_chromosome_$chromo_num_k* results_chromo_num_$chromo_num_k - mv significant_snps_chromo_num_$chromo_num_k* results_chromo_num_$chromo_num_k + if ls significant_snps_chromo_num_$chromo_num_k* 1> /dev/null 2>&1; then + mv significant_snps_chromo_num_$chromo_num_k* results_chromo_num_$chromo_num_k + fi mv vect_rlrt_value_chromo_num_$chromo_num_k* results_chromo_num_$chromo_num_k mv kernelized_gwas_of_chromosome_$chromo_num_k* results_chromo_num_$chromo_num_k done if [ -d results_all_chromosomes ]; then - echo rm -rf results_all_chromosomes/* + rm -rf results_all_chromosomes/* else mkdir results_all_chromosomes fi mv kernelized_gwas_for_* results_all_chromosomes + mv markers_of_tested_positions_with_statistics.txt* results_all_chromosomes rm vect_nb_hap_window_chromo_num_* fi @@ -119,13 +124,15 @@ if [ "$kernel_index" -gt 1 ] ; then for chromo_num_k in $(seq $nb_chromosomes -1 1) do if [ -d results_chromo_num_$chromo_num_k ]; then - echo rm -rf results_chromo_num_$chromo_num_k/* + rm -rf results_chromo_num_$chromo_num_k/* else mkdir results_chromo_num_$chromo_num_k fi mv flanking_markers_of_tested_positions_in_kb_with_rlrt_value_on_chromosome_$chromo_num_k* results_chromo_num_$chromo_num_k - mv flanking_markers_of_tested_positions_in_kb_with_significant_rlrt_value_on_chromosome_$chromo_num_k* results_chromo_num_$chromo_num_k - mv significant_haplotypes_chromo_num_$chromo_num_k* results_chromo_num_$chromo_num_k + mv flanking_markers_of_tested_positions_in_kb_with_significant_rlrt_value_on_chromosome_$chromo_num_k* results_chromo_num_$chromo_num_k + if ls significant_haplotypes_chromo_num_$chromo_num_k* 1> /dev/null 2>&1; then + mv significant_haplotypes_chromo_num_$chromo_num_k* results_chromo_num_$chromo_num_k + fi mv vect_rlrt_value_chromo_num_$chromo_num_k* results_chromo_num_$chromo_num_k mv vect_nb_hap_window_chromo_num_$chromo_num_k* results_chromo_num_$chromo_num_k mv number_of_haplotypes_per_window_for_chromosome_$chromo_num_k* results_chromo_num_$chromo_num_k @@ -133,35 +140,39 @@ if [ "$kernel_index" -gt 1 ] ; then done if [ -d results_all_chromosomes ]; then - echo rm -rf results_all_chromosomes/* + rm -rf results_all_chromosomes/* else mkdir results_all_chromosomes fi mv number_of_haplotypes_per_window_for_complete_genome_scan* results_all_chromosomes mv haplotype_based_genome_scan_for_* results_all_chromosomes + mv flanking_markers_of_tested_positions_with_statistics* results_all_chromosomes else for chromo_num_k in $(seq $nb_chromosomes -1 1) do if [ -d results_chromo_num_$chromo_num_k ]; then - echo rm -rf results_chromo_num_$chromo_num_k/* + rm -rf results_chromo_num_$chromo_num_k/* else mkdir results_chromo_num_$chromo_num_k fi mv markers_in_kb_with_rlrt_value_on_chromosome_$chromo_num_k* results_chromo_num_$chromo_num_k mv markers_in_kb_with_significant_rlrt_value_on_chromosome_$chromo_num_k* results_chromo_num_$chromo_num_k - mv significant_snps_chromo_num_$chromo_num_k* results_chromo_num_$chromo_num_k + if ls significant_snps_chromo_num_$chromo_num_k* 1> /dev/null 2>&1; then + mv significant_snps_chromo_num_$chromo_num_k* results_chromo_num_$chromo_num_k + fi mv vect_rlrt_value_chromo_num_$chromo_num_k* results_chromo_num_$chromo_num_k mv gwas_of_chromosome_$chromo_num_k* results_chromo_num_$chromo_num_k done if [ -d results_all_chromosomes ]; then - echo rm -rf results_all_chromosomes/* + rm -rf results_all_chromosomes/* else mkdir results_all_chromosomes fi mv gwas_for_* results_all_chromosomes + mv markers_of_tested_positions_with_statistics.txt* results_all_chromosomes rm vect_nb_hap_window_chromo_num_* fi @@ -169,5 +180,8 @@ if [ "$kernel_index" -gt 1 ] ; then fi clear +rm *.R +rm *.txt + # End of program for reformatting results diff --git a/khamix.sh b/khamix.sh index 2f4938b..450748e 100755 --- a/khamix.sh +++ b/khamix.sh @@ -34,11 +34,9 @@ mkdir estimates_h0/ cp data_parameters/trait_name.txt estimates_h0/ cp data_parameters/phenotypes.txt estimates_h0/ if [ -f data_parameters/incidence_fixed_effects.txt ] ; then - echo cp data_parameters/incidence_fixed_effects.txt estimates_h0/ fi if [ -f data_parameters/incidence_polygenic_effects.txt ] ; then - echo cp data_parameters/incidence_polygenic_effects.txt estimates_h0/ fi cp data_parameters/kernel_index.txt estimates_h0/ @@ -94,12 +92,10 @@ do # perform genome scan by sliding window for chromosome $chromo_num_k if [ "$local_or_cluster" -gt 1 ] ; then - echo # qsub -q normal.q -v scan_chromo_num_k.sh # a cc2 queue # sbatch scan_chromo_num_k.sh qsub -q workq scan_chromo_num_k.sh else - echo ./scan_chromo_num_k.sh fi diff --git a/programs/plot_results_scans.R b/programs/get_results_scans.R similarity index 66% rename from programs/plot_results_scans.R rename to programs/get_results_scans.R index 44892c5..4f08ac8 100644 --- a/programs/plot_results_scans.R +++ b/programs/get_results_scans.R @@ -1,3 +1,5 @@ +library(data.table) + #---------------------------# # read files and parameters # #---------------------------# @@ -7,9 +9,11 @@ nb_snp_hap <- scan("nb_snp_hap.txt") kernel_index <- scan("kernel_index.txt") alpha <- as.numeric(scan("signif_level.txt")) +# define shift quantity shift_quantity <- (floor(nb_snp_hap / 2) - 1) shift_quantity +# get physical map data physical_map_matrix <- read.table("physical_map.txt", header = TRUE) marker_id <- physical_map_matrix[, match( "MkID", @@ -24,15 +28,29 @@ position_kb <- physical_map_matrix[, match( colnames(physical_map_matrix) )] +# get phased genotype data +phased_genotype_matrix <- as.data.frame( + fread("phased_genotypes.txt", + header = TRUE + ) +) +phased_genotype_matrix <- phased_genotype_matrix[, -match( + c("I", "ID"), + colnames(phased_genotype_matrix) +)] +phased_genotype_matrix <- t(phased_genotype_matrix) + + # set rlrt threshold set.seed(123) -simulated_rlrt_distribution <- 0.5 * (sort(rchisq(1e3, df = 1, ncp = 0)) -+ sort(rchisq(1e3, df = 2, ncp = 0))) +simulated_rlrt_distribution <- 0.5 * (sort(rchisq(1e6, df = 1, ncp = 0)) ++ sort(rchisq(1e6, df = 2, ncp = 0))) rlrt_threshold <- quantile(simulated_rlrt_distribution, 1 - alpha) # function to get p-values get_p_value <- function(distrib_, test_statistic_value_) { p_val_ <- sum(distrib_ > test_statistic_value_) / length(distrib_) + return(p_val_) } #-------------------------------------------------------------# @@ -44,9 +62,7 @@ if (kernel_index == 1) { # --------------------------------------# if (nb_snp_hap == 1) { pdf(paste0("gwas_for_", trait_name, ".pdf")) - par(mfrow = c(4, 3)) - for (chromo_num_k in 1:nb_chromosomes) { index_chrom_num <- which(repeated_chrom_num == chromo_num_k) @@ -78,6 +94,25 @@ if (kernel_index == 1) { ) markers_in_Kb_rlrt_value_chromo_num_k$Chr <- chromo_num_k + # set zero machine to avoid numerical instability + idx_zero_p_values <- which( + markers_in_Kb_rlrt_value_chromo_num_k$p_value == 0 + ) + if (length(idx_zero_p_values) > 0) { + markers_in_Kb_rlrt_value_chromo_num_k$p_value[ + idx_zero_p_values + ] <- 1e-16 + } + idx_zero_rlrt_values <- which( + markers_in_Kb_rlrt_value_chromo_num_k$Restricted_LRT_value == 0 + ) + if (length(idx_zero_rlrt_values) > 0) { + markers_in_Kb_rlrt_value_chromo_num_k$Restricted_LRT_value[ + idx_zero_rlrt_values + ] <- 1e-16 + } + + write.table(markers_in_Kb_rlrt_value_chromo_num_k, file = paste0( "markers_in_kb_with_rlrt_value_on_chromosome_", @@ -87,7 +122,6 @@ if (kernel_index == 1) { row.names = FALSE, quote = FALSE, sep = " " ) - write.table( markers_in_Kb_rlrt_value_chromo_num_k[ markers_in_Kb_rlrt_value_chromo_num_k$Restricted_LRT_value >= @@ -115,11 +149,7 @@ if (kernel_index == 1) { cex.main = 0.8, cex.lab = 0.8 ) abline(h = rlrt_threshold, col = "red", lwd = 1) - } - dev.off() - for (chromo_num_k in 1:nb_chromosomes) - { pdf(paste0( "gwas_of_chromosome_", chromo_num_k, @@ -127,41 +157,49 @@ if (kernel_index == 1) { trait_name, ".pdf" )) - - index_chrom_num <- which(repeated_chrom_num == chromo_num_k) - position_kb_chrom_num <- as.numeric(as.character(position_kb[index_chrom_num])) - - rlrt_value <- scan(paste0( - "vect_rlrt_value_chromo_num_", - chromo_num_k, - ".txt" - )) - plot(position_kb_chrom_num, rlrt_value, type = "p", pch = 16, cex = 0.5, col = "black", - xlab = "Tested marker snp in Kb", ylab = "Restricted LRT", + xlab = "Tested marker snp in Kb", + ylab = "Restricted LRT", main = paste0( "GWAS of chromosome ", chromo_num_k, - " for ", + " \n for ", trait_name ), - cex.main = 1.0, - cex.lab = 1.0 + cex.main = 0.8, cex.lab = 0.8 ) abline(h = rlrt_threshold, col = "red", lwd = 1) - dev.off() + + # get significant snp + phased_genotypes_chromo_num_k <- phased_genotype_matrix[, index_chrom_num] + index_signif_rlrt_value <- which(rlrt_value >= rlrt_threshold) + + if (length(index_signif_rlrt_value) >= 1) { + for (m in index_signif_rlrt_value) + { + write.table(phased_genotypes_chromo_num_k[, m], + file = paste0( + "significant_snps_chromo_num_", + chromo_num_k, + "_snp_num_", + m, + ".txt" + ), + row.names = FALSE, col.names = FALSE, quote = FALSE, sep = " " + ) + } + } } + dev.off() } else { pdf(paste0( "haplotype_based_genome_scan_for_", trait_name, ".pdf" )) - par(mfrow = c(4, 3)) - for (chromo_num_k in 1:nb_chromosomes) { index_chrom_num <- which(repeated_chrom_num == chromo_num_k) @@ -178,11 +216,12 @@ if (kernel_index == 1) { scanned_position_kb <- rep(0, length(rlrt_value)) flank_markers_in_Kb_rlrt_value_chromo_num_k <- data.frame(matrix( - 0, length(rlrt_value), 8 + 0, length(rlrt_value), 9 )) colnames(flank_markers_in_Kb_rlrt_value_chromo_num_k) <- c( "left_flank_MkID_to_center", "Pos_in_Kb_left_flank_MkID", "right_flank_MkID_to_center", "Pos_in_Kb_right_flank_MkID", + "average_position_in_Kb_at_window_center", "Restricted_LRT_value", "p_value", "Window_starting_index", "Chr" ) @@ -202,6 +241,8 @@ if (kernel_index == 1) { flank_markers_in_Kb_rlrt_value_chromo_num_k$Restricted_LRT_value[p] <- rlrt_value[p] + flank_markers_in_Kb_rlrt_value_chromo_num_k$average_position_in_Kb_at_window_center[p] <- scanned_position_kb[p] + flank_markers_in_Kb_rlrt_value_chromo_num_k$p_value[p] <- get_p_value( distrib_ = simulated_rlrt_distribution, test_statistic_value_ = rlrt_value[p] @@ -210,7 +251,25 @@ if (kernel_index == 1) { p <- p + 1 } flank_markers_in_Kb_rlrt_value_chromo_num_k$Chr <- chromo_num_k - + + # set zero machine to avoid numerical instability + idx_zero_p_values <- which( + flank_markers_in_Kb_rlrt_value_chromo_num_k$p_value == 0 + ) + if (length(idx_zero_p_values) > 0) { + flank_markers_in_Kb_rlrt_value_chromo_num_k$p_value[ + idx_zero_p_values + ] <- 1e-16 + } + idx_zero_rlrt_values <- which( + flank_markers_in_Kb_rlrt_value_chromo_num_k$Restricted_LRT_value == 0 + ) + if (length(idx_zero_rlrt_values) > 0) { + flank_markers_in_Kb_rlrt_value_chromo_num_k$Restricted_LRT_value[ + idx_zero_rlrt_values + ] <- 1e-16 + } + write.table( flank_markers_in_Kb_rlrt_value_chromo_num_k, file = paste0( @@ -249,11 +308,7 @@ if (kernel_index == 1) { cex.lab = 0.8 ) abline(h = rlrt_threshold, col = "red", lwd = 1) - } - dev.off() - for (chromo_num_k in 1:nb_chromosomes) - { pdf(paste0( "haplotype_based_genome_scan_of_chromosome_", chromo_num_k, @@ -261,56 +316,53 @@ if (kernel_index == 1) { trait_name, ".pdf" )) - - index_chrom_num <- which(repeated_chrom_num == chromo_num_k) - position_kb_chrom_num <- as.numeric(as.character(position_kb[index_chrom_num])) - - index_last_window <- (length(index_chrom_num) - (nb_snp_hap - 1)) - - rlrt_value <- scan(paste0( - "vect_rlrt_value_chromo_num_", - chromo_num_k, ".txt" - )) - scanned_position_kb <- rep(0, length(rlrt_value)) - - p <- 1 - for (k in 1:index_last_window) - { - scanned_position_kb[p] <- (abs(position_kb_chrom_num[shift_quantity + - (k + 1)] - position_kb_chrom_num[shift_quantity + k]) / 2) + - position_kb_chrom_num[shift_quantity + k] - p <- p + 1 - } - plot(scanned_position_kb, rlrt_value, type = "p", pch = 16, cex = 0.5, col = "black", xlab = "Tested position in Kb at the \n center of the sliding window", ylab = "Restricted LRT", main = paste0( - "Haplotype-based scan of chromosome ", - chromo_num_k, + "Haplotype-based scan of \n chromosome ", chromo_num_k, " for ", trait_name ), - cex.main = 1.0, - cex.lab = 1.0 - ) - abline( - h = rlrt_threshold, - col = "red", lwd = 1 + cex.main = 0.8, + cex.lab = 0.8 ) + abline(h = rlrt_threshold, col = "red", lwd = 1) dev.off() + + # get significant snp + phased_genotypes_chromo_num_k <- phased_genotype_matrix[, index_chrom_num] + index_signif_rlrt_value <- which(rlrt_value >= rlrt_threshold) + + if (length(index_signif_rlrt_value) >= 1) { + for (m in index_signif_rlrt_value) + { + write.table(phased_genotypes_chromo_num_k[, m:(m + (nb_snp_hap - 1))], + file = paste0( + "significant_haplotypes_chromo_num_", + chromo_num_k, + "_window_", + m, + ".txt" + ), + row.names = FALSE, + col.names = FALSE, + quote = FALSE, + sep = " " + ) + } + } } + dev.off() } } else { # ----------------# # Gaussian kernel # - #-----------------# + # ----------------# if (nb_snp_hap == 1) { pdf(paste0("kernelized_gwas_for_", trait_name, ".pdf")) - par(mfrow = c(4, 3)) - for (chromo_num_k in 1:nb_chromosomes) { index_chrom_num <- which(repeated_chrom_num == chromo_num_k) @@ -341,7 +393,25 @@ if (kernel_index == 1) { function(x) get_p_value(distrib_ = simulated_rlrt_distribution, x) ) markers_in_Kb_rlrt_value_chromo_num_k$Chr <- chromo_num_k - + + # set zero machine to avoid numerical instability + idx_zero_p_values <- which( + markers_in_Kb_rlrt_value_chromo_num_k$p_value == 0 + ) + if (length(idx_zero_p_values) > 0) { + markers_in_Kb_rlrt_value_chromo_num_k$p_value[ + idx_zero_p_values + ] <- 1e-16 + } + idx_zero_rlrt_values <- which( + markers_in_Kb_rlrt_value_chromo_num_k$Restricted_LRT_value == 0 + ) + if (length(idx_zero_rlrt_values) > 0) { + markers_in_Kb_rlrt_value_chromo_num_k$Restricted_LRT_value[ + idx_zero_rlrt_values + ] <- 1e-16 + } + write.table(markers_in_Kb_rlrt_value_chromo_num_k, file = paste0( "markers_in_kb_with_rlrt_value_on_chromosome_", @@ -351,7 +421,6 @@ if (kernel_index == 1) { row.names = FALSE, quote = FALSE, sep = " " ) - write.table( markers_in_Kb_rlrt_value_chromo_num_k[ markers_in_Kb_rlrt_value_chromo_num_k$Restricted_LRT_value >= @@ -373,60 +442,63 @@ if (kernel_index == 1) { main = paste0( "kernelized GWAS of \n chromosome ", chromo_num_k, - " for ", + " \n for ", trait_name ), cex.main = 0.8, cex.lab = 0.8 ) abline(h = rlrt_threshold, col = "red", lwd = 1) - } - dev.off() - for (chromo_num_k in 1:nb_chromosomes) - { pdf(paste0( - "kernelized_gwas_of_chromosome_", chromo_num_k, - "_for_", trait_name, - ".pdf" - )) - - index_chrom_num <- which(repeated_chrom_num == chromo_num_k) - position_kb_chrom_num <- as.numeric(as.character(position_kb[index_chrom_num])) - - rlrt_value <- scan(paste0( - "vect_rlrt_value_chromo_num_", + "kernelized_gwas_of_chromosome_", chromo_num_k, - ".txt" + "_for_", + trait_name, + ".pdf" )) - plot(position_kb_chrom_num, rlrt_value, type = "p", pch = 16, cex = 0.5, col = "black", - xlab = "Tested marker snp in Kb", ylab = "Restricted LRT", + xlab = "Tested marker snp in Kb", + ylab = "Restricted LRT", main = paste0( "kernelized GWAS of chromosome ", chromo_num_k, - " for ", + " \n for ", trait_name ), - cex.main = 1.0, cex.lab = 1.0 - ) - abline( - h = rlrt_threshold, - col = "red", lwd = 1 + cex.main = 0.8, cex.lab = 0.8 ) + abline(h = rlrt_threshold, col = "red", lwd = 1) dev.off() + + # get significant snp + phased_genotypes_chromo_num_k <- phased_genotype_matrix[, index_chrom_num] + index_signif_rlrt_value <- which(rlrt_value >= rlrt_threshold) + + if (length(index_signif_rlrt_value) >= 1) { + for (m in index_signif_rlrt_value) + { + write.table(phased_genotypes_chromo_num_k[, m], + file = paste0( + "significant_snps_chromo_num_", + chromo_num_k, + "_snp_num_", + m, + ".txt" + ), + row.names = FALSE, col.names = FALSE, quote = FALSE, sep = " " + ) + } + } } + dev.off() } else { - pdf( - paste0( - "kernelized_haplotype_based_genome_scan_for_", - trait_name, - ".pdf" - ) - ) - + pdf(paste0( + "kernelized_haplotype_based_genome_scan_for_", + trait_name, + ".pdf" + )) par(mfrow = c(4, 3)) - for (chromo_num_k in 1:nb_chromosomes) { index_chrom_num <- which(repeated_chrom_num == chromo_num_k) @@ -443,11 +515,12 @@ if (kernel_index == 1) { scanned_position_kb <- rep(0, length(rlrt_value)) flank_markers_in_Kb_rlrt_value_chromo_num_k <- data.frame(matrix( - 0, length(rlrt_value), 8 + 0, length(rlrt_value), 9 )) colnames(flank_markers_in_Kb_rlrt_value_chromo_num_k) <- c( "left_flank_MkID_to_center", "Pos_in_Kb_left_flank_MkID", "right_flank_MkID_to_center", "Pos_in_Kb_right_flank_MkID", + "average_position_in_Kb_at_window_center", "Restricted_LRT_value", "p_value", "Window_starting_index", "Chr" ) @@ -467,6 +540,8 @@ if (kernel_index == 1) { flank_markers_in_Kb_rlrt_value_chromo_num_k$Restricted_LRT_value[p] <- rlrt_value[p] + flank_markers_in_Kb_rlrt_value_chromo_num_k$average_position_in_Kb_at_window_center[p] <- scanned_position_kb[p] + flank_markers_in_Kb_rlrt_value_chromo_num_k$p_value[p] <- get_p_value( distrib_ = simulated_rlrt_distribution, test_statistic_value_ = rlrt_value[p] @@ -475,7 +550,26 @@ if (kernel_index == 1) { p <- p + 1 } flank_markers_in_Kb_rlrt_value_chromo_num_k$Chr <- chromo_num_k - + + # set zero machine to avoid numerical instability + idx_zero_p_values <- which( + flank_markers_in_Kb_rlrt_value_chromo_num_k$p_value == 0 + ) + if (length(idx_zero_p_values) > 0) { + flank_markers_in_Kb_rlrt_value_chromo_num_k$p_value[ + idx_zero_p_values + ] <- 1e-16 + } + idx_zero_rlrt_values <- which( + flank_markers_in_Kb_rlrt_value_chromo_num_k$Restricted_LRT_value == 0 + ) + if (length(idx_zero_rlrt_values) > 0) { + flank_markers_in_Kb_rlrt_value_chromo_num_k$Restricted_LRT_value[ + idx_zero_rlrt_values + ] <- 1e-16 + } + + write.table( flank_markers_in_Kb_rlrt_value_chromo_num_k, file = paste0( @@ -504,68 +598,62 @@ if (kernel_index == 1) { plot(scanned_position_kb, rlrt_value, type = "p", pch = 16, cex = 0.5, col = "black", xlab = "Tested position in Kb at the \n center of the sliding window", - ylab = "Restricted LRT", main = paste0( - "kernelized haplotype-based scan of \n chromosome ", - chromo_num_k, + ylab = "Restricted LRT", + main = paste0( + "kernelized haplotype-based \n scan of chromosome ", chromo_num_k, " for ", trait_name ), - cex.main = 0.8, cex.lab = 0.8 - ) - abline( - h = rlrt_threshold, - col = "red", lwd = 1 - ) - } - dev.off() - - for (chromo_num_k in 1:nb_chromosomes) - { - pdf( - paste0( - "kernelized_haplotype_based_genome_scan_of_chromosome_", - chromo_num_k, - "_for_", - trait_name, - ".pdf" - ) + cex.main = 0.8, + cex.lab = 0.8 ) + abline(h = rlrt_threshold, col = "red", lwd = 1) - index_chrom_num <- which(repeated_chrom_num == chromo_num_k) - position_kb_chrom_num <- as.numeric(as.character(position_kb[index_chrom_num])) - - index_last_window <- (length(index_chrom_num) - (nb_snp_hap - 1)) - rlrt_value <- scan(paste0( - "vect_rlrt_value_chromo_num_", + pdf(paste0( + "kernelized_haplotype_based_genome_scan_of_chromosome_", chromo_num_k, - ".txt" + "_for_", + trait_name, + ".pdf" )) - scanned_position_kb <- rep(0, length(rlrt_value)) - - p <- 1 - for (k in 1:index_last_window) - { - scanned_position_kb[p] <- (abs(position_kb_chrom_num[shift_quantity + (k + 1)] - - position_kb_chrom_num[shift_quantity + k]) / 2) + - position_kb_chrom_num[shift_quantity + k] - p <- p + 1 - } - - plot( - scanned_position_kb, rlrt_value, + plot(scanned_position_kb, rlrt_value, type = "p", pch = 16, cex = 0.5, col = "black", xlab = "Tested position in Kb at the \n center of the sliding window", ylab = "Restricted LRT", main = paste0( - "kernelized haplotype-based scan of chromosome ", - chromo_num_k, + "kernelized haplotype-based \n scan of chromosome ", chromo_num_k, " for ", trait_name ), - cex.main = 1.0, cex.lab = 1.0 + cex.main = 0.8, + cex.lab = 0.8 ) abline(h = rlrt_threshold, col = "red", lwd = 1) dev.off() + + # get significant snp + phased_genotypes_chromo_num_k <- phased_genotype_matrix[, index_chrom_num] + index_signif_rlrt_value <- which(rlrt_value >= rlrt_threshold) + + if (length(index_signif_rlrt_value) >= 1) { + for (m in index_signif_rlrt_value) + { + write.table(phased_genotypes_chromo_num_k[, m:(m + (nb_snp_hap - 1))], + file = paste0( + "significant_haplotypes_chromo_num_", + chromo_num_k, + "_window_", + m, + ".txt" + ), + row.names = FALSE, + col.names = FALSE, + quote = FALSE, + sep = " " + ) + } + } } + dev.off() } } diff --git a/programs/get_significant_haplotypes.R b/programs/get_significant_haplotypes.R deleted file mode 100644 index db12e9b..0000000 --- a/programs/get_significant_haplotypes.R +++ /dev/null @@ -1,112 +0,0 @@ -library(data.table) - -#--------------------------# -# read parameters and data # -#--------------------------# -nb_chromosomes <- scan("nb_chromosomes.txt") -nb_snp_hap <- scan("nb_snp_hap.txt") -alpha <- as.numeric(scan("signif_level.txt")) - -physical_map_matrix <- read.table("physical_map.txt", header = TRUE) -repeated_chrom_num <- physical_map_matrix[, match( - "chr", - colnames(physical_map_matrix) -)] -position_kb <- physical_map_matrix[, match( - "Pos", - colnames(physical_map_matrix) -)] - -phased_genotype_matrix <- as.data.frame( - fread("phased_genotypes.txt", - header = TRUE - ) -) -phased_genotype_matrix <- phased_genotype_matrix[, -match( - c("I", "ID"), - colnames(phased_genotype_matrix) -)] -phased_genotype_matrix <- t(phased_genotype_matrix) - -# set rlrt threshold -set.seed(123) -simulated_rlrt_distribution <- 0.5 * (sort(rchisq(1e3, df = 1, ncp = 0)) -+ sort(rchisq(1e3, df = 2, ncp = 0))) -rlrt_threshold <- quantile(simulated_rlrt_distribution, 1 - alpha) - -#-------------------------------------------------------------# -# reformat results for each chromosome for the analyzed trait # -#-------------------------------------------------------------# -if (nb_snp_hap == 1) { - for (chromo_num_k in 1:nb_chromosomes) - { - index_chrom_num <- which(repeated_chrom_num == chromo_num_k) - - position_kb_chrom_num <- as.numeric(as.character(position_kb[index_chrom_num])) - - phased_genotypes_chromo_num_k <- phased_genotype_matrix[, index_chrom_num] - - rlrt_value <- scan(paste0( - "vect_rlrt_value_chromo_num_", - chromo_num_k, - ".txt" - )) - - index_signif_rlrt_value <- which(rlrt_value >= rlrt_threshold) - print(index_signif_rlrt_value) - - if (length(index_signif_rlrt_value) >= 1) { - for (m in index_signif_rlrt_value) - { - write.table(phased_genotypes_chromo_num_k[, m], - file = paste0( - "significant_snps_chromo_num_", - chromo_num_k, - "_snp_num_", - m, - ".txt" - ), - row.names = FALSE, col.names = FALSE, quote = FALSE, sep = " " - ) - } - } - } -} else { - for (chromo_num_k in 1:nb_chromosomes) - { - index_chrom_num <- which(repeated_chrom_num == chromo_num_k) - - position_kb_chrom_num <- as.numeric(as.character(position_kb[ - index_chrom_num - ])) - - phased_genotypes_chromo_num_k <- phased_genotype_matrix[, index_chrom_num] - - rlrt_value <- scan(paste0( - "vect_rlrt_value_chromo_num_", - chromo_num_k, - ".txt" - )) - index_signif_rlrt_value <- which(rlrt_value >= rlrt_threshold) - print(index_signif_rlrt_value) - - if (length(index_signif_rlrt_value) >= 1) { - for (m in index_signif_rlrt_value) - { - write.table(phased_genotypes_chromo_num_k[, m:(m + (nb_snp_hap - 1))], - file = paste0( - "significant_haplotypes_chromo_num_", - chromo_num_k, - "_window_", - m, - ".txt" - ), - row.names = FALSE, - col.names = FALSE, - quote = FALSE, - sep = " " - ) - } - } - } -} diff --git a/programs/plot_manhattan_scan.R b/programs/plot_manhattan_scan.R new file mode 100644 index 0000000..8c952b4 --- /dev/null +++ b/programs/plot_manhattan_scan.R @@ -0,0 +1,220 @@ +library(ggplot2) +library(data.table) + +#---------------------------# +# read files and parameters # +#---------------------------# +trait_name <- as.character(readLines("trait_name.txt")) +nb_chromosomes <- scan("nb_chromosomes.txt") +nb_snp_hap <- scan("nb_snp_hap.txt") +kernel_index <- scan("kernel_index.txt") +alpha <- as.numeric(scan("signif_level.txt")) + +# set rlrt threshold +set.seed(123) +simulated_rlrt_distribution <- 0.5 * (sort(rchisq(1e6, df = 1, ncp = 0)) ++ sort(rchisq(1e6, df = 2, ncp = 0))) +rlrt_threshold <- quantile(simulated_rlrt_distribution, 1 - alpha) + +# get data according to the defined analyzed case +if (nb_snp_hap == 1) { + df_ <- as.data.frame(fread(paste0( + "markers_in_kb_with_rlrt_value_on_chromosome_", + 1, ".txt" + ))) + for (k in 2:nb_chromosomes) { + df_ <- rbind( + df_, + as.data.frame(fread(paste0( + "markers_in_kb_with_rlrt_value_on_chromosome_", + k, ".txt" + ))) + ) + } +} else { + df_ <- as.data.frame(fread(paste0( + "flanking_markers_of_tested_positions_in_kb_with_rlrt_value_on_chromosome_", + 1, ".txt" + ))) + for (k in 2:nb_chromosomes) { + df_ <- rbind( + df_, + as.data.frame(fread(paste0( + "flanking_markers_of_tested_positions_in_kb_with_rlrt_value_on_chromosome_", + k, ".txt" + ))) + ) + } +} +# add a vector of position index for manhattan plot +df_$Position_index_manhattan <- 1:nrow(df_) + +# to avoid extreme values, perform a smoothing +p_value_threshold <- -log10(alpha) +try( + { + nonlinear_model <- nls(p_value ~ exp(-alpha * Restricted_LRT_value), + start = list(alpha = 1), + data = df_ + ) + df_$p_value <- predict(nonlinear_model, newdata = df_) + p_value_threshold <- predict(nonlinear_model, + newdata = data.frame(Restricted_LRT_value = rlrt_threshold) + ) + }, + silent = TRUE +) +max_y <- 1.5 * max(-log10(df_$p_value)) + +if (kernel_index == 1) { + # Van Raden matrix (i.e. linear kernel) + + if (nb_snp_hap == 1) { + # get manhattan plot for genome scan + p <- ggplot(df_, aes( + x = Position_index_manhattan, + y = -log10(p_value), color = factor(Chr) + )) + + geom_point(size = 3) + + geom_hline( + yintercept = -log10(p_value_threshold), + linetype = "dashed", color = "red" + ) + + labs( + x = "Position index", y = "-log10(p-value)", + title = paste0("GWAS for ", trait_name) + ) + + scale_color_discrete(name = "Chromosome") + + coord_cartesian(ylim = c(0, max_y)) + + theme(plot.title = element_text( + hjust = 0.5, + size = 20 + )) + ggsave( + paste0( + "gwas_for_", trait_name, + "_manhattan_plot.pdf" + ), + plot = p, width = 10, height = 8, dpi = 300 + ) + + # write results for all chromosomes + write.table( + df_, + file = "markers_of_tested_positions_with_statistics.txt", + row.names = FALSE, + quote = FALSE, sep = " " + ) + } else { + # get manhattan plot for genome scan + p <- ggplot(df_, aes( + x = Position_index_manhattan, + y = -log10(p_value), color = factor(Chr) + )) + + geom_point(size = 3) + + geom_hline( + yintercept = -log10(p_value_threshold), + linetype = "dashed", color = "red" + ) + + labs( + x = "Position index", y = "-log10(p-value)", + title = paste0("Haplotype-based scan for ", trait_name) + ) + + scale_color_discrete(name = "Chromosome") + + coord_cartesian(ylim = c(0, max_y)) + + theme(plot.title = element_text( + hjust = 0.5, + size = 20 + )) + ggsave( + paste0( + "haplotype_based_genome_scan_for_", + trait_name, "_manhattan_plot.pdf" + ), + plot = p, width = 10, height = 8, dpi = 300 + ) + # write results for all chromosomes + write.table( + df_, + file = "flanking_markers_of_tested_positions_with_statistics.txt", + row.names = FALSE, + quote = FALSE, sep = " " + ) + } +} else { + # Gaussian kernel + if (nb_snp_hap == 1) { + # get manhattan plot for genome scan + p <- ggplot(df_, aes( + x = Position_index_manhattan, + y = -log10(p_value), color = factor(Chr) + )) + + geom_point(size = 3) + + geom_hline( + yintercept = -log10(p_value_threshold), + linetype = "dashed", color = "red" + ) + + labs( + x = "Position index", y = "-log10(p-value)", + title = paste0("Kernelized GWAS for ", trait_name) + ) + + scale_color_discrete(name = "Chromosome") + + coord_cartesian(ylim = c(0, max_y)) + + theme(plot.title = element_text( + hjust = 0.5, + size = 20 + )) + ggsave( + paste0( + "kernelized_gwas_for_", trait_name, + "_manhattan_plot.pdf" + ), + plot = p, width = 10, height = 8, dpi = 300 + ) + # write results for all chromosomes + write.table( + df_, + file = "markers_of_tested_positions_with_statistics.txt", + row.names = FALSE, + quote = FALSE, sep = " " + ) + } else { + # get manhattan plot for genome scan + p <- ggplot(df_, aes( + x = Position_index_manhattan, + y = -log10(p_value), color = factor(Chr) + )) + + geom_point(size = 3) + + geom_hline( + yintercept = -log10(p_value_threshold), + linetype = "dashed", color = "red" + ) + + labs( + x = "Position index", y = "-log10(p-value)", + title = paste0( + "Kernelized haplotype-based scan for ", + trait_name + ) + ) + + scale_color_discrete(name = "Chromosome") + + coord_cartesian(ylim = c(0, max_y)) + + theme(plot.title = element_text( + hjust = 0.5, + size = 20 + )) + ggsave( + paste0( + "kernelized_haplotype_based_genome_scan_for_", + trait_name, "_manhattan_plot.pdf" + ), + plot = p, width = 10, height = 8, dpi = 300 + ) + # write results for all chromosomes + write.table( + df_, + file = "flanking_markers_of_tested_positions_with_statistics.txt", + row.names = FALSE, + quote = FALSE, sep = " " + ) + } +}