forked from Christensen-Lab-Dartmouth/brca_lowstage_DMGRs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ANALYSIS.sh
134 lines (102 loc) · 6.13 KB
/
ANALYSIS.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!bin/bash
#####################################################################
# ~~~~~~~~~~~~~~~~~~
# Identification and validation of tumor subtype and cell type independent DNA
# methylation alterations in breast carcinoma
# ~~~~~~~~~~~~~~~~~~
#
# Way, G., Johnson, K., Christensen, B. 2015
#
# The following bash script will perform a cell proportion analysis of
# solid tumor TCGA breast cancer samples. We will use this correction
# to analyze DNA methylation changes of low stage TCGA breast tumors
# stratefied by PAM50 Subtype.
#####################################################################
#Install R Packages
R --no-save < INSTALL.R
####################
# Step I: Data Processing
####################
# Data can be downloaded from TCGA
# https://tcga-data.nci.nih.gov/tcga/dataAccessMatrix.htm
#~~~~GET DATA
# Data Type: DNA Methylation; Batch Number: All; Data Level: Level 1
# Availibility: Available; Preservation: All; Center/Platform: JHU_USC (HumanMethylation450)
# Access date: March 2015
# Download Clinical Data from UCSC Cancer Genome Browser:
# https://genome-cancer.ucsc.edu/proj/site/hgHeatmap/
# NOTE: It is important to place the IDAT files and the Clinical data matrix in the same folder
IDAT_loc="../../Documents/mdata/TCGAbreast_idat/" # Place download folder location here
#~~~~~PREPROCESS DATA
# Run preprocessing script (if third argument is "summary", then the script will output a processing summary)
R --no-save --args "I.Data_Processing/Data/TCGA_BRCA" $IDAT_loc "nosummary" < I.Data_Processing/Scripts/A.preprocess_minfi.R
# Subset data to spectific subtypes; tumor adjacent samples are included in all subsets
R --no-save --args PAM50.RNAseq Basal Her2 LumA LumB Normal < I.Data_Processing/Scripts/B.subsetBetas.R
#~~~~~EXPAND ANNOTATION FILE
# This file will expand the annotation file to include multiple cgs that are associated with multiple gene:regions
R --no-save < I.Data_Processing/Scripts/C.editAnnotationFile.R # We will use this expanded annotation file in downstream analyses
# Describe the data you are using (Generate Table 1)
# To get information about CpG values, you need to have run a RefFreeEWAS already to understand which CpGs were actually used
R --no-save < I.Data_Processing/Scripts/D.DescribeTableI.R
# Describe TCGA tumor purity estimates
R --no-save < I.Data_Processing/Scripts/E.TumorPurity.R
####################
# Step II: Reference-Free cell type adjustment modeling
####################
# NOTE- This was performed on the Discovery cluster at Dartmouth College
# Apply a model correcting for age, and cell proportion heterogeneity estimates with 1000 bootstraps
# The python script runs the R script "II.RefFreeEWAS/Scripts/A.doRefFree.R"
python II.RefFreeEWAS/Scripts/Functions/RefFree_Discovery.py
# Output Volcano Plots
R --no-save < II.RefFreeEWAS/Scripts/B.Visualize_Volcano.R
# Summarize dimensionality
# First get the dimensionality of the full dataset as well as all of the tumor adjacent samples
R --no-save < II.RefFreeEWAS/Scripts/C.FullDataDimensionality.R
# Next, get all dimensionality estimates into a table
R --no-save < II.RefFreeEWAS/Scripts/D.Summarize_Dimensions.R
# Following the results of the Reference Free modeling approach, investigate and summarize q value cutoffs
R --no-save < II.RefFreeEWAS/Scripts/E.Summarize_q.R # The script will output the 400 genes used in the DAVID Analysis
####################
# Step III: Identifying Differentially Methylated Gene Regions (DMGRs)
####################
# NOTE- This step was performed in the Discovery cluster at Dartmouth College
python III.DMGR_analysis/Scripts/Functions/runDM.py # Script will run A.DMcgs.R
# Compare DMGRs within PAM50 across stage and across PAM50 within Stage (Output venn diagrams comparing overlaps) and low/high stage overlap tables
R --no-save < III.DMGR_analysis/Scripts/B.DMGRcomparison.R
# Perform Gene:Region Enrichment Test
R --no-save < III.DMGR_analysis/Scripts/C.FisherExact_RegionEnrichment.R
# Generate HeatMaps for all Gene:Regions and a heatmap in common to all low stage overlapping gene region associated CpGs
R --no-save < III.DMR_analysis/Scripts/D.DMGRheatmap.R
####################
# Step IV: DMGR Gene Expression and Genomic Locations
####################
# Input the 400 genes as given by II.RefFreeEWAS/Scripts/D.Summarize_q.R [http://david.abcc.ncifcrf.gov/tools.jsp] to run a pathways analysis
# Get RNAseq data for TCGA BRCA data from Synapse
# To install synapse follow instructions listed here: https://www.synapse.org/#!Wiki:syn1768504/ENTITY
synapse get syn1446185
# Move synapse data to appropriate location
mv unc.edu* IV.Genomic_analysis/Data
# For the 12 common low stage DMGRs, explore CpG association with gene expression
R --no-save < IV.Genomic_analysis/Scripts/A.MethRNAseq.R
# This script will plot the genomic locations of all genes and the associated significant CpG methylation differences
R --no-save < IV.Genomic_analysis/Scripts/B.GenomicLocations.R
####################
# Step V: Validation
####################
# NOTE: The argument supplied will hold the IDAT files for download
IDAT_validation="../../Documents/IDAT/"
# Get the Fliescher et al 2015 Data from GEO GSE60185
R --no-save --args $IDAT_validation < V.Validation/Scripts/A.getValidationData.R
# Remove the other .csv file that is automatically downloaded with the GEO profile (Not removing this file will cause errors in preprocessing)
rm $IDAT_validation/*.csv
# Copy the manifest file (Located in "V.Validation/Data/GSE60185_manifest.csv") to the location of the IDAT files
cp V.Validation/Data/GSE60185_manifest.csv $IDAT_validation
# Use minfi to preprocess the Fliescher Data
R --no-save --args $IDAT_validation < V.Validation/Scripts/B.preprocess_minfi_Validation.R
# Run RefFreeEWAS on the Validation Set using 1000 bootstraps
# NOTE- This was performed on the Discovery cluster at Dartmouth College
R --no-save --args 1000 < V.Validation/Scripts/C.doRefFree_Validation.R
# Output a table of dimensionality estimation for the validation set
R --no-save < V.Validation/Scripts/D.Dimensionality_Validation.R
# Compare Validation DMGRs to Test Set DMGRs (output Scatter plot and Heatmap)
R --no-save < V.Validation/Scripts/E.Validation_DMGRComparison.R