-
Notifications
You must be signed in to change notification settings - Fork 1
/
run_CrossPeak_example.sh
executable file
·260 lines (220 loc) · 11.2 KB
/
run_CrossPeak_example.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
#!/bin/bash
# Setup species, folders, and files *May need to modify for your analysis*
base_folder=$(pwd) # Base folder where the R scripts are located
current_folder="./Example_dataset" # Folder to save all intermediate results and logs
output_folder="./Example_dataset/Output" # Folder to save results
HAL_FILE="${current_folder}/primates.hal" # HAL file with species' genomes and ancestral genome
species=("human" "chimp" "rhesus") # Lowercase versions of names of species in HAL file
ANCESTRAL_GENOME="primate" # Name of ancestral genome in HAL file
docker_image="quay.io/comparative-genomics-toolkit/cactus:v2.9.3" # Docker image for HAL tools
use_docker=true # Set to true to use Docker, false to use local HAL installation
cleanup=true #Set to true to clean up intermediate files after the pipeline finishes. Set to false to assist in debugging
# Check for Docker if using Docker
if [[ "$use_docker" == true ]]; then
if ! command -v docker &> /dev/null; then
echo "Error: Docker is not installed or not in the PATH. Please install Docker or set use_docker=false."
exit 1
fi
fi
# Check for HAL tools if not using Docker
if [[ "$use_docker" == false ]]; then
if ! command -v halLiftover &> /dev/null; then
echo "Error: halLiftover is not installed or not in the PATH. Please install HAL tools or set use_docker=true."
exit 1
fi
fi
# Create the output folder if it doesn't exist
if [[ ! -d "$current_folder" ]]; then
mkdir -p "$current_folder"
echo "Created output folder: $current_folder"
fi
# Record the files in the current folder at the beginning
initial_files=$(mktemp) # Temporary file to save the list of initial files
ls "$current_folder" > "$initial_files"
# Step 1: Run R script to generate initial data
echo "Running Step 1 in R..."
Rscript "${base_folder}/CrossPeak_Step1.R" > "${current_folder}/step1.log" 2>&1
if [[ $? -ne 0 ]]; then
echo "Error: Step 1 failed. Check ${current_folder}/step1.log for details."
exit 1
fi
# Step A: Run halLiftover for each species
echo "Running Step A: halLiftover..."
hal_liftover_log="${current_folder}/stepA_halLiftover.log"
for sp in "${species[@]}"; do
input_file="${current_folder}/${sp}_summits.bed"
output_file="${current_folder}/${sp}_summits_lo.bed"
# Check if input file exists before running halLiftover
if [[ ! -f "$input_file" ]]; then
echo "Error: Input file $input_file not found for species $sp."
exit 1
fi
# Run halLiftover, either with Docker or directly
echo "Lifting over $sp..." | tee -a "$hal_liftover_log"
if [[ "$use_docker" == true ]]; then
docker run --rm -v "$base_folder:$base_folder" -w "$base_folder" "$docker_image" halLiftover "$HAL_FILE" "${sp^}" "$input_file" "$ANCESTRAL_GENOME" "$output_file" >> "$hal_liftover_log" 2>&1
else
halLiftover "$HAL_FILE" "${sp^}" "$input_file" "$ANCESTRAL_GENOME" "$output_file" >> "$hal_liftover_log" 2>&1
fi
# Check if halLiftover was successful or the output BED file is empty
if [[ $? -ne 0 || ! -s "$output_file" ]]; then
echo "Error: halLiftover failed for $sp, or the output file $output_file is empty. Check inputs and HAL file."
exit 1
fi
done
# Steps 2 and 3: Run the next R script
echo "Running Steps 2 and 3 in R..."
Rscript "${base_folder}/CrossPeak_Step2and3.R" > "${current_folder}/step2and3.log" 2>&1
if [[ $? -ne 0 ]]; then
echo "Error: Steps 2 and 3 failed. Check ${current_folder}/step2and3.log for details."
exit 1
fi
# Step B: Run halLiftover for consensus summits
echo "Running Step B: halLiftover for consensus summits..."
hal_liftover_log="${current_folder}/stepB_halLiftover.log"
consensus_input_file="${current_folder}/consensus_summits.bed"
if [[ ! -f "$consensus_input_file" ]]; then
echo "Error: Input file $consensus_input_file not found. Ensure Steps 2 and 3 completed successfully."
exit 1
fi
for sp in "${species[@]}"; do
consensus_output_file="${current_folder}/${sp}_consensus_summits_lo.bed"
# Run halLiftover, either with Docker or directly
echo "Lifting over consensus summits for $sp..." | tee -a "$hal_liftover_log"
if [[ "$use_docker" == true ]]; then
docker run --rm -v "$base_folder:$base_folder" -w "$base_folder" "$docker_image" halLiftover "$HAL_FILE" "$ANCESTRAL_GENOME" "$consensus_input_file" "${sp^}" "$consensus_output_file" >> "$hal_liftover_log" 2>&1
else
halLiftover "$HAL_FILE" "$ANCESTRAL_GENOME" "$consensus_input_file" "${sp^}" "$consensus_output_file" >> "$hal_liftover_log" 2>&1
fi
# Check if halLiftover was successful or the output BED file is empty
if [[ $? -ne 0 || ! -s "$consensus_output_file" ]]; then
echo "Error: halLiftover failed for consensus summits for $sp, or the output file $consensus_output_file is empty. Check inputs and HAL file."
exit 1
fi
done
# Steps 4, 5 and 6: Run the next R script
echo "Running Steps 4, 5, and 6 in R..."
Rscript "${base_folder}/CrossPeak_Step4and5and6.R" > "${current_folder}/step4and5and6.log" 2>&1
if [[ $? -ne 0 ]]; then
echo "Error: Steps 4, 5, or 6 failed. Check ${current_folder}/step4and5and6.log for details."
exit 1
fi
# Step C: Run halLiftover for species-specific peaks and summits
echo "Running Step C: halLiftover for species-specific peaks and summits..."
hal_liftover_log="${current_folder}/stepC_halLiftover.log"
for source_species in "${species[@]}"; do
for target_species in "${species[@]}"; do
# Skip if source and target species are the same
if [[ "$source_species" == "$target_species" ]]; then
continue
fi
# Generate file names for summits
input_summits_file="${current_folder}/${source_species}_spec_summits.bed"
output_summits_file="${current_folder}/${source_species}_on_${target_species}_spec_summits_lo.bed"
# Check if input file exists for summits
if [[ ! -f "$input_summits_file" ]]; then
echo "Error: Input file $input_summits_file not found for $source_species." | tee -a "$hal_liftover_log"
exit 1
fi
# Run halLiftover for summits
echo "Lifting over summits from $source_species to $target_species..." | tee -a "$hal_liftover_log"
if [[ "$use_docker" == true ]]; then
docker run --rm -v "$base_folder:$base_folder" -w "$base_folder" "$docker_image" halLiftover "$HAL_FILE" "${source_species^}" "$input_summits_file" "${target_species^}" "$output_summits_file" >> "$hal_liftover_log" 2>&1
else
halLiftover "$HAL_FILE" "${source_species^}" "$input_summits_file" "${target_species^}" "$output_summits_file" >> "$hal_liftover_log" 2>&1
fi
# Check if halLiftover was successful or the output BED file is empty
if [[ $? -ne 0 || ! -s "$output_summits_file" ]]; then
echo "Error: halLiftover failed or output file $output_summits_file is empty. Check inputs and HAL file." | tee -a "$hal_liftover_log"
exit 1
fi
# Generate file names for peaks
input_peaks_file="${current_folder}/${source_species}_spec_peaks.bed"
output_peaks_file="${current_folder}/${source_species}_on_${target_species}_spec_peaks_lo.bed"
# Check if input file exists for peaks
if [[ ! -f "$input_peaks_file" ]]; then
echo "Error: Input file $input_peaks_file not found for $source_species." | tee -a "$hal_liftover_log"
exit 1
fi
# Run halLiftover for peaks
echo "Lifting over peaks from $source_species to $target_species..." | tee -a "$hal_liftover_log"
if [[ "$use_docker" == true ]]; then
docker run --rm -v "$base_folder:$base_folder" -w "$base_folder" "$docker_image" halLiftover "$HAL_FILE" "${source_species^}" "$input_peaks_file" "${target_species^}" "$output_peaks_file" >> "$hal_liftover_log" 2>&1
else
halLiftover "$HAL_FILE" "${source_species^}" "$input_peaks_file" "${target_species^}" "$output_peaks_file" >> "$hal_liftover_log" 2>&1
fi
# Check if halLiftover was successful or the output BED file is empty
if [[ $? -ne 0 || ! -s "$output_peaks_file" ]]; then
echo "Error: halLiftover failed or output file $output_peaks_file is empty. Check inputs and HAL file." | tee -a "$hal_liftover_log"
exit 1
fi
done
done
# Step 7: Run the next R script
echo "Running Step 7 in R..."
Rscript "${base_folder}/CrossPeak_Step7.R" > "${current_folder}/step7.log" 2>&1
if [[ $? -ne 0 ]]; then
echo "Error: Step 7 failed. Check ${current_folder}/step7.log for details."
exit 1
fi
# Step D: Reciprocal liftover for round 2 peaks
echo "Running Step D: Reciprocal halLiftover for round 2 peaks..."
hal_liftover_log="${current_folder}/stepD_halLiftover.log"
for source_species in "${species[@]}"; do
for target_species in "${species[@]}"; do
# Skip if source and target species are the same
if [[ "$source_species" == "$target_species" ]]; then
continue
fi
# Generate file names
input_file="${current_folder}/${target_species}_on_${source_species}_rd2_con_peaks.bed"
output_file="${current_folder}/${target_species}_from_${source_species}_rd2_con_peaks_lo.bed"
# Check if input file exists
if [[ ! -f "$input_file" ]]; then
echo "Error: Input file $input_file not found for $source_species to $target_species." | tee -a "$hal_liftover_log"
exit 1
fi
# Run halLiftover, either with Docker or directly
echo "Lifting over round 2 consensus peaks from $target_species to $source_species..." | tee -a "$hal_liftover_log"
if [[ "$use_docker" == true ]]; then
docker run --rm -v "$base_folder:$base_folder" -w "$base_folder" "$docker_image" halLiftover "$HAL_FILE" "${source_species^}" "$input_file" "${target_species^}" "$output_file" >> "$hal_liftover_log" 2>&1
else
halLiftover "$HAL_FILE" "${source_species^}" "$input_file" "${target_species^}" "$output_file" >> "$hal_liftover_log" 2>&1
fi
# Check if halLiftover was successful or the output BED file is empty
if [[ $? -ne 0 || ! -s "$output_file" ]]; then
echo "Error: halLiftover failed for $source_species to $target_species, or the output file $output_file is empty. Check inputs and HAL file." | tee -a "$hal_liftover_log"
exit 1
fi
done
done
# Steps 8 and 9: Run the next R script
echo "Running Steps 8 and 9 in R..."
Rscript "${base_folder}/CrossPeak_Step8and9.R" > "${current_folder}/step8and9.log" 2>&1
if [[ $? -ne 0 ]]; then
echo "Error: Step 8 or 9 failed. Check ${current_folder}/step8and9.log for details."
exit 1
fi
# Cleanup: Delete intermediate files if the cleanup flag is set
if [[ "$cleanup" == true ]]; then
echo "Performing cleanup of intermediate files..."
# Get the list of .log files to keep
log_files=$(find "$current_folder" -maxdepth 1 -type f -name "*.log")
# Get the list of files to keep (initial files + .log files)
files_to_keep=$(mktemp)
cat "$initial_files" > "$files_to_keep"
echo "$log_files" >> "$files_to_keep"
# Find and remove all files in the current folder but not output folder except those in the "files_to_keep"
for file in $(find "$current_folder" -maxdepth 1 -type f); do
if ! grep -qF "$(basename "$file")" "$files_to_keep"; then
rm -f "$file"
fi
done
# Clean up temporary files
rm -f "$initial_files" "$files_to_keep"
echo "Cleanup completed. All intermediate files removed."
else
echo "Cleanup flag is set to false. Intermediate files are retained in ${current_folder}."
fi
# Final message
echo "CrossPeak completed successfully. Logs are in ${current_folder} and output files are in ${output_folder}."