-
Notifications
You must be signed in to change notification settings - Fork 8
/
run_coloc_pipeline_opt.sh
76 lines (58 loc) · 3.29 KB
/
run_coloc_pipeline_opt.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env bash
NCORES=$1
export PYSPARK_SUBMIT_ARGS=$2
#export PYSPARK_SUBMIT_ARGS="--driver-memory 100g pyspark-shell"
echo "Running on $NCORES cores"
echo "PYSPARK_SUBMIT_ARGS: $PYSPARK_SUBMIT_ARGS"
#python partition_top_loci_by_chrom.py # Script from fine-mapping pipeline
#echo -e "\n1_find_overlaps.sh"
# This step requires a lot of memory
time /bin/bash 1_find_overlaps.sh # 10 min last run
# Generate the manifest from the overlap table
echo -e "\n2_generate_manifest.py"
time python 2_generate_manifest.py # ~24 min last run
#cp /configs/manifest_unfiltered.json.gz /configs/manifest_unfiltered.all.json.gz
# Subset to chr21 for testing
#zcat /configs/manifest_unfiltered.all.json.gz | grep 'ukb_v3_chr21.downsampled10k' | head -n 10000 | gzip > /configs/manifest_unfiltered.json.gz
# Remove lines which are already in output from previous coloc runs
# (If no previous coloc, this just renames the file)
echo -e "\n2b_filter_manifest.py"
time python 2b_filter_manifest.py
# The script below generates commands to compute conditionally independent
# sumstats with GCTA. It creates two files `commands_todo.cond.txt.gz` and
# `commands_done.cond.txt.gz` showing which analyses have not yet/already
# been done. This step can be stopped at any time and restarted without
# repeating any completed analyses. You can safely regenerate the `commands_*.txt.gz`
# commands while the pipeline is running using `python 3_make_commands.py --quiet`.
# Note that it takes some time just to make the commands, e.g. >30 min.
# This gets longer as more output files have been written... so if it's piped
# to shuf then nothing will start before 30 min.
echo -e "\n3a_make_conditioning_commands.py"
time python 3a_make_conditioning_commands.py --quiet
# Creates `commands_todo_coloc_opt.txt`. Each command operates on a chunk of
# `coloc_manifest_opt.todo.txt.gz` written in the configs/commands_split directory.
echo -e "\n3b_make_coloc_commands_opt.sh"
time bash 3b_make_coloc_commands_opt.sh
############### Main conditioning step
# Took 26 hrs last run (222 cores, 400 Gb)
echo -e "\nRunning conditioning commands in parallel"
time zcat /configs/commands_todo.cond.txt.gz | shuf | parallel -j $NCORES --joblog /output/parallel.jobs.cond.log | tee /output/run_gcta_cond.out.txt
# Note: "--bar" can make things slower if there are millions of commands
############### Main coloc step
# Took ~ 2 hrs last run (222 cores), 6.8 M commands
echo -e "\nRunning coloc commands in parallel"
time cat /configs/commands_todo_coloc_opt.txt | parallel -j $NCORES --joblog /output/parallel.jobs.coloc.log --bar | tee /output/run_coloc_opt.out.txt
# Note: "--bar" can make things slower if there are millions of commands
# Combine the results of all the individual analyses
echo -e "\n5_combine_results.py"
time python 5_combine_results.py # Takes a few minutes
# Process the results for exporting. Renames or computes a few columns,
# e.g. coloc_h4_h3 ratio, filters based on number of overlapping vars,
# makes symmetric coloc result matrix.
echo -e "\n6_process_results.py"
time python 6_process_results.py # Takes just a minute or two
# Run this step if merging with previous coloc results
#export PYSPARK_SUBMIT_ARGS="--driver-memory 25g pyspark-shell"
#time python 7_merge_previous_results.py | tee /output/merge_previous_results.log
echo -e "\nDONE"
date