Skip to content

Commit

Permalink
test dataset improved
Browse files Browse the repository at this point in the history
  • Loading branch information
maxozo committed Oct 24, 2023
1 parent caa236f commit 840a4e8
Show file tree
Hide file tree
Showing 9 changed files with 17 additions and 186 deletions.
2 changes: 1 addition & 1 deletion assets/deploy_scripts/nohup_start_nextflow_lsf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ parentdir="$(dirname "$CWD1")"
export RUN_ID="${PWD##*/}"
mkdir $PWD/work || echo 'exists'
mkdir $PWD/work/tmp || echo 'exists'
echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 &
echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp -profile sanger -c /software/hgi/pipelines/yascp/conf/extra_confs/sanger/base.conf -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 &

# get process PID
sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ export RUN_ID="${PWD##*/}"
# export TEMP=$PWD/tmp
# export TMP_DIR=$PWD/tmp

echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -entry WORK_DIR_REMOVAL --remove_work_dir -resume > nextflow.nohup.log 2>&1 &
echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -entry WORK_DIR_REMOVAL --remove_work_dir -resume > nextflow.nohup.log 2>&1 &

# get process PID
sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")
Expand Down
2 changes: 1 addition & 1 deletion assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ parentdir="$(dirname "$CWD1")"
export RUN_ID="${PWD##*/}"
mkdir $PWD/work || echo 'exists'
mkdir $PWD/work/tmp || echo 'exists'
echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp -profile sanger,test --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 &
echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp -profile sanger,test -c /software/hgi/pipelines/yascp/conf/extra_confs/sanger/base.conf --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 &

# get process PID
sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")
Expand Down
7 changes: 4 additions & 3 deletions conf/base.conf
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ params{
rsync_to_web_file="${launchDir}/yascp/bin/rsync_to_web.sh"
profile='normal_run'
citeseq=false
estimate_and_provide_informative_snps_for_deconvolution=false
//# These are default parameters that can be overwriten to run in a different mode.
//# Here we have listed the default parameters when running without any extrainput.
tmpdir = "${launchDir}/work"
Expand Down Expand Up @@ -244,8 +245,8 @@ process {
cpus = 1
maxForks=4
errorStrategy = 'retry'
memory = '' // set to null '' as already specified in clusterOptions
time = { 12.h * task.attempt }
memory = 20.GB // set to null '' as already specified in clusterOptions
time = { 6.h * task.attempt }
}

withName:cluster_validate_resolution_keras{
Expand Down Expand Up @@ -329,7 +330,7 @@ process {
withName: DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION{
cpus = 5
time = { 5.h * task.attempt }
memory = { 30.GB * task.attempt }
memory = { 60.GB * task.attempt }
}

withName: VIREO{
Expand Down
182 changes: 5 additions & 177 deletions conf/extra_confs/sanger/base.conf
Original file line number Diff line number Diff line change
Expand Up @@ -8,187 +8,15 @@
----------------------------------------------------------------------------------------
*/

params{
input = 'existing_cellbender'
rsync_to_web_file="${launchDir}/yascp/bin/rsync_to_web.sh"
profile='normal_run'
//# These are default parameters that can be overwriten to run in a different mode.
//# Here we have listed the default parameters when running without any extrainput.
tmpdir = "${launchDir}/work"
cohorts_to_drop_from_GT_Relatednes_check=''
hard_filters_file = "no_file__file_sample_qc" //# This may point to the sample_qc.yml input which will apply hard filters to the merged cells.
hard_filters_drop = false //#This indicates whether we want to drop the cells that fail hard filters of just flag them
encrypt = false
write_h5=false
cellbender_location="${launchDir}/results"
skip_handover = false
RUN='default'
skip_qc=false
skip_merge=false
just_reports=false
add_donor_metadata = false
cellex_cluster_markers=false
mem1= 12000
copy_mode = "rellink"
split_bam = false
existing_cellsnp=''
existing_vireo=''
skip_preprocessing{
value=false
gt_match_file="" //We prvide this if we want to exclude a particular samples matched to a ceirtain GT cohortc from the adaptive qc
gt_match_based_adaptive_qc_exclusion_pattern = '' //We run the adaptive QC on these patterns independently regardless on assigned celltype.
file__anndata_merged = ''
file__cells_filtered = ''
}
genotype_phenotype_mapping_file =''
extra_sample_metadata = ''
use_phenotype_ids_for_gt_match = true //#if false this will keep the genotype ids, for this to be used have to set a genotype_phenotype_mapping_file to a path to csv where firs column contains genotype ids and second contains phenotype ids to replace these to.
run_celltype_assignment = true
cluster_validate_resolution_keras = false
input_tables_column_delimiter = '\t'
output_dir = outdir= "${launchDir}/results"
do_deconvolution = true
split_bam = false
run_multiplet = true
utilise_gpu = true
split_ad_per_bach = false
cellbender_resolution_to_use='0pt1'
reference_assembly_fasta_dir = "https://yascp.cog.sanger.ac.uk/public/10x_reference_assembly"
webtransfer = false
project_name = 'Cardinal_pilots'
run_with_genotype_input=false

eQTL{
eqtl_container = 'https://yascp.cog.sanger.ac.uk/public/singularity_images/eqtl_26_10_2022.img'
aggregation_collumn='Azimuth:predicted.celltype.l2'
n_min_cells = '5' // The number of cells for individual to use.
n_min_individ = '30' //Do not select less than 25 since this may result in a permutation issue with tensorqtl
aggregation_method = 'dMean,dSum'
}

genotype_input {
posterior_assignment = false
subset_genotypes = false
full_vcf_file = ''
}

cellsnp {
run = true
remove_workdir = false
copy_mode = "rellink"
vcf_candidate_snps = "https://yascp.cog.sanger.ac.uk/public/cellsnp/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf.gz"
description = """// this list of candidate SNPs for cellSNP comes from link at https://github.com/single-cell-genetics/cellSNP
// i.e., https://sourceforge.net/projects/cellsnp/files/SNPlist/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf.gz/download"""
min_maf = "0.1"
min_count = "60"
p = "20"
}

vireo {
run = true
remove_workdir = false
copy_mode = "rellink"
run_gtmatch_aposteriori = true
}

plot_donor_ncells {
run = false
remove_workdir = false
copy_mode = "rellink"
plotnine_dpi = "100"
}

souporcell {
run = true
use_raw_barcodes = false
remove_workdir = false
copy_mode = "rellink"
reference_fasta = "https://yascp.cog.sanger.ac.uk/public/10x_reference_assembly/genome.fa"
}


plot_souporcell_vs_vireo {
run = false
remove_workdir = false
copy_mode = "rellink"
}

cellsnp_recapture ='1'
split_h5ad_per_donor {
run = true
remove_workdir = false
copy_mode = "rellink"
input_h5_genome_version = "GRCh38"
print_modules_version = "True"
plot_n_cells_per_vireo_donor = "True"
write_donor_level_filtered_cells_h5 = "True"
plotnine_dpi = "100"
anndata_compression_level = "6"
}

}

process {
cache = 'lenient'

cpus = { 1 * task.attempt }
memory = { 6.GB * task.attempt }
time = { 4.h * task.attempt }
queue = { task.attempt > 3 ? 'long' : 'normal' }
containerOptions = " --cleanenv --containall -B "+params.tmpdir+":/tmp --env NUMBA_CACHE_DIR='"+params.tmpdir+"' --env MPLCONFIGDIR='"+params.tmpdir+"'"

errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
maxRetries = 1
maxErrors = '-1'

// Process-specific resource requirements
// NOTE - Please try and re-use the labels below as much as possible.
// These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
// If possible, it would be nice to keep the same label naming convention when
// adding in your local modules too.
// # The queues differ between institutions. So please chence them according to the times.

withLabel:process_low {
queue = { task.attempt > 3 ? 'long' : 'normal' }
}
withLabel:medium_cpus {
cpus = { 2 * task.attempt }
memory = { 36.GB * task.attempt }
}

withLabel:process_medium {
queue = { task.attempt > 2 ? 'long' : 'normal' }
}
withLabel:process_medium_single_CPU {
queue = { task.attempt > 1 ? 'long' : 'normal' }
}
withLabel:many_cores_small_mem {
queue = { task.attempt > 1 ? 'long' : 'normal' }
}

withLabel:process_high {
queue = 'long'
}
withLabel:process_long {
queue = 'long'
}
withLabel:process_extralong {
queue = 'basement'
}


executor = 'lsf'
queue = { task.time < 20.m ? 'small' : task.time < 12.h ? 'normal' : task.time < 48.h ? 'long' : task.time < 168.h ? 'week' : 'basement' }
withLabel: gpu {
cpus = 1
maxForks=4

errorStrategy = 'retry'
queue = { task.attempt > 1 ? 'gpu-huge' : 'gpu-normal' }
clusterOptions = { "-M "+params.mem1*task.attempt+" -R 'select[ngpus>0 && mem>="+params.mem1*task.attempt+"] rusage[ngpus_physical=1.00,mem="+params.mem1*task.attempt+"] span[ptile=1]' -gpu 'mode=exclusive_process'" }
memory = '' // set to null '' as already specified in clusterOptions

time = { check_max( 12.h * task.attempt, 'time' ) }
clusterOptions = { "-M "+task.memory.toMega()+" -R 'select[ngpus>0 && mem>="+task.memory.toMega()+"] rusage[ngpus_physical=1.00,mem="+task.memory.toMega()+"] span[ptile=1]' -gpu 'mode=exclusive_process'" }
queue = { task.time > 12.h ? 'gpu-basement' : task.memory.toMega() > 80.GB ? 'gpu-huge' : 'gpu-normal' }
containerOptions = {
workflow.containerEngine == "singularity" ? '--containall --cleanenv --nv -B /tmp':
workflow.containerEngine == "singularity" ? '--containall --cleanenv --nv':
( workflow.containerEngine == "docker" ? '--gpus all': null )
}
}
Expand Down
2 changes: 1 addition & 1 deletion conf/test.conf
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ params {
vireo_with_gt=false // Vireo is capable in runing both with genotypes and without. Here we define in which mode we want to run it.
posterior_assignment = false //if this is set to true, we will perform the genotype donor matching after the deconvolution is performed.
subset_genotypes = false
tsv_donor_panel_vcfs = "https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/vcf_inputs.tsv" //this is a panel of vcf files that we want to compar the genotypes with
tsv_donor_panel_vcfs = "https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/vcf_inputs_v2.tsv" //this is a panel of vcf files that we want to compar the genotypes with
}
hard_filters_file = "${projectDir}/sample_input/sample_qc.yml" //this file defilnes what hard filters we want to use to flag/drop the cells

Expand Down
2 changes: 1 addition & 1 deletion conf/test_full.conf
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ params {
vireo_with_gt=false // Vireo is capable in runing both with genotypes and without. Here we define in which mode we want to run it.
posterior_assignment = false //if this is set to true, we will perform the genotype donor matching after the deconvolution is performed.
subset_genotypes = false
tsv_donor_panel_vcfs = "https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/vcf_inputs.tsv" //this is a panel of vcf files that we want to compar the genotypes with
tsv_donor_panel_vcfs = "https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/vcf_inputs_v2.tsv" //this is a panel of vcf files that we want to compar the genotypes with
}
hard_filters_file = "${projectDir}/sample_input/sample_qc.yml" //this file defilnes what hard filters we want to use to flag/drop the cells

Expand Down
2 changes: 1 addition & 1 deletion subworkflows/local/retrieve_recourses.nf
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ process RETRIEVE_RECOURSES_TEST_DATASET{
// }

if (params.profile='test_full'){
get_full_test_data = 'mkdir full_test_dataset && cd full_test_dataset && wget https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/smaller_dataset.tar.gz && tar -xf smaller_dataset.tar.gz && rm smaller_dataset.tar.gz'
get_full_test_data = 'mkdir full_test_dataset && cd full_test_dataset && wget https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/smaller_dataset2.tar.gz -O smaller_dataset.tar.gz && tar -xf smaller_dataset.tar.gz && rm smaller_dataset.tar.gz'

}else{
get_full_test_data = ""
Expand Down
2 changes: 2 additions & 0 deletions subworkflows/main_deconvolution.nf
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,10 @@ workflow main_deconvolution {
merged_expected_genotypes2 = merged_expected_genotypes.combine(Channel.fromPath(params.cellsnp.vcf_candidate_snps))
// merged_expected_genotypes2.subscribe { println "merged_expected_genotypes2: $it" }
GT_MATCH_POOL_IBD(SUBSET_WORKF.out.samplename_subsetvcf_ibd,'Withing_expected','Expected')

DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION(merged_expected_genotypes2)
cellsnp_panels = DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION.out.cellsnp_pool_panel

informative_uninformative_sites = DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION.out.informative_uninformative_sites

// // If we have selected that we want to use all the genotypes as an input in the VCF file we will use the output of the MERGE_GENOTYPES_IN_ONE_VCF_SUBSET
Expand Down

0 comments on commit 840a4e8

Please sign in to comment.