test dataset improved

wtsi-hgi · Oct 24, 2023 · 840a4e8 · 840a4e8
1 parent caa236f
commit 840a4e8
Show file tree

Hide file tree

Showing 9 changed files with 17 additions and 186 deletions.
diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf.sh
@@ -17,7 +17,7 @@ parentdir="$(dirname "$CWD1")"
 export RUN_ID="${PWD##*/}"
 mkdir $PWD/work || echo 'exists'
 mkdir $PWD/work/tmp || echo 'exists'
-echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp -profile sanger -c /software/hgi/pipelines/yascp/conf/extra_confs/sanger/base.conf -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
 
 # get process PID 
 sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")

diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh
@@ -21,7 +21,7 @@ export RUN_ID="${PWD##*/}"
 # export TEMP=$PWD/tmp
 # export TMP_DIR=$PWD/tmp
 
-echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -entry WORK_DIR_REMOVAL --remove_work_dir -resume > nextflow.nohup.log 2>&1 & 
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp -profile sanger  -c $INPUT_FILE --nf_ci_loc $PWD -entry WORK_DIR_REMOVAL --remove_work_dir -resume > nextflow.nohup.log 2>&1 & 
 
 # get process PID 
 sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")

diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh
@@ -16,7 +16,7 @@ parentdir="$(dirname "$CWD1")"
 export RUN_ID="${PWD##*/}"
 mkdir $PWD/work || echo 'exists'
 mkdir $PWD/work/tmp || echo 'exists'
-echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp -profile sanger,test --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp -profile sanger,test -c /software/hgi/pipelines/yascp/conf/extra_confs/sanger/base.conf --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
 
 # get process PID 
 sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")

diff --git a/conf/base.conf b/conf/base.conf
@@ -13,6 +13,7 @@ params{
     rsync_to_web_file="${launchDir}/yascp/bin/rsync_to_web.sh"
     profile='normal_run'
     citeseq=false
+    estimate_and_provide_informative_snps_for_deconvolution=false
     //# These are default parameters that can be overwriten to run in a different mode.
     //# Here we have listed the default parameters when running without any extrainput.
     tmpdir = "${launchDir}/work"
@@ -244,8 +245,8 @@ process {
         cpus = 1
         maxForks=4
         errorStrategy = 'retry'
-        memory = '' // set to null '' as already specified in clusterOptions
-        time   = { 12.h   * task.attempt }
+        memory = 20.GB // set to null '' as already specified in clusterOptions
+        time   = { 6.h   * task.attempt }
     }
 
     withName:cluster_validate_resolution_keras{
@@ -329,7 +330,7 @@ process {
     withName: DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION{
         cpus = 5
         time   = { 5.h   * task.attempt }
-        memory = { 30.GB * task.attempt }
+        memory = { 60.GB * task.attempt }
     }
 
     withName: VIREO{

diff --git a/conf/extra_confs/sanger/base.conf b/conf/extra_confs/sanger/base.conf
@@ -8,187 +8,15 @@
 ----------------------------------------------------------------------------------------
 */
 
-params{
-    input = 'existing_cellbender' 
-    rsync_to_web_file="${launchDir}/yascp/bin/rsync_to_web.sh"
-    profile='normal_run'
-    //# These are default parameters that can be overwriten to run in a different mode.
-    //# Here we have listed the default parameters when running without any extrainput.
-    tmpdir = "${launchDir}/work"
-    cohorts_to_drop_from_GT_Relatednes_check=''
-    hard_filters_file = "no_file__file_sample_qc" //# This may point to the sample_qc.yml input which will apply hard filters to the merged cells.
-    hard_filters_drop = false //#This indicates whether we want to drop the cells that fail hard filters of just flag them
-    encrypt = false
-    write_h5=false
-    cellbender_location="${launchDir}/results"
-    skip_handover = false
-    RUN='default'
-    skip_qc=false
-    skip_merge=false
-    just_reports=false
-    add_donor_metadata = false
-    cellex_cluster_markers=false
-    mem1= 12000
-    copy_mode = "rellink"
-    split_bam = false
-    existing_cellsnp=''
-    existing_vireo=''
-    skip_preprocessing{
-        value=false
-        gt_match_file="" //We prvide this if we want to exclude a particular samples matched to a ceirtain GT cohortc from the adaptive qc
-        gt_match_based_adaptive_qc_exclusion_pattern = '' //We run the adaptive QC on these patterns independently regardless on assigned celltype.        
-        file__anndata_merged = ''
-        file__cells_filtered = ''
-    }
-    genotype_phenotype_mapping_file =''
-    extra_sample_metadata = ''
-    use_phenotype_ids_for_gt_match = true //#if false this will keep the genotype ids, for this to be used have to set a genotype_phenotype_mapping_file to a path to csv where firs column contains genotype ids and second contains phenotype ids to replace these to.
-    run_celltype_assignment = true
-    cluster_validate_resolution_keras = false
-    input_tables_column_delimiter = '\t'
-    output_dir = outdir= "${launchDir}/results"
-    do_deconvolution = true
-    split_bam = false
-    run_multiplet = true
-    utilise_gpu = true
-    split_ad_per_bach = false
-    cellbender_resolution_to_use='0pt1'
-    reference_assembly_fasta_dir = "https://yascp.cog.sanger.ac.uk/public/10x_reference_assembly"
-    webtransfer = false
-    project_name = 'Cardinal_pilots'
-    run_with_genotype_input=false
-
-    eQTL{
-        eqtl_container = 'https://yascp.cog.sanger.ac.uk/public/singularity_images/eqtl_26_10_2022.img'
-        aggregation_collumn='Azimuth:predicted.celltype.l2'
-        n_min_cells = '5' // The number of cells for individual to use. 
-        n_min_individ = '30' //Do not select less than 25 since this may result in a permutation issue with tensorqtl
-        aggregation_method = 'dMean,dSum'
-    }
-
-	genotype_input {
-        posterior_assignment = false
-        subset_genotypes = false 
-        full_vcf_file = ''
-    }
-
-    cellsnp {
-        run = true
-        remove_workdir = false
-        copy_mode = "rellink"
-        vcf_candidate_snps = "https://yascp.cog.sanger.ac.uk/public/cellsnp/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf.gz"
-        description = """// this list of candidate SNPs for cellSNP comes from link at https://github.com/single-cell-genetics/cellSNP
-        // i.e., https://sourceforge.net/projects/cellsnp/files/SNPlist/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf.gz/download"""
-        min_maf = "0.1"
-        min_count = "60"
-        p = "20"
-    }
-
-    vireo {
-        run = true
-        remove_workdir = false
-        copy_mode = "rellink"
-        run_gtmatch_aposteriori = true
-   }
-
-    plot_donor_ncells {
-        run = false
-        remove_workdir = false
-        copy_mode = "rellink"
-        plotnine_dpi = "100"
-    }
-
-    souporcell {
-        run = true
-        use_raw_barcodes = false
-        remove_workdir = false
-        copy_mode = "rellink"
-        reference_fasta = "https://yascp.cog.sanger.ac.uk/public/10x_reference_assembly/genome.fa"
-     }
-
-
-    plot_souporcell_vs_vireo {
-        run = false
-        remove_workdir = false
-        copy_mode = "rellink"
-    }
-
-    cellsnp_recapture ='1'
-    split_h5ad_per_donor {
-        run = true
-        remove_workdir = false
-        copy_mode = "rellink"
-        input_h5_genome_version = "GRCh38"
-        print_modules_version = "True"
-        plot_n_cells_per_vireo_donor = "True"
-        write_donor_level_filtered_cells_h5 = "True"
-        plotnine_dpi = "100"
-        anndata_compression_level = "6"
-    }
-
-}
 
 process {
-    cache = 'lenient'
-
-    cpus   = {  1    * task.attempt }
-    memory = {  6.GB * task.attempt }
-    time   = { 4.h  * task.attempt }
-    queue = { task.attempt > 3 ? 'long' : 'normal' }
-    containerOptions = " --cleanenv --containall -B "+params.tmpdir+":/tmp --env NUMBA_CACHE_DIR='"+params.tmpdir+"' --env MPLCONFIGDIR='"+params.tmpdir+"'"
-
-    errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
-    maxRetries    = 1
-    maxErrors     = '-1'
-
-    // Process-specific resource requirements
-    // NOTE - Please try and re-use the labels below as much as possible.
-    //        These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
-    //        If possible, it would be nice to keep the same label naming convention when
-    //        adding in your local modules too.
-    // # The queues differ between institutions. So please chence them according to the times.
-
-    withLabel:process_low {
-        queue = { task.attempt > 3 ? 'long' : 'normal' }
-    }
-    withLabel:medium_cpus {
-        cpus   = {  2     * task.attempt }
-        memory = { 36.GB * task.attempt }
-    }
-
-    withLabel:process_medium {
-        queue = { task.attempt > 2 ? 'long' : 'normal' }
-    }
-    withLabel:process_medium_single_CPU {
-        queue = { task.attempt > 1 ? 'long' : 'normal' }
-    }
-    withLabel:many_cores_small_mem {
-        queue = { task.attempt > 1 ? 'long' : 'normal' }
-    }
-
-    withLabel:process_high {
-        queue = 'long'
-    }
-    withLabel:process_long {
-        queue = 'long'
-    }
-    withLabel:process_extralong {
-        queue = 'basement'
-    }
-
-
+    executor = 'lsf'
+    queue = { task.time < 20.m ? 'small' : task.time < 12.h ? 'normal' : task.time < 48.h ? 'long' : task.time < 168.h ? 'week' : 'basement' }
     withLabel: gpu {
-        cpus = 1
-        maxForks=4
-
-        errorStrategy = 'retry'
-        queue = { task.attempt > 1 ? 'gpu-huge' : 'gpu-normal' }
-        clusterOptions = { "-M "+params.mem1*task.attempt+" -R 'select[ngpus>0 && mem>="+params.mem1*task.attempt+"] rusage[ngpus_physical=1.00,mem="+params.mem1*task.attempt+"] span[ptile=1]' -gpu 'mode=exclusive_process'" }
-	    memory = '' // set to null '' as already specified in clusterOptions
-
-        time   = { check_max( 12.h   * task.attempt, 'time'    ) }
+        clusterOptions = { "-M "+task.memory.toMega()+" -R 'select[ngpus>0 && mem>="+task.memory.toMega()+"] rusage[ngpus_physical=1.00,mem="+task.memory.toMega()+"] span[ptile=1]' -gpu 'mode=exclusive_process'" }
+        queue = { task.time > 12.h ? 'gpu-basement' : task.memory.toMega() > 80.GB ? 'gpu-huge' : 'gpu-normal' }
         containerOptions = {
-            workflow.containerEngine == "singularity" ? '--containall --cleanenv --nv -B /tmp':
+            workflow.containerEngine == "singularity" ? '--containall --cleanenv --nv':
             ( workflow.containerEngine == "docker" ? '--gpus all': null )
         }
     }

diff --git a/conf/test.conf b/conf/test.conf
@@ -22,7 +22,7 @@ params {
         vireo_with_gt=false // Vireo is capable in runing both with genotypes and without. Here we define in which mode we want to run it.
         posterior_assignment = false //if this is set to true, we will perform the genotype donor matching after the deconvolution is performed.
         subset_genotypes = false
-        tsv_donor_panel_vcfs = "https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/vcf_inputs.tsv" //this is a panel of vcf files that we want to compar the genotypes with
+        tsv_donor_panel_vcfs = "https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/vcf_inputs_v2.tsv" //this is a panel of vcf files that we want to compar the genotypes with
     }
     hard_filters_file = "${projectDir}/sample_input/sample_qc.yml" //this file defilnes what hard filters we want to use to flag/drop the cells
 

diff --git a/conf/test_full.conf b/conf/test_full.conf
@@ -22,7 +22,7 @@ params {
         vireo_with_gt=false // Vireo is capable in runing both with genotypes and without. Here we define in which mode we want to run it.
         posterior_assignment = false //if this is set to true, we will perform the genotype donor matching after the deconvolution is performed.
         subset_genotypes = false
-        tsv_donor_panel_vcfs = "https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/vcf_inputs.tsv" //this is a panel of vcf files that we want to compar the genotypes with
+        tsv_donor_panel_vcfs = "https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/vcf_inputs_v2.tsv" //this is a panel of vcf files that we want to compar the genotypes with
     }
     hard_filters_file = "${projectDir}/sample_input/sample_qc.yml" //this file defilnes what hard filters we want to use to flag/drop the cells
 

diff --git a/subworkflows/local/retrieve_recourses.nf b/subworkflows/local/retrieve_recourses.nf
@@ -47,7 +47,7 @@ process RETRIEVE_RECOURSES_TEST_DATASET{
     // }
 
     if (params.profile='test_full'){
-        get_full_test_data = 'mkdir full_test_dataset && cd full_test_dataset && wget https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/smaller_dataset.tar.gz && tar -xf smaller_dataset.tar.gz && rm smaller_dataset.tar.gz'
+        get_full_test_data = 'mkdir full_test_dataset && cd full_test_dataset && wget https://yascp.cog.sanger.ac.uk/public/test_datasets/full_test_dataset/smaller_dataset2.tar.gz -O smaller_dataset.tar.gz && tar -xf smaller_dataset.tar.gz && rm smaller_dataset.tar.gz'
 
     }else{
         get_full_test_data = ""

diff --git a/subworkflows/main_deconvolution.nf b/subworkflows/main_deconvolution.nf
@@ -75,8 +75,10 @@ workflow  main_deconvolution {
             merged_expected_genotypes2 = merged_expected_genotypes.combine(Channel.fromPath(params.cellsnp.vcf_candidate_snps))
             // merged_expected_genotypes2.subscribe { println "merged_expected_genotypes2: $it" }
             GT_MATCH_POOL_IBD(SUBSET_WORKF.out.samplename_subsetvcf_ibd,'Withing_expected','Expected')
+
             DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION(merged_expected_genotypes2)
             cellsnp_panels = DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION.out.cellsnp_pool_panel
+
             informative_uninformative_sites = DYNAMIC_DONOR_EXCLUSIVE_SNP_SELECTION.out.informative_uninformative_sites
 
             // // If we have selected that we want to use all the genotypes as an input in the VCF file we will use the output of the MERGE_GENOTYPES_IN_ONE_VCF_SUBSET