merge

wtsi-hgi · Nov 30, 2023 · 9061c80 · 9061c80
2 parents 5b61ac1 + edca4bf
commit 9061c80
Show file tree

Hide file tree

Showing 31 changed files with 4,315 additions and 1,279 deletions.
diff --git a/assets/deploy_scripts/bsub.sh b/assets/deploy_scripts/bsub.sh
@@ -21,5 +21,5 @@ if ["$varname" = ''];
 fi
 sample="$RUN_ID"
 echo -e "\n Submitting yascp (https://github.com/wtsi-hgi/yascp) with input file $INPUT_FILE"
-bsub -R'select[mem>8000] rusage[mem=8000]' -J $sample -n 1 -M 8000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf.sh $INPUT_FILE
+bsub -R'select[mem>8000] rusage[mem=8000]' -J $sample -n 1 -M 8000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf.sh $INPUT_FILE
 echo "Submitted job can be killed with: bkill -J $sample"
diff --git a/assets/deploy_scripts/bsub__removeWork.sh b/assets/deploy_scripts/bsub__removeWork.sh
@@ -5,5 +5,5 @@ INPUT_FILE=$1
 export RUN_ID="${PWD##*/}"
 sample="$RUN_ID.yascp"
 echo "Cleaning the work directory (https://github.com/wtsi-hgi/yascp) with input file $INPUT_FILE by using '-entry WORK_DIR_REMOVAL --remove_work_dir' "
-bsub -R'select[mem>4000] rusage[mem=4000]' -J $sample -n 1 -M 4000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh $INPUT_FILE
+bsub -R'select[mem>4000] rusage[mem=4000]' -J $sample -n 1 -M 4000 -o $sample.o -e $sample.e -q long bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh $INPUT_FILE
 echo "Submitted job can be killed with: bkill -J $sample"
diff --git a/assets/deploy_scripts/bsub_test.sh b/assets/deploy_scripts/bsub_test.sh
@@ -25,5 +25,5 @@ fi
 
 sample="$RUN_ID.yascp"
 echo -e "\nSubmitting yascp (https://github.com/wtsi-hgi/yascp) in test mode withsample OneK1k dataset"
-bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_test -n 1 -M 4000 -o yascp_test.o -e yascp_test.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh
+bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_test -n 1 -M 4000 -o yascp_test.o -e yascp_test.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh
 echo "Submitted job can be killed with: bkill -J yascp_test"
diff --git a/assets/deploy_scripts/bsub_test_celltypes.sh b/assets/deploy_scripts/bsub_test_celltypes.sh
@@ -25,5 +25,5 @@ fi
 
 sample="$RUN_ID.yascp"
 echo -e "\nSubmitting yascp (https://github.com/wtsi-hgi/yascp) in JUST_CELLTYPES mode with input file $INPUT_FILE"
-bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_celltypes -n 1 -M 4000 -o yascp_celltypes.o -e yascp_celltypes.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.2/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh  $INPUT_FILE
+bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_celltypes -n 1 -M 4000 -o yascp_celltypes.o -e yascp_celltypes.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh  $INPUT_FILE
 echo "Submitted job can be killed with: bkill -J yascp_celltypes"
diff --git a/assets/deploy_scripts/bsub_test_recluster.sh b/assets/deploy_scripts/bsub_test_recluster.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+CWD1="$PWD"
+parentdir="$(dirname "$CWD1")"
+INPUT_FILE=$1
+export RUN_ID="${PWD##*/}"
+
+# export SINGULARITY_CACHEDIR='/software/hgi/containers/yascp'
+
+export NXF_OPTS="-Xms5G -Xmx5G"
+export SINGULARITY_TMPDIR=$PWD/work/tmp
+export TEMP=$PWD/work/tmp
+export TMP_DIR=$PWD/work/tmp
+
+echo press ENTER to NOT fetch containers, otherwise provide writable path:
+read varname
+
+if ["$varname" = ''];
+    then
+        export NXF_SINGULARITY_CACHEDIR='/software/hgi/containers/yascp'
+        export SINGULARITY_DISABLE_CACHE=0
+    else
+        echo Yascp Will fetch the containers and place them in $varname
+        export NXF_SINGULARITY_CACHEDIR=$varname
+fi
+
+sample="$RUN_ID.yascp"
+echo -e "\nSubmitting yascp (https://github.com/wtsi-hgi/yascp) in JUST_RECLUSTER mode with input file $INPUT_FILE"
+bsub -R'select[mem>4000] rusage[mem=4000]' -J yascp_cluster -n 1 -M 4000 -o yascp_cluster.o -e yascp_cluster.e -q normal bash /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh  $INPUT_FILE
+echo "Submitted job can be killed with: bkill -J yascp_cluster"
diff --git a/assets/deploy_scripts/input_setups/recluster_profile.nf b/assets/deploy_scripts/input_setups/recluster_profile.nf
@@ -0,0 +1,138 @@
+params {
+
+    lisi{
+        run_process=true
+    }
+    replace_genotype_ids=false
+    write_h5=true
+    cluster_validate_resolution_keras = true
+    // run_celltype_assignment = true
+    project_name = 'T_Cell_Bio_Response'
+    filter_outliers = false
+    extra_sample_metadata =""
+    output_dir = outdir= "${launchDir}/recluster_resolutions"
+    cellex_cluster_markers=true 
+    cluster_markers = false
+    normalise_andata = false
+    skip_handover = true
+    // output_dir = outdir= "${launchDir}/results"
+    // run_celltype_assignment=true
+    split_ad_per_bach=true //if not splitting the celltype assignment will be run on full tranche
+    // input_data_table = "$outdir/handover/Summary_plots/$RUN_ID/Fetch Pipeline/Input/input_table.tsv"
+    // cellbender_location="${output_dir}/nf-preprocessing/cellbender" //!!!!! if cellbender is run already then can skip this by selecting  input = 'existing_cellbender' instead input = 'cellbender'
+    // existing_cellsnp="${output_dir}/cellsnp"
+    cellbender_location="/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/qc/results_11_09_2023/nf-preprocessing/cellbender" //!!!!! if cellbender is run already then can skip this by selecting  input = 'existing_cellbender' instead input = 'cellbender'
+    existing_cellsnp="/lustre/scratch123/hgi/teams/hgi/mo11/tmp_projects/harriet/qc/results/cellsnp"
+
+    skip_preprocessing = true
+    // file__anndata_merged = '/lustre/scratch126/humgen/projects/sc-eqtl-ibd/analysis/harriet_analysis/230313_hb58_yascp_analysis/231114_h5ad_files_for_MCC/231120_TCs_only_regressed_counts_HVGs.h5ad'
+
+    harmony{
+        run_process= true
+    }
+    umap{
+        run_process = true
+        colors_quantitative{
+            description = 'Comma separated string of quantitative variables that will be used to color points.'
+            value = 'n_cells,total_counts,pct_counts_gene_group__mito_transcript,prob_doublet,pct_counts_gene_group__ribo_rna,Azimuth:predicted.celltype.l2.score,Azimuth:mapping.score,log10_ngenes_by_count'
+        }
+        colors_categorical{
+            description = 'Comma separated string of categorical variables that will be used to color points.'
+            value = 'cell_passes_qc,cell_passes_qc-per:Azimuth:L0_predicted.celltype.l2,experiment_id,Azimuth:predicted.celltype.l2,Celltypist:Immune_All_Low:predicted_labels,Celltypist:Immune_All_High:predicted_labels,donor_id'
+        }
+    }
+
+    mads_categories ='pct_counts_gene_group__mito_transcript,pct_counts_gene_group__mito_protein,pct_counts_gene_group__ribo_protein,pct_counts_gene_group__ribo_rna,total_counts,n_genes_by_counts,log10_ngenes_by_count'
+    // hard_filters_file       = "${projectDir}/../sample_qc.yml"
+    // hard_filters_drop = false //#This indicates whether we want to drop the cells that fail hard filters of just flag them
+
+    cluster{
+        description = """Parameters for clustering. All pairwise combinations of
+        method and resolution will be performed."""
+        number_neighbors{
+            description = """Number of neighbors. If <= 0, uses number of unique
+            experiment_id."""
+            value = 15
+        }
+        methods{
+            description = 'Clustering method. Valid options [leiden|louvain].'
+            value = 'leiden'
+        }
+        resolutions{
+            description = 'Clustering resolution.'
+            value = [0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
+        }
+
+        variables_boxplot{
+            decription = 'Generate boxplots of these variables for each cluster.'
+            value ='n_cells,total_counts,pct_counts_gene_group__mito_transcript'
+        }
+
+        known_markers{
+            run_process = false
+            description = """Files with markers that will be used to generate
+            dotplots. Each marker file should be the full path and have the
+            following columns: cell_type, hgnc_symbol. The following columns
+            are optional: p_value_adj. Use "" for a single entry in the
+            file_id and file value to indicate no plots."""
+            value = [
+                [ file_id: 'SmillieCS_31348891', file: '/lustre/scratch119/humgen/projects/sc-eqtl-ibd/data/marker_gene_db-raw_data/database/celltypes/colon/SmillieCS-31348891/database.tsv' ],
+                [ file_id: 'ParikhK_30814735', file: '/lustre/scratch119/humgen/projects/sc-eqtl-ibd/data/marker_gene_db-raw_data/database/celltypes/colon/ParikhK-30814735/database.tsv' ],
+                [ file_id: 'JamesKR_32066951', file: '/lustre/scratch119/humgen/projects/sc-eqtl-ibd/data/marker_gene_db-raw_data/database/celltypes/colon-immune/JamesKR-32066951/database.tsv' ]
+            ]
+        }
+
+
+
+
+    }  
+    bbknn{
+        run_process = true
+    }
+
+    celltype_assignment{
+        run_celltype_assignment=false
+        run_azimuth=true
+        run_keras=false
+        run_celltypist=true
+    }
+    reduced_dims{
+        vars_to_regress{
+            value = ''   
+        }
+    }
+
+}
+
+process {
+
+    withName: plot_distributions{
+        containerOptions = "--containall --cleanenv --workdir /tmp -B /tmp"
+    }
+
+    withName: cellex_cluster_markers{
+        maxForks=7
+        memory = 300.GB
+    }
+
+    withName: GATHER_DATA{
+        maxForks=7
+        memory = 100.GB
+    }
+    withName: LISI{
+        maxForks=7
+        memory = 300.GB
+    }
+    withName: cluster_validate_resolution_keras{
+        memory = 300.GB
+    }
+
+    withName: umap_calculate_and_plot{
+        memory = 300.GB
+    }
+
+    withName: sccaf_assess_clustering{
+        memory = 300.GB
+    }
+
+}
diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf.sh
@@ -17,7 +17,7 @@ parentdir="$(dirname "$CWD1")"
 export RUN_ID="${PWD##*/}"
 mkdir $PWD/work || echo 'exists'
 mkdir $PWD/work/tmp || echo 'exists'
-echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -c $INPUT_FILE --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
 
 # get process PID 
 sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")

diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf__removeWork.sh
@@ -21,7 +21,7 @@ export RUN_ID="${PWD##*/}"
 # export TEMP=$PWD/tmp
 # export TMP_DIR=$PWD/tmp
 
-echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger  -c $INPUT_FILE --nf_ci_loc $PWD -entry WORK_DIR_REMOVAL --remove_work_dir -resume > nextflow.nohup.log 2>&1 & 
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger  -c $INPUT_FILE --nf_ci_loc $PWD -entry WORK_DIR_REMOVAL --remove_work_dir -resume > nextflow.nohup.log 2>&1 & 
 
 # get process PID 
 sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")

diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf_celltypes.sh
@@ -17,7 +17,7 @@ parentdir="$(dirname "$CWD1")"
 export RUN_ID="${PWD##*/}"
 mkdir $PWD/work || echo 'exists'
 mkdir $PWD/work/tmp || echo 'exists'
-echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger -entry JUST_CELLTYPES -c $INPUT_FILE  --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -entry JUST_CELLTYPES -c $INPUT_FILE  --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
 
 # get process PID 
 sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")

diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf_recluster.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+INPUT_FILE=$1
+dt=`date +"%Y_%m_%d_%T"`
+cp nextflow.nohup.log ./nextflow.nohup_$dt.log2 || echo 'first time running'
+# activate Nextflow conda env
+
+# clean up previous run files
+rm -f *.log
+rm -f nextflow.nohup.PID.txt 
+
+# start Nextflow in background:
+export NXF_OPTS="-Xms5G -Xmx5G"
+
+CWD1="$PWD"
+parentdir="$(dirname "$CWD1")"
+# export RUN_ID="${parentdir##*/}"
+export RUN_ID="${PWD##*/}"
+mkdir $PWD/work || echo 'exists'
+mkdir $PWD/work/tmp || echo 'exists'
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger -entry JUST_RECLUSTER -c /software/hgi/pipelines/yascp_versions/yascp_v1.3__work/assets/deploy_scripts/input_setups/recluster_profile.nf -c $INPUT_FILE  --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
+
+# get process PID 
+sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")
+echo $PID > nextflow.nohup.PID.txt
+echo "Nextflow PID is $PID (saved in ./nextflow.nohup.PID.txt)" 
+echo kill with \"kill $PID\"
+echo "check logs files nextflow.nohup.log and .nextflow.log"
diff --git a/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh b/assets/deploy_scripts/nohup_start_nextflow_lsf_test.sh
@@ -16,7 +16,7 @@ parentdir="$(dirname "$CWD1")"
 export RUN_ID="${PWD##*/}"
 mkdir $PWD/work || echo 'exists'
 mkdir $PWD/work/tmp || echo 'exists'
-echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.2 -profile sanger,test --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
+echo $RUN_ID | nextflow run /software/hgi/pipelines/yascp_versions/yascp_v1.3__work -profile sanger,test --nf_ci_loc $PWD -resume > nextflow.nohup.log 2>&1 & 
 
 # get process PID 
 sleep 1 && export PID=$(pgrep -f "\\-\\-nf_ci_loc $RUN_DIR")

diff --git a/bin/0026-plot_filtered_cells.py b/bin/0026-plot_filtered_cells.py
@@ -67,13 +67,16 @@ def main():
     # Check if any difference between before and after filters.	If not,
     # return early.
     df_after_filters = df[df.filter_type.isin(['after_filters'])]
-    filt = df_after_filters.n_cells_left_in_adata == df_before_filters.loc[
-        df_after_filters.experiment_id,
-        'n_cells_left_in_adata'
-    ].values
-    if all(filt):
-        print("No difference detected before and after filters. No plots.")
-        return()
+    try:
+        filt = df_after_filters.n_cells_left_in_adata == df_before_filters.loc[
+            df_after_filters.experiment_id,
+            'n_cells_left_in_adata'
+        ].values
+        if all(filt):
+            print("No difference detected before and after filters. No plots.")
+            return()
+    except:
+            return()
 
     # Set some plotting parameters
     plt_height = 16  # 1.5 * df.experiment_id.nunique()

diff --git a/bin/0028-plot_predicted_sex.py b/bin/0028-plot_predicted_sex.py
@@ -60,7 +60,10 @@ def main():
 
     # Load the AnnData file
     adata = sc.read_h5ad(filename=options.h5)
-
+    try:
+        adata.X=adata.layers['counts']
+    except:
+        _='counts may be already set'
     # If we have a flag for cells that pass QC then filter down to them
     if 'cell_passes_qc' in adata.obs:
         adata = adata[adata.obs['cell_passes_qc'], :]

diff --git a/bin/0030-estimate_pca_elbow.py b/bin/0030-estimate_pca_elbow.py
@@ -78,7 +78,10 @@ def main():
 
     # Read in the dataframe
     adata = sc.read_h5ad(filename=options.h5)
-
+    try:
+        adata.X=adata.layers['counts']
+    except:
+        _='counts may be already set'
     kneedle_dict = {}
     output_dict = {}