From fdd8dc0790eea403a3a730f5cca2ebf0943a4235 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 25 Oct 2024 15:04:32 +0100 Subject: [PATCH 001/159] Update main.nf - adding wf process names --- main.nf | 306 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 306 insertions(+) diff --git a/main.nf b/main.nf index 8b137891..099c6710 100644 --- a/main.nf +++ b/main.nf @@ -1 +1,307 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// Define inputs as channels +Channel.fromPath('genemeta_data.txt').set { genemeta } +Channel.fromPath('genes_data.txt').set { genes } +Channel.fromPath('barcodes_data.txt').set { barcodes } +Channel.fromPath('matrix_data.txt').set { matrix } +Channel.fromPath('cellmeta_data.txt').set { cellmeta } +Channel.value('X_pca').set { pca_param } +Channel.value('NO_CELLTYPE_FIELD').set { celltype_field_param } +Channel.value('').set { batch_variable } +Channel.value(['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50']).set { perplexity_values } +Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0']).set { resolution_values } + +/* + * Column_rearrange_1: Only keeps the specified columns and removes header + */ +process Column_rearrange_1 { + input: + + output: + + script: + """ + """ +} + +/* + * Column_rearrange_2: Only keeps the specified columns and removes header + */ +process Column_rearrange_2 { + // Set the output file + input: + + output: + + script: + """ + """ +} + +/* + * mergeGeneFiles: Merges gene file with genemeta on column 1, and keeps column1 and 4 + */ +process mergeGeneFiles { + input: + + output: + + script: + """ + """ +} + +process scanpy_read_10x { + input: + + output: + + script: + """ + """ +} + +process scanpy_filter_cells { + input: + + output: + + script: + """ + """ +} + +process scanpy_filter_genes { + input: + + output: + + script: + """ + """ +} + +process normalise_data { + input: + + output: + + script: + """ + """ +} + +process normalise_data_internal { + input: + + output: + + script: + """ + """ +} + +process find_variable_genes { + input: + + output: + + script: + """ + """ +} + +process run_pca { + input: + + output: + + script: + """ + """ +} + +process harmony_batch { + input: + + output: + + script: + """ + """ +} + +process neighbours { + input: + + output: + + script: + """ + """ +} + +process neighbours_for_umap { + input: + + output: + + script: + """ + """ +} + +process normalise_data { + input: + + output: + + script: + """ + """ +} + +process find_clusters { + input: + + output: + + script: + """ + """ +} + +process meta_vars { + input: + + output: + + script: + """ + """ +} + +process clustering_slotnames { + input: + + output: + + script: + """ + """ +} + +process merge_group_slotnames { + input: + + output: + + script: + """ + """ +} + +process merge_collections { + input: + + output: + + script: + """ + """ +} + +process build_list { + input: + + output: + + script: + """ + """ +} + +process find_markers { + input: + + output: + + script: + """ + """ +} + +process filtered_cellgroup_markers { + input: + + output: + + script: + """ + """ +} + +process run_umap { + input: + + output: + + script: + """ + """ +} + +process run_tsne { + input: + + output: + + script: + """ + """ +} + +process filter_failed_umap { + input: + + output: + + script: + """ + """ +} + +process filer_failed_tsne { + input: + + output: + + script: + """ + """ +} + +process merge_embeddings { + input: + + output: + + script: + """ + """ +} + + + +process make_project_file { + input: + + output: + + script: + """ + """ +} From fb359c80142ce5cacd2b80a44d6b7ab5528f9681 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 25 Oct 2024 15:16:05 +0100 Subject: [PATCH 002/159] Update main.nf - populate Column_rearrange processes --- main.nf | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/main.nf b/main.nf index 099c6710..6b4752d5 100644 --- a/main.nf +++ b/main.nf @@ -18,12 +18,27 @@ Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0']).s * Column_rearrange_1: Only keeps the specified columns and removes header */ process Column_rearrange_1 { + // Set the output file input: + path genemeta + val col output: + path 'filtered_genemeta.txt' script: """ + # Find the column number of the specified gene_id column name + col_num=\$(head -n1 "$genemeta" | tr '\\t' '\\n' | grep -n "^$col\$" | cut -d: -f1) + + # If column is found, extract it; otherwise, raise an error + if [[ -z "\$col_num" ]]; then + echo "Error: Column '$col' not found in $genemeta" >&2 + exit 1 + fi + + # Extract the gene_id column (without the header) + tail -n +2 "$genemeta" | cut -f\$col_num > filtered_genemeta.txt """ } @@ -33,11 +48,27 @@ process Column_rearrange_1 { process Column_rearrange_2 { // Set the output file input: + path genemeta + val col1 + val col2 output: + path 'filtered_genemeta_2.txt' script: """ + # Find the column number of the specified gene_id column name + col_num_1=\$(head -n1 "$genemeta" | tr '\\t' '\\n' | grep -n "^$col1\$" | cut -d: -f1) + col_num_2=\$(head -n1 "$genemeta" | tr '\\t' '\\n' | grep -n "^$col2\$" | cut -d: -f1) + + # If either column is not found, raise an error + if [[ -z "\$col1_num" || -z "\$col2_num" ]]; then + echo "Error: Column '$col1' or '$col2' not found in $genemeta" >&2 + exit 1 + fi + + # Extract the gene_id column (without the header) + tail -n +2 "$genemeta" | cut -f\$col1_num,\$col2_num > filtered_genemeta.txt """ } From 47d6662c71f8413f89d2cdf27182bb56c7da21bc Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 25 Oct 2024 15:16:46 +0100 Subject: [PATCH 003/159] Update main.nf - populate mergeGeneFiles --- main.nf | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/main.nf b/main.nf index 6b4752d5..010c3599 100644 --- a/main.nf +++ b/main.nf @@ -77,11 +77,20 @@ process Column_rearrange_2 { */ process mergeGeneFiles { input: + path gene from params.genes + path filtered_genemeta from Column_rearrange_1.out output: + path params.output script: """ + # Sort both files by the first column for join compatibility + sort -k1,1 "$gene" > sorted_gene.txt + sort -k1,1 filtered_genemeta.txt > sorted_genemeta.txt + + # Perform a left join to keep all data from gene file + join -a 1 -e 'NA' -t '\t' sorted_gene.txt sorted_genemeta.txt | cut -f1,4 > ${params.output} """ } From f87d49b8c58b90b329aa0a8ddad4a8dba79d924c Mon Sep 17 00:00:00 2001 From: Pedro Madrigal Date: Fri, 25 Oct 2024 15:47:31 +0100 Subject: [PATCH 004/159] add nextflow.config for Slurm --- nextflow.config | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/nextflow.config b/nextflow.config index 8b137891..1de8fc38 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1 +1,20 @@ +process { + executor='slurm' + queue="$SCXA_HPC_QUEUE" + clusterOptions="$SCXA_HPC_OPTIONS" + time = '7 d' + memory = '4 GB' + queueSize=500 + exitReadTimeout='100000 sec' + pollInterval = '5sec' +} +conda { + cacheDir = "$SCXA_WORKFLOW_ROOT/envs" + createTimeout = "30 min" + useMamba = true +} + +params { + +} From 7169e98d309a7a7b722723e8864927db4a21c32b Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 25 Oct 2024 16:02:58 +0100 Subject: [PATCH 005/159] Update main.nf - moving input channels to workflow --- main.nf | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/main.nf b/main.nf index 010c3599..8dfb3fd8 100644 --- a/main.nf +++ b/main.nf @@ -77,20 +77,20 @@ process Column_rearrange_2 { */ process mergeGeneFiles { input: - path gene from params.genes - path filtered_genemeta from Column_rearrange_1.out + path gene + path filtered_genemeta output: path params.output script: """ - # Sort both files by the first column for join compatibility - sort -k1,1 "$gene" > sorted_gene.txt - sort -k1,1 filtered_genemeta.txt > sorted_genemeta.txt - - # Perform a left join to keep all data from gene file - join -a 1 -e 'NA' -t '\t' sorted_gene.txt sorted_genemeta.txt | cut -f1,4 > ${params.output} + # Sort both files by the first column for join compatibility + sort -k1,1 "$gene" > sorted_gene.txt + sort -k1,1 filtered_genemeta.txt > sorted_genemeta.txt + + # Perform a left join to keep all data from gene file + join -a 1 -e 'NA' -t '\t' sorted_gene.txt sorted_genemeta.txt | cut -f1,4 > ${params.output} """ } @@ -98,6 +98,10 @@ process scanpy_read_10x { input: output: + path anndata + + conda: + script: """ @@ -345,3 +349,23 @@ process make_project_file { """ """ } + +workflow { + + // Create input channel (single file via CLI parameter) + genemeta = Channel.fromPath('genemeta_data.txt') + genes = Channel.fromPath('genes_data.txt') + barcodes = Channel.fromPath('barcodes_data.txt') + matrix = Channel.fromPath('matrix_data.txt') + cellmeta = Channel.fromPath('cellmeta_data.txt') + pca_param = Channel.value('X_pca') + celltype_field_param = Channel.value('NO_CELLTYPE_FIELD') + batch_variable = Channel.value('') + perplexity_values = Channel.value(['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50']) + resolution_values = Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0']) + + + // Create index file for input BAM file + Column_rearrange_1(genemeta, "gene_id") + Column_rearrange_2(genemeta, "gene_id", "gene_name") +} From b7de1405b452bcbf8551dac9adff71f05bc04784 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 25 Oct 2024 16:04:20 +0100 Subject: [PATCH 006/159] Update main.nf - deleting redundant input channels --- main.nf | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/main.nf b/main.nf index 8dfb3fd8..bc04c954 100644 --- a/main.nf +++ b/main.nf @@ -2,18 +2,6 @@ nextflow.enable.dsl=2 -// Define inputs as channels -Channel.fromPath('genemeta_data.txt').set { genemeta } -Channel.fromPath('genes_data.txt').set { genes } -Channel.fromPath('barcodes_data.txt').set { barcodes } -Channel.fromPath('matrix_data.txt').set { matrix } -Channel.fromPath('cellmeta_data.txt').set { cellmeta } -Channel.value('X_pca').set { pca_param } -Channel.value('NO_CELLTYPE_FIELD').set { celltype_field_param } -Channel.value('').set { batch_variable } -Channel.value(['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50']).set { perplexity_values } -Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0']).set { resolution_values } - /* * Column_rearrange_1: Only keeps the specified columns and removes header */ From aab1b63898336acc128e3890beca4b06c786036d Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 25 Oct 2024 16:08:53 +0100 Subject: [PATCH 007/159] Update main.nf - adds wf step for mergeGeneFile --- main.nf | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index bc04c954..1bbea5cf 100644 --- a/main.nf +++ b/main.nf @@ -56,7 +56,7 @@ process Column_rearrange_2 { fi # Extract the gene_id column (without the header) - tail -n +2 "$genemeta" | cut -f\$col1_num,\$col2_num > filtered_genemeta.txt + tail -n +2 "$genemeta" | cut -f\$col1_num,\$col2_num > filtered_genemeta_2.txt """ } @@ -75,7 +75,7 @@ process mergeGeneFiles { """ # Sort both files by the first column for join compatibility sort -k1,1 "$gene" > sorted_gene.txt - sort -k1,1 filtered_genemeta.txt > sorted_genemeta.txt + sort -k1,1 "$filtered_genemeta" > sorted_genemeta.txt # Perform a left join to keep all data from gene file join -a 1 -e 'NA' -t '\t' sorted_gene.txt sorted_genemeta.txt | cut -f1,4 > ${params.output} @@ -356,4 +356,8 @@ workflow { // Create index file for input BAM file Column_rearrange_1(genemeta, "gene_id") Column_rearrange_2(genemeta, "gene_id", "gene_name") + mergeGeneFiles( + genes, + Column_rearrange_2.out + ) } From 681aeb5439d850299f40a46b4c816e50857a8c7c Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 25 Oct 2024 16:15:59 +0100 Subject: [PATCH 008/159] Update main.nf - adds workflow step for scanpy-read-10x --- main.nf | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/main.nf b/main.nf index 1bbea5cf..a9e83bdb 100644 --- a/main.nf +++ b/main.nf @@ -84,6 +84,11 @@ process mergeGeneFiles { process scanpy_read_10x { input: + path matrix + path mergeGeneFiles.out + path barcodes + path cellmeta + path genemeta output: path anndata @@ -93,6 +98,17 @@ process scanpy_read_10x { script: """ + ln -s $matrix matrix.mtx + ln -s $genes genes.tsv + ln -s $barcodes barcodes.tsv + + scanpy-read-10x --input-10x-mtx ./ \ + --var-names 'gene_ids' \ + --extra-obs $cellmeta \ + --extra-var $genemeta \ + --show-obj stdout \ + --output-format anndata \ + $anndata """ } @@ -360,4 +376,11 @@ workflow { genes, Column_rearrange_2.out ) + scanpy_read_10x( + matrix, + mergeGeneFiles.out, + barcodes, + cellmeta, + genemeta + ) } From 4cec90b286ad8d9a243decbd26f6bba799649235 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 25 Oct 2024 16:35:05 +0100 Subject: [PATCH 009/159] Update main.nf - fixing typo --- main.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index a9e83bdb..74c9fa8b 100644 --- a/main.nf +++ b/main.nf @@ -50,13 +50,13 @@ process Column_rearrange_2 { col_num_2=\$(head -n1 "$genemeta" | tr '\\t' '\\n' | grep -n "^$col2\$" | cut -d: -f1) # If either column is not found, raise an error - if [[ -z "\$col1_num" || -z "\$col2_num" ]]; then + if [[ -z "\$col_num_1" || -z "\$col_num_2" ]]; then echo "Error: Column '$col1' or '$col2' not found in $genemeta" >&2 exit 1 fi # Extract the gene_id column (without the header) - tail -n +2 "$genemeta" | cut -f\$col1_num,\$col2_num > filtered_genemeta_2.txt + tail -n +2 "$genemeta" | cut -f\$col_num_1,\$col_num_2 > filtered_genemeta_2.txt """ } @@ -357,11 +357,11 @@ process make_project_file { workflow { // Create input channel (single file via CLI parameter) - genemeta = Channel.fromPath('genemeta_data.txt') - genes = Channel.fromPath('genes_data.txt') - barcodes = Channel.fromPath('barcodes_data.txt') + genemeta = Channel.fromPath('gene_metadata.tsv') + genes = Channel.fromPath('genes.tsv') + barcodes = Channel.fromPath('barcodes_data.tsv') matrix = Channel.fromPath('matrix_data.txt') - cellmeta = Channel.fromPath('cellmeta_data.txt') + cellmeta = Channel.fromPath('cell_metadata.tsv') pca_param = Channel.value('X_pca') celltype_field_param = Channel.value('NO_CELLTYPE_FIELD') batch_variable = Channel.value('') From 9842c6a2c8caef565d2e8eecfdc04de82e5f114c Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 25 Oct 2024 16:46:47 +0100 Subject: [PATCH 010/159] Update main.nf - fixes input file names, process mergeGeneFiles and scanpy_read_10x --- main.nf | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/main.nf b/main.nf index 74c9fa8b..1f854078 100644 --- a/main.nf +++ b/main.nf @@ -69,7 +69,7 @@ process mergeGeneFiles { path filtered_genemeta output: - path params.output + path 'merged_genemeta.tsv' script: """ @@ -78,29 +78,26 @@ process mergeGeneFiles { sort -k1,1 "$filtered_genemeta" > sorted_genemeta.txt # Perform a left join to keep all data from gene file - join -a 1 -e 'NA' -t '\t' sorted_gene.txt sorted_genemeta.txt | cut -f1,4 > ${params.output} + join -a 1 -e 'NA' -t '\t' sorted_gene.txt sorted_genemeta.txt | cut -f1,4 > merged_genemeta.tsv """ } process scanpy_read_10x { input: path matrix - path mergeGeneFiles.out + path genes path barcodes path cellmeta path genemeta output: - path anndata - - conda: - + path 'anndata.h5ad' script: """ - ln -s $matrix matrix.mtx - ln -s $genes genes.tsv - ln -s $barcodes barcodes.tsv + #ln -s $matrix matrix.mtx + #ln -s $genes genes.tsv + #ln -s $barcodes barcodes.tsv scanpy-read-10x --input-10x-mtx ./ \ --var-names 'gene_ids' \ @@ -108,7 +105,7 @@ process scanpy_read_10x { --extra-var $genemeta \ --show-obj stdout \ --output-format anndata \ - $anndata + 'anndata.h5ad' """ } @@ -359,8 +356,8 @@ workflow { // Create input channel (single file via CLI parameter) genemeta = Channel.fromPath('gene_metadata.tsv') genes = Channel.fromPath('genes.tsv') - barcodes = Channel.fromPath('barcodes_data.tsv') - matrix = Channel.fromPath('matrix_data.txt') + barcodes = Channel.fromPath('barcodes.tsv') + matrix = Channel.fromPath('matrix.mtx') cellmeta = Channel.fromPath('cell_metadata.tsv') pca_param = Channel.value('X_pca') celltype_field_param = Channel.value('NO_CELLTYPE_FIELD') From c23bb3abe5554afc4dcb068f0974f86b577dcad3 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 09:56:13 +0000 Subject: [PATCH 011/159] removes duplicated process and renames a file --- main.nf | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/main.nf b/main.nf index 1f854078..9797f532 100644 --- a/main.nf +++ b/main.nf @@ -199,16 +199,6 @@ process neighbours_for_umap { """ } -process normalise_data { - input: - - output: - - script: - """ - """ -} - process find_clusters { input: @@ -354,7 +344,7 @@ process make_project_file { workflow { // Create input channel (single file via CLI parameter) - genemeta = Channel.fromPath('gene_metadata.tsv') + genemeta = Channel.fromPath('genes_metadata.tsv') genes = Channel.fromPath('genes.tsv') barcodes = Channel.fromPath('barcodes.tsv') matrix = Channel.fromPath('matrix.mtx') From 2b039d0a5cd2bdc305cf71032181d85cd645e4ea Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 10:15:53 +0000 Subject: [PATCH 012/159] populate scanpy_filter_cells process --- main.nf | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/main.nf b/main.nf index 9797f532..16cc8512 100644 --- a/main.nf +++ b/main.nf @@ -111,11 +111,21 @@ process scanpy_read_10x { process scanpy_filter_cells { input: + path anndata output: + path 'filtered_cell_anndata.h5ad' script: """ + scanpy-filter-cells --gene-name 'gene_symbols' \ + --param 'c:n_counts' 750.0 1000000000.0 \ + --param 'c:pct_counts_mito' 0.0 0.35 \ + --category 'c:predicted_doublet' 'False' \ + --input-format 'anndata' input.h5 \ + --show-obj stdout \ + --output-format anndata 'filtered_cell_anndata.h5ad' \ + --export-mtx ./ """ } @@ -370,4 +380,7 @@ workflow { cellmeta, genemeta ) + scanpy_filter_cells( + scanpy_read_10x.out + ) } From 32731d0d29f4a678d8a941b92ae4de62276a94a9 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 10:21:37 +0000 Subject: [PATCH 013/159] replaes anndata file name with var --- main.nf | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 16cc8512..8683e9da 100644 --- a/main.nf +++ b/main.nf @@ -112,6 +112,7 @@ process scanpy_read_10x { process scanpy_filter_cells { input: path anndata + path genes output: path 'filtered_cell_anndata.h5ad' @@ -122,7 +123,7 @@ process scanpy_filter_cells { --param 'c:n_counts' 750.0 1000000000.0 \ --param 'c:pct_counts_mito' 0.0 0.35 \ --category 'c:predicted_doublet' 'False' \ - --input-format 'anndata' input.h5 \ + --input-format 'anndata' $anndata \ --show-obj stdout \ --output-format anndata 'filtered_cell_anndata.h5ad' \ --export-mtx ./ @@ -380,7 +381,4 @@ workflow { cellmeta, genemeta ) - scanpy_filter_cells( - scanpy_read_10x.out - ) } From 9de5090e757f1fb71e1d0e2ab90a71835e50d5ce Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 10:29:07 +0000 Subject: [PATCH 014/159] Populate scanpy_filter_genes process --- main.nf | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/main.nf b/main.nf index 8683e9da..fbf54bda 100644 --- a/main.nf +++ b/main.nf @@ -132,11 +132,23 @@ process scanpy_filter_cells { process scanpy_filter_genes { input: + path anndata + path genes output: + path 'filtered_gene_anndata.h5ad' script: """ + scanpy-filter-genes \ + --param 'g:n_cells' 3.0 1000000000.0 \ + --subset 'g:index' \ + $genes \ + --input-format 'anndata' $anndata \ + --show-obj stdout \ + --output-format anndata \ + filtered_gene_anndata.h5ad' \ + --export-mtx ./ """ } @@ -381,4 +393,8 @@ workflow { cellmeta, genemeta ) + scanpy_filter_cells( + scanpy_read_10x.out, + Column_rearrange_1.out[0] + ) } From 772d8414144a0001678c23549f2fd945cfb1b7f1 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 10:31:13 +0000 Subject: [PATCH 015/159] Adds missing quote --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index fbf54bda..23bfbd29 100644 --- a/main.nf +++ b/main.nf @@ -147,7 +147,7 @@ process scanpy_filter_genes { --input-format 'anndata' $anndata \ --show-obj stdout \ --output-format anndata \ - filtered_gene_anndata.h5ad' \ + 'filtered_gene_anndata.h5ad' \ --export-mtx ./ """ } From f8ae27e07bd114b86c3bc78ed40689b7657b67bb Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 10:36:11 +0000 Subject: [PATCH 016/159] Populate normalise_data process --- main.nf | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index 23bfbd29..5954def9 100644 --- a/main.nf +++ b/main.nf @@ -102,7 +102,7 @@ process scanpy_read_10x { scanpy-read-10x --input-10x-mtx ./ \ --var-names 'gene_ids' \ --extra-obs $cellmeta \ - --extra-var $genemeta \ + --extra-var $genemeta \ --show-obj stdout \ --output-format anndata \ 'anndata.h5ad' @@ -123,9 +123,9 @@ process scanpy_filter_cells { --param 'c:n_counts' 750.0 1000000000.0 \ --param 'c:pct_counts_mito' 0.0 0.35 \ --category 'c:predicted_doublet' 'False' \ - --input-format 'anndata' $anndata \ + --input-format 'anndata' $anndata \ --show-obj stdout \ - --output-format anndata 'filtered_cell_anndata.h5ad' \ + --output-format anndata 'filtered_cell_anndata.h5ad' \ --export-mtx ./ """ } @@ -147,18 +147,28 @@ process scanpy_filter_genes { --input-format 'anndata' $anndata \ --show-obj stdout \ --output-format anndata \ - 'filtered_gene_anndata.h5ad' \ + 'filtered_gene_anndata.h5ad' \ --export-mtx ./ """ } process normalise_data { input: + path anndata output: + path 'normalised_anndata.h5ad' script: """ + scanpy-normalise-data \ + --no-log-transform \ + --normalize-to '1000000.0' \ + --input-format 'anndata' $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'normalised_anndata.h5ad' \ + --export-mtx ./ """ } @@ -392,9 +402,12 @@ workflow { barcodes, cellmeta, genemeta - ) + ) scanpy_filter_cells( scanpy_read_10x.out, Column_rearrange_1.out[0] - ) + ) + normalise_data( + scanpy_filter_cells.out + ) } From c02a8c4506094448731dcf028c7c2310782b434d Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 10:50:10 +0000 Subject: [PATCH 017/159] Populate normalise_data_internal process --- main.nf | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/main.nf b/main.nf index 5954def9..82549357 100644 --- a/main.nf +++ b/main.nf @@ -174,11 +174,19 @@ process normalise_data { process normalise_data_internal { input: + path anndata output: + path 'normalised_internal_anndata.h5ad' script: """ + scanpy-normalise-data \ + --normalize-to '1000000.0' \ + --input-format 'anndata' $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'normalised_internal_anndata.h5ad' """ } @@ -410,4 +418,7 @@ workflow { normalise_data( scanpy_filter_cells.out ) + normalise_internal_data( + scanpy_filter_cells.out + ) } From 813c7ac46d2473658a133b285d8fd371478a201b Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 11:00:43 +0000 Subject: [PATCH 018/159] Populate find_variable_genes process --- main.nf | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/main.nf b/main.nf index 82549357..1b3a6376 100644 --- a/main.nf +++ b/main.nf @@ -192,11 +192,23 @@ process normalise_data_internal { process find_variable_genes { input: + path anndata output: + path 'variable_genes.h5ad' script: """ + scanpy-find-variable-genes \ + --flavor 'seurat' \ + --mean-limits 0.0125 1000000000.0 \ + --disp-limits 0.5 50.0 \ + --span 0.3 \ + --n-bins '20' \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata 'variable_genes.h5ad' """ } @@ -421,4 +433,7 @@ workflow { normalise_internal_data( scanpy_filter_cells.out ) + find_variable_genes( + normalise_internal_data.out + ) } From 418610e3d85ad967371febbca813da5fd333444f Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 11:51:23 +0000 Subject: [PATCH 019/159] Populate run_PCA process --- main.nf | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/main.nf b/main.nf index 1b3a6376..227ba679 100644 --- a/main.nf +++ b/main.nf @@ -214,11 +214,22 @@ process find_variable_genes { process run_pca { input: + path anndata output: + path 'PCA.h5ad' script: """ + scanpy-run-pca \ + --no-zero-center \ + --svd-solver 'arpack' \ + --random-state '1234' \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'PCA.h5ad' """ } @@ -436,4 +447,7 @@ workflow { find_variable_genes( normalise_internal_data.out ) + run_pca( + find_variable_genes.out + ) } From e3e74b0132c6635679c4d1fc856351a5f9999bbd Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 12:00:32 +0000 Subject: [PATCH 020/159] Populate harmony_batch process --- main.nf | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 227ba679..cae4698d 100644 --- a/main.nf +++ b/main.nf @@ -235,11 +235,16 @@ process run_pca { process harmony_batch { input: - + path anndata output: + path 'harmony.h5ad' script: """ + echo "No batch variables passed, simply passing original input as output unchanged." + + cp $anndata 'harmony.h5ad' + """ } @@ -450,4 +455,7 @@ workflow { run_pca( find_variable_genes.out ) + harmony_batch( + run_pca.out + ) } From 9a23c62ef7832aa87439c428ff68fcc57d874e7b Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 12:15:03 +0000 Subject: [PATCH 021/159] Populate neighbours process --- main.nf | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index cae4698d..26d12335 100644 --- a/main.nf +++ b/main.nf @@ -250,11 +250,26 @@ process harmony_batch { process neighbours { input: - + path anndata + val pca_param output: + path 'neighbours.h5ad' script: """ + scanpy-neighbors \ + --n-neighbors 15 \ + --method 'umap' \ + --metric 'euclidean' \ + --random-state '0' \ + --use-rep $pca_param \ + --n-pcs '50' \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'neighbours.h5ad' + """ } @@ -458,4 +473,8 @@ workflow { harmony_batch( run_pca.out ) + neighbours( + harmony_batch.out, + pca_param + ) } From e5e82e312c04e23c8cc968d7377a4db5cd314f6c Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 14:10:09 +0000 Subject: [PATCH 022/159] Applies batch_varibale --- main.nf | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 26d12335..317fe2bf 100644 --- a/main.nf +++ b/main.nf @@ -193,18 +193,27 @@ process normalise_data_internal { process find_variable_genes { input: path anndata + val batch_variable output: path 'variable_genes.h5ad' script: """ + if [[ -z "\$batch_variable" ]]; then + batch_variable = "--batch-key $batch_variable" + else + batch_variable = "" + fi + + scanpy-find-variable-genes \ --flavor 'seurat' \ --mean-limits 0.0125 1000000000.0 \ --disp-limits 0.5 50.0 \ --span 0.3 \ --n-bins '20' \ + $batch_variable \ --input-format 'anndata' \ $anndata \ --show-obj stdout \ @@ -236,14 +245,27 @@ process run_pca { process harmony_batch { input: path anndata + val batch_variable output: path 'harmony.h5ad' script: """ - echo "No batch variables passed, simply passing original input as output unchanged." + if [[ -z "\$batch_variable" ]]; then + scanpy-integrate harmony \ + --batch-key $batch_variable \ + --basis 'X_pca' \ + --adjusted-basis 'X_pca_harmony' \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'harmony.h5ad' + else + echo "No batch variables passed, simply passing original input as output unchanged." - cp $anndata 'harmony.h5ad' + cp $anndata 'harmony.h5ad' + fi """ } @@ -465,13 +487,15 @@ workflow { scanpy_filter_cells.out ) find_variable_genes( - normalise_internal_data.out + normalise_internal_data.out, + batch_variable ) run_pca( find_variable_genes.out ) harmony_batch( - run_pca.out + run_pca.out, + batch_variable ) neighbours( harmony_batch.out, From 458a7b4c4c7bd0157571919a97eb4fb0a78c18f9 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 15:41:52 +0000 Subject: [PATCH 023/159] Populate neighbours_for_umap process --- main.nf | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 317fe2bf..da73e715 100644 --- a/main.nf +++ b/main.nf @@ -297,11 +297,28 @@ process neighbours { process neighbours_for_umap { input: - + path anndata + val n_neighbours output: - + path 'neighbours_*.h5ad' script: """ + for i in $sample + do + scanpy-neighbors \ + --n-neighbors \$i \ + --method 'umap' \ + --metric 'euclidean' \ + --random-state '0' \ + --use-rep $pca_param \ + --n-pcs '50' \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'neighbours\$i.h5ad' + done + """ } @@ -460,6 +477,7 @@ workflow { batch_variable = Channel.value('') perplexity_values = Channel.value(['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50']) resolution_values = Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0']) + neighbor_values = Channel.value("10 100 15 20 25 3 30 5 50") // Create index file for input BAM file @@ -501,4 +519,8 @@ workflow { harmony_batch.out, pca_param ) + neighbours_for_umap( + harmony_batch.out, + pca_param + ) } From 72c747309ea74858471d5f60ff45bca42be86fcf Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 15:54:15 +0000 Subject: [PATCH 024/159] removes random comment --- main.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/main.nf b/main.nf index da73e715..e8e1ea58 100644 --- a/main.nf +++ b/main.nf @@ -480,7 +480,6 @@ workflow { neighbor_values = Channel.value("10 100 15 20 25 3 30 5 50") - // Create index file for input BAM file Column_rearrange_1(genemeta, "gene_id") Column_rearrange_2(genemeta, "gene_id", "gene_name") mergeGeneFiles( From 963203526aaec97ac2e36a37c83f07d0afd3ee29 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 15:55:06 +0000 Subject: [PATCH 025/159] Fixes lint --- main.nf | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index e8e1ea58..1b6147c5 100644 --- a/main.nf +++ b/main.nf @@ -480,8 +480,15 @@ workflow { neighbor_values = Channel.value("10 100 15 20 25 3 30 5 50") - Column_rearrange_1(genemeta, "gene_id") - Column_rearrange_2(genemeta, "gene_id", "gene_name") + Column_rearrange_1( + genemeta, + "gene_id" + ) + Column_rearrange_2( + genemeta, + "gene_id", + "gene_name" + ) mergeGeneFiles( genes, Column_rearrange_2.out From 527236102c575cea1d0ab1caeee09d62c1d4d5e0 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 16:01:35 +0000 Subject: [PATCH 026/159] Fixes input for neighbours_for_umap --- main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 1b6147c5..625c4011 100644 --- a/main.nf +++ b/main.nf @@ -527,6 +527,7 @@ workflow { ) neighbours_for_umap( harmony_batch.out, - pca_param + neighbor_values ) + } From fae78f8549ac1dfc64a740f8978f55b37c3f568d Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 16:20:39 +0000 Subject: [PATCH 027/159] Populate run_tsne process and fixes typo in neighbours_for_umap --- main.nf | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index 625c4011..0bfce888 100644 --- a/main.nf +++ b/main.nf @@ -303,7 +303,7 @@ process neighbours_for_umap { path 'neighbours_*.h5ad' script: """ - for i in $sample + for i in $n_neighbours do scanpy-neighbors \ --n-neighbors \$i \ @@ -414,12 +414,32 @@ process run_umap { process run_tsne { input: - + path anndata + val pca_param + val perplexity_values output: - + path 'neighbours_*.h5ad' script: """ - """ + for i in $perplexity_values + do + scanpy-run-tsne \ + --use-rep $pca_param \ + --export-embedding embeddings.tsv \ + --perplexity \$i \ + --key-added 'perplexity_\$i' \ + --early-exaggeration '12.0' \ + --learning-rate '400.0' \ + --no-fast-tsne \ + --random-state 1234 \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'tsne\$i.h5ad' + # Not sure if following is needed + # && mv 'embeddings_perplexity_1.tsv' embeddings.tsv + done } process filter_failed_umap { @@ -475,7 +495,7 @@ workflow { pca_param = Channel.value('X_pca') celltype_field_param = Channel.value('NO_CELLTYPE_FIELD') batch_variable = Channel.value('') - perplexity_values = Channel.value(['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50']) + perplexity_values = Channel.value("1 5 10 15 20 25 30 35 40 45 50") resolution_values = Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0']) neighbor_values = Channel.value("10 100 15 20 25 3 30 5 50") @@ -529,5 +549,9 @@ workflow { harmony_batch.out, neighbor_values ) - + run_tsne( + harmony_batch.out, + pca_param, + perplexity_values + ) } From e960f6e46510ef57e4b5882684dcf86d78518727 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 28 Oct 2024 16:40:52 +0000 Subject: [PATCH 028/159] Populate run_UMAP process --- main.nf | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 0bfce888..7cfbfac9 100644 --- a/main.nf +++ b/main.nf @@ -404,11 +404,33 @@ process filtered_cellgroup_markers { process run_umap { input: - + path anndata output: - + path 'umap_*.h5ad' script: """ + for i in $anndata + do + scanpy-run-umap \ + --neighbors-key 'neighbors_\$i' \ + --key-added 'neighbors_\$i' \ + --export-embedding embeddings.tsv \ + --n-components 2 \ + --min-dist 0.5 \ + --spread 1.0 \ + --alpha 1.0 \ + --gamma 1.0 \ + --negative-sample-rate 5 \ + --random-state 0 \ + --init-pos 'spectral' \ + --input-format 'anndata' \ + \$i \ + --show-obj stdout \ + --output-format anndata \ + 'umap_\$i.h5ad' + # Not sure if following is needed + # && mv 'embeddings_neighbors_n_neighbors_100.tsv' embeddings.tsv + done """ } @@ -418,7 +440,7 @@ process run_tsne { val pca_param val perplexity_values output: - path 'neighbours_*.h5ad' + path 'tsne_*.h5ad' script: """ for i in $perplexity_values @@ -436,7 +458,7 @@ process run_tsne { $anndata \ --show-obj stdout \ --output-format anndata \ - 'tsne\$i.h5ad' + 'tsne_\$i.h5ad' # Not sure if following is needed # && mv 'embeddings_perplexity_1.tsv' embeddings.tsv done From 834271558e7281de403f86b04a60089426d1bf8a Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 30 Oct 2024 11:34:34 +0000 Subject: [PATCH 029/159] renames normalise_internal_data and adds container tag --- main.nf | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 7cfbfac9..9389342c 100644 --- a/main.nf +++ b/main.nf @@ -93,6 +93,8 @@ process scanpy_read_10x { output: path 'anndata.h5ad' + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + script: """ #ln -s $matrix matrix.mtx @@ -172,7 +174,7 @@ process normalise_data { """ } -process normalise_data_internal { +process normalise_internal_data { input: path anndata @@ -462,6 +464,7 @@ process run_tsne { # Not sure if following is needed # && mv 'embeddings_perplexity_1.tsv' embeddings.tsv done + """ } process filter_failed_umap { From 3cc4066145a275bfe08f0a987710bf61b5b05072 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Thu, 31 Oct 2024 11:10:04 +0000 Subject: [PATCH 030/159] Update main.nf - fixing join command for needed output --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 9389342c..e43d7ea1 100644 --- a/main.nf +++ b/main.nf @@ -78,7 +78,7 @@ process mergeGeneFiles { sort -k1,1 "$filtered_genemeta" > sorted_genemeta.txt # Perform a left join to keep all data from gene file - join -a 1 -e 'NA' -t '\t' sorted_gene.txt sorted_genemeta.txt | cut -f1,4 > merged_genemeta.tsv + join -a 1 -t \$'\t' -o 0,1.2,2.2 sorted_gene.txt sorted_genemeta.txt | cut -f1,3 > merged_genemeta.tsv """ } From 0d014f1b642306ccf78549ac1aabd3250926dfc4 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Thu, 31 Oct 2024 11:11:31 +0000 Subject: [PATCH 031/159] Update main.nf - sylink genes.tsv locally --- main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index e43d7ea1..0232ecd0 100644 --- a/main.nf +++ b/main.nf @@ -83,6 +83,8 @@ process mergeGeneFiles { } process scanpy_read_10x { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: path matrix path genes @@ -93,12 +95,10 @@ process scanpy_read_10x { output: path 'anndata.h5ad' - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' - script: """ #ln -s $matrix matrix.mtx - #ln -s $genes genes.tsv + ln -s $genes genes.tsv #ln -s $barcodes barcodes.tsv scanpy-read-10x --input-10x-mtx ./ \ From 279b43c67dd84f4dd4e6ce405492f211494fea8a Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Thu, 31 Oct 2024 11:15:36 +0000 Subject: [PATCH 032/159] Update main.nf - adding container tag and commenting param in filter_cell --- main.nf | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 0232ecd0..953c6b4f 100644 --- a/main.nf +++ b/main.nf @@ -83,7 +83,7 @@ process mergeGeneFiles { } process scanpy_read_10x { - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: path matrix @@ -112,6 +112,8 @@ process scanpy_read_10x { } process scanpy_filter_cells { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: path anndata path genes @@ -124,7 +126,7 @@ process scanpy_filter_cells { scanpy-filter-cells --gene-name 'gene_symbols' \ --param 'c:n_counts' 750.0 1000000000.0 \ --param 'c:pct_counts_mito' 0.0 0.35 \ - --category 'c:predicted_doublet' 'False' \ + # --category 'c:predicted_doublet' 'False' \ # commenting temporary as error attribute not found --input-format 'anndata' $anndata \ --show-obj stdout \ --output-format anndata 'filtered_cell_anndata.h5ad' \ @@ -133,6 +135,8 @@ process scanpy_filter_cells { } process scanpy_filter_genes { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: path anndata path genes @@ -155,6 +159,8 @@ process scanpy_filter_genes { } process normalise_data { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: path anndata From f874a37f56b00b167fbc9ecbc32be2a7553ced85 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Thu, 31 Oct 2024 11:16:39 +0000 Subject: [PATCH 033/159] Update nextflow.config - adding singularity param --- nextflow.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nextflow.config b/nextflow.config index 1de8fc38..e6c5c1be 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,3 +1,5 @@ +singularity.enabled = true + process { executor='slurm' queue="$SCXA_HPC_QUEUE" From 33f70da311ea3e218521ac164540a33c8ceb846d Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Thu, 31 Oct 2024 11:22:28 +0000 Subject: [PATCH 034/159] Update main.nf - removing comment --- main.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/main.nf b/main.nf index 953c6b4f..56230709 100644 --- a/main.nf +++ b/main.nf @@ -126,7 +126,6 @@ process scanpy_filter_cells { scanpy-filter-cells --gene-name 'gene_symbols' \ --param 'c:n_counts' 750.0 1000000000.0 \ --param 'c:pct_counts_mito' 0.0 0.35 \ - # --category 'c:predicted_doublet' 'False' \ # commenting temporary as error attribute not found --input-format 'anndata' $anndata \ --show-obj stdout \ --output-format anndata 'filtered_cell_anndata.h5ad' \ From 1c924fba4b5006d3c4edfb415a9b64ea7a494397 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Thu, 31 Oct 2024 11:56:47 +0000 Subject: [PATCH 035/159] Update main.nf - adds container info fixes `batch_variable` --- main.nf | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 56230709..561a7e19 100644 --- a/main.nf +++ b/main.nf @@ -180,6 +180,8 @@ process normalise_data { } process normalise_internal_data { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: path anndata @@ -198,6 +200,8 @@ process normalise_internal_data { } process find_variable_genes { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: path anndata val batch_variable @@ -207,8 +211,9 @@ process find_variable_genes { script: """ - if [[ -z "\$batch_variable" ]]; then - batch_variable = "--batch-key $batch_variable" + batch_variable_tag="" + if [[ -z "$batch_variable" ]]; then + batch_variable_tag="--batch-key $batch_variable" else batch_variable = "" fi @@ -220,7 +225,7 @@ process find_variable_genes { --disp-limits 0.5 50.0 \ --span 0.3 \ --n-bins '20' \ - $batch_variable \ + \$batch_variable_tag \ --input-format 'anndata' \ $anndata \ --show-obj stdout \ @@ -229,6 +234,8 @@ process find_variable_genes { } process run_pca { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: path anndata @@ -250,6 +257,8 @@ process run_pca { } process harmony_batch { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: path anndata val batch_variable @@ -258,7 +267,7 @@ process harmony_batch { script: """ - if [[ -z "\$batch_variable" ]]; then + if [[ -z "$batch_variable" ]]; then scanpy-integrate harmony \ --batch-key $batch_variable \ --basis 'X_pca' \ @@ -278,6 +287,8 @@ process harmony_batch { } process neighbours { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: path anndata val pca_param @@ -303,6 +314,8 @@ process neighbours { } process neighbours_for_umap { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: path anndata val n_neighbours @@ -410,6 +423,8 @@ process filtered_cellgroup_markers { } process run_umap { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: path anndata output: @@ -442,6 +457,8 @@ process run_umap { } process run_tsne { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: path anndata val pca_param From 48f5f8d8cfe522090391ab85431f39a1c57f3b96 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Thu, 31 Oct 2024 14:04:41 +0000 Subject: [PATCH 036/159] Update main.nf - fixes `batch_variable` condition, adds pca_param in neighbors_for_umap --- main.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index 561a7e19..3f1e1788 100644 --- a/main.nf +++ b/main.nf @@ -212,10 +212,8 @@ process find_variable_genes { script: """ batch_variable_tag="" - if [[ -z "$batch_variable" ]]; then + if [[ -n "$batch_variable" ]]; then batch_variable_tag="--batch-key $batch_variable" - else - batch_variable = "" fi @@ -267,7 +265,7 @@ process harmony_batch { script: """ - if [[ -z "$batch_variable" ]]; then + if [[ -n "$batch_variable" ]]; then scanpy-integrate harmony \ --batch-key $batch_variable \ --basis 'X_pca' \ @@ -319,6 +317,7 @@ process neighbours_for_umap { input: path anndata val n_neighbours + val pca_param output: path 'neighbours_*.h5ad' script: @@ -336,7 +335,7 @@ process neighbours_for_umap { $anndata \ --show-obj stdout \ --output-format anndata \ - 'neighbours\$i.h5ad' + 'neighbours_\$i.h5ad' done """ @@ -594,7 +593,8 @@ workflow { ) neighbours_for_umap( harmony_batch.out, - neighbor_values + neighbor_values, + pca_param ) run_tsne( harmony_batch.out, From 3f3fecb677a382137de9d1d7f06e714039d075e2 Mon Sep 17 00:00:00 2001 From: Pedro Madrigal <8195212+pmb59@users.noreply.github.com> Date: Fri, 1 Nov 2024 15:08:52 +0000 Subject: [PATCH 037/159] edit process run_umap --- main.nf | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/main.nf b/main.nf index 3f1e1788..0dfd42e7 100644 --- a/main.nf +++ b/main.nf @@ -425,16 +425,14 @@ process run_umap { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: - path anndata + each anndata output: path 'umap_*.h5ad' script: """ - for i in $anndata - do scanpy-run-umap \ - --neighbors-key 'neighbors_\$i' \ - --key-added 'neighbors_\$i' \ + --neighbors-key 'neighbors_\$anndata' \ + --key-added 'neighbors_\$anndata' \ --export-embedding embeddings.tsv \ --n-components 2 \ --min-dist 0.5 \ @@ -445,13 +443,13 @@ process run_umap { --random-state 0 \ --init-pos 'spectral' \ --input-format 'anndata' \ - \$i \ + \$anndata \ --show-obj stdout \ --output-format anndata \ - 'umap_\$i.h5ad' + 'umap_\$anndata.h5ad' # Not sure if following is needed # && mv 'embeddings_neighbors_n_neighbors_100.tsv' embeddings.tsv - done + """ } From a94177aa33543da04e8d9d415ffe63ed60717c2d Mon Sep 17 00:00:00 2001 From: Pedro Madrigal <8195212+pmb59@users.noreply.github.com> Date: Fri, 1 Nov 2024 15:15:47 +0000 Subject: [PATCH 038/159] edit process run_tsne --- main.nf | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/main.nf b/main.nf index 0dfd42e7..ed3a52d5 100644 --- a/main.nf +++ b/main.nf @@ -459,18 +459,16 @@ process run_tsne { input: path anndata val pca_param - val perplexity_values + each perplexity_values output: path 'tsne_*.h5ad' script: """ - for i in $perplexity_values - do scanpy-run-tsne \ --use-rep $pca_param \ --export-embedding embeddings.tsv \ - --perplexity \$i \ - --key-added 'perplexity_\$i' \ + --perplexity \$perplexity_values \ + --key-added 'perplexity_\$perplexity_values' \ --early-exaggeration '12.0' \ --learning-rate '400.0' \ --no-fast-tsne \ @@ -479,10 +477,9 @@ process run_tsne { $anndata \ --show-obj stdout \ --output-format anndata \ - 'tsne_\$i.h5ad' + 'tsne_\$perplexity_values.h5ad' # Not sure if following is needed # && mv 'embeddings_perplexity_1.tsv' embeddings.tsv - done """ } From 12efb956f234248730866c7ed8c2b33ef59e5bec Mon Sep 17 00:00:00 2001 From: Pedro Madrigal <8195212+pmb59@users.noreply.github.com> Date: Fri, 1 Nov 2024 16:43:51 +0000 Subject: [PATCH 039/159] edit process neighbours_for_umap and add Dynamic Memory Allocation --- main.nf | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index ed3a52d5..f1a19232 100644 --- a/main.nf +++ b/main.nf @@ -314,18 +314,20 @@ process neighbours { process neighbours_for_umap { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + memory { 4.GB * task.attempt } + maxRetries 3 + input: path anndata - val n_neighbours + each n_neighbours val pca_param output: path 'neighbours_*.h5ad' script: """ - for i in $n_neighbours - do scanpy-neighbors \ - --n-neighbors \$i \ + --n-neighbors \$n_neighbours \ --method 'umap' \ --metric 'euclidean' \ --random-state '0' \ @@ -335,8 +337,7 @@ process neighbours_for_umap { $anndata \ --show-obj stdout \ --output-format anndata \ - 'neighbours_\$i.h5ad' - done + 'neighbours_\${n_neighbours}.h5ad' """ } From 86de8ade73fca5ff1d9de44b1033ddf31cd63179 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 10:43:02 +0000 Subject: [PATCH 040/159] Update main.nf - fixes parallel run for neighbours_for_umap --- main.nf | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/main.nf b/main.nf index f1a19232..77ffa6c1 100644 --- a/main.nf +++ b/main.nf @@ -2,6 +2,8 @@ nextflow.enable.dsl=2 +params.neighbor_values = ['10', '100', '15', '20', '25', '3', '30', '5', '50'] + /* * Column_rearrange_1: Only keeps the specified columns and removes header */ @@ -319,15 +321,14 @@ process neighbours_for_umap { maxRetries 3 input: - path anndata - each n_neighbours + tuple path(anndata), val(n_neighbours) val pca_param output: - path 'neighbours_*.h5ad' + path "neighbours_${n_neighbours}.h5ad" script: """ scanpy-neighbors \ - --n-neighbors \$n_neighbours \ + --n-neighbors $n_neighbours \ --method 'umap' \ --metric 'euclidean' \ --random-state '0' \ @@ -337,7 +338,7 @@ process neighbours_for_umap { $anndata \ --show-obj stdout \ --output-format anndata \ - 'neighbours_\${n_neighbours}.h5ad' + 'neighbours_${n_neighbours}.h5ad' """ } @@ -539,7 +540,7 @@ workflow { batch_variable = Channel.value('') perplexity_values = Channel.value("1 5 10 15 20 25 30 35 40 45 50") resolution_values = Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0']) - neighbor_values = Channel.value("10 100 15 20 25 3 30 5 50") + neighbors_ch = channel.fromList(params.neighbor_values) Column_rearrange_1( @@ -588,8 +589,7 @@ workflow { pca_param ) neighbours_for_umap( - harmony_batch.out, - neighbor_values, + harmony_batch.out.combine(neighbors_ch), pca_param ) run_tsne( From 8bf90e9814d81f51682162b61ae7423e67ac3a52 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 15:10:02 +0000 Subject: [PATCH 041/159] Update main.nf - fixes process run_tsne --- main.nf | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/main.nf b/main.nf index 77ffa6c1..c88c6e2c 100644 --- a/main.nf +++ b/main.nf @@ -3,7 +3,7 @@ nextflow.enable.dsl=2 params.neighbor_values = ['10', '100', '15', '20', '25', '3', '30', '5', '50'] - +params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50'] /* * Column_rearrange_1: Only keeps the specified columns and removes header */ @@ -459,18 +459,17 @@ process run_tsne { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: - path anndata + tuple path(anndata), val(perplexity_values) val pca_param - each perplexity_values output: - path 'tsne_*.h5ad' + path 'tsne_${perplexity_values}.h5ad' script: """ scanpy-run-tsne \ --use-rep $pca_param \ --export-embedding embeddings.tsv \ - --perplexity \$perplexity_values \ - --key-added 'perplexity_\$perplexity_values' \ + --perplexity $perplexity_values \ + --key-added 'perplexity_$perplexity_values' \ --early-exaggeration '12.0' \ --learning-rate '400.0' \ --no-fast-tsne \ @@ -479,7 +478,7 @@ process run_tsne { $anndata \ --show-obj stdout \ --output-format anndata \ - 'tsne_\$perplexity_values.h5ad' + 'tsne_${perplexity_values}.h5ad' # Not sure if following is needed # && mv 'embeddings_perplexity_1.tsv' embeddings.tsv """ @@ -541,7 +540,7 @@ workflow { perplexity_values = Channel.value("1 5 10 15 20 25 30 35 40 45 50") resolution_values = Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0']) neighbors_ch = channel.fromList(params.neighbor_values) - + perplexity_ch = channel.fromList(params.perplexity_values) Column_rearrange_1( genemeta, @@ -593,8 +592,7 @@ workflow { pca_param ) run_tsne( - harmony_batch.out, - pca_param, - perplexity_values + harmony_batch.out.combine(perplexity_ch), + pca_param ) } From b4d4ca2161230f703fc44c62ed13e20045528356 Mon Sep 17 00:00:00 2001 From: fg_atlas Date: Mon, 4 Nov 2024 15:14:17 +0000 Subject: [PATCH 042/159] fixes output run-tsne --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index c88c6e2c..968e694a 100644 --- a/main.nf +++ b/main.nf @@ -462,7 +462,7 @@ process run_tsne { tuple path(anndata), val(perplexity_values) val pca_param output: - path 'tsne_${perplexity_values}.h5ad' + path "tsne_${perplexity_values}.h5ad" script: """ scanpy-run-tsne \ From fbf688e5d34f82f8dbec95792d60c58b95db822f Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 15:29:53 +0000 Subject: [PATCH 043/159] Update main.nf - populate find_clusters process --- main.nf | 50 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/main.nf b/main.nf index 968e694a..2b999dda 100644 --- a/main.nf +++ b/main.nf @@ -4,6 +4,8 @@ nextflow.enable.dsl=2 params.neighbor_values = ['10', '100', '15', '20', '25', '3', '30', '5', '50'] params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50'] +params.resolution_values = ['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0'] + /* * Column_rearrange_1: Only keeps the specified columns and removes header */ @@ -327,29 +329,43 @@ process neighbours_for_umap { path "neighbours_${n_neighbours}.h5ad" script: """ - scanpy-neighbors \ - --n-neighbors $n_neighbours \ - --method 'umap' \ - --metric 'euclidean' \ - --random-state '0' \ - --use-rep $pca_param \ - --n-pcs '50' \ - --input-format 'anndata' \ - $anndata \ - --show-obj stdout \ - --output-format anndata \ - 'neighbours_${n_neighbours}.h5ad' + scanpy-neighbors \ + --n-neighbors $n_neighbours \ + --method 'umap' \ + --metric 'euclidean' \ + --random-state '0' \ + --use-rep $pca_param \ + --n-pcs '50' \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'neighbours_${n_neighbours}.h5ad' """ } process find_clusters { - input: + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: + tuple path(anndata), val(resolution) output: - + path "clusters_${resolution}.h5ad" script: """ + scanpy-find-cluster louvain \ + --neighbors-key 'neighbors' \ + --key-added 'louvain_resolution_${resolution}' \ + --resolution ${resolution} \ + --random-state '1234' \ + --directed \ + --export-cluster output.tsv \ + --input-format 'anndata' \ + input.h5 \ + --show-obj stdout \ + --output-format anndata \ + 'clusters_${resolution}.h5ad' """ } @@ -537,10 +553,9 @@ workflow { pca_param = Channel.value('X_pca') celltype_field_param = Channel.value('NO_CELLTYPE_FIELD') batch_variable = Channel.value('') - perplexity_values = Channel.value("1 5 10 15 20 25 30 35 40 45 50") - resolution_values = Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0']) neighbors_ch = channel.fromList(params.neighbor_values) perplexity_ch = channel.fromList(params.perplexity_values) + resolution_ch = channel.fromList(params.resolution_values) Column_rearrange_1( genemeta, @@ -595,4 +610,7 @@ workflow { harmony_batch.out.combine(perplexity_ch), pca_param ) + find_clusters( + neighbours.out.combine(resolution_ch) + ) } From ad54b02e22809937289af596c4ed10a852e80218 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 15:32:49 +0000 Subject: [PATCH 044/159] Update main.nf - fixes input for find_clusters --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 2b999dda..1a9ef1fe 100644 --- a/main.nf +++ b/main.nf @@ -362,7 +362,7 @@ process find_clusters { --directed \ --export-cluster output.tsv \ --input-format 'anndata' \ - input.h5 \ + $anndata \ --show-obj stdout \ --output-format anndata \ 'clusters_${resolution}.h5ad' From 310c5b228dd1f94bf24a8edbb22584a5782215a9 Mon Sep 17 00:00:00 2001 From: Pedro Madrigal Date: Mon, 4 Nov 2024 16:15:10 +0000 Subject: [PATCH 045/159] add nf-core linting github action --- .github/workflows/nextflow-linter.yaml | 29 ++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 .github/workflows/nextflow-linter.yaml diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml new file mode 100644 index 00000000..5f7c0291 --- /dev/null +++ b/.github/workflows/nextflow-linter.yaml @@ -0,0 +1,29 @@ +name: nf-core linting + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install nf-core tools + run: | + python -m pip install --upgrade pip + pip install nf-core + + - name: Run nf-core lint + run: nf-core lint . From 8e459d1be669b0f63b65b47de8e5ed1c9648e922 Mon Sep 17 00:00:00 2001 From: Pedro Madrigal Date: Mon, 4 Nov 2024 16:21:11 +0000 Subject: [PATCH 046/159] fix lint command --- .github/workflows/nextflow-linter.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml index 5f7c0291..73ac5b2f 100644 --- a/.github/workflows/nextflow-linter.yaml +++ b/.github/workflows/nextflow-linter.yaml @@ -12,18 +12,18 @@ jobs: lint: runs-on: ubuntu-latest steps: - - name: Checkout repository + - name: checkout repository uses: actions/checkout@v2 - - name: Set up Python + - name: set up Python uses: actions/setup-python@v2 with: python-version: '3.x' - - name: Install nf-core tools + - name: install nf-core tools run: | python -m pip install --upgrade pip pip install nf-core - - name: Run nf-core lint - run: nf-core lint . + - name: run nf-core lint + run: nf-core pipelines lint From cc5ea04c6f2871351470a4c9bbca06d25026d05b Mon Sep 17 00:00:00 2001 From: Pedro Madrigal Date: Mon, 4 Nov 2024 16:27:40 +0000 Subject: [PATCH 047/159] add java and nextflow to github action --- .github/workflows/nextflow-linter.yaml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml index 73ac5b2f..f55b4e79 100644 --- a/.github/workflows/nextflow-linter.yaml +++ b/.github/workflows/nextflow-linter.yaml @@ -15,15 +15,31 @@ jobs: - name: checkout repository uses: actions/checkout@v2 + - uses: actions/setup-java@v2 + with: + distribution: 'adopt' + java-version: '11' + + - name: install Nextflow + run: | + wget -qO- https://get.nextflow.io | bash + chmod +x nextflow + mkdir -p $HOME/.local/bin + mv nextflow $HOME/.local/bin/ + echo "$HOME/.local/bin" >> $GITHUB_PATH + - name: set up Python uses: actions/setup-python@v2 with: python-version: '3.x' - + - name: install nf-core tools run: | python -m pip install --upgrade pip pip install nf-core + - name: check Nextflow version + run: nextflow -version + - name: run nf-core lint run: nf-core pipelines lint From 16519762eaa5ce62f5d965ab5a0846ae2ddf05df Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 16:32:32 +0000 Subject: [PATCH 048/159] Update main.nf - adds more params and logs --- main.nf | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 1a9ef1fe..73138dbb 100644 --- a/main.nf +++ b/main.nf @@ -2,10 +2,29 @@ nextflow.enable.dsl=2 +params.celltype_field = 'NO_CELLTYPE_FIELD' params.neighbor_values = ['10', '100', '15', '20', '25', '3', '30', '5', '50'] params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50'] params.resolution_values = ['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0'] - +params.slotname = "louvain_resolution" +params.clustering_slotname = params.resolution_values.collect { params.slotname + "_" + it } +params.merged_group_slotname = params.clustering_slotname + params.celltype_field + + +log.info """ +=============================== +WORKFLOW PARAMETER VALUES +=============================== +celltype_field: ${params.celltype_field} +neighbor_values: ${params.neighbor_values} +perplexity_values: ${params.perplexity_values} +resolution_values: ${params.resolution_values} +slotname: ${params.slotname} +clustering_slotname: ${params.clustering_slotname} +merged_group_slotname: ${params.merged_group_slotname} +=============================== +""" + /* * Column_rearrange_1: Only keeps the specified columns and removes header */ @@ -551,7 +570,6 @@ workflow { matrix = Channel.fromPath('matrix.mtx') cellmeta = Channel.fromPath('cell_metadata.tsv') pca_param = Channel.value('X_pca') - celltype_field_param = Channel.value('NO_CELLTYPE_FIELD') batch_variable = Channel.value('') neighbors_ch = channel.fromList(params.neighbor_values) perplexity_ch = channel.fromList(params.perplexity_values) From 192ba0c0bc3fe9efee0c86bc9bddaea6eebfde2a Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 16:33:16 +0000 Subject: [PATCH 049/159] Update nextflow.config Co-authored-by: Pedro Madrigal <8195212+pmb59@users.noreply.github.com> --- nextflow.config | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index e6c5c1be..6dce3ed4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,4 +1,3 @@ -singularity.enabled = true process { executor='slurm' @@ -11,6 +10,11 @@ process { pollInterval = '5sec' } +singularity { + enabled = true + cacheDir = "$SCXA_SINGULARITY_CACHE" +} + conda { cacheDir = "$SCXA_WORKFLOW_ROOT/envs" createTimeout = "30 min" From 2a26c09fe70ca05eb52ad1a4fb66ec4b7ea8b13e Mon Sep 17 00:00:00 2001 From: Pedro Madrigal Date: Mon, 4 Nov 2024 16:39:22 +0000 Subject: [PATCH 050/159] Mov away from nf-core pipeline check, we only want to check the syntax in main.nf --- .github/workflows/nextflow-linter.yaml | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml index f55b4e79..e9f5b597 100644 --- a/.github/workflows/nextflow-linter.yaml +++ b/.github/workflows/nextflow-linter.yaml @@ -28,18 +28,8 @@ jobs: mv nextflow $HOME/.local/bin/ echo "$HOME/.local/bin" >> $GITHUB_PATH - - name: set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.x' - - - name: install nf-core tools - run: | - python -m pip install --upgrade pip - pip install nf-core - - name: check Nextflow version run: nextflow -version - - name: run nf-core lint - run: nf-core pipelines lint + - name: check syntax in main.nf + run: nextflow -quiet validate main.nf From 5591c4488eaddd5e09cf48411ae0944884b22884 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 16:44:16 +0000 Subject: [PATCH 051/159] Update main.nf - ignores failed run and keeps only successful run --- main.nf | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 73138dbb..ca488947 100644 --- a/main.nf +++ b/main.nf @@ -459,6 +459,8 @@ process filtered_cellgroup_markers { } process run_umap { + errorStrategy 'ignore' + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: @@ -491,6 +493,8 @@ process run_umap { } process run_tsne { + errorStrategy 'ignore' + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: @@ -624,10 +628,19 @@ workflow { harmony_batch.out.combine(neighbors_ch), pca_param ) - run_tsne( + TNSEs_ch = run_tsne( harmony_batch.out.combine(perplexity_ch), pca_param ) + TNSEs_ch + .filter { it.exitStatus == 0 } + + UMAPs_ch = run_tsne( + neighbours_for_umap.out, + pca_param + ) + UMAPs_ch + .filter { it.exitStatus == 0 } find_clusters( neighbours.out.combine(resolution_ch) ) From 1900933bc5e23ff88eba31cd3ff33c771f5c354a Mon Sep 17 00:00:00 2001 From: Pedro Madrigal Date: Mon, 4 Nov 2024 16:47:24 +0000 Subject: [PATCH 052/159] fix nextflow syntax check --- .github/workflows/nextflow-linter.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml index e9f5b597..91fdc140 100644 --- a/.github/workflows/nextflow-linter.yaml +++ b/.github/workflows/nextflow-linter.yaml @@ -32,4 +32,4 @@ jobs: run: nextflow -version - name: check syntax in main.nf - run: nextflow -quiet validate main.nf + run: nextflow main.nf -preview || (echo "Syntax check failed. Please review the error message above." && exit 1) From f184945589fa455158428890d72cf62887d82a66 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 16:47:51 +0000 Subject: [PATCH 053/159] Update main.nf --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index ca488947..dd86df5b 100644 --- a/main.nf +++ b/main.nf @@ -635,7 +635,7 @@ workflow { TNSEs_ch .filter { it.exitStatus == 0 } - UMAPs_ch = run_tsne( + UMAPs_ch = run_umap( neighbours_for_umap.out, pca_param ) From e0966f9436d45b273694e53fb0d0b4d00b29dbb9 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 16:49:41 +0000 Subject: [PATCH 054/159] Update main.nf --- main.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/main.nf b/main.nf index dd86df5b..5a01ac07 100644 --- a/main.nf +++ b/main.nf @@ -636,8 +636,7 @@ workflow { .filter { it.exitStatus == 0 } UMAPs_ch = run_umap( - neighbours_for_umap.out, - pca_param + neighbours_for_umap.out ) UMAPs_ch .filter { it.exitStatus == 0 } From 27359f69b2192e08de713bdd19b1f65b04f995c0 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 16:52:05 +0000 Subject: [PATCH 055/159] Update main.nf - comments ignore error --- main.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index 5a01ac07..f3d728ca 100644 --- a/main.nf +++ b/main.nf @@ -459,7 +459,7 @@ process filtered_cellgroup_markers { } process run_umap { - errorStrategy 'ignore' + //errorStrategy 'ignore' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' @@ -493,7 +493,7 @@ process run_umap { } process run_tsne { - errorStrategy 'ignore' + //errorStrategy 'ignore' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' @@ -632,14 +632,14 @@ workflow { harmony_batch.out.combine(perplexity_ch), pca_param ) - TNSEs_ch - .filter { it.exitStatus == 0 } + //TNSEs_ch + // .filter { it.exitStatus == 0 } UMAPs_ch = run_umap( neighbours_for_umap.out ) - UMAPs_ch - .filter { it.exitStatus == 0 } + //UMAPs_ch + // .filter { it.exitStatus == 0 } find_clusters( neighbours.out.combine(resolution_ch) ) From 44b0e31ecb3f6635656953e81701a91848eda095 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 16:54:11 +0000 Subject: [PATCH 056/159] Update main.nf - fixed run_umap --- main.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index f3d728ca..879f1672 100644 --- a/main.nf +++ b/main.nf @@ -470,8 +470,8 @@ process run_umap { script: """ scanpy-run-umap \ - --neighbors-key 'neighbors_\$anndata' \ - --key-added 'neighbors_\$anndata' \ + --neighbors-key 'neighbors_$anndata' \ + --key-added 'neighbors_$anndata' \ --export-embedding embeddings.tsv \ --n-components 2 \ --min-dist 0.5 \ @@ -482,10 +482,10 @@ process run_umap { --random-state 0 \ --init-pos 'spectral' \ --input-format 'anndata' \ - \$anndata \ + $anndata \ --show-obj stdout \ --output-format anndata \ - 'umap_\$anndata.h5ad' + 'umap_$anndata.h5ad' # Not sure if following is needed # && mv 'embeddings_neighbors_n_neighbors_100.tsv' embeddings.tsv From 79b2127ad02d35276455955562dce43292b5e0f5 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 16:56:52 +0000 Subject: [PATCH 057/159] Update main.nf - fixes run_umap --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 879f1672..928d08b5 100644 --- a/main.nf +++ b/main.nf @@ -464,7 +464,7 @@ process run_umap { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: - each anndata + path anndata output: path 'umap_*.h5ad' script: @@ -636,7 +636,7 @@ workflow { // .filter { it.exitStatus == 0 } UMAPs_ch = run_umap( - neighbours_for_umap.out + neighbours_for_umap.out.flatten() ) //UMAPs_ch // .filter { it.exitStatus == 0 } From a8319c1ed2fff54358ad9e10bbfa0e8fa17d08a4 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 16:58:19 +0000 Subject: [PATCH 058/159] Update main.nf - fixes run_umap --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 928d08b5..a23ed4a2 100644 --- a/main.nf +++ b/main.nf @@ -466,7 +466,7 @@ process run_umap { input: path anndata output: - path 'umap_*.h5ad' + path "umap_*.h5ad" script: """ scanpy-run-umap \ @@ -485,7 +485,7 @@ process run_umap { $anndata \ --show-obj stdout \ --output-format anndata \ - 'umap_$anndata.h5ad' + 'umap_${anndata}.h5ad' # Not sure if following is needed # && mv 'embeddings_neighbors_n_neighbors_100.tsv' embeddings.tsv From 06c13b60a28fa27d282a710884cb2aeb0d277d6b Mon Sep 17 00:00:00 2001 From: Pedro Madrigal Date: Mon, 4 Nov 2024 16:59:47 +0000 Subject: [PATCH 059/159] add simplified ci config --- .github/workflows/ci.config | 15 +++++++++++++++ .github/workflows/nextflow-linter.yaml | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/ci.config diff --git a/.github/workflows/ci.config b/.github/workflows/ci.config new file mode 100644 index 00000000..27cc59ae --- /dev/null +++ b/.github/workflows/ci.config @@ -0,0 +1,15 @@ + +process { + executor='slurm' + time = '7 d' + memory = '4 GB' +} + +singularity { + enabled = true +} + +conda { + createTimeout = "30 min" + useMamba = true +} diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml index 91fdc140..9ce16f71 100644 --- a/.github/workflows/nextflow-linter.yaml +++ b/.github/workflows/nextflow-linter.yaml @@ -32,4 +32,4 @@ jobs: run: nextflow -version - name: check syntax in main.nf - run: nextflow main.nf -preview || (echo "Syntax check failed. Please review the error message above." && exit 1) + run: nextflow main.nf -c .github/workflows/ci.config -preview || (echo "Syntax check failed. Please review the error message above." && exit 1) From 8a50a175266a947a07471055e0d154ab6c05eda3 Mon Sep 17 00:00:00 2001 From: Pedro Madrigal Date: Mon, 4 Nov 2024 17:03:16 +0000 Subject: [PATCH 060/159] disable CIi for now to avoid noise --- .github/workflows/nextflow-linter.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml index 9ce16f71..6bcc9eec 100644 --- a/.github/workflows/nextflow-linter.yaml +++ b/.github/workflows/nextflow-linter.yaml @@ -1,12 +1,12 @@ name: nf-core linting on: - push: - branches: - - main - pull_request: - branches: - - main +# push: +# branches: +# - main +# pull_request: +# branches: +# - main jobs: lint: From 19c66583ca8502c727701b6be10f0a68d64b1032 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 17:05:20 +0000 Subject: [PATCH 061/159] Update main.nf - adds parameter to neighbours_for_umap --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index a23ed4a2..b022ab56 100644 --- a/main.nf +++ b/main.nf @@ -350,6 +350,7 @@ process neighbours_for_umap { """ scanpy-neighbors \ --n-neighbors $n_neighbours \ + --key-added 'neighbors_n_neighbors_${n_neighbours}' --method 'umap' \ --metric 'euclidean' \ --random-state '0' \ From 216a89ffce8f362b841c6d1287280d0d507fd046 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 17:07:12 +0000 Subject: [PATCH 062/159] Update main.nf - fixes neighbours_for_umap --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index b022ab56..a1096b6d 100644 --- a/main.nf +++ b/main.nf @@ -350,7 +350,7 @@ process neighbours_for_umap { """ scanpy-neighbors \ --n-neighbors $n_neighbours \ - --key-added 'neighbors_n_neighbors_${n_neighbours}' + --key-added 'neighbors_n_neighbors_${n_neighbours}' \ --method 'umap' \ --metric 'euclidean' \ --random-state '0' \ From aa20bd6341dfaaea4ff02a111d7df8fbf6851a61 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 17:16:12 +0000 Subject: [PATCH 063/159] Update main.nf - fixes neighbor spelling as per script name --- main.nf | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/main.nf b/main.nf index a1096b6d..884f50c8 100644 --- a/main.nf +++ b/main.nf @@ -307,14 +307,14 @@ process harmony_batch { """ } -process neighbours { +process neighbors { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: path anndata val pca_param output: - path 'neighbours.h5ad' + path 'neighbors.h5ad' script: """ @@ -329,12 +329,12 @@ process neighbours { $anndata \ --show-obj stdout \ --output-format anndata \ - 'neighbours.h5ad' + 'neighbors.h5ad' """ } -process neighbours_for_umap { +process neighbors_for_umap { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' } @@ -342,15 +342,15 @@ process neighbours_for_umap { maxRetries 3 input: - tuple path(anndata), val(n_neighbours) + tuple path(anndata), val(n_neighbors) val pca_param output: - path "neighbours_${n_neighbours}.h5ad" + path "neighbors_${n_neighbors}.h5ad" script: """ scanpy-neighbors \ - --n-neighbors $n_neighbours \ - --key-added 'neighbors_n_neighbors_${n_neighbours}' \ + --n-neighbors $n_neighbors \ + --key-added 'neighbors_n_neighbors_${n_neighbors}' \ --method 'umap' \ --metric 'euclidean' \ --random-state '0' \ @@ -360,7 +360,7 @@ process neighbours_for_umap { $anndata \ --show-obj stdout \ --output-format anndata \ - 'neighbours_${n_neighbours}.h5ad' + 'neighbors_${n_neighbors}.h5ad' """ } @@ -621,11 +621,11 @@ workflow { run_pca.out, batch_variable ) - neighbours( + neighbors( harmony_batch.out, pca_param ) - neighbours_for_umap( + neighbors_for_umap( harmony_batch.out.combine(neighbors_ch), pca_param ) @@ -637,11 +637,11 @@ workflow { // .filter { it.exitStatus == 0 } UMAPs_ch = run_umap( - neighbours_for_umap.out.flatten() + neighbors_for_umap.out.flatten() ) //UMAPs_ch // .filter { it.exitStatus == 0 } find_clusters( - neighbours.out.combine(resolution_ch) + neighbors.out.combine(resolution_ch) ) } From 5b5a48b848eb23c9ec115dc84cfb47c5a516aa68 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Mon, 4 Nov 2024 17:37:52 +0000 Subject: [PATCH 064/159] Update main.nf - fixes run_umap for neighbor key --- main.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 884f50c8..d9573cb9 100644 --- a/main.nf +++ b/main.nf @@ -470,8 +470,9 @@ process run_umap { path "umap_*.h5ad" script: """ - scanpy-run-umap \ - --neighbors-key 'neighbors_$anndata' \ + n_neighbor="${anndata/.h5ad/}" + scanpy-run-umap \ + --neighbors-key 'neighbors_\$n_neighbor' \ --key-added 'neighbors_$anndata' \ --export-embedding embeddings.tsv \ --n-components 2 \ From cf621e5b3ce659221822a1e11e601852ccee76bb Mon Sep 17 00:00:00 2001 From: fg_atlas Date: Tue, 5 Nov 2024 10:12:42 +0000 Subject: [PATCH 065/159] fixes umap --- main.nf | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/main.nf b/main.nf index d9573cb9..27d6a63e 100644 --- a/main.nf +++ b/main.nf @@ -470,10 +470,12 @@ process run_umap { path "umap_*.h5ad" script: """ - n_neighbor="${anndata/.h5ad/}" - scanpy-run-umap \ - --neighbors-key 'neighbors_\$n_neighbor' \ - --key-added 'neighbors_$anndata' \ + VAR="$anndata" + n_number="\${VAR%.h5ad}" + echo \$n_number + scanpy-run-umap \ + --neighbors-key "neighbors_n_\${n_number}" \ + --key-added "neighbors_\${n_number}" \ --export-embedding embeddings.tsv \ --n-components 2 \ --min-dist 0.5 \ @@ -487,7 +489,7 @@ process run_umap { $anndata \ --show-obj stdout \ --output-format anndata \ - 'umap_${anndata}.h5ad' + "umap_\${n_number}.h5ad" # Not sure if following is needed # && mv 'embeddings_neighbors_n_neighbors_100.tsv' embeddings.tsv From 806953a97ecd35b5ce643ea573b731220f91ace9 Mon Sep 17 00:00:00 2001 From: fg_atlas Date: Tue, 5 Nov 2024 12:50:17 +0000 Subject: [PATCH 066/159] populates find_markers process --- main.nf | 35 ++++++++++++++++++++++++++++++++--- nextflow.config | 7 ------- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/main.nf b/main.nf index 27d6a63e..e89e0f06 100644 --- a/main.nf +++ b/main.nf @@ -10,7 +10,6 @@ params.slotname = "louvain_resolution" params.clustering_slotname = params.resolution_values.collect { params.slotname + "_" + it } params.merged_group_slotname = params.clustering_slotname + params.celltype_field - log.info """ =============================== WORKFLOW PARAMETER VALUES @@ -440,12 +439,28 @@ process build_list { } process find_markers { + errorStrategy 'ignore' + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: - + tuple path(anndata), val(merged_group_slotname) output: - + path "markers_${merged_group_slotname}.h5ad" script: """ + scanpy-find-markers \ + --save diffexp.tsv \ + --n-genes '100' \ + --groupby '${merged_group_slotname}' \ + --key-added 'markers_${merged_group_slotname}' \ + --method 'wilcoxon' \ + --use-raw \ + --reference 'rest' \ + --filter-params 'min_in_group_fraction:0.0,max_out_group_fraction:1.0,min_fold_change:1.0' \ + --input-format 'anndata' \ + $anndata \ + --show-obj stdout \ + --output-format anndata \ + 'markers_${merged_group_slotname}.h5ad' """ } @@ -582,6 +597,7 @@ workflow { neighbors_ch = channel.fromList(params.neighbor_values) perplexity_ch = channel.fromList(params.perplexity_values) resolution_ch = channel.fromList(params.resolution_values) + merged_group_slotname_ch = Channel.fromList(params.merged_group_slotname) Column_rearrange_1( genemeta, @@ -647,4 +663,17 @@ workflow { find_clusters( neighbors.out.combine(resolution_ch) ) + + // Combine the outputs of find_clusters and neighbors processes + combined_outputs = find_clusters.out.mix(neighbors.out) + + processed_files = combined_outputs.map { file -> + // Extract the sample number from the file name + def sampleNumber = file.baseName.replaceFirst('clusters_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field) + [file, sampleNumber] // Create a tuple with sample number and file + } + + find_markers( + processed_files + ) } diff --git a/nextflow.config b/nextflow.config index 6dce3ed4..46bba393 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,8 +1,6 @@ - process { executor='slurm' queue="$SCXA_HPC_QUEUE" - clusterOptions="$SCXA_HPC_OPTIONS" time = '7 d' memory = '4 GB' queueSize=500 @@ -12,15 +10,10 @@ process { singularity { enabled = true - cacheDir = "$SCXA_SINGULARITY_CACHE" } conda { - cacheDir = "$SCXA_WORKFLOW_ROOT/envs" createTimeout = "30 min" useMamba = true } -params { - -} From 8f32220988afe10a088e41192295d9ab8f6fa45b Mon Sep 17 00:00:00 2001 From: fg_atlas Date: Tue, 5 Nov 2024 13:32:30 +0000 Subject: [PATCH 067/159] revert config changes --- nextflow.config | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nextflow.config b/nextflow.config index 46bba393..e09dcc5e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,6 +1,7 @@ process { executor='slurm' queue="$SCXA_HPC_QUEUE" + clusterOptions="$SCXA_HPC_OPTIONS" time = '7 d' memory = '4 GB' queueSize=500 @@ -10,10 +11,15 @@ process { singularity { enabled = true + cacheDir = "$SCXA_SINGULARITY_CACHE" } conda { + cacheDir = "$SCXA_WORKFLOW_ROOT/envs" createTimeout = "30 min" useMamba = true } +params { + +} From bbfc882f3cae16d72a41417e71347aba0aad8200 Mon Sep 17 00:00:00 2001 From: fg_atlas Date: Tue, 5 Nov 2024 14:43:23 +0000 Subject: [PATCH 068/159] adds python script for final project process --- scripts/final_project.py | 616 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 616 insertions(+) create mode 100644 scripts/final_project.py diff --git a/scripts/final_project.py b/scripts/final_project.py new file mode 100644 index 00000000..0430187b --- /dev/null +++ b/scripts/final_project.py @@ -0,0 +1,616 @@ + +import scanpy as sc +import anndata +from numpy import all +import logging + +adata = sc.read('input.h5') + + +gene_name = 'index' +qc_vars = list() + + + +gene_names = getattr(adata.var, gene_name) + + +ad_s = sc.read('r_source.h5') +if not all(adata.obs.index.isin(ad_s.obs.index)): + logging.error("Specified object for .raw must contain all .obs from main object.") + sys.exit(1) +else: + adata.raw = ad_s[adata.obs.index] +del ad_s + +ad_s = sc.read('x_source_0.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + if "filtered" == '': + logging.error("%sth destination layer for %sth X source not specified" % ("0", "0")) + sys.exit(1) + adata.layers["filtered"] = ad_s.X +else: + logging.error("X source 0 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('x_source_1.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + if "normalised" == '': + logging.error("%sth destination layer for %sth X source not specified" % ("1", "1")) + sys.exit(1) + adata.layers["normalised"] = ad_s.X +else: + logging.error("X source 1 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s + + +ad_s = sc.read('obs_source_0.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.obs: + suffix = "_0" + + adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] + if k_to_copy in ad_s.uns.keys(): + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Observation source 0 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('obs_source_1.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.obs: + suffix = "_1" + + adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] + if k_to_copy in ad_s.uns.keys(): + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Observation source 1 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('obs_source_2.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.obs: + suffix = "_2" + + adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] + if k_to_copy in ad_s.uns.keys(): + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Observation source 2 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('obs_source_3.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.obs: + suffix = "_3" + + adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] + if k_to_copy in ad_s.uns.keys(): + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Observation source 3 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('obs_source_4.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.obs: + suffix = "_4" + + adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] + if k_to_copy in ad_s.uns.keys(): + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Observation source 4 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('obs_source_5.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.obs: + suffix = "_5" + + adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] + if k_to_copy in ad_s.uns.keys(): + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Observation source 5 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('obs_source_6.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.obs: + suffix = "_6" + + adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] + if k_to_copy in ad_s.uns.keys(): + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Observation source 6 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('obs_source_7.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.obs: + suffix = "_7" + + adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] + if k_to_copy in ad_s.uns.keys(): + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Observation source 7 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('obs_source_8.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.obs: + suffix = "_8" + + adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] + if k_to_copy in ad_s.uns.keys(): + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Observation source 8 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s + + +ad_s = sc.read('embedding_source_0.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_0" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_0" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 0 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('embedding_source_1.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_1" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_1" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 1 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('embedding_source_2.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_2" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_2" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 2 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('embedding_source_3.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_3" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_3" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 3 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('embedding_source_4.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_4" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_4" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 4 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('embedding_source_5.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_5" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_5" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 5 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('embedding_source_6.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_6" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_6" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 6 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('embedding_source_7.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_7" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_7" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 7 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('embedding_source_8.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_8" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_8" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 8 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('embedding_source_9.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_9" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_9" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 9 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('embedding_source_10.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_10" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_10" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 10 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('embedding_source_11.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_11" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_11" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 11 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('embedding_source_12.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_12" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_12" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 12 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('embedding_source_13.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_13" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_13" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 13 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('embedding_source_14.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_14" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_14" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 14 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('embedding_source_15.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_15" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_15" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 15 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('embedding_source_16.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_16" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = "_16" + adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] +else: + logging.error("Embedding source 16 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s + +ad_s = sc.read('uns_source_0.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.uns: + suffix="_0" + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Uns source 0 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('uns_source_1.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.uns: + suffix="_1" + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Uns source 1 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('uns_source_2.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.uns: + suffix="_2" + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Uns source 2 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('uns_source_3.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.uns: + suffix="_3" + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Uns source 3 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('uns_source_4.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.uns: + suffix="_4" + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Uns source 4 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('uns_source_5.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.uns: + suffix="_5" + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Uns source 5 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('uns_source_6.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.uns: + suffix="_6" + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Uns source 6 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('uns_source_7.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.uns: + suffix="_7" + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Uns source 7 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s +ad_s = sc.read('uns_source_8.h5') +if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.uns: + suffix="_8" + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] +else: + logging.error("Uns source 8 AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) +del ad_s + + +if len(qc_vars) > 0: + pct_top = [50] + sc.pp.calculate_qc_metrics(adata, qc_vars=qc_vars, percent_top=pct_top, inplace=True) + +if 'n_genes' not in adata.obs.columns: + sc.pp.filter_cells(adata, min_genes=0) +if 'n_counts' not in adata.obs.columns: + sc.pp.filter_cells(adata, min_counts=0) +if 'n_cells' not in adata.var.columns: + sc.pp.filter_genes(adata, min_cells=0) +if 'n_counts' not in adata.var.columns: + sc.pp.filter_genes(adata, min_counts=0) + +adata.write('output.h5', compression='gzip') + \ No newline at end of file From b9b11890a4223d8c1d331a8676362cb9ce6257b0 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Tue, 5 Nov 2024 14:52:53 +0000 Subject: [PATCH 069/159] Update nextflow-linter.yaml - adding back changes pushed by @pmadrgal --- .github/workflows/nextflow-linter.yaml | 44 +++++++++++++++++++------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml index 6bcc9eec..7b68f59b 100644 --- a/.github/workflows/nextflow-linter.yaml +++ b/.github/workflows/nextflow-linter.yaml @@ -1,20 +1,17 @@ name: nf-core linting - on: -# push: -# branches: -# - main -# pull_request: -# branches: -# - main + push: + branches: + - main + pull_request: + branches: + - main jobs: lint: runs-on: ubuntu-latest steps: - - name: checkout repository - uses: actions/checkout@v2 - + - uses: actions/checkout@v2 - uses: actions/setup-java@v2 with: distribution: 'adopt' @@ -28,8 +25,31 @@ jobs: mv nextflow $HOME/.local/bin/ echo "$HOME/.local/bin" >> $GITHUB_PATH + - name: set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: install nf-core tools + run: | + python -m pip install --upgrade pip + pip install nf-core + - name: check Nextflow version run: nextflow -version - - name: check syntax in main.nf - run: nextflow main.nf -c .github/workflows/ci.config -preview || (echo "Syntax check failed. Please review the error message above." && exit 1) + # https://nf-co.re/tools/docs/latest/pipeline_lint_tests/ + - name: create .nf-core.yml + run: | + cat << EOF > .nf-core.yml + repository_type: pipeline + lint: + actions_awsfulltest: False + actions_awstest: False + multiqc_config: False + schema_lint: False + schema_params: False + EOF + + - name: run nf-core lint + run: nf-core pipelines lint --dir . From d2b4a38ec8fb0fe1d938a1dcdd9f3ca8c0631684 Mon Sep 17 00:00:00 2001 From: fg_atlas Date: Tue, 5 Nov 2024 15:17:56 +0000 Subject: [PATCH 070/159] Fixed filter_gene and filter_cell connections --- main.nf | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index e89e0f06..fc09298c 100644 --- a/main.nf +++ b/main.nf @@ -138,7 +138,6 @@ process scanpy_filter_cells { input: path anndata - path genes output: path 'filtered_cell_anndata.h5ad' @@ -621,13 +620,16 @@ workflow { ) scanpy_filter_cells( scanpy_read_10x.out, + ) + scanpy_filter_genes( + scanpy_filter_cells.out, Column_rearrange_1.out[0] ) normalise_data( - scanpy_filter_cells.out + scanpy_filter_genes.out ) normalise_internal_data( - scanpy_filter_cells.out + scanpy_filter_genes.out ) find_variable_genes( normalise_internal_data.out, From 4b90e9f69f435d4e61ff58f29f15ea9d0611e03d Mon Sep 17 00:00:00 2001 From: fg_atlas Date: Tue, 5 Nov 2024 16:43:28 +0000 Subject: [PATCH 071/159] changes permission --- scripts/final_project.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 scripts/final_project.py diff --git a/scripts/final_project.py b/scripts/final_project.py old mode 100644 new mode 100755 index 0430187b..0153f87b --- a/scripts/final_project.py +++ b/scripts/final_project.py @@ -613,4 +613,4 @@ sc.pp.filter_genes(adata, min_counts=0) adata.write('output.h5', compression='gzip') - \ No newline at end of file + From 4961d5f0c15c0295ae53c25286f8c48be3ebf4b5 Mon Sep 17 00:00:00 2001 From: fg_atlas Date: Tue, 5 Nov 2024 16:46:10 +0000 Subject: [PATCH 072/159] populate make_project_file process --- main.nf | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index fc09298c..28920649 100644 --- a/main.nf +++ b/main.nf @@ -575,11 +575,34 @@ process merge_embeddings { process make_project_file { input: - + path neighbors + path scanpy_read_10x + path filter_genes + path normalise_data + path find_markers + path TNSEs_mix_UMAPs output: - + path "output.h5" script: """ + ln -s $neighbors input.h5 + ln -s $scanpy_read_10x r_source.h5 + ln -s '$filter_genes' x_source_0.h5 + ln -s '$normalise_data' x_source_1.h5 + count=0 + for i in $find_markers + do + ln -s "\${i}" obs_source_\${count}.h5 + ln -s "\${i}" uns_source_\${count}.h5 + ((count++)) + done + count=0 + for i in $TNSEs_mix_UMAPs + do + ln -s "\${i}" embedding_source_\${count}.h5 + ((count++)) + done + python scripts/final_project.py """ } @@ -678,4 +701,12 @@ workflow { find_markers( processed_files ) + make_project_file( + neighbors.out, + scanpy_read_10x.out, + scanpy_filter_genes.out, + normalise_data.out, + find_markers.out.collect(), + TNSEs_ch.mix(UMAPs_ch).collect() + ) } From d02f1a32ad19bebe38ff4f79d2bb0ddf8e972256 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Tue, 5 Nov 2024 17:09:48 +0000 Subject: [PATCH 073/159] Update final_project.py - removing redundant code --- scripts/final_project.py | 336 ++++----------------------------------- 1 file changed, 29 insertions(+), 307 deletions(-) diff --git a/scripts/final_project.py b/scripts/final_project.py index 0153f87b..a14edb69 100755 --- a/scripts/final_project.py +++ b/scripts/final_project.py @@ -3,6 +3,7 @@ import anndata from numpy import all import logging +import os adata = sc.read('input.h5') @@ -13,7 +14,8 @@ gene_names = getattr(adata.var, gene_name) - +# Define the directory containing your source files +source_dir = '.' # Adjust to the appropriate path ad_s = sc.read('r_source.h5') if not all(adata.obs.index.isin(ad_s.obs.index)): @@ -182,312 +184,32 @@ del ad_s -ad_s = sc.read('embedding_source_0.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_0" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_0" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 0 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('embedding_source_1.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_1" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_1" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 1 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('embedding_source_2.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_2" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_2" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 2 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('embedding_source_3.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_3" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_3" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 3 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('embedding_source_4.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_4" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_4" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 4 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('embedding_source_5.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_5" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_5" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 5 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('embedding_source_6.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_6" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_6" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 6 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('embedding_source_7.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_7" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_7" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 7 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('embedding_source_8.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_8" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_8" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 8 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('embedding_source_9.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_9" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_9" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 9 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('embedding_source_10.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_10" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_10" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 10 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('embedding_source_11.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_11" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_11" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 11 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('embedding_source_12.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_12" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_12" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 12 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('embedding_source_13.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_13" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_13" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 13 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('embedding_source_14.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_14" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_14" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 14 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('embedding_source_15.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_15" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_15" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 15 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('embedding_source_16.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_16" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = "_16" - adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy] -else: - logging.error("Embedding source 16 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s + + +embedding_sources = [file for file in os.listdir(source_dir) if file.startswith('embedding_source_') and file.endswith('.h5')] + +for idx, embedding_file in enumerate(sorted(embedding_sources)): + ad_s = sc.read(os.path.join(source_dir, embedding_file)) + if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + # Copy tsne embeddings + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = f"_{idx}" + adata.obsm[k_to_copy + suffix] = ad_s.obsm[k_to_copy] + # Copy umap embeddings + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm + suffix = f"_{idx}" + adata.obsm[k_to_copy + suffix] = ad_s.obsm[k_to_copy] + else: + logging.error(f"Embedding source {idx} AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) + del ad_s + ad_s = sc.read('uns_source_0.h5') if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): From a010ff54f02bb0c7c41fb33cb7775bd4f299366a Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Tue, 5 Nov 2024 17:18:36 +0000 Subject: [PATCH 074/159] Update final_project.py - removes redundant code for uns_source files --- scripts/final_project.py | 150 ++++++++------------------------------- 1 file changed, 28 insertions(+), 122 deletions(-) diff --git a/scripts/final_project.py b/scripts/final_project.py index a14edb69..b7871e9c 100755 --- a/scripts/final_project.py +++ b/scripts/final_project.py @@ -191,134 +191,40 @@ for idx, embedding_file in enumerate(sorted(embedding_sources)): ad_s = sc.read(os.path.join(source_dir, embedding_file)) if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - # Copy tsne embeddings - keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm: - suffix = f"_{idx}" - adata.obsm[k_to_copy + suffix] = ad_s.obsm[k_to_copy] - # Copy umap embeddings - keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) - for k_to_copy in keys_to_copy: - suffix = '' - if k_to_copy in adata.obsm - suffix = f"_{idx}" - adata.obsm[k_to_copy + suffix] = ad_s.obsm[k_to_copy] + # Copy tsne embeddings + keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm: + suffix = f"_{idx}" + adata.obsm[k_to_copy + suffix] = ad_s.obsm[k_to_copy] + # Copy umap embeddings + keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) + for k_to_copy in keys_to_copy: + suffix = '' + if k_to_copy in adata.obsm + suffix = f"_{idx}" + adata.obsm[k_to_copy + suffix] = ad_s.obsm[k_to_copy] else: logging.error(f"Embedding source {idx} AnnData file is not compatible to be merged to main AnnData file, different cell names.") sys.exit(1) del ad_s -ad_s = sc.read('uns_source_0.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.uns: - suffix="_0" - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Uns source 0 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('uns_source_1.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.uns: - suffix="_1" - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Uns source 1 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('uns_source_2.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.uns: - suffix="_2" - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Uns source 2 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('uns_source_3.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.uns: - suffix="_3" - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Uns source 3 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('uns_source_4.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.uns: - suffix="_4" - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Uns source 4 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('uns_source_5.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.uns: - suffix="_5" - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Uns source 5 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('uns_source_6.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.uns: - suffix="_6" - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Uns source 6 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('uns_source_7.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.uns: - suffix="_7" - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Uns source 7 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('uns_source_8.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.uns: - suffix="_8" - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Uns source 8 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s +uns_sources = [file for file in os.listdir(source_dir) if file.startswith('uns_source_') and file.endswith('.h5')] +for idx, uns_file in enumerate(sorted(uns_sources)): + ad_s = sc.read(os.path.join(source_dir, uns_file)) + if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.uns: + suffix=f"_{idx}" + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] + else: + logging.error(f"Uns source {idx} AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) + del ad_s if len(qc_vars) > 0: From eb5a6909df7d28d6fd52c3bf5cd66c85be841a33 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 6 Nov 2024 10:24:27 +0000 Subject: [PATCH 075/159] Update final_project.py - removes redundant obs_source operations --- scripts/final_project.py | 154 +++++---------------------------------- 1 file changed, 17 insertions(+), 137 deletions(-) diff --git a/scripts/final_project.py b/scripts/final_project.py index b7871e9c..b2e7a9c3 100755 --- a/scripts/final_project.py +++ b/scripts/final_project.py @@ -46,144 +46,24 @@ sys.exit(1) del ad_s +obs_sources = [file for file in os.listdir(source_dir) if file.startswith('obs_source_') and file.endswith('.h5')] -ad_s = sc.read('obs_source_0.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.obs: - suffix = "_0" - - adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] - if k_to_copy in ad_s.uns.keys(): - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Observation source 0 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('obs_source_1.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.obs: - suffix = "_1" - - adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] - if k_to_copy in ad_s.uns.keys(): - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Observation source 1 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('obs_source_2.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.obs: - suffix = "_2" - - adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] - if k_to_copy in ad_s.uns.keys(): - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Observation source 2 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('obs_source_3.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.obs: - suffix = "_3" - - adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] - if k_to_copy in ad_s.uns.keys(): - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Observation source 3 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('obs_source_4.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.obs: - suffix = "_4" - - adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] - if k_to_copy in ad_s.uns.keys(): - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Observation source 4 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('obs_source_5.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.obs: - suffix = "_5" - - adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] - if k_to_copy in ad_s.uns.keys(): - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Observation source 5 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('obs_source_6.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.obs: - suffix = "_6" - - adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] - if k_to_copy in ad_s.uns.keys(): - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Observation source 6 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('obs_source_7.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.obs: - suffix = "_7" - - adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] - if k_to_copy in ad_s.uns.keys(): - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Observation source 7 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s -ad_s = sc.read('obs_source_8.h5') -if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): - keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) - for k_to_copy in keys_to_copy: - suffix='' - if k_to_copy in adata.obs: - suffix = "_8" - - adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] - if k_to_copy in ad_s.uns.keys(): - adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] -else: - logging.error("Observation source 8 AnnData file is not compatible to be merged to main AnnData file, different cell names.") - sys.exit(1) -del ad_s - - +for idx, obs_source_file in enumerate(sorted(obs_sources)): + ad_s = sc.read(os.path.join(source_dir, obs_source_file)) + if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names): + keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k) + for k_to_copy in keys_to_copy: + suffix='' + if k_to_copy in adata.obs: + suffix = f"_{idx}" + + adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]] + if k_to_copy in ad_s.uns.keys(): + adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy] + else: + logging.error(f"Observation source {idx} AnnData file is not compatible to be merged to main AnnData file, different cell names.") + sys.exit(1) + del ad_s embedding_sources = [file for file in os.listdir(source_dir) if file.startswith('embedding_source_') and file.endswith('.h5')] From 814df0bbe81659dc7aa97d7d8dd529f34bd951a4 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 6 Nov 2024 10:32:37 +0000 Subject: [PATCH 076/159] Update main.nf - adds container in make_project_file process --- main.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.nf b/main.nf index 28920649..06fb79a2 100644 --- a/main.nf +++ b/main.nf @@ -574,6 +574,8 @@ process merge_embeddings { process make_project_file { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: path neighbors path scanpy_read_10x From 376ad3a93dcb208bea9e9950bf7a691efb1c5005 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 6 Nov 2024 11:50:55 +0000 Subject: [PATCH 077/159] Update final_project.py --- scripts/final_project.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/final_project.py b/scripts/final_project.py index b2e7a9c3..9616ded0 100755 --- a/scripts/final_project.py +++ b/scripts/final_project.py @@ -82,7 +82,7 @@ keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k) for k_to_copy in keys_to_copy: suffix = '' - if k_to_copy in adata.obsm + if k_to_copy in adata.obsm: suffix = f"_{idx}" adata.obsm[k_to_copy + suffix] = ad_s.obsm[k_to_copy] else: From 73eb1675b898562ddae50d81251e9c51786fa6a9 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 6 Nov 2024 15:17:00 +0000 Subject: [PATCH 078/159] Update main.nf - fixinf make_project_file process --- main.nf | 52 +++++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/main.nf b/main.nf index 06fb79a2..885d5560 100644 --- a/main.nf +++ b/main.nf @@ -577,34 +577,36 @@ process make_project_file { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: - path neighbors - path scanpy_read_10x - path filter_genes - path normalise_data - path find_markers - path TNSEs_mix_UMAPs + path neighbors + path scanpy_read_10x + path filter_genes + path normalise_data + path find_markers + path TNSEs_mix_UMAPs output: - path "output.h5" + path "output.h5" script: """ - ln -s $neighbors input.h5 - ln -s $scanpy_read_10x r_source.h5 - ln -s '$filter_genes' x_source_0.h5 - ln -s '$normalise_data' x_source_1.h5 - count=0 - for i in $find_markers - do - ln -s "\${i}" obs_source_\${count}.h5 - ln -s "\${i}" uns_source_\${count}.h5 - ((count++)) - done - count=0 - for i in $TNSEs_mix_UMAPs - do - ln -s "\${i}" embedding_source_\${count}.h5 - ((count++)) - done - python scripts/final_project.py + ln -s $neighbors input.h5 + ln -s $scanpy_read_10x r_source.h5 + ln -s '$filter_genes' x_source_0.h5 + ln -s '$normalise_data' x_source_1.h5 + count=0 + for i in $find_markers + do + ln -sf "\${i}" obs_source_\${count}.h5 + ln -sf "\${i}" uns_source_\${count}.h5 + count=\$((count + 1)) + echo "\${count}" + done + count=0 + for i in $TNSEs_mix_UMAPs + do + ln -sf "\${i}" embedding_source_\${count}.h5 + count=\$((count + 1)) + echo "\${count}" + done + python ${projectDir}/scripts/final_project.py """ } From 9063a30346851ba545eab771118b83a1630780e5 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 6 Nov 2024 15:25:00 +0000 Subject: [PATCH 079/159] Update nextflow.config Co-authored-by: Pedro Madrigal <8195212+pmb59@users.noreply.github.com> --- nextflow.config | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nextflow.config b/nextflow.config index e09dcc5e..2317fb25 100644 --- a/nextflow.config +++ b/nextflow.config @@ -7,6 +7,10 @@ process { queueSize=500 exitReadTimeout='100000 sec' pollInterval = '5sec' + // error strategy + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + memory = { 4.GB * 2 ^task.attempt } + maxRetries = 4 } singularity { From 0dc94e597fc94b8cfa0c648fe5ecba32483edfe7 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 6 Nov 2024 15:25:12 +0000 Subject: [PATCH 080/159] Update main.nf Co-authored-by: Pedro Madrigal <8195212+pmb59@users.noreply.github.com> --- main.nf | 3 --- 1 file changed, 3 deletions(-) diff --git a/main.nf b/main.nf index 885d5560..2f5321cb 100644 --- a/main.nf +++ b/main.nf @@ -335,9 +335,6 @@ process neighbors { process neighbors_for_umap { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' - errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' } - memory { 4.GB * task.attempt } - maxRetries 3 input: tuple path(anndata), val(n_neighbors) From b8cb996a3671073c5083790782cffd5cda90ca79 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 6 Nov 2024 20:59:28 +0000 Subject: [PATCH 081/159] Update main.nf - now uses param `dir_path` param for input files --- main.nf | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/main.nf b/main.nf index 2f5321cb..a90e3b48 100644 --- a/main.nf +++ b/main.nf @@ -2,6 +2,7 @@ nextflow.enable.dsl=2 +params.dir_path = "." params.celltype_field = 'NO_CELLTYPE_FIELD' params.neighbor_values = ['10', '100', '15', '20', '25', '3', '30', '5', '50'] params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50'] @@ -14,6 +15,7 @@ log.info """ =============================== WORKFLOW PARAMETER VALUES =============================== +EXP dir path: ${params.dir_path} celltype_field: ${params.celltype_field} neighbor_values: ${params.neighbor_values} perplexity_values: ${params.perplexity_values} @@ -610,11 +612,11 @@ process make_project_file { workflow { // Create input channel (single file via CLI parameter) - genemeta = Channel.fromPath('genes_metadata.tsv') - genes = Channel.fromPath('genes.tsv') - barcodes = Channel.fromPath('barcodes.tsv') - matrix = Channel.fromPath('matrix.mtx') - cellmeta = Channel.fromPath('cell_metadata.tsv') + genemeta = Channel.fromPath("${params.dir_path}/genes_metadata.tsv") + genes = Channel.fromPath("${params.dir_path}/genes.tsv") + barcodes = Channel.fromPath("${params.dir_path}/barcodes.tsv") + matrix = Channel.fromPath("${params.dir_path}/matrix.mtx") + cellmeta = Channel.fromPath("${params.dir_path}/cell_metadata.tsv") pca_param = Channel.value('X_pca') batch_variable = Channel.value('') neighbors_ch = channel.fromList(params.neighbor_values) From 9f512a922f686119e18e7c1138efd8c2ece49b07 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 6 Nov 2024 21:10:37 +0000 Subject: [PATCH 082/159] Create data_prep.sh --- scripts/data_prep.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 scripts/data_prep.sh diff --git a/scripts/data_prep.sh b/scripts/data_prep.sh new file mode 100644 index 00000000..df486fb2 --- /dev/null +++ b/scripts/data_prep.sh @@ -0,0 +1,19 @@ +if [ -z "$SCXA_WORKFLOW_ROOT" ]; then + echo "Variable SCXA_WORKFLOW_ROOT is not defined or empty. Please load SC env." + echo "Exiting..." + exit 1; +fi + +EXP_ID=$1 + +echo "Creating ${pwd}/${EXP_ID} directory" +mkdir ${EXP_ID} +cd ${EXP_ID} + +echo "Copying data to ${pwd}/${EXP_ID}" +cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/${EXP_ID}.cell_metadata.tsv cell_metadata.tsv +cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/genes.tsv.gz . && gunzip genes.tsv.gz +cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/matrix.mtx.gz . && gunzip matrix.mtx.gz +cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/barcodes.tsv.gz . && gunzip barcodes.tsv.gz +cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/reference/gene_annotation.txt genes_metadata.tsv +echo "Copying data for ${EXP_ID} finished" From 2000f3002e9bbc6a842379d97ccb17637d24e349 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 6 Nov 2024 21:20:10 +0000 Subject: [PATCH 083/159] Update data_prep.sh - fixing log and force unzip --- scripts/data_prep.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/data_prep.sh b/scripts/data_prep.sh index df486fb2..6a38866c 100644 --- a/scripts/data_prep.sh +++ b/scripts/data_prep.sh @@ -6,14 +6,14 @@ fi EXP_ID=$1 -echo "Creating ${pwd}/${EXP_ID} directory" -mkdir ${EXP_ID} -cd ${EXP_ID} +echo "Creating $(pwd)/${EXP_ID} directory" +mkdir -p $(pwd)/${EXP_ID} +cd $(pwd)/${EXP_ID} echo "Copying data to ${pwd}/${EXP_ID}" cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/${EXP_ID}.cell_metadata.tsv cell_metadata.tsv -cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/genes.tsv.gz . && gunzip genes.tsv.gz -cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/matrix.mtx.gz . && gunzip matrix.mtx.gz -cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/barcodes.tsv.gz . && gunzip barcodes.tsv.gz +cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/genes.tsv.gz . && gunzip -f genes.tsv.gz +cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/matrix.mtx.gz . && gunzip -f matrix.mtx.gz +cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/barcodes.tsv.gz . && gunzip -f barcodes.tsv.gz cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/reference/gene_annotation.txt genes_metadata.tsv echo "Copying data for ${EXP_ID} finished" From ee8b177088bfa5a6cb17838171798cd3d41821aa Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Thu, 7 Nov 2024 10:08:11 +0000 Subject: [PATCH 084/159] Update main.nf - adds results dir --- main.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/main.nf b/main.nf index a90e3b48..69b92841 100644 --- a/main.nf +++ b/main.nf @@ -3,6 +3,7 @@ nextflow.enable.dsl=2 params.dir_path = "." +params.result_dir_path = params.dir_path + "/results" params.celltype_field = 'NO_CELLTYPE_FIELD' params.neighbor_values = ['10', '100', '15', '20', '25', '3', '30', '5', '50'] params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50'] @@ -16,6 +17,7 @@ log.info """ WORKFLOW PARAMETER VALUES =============================== EXP dir path: ${params.dir_path} +Results results_dir_path: ${params.result_dir_path} celltype_field: ${params.celltype_field} neighbor_values: ${params.neighbor_values} perplexity_values: ${params.perplexity_values} @@ -573,6 +575,8 @@ process merge_embeddings { process make_project_file { + publishDir params.result_dir_path, mode: 'copy' + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: From 47b6395d4a82839fa321fa3c5b3dedfa083f9807 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Thu, 7 Nov 2024 10:23:10 +0000 Subject: [PATCH 085/159] Update nextflow.config - temp commenting lines from config file --- nextflow.config | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/nextflow.config b/nextflow.config index 2317fb25..fb2e6015 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,29 +1,29 @@ process { executor='slurm' queue="$SCXA_HPC_QUEUE" - clusterOptions="$SCXA_HPC_OPTIONS" + // clusterOptions="$SCXA_HPC_OPTIONS" time = '7 d' memory = '4 GB' queueSize=500 exitReadTimeout='100000 sec' pollInterval = '5sec' // error strategy - errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } - memory = { 4.GB * 2 ^task.attempt } - maxRetries = 4 + // errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + // memory = { 4.GB * 2 ^task.attempt } + // maxRetries = 4 } singularity { enabled = true - cacheDir = "$SCXA_SINGULARITY_CACHE" + // cacheDir = "$SCXA_SINGULARITY_CACHE" } conda { - cacheDir = "$SCXA_WORKFLOW_ROOT/envs" + // cacheDir = "$SCXA_WORKFLOW_ROOT/envs" createTimeout = "30 min" useMamba = true } -params { - -} +// params { +// +// } From 82408ae41275272384030143fd8f911d300b78ef Mon Sep 17 00:00:00 2001 From: Iris Diana Yu Date: Thu, 7 Nov 2024 11:38:44 +0000 Subject: [PATCH 086/159] Add draft processes and conditional for scrublet --- main.nf | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/main.nf b/main.nf index 69b92841..7752c664 100644 --- a/main.nf +++ b/main.nf @@ -108,6 +108,38 @@ process mergeGeneFiles { """ } +process scanpy_multiplet_scrublet { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + + input: + path anndata + val batch_variable + + output: + path 'anndata.h5ad' + + script: + """ + echo $batch_variable > scanpy_multiplet_scrublet.test + cp $anndata anndata.h5ad + """ +} + +process scanpy_plot_scrublet { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + + input: + path anndata + + output: + path 'scanpy_plot_scrublet.test' + + script: + """ + echo $anndata > scanpy_plot_scrublet.test + """ +} + process scanpy_read_10x { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' @@ -648,6 +680,27 @@ workflow { cellmeta, genemeta ) + + if ( params.is_droplet ) { + SCRUBLET_ch = scanpy_multiplet_scrublet( + scanpy_read_10x.out, + batch_variable + ) + scanpy_plot_scrublet( + SCRUBLET_ch + ) + scanpy_filter_cells( + SCRUBLET_ch, + Column_rearrange_1.out[0] + ) + } + else { + scanpy_filter_cells( + scanpy_read_10x.out, + Column_rearrange_1.out[0] + ) + } + scanpy_filter_cells( scanpy_read_10x.out, ) From 1b5847db20dd6cf40dd8d83301e3895fe5387506 Mon Sep 17 00:00:00 2001 From: Iris Diana Yu Date: Thu, 7 Nov 2024 11:43:50 +0000 Subject: [PATCH 087/159] Add param technology, change conditional --- main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 7752c664..eaf43afa 100644 --- a/main.nf +++ b/main.nf @@ -2,6 +2,7 @@ nextflow.enable.dsl=2 +params.technology = "plate" params.dir_path = "." params.result_dir_path = params.dir_path + "/results" params.celltype_field = 'NO_CELLTYPE_FIELD' @@ -681,7 +682,7 @@ workflow { genemeta ) - if ( params.is_droplet ) { + if ( params.technology == "droplet" ) { SCRUBLET_ch = scanpy_multiplet_scrublet( scanpy_read_10x.out, batch_variable From 2b37608f5d1774313b186d7307d6198999f68ef5 Mon Sep 17 00:00:00 2001 From: Iris Diana Yu Date: Thu, 7 Nov 2024 11:53:17 +0000 Subject: [PATCH 088/159] Correct workflow --- main.nf | 63 ++++++++++++++++++++++++++------------------------------- 1 file changed, 29 insertions(+), 34 deletions(-) diff --git a/main.nf b/main.nf index eaf43afa..85688c97 100644 --- a/main.nf +++ b/main.nf @@ -109,64 +109,64 @@ process mergeGeneFiles { """ } -process scanpy_multiplet_scrublet { +process scanpy_read_10x { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: - path anndata - val batch_variable + path matrix + path genes + path barcodes + path cellmeta + path genemeta output: path 'anndata.h5ad' script: """ - echo $batch_variable > scanpy_multiplet_scrublet.test - cp $anndata anndata.h5ad + #ln -s $matrix matrix.mtx + ln -s $genes genes.tsv + #ln -s $barcodes barcodes.tsv + + scanpy-read-10x --input-10x-mtx ./ \ + --var-names 'gene_ids' \ + --extra-obs $cellmeta \ + --extra-var $genemeta \ + --show-obj stdout \ + --output-format anndata \ + 'anndata.h5ad' """ } -process scanpy_plot_scrublet { +process scanpy_multiplet_scrublet { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: path anndata + val batch_variable output: - path 'scanpy_plot_scrublet.test' + path 'anndata.h5ad' script: """ - echo $anndata > scanpy_plot_scrublet.test + echo $batch_variable > scanpy_multiplet_scrublet.test + cp $anndata anndata.h5ad """ } -process scanpy_read_10x { +process scanpy_plot_scrublet { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: - path matrix - path genes - path barcodes - path cellmeta - path genemeta + path anndata output: - path 'anndata.h5ad' + path 'scanpy_plot_scrublet.test' script: """ - #ln -s $matrix matrix.mtx - ln -s $genes genes.tsv - #ln -s $barcodes barcodes.tsv - - scanpy-read-10x --input-10x-mtx ./ \ - --var-names 'gene_ids' \ - --extra-obs $cellmeta \ - --extra-var $genemeta \ - --show-obj stdout \ - --output-format anndata \ - 'anndata.h5ad' + echo $anndata > scanpy_plot_scrublet.test """ } @@ -691,20 +691,15 @@ workflow { SCRUBLET_ch ) scanpy_filter_cells( - SCRUBLET_ch, - Column_rearrange_1.out[0] + SCRUBLET_ch ) } else { scanpy_filter_cells( - scanpy_read_10x.out, - Column_rearrange_1.out[0] + scanpy_read_10x.out ) } - - scanpy_filter_cells( - scanpy_read_10x.out, - ) + scanpy_filter_genes( scanpy_filter_cells.out, Column_rearrange_1.out[0] From 014601d295214c9b78663477518844115c10c7a2 Mon Sep 17 00:00:00 2001 From: Iris Diana Yu Date: Thu, 7 Nov 2024 13:46:46 +0000 Subject: [PATCH 089/159] Rename scrublet process output --- main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 85688c97..325d5f9e 100644 --- a/main.nf +++ b/main.nf @@ -146,12 +146,12 @@ process scanpy_multiplet_scrublet { val batch_variable output: - path 'anndata.h5ad' + path 'scrublet.h5ad' script: """ echo $batch_variable > scanpy_multiplet_scrublet.test - cp $anndata anndata.h5ad + cp $anndata scrublet.h5ad """ } @@ -699,7 +699,7 @@ workflow { scanpy_read_10x.out ) } - + scanpy_filter_genes( scanpy_filter_cells.out, Column_rearrange_1.out[0] From 02501d9fe3f8263bba8e91daeb1818630893f349 Mon Sep 17 00:00:00 2001 From: Iris Diana Yu Date: Fri, 8 Nov 2024 14:49:43 +0000 Subject: [PATCH 090/159] Add real scrublet commands --- main.nf | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 325d5f9e..b57bf0bb 100644 --- a/main.nf +++ b/main.nf @@ -150,8 +150,12 @@ process scanpy_multiplet_scrublet { script: """ - echo $batch_variable > scanpy_multiplet_scrublet.test - cp $anndata scrublet.h5ad + scanpy-cli multiplet scrublet \ + --input-format 'anndata' \ + --output-format 'anndata_h5ad' \ + --batch-key "${params.batch_variable}" \ + $anndata \ + scrublet.h5ad """ } @@ -162,11 +166,16 @@ process scanpy_plot_scrublet { path anndata output: - path 'scanpy_plot_scrublet.test' + path 'scrublet.png' script: """ - echo $anndata > scanpy_plot_scrublet.test + scanpy-cli plot scrublet \ + --input-format "anndata" \ + --scale-hist-obs "linear" \ + --scale-hist-sim "linear" \ + $anndata \ + scrublet.png """ } From 8b74c493e6fa16ec0ca6dfc5abb809a2f6054779 Mon Sep 17 00:00:00 2001 From: Iris Diana Yu Date: Fri, 8 Nov 2024 14:58:46 +0000 Subject: [PATCH 091/159] Initialise batch variable, correct scrublet output format --- main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index b57bf0bb..d7096e8a 100644 --- a/main.nf +++ b/main.nf @@ -3,6 +3,7 @@ nextflow.enable.dsl=2 params.technology = "plate" +params.batch_variable = "" params.dir_path = "." params.result_dir_path = params.dir_path + "/results" params.celltype_field = 'NO_CELLTYPE_FIELD' @@ -152,7 +153,7 @@ process scanpy_multiplet_scrublet { """ scanpy-cli multiplet scrublet \ --input-format 'anndata' \ - --output-format 'anndata_h5ad' \ + --output-format 'anndata' \ --batch-key "${params.batch_variable}" \ $anndata \ scrublet.h5ad From ddb3916ac2488aba30fdf67bac4164530811384d Mon Sep 17 00:00:00 2001 From: Iris Diana Yu Date: Tue, 12 Nov 2024 11:33:56 +0000 Subject: [PATCH 092/159] Allow scrublet to execute without a batch variable --- main.nf | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index d7096e8a..9528b3e3 100644 --- a/main.nf +++ b/main.nf @@ -151,12 +151,20 @@ process scanpy_multiplet_scrublet { script: """ - scanpy-cli multiplet scrublet \ - --input-format 'anndata' \ - --output-format 'anndata' \ - --batch-key "${params.batch_variable}" \ - $anndata \ - scrublet.h5ad + if [ "${params.batch_variable}" -eq "" ]; do + scanpy-cli multiplet scrublet \ + --input-format 'anndata' \ + --output-format 'anndata' \ + $anndata \ + scrublet.h5ad + else + scanpy-cli multiplet scrublet \ + --input-format 'anndata' \ + --output-format 'anndata' \ + --batch-key "${params.batch_variable}" \ + $anndata \ + scrublet.h5ad + fi """ } From 3ec2fe6f6d53685cd245ce98eb33fbb45e9c3106 Mon Sep 17 00:00:00 2001 From: Iris Diana Yu Date: Tue, 12 Nov 2024 11:57:57 +0000 Subject: [PATCH 093/159] bash correction --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 9528b3e3..e2903e4b 100644 --- a/main.nf +++ b/main.nf @@ -151,7 +151,7 @@ process scanpy_multiplet_scrublet { script: """ - if [ "${params.batch_variable}" -eq "" ]; do + if [ "${params.batch_variable}" -eq "" ]; then scanpy-cli multiplet scrublet \ --input-format 'anndata' \ --output-format 'anndata' \ From 55feac593cdc5c052864703aacd5898f61373937 Mon Sep 17 00:00:00 2001 From: Iris Diana Yu Date: Tue, 12 Nov 2024 12:09:09 +0000 Subject: [PATCH 094/159] correct if expression --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index e2903e4b..2f7f35e6 100644 --- a/main.nf +++ b/main.nf @@ -151,7 +151,7 @@ process scanpy_multiplet_scrublet { script: """ - if [ "${params.batch_variable}" -eq "" ]; then + if [ -z "${params.batch_variable}" ]; then scanpy-cli multiplet scrublet \ --input-format 'anndata' \ --output-format 'anndata' \ From 12f2024a38316c2f713c37483edab0d0b7c0d69f Mon Sep 17 00:00:00 2001 From: Iris Diana Yu Date: Wed, 13 Nov 2024 10:41:29 +0000 Subject: [PATCH 095/159] Filter predicted doublets if applicable --- main.nf | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 2f7f35e6..62748979 100644 --- a/main.nf +++ b/main.nf @@ -193,6 +193,7 @@ process scanpy_filter_cells { input: path anndata + val category output: path 'filtered_cell_anndata.h5ad' @@ -205,7 +206,8 @@ process scanpy_filter_cells { --input-format 'anndata' $anndata \ --show-obj stdout \ --output-format anndata 'filtered_cell_anndata.h5ad' \ - --export-mtx ./ + --export-mtx ./ \ + $category """ } @@ -709,12 +711,14 @@ workflow { SCRUBLET_ch ) scanpy_filter_cells( - SCRUBLET_ch + SCRUBLET_ch, + "--category predicted_doublet False" ) } else { scanpy_filter_cells( - scanpy_read_10x.out + scanpy_read_10x.out, + "" ) } From 130486f2b69ff37100bead332ae42b544617db93 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 13 Nov 2024 15:28:47 +0000 Subject: [PATCH 096/159] Update main.nf - adds `restore_unscaled` process - not tested --- main.nf | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/main.nf b/main.nf index 62748979..3c9513eb 100644 --- a/main.nf +++ b/main.nf @@ -491,6 +491,21 @@ process build_list { """ } +process restore_unscaled { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: + tuple path(anndata), path(normalise_internal_data) + output: + path "restore_unscaled_output_${merged_group_slotname}.h5" + script: + """ + ln -s $anndata input.h5 + ln -s $normalise_internal_data r_source.h5 + python ${projectDir}/scripts/restore_unscaled.py + mv output.h5 'restore_unscaled_output_${merged_group_slotname}.h5' + """ +} + process find_markers { errorStrategy 'ignore' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' @@ -770,15 +785,24 @@ workflow { // Combine the outputs of find_clusters and neighbors processes combined_outputs = find_clusters.out.mix(neighbors.out) - processed_files = combined_outputs.map { file -> - // Extract the sample number from the file name - def sampleNumber = file.baseName.replaceFirst('clusters_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field) - [file, sampleNumber] // Create a tuple with sample number and file + if ( params.technology == "droplet" ) { + restore_unscaled + combined_outputs.combine(normalise_internal_data.out) + ) + restore_unscaled_files = restore_unscaled.out.map { file -> + // Extract the sample number from the file name + def sampleNumber = file.baseName.replaceFirst('restore_unscaled_output_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field) + [file, sampleNumber] // Create a tuple with sample number and file + } + find_markers( + restore_unscaled_files + ) + } + else { + find_markers( + processed_files + ) } - - find_markers( - processed_files - ) make_project_file( neighbors.out, scanpy_read_10x.out, From 2e19d39b59ca9b2af3b6685b78134d8e4ef0b18d Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 13 Nov 2024 15:32:07 +0000 Subject: [PATCH 097/159] Create resource_unscalled.py --- scripts/resource_unscalled.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 scripts/resource_unscalled.py diff --git a/scripts/resource_unscalled.py b/scripts/resource_unscalled.py new file mode 100644 index 00000000..9e5eef92 --- /dev/null +++ b/scripts/resource_unscalled.py @@ -0,0 +1,33 @@ +import scanpy as sc +import anndata +from numpy import all +import logging + +adata = sc.read('input.h5') + +gene_name = 'index' +qc_vars = list() +gene_names = getattr(adata.var, gene_name) + +ad_s = sc.read('r_source.h5') +if not all(adata.obs.index.isin(ad_s.obs.index)): + logging.error("Specified object for .raw must contain all .obs from main object.") + sys.exit(1) +else: + adata.raw = ad_s[adata.obs.index] +del ad_s + +if len(qc_vars) > 0: + pct_top = [50] + sc.pp.calculate_qc_metrics(adata, qc_vars=qc_vars, percent_top=pct_top, inplace=True) + +if 'n_genes' not in adata.obs.columns: + sc.pp.filter_cells(adata, min_genes=0) +if 'n_counts' not in adata.obs.columns: + sc.pp.filter_cells(adata, min_counts=0) +if 'n_cells' not in adata.var.columns: + sc.pp.filter_genes(adata, min_cells=0) +if 'n_counts' not in adata.var.columns: + sc.pp.filter_genes(adata, min_counts=0) + +adata.write('output.h5', compression='gzip') From b5e37d1a22838ed792926530e1001f537b616fd2 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 13 Nov 2024 15:44:24 +0000 Subject: [PATCH 098/159] Update main.nf - adds missing `(` --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 3c9513eb..f902e722 100644 --- a/main.nf +++ b/main.nf @@ -786,7 +786,7 @@ workflow { combined_outputs = find_clusters.out.mix(neighbors.out) if ( params.technology == "droplet" ) { - restore_unscaled + restore_unscaled ( combined_outputs.combine(normalise_internal_data.out) ) restore_unscaled_files = restore_unscaled.out.map { file -> From 6bbf4cb320b25d218b04105eb245d93a1a326fd1 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 13 Nov 2024 15:49:30 +0000 Subject: [PATCH 099/159] Update main.nf - adding `processed_file` back --- main.nf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/main.nf b/main.nf index f902e722..dac1fa8b 100644 --- a/main.nf +++ b/main.nf @@ -799,6 +799,11 @@ workflow { ) } else { + processed_files = combined_outputs.map { file -> + // Extract the sample number from the file name + def sampleNumber = file.baseName.replaceFirst('clusters_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field) + [file, sampleNumber] // Create a tuple with sample number and file + find_markers( processed_files ) From 22d37283c06575db2a9406715cb4171b20e7b1ba Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 13 Nov 2024 15:53:01 +0000 Subject: [PATCH 100/159] Update main.nf - fixes `restore_unscaled` --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index dac1fa8b..ff6ec312 100644 --- a/main.nf +++ b/main.nf @@ -496,13 +496,13 @@ process restore_unscaled { input: tuple path(anndata), path(normalise_internal_data) output: - path "restore_unscaled_output_${merged_group_slotname}.h5" + path "restore_unscaled_output_${anndata}.h5" script: """ ln -s $anndata input.h5 ln -s $normalise_internal_data r_source.h5 python ${projectDir}/scripts/restore_unscaled.py - mv output.h5 'restore_unscaled_output_${merged_group_slotname}.h5' + mv output.h5 'restore_unscaled_output_${anndata}.h5' """ } From 77b50c2699786c938c4de95801bb3485390c82e6 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 13 Nov 2024 15:58:03 +0000 Subject: [PATCH 101/159] Update main.nf - addsmissin `}` --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index ff6ec312..d7598a1d 100644 --- a/main.nf +++ b/main.nf @@ -803,7 +803,7 @@ workflow { // Extract the sample number from the file name def sampleNumber = file.baseName.replaceFirst('clusters_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field) [file, sampleNumber] // Create a tuple with sample number and file - + } find_markers( processed_files ) From 9fd836123c8f131ae9fb17f9e36fca1ba18205aa Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 13 Nov 2024 16:05:09 +0000 Subject: [PATCH 102/159] Rename resource_unscalled.py to restore_unscaled.py --- scripts/{resource_unscalled.py => restore_unscaled.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/{resource_unscalled.py => restore_unscaled.py} (100%) diff --git a/scripts/resource_unscalled.py b/scripts/restore_unscaled.py similarity index 100% rename from scripts/resource_unscalled.py rename to scripts/restore_unscaled.py From 75a57b0830749c9f0347c983658964303b21ca6d Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 13 Nov 2024 16:39:00 +0000 Subject: [PATCH 103/159] Update main.nf - fixed mapping for `find_marker` after `restore_unscaled` --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index d7598a1d..bb9f8b65 100644 --- a/main.nf +++ b/main.nf @@ -791,7 +791,7 @@ workflow { ) restore_unscaled_files = restore_unscaled.out.map { file -> // Extract the sample number from the file name - def sampleNumber = file.baseName.replaceFirst('restore_unscaled_output_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field) + def sampleNumber = file.baseName.replaceFirst('restore_unscaled_output_', '').replaceFirst('clusters_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field).replaceFirst('neighbors',params.celltype_field).replaceFirst('.h5ad','') [file, sampleNumber] // Create a tuple with sample number and file } find_markers( From 2b539b7ad76ab51ebfe6db3bed7f355af4c9f751 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 13 Nov 2024 16:57:48 +0000 Subject: [PATCH 104/159] Update README.md --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index 06bcde9e..81348687 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,16 @@ # scxa-tertiary-workflow Tertiary component for SCXA workflows + +# How to run workflow for tertiary analysis +## Prepare data +``` +bash scripts/data_prep.sh +``` +## Run for plate +``` +nextflow run main.nf --slurm -resume --dir_path +``` +## Run for droplet +``` +nextflow run main.nf --slurm -resume --dir_path --technology droplet +``` From db56ad6e7a5b9056b60661eb2fce0fea6749864b Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 13 Nov 2024 20:48:19 +0000 Subject: [PATCH 105/159] Update main.nf - adds publishDir options.. --- main.nf | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index bb9f8b65..34358882 100644 --- a/main.nf +++ b/main.nf @@ -212,6 +212,8 @@ process scanpy_filter_cells { } process scanpy_filter_genes { + publishDir params.result_dir_path, mode: 'copy', pattern: '(matrix\\.mtx|barcodes\\.tsv|genes\\.tsv)' + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: @@ -236,6 +238,7 @@ process scanpy_filter_genes { } process normalise_data { + publishDir params.result_dir_path, mode: 'copy', pattern: '(matrix\\.mtx|barcodes\\.tsv|genes\\.tsv)' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: @@ -418,6 +421,7 @@ process neighbors_for_umap { } process find_clusters { + publishDir params.result_dir_path, mode: 'copy', pattern: '(clusters\\.tsv)' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: @@ -438,6 +442,7 @@ process find_clusters { --show-obj stdout \ --output-format anndata \ 'clusters_${resolution}.h5ad' + && mv 'output.tsv' 'clusters_${resolution}.tsv' """ } @@ -507,6 +512,7 @@ process restore_unscaled { } process find_markers { + publishDir params.result_dir_path, mode: 'copy', pattern: '(markers_\\.tsv)' errorStrategy 'ignore' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: @@ -516,7 +522,7 @@ process find_markers { script: """ scanpy-find-markers \ - --save diffexp.tsv \ + --save 'markers_${merged_group_slotname}.tsv' \ --n-genes '100' \ --groupby '${merged_group_slotname}' \ --key-added 'markers_${merged_group_slotname}' \ @@ -543,6 +549,7 @@ process filtered_cellgroup_markers { } process run_umap { + publishDir params.result_dir_path, mode: 'copy', pattern: '(embeddings\\.tsv)' //errorStrategy 'ignore' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' @@ -574,12 +581,13 @@ process run_umap { --output-format anndata \ "umap_\${n_number}.h5ad" # Not sure if following is needed - # && mv 'embeddings_neighbors_n_neighbors_100.tsv' embeddings.tsv + # && mv 'embeddings_neighbors_n_neighbors_${n_number}.tsv' embeddings.tsv """ } process run_tsne { + publishDir params.result_dir_path, mode: 'copy', pattern: '(embeddings_perplexity\\.tsv)' //errorStrategy 'ignore' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' @@ -606,7 +614,7 @@ process run_tsne { --output-format anndata \ 'tsne_${perplexity_values}.h5ad' # Not sure if following is needed - # && mv 'embeddings_perplexity_1.tsv' embeddings.tsv + && mv 'embeddings_perplexity_${perplexity_values}.tsv' embeddings.tsv """ } From 96881dd1788bd2301bb0ac43b9969e8459cb738c Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 13 Nov 2024 20:53:25 +0000 Subject: [PATCH 106/159] Update main.nf - fixes err --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 34358882..55656ad2 100644 --- a/main.nf +++ b/main.nf @@ -581,7 +581,7 @@ process run_umap { --output-format anndata \ "umap_\${n_number}.h5ad" # Not sure if following is needed - # && mv 'embeddings_neighbors_n_neighbors_${n_number}.tsv' embeddings.tsv + # && mv "embeddings_neighbors_n_neighbors_\${n_number}.tsv" embeddings.tsv """ } From 281875b5b49f974d15ec6ad076de8e8b24ee4135 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 13 Nov 2024 21:02:37 +0000 Subject: [PATCH 107/159] Update main.nf - fixes err --- main.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 55656ad2..835e6537 100644 --- a/main.nf +++ b/main.nf @@ -442,7 +442,8 @@ process find_clusters { --show-obj stdout \ --output-format anndata \ 'clusters_${resolution}.h5ad' - && mv 'output.tsv' 'clusters_${resolution}.tsv' + + mv 'output.tsv' 'clusters_${resolution}.tsv' """ } @@ -614,7 +615,7 @@ process run_tsne { --output-format anndata \ 'tsne_${perplexity_values}.h5ad' # Not sure if following is needed - && mv 'embeddings_perplexity_${perplexity_values}.tsv' embeddings.tsv + # && mv 'embeddings_perplexity_${perplexity_values}.tsv' embeddings.tsv """ } From 6a2b7366cffc360af316f131b3d237fa8b5fe869 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 13 Nov 2024 21:05:53 +0000 Subject: [PATCH 108/159] Update main.nf - adds publishdir for scrublet plot --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index 835e6537..e2b446cf 100644 --- a/main.nf +++ b/main.nf @@ -169,6 +169,7 @@ process scanpy_multiplet_scrublet { } process scanpy_plot_scrublet { + publishDir params.result_dir_path, mode: 'copy', pattern: '(scrublet.png)' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: From 6b19a64ba9a4b84c7dfef8726bc43b62dcd99f55 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 15 Nov 2024 14:31:25 +0000 Subject: [PATCH 109/159] Update main.nf - redirects output to `publishDir` --- main.nf | 58 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/main.nf b/main.nf index e2b446cf..fe25fd42 100644 --- a/main.nf +++ b/main.nf @@ -213,8 +213,9 @@ process scanpy_filter_cells { } process scanpy_filter_genes { - publishDir params.result_dir_path, mode: 'copy', pattern: '(matrix\\.mtx|barcodes\\.tsv|genes\\.tsv)' - + publishDir "${params.result_dir_path}/scanpy_filter_genes", mode: 'copy', pattern: 'matrix.mtx' + publishDir "${params.result_dir_path}/scanpy_filter_genes", mode: 'copy', pattern: 'barcodes.tsv' + publishDir "${params.result_dir_path}/scanpy_filter_genes", mode: 'copy', pattern: 'genes.tsv' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: @@ -223,6 +224,9 @@ process scanpy_filter_genes { output: path 'filtered_gene_anndata.h5ad' + path 'matrix.mtx' + path 'barcodes.tsv' + path 'genes.tsv' script: """ @@ -239,14 +243,19 @@ process scanpy_filter_genes { } process normalise_data { - publishDir params.result_dir_path, mode: 'copy', pattern: '(matrix\\.mtx|barcodes\\.tsv|genes\\.tsv)' + publishDir "${params.result_dir_path}/normalise_data", mode: 'copy', pattern: 'matrix.mtx' + publishDir "${params.result_dir_path}/normalise_data", mode: 'copy', pattern: 'barcodes.tsv' + publishDir "${params.result_dir_path}/normalise_data", mode: 'copy', pattern: 'genes.tsv' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: path anndata output: - path 'normalised_anndata.h5ad' + path 'normalised_anndata.h5ad' + path 'matrix.mtx' + path 'barcodes.tsv' + path 'genes.tsv' script: """ @@ -422,13 +431,15 @@ process neighbors_for_umap { } process find_clusters { - publishDir params.result_dir_path, mode: 'copy', pattern: '(clusters\\.tsv)' + publishDir "${params.result_dir_path}/find_clusters", mode: 'copy', pattern: 'clusters_*.tsv' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: tuple path(anndata), val(resolution) output: path "clusters_${resolution}.h5ad" + path "clusters_${resolution}.tsv" + script: """ scanpy-find-cluster louvain \ @@ -500,10 +511,13 @@ process build_list { process restore_unscaled { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: - tuple path(anndata), path(normalise_internal_data) + tuple path(anndata), path(normalise_internal_data + output: path "restore_unscaled_output_${anndata}.h5" + script: """ ln -s $anndata input.h5 @@ -514,13 +528,17 @@ process restore_unscaled { } process find_markers { - publishDir params.result_dir_path, mode: 'copy', pattern: '(markers_\\.tsv)' + publishDir "${params.result_dir_path}/find_markers", mode: 'copy', pattern: 'markers_*.tsv' errorStrategy 'ignore' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + input: tuple path(anndata), val(merged_group_slotname) + output: path "markers_${merged_group_slotname}.h5ad" + path "markers_${merged_group_slotname}.tsv" + script: """ scanpy-find-markers \ @@ -551,15 +569,18 @@ process filtered_cellgroup_markers { } process run_umap { - publishDir params.result_dir_path, mode: 'copy', pattern: '(embeddings\\.tsv)' + publishDir "${params.result_dir_path}/run_umap", mode: 'copy', pattern: 'embeddings_neighbors_neighbors_*.tsv' //errorStrategy 'ignore' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: path anndata + output: path "umap_*.h5ad" + path "embeddings_neighbors_neighbors_*.tsv" + script: """ VAR="$anndata" @@ -589,7 +610,7 @@ process run_umap { } process run_tsne { - publishDir params.result_dir_path, mode: 'copy', pattern: '(embeddings_perplexity\\.tsv)' + publishDir "${params.result_dir_path}/run_tsne", mode: 'copy', pattern: 'embeddings_perplexity_*\\.tsv' //errorStrategy 'ignore' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' @@ -597,8 +618,11 @@ process run_tsne { input: tuple path(anndata), val(perplexity_values) val pca_param + output: path "tsne_${perplexity_values}.h5ad" + path "embeddings_perplexity_${perplexity_values}.tsv" + script: """ scanpy-run-tsne \ @@ -752,10 +776,10 @@ workflow { Column_rearrange_1.out[0] ) normalise_data( - scanpy_filter_genes.out + scanpy_filter_genes.out[0] ) normalise_internal_data( - scanpy_filter_genes.out + scanpy_filter_genes.out[0] ) find_variable_genes( normalise_internal_data.out, @@ -779,13 +803,13 @@ workflow { TNSEs_ch = run_tsne( harmony_batch.out.combine(perplexity_ch), pca_param - ) + )[0] //TNSEs_ch // .filter { it.exitStatus == 0 } UMAPs_ch = run_umap( neighbors_for_umap.out.flatten() - ) + )[0] //UMAPs_ch // .filter { it.exitStatus == 0 } find_clusters( @@ -793,7 +817,7 @@ workflow { ) // Combine the outputs of find_clusters and neighbors processes - combined_outputs = find_clusters.out.mix(neighbors.out) + combined_outputs = find_clusters.out[0].mix(neighbors.out) if ( params.technology == "droplet" ) { restore_unscaled ( @@ -821,9 +845,9 @@ workflow { make_project_file( neighbors.out, scanpy_read_10x.out, - scanpy_filter_genes.out, - normalise_data.out, - find_markers.out.collect(), + scanpy_filter_genes.out[0], + normalise_data.out[0], + find_markers.out[0].collect(), TNSEs_ch.mix(UMAPs_ch).collect() ) } From 040250cd31af812d5229edb24e497ede4daf8aee Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 15 Nov 2024 14:35:18 +0000 Subject: [PATCH 110/159] Update main.nf --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index fe25fd42..40c9f337 100644 --- a/main.nf +++ b/main.nf @@ -19,6 +19,7 @@ log.info """ WORKFLOW PARAMETER VALUES =============================== EXP dir path: ${params.dir_path} +Selected technology: ${params.technology} Results results_dir_path: ${params.result_dir_path} celltype_field: ${params.celltype_field} neighbor_values: ${params.neighbor_values} From 82e330308d06c24ec7d5007efda549ea941ac80e Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 15 Nov 2024 14:50:24 +0000 Subject: [PATCH 111/159] Update main.nf - removed filter_failed_umap/tsne as errorStrategy 'ignore' works --- main.nf | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/main.nf b/main.nf index 40c9f337..2b82e14d 100644 --- a/main.nf +++ b/main.nf @@ -571,7 +571,8 @@ process filtered_cellgroup_markers { process run_umap { publishDir "${params.result_dir_path}/run_umap", mode: 'copy', pattern: 'embeddings_neighbors_neighbors_*.tsv' - //errorStrategy 'ignore' + + errorStrategy 'ignore' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' @@ -612,7 +613,8 @@ process run_umap { process run_tsne { publishDir "${params.result_dir_path}/run_tsne", mode: 'copy', pattern: 'embeddings_perplexity_*\\.tsv' - //errorStrategy 'ignore' + + errorStrategy 'ignore' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' @@ -645,26 +647,6 @@ process run_tsne { """ } -process filter_failed_umap { - input: - - output: - - script: - """ - """ -} - -process filer_failed_tsne { - input: - - output: - - script: - """ - """ -} - process merge_embeddings { input: From 73314b99592d22b59fed1e99e97141fb2656b58f Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 15 Nov 2024 14:52:00 +0000 Subject: [PATCH 112/159] Update main.nf - `merge_embedings` removed as `TNSEs_ch.mix(UMAPs_ch).collect()` does same thing --- main.nf | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/main.nf b/main.nf index 2b82e14d..e2962da3 100644 --- a/main.nf +++ b/main.nf @@ -647,18 +647,6 @@ process run_tsne { """ } -process merge_embeddings { - input: - - output: - - script: - """ - """ -} - - - process make_project_file { publishDir params.result_dir_path, mode: 'copy' From 7f9162f40cf12546d602065e9c5fe016ddd78eea Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 15 Nov 2024 14:54:10 +0000 Subject: [PATCH 113/159] Update main.nf - remove `filtered_cellgroup_markers` as `errorStrategy 'ignore'` does same thing --- main.nf | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/main.nf b/main.nf index e2962da3..18a391d9 100644 --- a/main.nf +++ b/main.nf @@ -559,16 +559,6 @@ process find_markers { """ } -process filtered_cellgroup_markers { - input: - - output: - - script: - """ - """ -} - process run_umap { publishDir "${params.result_dir_path}/run_umap", mode: 'copy', pattern: 'embeddings_neighbors_neighbors_*.tsv' From 81b6e238be399ed9ab20687cc1379b1b1318396b Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 15 Nov 2024 14:57:53 +0000 Subject: [PATCH 114/159] Update main.nf - `merge_collection` and `build_list` removed as its done by Nextfloe operator --- main.nf | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/main.nf b/main.nf index 18a391d9..2c8b9e79 100644 --- a/main.nf +++ b/main.nf @@ -490,26 +490,6 @@ process merge_group_slotnames { """ } -process merge_collections { - input: - - output: - - script: - """ - """ -} - -process build_list { - input: - - output: - - script: - """ - """ -} - process restore_unscaled { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' From 7c4c139e9eea1feb7ae539ec16579101b79f0d98 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 15 Nov 2024 14:59:23 +0000 Subject: [PATCH 115/159] Update main.nf - removed `clustering_slot_names` and `merge_group_slotnames` done by Nextflow operators --- main.nf | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/main.nf b/main.nf index 2c8b9e79..e483a123 100644 --- a/main.nf +++ b/main.nf @@ -470,26 +470,6 @@ process meta_vars { """ } -process clustering_slotnames { - input: - - output: - - script: - """ - """ -} - -process merge_group_slotnames { - input: - - output: - - script: - """ - """ -} - process restore_unscaled { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' From c540cdf54d1435c37e36d4e9c470ed518bf7cecf Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 15 Nov 2024 15:08:14 +0000 Subject: [PATCH 116/159] Update main.nf - removed `meta_vars` it was galaxy specific functionality --- main.nf | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/main.nf b/main.nf index e483a123..d8ff92b8 100644 --- a/main.nf +++ b/main.nf @@ -460,16 +460,6 @@ process find_clusters { """ } -process meta_vars { - input: - - output: - - script: - """ - """ -} - process restore_unscaled { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' From bce8436f83476192ca15277cb609ca44ff433ec8 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 15 Nov 2024 15:20:15 +0000 Subject: [PATCH 117/159] Update main.nf - removes hard coded name --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index d8ff92b8..49a1a0f1 100644 --- a/main.nf +++ b/main.nf @@ -736,7 +736,7 @@ workflow { ) restore_unscaled_files = restore_unscaled.out.map { file -> // Extract the sample number from the file name - def sampleNumber = file.baseName.replaceFirst('restore_unscaled_output_', '').replaceFirst('clusters_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field).replaceFirst('neighbors',params.celltype_field).replaceFirst('.h5ad','') + def sampleNumber = file.baseName.replaceFirst('restore_unscaled_output_', '').replaceFirst('clusters', params.slotname).replaceFirst('neighbors',params.celltype_field).replaceFirst('.h5ad','') [file, sampleNumber] // Create a tuple with sample number and file } find_markers( @@ -746,7 +746,7 @@ workflow { else { processed_files = combined_outputs.map { file -> // Extract the sample number from the file name - def sampleNumber = file.baseName.replaceFirst('clusters_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field) + def sampleNumber = file.baseName.replaceFirst('clusters', , params.slotname).replaceFirst('neighbors',params.celltype_field) [file, sampleNumber] // Create a tuple with sample number and file } find_markers( From be5fcece40b67aa4ab296d36933d152638fb6d71 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 15 Nov 2024 15:40:41 +0000 Subject: [PATCH 118/159] Update main.nf - adds missing `)` --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 49a1a0f1..8b390df6 100644 --- a/main.nf +++ b/main.nf @@ -464,7 +464,7 @@ process restore_unscaled { container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: - tuple path(anndata), path(normalise_internal_data + tuple path(anndata), path(normalise_internal_data) output: path "restore_unscaled_output_${anndata}.h5" From bbcb763f787aa708e618ac5863f2e6907249269c Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 15 Nov 2024 16:15:57 +0000 Subject: [PATCH 119/159] Update main.nf - adds `output_dir` param --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 8b390df6..17e0d507 100644 --- a/main.nf +++ b/main.nf @@ -5,7 +5,7 @@ nextflow.enable.dsl=2 params.technology = "plate" params.batch_variable = "" params.dir_path = "." -params.result_dir_path = params.dir_path + "/results" +params.result_dir_path = params.output_path ?: params.dir_path + "/results" params.celltype_field = 'NO_CELLTYPE_FIELD' params.neighbor_values = ['10', '100', '15', '20', '25', '3', '30', '5', '50'] params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50'] From e81f6bc905d0fac8c3a950eb86e42563898561b1 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Fri, 15 Nov 2024 16:17:42 +0000 Subject: [PATCH 120/159] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 81348687..f9495845 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,11 @@ bash scripts/data_prep.sh ``` ## Run for plate ``` -nextflow run main.nf --slurm -resume --dir_path +nextflow run main.nf --slurm -resume --dir_path [--output_path ] ``` ## Run for droplet ``` -nextflow run main.nf --slurm -resume --dir_path --technology droplet +nextflow run main.nf --slurm -resume --dir_path --technology droplet [--output_path ] ``` + +If `[--output_path ]` is not specified results will be `/results` dir. From f32e5252dea9263446cadfce201a5f8f6fcbec56 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Tue, 19 Nov 2024 14:59:51 +0000 Subject: [PATCH 121/159] Update data_prep.sh - adds optional output dir --- scripts/data_prep.sh | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/scripts/data_prep.sh b/scripts/data_prep.sh index 6a38866c..eb9c2433 100644 --- a/scripts/data_prep.sh +++ b/scripts/data_prep.sh @@ -1,19 +1,39 @@ +#!/usr/bin/env bash + +# This is EMBL-EBI specific script to fetch data from workflow root and put it in a place for downstream workflow to use + if [ -z "$SCXA_WORKFLOW_ROOT" ]; then echo "Variable SCXA_WORKFLOW_ROOT is not defined or empty. Please load SC env." echo "Exiting..." exit 1; fi +if [ -z "$1" ]; then + echo "Experiment ID is not provided. Please provide EXP ID" + echo "bash data_prep.sh [output path]" + echo "Exiting..." + exit 1; +fi + EXP_ID=$1 -echo "Creating $(pwd)/${EXP_ID} directory" -mkdir -p $(pwd)/${EXP_ID} -cd $(pwd)/${EXP_ID} +outdir="$(pwd)" + +if [ "$2" ]; then + outdir=$2 +fi + + +echo "Creating ${outdir}/${EXP_ID} directory" +mkdir -p ${outdir}/${EXP_ID} +cd ${outdir}/${EXP_ID} + +echo "Copying data to ${outdir}/${EXP_ID}" -echo "Copying data to ${pwd}/${EXP_ID}" cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/${EXP_ID}.cell_metadata.tsv cell_metadata.tsv cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/genes.tsv.gz . && gunzip -f genes.tsv.gz cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/matrix.mtx.gz . && gunzip -f matrix.mtx.gz cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/barcodes.tsv.gz . && gunzip -f barcodes.tsv.gz cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/reference/gene_annotation.txt genes_metadata.tsv + echo "Copying data for ${EXP_ID} finished" From 6c088d2ade2500f4be747fbf74f882806eac4d3c Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Tue, 19 Nov 2024 15:02:21 +0000 Subject: [PATCH 122/159] Update README.md - updates read me --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f9495845..76d6284c 100644 --- a/README.md +++ b/README.md @@ -4,15 +4,15 @@ Tertiary component for SCXA workflows # How to run workflow for tertiary analysis ## Prepare data ``` -bash scripts/data_prep.sh +bash scripts/data_prep.sh [output path] ``` ## Run for plate ``` -nextflow run main.nf --slurm -resume --dir_path [--output_path ] +nextflow run main.nf --slurm -resume --dir_path [--output_path ] ``` ## Run for droplet ``` -nextflow run main.nf --slurm -resume --dir_path --technology droplet [--output_path ] +nextflow run main.nf --slurm -resume --dir_path --technology droplet [--output_path ] ``` -If `[--output_path ]` is not specified results will be `/results` dir. +If `[--output_path ]` is not specified results will be `/results` dir. From ea27bbab1b72734addbe314c31bae7403027dcf1 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Tue, 19 Nov 2024 16:03:29 +0000 Subject: [PATCH 123/159] Update main.nf - removes redundant comma --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 17e0d507..ca178879 100644 --- a/main.nf +++ b/main.nf @@ -746,7 +746,7 @@ workflow { else { processed_files = combined_outputs.map { file -> // Extract the sample number from the file name - def sampleNumber = file.baseName.replaceFirst('clusters', , params.slotname).replaceFirst('neighbors',params.celltype_field) + def sampleNumber = file.baseName.replaceFirst('clusters', params.slotname).replaceFirst('neighbors',params.celltype_field) [file, sampleNumber] // Create a tuple with sample number and file } find_markers( From ffd644328aaebf8f6807620501806d3d2c759edc Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Tue, 19 Nov 2024 16:50:43 +0000 Subject: [PATCH 124/159] Update nextflow.config - adds config for reporting --- nextflow.config | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/nextflow.config b/nextflow.config index fb2e6015..62618776 100644 --- a/nextflow.config +++ b/nextflow.config @@ -24,6 +24,24 @@ conda { useMamba = true } +timeline { + enabled = true + file = "${params.results_dir_path}/timeline.html" + overwrite = true +} + +trace { + enabled = true + file = "${params.results_dir_path}/trace.txt" + overwrite = true +} + +report { + enabled = true + file = "${params.results_dir_path}/report.html" + overwrite = true +} + // params { // // } From 23f0b6c954414c545ef8452e361ee50e28b1e1c6 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 20 Nov 2024 08:57:12 +0000 Subject: [PATCH 125/159] Update nextflow.config - generates report in `result_dir_path` without specified in params --- nextflow.config | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/nextflow.config b/nextflow.config index 62618776..e0bd55d1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -24,24 +24,26 @@ conda { useMamba = true } -timeline { +params { + dir_path = "." + output_path = null + result_dir_path = "${params.output_path ?: params.dir_path + '/results'}" +} + +trace { enabled = true - file = "${params.results_dir_path}/timeline.html" + file = "${params.result_dir_path}/trace.txt" overwrite = true } -trace { +timeline { enabled = true - file = "${params.results_dir_path}/trace.txt" + file = "${params.result_dir_path}/timeline.html" overwrite = true } report { enabled = true - file = "${params.results_dir_path}/report.html" + file = "${params.result_dir_path}/report.html" overwrite = true } - -// params { -// -// } From 7ed1533262068d1b3012eaa164956fe346cad23e Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 20 Nov 2024 09:14:58 +0000 Subject: [PATCH 126/159] Update main.nf - making `batch_variable` and `pca_param` as params. --- main.nf | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/main.nf b/main.nf index ca178879..745fd5a0 100644 --- a/main.nf +++ b/main.nf @@ -4,6 +4,7 @@ nextflow.enable.dsl=2 params.technology = "plate" params.batch_variable = "" +params.pca_param = "X_pca" params.dir_path = "." params.result_dir_path = params.output_path ?: params.dir_path + "/results" params.celltype_field = 'NO_CELLTYPE_FIELD' @@ -634,8 +635,6 @@ workflow { barcodes = Channel.fromPath("${params.dir_path}/barcodes.tsv") matrix = Channel.fromPath("${params.dir_path}/matrix.mtx") cellmeta = Channel.fromPath("${params.dir_path}/cell_metadata.tsv") - pca_param = Channel.value('X_pca') - batch_variable = Channel.value('') neighbors_ch = channel.fromList(params.neighbor_values) perplexity_ch = channel.fromList(params.perplexity_values) resolution_ch = channel.fromList(params.resolution_values) @@ -665,7 +664,7 @@ workflow { if ( params.technology == "droplet" ) { SCRUBLET_ch = scanpy_multiplet_scrublet( scanpy_read_10x.out, - batch_variable + params.batch_variable ) scanpy_plot_scrublet( SCRUBLET_ch @@ -694,26 +693,26 @@ workflow { ) find_variable_genes( normalise_internal_data.out, - batch_variable + params.batch_variable ) run_pca( find_variable_genes.out ) harmony_batch( run_pca.out, - batch_variable + params.batch_variable ) neighbors( harmony_batch.out, - pca_param + params.pca_param ) neighbors_for_umap( harmony_batch.out.combine(neighbors_ch), - pca_param + params.pca_param ) TNSEs_ch = run_tsne( harmony_batch.out.combine(perplexity_ch), - pca_param + params.pca_param )[0] //TNSEs_ch // .filter { it.exitStatus == 0 } From 52f75b3c0cb8426ca71c29fbe5a8c3b72edd4e3b Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 20 Nov 2024 09:21:40 +0000 Subject: [PATCH 127/159] Update main.nf - rename `pca_param` variable as `representation` --- main.nf | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/main.nf b/main.nf index 745fd5a0..4b9c5d89 100644 --- a/main.nf +++ b/main.nf @@ -4,7 +4,7 @@ nextflow.enable.dsl=2 params.technology = "plate" params.batch_variable = "" -params.pca_param = "X_pca" +params.representation = "X_pca" params.dir_path = "." params.result_dir_path = params.output_path ?: params.dir_path + "/results" params.celltype_field = 'NO_CELLTYPE_FIELD' @@ -382,7 +382,7 @@ process neighbors { input: path anndata - val pca_param + val representation output: path 'neighbors.h5ad' @@ -393,7 +393,7 @@ process neighbors { --method 'umap' \ --metric 'euclidean' \ --random-state '0' \ - --use-rep $pca_param \ + --use-rep $representation \ --n-pcs '50' \ --input-format 'anndata' \ $anndata \ @@ -410,7 +410,7 @@ process neighbors_for_umap { input: tuple path(anndata), val(n_neighbors) - val pca_param + val representation output: path "neighbors_${n_neighbors}.h5ad" script: @@ -421,7 +421,7 @@ process neighbors_for_umap { --method 'umap' \ --metric 'euclidean' \ --random-state '0' \ - --use-rep $pca_param \ + --use-rep $representation \ --n-pcs '50' \ --input-format 'anndata' \ $anndata \ @@ -561,7 +561,7 @@ process run_tsne { input: tuple path(anndata), val(perplexity_values) - val pca_param + val representation output: path "tsne_${perplexity_values}.h5ad" @@ -570,7 +570,7 @@ process run_tsne { script: """ scanpy-run-tsne \ - --use-rep $pca_param \ + --use-rep $representation \ --export-embedding embeddings.tsv \ --perplexity $perplexity_values \ --key-added 'perplexity_$perplexity_values' \ @@ -704,15 +704,15 @@ workflow { ) neighbors( harmony_batch.out, - params.pca_param + params.representation ) neighbors_for_umap( harmony_batch.out.combine(neighbors_ch), - params.pca_param + params.representation ) TNSEs_ch = run_tsne( harmony_batch.out.combine(perplexity_ch), - params.pca_param + params.representation )[0] //TNSEs_ch // .filter { it.exitStatus == 0 } From deb55066da223f1d31a2759b46b000ebac99cb39 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 20 Nov 2024 10:39:32 +0000 Subject: [PATCH 128/159] Update main.nf - renames output to match existing pipeline --- main.nf | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/main.nf b/main.nf index 4b9c5d89..d7a54db9 100644 --- a/main.nf +++ b/main.nf @@ -215,9 +215,9 @@ process scanpy_filter_cells { } process scanpy_filter_genes { - publishDir "${params.result_dir_path}/scanpy_filter_genes", mode: 'copy', pattern: 'matrix.mtx' - publishDir "${params.result_dir_path}/scanpy_filter_genes", mode: 'copy', pattern: 'barcodes.tsv' - publishDir "${params.result_dir_path}/scanpy_filter_genes", mode: 'copy', pattern: 'genes.tsv' + publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'matrix.mtx' + publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'barcodes.tsv' + publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'genes.tsv' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: @@ -245,9 +245,9 @@ process scanpy_filter_genes { } process normalise_data { - publishDir "${params.result_dir_path}/normalise_data", mode: 'copy', pattern: 'matrix.mtx' - publishDir "${params.result_dir_path}/normalise_data", mode: 'copy', pattern: 'barcodes.tsv' - publishDir "${params.result_dir_path}/normalise_data", mode: 'copy', pattern: 'genes.tsv' + publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'matrix.mtx' + publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'barcodes.tsv' + publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'genes.tsv' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: @@ -433,7 +433,7 @@ process neighbors_for_umap { } process find_clusters { - publishDir "${params.result_dir_path}/find_clusters", mode: 'copy', pattern: 'clusters_*.tsv' + publishDir "${params.result_dir_path}/clusters", mode: 'copy', pattern: 'clusters_*.tsv' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' input: @@ -480,7 +480,7 @@ process restore_unscaled { } process find_markers { - publishDir "${params.result_dir_path}/find_markers", mode: 'copy', pattern: 'markers_*.tsv' + publishDir "${params.result_dir_path}/markers", mode: 'copy', pattern: 'markers_*.tsv' errorStrategy 'ignore' container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' @@ -511,7 +511,7 @@ process find_markers { } process run_umap { - publishDir "${params.result_dir_path}/run_umap", mode: 'copy', pattern: 'embeddings_neighbors_neighbors_*.tsv' + publishDir "${params.result_dir_path}/umap", mode: 'copy', pattern: 'embeddings_neighbors_neighbors_*.tsv' errorStrategy 'ignore' @@ -553,7 +553,7 @@ process run_umap { } process run_tsne { - publishDir "${params.result_dir_path}/run_tsne", mode: 'copy', pattern: 'embeddings_perplexity_*\\.tsv' + publishDir "${params.result_dir_path}/tsne", mode: 'copy', pattern: 'embeddings_perplexity_*\\.tsv' errorStrategy 'ignore' @@ -601,7 +601,7 @@ process make_project_file { path find_markers path TNSEs_mix_UMAPs output: - path "output.h5" + path "project.h5ad" script: """ ln -s $neighbors input.h5 @@ -624,6 +624,7 @@ process make_project_file { echo "\${count}" done python ${projectDir}/scripts/final_project.py + mv output.h5 project.h5ad """ } From 831982cc06225f334e428587388da0827598ea01 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 20 Nov 2024 11:04:26 +0000 Subject: [PATCH 129/159] Update main.nf - parameterising container --- main.nf | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/main.nf b/main.nf index d7a54db9..7942fbfa 100644 --- a/main.nf +++ b/main.nf @@ -2,6 +2,7 @@ nextflow.enable.dsl=2 +params.scanpy_container = "quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0" params.technology = "plate" params.batch_variable = "" params.representation = "X_pca" @@ -113,7 +114,7 @@ process mergeGeneFiles { } process scanpy_read_10x { - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: path matrix @@ -142,7 +143,7 @@ process scanpy_read_10x { } process scanpy_multiplet_scrublet { - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: path anndata @@ -172,7 +173,7 @@ process scanpy_multiplet_scrublet { process scanpy_plot_scrublet { publishDir params.result_dir_path, mode: 'copy', pattern: '(scrublet.png)' - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: path anndata @@ -192,7 +193,7 @@ process scanpy_plot_scrublet { } process scanpy_filter_cells { - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: path anndata @@ -218,7 +219,7 @@ process scanpy_filter_genes { publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'matrix.mtx' publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'barcodes.tsv' publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'genes.tsv' - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: path anndata @@ -248,7 +249,7 @@ process normalise_data { publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'matrix.mtx' publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'barcodes.tsv' publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'genes.tsv' - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: path anndata @@ -273,7 +274,7 @@ process normalise_data { } process normalise_internal_data { - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: path anndata @@ -293,7 +294,7 @@ process normalise_internal_data { } process find_variable_genes { - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: path anndata @@ -325,7 +326,7 @@ process find_variable_genes { } process run_pca { - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: path anndata @@ -348,7 +349,7 @@ process run_pca { } process harmony_batch { - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: path anndata @@ -378,7 +379,7 @@ process harmony_batch { } process neighbors { - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: path anndata @@ -405,7 +406,7 @@ process neighbors { } process neighbors_for_umap { - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: @@ -434,7 +435,7 @@ process neighbors_for_umap { process find_clusters { publishDir "${params.result_dir_path}/clusters", mode: 'copy', pattern: 'clusters_*.tsv' - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: tuple path(anndata), val(resolution) @@ -462,7 +463,7 @@ process find_clusters { } process restore_unscaled { - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: tuple path(anndata), path(normalise_internal_data) @@ -482,7 +483,7 @@ process restore_unscaled { process find_markers { publishDir "${params.result_dir_path}/markers", mode: 'copy', pattern: 'markers_*.tsv' errorStrategy 'ignore' - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: tuple path(anndata), val(merged_group_slotname) @@ -515,7 +516,7 @@ process run_umap { errorStrategy 'ignore' - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: path anndata @@ -557,7 +558,7 @@ process run_tsne { errorStrategy 'ignore' - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: tuple path(anndata), val(perplexity_values) @@ -591,7 +592,7 @@ process run_tsne { process make_project_file { publishDir params.result_dir_path, mode: 'copy' - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_container input: path neighbors From 9c05d8e501bf7d17a75d131c55e630dd7680d1ac Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 20 Nov 2024 11:05:09 +0000 Subject: [PATCH 130/159] Update main.nf - rename `scanpy_container` to `scanpy_scripts_container` --- main.nf | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/main.nf b/main.nf index 7942fbfa..02163de6 100644 --- a/main.nf +++ b/main.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 -params.scanpy_container = "quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0" +params.scanpy_scripts_container = "quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0" params.technology = "plate" params.batch_variable = "" params.representation = "X_pca" @@ -114,7 +114,7 @@ process mergeGeneFiles { } process scanpy_read_10x { - container params.scanpy_container + container params.scanpy_scripts_container input: path matrix @@ -143,7 +143,7 @@ process scanpy_read_10x { } process scanpy_multiplet_scrublet { - container params.scanpy_container + container params.scanpy_scripts_container input: path anndata @@ -173,7 +173,7 @@ process scanpy_multiplet_scrublet { process scanpy_plot_scrublet { publishDir params.result_dir_path, mode: 'copy', pattern: '(scrublet.png)' - container params.scanpy_container + container params.scanpy_scripts_container input: path anndata @@ -193,7 +193,7 @@ process scanpy_plot_scrublet { } process scanpy_filter_cells { - container params.scanpy_container + container params.scanpy_scripts_container input: path anndata @@ -219,7 +219,7 @@ process scanpy_filter_genes { publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'matrix.mtx' publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'barcodes.tsv' publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'genes.tsv' - container params.scanpy_container + container params.scanpy_scripts_container input: path anndata @@ -249,7 +249,7 @@ process normalise_data { publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'matrix.mtx' publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'barcodes.tsv' publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'genes.tsv' - container params.scanpy_container + container params.scanpy_scripts_container input: path anndata @@ -274,7 +274,7 @@ process normalise_data { } process normalise_internal_data { - container params.scanpy_container + container params.scanpy_scripts_container input: path anndata @@ -294,7 +294,7 @@ process normalise_internal_data { } process find_variable_genes { - container params.scanpy_container + container params.scanpy_scripts_container input: path anndata @@ -326,7 +326,7 @@ process find_variable_genes { } process run_pca { - container params.scanpy_container + container params.scanpy_scripts_container input: path anndata @@ -349,7 +349,7 @@ process run_pca { } process harmony_batch { - container params.scanpy_container + container params.scanpy_scripts_container input: path anndata @@ -379,7 +379,7 @@ process harmony_batch { } process neighbors { - container params.scanpy_container + container params.scanpy_scripts_container input: path anndata @@ -406,7 +406,7 @@ process neighbors { } process neighbors_for_umap { - container params.scanpy_container + container params.scanpy_scripts_container input: @@ -435,7 +435,7 @@ process neighbors_for_umap { process find_clusters { publishDir "${params.result_dir_path}/clusters", mode: 'copy', pattern: 'clusters_*.tsv' - container params.scanpy_container + container params.scanpy_scripts_container input: tuple path(anndata), val(resolution) @@ -463,7 +463,7 @@ process find_clusters { } process restore_unscaled { - container params.scanpy_container + container params.scanpy_scripts_container input: tuple path(anndata), path(normalise_internal_data) @@ -483,7 +483,7 @@ process restore_unscaled { process find_markers { publishDir "${params.result_dir_path}/markers", mode: 'copy', pattern: 'markers_*.tsv' errorStrategy 'ignore' - container params.scanpy_container + container params.scanpy_scripts_container input: tuple path(anndata), val(merged_group_slotname) @@ -516,7 +516,7 @@ process run_umap { errorStrategy 'ignore' - container params.scanpy_container + container params.scanpy_scripts_container input: path anndata @@ -558,7 +558,7 @@ process run_tsne { errorStrategy 'ignore' - container params.scanpy_container + container params.scanpy_scripts_container input: tuple path(anndata), val(perplexity_values) @@ -592,7 +592,7 @@ process run_tsne { process make_project_file { publishDir params.result_dir_path, mode: 'copy' - container params.scanpy_container + container params.scanpy_scripts_container input: path neighbors From 05d3e7506067c596faea062eba7aab5bc1b6815e Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 20 Nov 2024 11:05:50 +0000 Subject: [PATCH 131/159] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 76d6284c..0fc91ff7 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,11 @@ bash scripts/data_prep.sh [output path] ``` ## Run for plate ``` -nextflow run main.nf --slurm -resume --dir_path [--output_path ] +nextflow run main.nf --slurm -resume --dir_path [--output_path ] [--scanpy_scripts_container ] ``` ## Run for droplet ``` -nextflow run main.nf --slurm -resume --dir_path --technology droplet [--output_path ] +nextflow run main.nf --slurm -resume --dir_path --technology droplet [--output_path ] [--scanpy_scripts_container ] ``` If `[--output_path ]` is not specified results will be `/results` dir. From 290d17466d1b8161eeba567e30be9a015bc9f1b6 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 20 Nov 2024 11:36:00 +0000 Subject: [PATCH 132/159] Update main.nf - rename output dir --- main.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index 02163de6..f54bc2b1 100644 --- a/main.nf +++ b/main.nf @@ -216,9 +216,9 @@ process scanpy_filter_cells { } process scanpy_filter_genes { - publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'matrix.mtx' - publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'barcodes.tsv' - publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'genes.tsv' + publishDir "${params.result_dir_path}/matrices/raw_filtered", mode: 'copy', pattern: 'matrix.mtx' + publishDir "${params.result_dir_path}/matrices/raw_filtered", mode: 'copy', pattern: 'barcodes.tsv' + publishDir "${params.result_dir_path}/matrices/raw_filtered", mode: 'copy', pattern: 'genes.tsv' container params.scanpy_scripts_container input: @@ -246,9 +246,9 @@ process scanpy_filter_genes { } process normalise_data { - publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'matrix.mtx' - publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'barcodes.tsv' - publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'genes.tsv' + publishDir "${params.result_dir_path}/matrices/filtered_normalised", mode: 'copy', pattern: 'matrix.mtx' + publishDir "${params.result_dir_path}/matrices/filtered_normalised", mode: 'copy', pattern: 'barcodes.tsv' + publishDir "${params.result_dir_path}/matrices/filtered_normalised", mode: 'copy', pattern: 'genes.tsv' container params.scanpy_scripts_container input: From 021411d4a2974eeea633715b20e7a262e49df495 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Thu, 28 Nov 2024 11:19:29 +0000 Subject: [PATCH 133/159] Update main.nf --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index f54bc2b1..dc5c7568 100644 --- a/main.nf +++ b/main.nf @@ -164,7 +164,7 @@ process scanpy_multiplet_scrublet { scanpy-cli multiplet scrublet \ --input-format 'anndata' \ --output-format 'anndata' \ - --batch-key "${params.batch_variable}" \ + --batch-key "$batch_variable" \ $anndata \ scrublet.h5ad fi From 4a2af1fd39f91b51db53ac174632f5ac708994f8 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Thu, 28 Nov 2024 11:20:59 +0000 Subject: [PATCH 134/159] Update main.nf --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index dc5c7568..62e09146 100644 --- a/main.nf +++ b/main.nf @@ -154,7 +154,7 @@ process scanpy_multiplet_scrublet { script: """ - if [ -z "${params.batch_variable}" ]; then + if [ -z "$batch_variable" ]; then scanpy-cli multiplet scrublet \ --input-format 'anndata' \ --output-format 'anndata' \ From 5d568d97d2f79ffa2647a943d9c278ac25c6cf3d Mon Sep 17 00:00:00 2001 From: Pedro Madrigal Date: Fri, 29 Nov 2024 15:59:38 +0000 Subject: [PATCH 135/159] make scripts executable --- scripts/data_prep.sh | 0 scripts/restore_unscaled.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/data_prep.sh mode change 100644 => 100755 scripts/restore_unscaled.py diff --git a/scripts/data_prep.sh b/scripts/data_prep.sh old mode 100644 new mode 100755 diff --git a/scripts/restore_unscaled.py b/scripts/restore_unscaled.py old mode 100644 new mode 100755 From cc29b1fcf54736aa1520722d96e61511f6b97d08 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Tue, 3 Dec 2024 15:09:25 +0000 Subject: [PATCH 136/159] Update main.nf - renaming tsne and umap tsvs --- main.nf | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/main.nf b/main.nf index 62e09146..309bf6dc 100644 --- a/main.nf +++ b/main.nf @@ -512,7 +512,7 @@ process find_markers { } process run_umap { - publishDir "${params.result_dir_path}/umap", mode: 'copy', pattern: 'embeddings_neighbors_neighbors_*.tsv' + publishDir "${params.result_dir_path}/umap", mode: 'copy', pattern: 'umap_n_neighbors_*.tsv' errorStrategy 'ignore' @@ -546,15 +546,14 @@ process run_umap { $anndata \ --show-obj stdout \ --output-format anndata \ - "umap_\${n_number}.h5ad" - # Not sure if following is needed - # && mv "embeddings_neighbors_n_neighbors_\${n_number}.tsv" embeddings.tsv + "umap_\${n_number}.h5ad" \ + && mv "embeddings_neighbors_neighbors_\${n_number}.tsv" umap_n_neighbors_\${n_number}.tsv """ } process run_tsne { - publishDir "${params.result_dir_path}/tsne", mode: 'copy', pattern: 'embeddings_perplexity_*\\.tsv' + publishDir "${params.result_dir_path}/tsne", mode: 'copy', pattern: 'tsne_perplexity_*\\.tsv' errorStrategy 'ignore' @@ -583,9 +582,8 @@ process run_tsne { $anndata \ --show-obj stdout \ --output-format anndata \ - 'tsne_${perplexity_values}.h5ad' - # Not sure if following is needed - # && mv 'embeddings_perplexity_${perplexity_values}.tsv' embeddings.tsv + 'tsne_${perplexity_values}.h5ad' \ + && mv 'embeddings_perplexity_${perplexity_values}.tsv' 'tsne_perplexity_${perplexity_values}.tsv' """ } From 9b89d11321800f261ef1ef207f5dd2f4e1eae856 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Tue, 3 Dec 2024 15:58:16 +0000 Subject: [PATCH 137/159] Update main.nf - updates output names --- main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 309bf6dc..0a48ec6b 100644 --- a/main.nf +++ b/main.nf @@ -523,7 +523,7 @@ process run_umap { output: path "umap_*.h5ad" - path "embeddings_neighbors_neighbors_*.tsv" + path "umap_n_neighbors_*.tsv" script: """ @@ -547,7 +547,7 @@ process run_umap { --show-obj stdout \ --output-format anndata \ "umap_\${n_number}.h5ad" \ - && mv "embeddings_neighbors_neighbors_\${n_number}.tsv" umap_n_neighbors_\${n_number}.tsv + && mv 'embeddings_neighbors_neighbors_\${n_number}.tsv' 'umap_n_neighbors_\${n_number}.tsv' """ } @@ -565,7 +565,7 @@ process run_tsne { output: path "tsne_${perplexity_values}.h5ad" - path "embeddings_perplexity_${perplexity_values}.tsv" + path "tsne_perplexity_${perplexity_values}.tsv" script: """ From 2d4e7e314a77992de53cd3e0b6d60f625006d9db Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Tue, 3 Dec 2024 16:39:30 +0000 Subject: [PATCH 138/159] Update main.nf --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 0a48ec6b..efef95c7 100644 --- a/main.nf +++ b/main.nf @@ -547,7 +547,7 @@ process run_umap { --show-obj stdout \ --output-format anndata \ "umap_\${n_number}.h5ad" \ - && mv 'embeddings_neighbors_neighbors_\${n_number}.tsv' 'umap_n_neighbors_\${n_number}.tsv' + && mv "embeddings_neighbors_\${n_number}.tsv" umap_n_\${n_number}.tsv """ } From de3c4f4b8bdc770e76c23fa3184fd2bf9196b1c9 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Tue, 3 Dec 2024 19:52:29 +0000 Subject: [PATCH 139/159] Update main.nf - updates cluster file name to match existing and parsing script --- main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index efef95c7..c0beb618 100644 --- a/main.nf +++ b/main.nf @@ -434,7 +434,7 @@ process neighbors_for_umap { } process find_clusters { - publishDir "${params.result_dir_path}/clusters", mode: 'copy', pattern: 'clusters_*.tsv' + publishDir "${params.result_dir_path}/clusters", mode: 'copy', pattern: 'clusters_resolution_*.tsv' container params.scanpy_scripts_container input: @@ -458,7 +458,7 @@ process find_clusters { --output-format anndata \ 'clusters_${resolution}.h5ad' - mv 'output.tsv' 'clusters_${resolution}.tsv' + mv 'output.tsv' 'clusters_resolution_${resolution}.tsv' """ } @@ -507,7 +507,7 @@ process find_markers { $anndata \ --show-obj stdout \ --output-format anndata \ - 'markers_${merged_group_slotname}.h5ad' + 'markers_${merged_group_slotname}.h5ad' \ """ } From c300446f340fd22aad5ac47e59d7c02413a4c543 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Tue, 3 Dec 2024 20:38:44 +0000 Subject: [PATCH 140/159] Update main.nf - fixes cluster output --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index c0beb618..4d31fd07 100644 --- a/main.nf +++ b/main.nf @@ -441,7 +441,7 @@ process find_clusters { tuple path(anndata), val(resolution) output: path "clusters_${resolution}.h5ad" - path "clusters_${resolution}.tsv" + path "clusters_resolution_${resolution}.tsv" script: """ From 651119a3e797322b897f2983153b8f1cba781b67 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Wed, 4 Dec 2024 21:48:39 +0000 Subject: [PATCH 141/159] Update main.nf - changes marker file name --- main.nf | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 4d31fd07..bb89d4c2 100644 --- a/main.nf +++ b/main.nf @@ -490,10 +490,15 @@ process find_markers { output: path "markers_${merged_group_slotname}.h5ad" - path "markers_${merged_group_slotname}.tsv" + path "markers_*.tsv" script: """ + VAR="$merged_group_slotname" + PREFIX={params.slotname} + n_number="\${VAR/_\$PREFIX/}" + echo \$n_number + scanpy-find-markers \ --save 'markers_${merged_group_slotname}.tsv' \ --n-genes '100' \ @@ -508,6 +513,7 @@ process find_markers { --show-obj stdout \ --output-format anndata \ 'markers_${merged_group_slotname}.h5ad' \ + && 'markers_${merged_group_slotname}.tsv' 'markers_\${n_number}.tsv' """ } From a69967fc4458f89617ce0a9f1c726a6ca07bd7e5 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Thu, 5 Dec 2024 08:05:12 +0000 Subject: [PATCH 142/159] Update main.nf - ads log --- main.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index bb89d4c2..813feccb 100644 --- a/main.nf +++ b/main.nf @@ -496,6 +496,8 @@ process find_markers { """ VAR="$merged_group_slotname" PREFIX={params.slotname} + echo \$VAR + echo \$PREFIX n_number="\${VAR/_\$PREFIX/}" echo \$n_number @@ -513,7 +515,7 @@ process find_markers { --show-obj stdout \ --output-format anndata \ 'markers_${merged_group_slotname}.h5ad' \ - && 'markers_${merged_group_slotname}.tsv' 'markers_\${n_number}.tsv' + && mv 'markers_${merged_group_slotname}.tsv' 'markers_\${n_number}.tsv' """ } From af972a3a1a68e80d46f639e31f5f40bc95b2bf7b Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Thu, 5 Dec 2024 09:42:33 +0000 Subject: [PATCH 143/159] Update main.nf - fixes marker rename --- main.nf | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/main.nf b/main.nf index 813feccb..89fc3e65 100644 --- a/main.nf +++ b/main.nf @@ -495,11 +495,11 @@ process find_markers { script: """ VAR="$merged_group_slotname" - PREFIX={params.slotname} - echo \$VAR - echo \$PREFIX - n_number="\${VAR/_\$PREFIX/}" - echo \$n_number + PREFIX="${params.slotname}_" + echo \$VAR + echo \$PREFIX + n_number="\${VAR#\$PREFIX}" + echo \$n_number scanpy-find-markers \ --save 'markers_${merged_group_slotname}.tsv' \ @@ -514,8 +514,8 @@ process find_markers { $anndata \ --show-obj stdout \ --output-format anndata \ - 'markers_${merged_group_slotname}.h5ad' \ - && mv 'markers_${merged_group_slotname}.tsv' 'markers_\${n_number}.tsv' + "markers_${merged_group_slotname}.h5ad" \ + && mv "markers_${merged_group_slotname}.tsv" "markers_\${n_number}.tsv" """ } From df76b9f65bf856870013c5f73b94069324a8f325 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Thu, 5 Dec 2024 13:31:56 +0000 Subject: [PATCH 144/159] Update main.nf - changes marker tsv name --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 89fc3e65..eaadf080 100644 --- a/main.nf +++ b/main.nf @@ -515,7 +515,7 @@ process find_markers { --show-obj stdout \ --output-format anndata \ "markers_${merged_group_slotname}.h5ad" \ - && mv "markers_${merged_group_slotname}.tsv" "markers_\${n_number}.tsv" + && mv "markers_${merged_group_slotname}.tsv" "markers_resolution_\${n_number}.tsv" """ } From 98ae08ee30553438266632aef60623f09cb60a3e Mon Sep 17 00:00:00 2001 From: Iris Diana Yu <17606346+irisdianauy@users.noreply.github.com> Date: Fri, 6 Dec 2024 18:05:57 +0800 Subject: [PATCH 145/159] Update main.nf - correct key added in run_umap --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index eaadf080..b50b1168 100644 --- a/main.nf +++ b/main.nf @@ -540,7 +540,7 @@ process run_umap { echo \$n_number scanpy-run-umap \ --neighbors-key "neighbors_n_\${n_number}" \ - --key-added "neighbors_\${n_number}" \ + --key-added "neighbors_n_\${n_number}" \ --export-embedding embeddings.tsv \ --n-components 2 \ --min-dist 0.5 \ From 6cb9451c59d7fb84268ab9aeb9e0d1752e9a8aef Mon Sep 17 00:00:00 2001 From: Pedro Madrigal <8195212+pmb59@users.noreply.github.com> Date: Fri, 6 Dec 2024 10:49:52 +0000 Subject: [PATCH 146/159] add _n in embeddings_neighbors_n_ --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index b50b1168..3b187b32 100644 --- a/main.nf +++ b/main.nf @@ -555,7 +555,7 @@ process run_umap { --show-obj stdout \ --output-format anndata \ "umap_\${n_number}.h5ad" \ - && mv "embeddings_neighbors_\${n_number}.tsv" umap_n_\${n_number}.tsv + && mv "embeddings_neighbors_n_\${n_number}.tsv" umap_n_\${n_number}.tsv """ } From c24f6e1a93f8a2f5fd0201abbebb78fb07c99736 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Tue, 10 Dec 2024 12:16:43 +0000 Subject: [PATCH 147/159] Update main.nf - adding log info --- main.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.nf b/main.nf index 3b187b32..50bd5cfd 100644 --- a/main.nf +++ b/main.nf @@ -30,6 +30,8 @@ resolution_values: ${params.resolution_values} slotname: ${params.slotname} clustering_slotname: ${params.clustering_slotname} merged_group_slotname: ${params.merged_group_slotname} +batch_variable: ${params.batch_variable} +representation: ${params.representation} =============================== """ From 0d3248c93e790eeb9f311842bf77ac2516f16e39 Mon Sep 17 00:00:00 2001 From: Anil Thanki Date: Tue, 10 Dec 2024 12:30:39 +0000 Subject: [PATCH 148/159] Update main.nf - filter cell process n_counts based on technology --- main.nf | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 50bd5cfd..40017769 100644 --- a/main.nf +++ b/main.nf @@ -206,8 +206,13 @@ process scanpy_filter_cells { script: """ - scanpy-filter-cells --gene-name 'gene_symbols' \ - --param 'c:n_counts' 750.0 1000000000.0 \ + n_counts=1500 + if [[ -n "$category" ]]; then + n_counts=750 + fi + + scanpy-filter-cells --gene-name 'gene_symbols' \ + --param 'c:n_counts' \$n_counts 1000000000.0 \ --param 'c:pct_counts_mito' 0.0 0.35 \ --input-format 'anndata' $anndata \ --show-obj stdout \ From 7e61db3922a1cdd51875450d9e794466606e61ab Mon Sep 17 00:00:00 2001 From: Iris Diana Yu Date: Wed, 11 Dec 2024 09:14:29 +0000 Subject: [PATCH 149/159] Add process scale_data --- main.nf | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 40017769..5bb76ccc 100644 --- a/main.nf +++ b/main.nf @@ -332,6 +332,26 @@ process find_variable_genes { """ } +process scale_data { + container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + + input: + path anndata + + output: + path 'scaled_anndata.h5ad' + + script: + """ + scanpy-scale-data \ + --input-format "anndata" \ + --output-format "anndata" \ + $anndata \ + 'scaled_anndata.h5ad' + + """ +} + process run_pca { container params.scanpy_scripts_container @@ -710,9 +730,21 @@ workflow { normalise_internal_data.out, params.batch_variable ) - run_pca( - find_variable_genes.out - ) + + if ( params.technology == "droplet" ) { + scale_data( + find_variable_genes.out + ) + run_pca( + scale_data.out + ) + } + else { + run_pca( + find_variable_genes.out + ) + } + harmony_batch( run_pca.out, params.batch_variable From 28064664eb6df7e73cc6e2401a615ef1f79d700b Mon Sep 17 00:00:00 2001 From: Pedro Madrigal <8195212+pmb59@users.noreply.github.com> Date: Thu, 12 Dec 2024 12:12:17 +0000 Subject: [PATCH 150/159] reorder params.neighbor_values --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 5bb76ccc..1f22ad11 100644 --- a/main.nf +++ b/main.nf @@ -9,7 +9,7 @@ params.representation = "X_pca" params.dir_path = "." params.result_dir_path = params.output_path ?: params.dir_path + "/results" params.celltype_field = 'NO_CELLTYPE_FIELD' -params.neighbor_values = ['10', '100', '15', '20', '25', '3', '30', '5', '50'] +params.neighbor_values = ['3', '5', '10', '15', '20', '25', '30', '50', '100'] params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50'] params.resolution_values = ['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0'] params.slotname = "louvain_resolution" From d420d3994344e110aa503d6b9a927913b535b906 Mon Sep 17 00:00:00 2001 From: Pedro Madrigal <8195212+pmb59@users.noreply.github.com> Date: Thu, 12 Dec 2024 15:29:55 +0000 Subject: [PATCH 151/159] Update main.nf --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index 1f22ad11..b3104978 100644 --- a/main.nf +++ b/main.nf @@ -562,6 +562,7 @@ process run_umap { script: """ + PYTHONIOENCODING=utf-8 VAR="$anndata" n_number="\${VAR%.h5ad}" echo \$n_number From 38766ea3d8c3ccde8f854ec0d5800aee027d4c04 Mon Sep 17 00:00:00 2001 From: Pedro Madrigal Date: Thu, 12 Dec 2024 15:43:08 +0000 Subject: [PATCH 152/159] set env variable PYTHONIOENCODING = 'utf-8' for all processes --- main.nf | 6 +++--- nextflow.config | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index b3104978..109a6cbc 100644 --- a/main.nf +++ b/main.nf @@ -558,11 +558,11 @@ process run_umap { output: path "umap_*.h5ad" - path "umap_n_neighbors_*.tsv" + path "umap_n_neighbors_*.tsv" script: """ - PYTHONIOENCODING=utf-8 + echo \$PYTHONIOENCODING VAR="$anndata" n_number="\${VAR%.h5ad}" echo \$n_number @@ -659,7 +659,7 @@ process make_project_file { echo "\${count}" done python ${projectDir}/scripts/final_project.py - mv output.h5 project.h5ad + mv output.h5 project.h5ad """ } diff --git a/nextflow.config b/nextflow.config index e0bd55d1..8170742f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -7,6 +7,7 @@ process { queueSize=500 exitReadTimeout='100000 sec' pollInterval = '5sec' + env.PYTHONIOENCODING = 'utf-8' // error strategy // errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } // memory = { 4.GB * 2 ^task.attempt } From 18a690e0d8f5379c5091d4cd0c295707069bc5cc Mon Sep 17 00:00:00 2001 From: Pedro Madrigal Date: Thu, 12 Dec 2024 16:13:10 +0000 Subject: [PATCH 153/159] fix env PYTHONIOENCODING = 'utf-8' --- nextflow.config | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 8170742f..9a1f491e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,3 +1,7 @@ +env { + PYTHONIOENCODING = 'utf-8' +} + process { executor='slurm' queue="$SCXA_HPC_QUEUE" @@ -7,7 +11,6 @@ process { queueSize=500 exitReadTimeout='100000 sec' pollInterval = '5sec' - env.PYTHONIOENCODING = 'utf-8' // error strategy // errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } // memory = { 4.GB * 2 ^task.attempt } From 8aece46179d1ffb6da871d658594a305904d6cf1 Mon Sep 17 00:00:00 2001 From: Pedro Madrigal <8195212+pmb59@users.noreply.github.com> Date: Thu, 12 Dec 2024 16:33:33 +0000 Subject: [PATCH 154/159] Update main.nf --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index 109a6cbc..d959fa6f 100644 --- a/main.nf +++ b/main.nf @@ -562,6 +562,7 @@ process run_umap { script: """ + export PYTHONIOENCODING='utf-8' echo \$PYTHONIOENCODING VAR="$anndata" n_number="\${VAR%.h5ad}" From 931231157fbcfbcf082d2ff83b7223cc22965a63 Mon Sep 17 00:00:00 2001 From: Pedro Madrigal Date: Fri, 13 Dec 2024 15:53:31 +0000 Subject: [PATCH 155/159] ensure PYTHONIOENCODING='utf-8' on each scanpy-script command --- main.nf | 31 +++++++++++++++++++++++++------ nextflow.config | 4 ---- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/main.nf b/main.nf index d959fa6f..a630d414 100644 --- a/main.nf +++ b/main.nf @@ -14,7 +14,7 @@ params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40', params.resolution_values = ['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0'] params.slotname = "louvain_resolution" params.clustering_slotname = params.resolution_values.collect { params.slotname + "_" + it } -params.merged_group_slotname = params.clustering_slotname + params.celltype_field +params.merged_group_slotname = params.clustering_slotname.collect { it + params.celltype_field } log.info """ =============================== @@ -133,6 +133,8 @@ process scanpy_read_10x { #ln -s $matrix matrix.mtx ln -s $genes genes.tsv #ln -s $barcodes barcodes.tsv + + export PYTHONIOENCODING='utf-8' scanpy-read-10x --input-10x-mtx ./ \ --var-names 'gene_ids' \ @@ -156,6 +158,7 @@ process scanpy_multiplet_scrublet { script: """ + export PYTHONIOENCODING='utf-8' if [ -z "$batch_variable" ]; then scanpy-cli multiplet scrublet \ --input-format 'anndata' \ @@ -185,6 +188,7 @@ process scanpy_plot_scrublet { script: """ + export PYTHONIOENCODING='utf-8' scanpy-cli plot scrublet \ --input-format "anndata" \ --scale-hist-obs "linear" \ @@ -206,11 +210,12 @@ process scanpy_filter_cells { script: """ - n_counts=1500 - if [[ -n "$category" ]]; then - n_counts=750 - fi + n_counts=1500 + if [[ -n "$category" ]]; then + n_counts=750 + fi + export PYTHONIOENCODING='utf-8' scanpy-filter-cells --gene-name 'gene_symbols' \ --param 'c:n_counts' \$n_counts 1000000000.0 \ --param 'c:pct_counts_mito' 0.0 0.35 \ @@ -240,6 +245,7 @@ process scanpy_filter_genes { script: """ + export PYTHONIOENCODING='utf-8' scanpy-filter-genes \ --param 'g:n_cells' 3.0 1000000000.0 \ --subset 'g:index' \ @@ -269,6 +275,7 @@ process normalise_data { script: """ + export PYTHONIOENCODING='utf-8' scanpy-normalise-data \ --no-log-transform \ --normalize-to '1000000.0' \ @@ -291,6 +298,7 @@ process normalise_internal_data { script: """ + export PYTHONIOENCODING='utf-8' scanpy-normalise-data \ --normalize-to '1000000.0' \ --input-format 'anndata' $anndata \ @@ -317,7 +325,7 @@ process find_variable_genes { batch_variable_tag="--batch-key $batch_variable" fi - + export PYTHONIOENCODING='utf-8' scanpy-find-variable-genes \ --flavor 'seurat' \ --mean-limits 0.0125 1000000000.0 \ @@ -343,6 +351,7 @@ process scale_data { script: """ + export PYTHONIOENCODING='utf-8' scanpy-scale-data \ --input-format "anndata" \ --output-format "anndata" \ @@ -363,6 +372,7 @@ process run_pca { script: """ + export PYTHONIOENCODING='utf-8' scanpy-run-pca \ --no-zero-center \ --svd-solver 'arpack' \ @@ -386,6 +396,7 @@ process harmony_batch { script: """ + export PYTHONIOENCODING='utf-8' if [[ -n "$batch_variable" ]]; then scanpy-integrate harmony \ --batch-key $batch_variable \ @@ -416,6 +427,7 @@ process neighbors { script: """ + export PYTHONIOENCODING='utf-8' scanpy-neighbors \ --n-neighbors 15 \ --method 'umap' \ @@ -443,6 +455,7 @@ process neighbors_for_umap { path "neighbors_${n_neighbors}.h5ad" script: """ + export PYTHONIOENCODING='utf-8' scanpy-neighbors \ --n-neighbors $n_neighbors \ --key-added 'neighbors_n_neighbors_${n_neighbors}' \ @@ -472,6 +485,7 @@ process find_clusters { script: """ + export PYTHONIOENCODING='utf-8' scanpy-find-cluster louvain \ --neighbors-key 'neighbors' \ --key-added 'louvain_resolution_${resolution}' \ @@ -500,6 +514,7 @@ process restore_unscaled { script: """ + export PYTHONIOENCODING='utf-8' ln -s $anndata input.h5 ln -s $normalise_internal_data r_source.h5 python ${projectDir}/scripts/restore_unscaled.py @@ -528,6 +543,8 @@ process find_markers { n_number="\${VAR#\$PREFIX}" echo \$n_number + export PYTHONIOENCODING='utf-8' + scanpy-find-markers \ --save 'markers_${merged_group_slotname}.tsv' \ --n-genes '100' \ @@ -606,6 +623,7 @@ process run_tsne { script: """ + export PYTHONIOENCODING='utf-8' scanpy-run-tsne \ --use-rep $representation \ --export-embedding embeddings.tsv \ @@ -640,6 +658,7 @@ process make_project_file { path "project.h5ad" script: """ + export PYTHONIOENCODING='utf-8' ln -s $neighbors input.h5 ln -s $scanpy_read_10x r_source.h5 ln -s '$filter_genes' x_source_0.h5 diff --git a/nextflow.config b/nextflow.config index 9a1f491e..e0bd55d1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,7 +1,3 @@ -env { - PYTHONIOENCODING = 'utf-8' -} - process { executor='slurm' queue="$SCXA_HPC_QUEUE" From 27503b59c0a31611252556aafd4d844f75eb33f3 Mon Sep 17 00:00:00 2001 From: Pedro Madrigal Date: Wed, 18 Dec 2024 11:54:44 +0000 Subject: [PATCH 156/159] upgrade scanpy-scripts --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index a630d414..afe544ef 100644 --- a/main.nf +++ b/main.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 -params.scanpy_scripts_container = "quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0" +params.scanpy_scripts_container = "quay.io/biocontainers/scanpy-scripts:1.9.301--pyhdfd78af_0" params.technology = "plate" params.batch_variable = "" params.representation = "X_pca" @@ -341,7 +341,7 @@ process find_variable_genes { } process scale_data { - container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0' + container params.scanpy_scripts_container input: path anndata From ecdddb71572a905862ee8e17b9a3f17a75f5bd76 Mon Sep 17 00:00:00 2001 From: Pedro Madrigal Date: Wed, 18 Dec 2024 14:12:45 +0000 Subject: [PATCH 157/159] pin production version of scanpy-scripts --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index afe544ef..54176bd0 100644 --- a/main.nf +++ b/main.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 -params.scanpy_scripts_container = "quay.io/biocontainers/scanpy-scripts:1.9.301--pyhdfd78af_0" +params.scanpy_scripts_container = "quay.io/biocontainers/scanpy-scripts:1.1.2--pypyhdfd78af_1" params.technology = "plate" params.batch_variable = "" params.representation = "X_pca" From c2a285200fe85c5ecd0252167f7587deec223a8b Mon Sep 17 00:00:00 2001 From: Pedro Madrigal Date: Thu, 19 Dec 2024 17:54:39 +0000 Subject: [PATCH 158/159] upgrade s. scripts --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 54176bd0..afe544ef 100644 --- a/main.nf +++ b/main.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 -params.scanpy_scripts_container = "quay.io/biocontainers/scanpy-scripts:1.1.2--pypyhdfd78af_1" +params.scanpy_scripts_container = "quay.io/biocontainers/scanpy-scripts:1.9.301--pyhdfd78af_0" params.technology = "plate" params.batch_variable = "" params.representation = "X_pca" From ef03cdacd9d843c36e707a935d6147deb3691d06 Mon Sep 17 00:00:00 2001 From: Pedro Madrigal <8195212+pmb59@users.noreply.github.com> Date: Fri, 20 Dec 2024 15:33:08 +0000 Subject: [PATCH 159/159] specify a Singularity cache directory --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index e0bd55d1..9f892942 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,7 +15,7 @@ process { singularity { enabled = true - // cacheDir = "$SCXA_SINGULARITY_CACHE" + cacheDir = "$SCXA_SINGULARITY_CACHE" } conda {