From fdd8dc0790eea403a3a730f5cca2ebf0943a4235 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 25 Oct 2024 15:04:32 +0100
Subject: [PATCH 001/159] Update main.nf - adding wf process names

---
 main.nf | 306 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 306 insertions(+)

diff --git a/main.nf b/main.nf
index 8b137891..099c6710 100644
--- a/main.nf
+++ b/main.nf
@@ -1 +1,307 @@
+#!/usr/bin/env nextflow
 
+nextflow.enable.dsl=2
+
+// Define inputs as channels
+Channel.fromPath('genemeta_data.txt').set { genemeta }
+Channel.fromPath('genes_data.txt').set { genes }
+Channel.fromPath('barcodes_data.txt').set { barcodes }
+Channel.fromPath('matrix_data.txt').set { matrix }
+Channel.fromPath('cellmeta_data.txt').set { cellmeta }
+Channel.value('X_pca').set { pca_param }
+Channel.value('NO_CELLTYPE_FIELD').set { celltype_field_param }
+Channel.value('').set { batch_variable }
+Channel.value(['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50']).set { perplexity_values }
+Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0']).set { resolution_values }
+
+/*
+ * Column_rearrange_1: Only keeps the specified columns and removes header
+ */
+process Column_rearrange_1 {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+/*
+ * Column_rearrange_2: Only keeps the specified columns and removes header
+ */
+process Column_rearrange_2 {
+    // Set the output file
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+/*
+ * mergeGeneFiles: Merges gene file with genemeta on column 1, and keeps column1 and 4
+ */
+process mergeGeneFiles {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process scanpy_read_10x {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process scanpy_filter_cells {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process scanpy_filter_genes {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process normalise_data {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process normalise_data_internal {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process find_variable_genes {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process run_pca {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process harmony_batch {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process neighbours {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process neighbours_for_umap {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process normalise_data {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process find_clusters {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process meta_vars {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process clustering_slotnames {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process merge_group_slotnames {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process merge_collections {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process build_list {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process find_markers {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process filtered_cellgroup_markers {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process run_umap {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process run_tsne {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process filter_failed_umap {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process filer_failed_tsne {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+process merge_embeddings {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}
+
+
+
+process make_project_file {
+    input:
+
+    output:
+
+    script:
+    """
+    """
+}

From fb359c80142ce5cacd2b80a44d6b7ab5528f9681 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 25 Oct 2024 15:16:05 +0100
Subject: [PATCH 002/159] Update main.nf - populate Column_rearrange processes

---
 main.nf | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/main.nf b/main.nf
index 099c6710..6b4752d5 100644
--- a/main.nf
+++ b/main.nf
@@ -18,12 +18,27 @@ Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0']).s
  * Column_rearrange_1: Only keeps the specified columns and removes header
  */
 process Column_rearrange_1 {
+    // Set the output file
     input:
+      path genemeta
+      val col
 
     output:
+      path 'filtered_genemeta.txt'
 
     script:
     """
+      # Find the column number of the specified gene_id column name
+      col_num=\$(head -n1 "$genemeta" | tr '\\t' '\\n' | grep -n "^$col\$" | cut -d: -f1)
+  
+      # If column is found, extract it; otherwise, raise an error
+      if [[ -z "\$col_num" ]]; then
+          echo "Error: Column '$col' not found in $genemeta" >&2
+          exit 1
+      fi
+  
+      # Extract the gene_id column (without the header)
+      tail -n +2 "$genemeta" | cut -f\$col_num > filtered_genemeta.txt
     """
 }
 
@@ -33,11 +48,27 @@ process Column_rearrange_1 {
 process Column_rearrange_2 {
     // Set the output file
     input:
+      path genemeta
+      val col1
+      val col2
 
     output:
+      path 'filtered_genemeta_2.txt'
 
     script:
     """
+      # Find the column number of the specified gene_id column name
+      col_num_1=\$(head -n1 "$genemeta" | tr '\\t' '\\n' | grep -n "^$col1\$" | cut -d: -f1)
+      col_num_2=\$(head -n1 "$genemeta" | tr '\\t' '\\n' | grep -n "^$col2\$" | cut -d: -f1)
+  
+      # If either column is not found, raise an error
+      if [[ -z "\$col1_num" || -z "\$col2_num" ]]; then
+          echo "Error: Column '$col1' or '$col2' not found in $genemeta" >&2
+          exit 1
+      fi
+  
+      # Extract the gene_id column (without the header)
+      tail -n +2 "$genemeta" | cut -f\$col1_num,\$col2_num > filtered_genemeta.txt
     """
 }
 

From 47d6662c71f8413f89d2cdf27182bb56c7da21bc Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 25 Oct 2024 15:16:46 +0100
Subject: [PATCH 003/159] Update main.nf - populate mergeGeneFiles

---
 main.nf | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/main.nf b/main.nf
index 6b4752d5..010c3599 100644
--- a/main.nf
+++ b/main.nf
@@ -77,11 +77,20 @@ process Column_rearrange_2 {
  */
 process mergeGeneFiles {
     input:
+      path gene from params.genes
+      path filtered_genemeta from Column_rearrange_1.out
 
     output:
+      path params.output
 
     script:
     """
+      # Sort both files by the first column for join compatibility
+      sort -k1,1 "$gene" > sorted_gene.txt
+      sort -k1,1 filtered_genemeta.txt > sorted_genemeta.txt
+  
+      # Perform a left join to keep all data from gene file
+      join -a 1 -e 'NA' -t '\t' sorted_gene.txt sorted_genemeta.txt | cut -f1,4 > ${params.output} 
     """
 }
 

From f87d49b8c58b90b329aa0a8ddad4a8dba79d924c Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <pmadrigal@ebi.ac.uk>
Date: Fri, 25 Oct 2024 15:47:31 +0100
Subject: [PATCH 004/159] add nextflow.config for Slurm

---
 nextflow.config | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/nextflow.config b/nextflow.config
index 8b137891..1de8fc38 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1 +1,20 @@
+process {
+    executor='slurm'
+    queue="$SCXA_HPC_QUEUE"
+    clusterOptions="$SCXA_HPC_OPTIONS"
+    time = '7 d'
+    memory = '4 GB'
+    queueSize=500
+    exitReadTimeout='100000 sec'
+    pollInterval = '5sec'
+}
 
+conda {
+    cacheDir = "$SCXA_WORKFLOW_ROOT/envs"
+    createTimeout = "30 min"
+    useMamba = true
+}
+
+params {
+
+}

From 7169e98d309a7a7b722723e8864927db4a21c32b Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 25 Oct 2024 16:02:58 +0100
Subject: [PATCH 005/159] Update main.nf - moving input channels to workflow

---
 main.nf | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/main.nf b/main.nf
index 010c3599..8dfb3fd8 100644
--- a/main.nf
+++ b/main.nf
@@ -77,20 +77,20 @@ process Column_rearrange_2 {
  */
 process mergeGeneFiles {
     input:
-      path gene from params.genes
-      path filtered_genemeta from Column_rearrange_1.out
+      path gene
+      path filtered_genemeta
 
     output:
       path params.output
 
     script:
     """
-      # Sort both files by the first column for join compatibility
-      sort -k1,1 "$gene" > sorted_gene.txt
-      sort -k1,1 filtered_genemeta.txt > sorted_genemeta.txt
-  
-      # Perform a left join to keep all data from gene file
-      join -a 1 -e 'NA' -t '\t' sorted_gene.txt sorted_genemeta.txt | cut -f1,4 > ${params.output} 
+        # Sort both files by the first column for join compatibility
+        sort -k1,1 "$gene" > sorted_gene.txt
+        sort -k1,1 filtered_genemeta.txt > sorted_genemeta.txt
+        
+        # Perform a left join to keep all data from gene file
+        join -a 1 -e 'NA' -t '\t' sorted_gene.txt sorted_genemeta.txt | cut -f1,4 > ${params.output} 
     """
 }
 
@@ -98,6 +98,10 @@ process scanpy_read_10x {
     input:
 
     output:
+        path anndata
+
+    conda:
+        
 
     script:
     """
@@ -345,3 +349,23 @@ process make_project_file {
     """
     """
 }
+
+workflow {
+
+    // Create input channel (single file via CLI parameter)
+    genemeta = Channel.fromPath('genemeta_data.txt')
+    genes = Channel.fromPath('genes_data.txt')
+    barcodes = Channel.fromPath('barcodes_data.txt')
+    matrix = Channel.fromPath('matrix_data.txt')
+    cellmeta = Channel.fromPath('cellmeta_data.txt')
+    pca_param = Channel.value('X_pca')
+    celltype_field_param = Channel.value('NO_CELLTYPE_FIELD')
+    batch_variable = Channel.value('')
+    perplexity_values = Channel.value(['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50'])
+    resolution_values = Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0'])
+
+
+    // Create index file for input BAM file
+    Column_rearrange_1(genemeta, "gene_id")
+    Column_rearrange_2(genemeta, "gene_id", "gene_name")
+}

From b7de1405b452bcbf8551dac9adff71f05bc04784 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 25 Oct 2024 16:04:20 +0100
Subject: [PATCH 006/159] Update main.nf - deleting redundant input channels

---
 main.nf | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/main.nf b/main.nf
index 8dfb3fd8..bc04c954 100644
--- a/main.nf
+++ b/main.nf
@@ -2,18 +2,6 @@
 
 nextflow.enable.dsl=2
 
-// Define inputs as channels
-Channel.fromPath('genemeta_data.txt').set { genemeta }
-Channel.fromPath('genes_data.txt').set { genes }
-Channel.fromPath('barcodes_data.txt').set { barcodes }
-Channel.fromPath('matrix_data.txt').set { matrix }
-Channel.fromPath('cellmeta_data.txt').set { cellmeta }
-Channel.value('X_pca').set { pca_param }
-Channel.value('NO_CELLTYPE_FIELD').set { celltype_field_param }
-Channel.value('').set { batch_variable }
-Channel.value(['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50']).set { perplexity_values }
-Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0']).set { resolution_values }
-
 /*
  * Column_rearrange_1: Only keeps the specified columns and removes header
  */

From aab1b63898336acc128e3890beca4b06c786036d Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 25 Oct 2024 16:08:53 +0100
Subject: [PATCH 007/159] Update main.nf - adds wf step for mergeGeneFile

---
 main.nf | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index bc04c954..1bbea5cf 100644
--- a/main.nf
+++ b/main.nf
@@ -56,7 +56,7 @@ process Column_rearrange_2 {
       fi
   
       # Extract the gene_id column (without the header)
-      tail -n +2 "$genemeta" | cut -f\$col1_num,\$col2_num > filtered_genemeta.txt
+      tail -n +2 "$genemeta" | cut -f\$col1_num,\$col2_num > filtered_genemeta_2.txt
     """
 }
 
@@ -75,7 +75,7 @@ process mergeGeneFiles {
     """
         # Sort both files by the first column for join compatibility
         sort -k1,1 "$gene" > sorted_gene.txt
-        sort -k1,1 filtered_genemeta.txt > sorted_genemeta.txt
+        sort -k1,1 "$filtered_genemeta" > sorted_genemeta.txt
         
         # Perform a left join to keep all data from gene file
         join -a 1 -e 'NA' -t '\t' sorted_gene.txt sorted_genemeta.txt | cut -f1,4 > ${params.output} 
@@ -356,4 +356,8 @@ workflow {
     // Create index file for input BAM file
     Column_rearrange_1(genemeta, "gene_id")
     Column_rearrange_2(genemeta, "gene_id", "gene_name")
+    mergeGeneFiles(
+        genes,
+        Column_rearrange_2.out
+    )
 }

From 681aeb5439d850299f40a46b4c816e50857a8c7c Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 25 Oct 2024 16:15:59 +0100
Subject: [PATCH 008/159] Update main.nf - adds workflow step for
 scanpy-read-10x

---
 main.nf | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/main.nf b/main.nf
index 1bbea5cf..a9e83bdb 100644
--- a/main.nf
+++ b/main.nf
@@ -84,6 +84,11 @@ process mergeGeneFiles {
 
 process scanpy_read_10x {
     input:
+        path matrix
+        path mergeGeneFiles.out
+        path barcodes
+        path cellmeta
+        path genemeta
 
     output:
         path anndata
@@ -93,6 +98,17 @@ process scanpy_read_10x {
 
     script:
     """
+        ln -s $matrix matrix.mtx
+        ln -s $genes genes.tsv
+        ln -s $barcodes barcodes.tsv
+        
+        scanpy-read-10x --input-10x-mtx ./ \
+        --var-names 'gene_ids' \
+        --extra-obs $cellmeta \
+        --extra-var $genemeta  \
+        --show-obj stdout \
+        --output-format anndata \
+        $anndata
     """
 }
 
@@ -360,4 +376,11 @@ workflow {
         genes,
         Column_rearrange_2.out
     )
+    scanpy_read_10x(
+        matrix,
+        mergeGeneFiles.out,
+        barcodes,
+        cellmeta,
+        genemeta
+    )  
 }

From 4cec90b286ad8d9a243decbd26f6bba799649235 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 25 Oct 2024 16:35:05 +0100
Subject: [PATCH 009/159] Update main.nf - fixing typo

---
 main.nf | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/main.nf b/main.nf
index a9e83bdb..74c9fa8b 100644
--- a/main.nf
+++ b/main.nf
@@ -50,13 +50,13 @@ process Column_rearrange_2 {
       col_num_2=\$(head -n1 "$genemeta" | tr '\\t' '\\n' | grep -n "^$col2\$" | cut -d: -f1)
   
       # If either column is not found, raise an error
-      if [[ -z "\$col1_num" || -z "\$col2_num" ]]; then
+      if [[ -z "\$col_num_1" || -z "\$col_num_2" ]]; then
           echo "Error: Column '$col1' or '$col2' not found in $genemeta" >&2
           exit 1
       fi
   
       # Extract the gene_id column (without the header)
-      tail -n +2 "$genemeta" | cut -f\$col1_num,\$col2_num > filtered_genemeta_2.txt
+      tail -n +2 "$genemeta" | cut -f\$col_num_1,\$col_num_2 > filtered_genemeta_2.txt
     """
 }
 
@@ -357,11 +357,11 @@ process make_project_file {
 workflow {
 
     // Create input channel (single file via CLI parameter)
-    genemeta = Channel.fromPath('genemeta_data.txt')
-    genes = Channel.fromPath('genes_data.txt')
-    barcodes = Channel.fromPath('barcodes_data.txt')
+    genemeta = Channel.fromPath('gene_metadata.tsv')
+    genes = Channel.fromPath('genes.tsv')
+    barcodes = Channel.fromPath('barcodes_data.tsv')
     matrix = Channel.fromPath('matrix_data.txt')
-    cellmeta = Channel.fromPath('cellmeta_data.txt')
+    cellmeta = Channel.fromPath('cell_metadata.tsv')
     pca_param = Channel.value('X_pca')
     celltype_field_param = Channel.value('NO_CELLTYPE_FIELD')
     batch_variable = Channel.value('')

From 9842c6a2c8caef565d2e8eecfdc04de82e5f114c Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 25 Oct 2024 16:46:47 +0100
Subject: [PATCH 010/159] Update main.nf - fixes input file names, process
 mergeGeneFiles and scanpy_read_10x

---
 main.nf | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/main.nf b/main.nf
index 74c9fa8b..1f854078 100644
--- a/main.nf
+++ b/main.nf
@@ -69,7 +69,7 @@ process mergeGeneFiles {
       path filtered_genemeta
 
     output:
-      path params.output
+      path 'merged_genemeta.tsv'
 
     script:
     """
@@ -78,29 +78,26 @@ process mergeGeneFiles {
         sort -k1,1 "$filtered_genemeta" > sorted_genemeta.txt
         
         # Perform a left join to keep all data from gene file
-        join -a 1 -e 'NA' -t '\t' sorted_gene.txt sorted_genemeta.txt | cut -f1,4 > ${params.output} 
+        join -a 1 -e 'NA' -t '\t' sorted_gene.txt sorted_genemeta.txt | cut -f1,4 > merged_genemeta.tsv
     """
 }
 
 process scanpy_read_10x {
     input:
         path matrix
-        path mergeGeneFiles.out
+        path genes
         path barcodes
         path cellmeta
         path genemeta
 
     output:
-        path anndata
-
-    conda:
-        
+        path 'anndata.h5ad'
 
     script:
     """
-        ln -s $matrix matrix.mtx
-        ln -s $genes genes.tsv
-        ln -s $barcodes barcodes.tsv
+        #ln -s $matrix matrix.mtx
+        #ln -s $genes genes.tsv
+        #ln -s $barcodes barcodes.tsv
         
         scanpy-read-10x --input-10x-mtx ./ \
         --var-names 'gene_ids' \
@@ -108,7 +105,7 @@ process scanpy_read_10x {
         --extra-var $genemeta  \
         --show-obj stdout \
         --output-format anndata \
-        $anndata
+        'anndata.h5ad'
     """
 }
 
@@ -359,8 +356,8 @@ workflow {
     // Create input channel (single file via CLI parameter)
     genemeta = Channel.fromPath('gene_metadata.tsv')
     genes = Channel.fromPath('genes.tsv')
-    barcodes = Channel.fromPath('barcodes_data.tsv')
-    matrix = Channel.fromPath('matrix_data.txt')
+    barcodes = Channel.fromPath('barcodes.tsv')
+    matrix = Channel.fromPath('matrix.mtx')
     cellmeta = Channel.fromPath('cell_metadata.tsv')
     pca_param = Channel.value('X_pca')
     celltype_field_param = Channel.value('NO_CELLTYPE_FIELD')

From c23bb3abe5554afc4dcb068f0974f86b577dcad3 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 09:56:13 +0000
Subject: [PATCH 011/159] removes duplicated process and renames a file

---
 main.nf | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/main.nf b/main.nf
index 1f854078..9797f532 100644
--- a/main.nf
+++ b/main.nf
@@ -199,16 +199,6 @@ process neighbours_for_umap {
     """
 }
 
-process normalise_data {
-    input:
-
-    output:
-
-    script:
-    """
-    """
-}
-
 process find_clusters {
     input:
 
@@ -354,7 +344,7 @@ process make_project_file {
 workflow {
 
     // Create input channel (single file via CLI parameter)
-    genemeta = Channel.fromPath('gene_metadata.tsv')
+    genemeta = Channel.fromPath('genes_metadata.tsv')
     genes = Channel.fromPath('genes.tsv')
     barcodes = Channel.fromPath('barcodes.tsv')
     matrix = Channel.fromPath('matrix.mtx')

From 2b039d0a5cd2bdc305cf71032181d85cd645e4ea Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 10:15:53 +0000
Subject: [PATCH 012/159] populate scanpy_filter_cells process

---
 main.nf | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/main.nf b/main.nf
index 9797f532..16cc8512 100644
--- a/main.nf
+++ b/main.nf
@@ -111,11 +111,21 @@ process scanpy_read_10x {
 
 process scanpy_filter_cells {
     input:
+        path anndata
 
     output:
+        path 'filtered_cell_anndata.h5ad'
 
     script:
     """
+        scanpy-filter-cells --gene-name 'gene_symbols' \
+        --param 'c:n_counts' 750.0 1000000000.0 \
+        --param 'c:pct_counts_mito' 0.0 0.35 \
+        --category 'c:predicted_doublet' 'False' \
+        --input-format 'anndata' input.h5  \
+        --show-obj stdout \
+        --output-format anndata 'filtered_cell_anndata.h5ad'  \
+        --export-mtx ./
     """
 }
 
@@ -370,4 +380,7 @@ workflow {
         cellmeta,
         genemeta
     )  
+    scanpy_filter_cells(
+        scanpy_read_10x.out
+    )  
 }

From 32731d0d29f4a678d8a941b92ae4de62276a94a9 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 10:21:37 +0000
Subject: [PATCH 013/159] replaes anndata file name with var

---
 main.nf | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/main.nf b/main.nf
index 16cc8512..8683e9da 100644
--- a/main.nf
+++ b/main.nf
@@ -112,6 +112,7 @@ process scanpy_read_10x {
 process scanpy_filter_cells {
     input:
         path anndata
+        path genes
 
     output:
         path 'filtered_cell_anndata.h5ad'
@@ -122,7 +123,7 @@ process scanpy_filter_cells {
         --param 'c:n_counts' 750.0 1000000000.0 \
         --param 'c:pct_counts_mito' 0.0 0.35 \
         --category 'c:predicted_doublet' 'False' \
-        --input-format 'anndata' input.h5  \
+        --input-format 'anndata' $anndata  \
         --show-obj stdout \
         --output-format anndata 'filtered_cell_anndata.h5ad'  \
         --export-mtx ./
@@ -380,7 +381,4 @@ workflow {
         cellmeta,
         genemeta
     )  
-    scanpy_filter_cells(
-        scanpy_read_10x.out
-    )  
 }

From 9de5090e757f1fb71e1d0e2ab90a71835e50d5ce Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 10:29:07 +0000
Subject: [PATCH 014/159] Populate scanpy_filter_genes process

---
 main.nf | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/main.nf b/main.nf
index 8683e9da..fbf54bda 100644
--- a/main.nf
+++ b/main.nf
@@ -132,11 +132,23 @@ process scanpy_filter_cells {
 
 process scanpy_filter_genes {
     input:
+        path anndata
+        path genes
 
     output:
+        path 'filtered_gene_anndata.h5ad'
 
     script:
     """
+        scanpy-filter-genes \
+        --param 'g:n_cells' 3.0 1000000000.0 \
+        --subset 'g:index' \
+        $genes \
+        --input-format 'anndata' $anndata \
+        --show-obj stdout \
+        --output-format anndata \
+        filtered_gene_anndata.h5ad'  \
+        --export-mtx ./
     """
 }
 
@@ -381,4 +393,8 @@ workflow {
         cellmeta,
         genemeta
     )  
+    scanpy_filter_cells(
+        scanpy_read_10x.out,
+        Column_rearrange_1.out[0]
+    )  
 }

From 772d8414144a0001678c23549f2fd945cfb1b7f1 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 10:31:13 +0000
Subject: [PATCH 015/159] Adds missing quote

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index fbf54bda..23bfbd29 100644
--- a/main.nf
+++ b/main.nf
@@ -147,7 +147,7 @@ process scanpy_filter_genes {
         --input-format 'anndata' $anndata \
         --show-obj stdout \
         --output-format anndata \
-        filtered_gene_anndata.h5ad'  \
+        'filtered_gene_anndata.h5ad'  \
         --export-mtx ./
     """
 }

From f8ae27e07bd114b86c3bc78ed40689b7657b67bb Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 10:36:11 +0000
Subject: [PATCH 016/159] Populate normalise_data process

---
 main.nf | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/main.nf b/main.nf
index 23bfbd29..5954def9 100644
--- a/main.nf
+++ b/main.nf
@@ -102,7 +102,7 @@ process scanpy_read_10x {
         scanpy-read-10x --input-10x-mtx ./ \
         --var-names 'gene_ids' \
         --extra-obs $cellmeta \
-        --extra-var $genemeta  \
+        --extra-var $genemeta \
         --show-obj stdout \
         --output-format anndata \
         'anndata.h5ad'
@@ -123,9 +123,9 @@ process scanpy_filter_cells {
         --param 'c:n_counts' 750.0 1000000000.0 \
         --param 'c:pct_counts_mito' 0.0 0.35 \
         --category 'c:predicted_doublet' 'False' \
-        --input-format 'anndata' $anndata  \
+        --input-format 'anndata' $anndata \
         --show-obj stdout \
-        --output-format anndata 'filtered_cell_anndata.h5ad'  \
+        --output-format anndata 'filtered_cell_anndata.h5ad' \
         --export-mtx ./
     """
 }
@@ -147,18 +147,28 @@ process scanpy_filter_genes {
         --input-format 'anndata' $anndata \
         --show-obj stdout \
         --output-format anndata \
-        'filtered_gene_anndata.h5ad'  \
+        'filtered_gene_anndata.h5ad' \
         --export-mtx ./
     """
 }
 
 process normalise_data {
     input:
+        path anndata
 
     output:
+        path 'normalised_anndata.h5ad'
 
     script:
     """
+        scanpy-normalise-data \
+        --no-log-transform \
+        --normalize-to '1000000.0' \
+        --input-format 'anndata' $anndata \
+        --show-obj stdout \
+        --output-format anndata \
+        'normalised_anndata.h5ad' \
+        --export-mtx ./
     """
 }
 
@@ -392,9 +402,12 @@ workflow {
         barcodes,
         cellmeta,
         genemeta
-    )  
+    )
     scanpy_filter_cells(
         scanpy_read_10x.out,
         Column_rearrange_1.out[0]
-    )  
+    )
+    normalise_data(
+        scanpy_filter_cells.out
+    )
 }

From c02a8c4506094448731dcf028c7c2310782b434d Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 10:50:10 +0000
Subject: [PATCH 017/159] Populate normalise_data_internal process

---
 main.nf | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/main.nf b/main.nf
index 5954def9..82549357 100644
--- a/main.nf
+++ b/main.nf
@@ -174,11 +174,19 @@ process normalise_data {
 
 process normalise_data_internal {
     input:
+        path anndata
 
     output:
+        path 'normalised_internal_anndata.h5ad'
 
     script:
     """
+        scanpy-normalise-data \
+        --normalize-to '1000000.0' \
+        --input-format 'anndata' $anndata \
+        --show-obj stdout \
+        --output-format anndata \
+        'normalised_internal_anndata.h5ad' 
     """
 }
 
@@ -410,4 +418,7 @@ workflow {
     normalise_data(
         scanpy_filter_cells.out
     )
+    normalise_internal_data(
+        scanpy_filter_cells.out
+    )
 }

From 813c7ac46d2473658a133b285d8fd371478a201b Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 11:00:43 +0000
Subject: [PATCH 018/159] Populate find_variable_genes process

---
 main.nf | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/main.nf b/main.nf
index 82549357..1b3a6376 100644
--- a/main.nf
+++ b/main.nf
@@ -192,11 +192,23 @@ process normalise_data_internal {
 
 process find_variable_genes {
     input:
+        path anndata
 
     output:
+        path 'variable_genes.h5ad'
 
     script:
     """
+        scanpy-find-variable-genes \
+        --flavor 'seurat' \
+        --mean-limits 0.0125 1000000000.0 \
+        --disp-limits 0.5 50.0 \
+        --span 0.3 \
+        --n-bins '20' \
+        --input-format 'anndata' \
+        $anndata \
+        --show-obj stdout \
+        --output-format anndata 'variable_genes.h5ad'
     """
 }
 
@@ -421,4 +433,7 @@ workflow {
     normalise_internal_data(
         scanpy_filter_cells.out
     )
+    find_variable_genes(
+        normalise_internal_data.out
+    )
 }

From 418610e3d85ad967371febbca813da5fd333444f Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 11:51:23 +0000
Subject: [PATCH 019/159] Populate run_PCA process

---
 main.nf | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/main.nf b/main.nf
index 1b3a6376..227ba679 100644
--- a/main.nf
+++ b/main.nf
@@ -214,11 +214,22 @@ process find_variable_genes {
 
 process run_pca {
     input:
+        path anndata
 
     output:
+        path 'PCA.h5ad'
 
     script:
     """
+        scanpy-run-pca \
+        --no-zero-center \
+        --svd-solver 'arpack' \
+        --random-state '1234' \
+        --input-format 'anndata' \
+        $anndata \
+        --show-obj stdout \
+        --output-format anndata \
+        'PCA.h5ad'
     """
 }
 
@@ -436,4 +447,7 @@ workflow {
     find_variable_genes(
         normalise_internal_data.out
     )
+    run_pca(
+        find_variable_genes.out
+    )
 }

From e3e74b0132c6635679c4d1fc856351a5f9999bbd Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 12:00:32 +0000
Subject: [PATCH 020/159] Populate harmony_batch process

---
 main.nf | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 227ba679..cae4698d 100644
--- a/main.nf
+++ b/main.nf
@@ -235,11 +235,16 @@ process run_pca {
 
 process harmony_batch {
     input:
-
+        path anndata
     output:
+        path 'harmony.h5ad'
 
     script:
     """
+        echo "No batch variables passed, simply passing original input as output unchanged."
+
+        cp $anndata 'harmony.h5ad'
+
     """
 }
 
@@ -450,4 +455,7 @@ workflow {
     run_pca(
         find_variable_genes.out
     )
+    harmony_batch(
+        run_pca.out
+    )
 }

From 9a23c62ef7832aa87439c428ff68fcc57d874e7b Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 12:15:03 +0000
Subject: [PATCH 021/159] Populate neighbours process

---
 main.nf | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index cae4698d..26d12335 100644
--- a/main.nf
+++ b/main.nf
@@ -250,11 +250,26 @@ process harmony_batch {
 
 process neighbours {
     input:
-
+        path anndata
+        val pca_param
     output:
+        path 'neighbours.h5ad'
 
     script:
     """
+        scanpy-neighbors \
+        --n-neighbors 15 \
+        --method 'umap' \
+        --metric 'euclidean' \
+        --random-state '0' \
+        --use-rep $pca_param \
+        --n-pcs '50' \
+        --input-format 'anndata' \
+        $anndata \
+        --show-obj stdout \
+        --output-format anndata \
+        'neighbours.h5ad'
+
     """
 }
 
@@ -458,4 +473,8 @@ workflow {
     harmony_batch(
         run_pca.out
     )
+    neighbours(
+        harmony_batch.out,
+        pca_param
+    )
 }

From e5e82e312c04e23c8cc968d7377a4db5cd314f6c Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 14:10:09 +0000
Subject: [PATCH 022/159] Applies batch_varibale

---
 main.nf | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/main.nf b/main.nf
index 26d12335..317fe2bf 100644
--- a/main.nf
+++ b/main.nf
@@ -193,18 +193,27 @@ process normalise_data_internal {
 process find_variable_genes {
     input:
         path anndata
+        val batch_variable
 
     output:
         path 'variable_genes.h5ad'
 
     script:
     """
+        if [[ -z "\$batch_variable" ]]; then
+            batch_variable = "--batch-key $batch_variable"
+        else
+            batch_variable = ""
+        fi
+
+
         scanpy-find-variable-genes \
         --flavor 'seurat' \
         --mean-limits 0.0125 1000000000.0 \
         --disp-limits 0.5 50.0 \
         --span 0.3 \
         --n-bins '20' \
+        $batch_variable \
         --input-format 'anndata' \
         $anndata \
         --show-obj stdout \
@@ -236,14 +245,27 @@ process run_pca {
 process harmony_batch {
     input:
         path anndata
+        val batch_variable
     output:
         path 'harmony.h5ad'
 
     script:
     """
-        echo "No batch variables passed, simply passing original input as output unchanged."
+        if [[ -z "\$batch_variable" ]]; then
+            scanpy-integrate harmony \
+            --batch-key $batch_variable \
+            --basis 'X_pca' \
+            --adjusted-basis 'X_pca_harmony' \
+            --input-format 'anndata' \
+            $anndata \
+            --show-obj stdout \
+            --output-format anndata \
+            'harmony.h5ad'
+        else
+            echo "No batch variables passed, simply passing original input as output unchanged."
 
-        cp $anndata 'harmony.h5ad'
+            cp $anndata 'harmony.h5ad'
+        fi
 
     """
 }
@@ -465,13 +487,15 @@ workflow {
         scanpy_filter_cells.out
     )
     find_variable_genes(
-        normalise_internal_data.out
+        normalise_internal_data.out,
+        batch_variable
     )
     run_pca(
         find_variable_genes.out
     )
     harmony_batch(
-        run_pca.out
+        run_pca.out,
+        batch_variable
     )
     neighbours(
         harmony_batch.out,

From 458a7b4c4c7bd0157571919a97eb4fb0a78c18f9 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 15:41:52 +0000
Subject: [PATCH 023/159] Populate neighbours_for_umap process

---
 main.nf | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 317fe2bf..da73e715 100644
--- a/main.nf
+++ b/main.nf
@@ -297,11 +297,28 @@ process neighbours {
 
 process neighbours_for_umap {
     input:
-
+        path anndata
+        val n_neighbours
     output:
-
+        path 'neighbours_*.h5ad'
     script:
     """
+        for i in $sample
+        do
+            scanpy-neighbors \
+                --n-neighbors \$i \
+                --method 'umap' \
+                --metric 'euclidean' \
+                --random-state '0' \
+                --use-rep $pca_param \
+                --n-pcs '50' \
+                --input-format 'anndata' \
+                $anndata \
+                --show-obj stdout \
+                --output-format anndata \
+                'neighbours\$i.h5ad'
+        done
+
     """
 }
 
@@ -460,6 +477,7 @@ workflow {
     batch_variable = Channel.value('')
     perplexity_values = Channel.value(['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50'])
     resolution_values = Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0'])
+    neighbor_values = Channel.value("10 100 15 20 25 3 30 5 50")
 
 
     // Create index file for input BAM file
@@ -501,4 +519,8 @@ workflow {
         harmony_batch.out,
         pca_param
     )
+    neighbours_for_umap(
+        harmony_batch.out,
+        pca_param
+    )
 }

From 72c747309ea74858471d5f60ff45bca42be86fcf Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 15:54:15 +0000
Subject: [PATCH 024/159] removes random comment

---
 main.nf | 1 -
 1 file changed, 1 deletion(-)

diff --git a/main.nf b/main.nf
index da73e715..e8e1ea58 100644
--- a/main.nf
+++ b/main.nf
@@ -480,7 +480,6 @@ workflow {
     neighbor_values = Channel.value("10 100 15 20 25 3 30 5 50")
 
 
-    // Create index file for input BAM file
     Column_rearrange_1(genemeta, "gene_id")
     Column_rearrange_2(genemeta, "gene_id", "gene_name")
     mergeGeneFiles(

From 963203526aaec97ac2e36a37c83f07d0afd3ee29 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 15:55:06 +0000
Subject: [PATCH 025/159] Fixes lint

---
 main.nf | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index e8e1ea58..1b6147c5 100644
--- a/main.nf
+++ b/main.nf
@@ -480,8 +480,15 @@ workflow {
     neighbor_values = Channel.value("10 100 15 20 25 3 30 5 50")
 
 
-    Column_rearrange_1(genemeta, "gene_id")
-    Column_rearrange_2(genemeta, "gene_id", "gene_name")
+    Column_rearrange_1(
+        genemeta, 
+        "gene_id"
+    )
+    Column_rearrange_2(
+        genemeta, 
+        "gene_id", 
+        "gene_name"
+    )
     mergeGeneFiles(
         genes,
         Column_rearrange_2.out

From 527236102c575cea1d0ab1caeee09d62c1d4d5e0 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 16:01:35 +0000
Subject: [PATCH 026/159] Fixes input for neighbours_for_umap

---
 main.nf | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 1b6147c5..625c4011 100644
--- a/main.nf
+++ b/main.nf
@@ -527,6 +527,7 @@ workflow {
     )
     neighbours_for_umap(
         harmony_batch.out,
-        pca_param
+        neighbor_values
     )
+
 }

From fae78f8549ac1dfc64a740f8978f55b37c3f568d Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 16:20:39 +0000
Subject: [PATCH 027/159] Populate run_tsne process and fixes typo in
 neighbours_for_umap

---
 main.nf | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/main.nf b/main.nf
index 625c4011..0bfce888 100644
--- a/main.nf
+++ b/main.nf
@@ -303,7 +303,7 @@ process neighbours_for_umap {
         path 'neighbours_*.h5ad'
     script:
     """
-        for i in $sample
+        for i in $n_neighbours
         do
             scanpy-neighbors \
                 --n-neighbors \$i \
@@ -414,12 +414,32 @@ process run_umap {
 
 process run_tsne {
     input:
-
+        path anndata
+        val pca_param
+        val perplexity_values
     output:
-
+        path 'neighbours_*.h5ad'
     script:
     """
-    """
+        for i in $perplexity_values
+        do
+            scanpy-run-tsne \
+            --use-rep $pca_param \
+            --export-embedding embeddings.tsv \
+            --perplexity \$i \
+            --key-added 'perplexity_\$i' \
+            --early-exaggeration '12.0' \
+            --learning-rate '400.0' \
+            --no-fast-tsne \
+            --random-state 1234  \
+            --input-format 'anndata' \
+            $anndata \
+            --show-obj stdout \
+            --output-format anndata \
+            'tsne\$i.h5ad'
+            # Not sure if following is needed
+            # && mv 'embeddings_perplexity_1.tsv' embeddings.tsv
+        done
 }
 
 process filter_failed_umap {
@@ -475,7 +495,7 @@ workflow {
     pca_param = Channel.value('X_pca')
     celltype_field_param = Channel.value('NO_CELLTYPE_FIELD')
     batch_variable = Channel.value('')
-    perplexity_values = Channel.value(['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50'])
+    perplexity_values = Channel.value("1 5 10 15 20 25 30 35 40 45 50")
     resolution_values = Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0'])
     neighbor_values = Channel.value("10 100 15 20 25 3 30 5 50")
 
@@ -529,5 +549,9 @@ workflow {
         harmony_batch.out,
         neighbor_values
     )
-
+    run_tsne(
+        harmony_batch.out,
+        pca_param,
+        perplexity_values
+    )
 }

From e960f6e46510ef57e4b5882684dcf86d78518727 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 28 Oct 2024 16:40:52 +0000
Subject: [PATCH 028/159] Populate run_UMAP process

---
 main.nf | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/main.nf b/main.nf
index 0bfce888..7cfbfac9 100644
--- a/main.nf
+++ b/main.nf
@@ -404,11 +404,33 @@ process filtered_cellgroup_markers {
 
 process run_umap {
     input:
-
+        path anndata
     output:
-
+        path 'umap_*.h5ad'
     script:
     """
+        for i in $anndata
+        do
+            scanpy-run-umap \
+            --neighbors-key 'neighbors_\$i' \
+            --key-added 'neighbors_\$i' \
+            --export-embedding embeddings.tsv \
+            --n-components 2 \
+            --min-dist 0.5 \
+            --spread 1.0 \
+            --alpha 1.0 \
+            --gamma 1.0 \
+            --negative-sample-rate 5 \
+            --random-state 0 \
+            --init-pos 'spectral' \
+            --input-format 'anndata' \
+            \$i \
+            --show-obj stdout \
+            --output-format anndata \
+            'umap_\$i.h5ad'  
+            # Not sure if following is needed
+            # && mv 'embeddings_neighbors_n_neighbors_100.tsv' embeddings.tsv
+        done
     """
 }
 
@@ -418,7 +440,7 @@ process run_tsne {
         val pca_param
         val perplexity_values
     output:
-        path 'neighbours_*.h5ad'
+        path 'tsne_*.h5ad'
     script:
     """
         for i in $perplexity_values
@@ -436,7 +458,7 @@ process run_tsne {
             $anndata \
             --show-obj stdout \
             --output-format anndata \
-            'tsne\$i.h5ad'
+            'tsne_\$i.h5ad'
             # Not sure if following is needed
             # && mv 'embeddings_perplexity_1.tsv' embeddings.tsv
         done

From 834271558e7281de403f86b04a60089426d1bf8a Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 30 Oct 2024 11:34:34 +0000
Subject: [PATCH 029/159] renames normalise_internal_data and adds container
 tag

---
 main.nf | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 7cfbfac9..9389342c 100644
--- a/main.nf
+++ b/main.nf
@@ -93,6 +93,8 @@ process scanpy_read_10x {
     output:
         path 'anndata.h5ad'
 
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+
     script:
     """
         #ln -s $matrix matrix.mtx
@@ -172,7 +174,7 @@ process normalise_data {
     """
 }
 
-process normalise_data_internal {
+process normalise_internal_data {
     input:
         path anndata
 
@@ -462,6 +464,7 @@ process run_tsne {
             # Not sure if following is needed
             # && mv 'embeddings_perplexity_1.tsv' embeddings.tsv
         done
+    """
 }
 
 process filter_failed_umap {

From 3cc4066145a275bfe08f0a987710bf61b5b05072 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Thu, 31 Oct 2024 11:10:04 +0000
Subject: [PATCH 030/159] Update main.nf - fixing join command for needed
 output

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 9389342c..e43d7ea1 100644
--- a/main.nf
+++ b/main.nf
@@ -78,7 +78,7 @@ process mergeGeneFiles {
         sort -k1,1 "$filtered_genemeta" > sorted_genemeta.txt
         
         # Perform a left join to keep all data from gene file
-        join -a 1 -e 'NA' -t '\t' sorted_gene.txt sorted_genemeta.txt | cut -f1,4 > merged_genemeta.tsv
+        join -a 1 -t \$'\t' -o 0,1.2,2.2 sorted_gene.txt sorted_genemeta.txt | cut -f1,3 > merged_genemeta.tsv
     """
 }
 

From 0d014f1b642306ccf78549ac1aabd3250926dfc4 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Thu, 31 Oct 2024 11:11:31 +0000
Subject: [PATCH 031/159] Update main.nf - sylink genes.tsv locally

---
 main.nf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index e43d7ea1..0232ecd0 100644
--- a/main.nf
+++ b/main.nf
@@ -83,6 +83,8 @@ process mergeGeneFiles {
 }
 
 process scanpy_read_10x {
+        container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    
     input:
         path matrix
         path genes
@@ -93,12 +95,10 @@ process scanpy_read_10x {
     output:
         path 'anndata.h5ad'
 
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
-
     script:
     """
         #ln -s $matrix matrix.mtx
-        #ln -s $genes genes.tsv
+        ln -s $genes genes.tsv
         #ln -s $barcodes barcodes.tsv
         
         scanpy-read-10x --input-10x-mtx ./ \

From 279b43c67dd84f4dd4e6ce405492f211494fea8a Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Thu, 31 Oct 2024 11:15:36 +0000
Subject: [PATCH 032/159] Update main.nf -  adding container tag and commenting
 param in filter_cell

---
 main.nf | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 0232ecd0..953c6b4f 100644
--- a/main.nf
+++ b/main.nf
@@ -83,7 +83,7 @@ process mergeGeneFiles {
 }
 
 process scanpy_read_10x {
-        container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     
     input:
         path matrix
@@ -112,6 +112,8 @@ process scanpy_read_10x {
 }
 
 process scanpy_filter_cells {
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    
     input:
         path anndata
         path genes
@@ -124,7 +126,7 @@ process scanpy_filter_cells {
         scanpy-filter-cells --gene-name 'gene_symbols' \
         --param 'c:n_counts' 750.0 1000000000.0 \
         --param 'c:pct_counts_mito' 0.0 0.35 \
-        --category 'c:predicted_doublet' 'False' \
+        # --category 'c:predicted_doublet' 'False' \ # commenting temporary as error attribute not found
         --input-format 'anndata' $anndata \
         --show-obj stdout \
         --output-format anndata 'filtered_cell_anndata.h5ad' \
@@ -133,6 +135,8 @@ process scanpy_filter_cells {
 }
 
 process scanpy_filter_genes {
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+
     input:
         path anndata
         path genes
@@ -155,6 +159,8 @@ process scanpy_filter_genes {
 }
 
 process normalise_data {
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+
     input:
         path anndata
 

From f874a37f56b00b167fbc9ecbc32be2a7553ced85 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Thu, 31 Oct 2024 11:16:39 +0000
Subject: [PATCH 033/159] Update nextflow.config - adding singularity param

---
 nextflow.config | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nextflow.config b/nextflow.config
index 1de8fc38..e6c5c1be 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,3 +1,5 @@
+singularity.enabled = true
+
 process {
     executor='slurm'
     queue="$SCXA_HPC_QUEUE"

From 33f70da311ea3e218521ac164540a33c8ceb846d Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Thu, 31 Oct 2024 11:22:28 +0000
Subject: [PATCH 034/159] Update main.nf - removing comment

---
 main.nf | 1 -
 1 file changed, 1 deletion(-)

diff --git a/main.nf b/main.nf
index 953c6b4f..56230709 100644
--- a/main.nf
+++ b/main.nf
@@ -126,7 +126,6 @@ process scanpy_filter_cells {
         scanpy-filter-cells --gene-name 'gene_symbols' \
         --param 'c:n_counts' 750.0 1000000000.0 \
         --param 'c:pct_counts_mito' 0.0 0.35 \
-        # --category 'c:predicted_doublet' 'False' \ # commenting temporary as error attribute not found
         --input-format 'anndata' $anndata \
         --show-obj stdout \
         --output-format anndata 'filtered_cell_anndata.h5ad' \

From 1c924fba4b5006d3c4edfb415a9b64ea7a494397 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Thu, 31 Oct 2024 11:56:47 +0000
Subject: [PATCH 035/159] Update main.nf - adds container info fixes
 `batch_variable`

---
 main.nf | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/main.nf b/main.nf
index 56230709..561a7e19 100644
--- a/main.nf
+++ b/main.nf
@@ -180,6 +180,8 @@ process normalise_data {
 }
 
 process normalise_internal_data {
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    
     input:
         path anndata
 
@@ -198,6 +200,8 @@ process normalise_internal_data {
 }
 
 process find_variable_genes {
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+
     input:
         path anndata
         val batch_variable
@@ -207,8 +211,9 @@ process find_variable_genes {
 
     script:
     """
-        if [[ -z "\$batch_variable" ]]; then
-            batch_variable = "--batch-key $batch_variable"
+        batch_variable_tag=""
+        if [[ -z "$batch_variable" ]]; then
+            batch_variable_tag="--batch-key $batch_variable"
         else
             batch_variable = ""
         fi
@@ -220,7 +225,7 @@ process find_variable_genes {
         --disp-limits 0.5 50.0 \
         --span 0.3 \
         --n-bins '20' \
-        $batch_variable \
+        \$batch_variable_tag \
         --input-format 'anndata' \
         $anndata \
         --show-obj stdout \
@@ -229,6 +234,8 @@ process find_variable_genes {
 }
 
 process run_pca {
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+
     input:
         path anndata
 
@@ -250,6 +257,8 @@ process run_pca {
 }
 
 process harmony_batch {
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+
     input:
         path anndata
         val batch_variable
@@ -258,7 +267,7 @@ process harmony_batch {
 
     script:
     """
-        if [[ -z "\$batch_variable" ]]; then
+        if [[ -z "$batch_variable" ]]; then
             scanpy-integrate harmony \
             --batch-key $batch_variable \
             --basis 'X_pca' \
@@ -278,6 +287,8 @@ process harmony_batch {
 }
 
 process neighbours {
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+
     input:
         path anndata
         val pca_param
@@ -303,6 +314,8 @@ process neighbours {
 }
 
 process neighbours_for_umap {
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+
     input:
         path anndata
         val n_neighbours
@@ -410,6 +423,8 @@ process filtered_cellgroup_markers {
 }
 
 process run_umap {
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    
     input:
         path anndata
     output:
@@ -442,6 +457,8 @@ process run_umap {
 }
 
 process run_tsne {
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    
     input:
         path anndata
         val pca_param

From 48f5f8d8cfe522090391ab85431f39a1c57f3b96 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Thu, 31 Oct 2024 14:04:41 +0000
Subject: [PATCH 036/159] Update main.nf - fixes `batch_variable` condition,
 adds pca_param in neighbors_for_umap

---
 main.nf | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/main.nf b/main.nf
index 561a7e19..3f1e1788 100644
--- a/main.nf
+++ b/main.nf
@@ -212,10 +212,8 @@ process find_variable_genes {
     script:
     """
         batch_variable_tag=""
-        if [[ -z "$batch_variable" ]]; then
+        if [[ -n "$batch_variable" ]]; then
             batch_variable_tag="--batch-key $batch_variable"
-        else
-            batch_variable = ""
         fi
 
 
@@ -267,7 +265,7 @@ process harmony_batch {
 
     script:
     """
-        if [[ -z "$batch_variable" ]]; then
+        if [[ -n "$batch_variable" ]]; then
             scanpy-integrate harmony \
             --batch-key $batch_variable \
             --basis 'X_pca' \
@@ -319,6 +317,7 @@ process neighbours_for_umap {
     input:
         path anndata
         val n_neighbours
+        val pca_param
     output:
         path 'neighbours_*.h5ad'
     script:
@@ -336,7 +335,7 @@ process neighbours_for_umap {
                 $anndata \
                 --show-obj stdout \
                 --output-format anndata \
-                'neighbours\$i.h5ad'
+                'neighbours_\$i.h5ad'
         done
 
     """
@@ -594,7 +593,8 @@ workflow {
     )
     neighbours_for_umap(
         harmony_batch.out,
-        neighbor_values
+        neighbor_values,
+        pca_param
     )
     run_tsne(
         harmony_batch.out,

From 3f3fecb677a382137de9d1d7f06e714039d075e2 Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <8195212+pmb59@users.noreply.github.com>
Date: Fri, 1 Nov 2024 15:08:52 +0000
Subject: [PATCH 037/159] edit process run_umap

---
 main.nf | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/main.nf b/main.nf
index 3f1e1788..0dfd42e7 100644
--- a/main.nf
+++ b/main.nf
@@ -425,16 +425,14 @@ process run_umap {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     
     input:
-        path anndata
+        each anndata
     output:
         path 'umap_*.h5ad'
     script:
     """
-        for i in $anndata
-        do
             scanpy-run-umap \
-            --neighbors-key 'neighbors_\$i' \
-            --key-added 'neighbors_\$i' \
+            --neighbors-key 'neighbors_\$anndata' \
+            --key-added 'neighbors_\$anndata' \
             --export-embedding embeddings.tsv \
             --n-components 2 \
             --min-dist 0.5 \
@@ -445,13 +443,13 @@ process run_umap {
             --random-state 0 \
             --init-pos 'spectral' \
             --input-format 'anndata' \
-            \$i \
+            \$anndata \
             --show-obj stdout \
             --output-format anndata \
-            'umap_\$i.h5ad'  
+            'umap_\$anndata.h5ad'  
             # Not sure if following is needed
             # && mv 'embeddings_neighbors_n_neighbors_100.tsv' embeddings.tsv
-        done
+
     """
 }
 

From a94177aa33543da04e8d9d415ffe63ed60717c2d Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <8195212+pmb59@users.noreply.github.com>
Date: Fri, 1 Nov 2024 15:15:47 +0000
Subject: [PATCH 038/159] edit process run_tsne

---
 main.nf | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/main.nf b/main.nf
index 0dfd42e7..ed3a52d5 100644
--- a/main.nf
+++ b/main.nf
@@ -459,18 +459,16 @@ process run_tsne {
     input:
         path anndata
         val pca_param
-        val perplexity_values
+        each perplexity_values
     output:
         path 'tsne_*.h5ad'
     script:
     """
-        for i in $perplexity_values
-        do
             scanpy-run-tsne \
             --use-rep $pca_param \
             --export-embedding embeddings.tsv \
-            --perplexity \$i \
-            --key-added 'perplexity_\$i' \
+            --perplexity \$perplexity_values \
+            --key-added 'perplexity_\$perplexity_values' \
             --early-exaggeration '12.0' \
             --learning-rate '400.0' \
             --no-fast-tsne \
@@ -479,10 +477,9 @@ process run_tsne {
             $anndata \
             --show-obj stdout \
             --output-format anndata \
-            'tsne_\$i.h5ad'
+            'tsne_\$perplexity_values.h5ad'
             # Not sure if following is needed
             # && mv 'embeddings_perplexity_1.tsv' embeddings.tsv
-        done
     """
 }
 

From 12efb956f234248730866c7ed8c2b33ef59e5bec Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <8195212+pmb59@users.noreply.github.com>
Date: Fri, 1 Nov 2024 16:43:51 +0000
Subject: [PATCH 039/159] edit process neighbours_for_umap

and add Dynamic Memory Allocation
---
 main.nf | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/main.nf b/main.nf
index ed3a52d5..f1a19232 100644
--- a/main.nf
+++ b/main.nf
@@ -314,18 +314,20 @@ process neighbours {
 process neighbours_for_umap {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
+    errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
+    memory { 4.GB * task.attempt }
+    maxRetries 3
+
     input:
         path anndata
-        val n_neighbours
+        each n_neighbours
         val pca_param
     output:
         path 'neighbours_*.h5ad'
     script:
     """
-        for i in $n_neighbours
-        do
             scanpy-neighbors \
-                --n-neighbors \$i \
+                --n-neighbors \$n_neighbours \
                 --method 'umap' \
                 --metric 'euclidean' \
                 --random-state '0' \
@@ -335,8 +337,7 @@ process neighbours_for_umap {
                 $anndata \
                 --show-obj stdout \
                 --output-format anndata \
-                'neighbours_\$i.h5ad'
-        done
+                'neighbours_\${n_neighbours}.h5ad'
 
     """
 }

From 86de8ade73fca5ff1d9de44b1033ddf31cd63179 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 10:43:02 +0000
Subject: [PATCH 040/159] Update main.nf - fixes parallel run for
 neighbours_for_umap

---
 main.nf | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/main.nf b/main.nf
index f1a19232..77ffa6c1 100644
--- a/main.nf
+++ b/main.nf
@@ -2,6 +2,8 @@
 
 nextflow.enable.dsl=2
 
+params.neighbor_values = ['10', '100', '15', '20', '25', '3', '30', '5', '50']
+
 /*
  * Column_rearrange_1: Only keeps the specified columns and removes header
  */
@@ -319,15 +321,14 @@ process neighbours_for_umap {
     maxRetries 3
 
     input:
-        path anndata
-        each n_neighbours
+        tuple path(anndata), val(n_neighbours)
         val pca_param
     output:
-        path 'neighbours_*.h5ad'
+        path "neighbours_${n_neighbours}.h5ad"
     script:
     """
             scanpy-neighbors \
-                --n-neighbors \$n_neighbours \
+                --n-neighbors $n_neighbours \
                 --method 'umap' \
                 --metric 'euclidean' \
                 --random-state '0' \
@@ -337,7 +338,7 @@ process neighbours_for_umap {
                 $anndata \
                 --show-obj stdout \
                 --output-format anndata \
-                'neighbours_\${n_neighbours}.h5ad'
+                'neighbours_${n_neighbours}.h5ad'
 
     """
 }
@@ -539,7 +540,7 @@ workflow {
     batch_variable = Channel.value('')
     perplexity_values = Channel.value("1 5 10 15 20 25 30 35 40 45 50")
     resolution_values = Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0'])
-    neighbor_values = Channel.value("10 100 15 20 25 3 30 5 50")
+    neighbors_ch = channel.fromList(params.neighbor_values)
 
 
     Column_rearrange_1(
@@ -588,8 +589,7 @@ workflow {
         pca_param
     )
     neighbours_for_umap(
-        harmony_batch.out,
-        neighbor_values,
+        harmony_batch.out.combine(neighbors_ch),
         pca_param
     )
     run_tsne(

From 8bf90e9814d81f51682162b61ae7423e67ac3a52 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 15:10:02 +0000
Subject: [PATCH 041/159] Update main.nf - fixes process run_tsne

---
 main.nf | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/main.nf b/main.nf
index 77ffa6c1..c88c6e2c 100644
--- a/main.nf
+++ b/main.nf
@@ -3,7 +3,7 @@
 nextflow.enable.dsl=2
 
 params.neighbor_values = ['10', '100', '15', '20', '25', '3', '30', '5', '50']
-
+params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50']
 /*
  * Column_rearrange_1: Only keeps the specified columns and removes header
  */
@@ -459,18 +459,17 @@ process run_tsne {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     
     input:
-        path anndata
+        tuple path(anndata), val(perplexity_values)
         val pca_param
-        each perplexity_values
     output:
-        path 'tsne_*.h5ad'
+        path 'tsne_${perplexity_values}.h5ad'
     script:
     """
             scanpy-run-tsne \
             --use-rep $pca_param \
             --export-embedding embeddings.tsv \
-            --perplexity \$perplexity_values \
-            --key-added 'perplexity_\$perplexity_values' \
+            --perplexity $perplexity_values \
+            --key-added 'perplexity_$perplexity_values' \
             --early-exaggeration '12.0' \
             --learning-rate '400.0' \
             --no-fast-tsne \
@@ -479,7 +478,7 @@ process run_tsne {
             $anndata \
             --show-obj stdout \
             --output-format anndata \
-            'tsne_\$perplexity_values.h5ad'
+            'tsne_${perplexity_values}.h5ad'
             # Not sure if following is needed
             # && mv 'embeddings_perplexity_1.tsv' embeddings.tsv
     """
@@ -541,7 +540,7 @@ workflow {
     perplexity_values = Channel.value("1 5 10 15 20 25 30 35 40 45 50")
     resolution_values = Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0'])
     neighbors_ch = channel.fromList(params.neighbor_values)
-
+    perplexity_ch = channel.fromList(params.perplexity_values)
 
     Column_rearrange_1(
         genemeta, 
@@ -593,8 +592,7 @@ workflow {
         pca_param
     )
     run_tsne(
-        harmony_batch.out,
-        pca_param,
-        perplexity_values
+        harmony_batch.out.combine(perplexity_ch),
+        pca_param
     )
 }

From b4d4ca2161230f703fc44c62ed13e20045528356 Mon Sep 17 00:00:00 2001
From: fg_atlas <fg_atlas@codon-slurm-login-02.ebi.ac.uk>
Date: Mon, 4 Nov 2024 15:14:17 +0000
Subject: [PATCH 042/159] fixes output run-tsne

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index c88c6e2c..968e694a 100644
--- a/main.nf
+++ b/main.nf
@@ -462,7 +462,7 @@ process run_tsne {
         tuple path(anndata), val(perplexity_values)
         val pca_param
     output:
-        path 'tsne_${perplexity_values}.h5ad'
+        path "tsne_${perplexity_values}.h5ad"
     script:
     """
             scanpy-run-tsne \

From fbf688e5d34f82f8dbec95792d60c58b95db822f Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 15:29:53 +0000
Subject: [PATCH 043/159] Update main.nf - populate find_clusters process

---
 main.nf | 50 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/main.nf b/main.nf
index 968e694a..2b999dda 100644
--- a/main.nf
+++ b/main.nf
@@ -4,6 +4,8 @@ nextflow.enable.dsl=2
 
 params.neighbor_values = ['10', '100', '15', '20', '25', '3', '30', '5', '50']
 params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50']
+params.resolution_values = ['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0']
+    
 /*
  * Column_rearrange_1: Only keeps the specified columns and removes header
  */
@@ -327,29 +329,43 @@ process neighbours_for_umap {
         path "neighbours_${n_neighbours}.h5ad"
     script:
     """
-            scanpy-neighbors \
-                --n-neighbors $n_neighbours \
-                --method 'umap' \
-                --metric 'euclidean' \
-                --random-state '0' \
-                --use-rep $pca_param \
-                --n-pcs '50' \
-                --input-format 'anndata' \
-                $anndata \
-                --show-obj stdout \
-                --output-format anndata \
-                'neighbours_${n_neighbours}.h5ad'
+        scanpy-neighbors \
+            --n-neighbors $n_neighbours \
+            --method 'umap' \
+            --metric 'euclidean' \
+            --random-state '0' \
+            --use-rep $pca_param \
+            --n-pcs '50' \
+            --input-format 'anndata' \
+            $anndata \
+            --show-obj stdout \
+            --output-format anndata \
+            'neighbours_${n_neighbours}.h5ad'
 
     """
 }
 
 process find_clusters {
-    input:
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
+    input:
+        tuple path(anndata), val(resolution)
     output:
-
+        path "clusters_${resolution}.h5ad"
     script:
     """
+        scanpy-find-cluster louvain \
+        --neighbors-key 'neighbors' \
+        --key-added 'louvain_resolution_${resolution}' \
+        --resolution ${resolution} \
+        --random-state '1234' \
+        --directed \
+        --export-cluster output.tsv \
+        --input-format 'anndata' \
+        input.h5 \
+        --show-obj stdout \
+        --output-format anndata \
+        'clusters_${resolution}.h5ad'
     """
 }
 
@@ -537,10 +553,9 @@ workflow {
     pca_param = Channel.value('X_pca')
     celltype_field_param = Channel.value('NO_CELLTYPE_FIELD')
     batch_variable = Channel.value('')
-    perplexity_values = Channel.value("1 5 10 15 20 25 30 35 40 45 50")
-    resolution_values = Channel.value(['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0'])
     neighbors_ch = channel.fromList(params.neighbor_values)
     perplexity_ch = channel.fromList(params.perplexity_values)
+    resolution_ch = channel.fromList(params.resolution_values)
 
     Column_rearrange_1(
         genemeta, 
@@ -595,4 +610,7 @@ workflow {
         harmony_batch.out.combine(perplexity_ch),
         pca_param
     )
+    find_clusters(
+        neighbours.out.combine(resolution_ch)
+    )
 }

From ad54b02e22809937289af596c4ed10a852e80218 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 15:32:49 +0000
Subject: [PATCH 044/159] Update main.nf - fixes input for find_clusters

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 2b999dda..1a9ef1fe 100644
--- a/main.nf
+++ b/main.nf
@@ -362,7 +362,7 @@ process find_clusters {
         --directed \
         --export-cluster output.tsv \
         --input-format 'anndata' \
-        input.h5 \
+        $anndata \
         --show-obj stdout \
         --output-format anndata \
         'clusters_${resolution}.h5ad'

From 310c5b228dd1f94bf24a8edbb22584a5782215a9 Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <pmadrigal@ebi.ac.uk>
Date: Mon, 4 Nov 2024 16:15:10 +0000
Subject: [PATCH 045/159] add nf-core linting github action

---
 .github/workflows/nextflow-linter.yaml | 29 ++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 .github/workflows/nextflow-linter.yaml

diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml
new file mode 100644
index 00000000..5f7c0291
--- /dev/null
+++ b/.github/workflows/nextflow-linter.yaml
@@ -0,0 +1,29 @@
+name: nf-core linting
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.x'
+      
+      - name: Install nf-core tools
+        run: |
+          python -m pip install --upgrade pip
+          pip install nf-core
+
+      - name: Run nf-core lint
+        run: nf-core lint .

From 8e459d1be669b0f63b65b47de8e5ed1c9648e922 Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <pmadrigal@ebi.ac.uk>
Date: Mon, 4 Nov 2024 16:21:11 +0000
Subject: [PATCH 046/159] fix lint command

---
 .github/workflows/nextflow-linter.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml
index 5f7c0291..73ac5b2f 100644
--- a/.github/workflows/nextflow-linter.yaml
+++ b/.github/workflows/nextflow-linter.yaml
@@ -12,18 +12,18 @@ jobs:
   lint:
     runs-on: ubuntu-latest
     steps:
-      - name: Checkout repository
+      - name: checkout repository
         uses: actions/checkout@v2
 
-      - name: Set up Python
+      - name: set up Python
         uses: actions/setup-python@v2
         with:
           python-version: '3.x'
       
-      - name: Install nf-core tools
+      - name: install nf-core tools
         run: |
           python -m pip install --upgrade pip
           pip install nf-core
 
-      - name: Run nf-core lint
-        run: nf-core lint .
+      - name: run nf-core lint
+        run: nf-core pipelines lint

From cc5ea04c6f2871351470a4c9bbca06d25026d05b Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <pmadrigal@ebi.ac.uk>
Date: Mon, 4 Nov 2024 16:27:40 +0000
Subject: [PATCH 047/159] add java and nextflow to github action

---
 .github/workflows/nextflow-linter.yaml | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml
index 73ac5b2f..f55b4e79 100644
--- a/.github/workflows/nextflow-linter.yaml
+++ b/.github/workflows/nextflow-linter.yaml
@@ -15,15 +15,31 @@ jobs:
       - name: checkout repository
         uses: actions/checkout@v2
 
+      - uses: actions/setup-java@v2
+        with:
+          distribution: 'adopt'
+          java-version: '11'
+
+      - name: install Nextflow
+        run: |
+          wget -qO- https://get.nextflow.io | bash
+          chmod +x nextflow
+          mkdir -p $HOME/.local/bin
+          mv nextflow $HOME/.local/bin/
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
+
       - name: set up Python
         uses: actions/setup-python@v2
         with:
           python-version: '3.x'
-      
+
       - name: install nf-core tools
         run: |
           python -m pip install --upgrade pip
           pip install nf-core
 
+      - name: check Nextflow version
+        run: nextflow -version
+
       - name: run nf-core lint
         run: nf-core pipelines lint

From 16519762eaa5ce62f5d965ab5a0846ae2ddf05df Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 16:32:32 +0000
Subject: [PATCH 048/159] Update main.nf - adds more params and logs

---
 main.nf | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 1a9ef1fe..73138dbb 100644
--- a/main.nf
+++ b/main.nf
@@ -2,10 +2,29 @@
 
 nextflow.enable.dsl=2
 
+params.celltype_field = 'NO_CELLTYPE_FIELD'
 params.neighbor_values = ['10', '100', '15', '20', '25', '3', '30', '5', '50']
 params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50']
 params.resolution_values = ['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0']
-    
+params.slotname = "louvain_resolution"
+params.clustering_slotname = params.resolution_values.collect { params.slotname + "_" + it }
+params.merged_group_slotname = params.clustering_slotname + params.celltype_field
+
+
+log.info """
+===============================
+WORKFLOW PARAMETER VALUES
+===============================
+celltype_field: ${params.celltype_field}
+neighbor_values: ${params.neighbor_values}
+perplexity_values: ${params.perplexity_values}
+resolution_values: ${params.resolution_values}
+slotname: ${params.slotname}
+clustering_slotname: ${params.clustering_slotname}
+merged_group_slotname: ${params.merged_group_slotname}
+===============================
+"""
+
 /*
  * Column_rearrange_1: Only keeps the specified columns and removes header
  */
@@ -551,7 +570,6 @@ workflow {
     matrix = Channel.fromPath('matrix.mtx')
     cellmeta = Channel.fromPath('cell_metadata.tsv')
     pca_param = Channel.value('X_pca')
-    celltype_field_param = Channel.value('NO_CELLTYPE_FIELD')
     batch_variable = Channel.value('')
     neighbors_ch = channel.fromList(params.neighbor_values)
     perplexity_ch = channel.fromList(params.perplexity_values)

From 192ba0c0bc3fe9efee0c86bc9bddaea6eebfde2a Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 16:33:16 +0000
Subject: [PATCH 049/159] Update nextflow.config

Co-authored-by: Pedro Madrigal <8195212+pmb59@users.noreply.github.com>
---
 nextflow.config | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index e6c5c1be..6dce3ed4 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,4 +1,3 @@
-singularity.enabled = true
 
 process {
     executor='slurm'
@@ -11,6 +10,11 @@ process {
     pollInterval = '5sec'
 }
 
+singularity {
+    enabled = true
+    cacheDir = "$SCXA_SINGULARITY_CACHE"
+}
+
 conda {
     cacheDir = "$SCXA_WORKFLOW_ROOT/envs"
     createTimeout = "30 min"

From 2a26c09fe70ca05eb52ad1a4fb66ec4b7ea8b13e Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <pmadrigal@ebi.ac.uk>
Date: Mon, 4 Nov 2024 16:39:22 +0000
Subject: [PATCH 050/159] Mov away from nf-core pipeline check, we only want to
 check the syntax in main.nf

---
 .github/workflows/nextflow-linter.yaml | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml
index f55b4e79..e9f5b597 100644
--- a/.github/workflows/nextflow-linter.yaml
+++ b/.github/workflows/nextflow-linter.yaml
@@ -28,18 +28,8 @@ jobs:
           mv nextflow $HOME/.local/bin/
           echo "$HOME/.local/bin" >> $GITHUB_PATH
 
-      - name: set up Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: '3.x'
-
-      - name: install nf-core tools
-        run: |
-          python -m pip install --upgrade pip
-          pip install nf-core
-
       - name: check Nextflow version
         run: nextflow -version
 
-      - name: run nf-core lint
-        run: nf-core pipelines lint
+      - name: check syntax in main.nf
+        run: nextflow -quiet validate main.nf

From 5591c4488eaddd5e09cf48411ae0944884b22884 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 16:44:16 +0000
Subject: [PATCH 051/159] Update main.nf - ignores failed run and keeps only
 successful run

---
 main.nf | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 73138dbb..ca488947 100644
--- a/main.nf
+++ b/main.nf
@@ -459,6 +459,8 @@ process filtered_cellgroup_markers {
 }
 
 process run_umap {
+    errorStrategy 'ignore'
+
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     
     input:
@@ -491,6 +493,8 @@ process run_umap {
 }
 
 process run_tsne {
+    errorStrategy 'ignore'
+    
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     
     input:
@@ -624,10 +628,19 @@ workflow {
         harmony_batch.out.combine(neighbors_ch),
         pca_param
     )
-    run_tsne(
+    TNSEs_ch = run_tsne(
         harmony_batch.out.combine(perplexity_ch),
         pca_param
     )
+    TNSEs_ch
+        .filter { it.exitStatus == 0 }
+
+    UMAPs_ch = run_tsne(
+        neighbours_for_umap.out,
+        pca_param
+    )
+    UMAPs_ch
+        .filter { it.exitStatus == 0 }
     find_clusters(
         neighbours.out.combine(resolution_ch)
     )

From 1900933bc5e23ff88eba31cd3ff33c771f5c354a Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <pmadrigal@ebi.ac.uk>
Date: Mon, 4 Nov 2024 16:47:24 +0000
Subject: [PATCH 052/159] fix nextflow syntax check

---
 .github/workflows/nextflow-linter.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml
index e9f5b597..91fdc140 100644
--- a/.github/workflows/nextflow-linter.yaml
+++ b/.github/workflows/nextflow-linter.yaml
@@ -32,4 +32,4 @@ jobs:
         run: nextflow -version
 
       - name: check syntax in main.nf
-        run: nextflow -quiet validate main.nf
+        run: nextflow main.nf -preview || (echo "Syntax check failed. Please review the error message above." && exit 1)

From f184945589fa455158428890d72cf62887d82a66 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 16:47:51 +0000
Subject: [PATCH 053/159] Update main.nf

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index ca488947..dd86df5b 100644
--- a/main.nf
+++ b/main.nf
@@ -635,7 +635,7 @@ workflow {
     TNSEs_ch
         .filter { it.exitStatus == 0 }
 
-    UMAPs_ch = run_tsne(
+    UMAPs_ch = run_umap(
         neighbours_for_umap.out,
         pca_param
     )

From e0966f9436d45b273694e53fb0d0b4d00b29dbb9 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 16:49:41 +0000
Subject: [PATCH 054/159] Update main.nf

---
 main.nf | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index dd86df5b..5a01ac07 100644
--- a/main.nf
+++ b/main.nf
@@ -636,8 +636,7 @@ workflow {
         .filter { it.exitStatus == 0 }
 
     UMAPs_ch = run_umap(
-        neighbours_for_umap.out,
-        pca_param
+        neighbours_for_umap.out
     )
     UMAPs_ch
         .filter { it.exitStatus == 0 }

From 27359f69b2192e08de713bdd19b1f65b04f995c0 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 16:52:05 +0000
Subject: [PATCH 055/159] Update main.nf - comments ignore error

---
 main.nf | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/main.nf b/main.nf
index 5a01ac07..f3d728ca 100644
--- a/main.nf
+++ b/main.nf
@@ -459,7 +459,7 @@ process filtered_cellgroup_markers {
 }
 
 process run_umap {
-    errorStrategy 'ignore'
+    //errorStrategy 'ignore'
 
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     
@@ -493,7 +493,7 @@ process run_umap {
 }
 
 process run_tsne {
-    errorStrategy 'ignore'
+    //errorStrategy 'ignore'
     
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     
@@ -632,14 +632,14 @@ workflow {
         harmony_batch.out.combine(perplexity_ch),
         pca_param
     )
-    TNSEs_ch
-        .filter { it.exitStatus == 0 }
+    //TNSEs_ch
+    //    .filter { it.exitStatus == 0 }
 
     UMAPs_ch = run_umap(
         neighbours_for_umap.out
     )
-    UMAPs_ch
-        .filter { it.exitStatus == 0 }
+    //UMAPs_ch
+   //     .filter { it.exitStatus == 0 }
     find_clusters(
         neighbours.out.combine(resolution_ch)
     )

From 44b0e31ecb3f6635656953e81701a91848eda095 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 16:54:11 +0000
Subject: [PATCH 056/159] Update main.nf - fixed run_umap

---
 main.nf | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/main.nf b/main.nf
index f3d728ca..879f1672 100644
--- a/main.nf
+++ b/main.nf
@@ -470,8 +470,8 @@ process run_umap {
     script:
     """
             scanpy-run-umap \
-            --neighbors-key 'neighbors_\$anndata' \
-            --key-added 'neighbors_\$anndata' \
+            --neighbors-key 'neighbors_$anndata' \
+            --key-added 'neighbors_$anndata' \
             --export-embedding embeddings.tsv \
             --n-components 2 \
             --min-dist 0.5 \
@@ -482,10 +482,10 @@ process run_umap {
             --random-state 0 \
             --init-pos 'spectral' \
             --input-format 'anndata' \
-            \$anndata \
+            $anndata \
             --show-obj stdout \
             --output-format anndata \
-            'umap_\$anndata.h5ad'  
+            'umap_$anndata.h5ad'  
             # Not sure if following is needed
             # && mv 'embeddings_neighbors_n_neighbors_100.tsv' embeddings.tsv
 

From 79b2127ad02d35276455955562dce43292b5e0f5 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 16:56:52 +0000
Subject: [PATCH 057/159] Update main.nf - fixes run_umap

---
 main.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 879f1672..928d08b5 100644
--- a/main.nf
+++ b/main.nf
@@ -464,7 +464,7 @@ process run_umap {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     
     input:
-        each anndata
+        path anndata
     output:
         path 'umap_*.h5ad'
     script:
@@ -636,7 +636,7 @@ workflow {
     //    .filter { it.exitStatus == 0 }
 
     UMAPs_ch = run_umap(
-        neighbours_for_umap.out
+        neighbours_for_umap.out.flatten()
     )
     //UMAPs_ch
    //     .filter { it.exitStatus == 0 }

From a8319c1ed2fff54358ad9e10bbfa0e8fa17d08a4 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 16:58:19 +0000
Subject: [PATCH 058/159] Update main.nf - fixes run_umap

---
 main.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 928d08b5..a23ed4a2 100644
--- a/main.nf
+++ b/main.nf
@@ -466,7 +466,7 @@ process run_umap {
     input:
         path anndata
     output:
-        path 'umap_*.h5ad'
+        path "umap_*.h5ad"
     script:
     """
             scanpy-run-umap \
@@ -485,7 +485,7 @@ process run_umap {
             $anndata \
             --show-obj stdout \
             --output-format anndata \
-            'umap_$anndata.h5ad'  
+            'umap_${anndata}.h5ad'  
             # Not sure if following is needed
             # && mv 'embeddings_neighbors_n_neighbors_100.tsv' embeddings.tsv
 

From 06c13b60a28fa27d282a710884cb2aeb0d277d6b Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <pmadrigal@ebi.ac.uk>
Date: Mon, 4 Nov 2024 16:59:47 +0000
Subject: [PATCH 059/159] add simplified ci config

---
 .github/workflows/ci.config            | 15 +++++++++++++++
 .github/workflows/nextflow-linter.yaml |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/ci.config

diff --git a/.github/workflows/ci.config b/.github/workflows/ci.config
new file mode 100644
index 00000000..27cc59ae
--- /dev/null
+++ b/.github/workflows/ci.config
@@ -0,0 +1,15 @@
+
+process {
+    executor='slurm'
+    time = '7 d'
+    memory = '4 GB'
+}
+
+singularity {
+    enabled = true
+}
+
+conda {
+    createTimeout = "30 min"
+    useMamba = true
+}
diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml
index 91fdc140..9ce16f71 100644
--- a/.github/workflows/nextflow-linter.yaml
+++ b/.github/workflows/nextflow-linter.yaml
@@ -32,4 +32,4 @@ jobs:
         run: nextflow -version
 
       - name: check syntax in main.nf
-        run: nextflow main.nf -preview || (echo "Syntax check failed. Please review the error message above." && exit 1)
+        run: nextflow main.nf -c .github/workflows/ci.config -preview || (echo "Syntax check failed. Please review the error message above." && exit 1)

From 8a50a175266a947a07471055e0d154ab6c05eda3 Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <pmadrigal@ebi.ac.uk>
Date: Mon, 4 Nov 2024 17:03:16 +0000
Subject: [PATCH 060/159] disable CIi for now to avoid noise

---
 .github/workflows/nextflow-linter.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml
index 9ce16f71..6bcc9eec 100644
--- a/.github/workflows/nextflow-linter.yaml
+++ b/.github/workflows/nextflow-linter.yaml
@@ -1,12 +1,12 @@
 name: nf-core linting
 
 on:
-  push:
-    branches:
-      - main
-  pull_request:
-    branches:
-      - main
+#  push:
+#    branches:
+#      - main
+#  pull_request:
+#    branches:
+#      - main
 
 jobs:
   lint:

From 19c66583ca8502c727701b6be10f0a68d64b1032 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 17:05:20 +0000
Subject: [PATCH 061/159] Update main.nf - adds parameter to
 neighbours_for_umap

---
 main.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/main.nf b/main.nf
index a23ed4a2..b022ab56 100644
--- a/main.nf
+++ b/main.nf
@@ -350,6 +350,7 @@ process neighbours_for_umap {
     """
         scanpy-neighbors \
             --n-neighbors $n_neighbours \
+            --key-added 'neighbors_n_neighbors_${n_neighbours}' 
             --method 'umap' \
             --metric 'euclidean' \
             --random-state '0' \

From 216a89ffce8f362b841c6d1287280d0d507fd046 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 17:07:12 +0000
Subject: [PATCH 062/159] Update main.nf - fixes neighbours_for_umap

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index b022ab56..a1096b6d 100644
--- a/main.nf
+++ b/main.nf
@@ -350,7 +350,7 @@ process neighbours_for_umap {
     """
         scanpy-neighbors \
             --n-neighbors $n_neighbours \
-            --key-added 'neighbors_n_neighbors_${n_neighbours}' 
+            --key-added 'neighbors_n_neighbors_${n_neighbours}' \
             --method 'umap' \
             --metric 'euclidean' \
             --random-state '0' \

From aa20bd6341dfaaea4ff02a111d7df8fbf6851a61 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 17:16:12 +0000
Subject: [PATCH 063/159] Update main.nf - fixes neighbor spelling as per
 script name

---
 main.nf | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/main.nf b/main.nf
index a1096b6d..884f50c8 100644
--- a/main.nf
+++ b/main.nf
@@ -307,14 +307,14 @@ process harmony_batch {
     """
 }
 
-process neighbours {
+process neighbors {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
     input:
         path anndata
         val pca_param
     output:
-        path 'neighbours.h5ad'
+        path 'neighbors.h5ad'
 
     script:
     """
@@ -329,12 +329,12 @@ process neighbours {
         $anndata \
         --show-obj stdout \
         --output-format anndata \
-        'neighbours.h5ad'
+        'neighbors.h5ad'
 
     """
 }
 
-process neighbours_for_umap {
+process neighbors_for_umap {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
     errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
@@ -342,15 +342,15 @@ process neighbours_for_umap {
     maxRetries 3
 
     input:
-        tuple path(anndata), val(n_neighbours)
+        tuple path(anndata), val(n_neighbors)
         val pca_param
     output:
-        path "neighbours_${n_neighbours}.h5ad"
+        path "neighbors_${n_neighbors}.h5ad"
     script:
     """
         scanpy-neighbors \
-            --n-neighbors $n_neighbours \
-            --key-added 'neighbors_n_neighbors_${n_neighbours}' \
+            --n-neighbors $n_neighbors \
+            --key-added 'neighbors_n_neighbors_${n_neighbors}' \
             --method 'umap' \
             --metric 'euclidean' \
             --random-state '0' \
@@ -360,7 +360,7 @@ process neighbours_for_umap {
             $anndata \
             --show-obj stdout \
             --output-format anndata \
-            'neighbours_${n_neighbours}.h5ad'
+            'neighbors_${n_neighbors}.h5ad'
 
     """
 }
@@ -621,11 +621,11 @@ workflow {
         run_pca.out,
         batch_variable
     )
-    neighbours(
+    neighbors(
         harmony_batch.out,
         pca_param
     )
-    neighbours_for_umap(
+    neighbors_for_umap(
         harmony_batch.out.combine(neighbors_ch),
         pca_param
     )
@@ -637,11 +637,11 @@ workflow {
     //    .filter { it.exitStatus == 0 }
 
     UMAPs_ch = run_umap(
-        neighbours_for_umap.out.flatten()
+        neighbors_for_umap.out.flatten()
     )
     //UMAPs_ch
    //     .filter { it.exitStatus == 0 }
     find_clusters(
-        neighbours.out.combine(resolution_ch)
+        neighbors.out.combine(resolution_ch)
     )
 }

From 5b5a48b848eb23c9ec115dc84cfb47c5a516aa68 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Mon, 4 Nov 2024 17:37:52 +0000
Subject: [PATCH 064/159] Update main.nf - fixes run_umap for neighbor key

---
 main.nf | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 884f50c8..d9573cb9 100644
--- a/main.nf
+++ b/main.nf
@@ -470,8 +470,9 @@ process run_umap {
         path "umap_*.h5ad"
     script:
     """
-            scanpy-run-umap \
-            --neighbors-key 'neighbors_$anndata' \
+        n_neighbor="${anndata/.h5ad/}"
+        scanpy-run-umap \
+            --neighbors-key 'neighbors_\$n_neighbor' \
             --key-added 'neighbors_$anndata' \
             --export-embedding embeddings.tsv \
             --n-components 2 \

From cf621e5b3ce659221822a1e11e601852ccee76bb Mon Sep 17 00:00:00 2001
From: fg_atlas <fg_atlas@codon-slurm-login-04.ebi.ac.uk>
Date: Tue, 5 Nov 2024 10:12:42 +0000
Subject: [PATCH 065/159] fixes umap

---
 main.nf | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/main.nf b/main.nf
index d9573cb9..27d6a63e 100644
--- a/main.nf
+++ b/main.nf
@@ -470,10 +470,12 @@ process run_umap {
         path "umap_*.h5ad"
     script:
     """
-        n_neighbor="${anndata/.h5ad/}"
-        scanpy-run-umap \
-            --neighbors-key 'neighbors_\$n_neighbor' \
-            --key-added 'neighbors_$anndata' \
+	VAR="$anndata"
+	n_number="\${VAR%.h5ad}"
+	echo \$n_number
+	scanpy-run-umap \
+            --neighbors-key "neighbors_n_\${n_number}" \
+            --key-added "neighbors_\${n_number}" \
             --export-embedding embeddings.tsv \
             --n-components 2 \
             --min-dist 0.5 \
@@ -487,7 +489,7 @@ process run_umap {
             $anndata \
             --show-obj stdout \
             --output-format anndata \
-            'umap_${anndata}.h5ad'  
+            "umap_\${n_number}.h5ad"  
             # Not sure if following is needed
             # && mv 'embeddings_neighbors_n_neighbors_100.tsv' embeddings.tsv
 

From 806953a97ecd35b5ce643ea573b731220f91ace9 Mon Sep 17 00:00:00 2001
From: fg_atlas <fg_atlas@codon-slurm-login-04.ebi.ac.uk>
Date: Tue, 5 Nov 2024 12:50:17 +0000
Subject: [PATCH 066/159] populates find_markers process

---
 main.nf         | 35 ++++++++++++++++++++++++++++++++---
 nextflow.config |  7 -------
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/main.nf b/main.nf
index 27d6a63e..e89e0f06 100644
--- a/main.nf
+++ b/main.nf
@@ -10,7 +10,6 @@ params.slotname = "louvain_resolution"
 params.clustering_slotname = params.resolution_values.collect { params.slotname + "_" + it }
 params.merged_group_slotname = params.clustering_slotname + params.celltype_field
 
-
 log.info """
 ===============================
 WORKFLOW PARAMETER VALUES
@@ -440,12 +439,28 @@ process build_list {
 }
 
 process find_markers {
+    errorStrategy 'ignore'
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     input:
-
+	tuple path(anndata), val(merged_group_slotname)
     output:
-
+	path "markers_${merged_group_slotname}.h5ad"
     script:
     """
+	scanpy-find-markers \
+	--save diffexp.tsv \
+	--n-genes '100' \
+	--groupby '${merged_group_slotname}' \
+	--key-added 'markers_${merged_group_slotname}' \
+	--method 'wilcoxon' \
+	--use-raw  \
+	--reference 'rest' \
+	--filter-params 'min_in_group_fraction:0.0,max_out_group_fraction:1.0,min_fold_change:1.0'  \
+	--input-format 'anndata' \
+	$anndata  \
+	--show-obj stdout \
+	--output-format anndata \
+	'markers_${merged_group_slotname}.h5ad'
     """
 }
 
@@ -582,6 +597,7 @@ workflow {
     neighbors_ch = channel.fromList(params.neighbor_values)
     perplexity_ch = channel.fromList(params.perplexity_values)
     resolution_ch = channel.fromList(params.resolution_values)
+    merged_group_slotname_ch = Channel.fromList(params.merged_group_slotname)
 
     Column_rearrange_1(
         genemeta, 
@@ -647,4 +663,17 @@ workflow {
     find_clusters(
         neighbors.out.combine(resolution_ch)
     )
+
+    // Combine the outputs of find_clusters and neighbors processes
+    combined_outputs = find_clusters.out.mix(neighbors.out)
+
+    processed_files = combined_outputs.map { file ->
+        // Extract the sample number from the file name
+        def sampleNumber = file.baseName.replaceFirst('clusters_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field)
+        [file, sampleNumber] // Create a tuple with sample number and file
+    }
+
+    find_markers(
+	processed_files
+    )
 }
diff --git a/nextflow.config b/nextflow.config
index 6dce3ed4..46bba393 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,8 +1,6 @@
-
 process {
     executor='slurm'
     queue="$SCXA_HPC_QUEUE"
-    clusterOptions="$SCXA_HPC_OPTIONS"
     time = '7 d'
     memory = '4 GB'
     queueSize=500
@@ -12,15 +10,10 @@ process {
 
 singularity {
     enabled = true
-    cacheDir = "$SCXA_SINGULARITY_CACHE"
 }
 
 conda {
-    cacheDir = "$SCXA_WORKFLOW_ROOT/envs"
     createTimeout = "30 min"
     useMamba = true
 }
 
-params {
-
-}

From 8f32220988afe10a088e41192295d9ab8f6fa45b Mon Sep 17 00:00:00 2001
From: fg_atlas <fg_atlas@codon-slurm-login-04.ebi.ac.uk>
Date: Tue, 5 Nov 2024 13:32:30 +0000
Subject: [PATCH 067/159] revert config changes

---
 nextflow.config | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/nextflow.config b/nextflow.config
index 46bba393..e09dcc5e 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,6 +1,7 @@
 process {
     executor='slurm'
     queue="$SCXA_HPC_QUEUE"
+    clusterOptions="$SCXA_HPC_OPTIONS"
     time = '7 d'
     memory = '4 GB'
     queueSize=500
@@ -10,10 +11,15 @@ process {
 
 singularity {
     enabled = true
+    cacheDir = "$SCXA_SINGULARITY_CACHE"
 }
 
 conda {
+    cacheDir = "$SCXA_WORKFLOW_ROOT/envs"
     createTimeout = "30 min"
     useMamba = true
 }
 
+params {
+
+}

From bbfc882f3cae16d72a41417e71347aba0aad8200 Mon Sep 17 00:00:00 2001
From: fg_atlas <fg_atlas@codon-slurm-login-02.ebi.ac.uk>
Date: Tue, 5 Nov 2024 14:43:23 +0000
Subject: [PATCH 068/159] adds python script for final project process

---
 scripts/final_project.py | 616 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 616 insertions(+)
 create mode 100644 scripts/final_project.py

diff --git a/scripts/final_project.py b/scripts/final_project.py
new file mode 100644
index 00000000..0430187b
--- /dev/null
+++ b/scripts/final_project.py
@@ -0,0 +1,616 @@
+
+import scanpy as sc
+import anndata
+from numpy import all
+import logging
+
+adata = sc.read('input.h5')
+
+
+gene_name = 'index'
+qc_vars = list()
+
+
+
+gene_names = getattr(adata.var, gene_name)
+
+
+ad_s = sc.read('r_source.h5')
+if not all(adata.obs.index.isin(ad_s.obs.index)):
+  logging.error("Specified object for .raw must contain all .obs from main object.")
+  sys.exit(1)
+else:
+  adata.raw = ad_s[adata.obs.index]
+del ad_s
+
+ad_s = sc.read('x_source_0.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  if "filtered" == '':
+    logging.error("%sth destination layer for %sth X source not specified" % ("0", "0"))
+    sys.exit(1)
+  adata.layers["filtered"] = ad_s.X
+else:
+  logging.error("X source 0 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('x_source_1.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  if "normalised" == '':
+    logging.error("%sth destination layer for %sth X source not specified" % ("1", "1"))
+    sys.exit(1)
+  adata.layers["normalised"] = ad_s.X
+else:
+  logging.error("X source 1 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+
+
+ad_s = sc.read('obs_source_0.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.obs:
+        suffix = "_0"
+
+    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
+    if k_to_copy in ad_s.uns.keys():
+      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Observation source 0 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('obs_source_1.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.obs:
+        suffix = "_1"
+
+    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
+    if k_to_copy in ad_s.uns.keys():
+      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Observation source 1 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('obs_source_2.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.obs:
+        suffix = "_2"
+
+    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
+    if k_to_copy in ad_s.uns.keys():
+      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Observation source 2 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('obs_source_3.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.obs:
+        suffix = "_3"
+
+    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
+    if k_to_copy in ad_s.uns.keys():
+      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Observation source 3 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('obs_source_4.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.obs:
+        suffix = "_4"
+
+    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
+    if k_to_copy in ad_s.uns.keys():
+      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Observation source 4 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('obs_source_5.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.obs:
+        suffix = "_5"
+
+    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
+    if k_to_copy in ad_s.uns.keys():
+      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Observation source 5 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('obs_source_6.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.obs:
+        suffix = "_6"
+
+    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
+    if k_to_copy in ad_s.uns.keys():
+      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Observation source 6 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('obs_source_7.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.obs:
+        suffix = "_7"
+
+    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
+    if k_to_copy in ad_s.uns.keys():
+      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Observation source 7 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('obs_source_8.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.obs:
+        suffix = "_8"
+
+    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
+    if k_to_copy in ad_s.uns.keys():
+      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Observation source 8 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+
+
+ad_s = sc.read('embedding_source_0.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_0"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_0"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 0 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('embedding_source_1.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_1"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_1"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 1 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('embedding_source_2.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_2"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_2"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 2 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('embedding_source_3.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_3"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_3"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 3 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('embedding_source_4.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_4"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_4"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 4 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('embedding_source_5.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_5"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_5"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 5 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('embedding_source_6.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_6"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_6"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 6 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('embedding_source_7.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_7"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_7"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 7 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('embedding_source_8.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_8"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_8"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 8 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('embedding_source_9.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_9"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_9"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 9 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('embedding_source_10.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_10"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_10"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 10 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('embedding_source_11.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_11"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_11"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 11 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('embedding_source_12.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_12"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_12"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 12 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('embedding_source_13.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_13"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_13"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 13 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('embedding_source_14.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_14"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_14"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 14 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('embedding_source_15.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_15"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_15"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 15 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('embedding_source_16.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_16"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+  for k_to_copy in keys_to_copy:
+    suffix = ''
+    if k_to_copy in adata.obsm:
+        suffix = "_16"
+    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
+else:
+  logging.error("Embedding source 16 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+
+ad_s = sc.read('uns_source_0.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.uns:
+        suffix="_0"
+    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Uns source 0 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('uns_source_1.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.uns:
+        suffix="_1"
+    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Uns source 1 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('uns_source_2.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.uns:
+        suffix="_2"
+    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Uns source 2 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('uns_source_3.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.uns:
+        suffix="_3"
+    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Uns source 3 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('uns_source_4.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.uns:
+        suffix="_4"
+    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Uns source 4 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('uns_source_5.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.uns:
+        suffix="_5"
+    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Uns source 5 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('uns_source_6.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.uns:
+        suffix="_6"
+    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Uns source 6 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('uns_source_7.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.uns:
+        suffix="_7"
+    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Uns source 7 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+ad_s = sc.read('uns_source_8.h5')
+if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
+  for k_to_copy in keys_to_copy:
+    suffix=''
+    if k_to_copy in adata.uns:
+        suffix="_8"
+    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+else:
+  logging.error("Uns source 8 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+  sys.exit(1)
+del ad_s
+
+
+if len(qc_vars) > 0:
+    pct_top = [50]
+    sc.pp.calculate_qc_metrics(adata, qc_vars=qc_vars, percent_top=pct_top, inplace=True)
+
+if 'n_genes' not in adata.obs.columns:
+    sc.pp.filter_cells(adata, min_genes=0)
+if 'n_counts' not in adata.obs.columns:
+    sc.pp.filter_cells(adata, min_counts=0)
+if 'n_cells' not in adata.var.columns:
+    sc.pp.filter_genes(adata, min_cells=0)
+if 'n_counts' not in adata.var.columns:
+    sc.pp.filter_genes(adata, min_counts=0)
+
+adata.write('output.h5', compression='gzip')
+    
\ No newline at end of file

From b9b11890a4223d8c1d331a8676362cb9ce6257b0 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Tue, 5 Nov 2024 14:52:53 +0000
Subject: [PATCH 069/159] Update nextflow-linter.yaml - adding back changes
 pushed by @pmadrgal

---
 .github/workflows/nextflow-linter.yaml | 44 +++++++++++++++++++-------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/nextflow-linter.yaml b/.github/workflows/nextflow-linter.yaml
index 6bcc9eec..7b68f59b 100644
--- a/.github/workflows/nextflow-linter.yaml
+++ b/.github/workflows/nextflow-linter.yaml
@@ -1,20 +1,17 @@
 name: nf-core linting
-
 on:
-#  push:
-#    branches:
-#      - main
-#  pull_request:
-#    branches:
-#      - main
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
 
 jobs:
   lint:
     runs-on: ubuntu-latest
     steps:
-      - name: checkout repository
-        uses: actions/checkout@v2
-
+      - uses: actions/checkout@v2
       - uses: actions/setup-java@v2
         with:
           distribution: 'adopt'
@@ -28,8 +25,31 @@ jobs:
           mv nextflow $HOME/.local/bin/
           echo "$HOME/.local/bin" >> $GITHUB_PATH
 
+      - name: set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.x'
+
+      - name: install nf-core tools
+        run: |
+          python -m pip install --upgrade pip
+          pip install nf-core
+          
       - name: check Nextflow version
         run: nextflow -version
 
-      - name: check syntax in main.nf
-        run: nextflow main.nf -c .github/workflows/ci.config -preview || (echo "Syntax check failed. Please review the error message above." && exit 1)
+      # https://nf-co.re/tools/docs/latest/pipeline_lint_tests/
+      - name: create .nf-core.yml
+        run: |
+          cat << EOF > .nf-core.yml
+          repository_type: pipeline
+          lint:
+            actions_awsfulltest: False
+            actions_awstest: False
+            multiqc_config: False
+            schema_lint: False
+            schema_params: False
+          EOF
+
+      - name: run nf-core lint
+        run: nf-core pipelines lint --dir .

From d2b4a38ec8fb0fe1d938a1dcdd9f3ca8c0631684 Mon Sep 17 00:00:00 2001
From: fg_atlas <fg_atlas@codon-slurm-login-02.ebi.ac.uk>
Date: Tue, 5 Nov 2024 15:17:56 +0000
Subject: [PATCH 070/159] Fixed filter_gene and filter_cell connections

---
 main.nf | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index e89e0f06..fc09298c 100644
--- a/main.nf
+++ b/main.nf
@@ -138,7 +138,6 @@ process scanpy_filter_cells {
     
     input:
         path anndata
-        path genes
 
     output:
         path 'filtered_cell_anndata.h5ad'
@@ -621,13 +620,16 @@ workflow {
     )
     scanpy_filter_cells(
         scanpy_read_10x.out,
+    )
+    scanpy_filter_genes(
+        scanpy_filter_cells.out,
         Column_rearrange_1.out[0]
     )
     normalise_data(
-        scanpy_filter_cells.out
+        scanpy_filter_genes.out
     )
     normalise_internal_data(
-        scanpy_filter_cells.out
+        scanpy_filter_genes.out
     )
     find_variable_genes(
         normalise_internal_data.out,

From 4b90e9f69f435d4e61ff58f29f15ea9d0611e03d Mon Sep 17 00:00:00 2001
From: fg_atlas <fg_atlas@codon-slurm-login-02.ebi.ac.uk>
Date: Tue, 5 Nov 2024 16:43:28 +0000
Subject: [PATCH 071/159] changes permission

---
 scripts/final_project.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 mode change 100644 => 100755 scripts/final_project.py

diff --git a/scripts/final_project.py b/scripts/final_project.py
old mode 100644
new mode 100755
index 0430187b..0153f87b
--- a/scripts/final_project.py
+++ b/scripts/final_project.py
@@ -613,4 +613,4 @@
     sc.pp.filter_genes(adata, min_counts=0)
 
 adata.write('output.h5', compression='gzip')
-    
\ No newline at end of file
+    

From 4961d5f0c15c0295ae53c25286f8c48be3ebf4b5 Mon Sep 17 00:00:00 2001
From: fg_atlas <fg_atlas@codon-slurm-login-02.ebi.ac.uk>
Date: Tue, 5 Nov 2024 16:46:10 +0000
Subject: [PATCH 072/159] populate make_project_file process

---
 main.nf | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index fc09298c..28920649 100644
--- a/main.nf
+++ b/main.nf
@@ -575,11 +575,34 @@ process merge_embeddings {
 
 process make_project_file {
     input:
-
+	path neighbors
+	path scanpy_read_10x
+	path filter_genes
+	path normalise_data
+	path find_markers
+	path TNSEs_mix_UMAPs
     output:
-
+	path "output.h5"
     script:
     """
+	ln -s $neighbors input.h5
+	ln -s $scanpy_read_10x r_source.h5
+		ln -s '$filter_genes' x_source_0.h5
+		ln -s '$normalise_data' x_source_1.h5
+	count=0
+	for i in $find_markers
+	do
+		ln -s "\${i}" obs_source_\${count}.h5
+		ln -s "\${i}" uns_source_\${count}.h5
+		((count++))
+	done
+	count=0
+	for i in $TNSEs_mix_UMAPs
+	do
+		ln -s "\${i}" embedding_source_\${count}.h5
+		((count++))
+	done
+	python scripts/final_project.py
     """
 }
 
@@ -678,4 +701,12 @@ workflow {
     find_markers(
 	processed_files
     )
+    make_project_file(
+	neighbors.out,
+	scanpy_read_10x.out,
+	scanpy_filter_genes.out,
+	normalise_data.out,
+	find_markers.out.collect(),
+	TNSEs_ch.mix(UMAPs_ch).collect()
+    )
 }

From d02f1a32ad19bebe38ff4f79d2bb0ddf8e972256 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Tue, 5 Nov 2024 17:09:48 +0000
Subject: [PATCH 073/159] Update final_project.py - removing redundant code

---
 scripts/final_project.py | 336 ++++-----------------------------------
 1 file changed, 29 insertions(+), 307 deletions(-)

diff --git a/scripts/final_project.py b/scripts/final_project.py
index 0153f87b..a14edb69 100755
--- a/scripts/final_project.py
+++ b/scripts/final_project.py
@@ -3,6 +3,7 @@
 import anndata
 from numpy import all
 import logging
+import os
 
 adata = sc.read('input.h5')
 
@@ -13,7 +14,8 @@
 
 
 gene_names = getattr(adata.var, gene_name)
-
+# Define the directory containing your source files
+source_dir = '.'  # Adjust to the appropriate path
 
 ad_s = sc.read('r_source.h5')
 if not all(adata.obs.index.isin(ad_s.obs.index)):
@@ -182,312 +184,32 @@
 del ad_s
 
 
-ad_s = sc.read('embedding_source_0.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_0"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_0"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 0 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('embedding_source_1.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_1"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_1"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 1 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('embedding_source_2.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_2"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_2"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 2 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('embedding_source_3.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_3"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_3"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 3 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('embedding_source_4.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_4"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_4"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 4 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('embedding_source_5.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_5"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_5"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 5 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('embedding_source_6.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_6"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_6"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 6 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('embedding_source_7.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_7"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_7"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 7 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('embedding_source_8.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_8"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_8"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 8 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('embedding_source_9.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_9"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_9"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 9 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('embedding_source_10.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_10"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_10"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 10 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('embedding_source_11.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_11"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_11"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 11 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('embedding_source_12.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_12"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_12"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 12 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('embedding_source_13.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_13"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_13"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 13 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('embedding_source_14.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_14"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_14"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 14 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('embedding_source_15.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_15"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_15"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 15 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('embedding_source_16.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_16"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-  keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-  for k_to_copy in keys_to_copy:
-    suffix = ''
-    if k_to_copy in adata.obsm:
-        suffix = "_16"
-    adata.obsm[k_to_copy+suffix] = ad_s.obsm[k_to_copy]
-else:
-  logging.error("Embedding source 16 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
+
+
+embedding_sources = [file for file in os.listdir(source_dir) if file.startswith('embedding_source_') and file.endswith('.h5')]
+
+for idx, embedding_file in enumerate(sorted(embedding_sources)):
+    ad_s = sc.read(os.path.join(source_dir, embedding_file))
+    if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+        # Copy tsne embeddings
+        keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+        for k_to_copy in keys_to_copy:
+          suffix = ''
+          if k_to_copy in adata.obsm:
+            suffix = f"_{idx}"
+          adata.obsm[k_to_copy + suffix] = ad_s.obsm[k_to_copy]
+        # Copy umap embeddings
+        keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+        for k_to_copy in keys_to_copy:
+          suffix = ''
+          if k_to_copy in adata.obsm
+            suffix = f"_{idx}"
+          adata.obsm[k_to_copy + suffix] = ad_s.obsm[k_to_copy]
+    else:
+        logging.error(f"Embedding source {idx} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+        sys.exit(1)
+    del ad_s
+
 
 ad_s = sc.read('uns_source_0.h5')
 if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):

From a010ff54f02bb0c7c41fb33cb7775bd4f299366a Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Tue, 5 Nov 2024 17:18:36 +0000
Subject: [PATCH 074/159] Update final_project.py - removes redundant code for
 uns_source files

---
 scripts/final_project.py | 150 ++++++++-------------------------------
 1 file changed, 28 insertions(+), 122 deletions(-)

diff --git a/scripts/final_project.py b/scripts/final_project.py
index a14edb69..b7871e9c 100755
--- a/scripts/final_project.py
+++ b/scripts/final_project.py
@@ -191,134 +191,40 @@
 for idx, embedding_file in enumerate(sorted(embedding_sources)):
     ad_s = sc.read(os.path.join(source_dir, embedding_file))
     if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-        # Copy tsne embeddings
-        keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
-        for k_to_copy in keys_to_copy:
-          suffix = ''
-          if k_to_copy in adata.obsm:
-            suffix = f"_{idx}"
-          adata.obsm[k_to_copy + suffix] = ad_s.obsm[k_to_copy]
-        # Copy umap embeddings
-        keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
-        for k_to_copy in keys_to_copy:
-          suffix = ''
-          if k_to_copy in adata.obsm
-            suffix = f"_{idx}"
-          adata.obsm[k_to_copy + suffix] = ad_s.obsm[k_to_copy]
+          # Copy tsne embeddings
+          keys_to_copy = (k for k in ad_s.obsm.keys() if "tsne" in k)
+          for k_to_copy in keys_to_copy:
+              suffix = ''
+              if k_to_copy in adata.obsm:
+                  suffix = f"_{idx}"
+              adata.obsm[k_to_copy + suffix] = ad_s.obsm[k_to_copy]
+          # Copy umap embeddings
+          keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
+          for k_to_copy in keys_to_copy:
+              suffix = ''
+              if k_to_copy in adata.obsm
+                  suffix = f"_{idx}"
+              adata.obsm[k_to_copy + suffix] = ad_s.obsm[k_to_copy]
     else:
         logging.error(f"Embedding source {idx} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
         sys.exit(1)
     del ad_s
 
 
-ad_s = sc.read('uns_source_0.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.uns:
-        suffix="_0"
-    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Uns source 0 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('uns_source_1.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.uns:
-        suffix="_1"
-    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Uns source 1 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('uns_source_2.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.uns:
-        suffix="_2"
-    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Uns source 2 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('uns_source_3.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.uns:
-        suffix="_3"
-    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Uns source 3 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('uns_source_4.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.uns:
-        suffix="_4"
-    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Uns source 4 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('uns_source_5.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.uns:
-        suffix="_5"
-    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Uns source 5 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('uns_source_6.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.uns:
-        suffix="_6"
-    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Uns source 6 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('uns_source_7.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.uns:
-        suffix="_7"
-    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Uns source 7 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('uns_source_8.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.uns:
-        suffix="_8"
-    adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Uns source 8 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
+uns_sources = [file for file in os.listdir(source_dir) if file.startswith('uns_source_') and file.endswith('.h5')]
+for idx, uns_file in enumerate(sorted(uns_sources)):
+  ad_s = sc.read(os.path.join(source_dir, uns_file))
+  if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+    keys_to_copy = (k for k in ad_s.uns.keys() if "marker" in k)
+    for k_to_copy in keys_to_copy:
+      suffix=''
+      if k_to_copy in adata.uns:
+          suffix=f"_{idx}"
+      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+  else:
+    logging.error(f"Uns source {idx} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+    sys.exit(1)
+  del ad_s
 
 
 if len(qc_vars) > 0:

From eb5a6909df7d28d6fd52c3bf5cd66c85be841a33 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 6 Nov 2024 10:24:27 +0000
Subject: [PATCH 075/159] Update final_project.py - removes redundant
 obs_source operations

---
 scripts/final_project.py | 154 +++++----------------------------------
 1 file changed, 17 insertions(+), 137 deletions(-)

diff --git a/scripts/final_project.py b/scripts/final_project.py
index b7871e9c..b2e7a9c3 100755
--- a/scripts/final_project.py
+++ b/scripts/final_project.py
@@ -46,144 +46,24 @@
   sys.exit(1)
 del ad_s
 
+obs_sources = [file for file in os.listdir(source_dir) if file.startswith('obs_source_') and file.endswith('.h5')]
 
-ad_s = sc.read('obs_source_0.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.obs:
-        suffix = "_0"
-
-    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
-    if k_to_copy in ad_s.uns.keys():
-      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Observation source 0 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('obs_source_1.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.obs:
-        suffix = "_1"
-
-    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
-    if k_to_copy in ad_s.uns.keys():
-      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Observation source 1 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('obs_source_2.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.obs:
-        suffix = "_2"
-
-    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
-    if k_to_copy in ad_s.uns.keys():
-      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Observation source 2 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('obs_source_3.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.obs:
-        suffix = "_3"
-
-    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
-    if k_to_copy in ad_s.uns.keys():
-      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Observation source 3 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('obs_source_4.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.obs:
-        suffix = "_4"
-
-    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
-    if k_to_copy in ad_s.uns.keys():
-      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Observation source 4 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('obs_source_5.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.obs:
-        suffix = "_5"
-
-    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
-    if k_to_copy in ad_s.uns.keys():
-      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Observation source 5 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('obs_source_6.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.obs:
-        suffix = "_6"
-
-    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
-    if k_to_copy in ad_s.uns.keys():
-      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Observation source 6 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('obs_source_7.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.obs:
-        suffix = "_7"
-
-    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
-    if k_to_copy in ad_s.uns.keys():
-      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Observation source 7 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-ad_s = sc.read('obs_source_8.h5')
-if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
-  keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
-  for k_to_copy in keys_to_copy:
-    suffix=''
-    if k_to_copy in adata.obs:
-        suffix = "_8"
-
-    adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
-    if k_to_copy in ad_s.uns.keys():
-      adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
-else:
-  logging.error("Observation source 8 AnnData file is not compatible to be merged to main AnnData file, different cell names.")
-  sys.exit(1)
-del ad_s
-
-
+for idx, obs_source_file in enumerate(sorted(obs_sources)):
+    ad_s = sc.read(os.path.join(source_dir, obs_source_file))
+    if adata.n_obs == ad_s.n_obs and all(adata.obs_names == ad_s.obs_names):
+      keys_to_copy = (k for k in ad_s.obs.keys() if "louvain" in k)
+      for k_to_copy in keys_to_copy:
+        suffix=''
+        if k_to_copy in adata.obs:
+            suffix = f"_{idx}"
+    
+        adata.obs[[k_to_copy+suffix]] = ad_s.obs[[k_to_copy]]
+        if k_to_copy in ad_s.uns.keys():
+          adata.uns[k_to_copy+suffix] = ad_s.uns[k_to_copy]
+    else:
+      logging.error(f"Observation source {idx} AnnData file is not compatible to be merged to main AnnData file, different cell names.")
+      sys.exit(1)
+    del ad_s
 
 
 embedding_sources = [file for file in os.listdir(source_dir) if file.startswith('embedding_source_') and file.endswith('.h5')]

From 814df0bbe81659dc7aa97d7d8dd529f34bd951a4 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 6 Nov 2024 10:32:37 +0000
Subject: [PATCH 076/159] Update main.nf - adds container in make_project_file
 process

---
 main.nf | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/main.nf b/main.nf
index 28920649..06fb79a2 100644
--- a/main.nf
+++ b/main.nf
@@ -574,6 +574,8 @@ process merge_embeddings {
 
 
 process make_project_file {
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+
     input:
 	path neighbors
 	path scanpy_read_10x

From 376ad3a93dcb208bea9e9950bf7a691efb1c5005 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 6 Nov 2024 11:50:55 +0000
Subject: [PATCH 077/159] Update final_project.py

---
 scripts/final_project.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/final_project.py b/scripts/final_project.py
index b2e7a9c3..9616ded0 100755
--- a/scripts/final_project.py
+++ b/scripts/final_project.py
@@ -82,7 +82,7 @@
           keys_to_copy = (k for k in ad_s.obsm.keys() if "umap" in k)
           for k_to_copy in keys_to_copy:
               suffix = ''
-              if k_to_copy in adata.obsm
+              if k_to_copy in adata.obsm:
                   suffix = f"_{idx}"
               adata.obsm[k_to_copy + suffix] = ad_s.obsm[k_to_copy]
     else:

From 73eb1675b898562ddae50d81251e9c51786fa6a9 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 6 Nov 2024 15:17:00 +0000
Subject: [PATCH 078/159] Update main.nf - fixinf make_project_file process

---
 main.nf | 52 +++++++++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/main.nf b/main.nf
index 06fb79a2..885d5560 100644
--- a/main.nf
+++ b/main.nf
@@ -577,34 +577,36 @@ process make_project_file {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
     input:
-	path neighbors
-	path scanpy_read_10x
-	path filter_genes
-	path normalise_data
-	path find_markers
-	path TNSEs_mix_UMAPs
+        path neighbors
+        path scanpy_read_10x
+        path filter_genes
+        path normalise_data
+        path find_markers
+        path TNSEs_mix_UMAPs
     output:
-	path "output.h5"
+        path "output.h5"
     script:
     """
-	ln -s $neighbors input.h5
-	ln -s $scanpy_read_10x r_source.h5
-		ln -s '$filter_genes' x_source_0.h5
-		ln -s '$normalise_data' x_source_1.h5
-	count=0
-	for i in $find_markers
-	do
-		ln -s "\${i}" obs_source_\${count}.h5
-		ln -s "\${i}" uns_source_\${count}.h5
-		((count++))
-	done
-	count=0
-	for i in $TNSEs_mix_UMAPs
-	do
-		ln -s "\${i}" embedding_source_\${count}.h5
-		((count++))
-	done
-	python scripts/final_project.py
+        ln -s $neighbors input.h5
+        ln -s $scanpy_read_10x r_source.h5
+        ln -s '$filter_genes' x_source_0.h5
+        ln -s '$normalise_data' x_source_1.h5
+        count=0
+        for i in $find_markers
+        do
+                ln -sf "\${i}" obs_source_\${count}.h5
+                ln -sf "\${i}" uns_source_\${count}.h5
+                count=\$((count + 1))
+                echo "\${count}"
+        done
+        count=0
+        for i in $TNSEs_mix_UMAPs
+        do
+                ln -sf "\${i}" embedding_source_\${count}.h5
+                count=\$((count + 1))
+                echo "\${count}"
+        done
+        python ${projectDir}/scripts/final_project.py
     """
 }
 

From 9063a30346851ba545eab771118b83a1630780e5 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 6 Nov 2024 15:25:00 +0000
Subject: [PATCH 079/159] Update nextflow.config

Co-authored-by: Pedro Madrigal <8195212+pmb59@users.noreply.github.com>
---
 nextflow.config | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/nextflow.config b/nextflow.config
index e09dcc5e..2317fb25 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -7,6 +7,10 @@ process {
     queueSize=500
     exitReadTimeout='100000 sec'
     pollInterval = '5sec'
+    // error strategy
+    errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
+    memory = { 4.GB * 2 ^task.attempt }
+    maxRetries = 4
 }
 
 singularity {

From 0dc94e597fc94b8cfa0c648fe5ecba32483edfe7 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 6 Nov 2024 15:25:12 +0000
Subject: [PATCH 080/159] Update main.nf

Co-authored-by: Pedro Madrigal <8195212+pmb59@users.noreply.github.com>
---
 main.nf | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/main.nf b/main.nf
index 885d5560..2f5321cb 100644
--- a/main.nf
+++ b/main.nf
@@ -335,9 +335,6 @@ process neighbors {
 process neighbors_for_umap {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
-    errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
-    memory { 4.GB * task.attempt }
-    maxRetries 3
 
     input:
         tuple path(anndata), val(n_neighbors)

From b8cb996a3671073c5083790782cffd5cda90ca79 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 6 Nov 2024 20:59:28 +0000
Subject: [PATCH 081/159] Update main.nf - now uses param `dir_path` param for
 input files

---
 main.nf | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/main.nf b/main.nf
index 2f5321cb..a90e3b48 100644
--- a/main.nf
+++ b/main.nf
@@ -2,6 +2,7 @@
 
 nextflow.enable.dsl=2
 
+params.dir_path = "."
 params.celltype_field = 'NO_CELLTYPE_FIELD'
 params.neighbor_values = ['10', '100', '15', '20', '25', '3', '30', '5', '50']
 params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50']
@@ -14,6 +15,7 @@ log.info """
 ===============================
 WORKFLOW PARAMETER VALUES
 ===============================
+EXP dir path: ${params.dir_path}
 celltype_field: ${params.celltype_field}
 neighbor_values: ${params.neighbor_values}
 perplexity_values: ${params.perplexity_values}
@@ -610,11 +612,11 @@ process make_project_file {
 workflow {
 
     // Create input channel (single file via CLI parameter)
-    genemeta = Channel.fromPath('genes_metadata.tsv')
-    genes = Channel.fromPath('genes.tsv')
-    barcodes = Channel.fromPath('barcodes.tsv')
-    matrix = Channel.fromPath('matrix.mtx')
-    cellmeta = Channel.fromPath('cell_metadata.tsv')
+    genemeta = Channel.fromPath("${params.dir_path}/genes_metadata.tsv")
+    genes = Channel.fromPath("${params.dir_path}/genes.tsv")
+    barcodes = Channel.fromPath("${params.dir_path}/barcodes.tsv")
+    matrix = Channel.fromPath("${params.dir_path}/matrix.mtx")
+    cellmeta = Channel.fromPath("${params.dir_path}/cell_metadata.tsv")
     pca_param = Channel.value('X_pca')
     batch_variable = Channel.value('')
     neighbors_ch = channel.fromList(params.neighbor_values)

From 9f512a922f686119e18e7c1138efd8c2ece49b07 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 6 Nov 2024 21:10:37 +0000
Subject: [PATCH 082/159] Create data_prep.sh

---
 scripts/data_prep.sh | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 scripts/data_prep.sh

diff --git a/scripts/data_prep.sh b/scripts/data_prep.sh
new file mode 100644
index 00000000..df486fb2
--- /dev/null
+++ b/scripts/data_prep.sh
@@ -0,0 +1,19 @@
+if [ -z "$SCXA_WORKFLOW_ROOT" ]; then
+    echo "Variable SCXA_WORKFLOW_ROOT is not defined or empty. Please load SC env."
+    echo "Exiting..."
+    exit 1;
+fi
+
+EXP_ID=$1
+
+echo "Creating ${pwd}/${EXP_ID} directory"
+mkdir ${EXP_ID}
+cd ${EXP_ID}
+
+echo "Copying data to ${pwd}/${EXP_ID}"
+cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/${EXP_ID}.cell_metadata.tsv cell_metadata.tsv
+cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/genes.tsv.gz . && gunzip genes.tsv.gz
+cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/matrix.mtx.gz . && gunzip matrix.mtx.gz
+cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/barcodes.tsv.gz . && gunzip barcodes.tsv.gz
+cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/reference/gene_annotation.txt genes_metadata.tsv
+echo "Copying data for ${EXP_ID} finished"

From 2000f3002e9bbc6a842379d97ccb17637d24e349 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 6 Nov 2024 21:20:10 +0000
Subject: [PATCH 083/159] Update data_prep.sh - fixing log and force unzip

---
 scripts/data_prep.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/data_prep.sh b/scripts/data_prep.sh
index df486fb2..6a38866c 100644
--- a/scripts/data_prep.sh
+++ b/scripts/data_prep.sh
@@ -6,14 +6,14 @@ fi
 
 EXP_ID=$1
 
-echo "Creating ${pwd}/${EXP_ID} directory"
-mkdir ${EXP_ID}
-cd ${EXP_ID}
+echo "Creating $(pwd)/${EXP_ID} directory"
+mkdir -p $(pwd)/${EXP_ID}
+cd $(pwd)/${EXP_ID}
 
 echo "Copying data to ${pwd}/${EXP_ID}"
 cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/${EXP_ID}.cell_metadata.tsv cell_metadata.tsv
-cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/genes.tsv.gz . && gunzip genes.tsv.gz
-cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/matrix.mtx.gz . && gunzip matrix.mtx.gz
-cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/barcodes.tsv.gz . && gunzip barcodes.tsv.gz
+cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/genes.tsv.gz . && gunzip -f genes.tsv.gz
+cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/matrix.mtx.gz . && gunzip -f matrix.mtx.gz
+cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/barcodes.tsv.gz . && gunzip -f barcodes.tsv.gz
 cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/reference/gene_annotation.txt genes_metadata.tsv
 echo "Copying data for ${EXP_ID} finished"

From ee8b177088bfa5a6cb17838171798cd3d41821aa Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Thu, 7 Nov 2024 10:08:11 +0000
Subject: [PATCH 084/159] Update main.nf - adds results dir

---
 main.nf | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/main.nf b/main.nf
index a90e3b48..69b92841 100644
--- a/main.nf
+++ b/main.nf
@@ -3,6 +3,7 @@
 nextflow.enable.dsl=2
 
 params.dir_path = "."
+params.result_dir_path = params.dir_path + "/results"
 params.celltype_field = 'NO_CELLTYPE_FIELD'
 params.neighbor_values = ['10', '100', '15', '20', '25', '3', '30', '5', '50']
 params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50']
@@ -16,6 +17,7 @@ log.info """
 WORKFLOW PARAMETER VALUES
 ===============================
 EXP dir path: ${params.dir_path}
+Results results_dir_path: ${params.result_dir_path}
 celltype_field: ${params.celltype_field}
 neighbor_values: ${params.neighbor_values}
 perplexity_values: ${params.perplexity_values}
@@ -573,6 +575,8 @@ process merge_embeddings {
 
 
 process make_project_file {
+    publishDir params.result_dir_path, mode: 'copy'
+
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
     input:

From 47b6395d4a82839fa321fa3c5b3dedfa083f9807 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Thu, 7 Nov 2024 10:23:10 +0000
Subject: [PATCH 085/159] Update nextflow.config - temp commenting lines from
 config file

---
 nextflow.config | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 2317fb25..fb2e6015 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,29 +1,29 @@
 process {
     executor='slurm'
     queue="$SCXA_HPC_QUEUE"
-    clusterOptions="$SCXA_HPC_OPTIONS"
+    // clusterOptions="$SCXA_HPC_OPTIONS"
     time = '7 d'
     memory = '4 GB'
     queueSize=500
     exitReadTimeout='100000 sec'
     pollInterval = '5sec'
     // error strategy
-    errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
-    memory = { 4.GB * 2 ^task.attempt }
-    maxRetries = 4
+    // errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
+    // memory = { 4.GB * 2 ^task.attempt }
+    // maxRetries = 4
 }
 
 singularity {
     enabled = true
-    cacheDir = "$SCXA_SINGULARITY_CACHE"
+    // cacheDir = "$SCXA_SINGULARITY_CACHE"
 }
 
 conda {
-    cacheDir = "$SCXA_WORKFLOW_ROOT/envs"
+    // cacheDir = "$SCXA_WORKFLOW_ROOT/envs"
     createTimeout = "30 min"
     useMamba = true
 }
 
-params {
-
-}
+// params {
+//  
+// }

From 82408ae41275272384030143fd8f911d300b78ef Mon Sep 17 00:00:00 2001
From: Iris Diana Yu <irisyu@ebi.ac.uk>
Date: Thu, 7 Nov 2024 11:38:44 +0000
Subject: [PATCH 086/159] Add draft processes and conditional for scrublet

---
 main.nf | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/main.nf b/main.nf
index 69b92841..7752c664 100644
--- a/main.nf
+++ b/main.nf
@@ -108,6 +108,38 @@ process mergeGeneFiles {
     """
 }
 
+process scanpy_multiplet_scrublet {
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    
+    input:
+        path anndata
+        val batch_variable
+
+    output:
+        path 'anndata.h5ad'
+
+    script:
+    """
+        echo $batch_variable > scanpy_multiplet_scrublet.test
+        cp $anndata anndata.h5ad
+    """
+}
+
+process scanpy_plot_scrublet {
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    
+    input:
+        path anndata
+
+    output:
+        path 'scanpy_plot_scrublet.test'
+
+    script:
+    """
+        echo $anndata > scanpy_plot_scrublet.test
+    """
+}
+
 process scanpy_read_10x {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     
@@ -648,6 +680,27 @@ workflow {
         cellmeta,
         genemeta
     )
+
+    if ( params.is_droplet ) {
+        SCRUBLET_ch = scanpy_multiplet_scrublet(
+            scanpy_read_10x.out,
+            batch_variable
+        )
+        scanpy_plot_scrublet(
+            SCRUBLET_ch
+        )
+        scanpy_filter_cells(
+            SCRUBLET_ch,
+            Column_rearrange_1.out[0]
+        )
+    }
+    else {
+        scanpy_filter_cells(
+            scanpy_read_10x.out,
+            Column_rearrange_1.out[0]
+        )
+    }
+
     scanpy_filter_cells(
         scanpy_read_10x.out,
     )

From 1b5847db20dd6cf40dd8d83301e3895fe5387506 Mon Sep 17 00:00:00 2001
From: Iris Diana Yu <irisyu@ebi.ac.uk>
Date: Thu, 7 Nov 2024 11:43:50 +0000
Subject: [PATCH 087/159] Add param technology, change conditional

---
 main.nf | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 7752c664..eaf43afa 100644
--- a/main.nf
+++ b/main.nf
@@ -2,6 +2,7 @@
 
 nextflow.enable.dsl=2
 
+params.technology = "plate"
 params.dir_path = "."
 params.result_dir_path = params.dir_path + "/results"
 params.celltype_field = 'NO_CELLTYPE_FIELD'
@@ -681,7 +682,7 @@ workflow {
         genemeta
     )
 
-    if ( params.is_droplet ) {
+    if ( params.technology == "droplet" ) {
         SCRUBLET_ch = scanpy_multiplet_scrublet(
             scanpy_read_10x.out,
             batch_variable

From 2b37608f5d1774313b186d7307d6198999f68ef5 Mon Sep 17 00:00:00 2001
From: Iris Diana Yu <irisyu@ebi.ac.uk>
Date: Thu, 7 Nov 2024 11:53:17 +0000
Subject: [PATCH 088/159] Correct workflow

---
 main.nf | 63 ++++++++++++++++++++++++++-------------------------------
 1 file changed, 29 insertions(+), 34 deletions(-)

diff --git a/main.nf b/main.nf
index eaf43afa..85688c97 100644
--- a/main.nf
+++ b/main.nf
@@ -109,64 +109,64 @@ process mergeGeneFiles {
     """
 }
 
-process scanpy_multiplet_scrublet {
+process scanpy_read_10x {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     
     input:
-        path anndata
-        val batch_variable
+        path matrix
+        path genes
+        path barcodes
+        path cellmeta
+        path genemeta
 
     output:
         path 'anndata.h5ad'
 
     script:
     """
-        echo $batch_variable > scanpy_multiplet_scrublet.test
-        cp $anndata anndata.h5ad
+        #ln -s $matrix matrix.mtx
+        ln -s $genes genes.tsv
+        #ln -s $barcodes barcodes.tsv
+        
+        scanpy-read-10x --input-10x-mtx ./ \
+        --var-names 'gene_ids' \
+        --extra-obs $cellmeta \
+        --extra-var $genemeta \
+        --show-obj stdout \
+        --output-format anndata \
+        'anndata.h5ad'
     """
 }
 
-process scanpy_plot_scrublet {
+process scanpy_multiplet_scrublet {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     
     input:
         path anndata
+        val batch_variable
 
     output:
-        path 'scanpy_plot_scrublet.test'
+        path 'anndata.h5ad'
 
     script:
     """
-        echo $anndata > scanpy_plot_scrublet.test
+        echo $batch_variable > scanpy_multiplet_scrublet.test
+        cp $anndata anndata.h5ad
     """
 }
 
-process scanpy_read_10x {
+process scanpy_plot_scrublet {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     
     input:
-        path matrix
-        path genes
-        path barcodes
-        path cellmeta
-        path genemeta
+        path anndata
 
     output:
-        path 'anndata.h5ad'
+        path 'scanpy_plot_scrublet.test'
 
     script:
     """
-        #ln -s $matrix matrix.mtx
-        ln -s $genes genes.tsv
-        #ln -s $barcodes barcodes.tsv
-        
-        scanpy-read-10x --input-10x-mtx ./ \
-        --var-names 'gene_ids' \
-        --extra-obs $cellmeta \
-        --extra-var $genemeta \
-        --show-obj stdout \
-        --output-format anndata \
-        'anndata.h5ad'
+        echo $anndata > scanpy_plot_scrublet.test
     """
 }
 
@@ -691,20 +691,15 @@ workflow {
             SCRUBLET_ch
         )
         scanpy_filter_cells(
-            SCRUBLET_ch,
-            Column_rearrange_1.out[0]
+            SCRUBLET_ch
         )
     }
     else {
         scanpy_filter_cells(
-            scanpy_read_10x.out,
-            Column_rearrange_1.out[0]
+            scanpy_read_10x.out
         )
     }
-
-    scanpy_filter_cells(
-        scanpy_read_10x.out,
-    )
+    
     scanpy_filter_genes(
         scanpy_filter_cells.out,
         Column_rearrange_1.out[0]

From 014601d295214c9b78663477518844115c10c7a2 Mon Sep 17 00:00:00 2001
From: Iris Diana Yu <irisyu@ebi.ac.uk>
Date: Thu, 7 Nov 2024 13:46:46 +0000
Subject: [PATCH 089/159] Rename scrublet process output

---
 main.nf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index 85688c97..325d5f9e 100644
--- a/main.nf
+++ b/main.nf
@@ -146,12 +146,12 @@ process scanpy_multiplet_scrublet {
         val batch_variable
 
     output:
-        path 'anndata.h5ad'
+        path 'scrublet.h5ad'
 
     script:
     """
         echo $batch_variable > scanpy_multiplet_scrublet.test
-        cp $anndata anndata.h5ad
+        cp $anndata scrublet.h5ad
     """
 }
 
@@ -699,7 +699,7 @@ workflow {
             scanpy_read_10x.out
         )
     }
-    
+
     scanpy_filter_genes(
         scanpy_filter_cells.out,
         Column_rearrange_1.out[0]

From 02501d9fe3f8263bba8e91daeb1818630893f349 Mon Sep 17 00:00:00 2001
From: Iris Diana Yu <irisyu@ebi.ac.uk>
Date: Fri, 8 Nov 2024 14:49:43 +0000
Subject: [PATCH 090/159] Add real scrublet commands

---
 main.nf | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/main.nf b/main.nf
index 325d5f9e..b57bf0bb 100644
--- a/main.nf
+++ b/main.nf
@@ -150,8 +150,12 @@ process scanpy_multiplet_scrublet {
 
     script:
     """
-        echo $batch_variable > scanpy_multiplet_scrublet.test
-        cp $anndata scrublet.h5ad
+        scanpy-cli multiplet scrublet \
+        --input-format 'anndata' \
+        --output-format 'anndata_h5ad' \
+        --batch-key "${params.batch_variable}" \
+        $anndata \
+        scrublet.h5ad
     """
 }
 
@@ -162,11 +166,16 @@ process scanpy_plot_scrublet {
         path anndata
 
     output:
-        path 'scanpy_plot_scrublet.test'
+        path 'scrublet.png'
 
     script:
     """
-        echo $anndata > scanpy_plot_scrublet.test
+        scanpy-cli plot scrublet \
+        --input-format "anndata" \
+        --scale-hist-obs "linear" \
+        --scale-hist-sim "linear" \
+        $anndata \
+        scrublet.png
     """
 }
 

From 8b74c493e6fa16ec0ca6dfc5abb809a2f6054779 Mon Sep 17 00:00:00 2001
From: Iris Diana Yu <irisyu@ebi.ac.uk>
Date: Fri, 8 Nov 2024 14:58:46 +0000
Subject: [PATCH 091/159] Initialise batch variable, correct scrublet output
 format

---
 main.nf | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index b57bf0bb..d7096e8a 100644
--- a/main.nf
+++ b/main.nf
@@ -3,6 +3,7 @@
 nextflow.enable.dsl=2
 
 params.technology = "plate"
+params.batch_variable = ""
 params.dir_path = "."
 params.result_dir_path = params.dir_path + "/results"
 params.celltype_field = 'NO_CELLTYPE_FIELD'
@@ -152,7 +153,7 @@ process scanpy_multiplet_scrublet {
     """
         scanpy-cli multiplet scrublet \
         --input-format 'anndata' \
-        --output-format 'anndata_h5ad' \
+        --output-format 'anndata' \
         --batch-key "${params.batch_variable}" \
         $anndata \
         scrublet.h5ad

From ddb3916ac2488aba30fdf67bac4164530811384d Mon Sep 17 00:00:00 2001
From: Iris Diana Yu <irisyu@ebi.ac.uk>
Date: Tue, 12 Nov 2024 11:33:56 +0000
Subject: [PATCH 092/159] Allow scrublet to execute without a batch variable

---
 main.nf | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/main.nf b/main.nf
index d7096e8a..9528b3e3 100644
--- a/main.nf
+++ b/main.nf
@@ -151,12 +151,20 @@ process scanpy_multiplet_scrublet {
 
     script:
     """
-        scanpy-cli multiplet scrublet \
-        --input-format 'anndata' \
-        --output-format 'anndata' \
-        --batch-key "${params.batch_variable}" \
-        $anndata \
-        scrublet.h5ad
+        if [ "${params.batch_variable}" -eq "" ]; do
+            scanpy-cli multiplet scrublet \
+            --input-format 'anndata' \
+            --output-format 'anndata' \
+            $anndata \
+            scrublet.h5ad
+        else
+            scanpy-cli multiplet scrublet \
+            --input-format 'anndata' \
+            --output-format 'anndata' \
+            --batch-key "${params.batch_variable}" \
+            $anndata \
+            scrublet.h5ad
+        fi
     """
 }
 

From 3ec2fe6f6d53685cd245ce98eb33fbb45e9c3106 Mon Sep 17 00:00:00 2001
From: Iris Diana Yu <irisyu@ebi.ac.uk>
Date: Tue, 12 Nov 2024 11:57:57 +0000
Subject: [PATCH 093/159] bash correction

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 9528b3e3..e2903e4b 100644
--- a/main.nf
+++ b/main.nf
@@ -151,7 +151,7 @@ process scanpy_multiplet_scrublet {
 
     script:
     """
-        if [ "${params.batch_variable}" -eq "" ]; do
+        if [ "${params.batch_variable}" -eq "" ]; then
             scanpy-cli multiplet scrublet \
             --input-format 'anndata' \
             --output-format 'anndata' \

From 55feac593cdc5c052864703aacd5898f61373937 Mon Sep 17 00:00:00 2001
From: Iris Diana Yu <irisyu@ebi.ac.uk>
Date: Tue, 12 Nov 2024 12:09:09 +0000
Subject: [PATCH 094/159] correct if expression

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index e2903e4b..2f7f35e6 100644
--- a/main.nf
+++ b/main.nf
@@ -151,7 +151,7 @@ process scanpy_multiplet_scrublet {
 
     script:
     """
-        if [ "${params.batch_variable}" -eq "" ]; then
+        if [ -z "${params.batch_variable}" ]; then
             scanpy-cli multiplet scrublet \
             --input-format 'anndata' \
             --output-format 'anndata' \

From 12f2024a38316c2f713c37483edab0d0b7c0d69f Mon Sep 17 00:00:00 2001
From: Iris Diana Yu <irisyu@ebi.ac.uk>
Date: Wed, 13 Nov 2024 10:41:29 +0000
Subject: [PATCH 095/159] Filter predicted doublets if applicable

---
 main.nf | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index 2f7f35e6..62748979 100644
--- a/main.nf
+++ b/main.nf
@@ -193,6 +193,7 @@ process scanpy_filter_cells {
     
     input:
         path anndata
+        val category
 
     output:
         path 'filtered_cell_anndata.h5ad'
@@ -205,7 +206,8 @@ process scanpy_filter_cells {
         --input-format 'anndata' $anndata \
         --show-obj stdout \
         --output-format anndata 'filtered_cell_anndata.h5ad' \
-        --export-mtx ./
+        --export-mtx ./ \
+        $category
     """
 }
 
@@ -709,12 +711,14 @@ workflow {
             SCRUBLET_ch
         )
         scanpy_filter_cells(
-            SCRUBLET_ch
+            SCRUBLET_ch,
+            "--category predicted_doublet False"
         )
     }
     else {
         scanpy_filter_cells(
-            scanpy_read_10x.out
+            scanpy_read_10x.out,
+            ""
         )
     }
 

From 130486f2b69ff37100bead332ae42b544617db93 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 13 Nov 2024 15:28:47 +0000
Subject: [PATCH 096/159] Update main.nf - adds `restore_unscaled` process -
 not tested

---
 main.nf | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/main.nf b/main.nf
index 62748979..3c9513eb 100644
--- a/main.nf
+++ b/main.nf
@@ -491,6 +491,21 @@ process build_list {
     """
 }
 
+process restore_unscaled {
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    input:
+	tuple path(anndata), path(normalise_internal_data)
+    output:
+	path "restore_unscaled_output_${merged_group_slotname}.h5"
+    script:
+    """
+	ln -s $anndata input.h5
+	ln -s $normalise_internal_data r_source.h5
+	python ${projectDir}/scripts/restore_unscaled.py
+	mv output.h5 'restore_unscaled_output_${merged_group_slotname}.h5'
+    """
+}
+
 process find_markers {
     errorStrategy 'ignore'
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
@@ -770,15 +785,24 @@ workflow {
     // Combine the outputs of find_clusters and neighbors processes
     combined_outputs = find_clusters.out.mix(neighbors.out)
 
-    processed_files = combined_outputs.map { file ->
-        // Extract the sample number from the file name
-        def sampleNumber = file.baseName.replaceFirst('clusters_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field)
-        [file, sampleNumber] // Create a tuple with sample number and file
+    if ( params.technology == "droplet" ) {
+        restore_unscaled
+	    combined_outputs.combine(normalise_internal_data.out)
+    	)
+	restore_unscaled_files = restore_unscaled.out.map { file ->
+	    // Extract the sample number from the file name
+	    def sampleNumber = file.baseName.replaceFirst('restore_unscaled_output_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field)
+	    [file, sampleNumber] // Create a tuple with sample number and file
+	}
+	find_markers(
+	    restore_unscaled_files
+    	)
+    }
+    else {
+        find_markers(
+	    processed_files
+    	)
     }
-
-    find_markers(
-	processed_files
-    )
     make_project_file(
 	neighbors.out,
 	scanpy_read_10x.out,

From 2e19d39b59ca9b2af3b6685b78134d8e4ef0b18d Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 13 Nov 2024 15:32:07 +0000
Subject: [PATCH 097/159] Create resource_unscalled.py

---
 scripts/resource_unscalled.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 scripts/resource_unscalled.py

diff --git a/scripts/resource_unscalled.py b/scripts/resource_unscalled.py
new file mode 100644
index 00000000..9e5eef92
--- /dev/null
+++ b/scripts/resource_unscalled.py
@@ -0,0 +1,33 @@
+import scanpy as sc
+import anndata
+from numpy import all
+import logging
+
+adata = sc.read('input.h5')
+
+gene_name = 'index'
+qc_vars = list()
+gene_names = getattr(adata.var, gene_name)
+
+ad_s = sc.read('r_source.h5')
+if not all(adata.obs.index.isin(ad_s.obs.index)):
+  logging.error("Specified object for .raw must contain all .obs from main object.")
+  sys.exit(1)
+else:
+  adata.raw = ad_s[adata.obs.index]
+del ad_s
+
+if len(qc_vars) > 0:
+    pct_top = [50]
+    sc.pp.calculate_qc_metrics(adata, qc_vars=qc_vars, percent_top=pct_top, inplace=True)
+
+if 'n_genes' not in adata.obs.columns:
+    sc.pp.filter_cells(adata, min_genes=0)
+if 'n_counts' not in adata.obs.columns:
+    sc.pp.filter_cells(adata, min_counts=0)
+if 'n_cells' not in adata.var.columns:
+    sc.pp.filter_genes(adata, min_cells=0)
+if 'n_counts' not in adata.var.columns:
+    sc.pp.filter_genes(adata, min_counts=0)
+
+adata.write('output.h5', compression='gzip')

From b5e37d1a22838ed792926530e1001f537b616fd2 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 13 Nov 2024 15:44:24 +0000
Subject: [PATCH 098/159] Update main.nf - adds missing `(`

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 3c9513eb..f902e722 100644
--- a/main.nf
+++ b/main.nf
@@ -786,7 +786,7 @@ workflow {
     combined_outputs = find_clusters.out.mix(neighbors.out)
 
     if ( params.technology == "droplet" ) {
-        restore_unscaled
+        restore_unscaled (
 	    combined_outputs.combine(normalise_internal_data.out)
     	)
 	restore_unscaled_files = restore_unscaled.out.map { file ->

From 6bbf4cb320b25d218b04105eb245d93a1a326fd1 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 13 Nov 2024 15:49:30 +0000
Subject: [PATCH 099/159] Update main.nf - adding `processed_file` back

---
 main.nf | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/main.nf b/main.nf
index f902e722..dac1fa8b 100644
--- a/main.nf
+++ b/main.nf
@@ -799,6 +799,11 @@ workflow {
     	)
     }
     else {
+	processed_files = combined_outputs.map { file ->
+         // Extract the sample number from the file name
+         def sampleNumber = file.baseName.replaceFirst('clusters_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field)
+         [file, sampleNumber] // Create a tuple with sample number and file
+
         find_markers(
 	    processed_files
     	)

From 22d37283c06575db2a9406715cb4171b20e7b1ba Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 13 Nov 2024 15:53:01 +0000
Subject: [PATCH 100/159] Update main.nf - fixes `restore_unscaled`

---
 main.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index dac1fa8b..ff6ec312 100644
--- a/main.nf
+++ b/main.nf
@@ -496,13 +496,13 @@ process restore_unscaled {
     input:
 	tuple path(anndata), path(normalise_internal_data)
     output:
-	path "restore_unscaled_output_${merged_group_slotname}.h5"
+	path "restore_unscaled_output_${anndata}.h5"
     script:
     """
 	ln -s $anndata input.h5
 	ln -s $normalise_internal_data r_source.h5
 	python ${projectDir}/scripts/restore_unscaled.py
-	mv output.h5 'restore_unscaled_output_${merged_group_slotname}.h5'
+	mv output.h5 'restore_unscaled_output_${anndata}.h5'
     """
 }
 

From 77b50c2699786c938c4de95801bb3485390c82e6 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 13 Nov 2024 15:58:03 +0000
Subject: [PATCH 101/159] Update main.nf - addsmissin `}`

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index ff6ec312..d7598a1d 100644
--- a/main.nf
+++ b/main.nf
@@ -803,7 +803,7 @@ workflow {
          // Extract the sample number from the file name
          def sampleNumber = file.baseName.replaceFirst('clusters_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field)
          [file, sampleNumber] // Create a tuple with sample number and file
-
+	}
         find_markers(
 	    processed_files
     	)

From 9fd836123c8f131ae9fb17f9e36fca1ba18205aa Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 13 Nov 2024 16:05:09 +0000
Subject: [PATCH 102/159] Rename resource_unscalled.py to restore_unscaled.py

---
 scripts/{resource_unscalled.py => restore_unscaled.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename scripts/{resource_unscalled.py => restore_unscaled.py} (100%)

diff --git a/scripts/resource_unscalled.py b/scripts/restore_unscaled.py
similarity index 100%
rename from scripts/resource_unscalled.py
rename to scripts/restore_unscaled.py

From 75a57b0830749c9f0347c983658964303b21ca6d Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 13 Nov 2024 16:39:00 +0000
Subject: [PATCH 103/159] Update main.nf - fixed mapping for `find_marker`
 after `restore_unscaled`

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index d7598a1d..bb9f8b65 100644
--- a/main.nf
+++ b/main.nf
@@ -791,7 +791,7 @@ workflow {
     	)
 	restore_unscaled_files = restore_unscaled.out.map { file ->
 	    // Extract the sample number from the file name
-	    def sampleNumber = file.baseName.replaceFirst('restore_unscaled_output_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field)
+	    def sampleNumber = file.baseName.replaceFirst('restore_unscaled_output_', '').replaceFirst('clusters_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field).replaceFirst('neighbors',params.celltype_field).replaceFirst('.h5ad','')
 	    [file, sampleNumber] // Create a tuple with sample number and file
 	}
 	find_markers(

From 2b539b7ad76ab51ebfe6db3bed7f355af4c9f751 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 13 Nov 2024 16:57:48 +0000
Subject: [PATCH 104/159] Update README.md

---
 README.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/README.md b/README.md
index 06bcde9e..81348687 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,16 @@
 # scxa-tertiary-workflow
 Tertiary component for SCXA workflows
+
+# How to run workflow for tertiary analysis 
+## Prepare data
+```
+bash scripts/data_prep.sh <EXP-ID>
+```
+## Run for plate
+```
+nextflow run main.nf --slurm -resume --dir_path <EXP-ID>
+```
+## Run for droplet
+```
+nextflow run main.nf --slurm -resume --dir_path <EXP-ID> --technology droplet
+```

From db56ad6e7a5b9056b60661eb2fce0fea6749864b Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 13 Nov 2024 20:48:19 +0000
Subject: [PATCH 105/159] Update main.nf - adds publishDir options..

---
 main.nf | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index bb9f8b65..34358882 100644
--- a/main.nf
+++ b/main.nf
@@ -212,6 +212,8 @@ process scanpy_filter_cells {
 }
 
 process scanpy_filter_genes {
+    publishDir params.result_dir_path, mode: 'copy', pattern: '(matrix\\.mtx|barcodes\\.tsv|genes\\.tsv)'
+
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
     input:
@@ -236,6 +238,7 @@ process scanpy_filter_genes {
 }
 
 process normalise_data {
+    publishDir params.result_dir_path, mode: 'copy', pattern: '(matrix\\.mtx|barcodes\\.tsv|genes\\.tsv)'
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
     input:
@@ -418,6 +421,7 @@ process neighbors_for_umap {
 }
 
 process find_clusters {
+    publishDir params.result_dir_path, mode: 'copy', pattern: '(clusters\\.tsv)'
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
     input:
@@ -438,6 +442,7 @@ process find_clusters {
         --show-obj stdout \
         --output-format anndata \
         'clusters_${resolution}.h5ad'
+	&& mv 'output.tsv' 'clusters_${resolution}.tsv'
     """
 }
 
@@ -507,6 +512,7 @@ process restore_unscaled {
 }
 
 process find_markers {
+    publishDir params.result_dir_path, mode: 'copy', pattern: '(markers_\\.tsv)'
     errorStrategy 'ignore'
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     input:
@@ -516,7 +522,7 @@ process find_markers {
     script:
     """
 	scanpy-find-markers \
-	--save diffexp.tsv \
+	--save 'markers_${merged_group_slotname}.tsv' \
 	--n-genes '100' \
 	--groupby '${merged_group_slotname}' \
 	--key-added 'markers_${merged_group_slotname}' \
@@ -543,6 +549,7 @@ process filtered_cellgroup_markers {
 }
 
 process run_umap {
+    publishDir params.result_dir_path, mode: 'copy', pattern: '(embeddings\\.tsv)'
     //errorStrategy 'ignore'
 
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
@@ -574,12 +581,13 @@ process run_umap {
             --output-format anndata \
             "umap_\${n_number}.h5ad"  
             # Not sure if following is needed
-            # && mv 'embeddings_neighbors_n_neighbors_100.tsv' embeddings.tsv
+            # && mv 'embeddings_neighbors_n_neighbors_${n_number}.tsv' embeddings.tsv
 
     """
 }
 
 process run_tsne {
+    publishDir params.result_dir_path, mode: 'copy', pattern: '(embeddings_perplexity\\.tsv)'
     //errorStrategy 'ignore'
     
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
@@ -606,7 +614,7 @@ process run_tsne {
             --output-format anndata \
             'tsne_${perplexity_values}.h5ad'
             # Not sure if following is needed
-            # && mv 'embeddings_perplexity_1.tsv' embeddings.tsv
+            && mv 'embeddings_perplexity_${perplexity_values}.tsv' embeddings.tsv
     """
 }
 

From 96881dd1788bd2301bb0ac43b9969e8459cb738c Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 13 Nov 2024 20:53:25 +0000
Subject: [PATCH 106/159] Update main.nf - fixes err

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 34358882..55656ad2 100644
--- a/main.nf
+++ b/main.nf
@@ -581,7 +581,7 @@ process run_umap {
             --output-format anndata \
             "umap_\${n_number}.h5ad"  
             # Not sure if following is needed
-            # && mv 'embeddings_neighbors_n_neighbors_${n_number}.tsv' embeddings.tsv
+            # && mv "embeddings_neighbors_n_neighbors_\${n_number}.tsv" embeddings.tsv
 
     """
 }

From 281875b5b49f974d15ec6ad076de8e8b24ee4135 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 13 Nov 2024 21:02:37 +0000
Subject: [PATCH 107/159] Update main.nf - fixes err

---
 main.nf | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 55656ad2..835e6537 100644
--- a/main.nf
+++ b/main.nf
@@ -442,7 +442,8 @@ process find_clusters {
         --show-obj stdout \
         --output-format anndata \
         'clusters_${resolution}.h5ad'
-	&& mv 'output.tsv' 'clusters_${resolution}.tsv'
+	
+	mv 'output.tsv' 'clusters_${resolution}.tsv'
     """
 }
 
@@ -614,7 +615,7 @@ process run_tsne {
             --output-format anndata \
             'tsne_${perplexity_values}.h5ad'
             # Not sure if following is needed
-            && mv 'embeddings_perplexity_${perplexity_values}.tsv' embeddings.tsv
+            # && mv 'embeddings_perplexity_${perplexity_values}.tsv' embeddings.tsv
     """
 }
 

From 6a2b7366cffc360af316f131b3d237fa8b5fe869 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 13 Nov 2024 21:05:53 +0000
Subject: [PATCH 108/159] Update main.nf - adds publishdir for scrublet plot

---
 main.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/main.nf b/main.nf
index 835e6537..e2b446cf 100644
--- a/main.nf
+++ b/main.nf
@@ -169,6 +169,7 @@ process scanpy_multiplet_scrublet {
 }
 
 process scanpy_plot_scrublet {
+    publishDir params.result_dir_path, mode: 'copy', pattern: '(scrublet.png)'
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     
     input:

From 6b19a64ba9a4b84c7dfef8726bc43b62dcd99f55 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 15 Nov 2024 14:31:25 +0000
Subject: [PATCH 109/159] Update main.nf - redirects output to `publishDir`

---
 main.nf | 58 ++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/main.nf b/main.nf
index e2b446cf..fe25fd42 100644
--- a/main.nf
+++ b/main.nf
@@ -213,8 +213,9 @@ process scanpy_filter_cells {
 }
 
 process scanpy_filter_genes {
-    publishDir params.result_dir_path, mode: 'copy', pattern: '(matrix\\.mtx|barcodes\\.tsv|genes\\.tsv)'
-
+    publishDir "${params.result_dir_path}/scanpy_filter_genes", mode: 'copy', pattern: 'matrix.mtx'
+    publishDir "${params.result_dir_path}/scanpy_filter_genes", mode: 'copy', pattern: 'barcodes.tsv'
+    publishDir "${params.result_dir_path}/scanpy_filter_genes", mode: 'copy', pattern: 'genes.tsv'
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
     input:
@@ -223,6 +224,9 @@ process scanpy_filter_genes {
 
     output:
         path 'filtered_gene_anndata.h5ad'
+        path 'matrix.mtx'
+        path 'barcodes.tsv'
+        path 'genes.tsv'
 
     script:
     """
@@ -239,14 +243,19 @@ process scanpy_filter_genes {
 }
 
 process normalise_data {
-    publishDir params.result_dir_path, mode: 'copy', pattern: '(matrix\\.mtx|barcodes\\.tsv|genes\\.tsv)'
+    publishDir "${params.result_dir_path}/normalise_data", mode: 'copy', pattern: 'matrix.mtx'
+    publishDir "${params.result_dir_path}/normalise_data", mode: 'copy', pattern: 'barcodes.tsv'
+    publishDir "${params.result_dir_path}/normalise_data", mode: 'copy', pattern: 'genes.tsv'
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
     input:
         path anndata
 
     output:
-        path 'normalised_anndata.h5ad'
+	path 'normalised_anndata.h5ad'
+	path 'matrix.mtx'
+	path 'barcodes.tsv'
+	path 'genes.tsv'
 
     script:
     """
@@ -422,13 +431,15 @@ process neighbors_for_umap {
 }
 
 process find_clusters {
-    publishDir params.result_dir_path, mode: 'copy', pattern: '(clusters\\.tsv)'
+    publishDir "${params.result_dir_path}/find_clusters", mode: 'copy', pattern: 'clusters_*.tsv'
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
     input:
         tuple path(anndata), val(resolution)
     output:
         path "clusters_${resolution}.h5ad"
+	path "clusters_${resolution}.tsv"
+
     script:
     """
         scanpy-find-cluster louvain \
@@ -500,10 +511,13 @@ process build_list {
 
 process restore_unscaled {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+
     input:
-	tuple path(anndata), path(normalise_internal_data)
+	tuple path(anndata), path(normalise_internal_data
+
     output:
 	path "restore_unscaled_output_${anndata}.h5"
+
     script:
     """
 	ln -s $anndata input.h5
@@ -514,13 +528,17 @@ process restore_unscaled {
 }
 
 process find_markers {
-    publishDir params.result_dir_path, mode: 'copy', pattern: '(markers_\\.tsv)'
+    publishDir "${params.result_dir_path}/find_markers", mode: 'copy', pattern: 'markers_*.tsv'
     errorStrategy 'ignore'
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+
     input:
 	tuple path(anndata), val(merged_group_slotname)
+
     output:
 	path "markers_${merged_group_slotname}.h5ad"
+	path "markers_${merged_group_slotname}.tsv"
+
     script:
     """
 	scanpy-find-markers \
@@ -551,15 +569,18 @@ process filtered_cellgroup_markers {
 }
 
 process run_umap {
-    publishDir params.result_dir_path, mode: 'copy', pattern: '(embeddings\\.tsv)'
+    publishDir "${params.result_dir_path}/run_umap", mode: 'copy', pattern: 'embeddings_neighbors_neighbors_*.tsv'
     //errorStrategy 'ignore'
 
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     
     input:
         path anndata
+
     output:
         path "umap_*.h5ad"
+	path "embeddings_neighbors_neighbors_*.tsv"
+
     script:
     """
 	VAR="$anndata"
@@ -589,7 +610,7 @@ process run_umap {
 }
 
 process run_tsne {
-    publishDir params.result_dir_path, mode: 'copy', pattern: '(embeddings_perplexity\\.tsv)'
+    publishDir "${params.result_dir_path}/run_tsne", mode: 'copy', pattern: 'embeddings_perplexity_*\\.tsv'
     //errorStrategy 'ignore'
     
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
@@ -597,8 +618,11 @@ process run_tsne {
     input:
         tuple path(anndata), val(perplexity_values)
         val pca_param
+
     output:
         path "tsne_${perplexity_values}.h5ad"
+	path "embeddings_perplexity_${perplexity_values}.tsv"
+
     script:
     """
             scanpy-run-tsne \
@@ -752,10 +776,10 @@ workflow {
         Column_rearrange_1.out[0]
     )
     normalise_data(
-        scanpy_filter_genes.out
+        scanpy_filter_genes.out[0]
     )
     normalise_internal_data(
-        scanpy_filter_genes.out
+        scanpy_filter_genes.out[0]
     )
     find_variable_genes(
         normalise_internal_data.out,
@@ -779,13 +803,13 @@ workflow {
     TNSEs_ch = run_tsne(
         harmony_batch.out.combine(perplexity_ch),
         pca_param
-    )
+    )[0]
     //TNSEs_ch
     //    .filter { it.exitStatus == 0 }
 
     UMAPs_ch = run_umap(
         neighbors_for_umap.out.flatten()
-    )
+    )[0]
     //UMAPs_ch
    //     .filter { it.exitStatus == 0 }
     find_clusters(
@@ -793,7 +817,7 @@ workflow {
     )
 
     // Combine the outputs of find_clusters and neighbors processes
-    combined_outputs = find_clusters.out.mix(neighbors.out)
+    combined_outputs = find_clusters.out[0].mix(neighbors.out)
 
     if ( params.technology == "droplet" ) {
         restore_unscaled (
@@ -821,9 +845,9 @@ workflow {
     make_project_file(
 	neighbors.out,
 	scanpy_read_10x.out,
-	scanpy_filter_genes.out,
-	normalise_data.out,
-	find_markers.out.collect(),
+	scanpy_filter_genes.out[0],
+	normalise_data.out[0],
+	find_markers.out[0].collect(),
 	TNSEs_ch.mix(UMAPs_ch).collect()
     )
 }

From 040250cd31af812d5229edb24e497ede4daf8aee Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 15 Nov 2024 14:35:18 +0000
Subject: [PATCH 110/159] Update main.nf

---
 main.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/main.nf b/main.nf
index fe25fd42..40c9f337 100644
--- a/main.nf
+++ b/main.nf
@@ -19,6 +19,7 @@ log.info """
 WORKFLOW PARAMETER VALUES
 ===============================
 EXP dir path: ${params.dir_path}
+Selected technology: ${params.technology}
 Results results_dir_path: ${params.result_dir_path}
 celltype_field: ${params.celltype_field}
 neighbor_values: ${params.neighbor_values}

From 82e330308d06c24ec7d5007efda549ea941ac80e Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 15 Nov 2024 14:50:24 +0000
Subject: [PATCH 111/159] Update main.nf - removed filter_failed_umap/tsne as
 errorStrategy 'ignore' works

---
 main.nf | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/main.nf b/main.nf
index 40c9f337..2b82e14d 100644
--- a/main.nf
+++ b/main.nf
@@ -571,7 +571,8 @@ process filtered_cellgroup_markers {
 
 process run_umap {
     publishDir "${params.result_dir_path}/run_umap", mode: 'copy', pattern: 'embeddings_neighbors_neighbors_*.tsv'
-    //errorStrategy 'ignore'
+
+    errorStrategy 'ignore'
 
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     
@@ -612,7 +613,8 @@ process run_umap {
 
 process run_tsne {
     publishDir "${params.result_dir_path}/run_tsne", mode: 'copy', pattern: 'embeddings_perplexity_*\\.tsv'
-    //errorStrategy 'ignore'
+
+    errorStrategy 'ignore'
     
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
     
@@ -645,26 +647,6 @@ process run_tsne {
     """
 }
 
-process filter_failed_umap {
-    input:
-
-    output:
-
-    script:
-    """
-    """
-}
-
-process filer_failed_tsne {
-    input:
-
-    output:
-
-    script:
-    """
-    """
-}
-
 process merge_embeddings {
     input:
 

From 73314b99592d22b59fed1e99e97141fb2656b58f Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 15 Nov 2024 14:52:00 +0000
Subject: [PATCH 112/159] Update main.nf - `merge_embedings` removed as
 `TNSEs_ch.mix(UMAPs_ch).collect()` does same thing

---
 main.nf | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/main.nf b/main.nf
index 2b82e14d..e2962da3 100644
--- a/main.nf
+++ b/main.nf
@@ -647,18 +647,6 @@ process run_tsne {
     """
 }
 
-process merge_embeddings {
-    input:
-
-    output:
-
-    script:
-    """
-    """
-}
-
-
-
 process make_project_file {
     publishDir params.result_dir_path, mode: 'copy'
 

From 7f9162f40cf12546d602065e9c5fe016ddd78eea Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 15 Nov 2024 14:54:10 +0000
Subject: [PATCH 113/159] Update main.nf - remove `filtered_cellgroup_markers` 
 as `errorStrategy 'ignore'` does same thing

---
 main.nf | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/main.nf b/main.nf
index e2962da3..18a391d9 100644
--- a/main.nf
+++ b/main.nf
@@ -559,16 +559,6 @@ process find_markers {
     """
 }
 
-process filtered_cellgroup_markers {
-    input:
-
-    output:
-
-    script:
-    """
-    """
-}
-
 process run_umap {
     publishDir "${params.result_dir_path}/run_umap", mode: 'copy', pattern: 'embeddings_neighbors_neighbors_*.tsv'
 

From 81b6e238be399ed9ab20687cc1379b1b1318396b Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 15 Nov 2024 14:57:53 +0000
Subject: [PATCH 114/159] Update main.nf - `merge_collection` and `build_list`
 removed as its done by Nextfloe operator

---
 main.nf | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/main.nf b/main.nf
index 18a391d9..2c8b9e79 100644
--- a/main.nf
+++ b/main.nf
@@ -490,26 +490,6 @@ process merge_group_slotnames {
     """
 }
 
-process merge_collections {
-    input:
-
-    output:
-
-    script:
-    """
-    """
-}
-
-process build_list {
-    input:
-
-    output:
-
-    script:
-    """
-    """
-}
-
 process restore_unscaled {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 

From 7c4c139e9eea1feb7ae539ec16579101b79f0d98 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 15 Nov 2024 14:59:23 +0000
Subject: [PATCH 115/159] Update main.nf - removed `clustering_slot_names` and
 `merge_group_slotnames` done by Nextflow operators

---
 main.nf | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/main.nf b/main.nf
index 2c8b9e79..e483a123 100644
--- a/main.nf
+++ b/main.nf
@@ -470,26 +470,6 @@ process meta_vars {
     """
 }
 
-process clustering_slotnames {
-    input:
-
-    output:
-
-    script:
-    """
-    """
-}
-
-process merge_group_slotnames {
-    input:
-
-    output:
-
-    script:
-    """
-    """
-}
-
 process restore_unscaled {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 

From c540cdf54d1435c37e36d4e9c470ed518bf7cecf Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 15 Nov 2024 15:08:14 +0000
Subject: [PATCH 116/159] Update main.nf - removed `meta_vars` it was  galaxy
 specific functionality

---
 main.nf | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/main.nf b/main.nf
index e483a123..d8ff92b8 100644
--- a/main.nf
+++ b/main.nf
@@ -460,16 +460,6 @@ process find_clusters {
     """
 }
 
-process meta_vars {
-    input:
-
-    output:
-
-    script:
-    """
-    """
-}
-
 process restore_unscaled {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 

From bce8436f83476192ca15277cb609ca44ff433ec8 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 15 Nov 2024 15:20:15 +0000
Subject: [PATCH 117/159] Update main.nf - removes hard coded name

---
 main.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index d8ff92b8..49a1a0f1 100644
--- a/main.nf
+++ b/main.nf
@@ -736,7 +736,7 @@ workflow {
     	)
 	restore_unscaled_files = restore_unscaled.out.map { file ->
 	    // Extract the sample number from the file name
-	    def sampleNumber = file.baseName.replaceFirst('restore_unscaled_output_', '').replaceFirst('clusters_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field).replaceFirst('neighbors',params.celltype_field).replaceFirst('.h5ad','')
+	    def sampleNumber = file.baseName.replaceFirst('restore_unscaled_output_', '').replaceFirst('clusters', params.slotname).replaceFirst('neighbors',params.celltype_field).replaceFirst('.h5ad','')
 	    [file, sampleNumber] // Create a tuple with sample number and file
 	}
 	find_markers(
@@ -746,7 +746,7 @@ workflow {
     else {
 	processed_files = combined_outputs.map { file ->
          // Extract the sample number from the file name
-         def sampleNumber = file.baseName.replaceFirst('clusters_', 'louvain_resolution_').replaceFirst('neighbors',params.celltype_field)
+         def sampleNumber = file.baseName.replaceFirst('clusters', , params.slotname).replaceFirst('neighbors',params.celltype_field)
          [file, sampleNumber] // Create a tuple with sample number and file
 	}
         find_markers(

From be5fcece40b67aa4ab296d36933d152638fb6d71 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 15 Nov 2024 15:40:41 +0000
Subject: [PATCH 118/159] Update main.nf - adds missing `)`

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 49a1a0f1..8b390df6 100644
--- a/main.nf
+++ b/main.nf
@@ -464,7 +464,7 @@ process restore_unscaled {
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
     input:
-	tuple path(anndata), path(normalise_internal_data
+	tuple path(anndata), path(normalise_internal_data)
 
     output:
 	path "restore_unscaled_output_${anndata}.h5"

From bbcb763f787aa708e618ac5863f2e6907249269c Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 15 Nov 2024 16:15:57 +0000
Subject: [PATCH 119/159] Update main.nf - adds `output_dir` param

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 8b390df6..17e0d507 100644
--- a/main.nf
+++ b/main.nf
@@ -5,7 +5,7 @@ nextflow.enable.dsl=2
 params.technology = "plate"
 params.batch_variable = ""
 params.dir_path = "."
-params.result_dir_path = params.dir_path + "/results"
+params.result_dir_path = params.output_path ?: params.dir_path + "/results"
 params.celltype_field = 'NO_CELLTYPE_FIELD'
 params.neighbor_values = ['10', '100', '15', '20', '25', '3', '30', '5', '50']
 params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50']

From e81f6bc905d0fac8c3a950eb86e42563898561b1 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Fri, 15 Nov 2024 16:17:42 +0000
Subject: [PATCH 120/159] Update README.md

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 81348687..f9495845 100644
--- a/README.md
+++ b/README.md
@@ -8,9 +8,11 @@ bash scripts/data_prep.sh <EXP-ID>
 ```
 ## Run for plate
 ```
-nextflow run main.nf --slurm -resume --dir_path <EXP-ID>
+nextflow run main.nf --slurm -resume --dir_path <EXP-ID> [--output_path <PATH>]
 ```
 ## Run for droplet
 ```
-nextflow run main.nf --slurm -resume --dir_path <EXP-ID> --technology droplet
+nextflow run main.nf --slurm -resume --dir_path <EXP-ID> --technology droplet [--output_path <PATH>]
 ```
+
+If `[--output_path <PATH>]` is not specified results will be `<EXP-ID>/results` dir. 

From f32e5252dea9263446cadfce201a5f8f6fcbec56 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Tue, 19 Nov 2024 14:59:51 +0000
Subject: [PATCH 121/159] Update data_prep.sh - adds optional output dir

---
 scripts/data_prep.sh | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/scripts/data_prep.sh b/scripts/data_prep.sh
index 6a38866c..eb9c2433 100644
--- a/scripts/data_prep.sh
+++ b/scripts/data_prep.sh
@@ -1,19 +1,39 @@
+#!/usr/bin/env bash
+
+# This is EMBL-EBI specific script to fetch data from workflow root and put it in a place for downstream workflow to use 
+
 if [ -z "$SCXA_WORKFLOW_ROOT" ]; then
     echo "Variable SCXA_WORKFLOW_ROOT is not defined or empty. Please load SC env."
     echo "Exiting..."
     exit 1;
 fi
 
+if [ -z "$1" ]; then
+    echo "Experiment ID is not provided. Please provide EXP ID"
+    echo "bash data_prep.sh <EXP-ID> [output path]"
+    echo "Exiting..."
+    exit 1;
+fi
+
 EXP_ID=$1
 
-echo "Creating $(pwd)/${EXP_ID} directory"
-mkdir -p $(pwd)/${EXP_ID}
-cd $(pwd)/${EXP_ID}
+outdir="$(pwd)"
+
+if [ "$2" ]; then
+    outdir=$2
+fi
+
+
+echo "Creating ${outdir}/${EXP_ID} directory"
+mkdir -p ${outdir}/${EXP_ID}
+cd ${outdir}/${EXP_ID}
+
+echo "Copying data to ${outdir}/${EXP_ID}"
 
-echo "Copying data to ${pwd}/${EXP_ID}"
 cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/${EXP_ID}.cell_metadata.tsv cell_metadata.tsv
 cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/genes.tsv.gz . && gunzip -f genes.tsv.gz
 cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/matrix.mtx.gz . && gunzip -f matrix.mtx.gz
 cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/filtered_normalised/barcodes.tsv.gz . && gunzip -f barcodes.tsv.gz
 cp ${SCXA_WORKFLOW_ROOT}/results/${EXP_ID}/*/bundle/reference/gene_annotation.txt genes_metadata.tsv
+
 echo "Copying data for ${EXP_ID} finished"

From 6c088d2ade2500f4be747fbf74f882806eac4d3c Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Tue, 19 Nov 2024 15:02:21 +0000
Subject: [PATCH 122/159] Update README.md - updates read me

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index f9495845..76d6284c 100644
--- a/README.md
+++ b/README.md
@@ -4,15 +4,15 @@ Tertiary component for SCXA workflows
 # How to run workflow for tertiary analysis 
 ## Prepare data
 ```
-bash scripts/data_prep.sh <EXP-ID>
+bash scripts/data_prep.sh <EXP-ID> [output path]
 ```
 ## Run for plate
 ```
-nextflow run main.nf --slurm -resume --dir_path <EXP-ID> [--output_path <PATH>]
+nextflow run main.nf --slurm -resume --dir_path <EXP-ID with path> [--output_path <PATH>]
 ```
 ## Run for droplet
 ```
-nextflow run main.nf --slurm -resume --dir_path <EXP-ID> --technology droplet [--output_path <PATH>]
+nextflow run main.nf --slurm -resume --dir_path <EXP-ID with path> --technology droplet [--output_path <PATH>]
 ```
 
-If `[--output_path <PATH>]` is not specified results will be `<EXP-ID>/results` dir. 
+If `[--output_path <PATH>]` is not specified results will be `<EXP-ID with path>/results` dir. 

From ea27bbab1b72734addbe314c31bae7403027dcf1 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Tue, 19 Nov 2024 16:03:29 +0000
Subject: [PATCH 123/159] Update main.nf - removes redundant comma

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 17e0d507..ca178879 100644
--- a/main.nf
+++ b/main.nf
@@ -746,7 +746,7 @@ workflow {
     else {
 	processed_files = combined_outputs.map { file ->
          // Extract the sample number from the file name
-         def sampleNumber = file.baseName.replaceFirst('clusters', , params.slotname).replaceFirst('neighbors',params.celltype_field)
+         def sampleNumber = file.baseName.replaceFirst('clusters', params.slotname).replaceFirst('neighbors',params.celltype_field)
          [file, sampleNumber] // Create a tuple with sample number and file
 	}
         find_markers(

From ffd644328aaebf8f6807620501806d3d2c759edc Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Tue, 19 Nov 2024 16:50:43 +0000
Subject: [PATCH 124/159] Update nextflow.config - adds config for reporting

---
 nextflow.config | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/nextflow.config b/nextflow.config
index fb2e6015..62618776 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -24,6 +24,24 @@ conda {
     useMamba = true
 }
 
+timeline {
+    enabled = true
+    file = "${params.results_dir_path}/timeline.html"
+    overwrite = true
+}
+
+trace {
+    enabled = true
+    file = "${params.results_dir_path}/trace.txt"
+    overwrite = true
+}
+
+report {
+    enabled = true
+    file = "${params.results_dir_path}/report.html"
+    overwrite = true
+}
+
 // params {
 //  
 // }

From 23f0b6c954414c545ef8452e361ee50e28b1e1c6 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 20 Nov 2024 08:57:12 +0000
Subject: [PATCH 125/159] Update nextflow.config - generates report in
 `result_dir_path` without specified in params

---
 nextflow.config | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 62618776..e0bd55d1 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -24,24 +24,26 @@ conda {
     useMamba = true
 }
 
-timeline {
+params {
+    dir_path = "."
+    output_path = null
+    result_dir_path = "${params.output_path ?: params.dir_path + '/results'}"
+}
+
+trace {
     enabled = true
-    file = "${params.results_dir_path}/timeline.html"
+    file = "${params.result_dir_path}/trace.txt"
     overwrite = true
 }
 
-trace {
+timeline {
     enabled = true
-    file = "${params.results_dir_path}/trace.txt"
+    file = "${params.result_dir_path}/timeline.html"
     overwrite = true
 }
 
 report {
     enabled = true
-    file = "${params.results_dir_path}/report.html"
+    file = "${params.result_dir_path}/report.html"
     overwrite = true
 }
-
-// params {
-//  
-// }

From 7ed1533262068d1b3012eaa164956fe346cad23e Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 20 Nov 2024 09:14:58 +0000
Subject: [PATCH 126/159] Update main.nf - making `batch_variable` and
 `pca_param` as params.

---
 main.nf | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/main.nf b/main.nf
index ca178879..745fd5a0 100644
--- a/main.nf
+++ b/main.nf
@@ -4,6 +4,7 @@ nextflow.enable.dsl=2
 
 params.technology = "plate"
 params.batch_variable = ""
+params.pca_param = "X_pca"
 params.dir_path = "."
 params.result_dir_path = params.output_path ?: params.dir_path + "/results"
 params.celltype_field = 'NO_CELLTYPE_FIELD'
@@ -634,8 +635,6 @@ workflow {
     barcodes = Channel.fromPath("${params.dir_path}/barcodes.tsv")
     matrix = Channel.fromPath("${params.dir_path}/matrix.mtx")
     cellmeta = Channel.fromPath("${params.dir_path}/cell_metadata.tsv")
-    pca_param = Channel.value('X_pca')
-    batch_variable = Channel.value('')
     neighbors_ch = channel.fromList(params.neighbor_values)
     perplexity_ch = channel.fromList(params.perplexity_values)
     resolution_ch = channel.fromList(params.resolution_values)
@@ -665,7 +664,7 @@ workflow {
     if ( params.technology == "droplet" ) {
         SCRUBLET_ch = scanpy_multiplet_scrublet(
             scanpy_read_10x.out,
-            batch_variable
+            params.batch_variable
         )
         scanpy_plot_scrublet(
             SCRUBLET_ch
@@ -694,26 +693,26 @@ workflow {
     )
     find_variable_genes(
         normalise_internal_data.out,
-        batch_variable
+        params.batch_variable
     )
     run_pca(
         find_variable_genes.out
     )
     harmony_batch(
         run_pca.out,
-        batch_variable
+        params.batch_variable
     )
     neighbors(
         harmony_batch.out,
-        pca_param
+        params.pca_param
     )
     neighbors_for_umap(
         harmony_batch.out.combine(neighbors_ch),
-        pca_param
+        params.pca_param
     )
     TNSEs_ch = run_tsne(
         harmony_batch.out.combine(perplexity_ch),
-        pca_param
+        params.pca_param
     )[0]
     //TNSEs_ch
     //    .filter { it.exitStatus == 0 }

From 52f75b3c0cb8426ca71c29fbe5a8c3b72edd4e3b Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 20 Nov 2024 09:21:40 +0000
Subject: [PATCH 127/159] Update main.nf - rename `pca_param` variable as
 `representation`

---
 main.nf | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/main.nf b/main.nf
index 745fd5a0..4b9c5d89 100644
--- a/main.nf
+++ b/main.nf
@@ -4,7 +4,7 @@ nextflow.enable.dsl=2
 
 params.technology = "plate"
 params.batch_variable = ""
-params.pca_param = "X_pca"
+params.representation = "X_pca"
 params.dir_path = "."
 params.result_dir_path = params.output_path ?: params.dir_path + "/results"
 params.celltype_field = 'NO_CELLTYPE_FIELD'
@@ -382,7 +382,7 @@ process neighbors {
 
     input:
         path anndata
-        val pca_param
+        val representation
     output:
         path 'neighbors.h5ad'
 
@@ -393,7 +393,7 @@ process neighbors {
         --method 'umap' \
         --metric 'euclidean' \
         --random-state '0' \
-        --use-rep $pca_param \
+        --use-rep $representation \
         --n-pcs '50' \
         --input-format 'anndata' \
         $anndata \
@@ -410,7 +410,7 @@ process neighbors_for_umap {
 
     input:
         tuple path(anndata), val(n_neighbors)
-        val pca_param
+        val representation
     output:
         path "neighbors_${n_neighbors}.h5ad"
     script:
@@ -421,7 +421,7 @@ process neighbors_for_umap {
             --method 'umap' \
             --metric 'euclidean' \
             --random-state '0' \
-            --use-rep $pca_param \
+            --use-rep $representation \
             --n-pcs '50' \
             --input-format 'anndata' \
             $anndata \
@@ -561,7 +561,7 @@ process run_tsne {
     
     input:
         tuple path(anndata), val(perplexity_values)
-        val pca_param
+        val representation
 
     output:
         path "tsne_${perplexity_values}.h5ad"
@@ -570,7 +570,7 @@ process run_tsne {
     script:
     """
             scanpy-run-tsne \
-            --use-rep $pca_param \
+            --use-rep $representation \
             --export-embedding embeddings.tsv \
             --perplexity $perplexity_values \
             --key-added 'perplexity_$perplexity_values' \
@@ -704,15 +704,15 @@ workflow {
     )
     neighbors(
         harmony_batch.out,
-        params.pca_param
+        params.representation
     )
     neighbors_for_umap(
         harmony_batch.out.combine(neighbors_ch),
-        params.pca_param
+        params.representation
     )
     TNSEs_ch = run_tsne(
         harmony_batch.out.combine(perplexity_ch),
-        params.pca_param
+        params.representation
     )[0]
     //TNSEs_ch
     //    .filter { it.exitStatus == 0 }

From deb55066da223f1d31a2759b46b000ebac99cb39 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 20 Nov 2024 10:39:32 +0000
Subject: [PATCH 128/159] Update main.nf - renames output to match existing
 pipeline

---
 main.nf | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/main.nf b/main.nf
index 4b9c5d89..d7a54db9 100644
--- a/main.nf
+++ b/main.nf
@@ -215,9 +215,9 @@ process scanpy_filter_cells {
 }
 
 process scanpy_filter_genes {
-    publishDir "${params.result_dir_path}/scanpy_filter_genes", mode: 'copy', pattern: 'matrix.mtx'
-    publishDir "${params.result_dir_path}/scanpy_filter_genes", mode: 'copy', pattern: 'barcodes.tsv'
-    publishDir "${params.result_dir_path}/scanpy_filter_genes", mode: 'copy', pattern: 'genes.tsv'
+    publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'matrix.mtx'
+    publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'barcodes.tsv'
+    publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'genes.tsv'
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
     input:
@@ -245,9 +245,9 @@ process scanpy_filter_genes {
 }
 
 process normalise_data {
-    publishDir "${params.result_dir_path}/normalise_data", mode: 'copy', pattern: 'matrix.mtx'
-    publishDir "${params.result_dir_path}/normalise_data", mode: 'copy', pattern: 'barcodes.tsv'
-    publishDir "${params.result_dir_path}/normalise_data", mode: 'copy', pattern: 'genes.tsv'
+    publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'matrix.mtx'
+    publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'barcodes.tsv'
+    publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'genes.tsv'
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
     input:
@@ -433,7 +433,7 @@ process neighbors_for_umap {
 }
 
 process find_clusters {
-    publishDir "${params.result_dir_path}/find_clusters", mode: 'copy', pattern: 'clusters_*.tsv'
+    publishDir "${params.result_dir_path}/clusters", mode: 'copy', pattern: 'clusters_*.tsv'
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
     input:
@@ -480,7 +480,7 @@ process restore_unscaled {
 }
 
 process find_markers {
-    publishDir "${params.result_dir_path}/find_markers", mode: 'copy', pattern: 'markers_*.tsv'
+    publishDir "${params.result_dir_path}/markers", mode: 'copy', pattern: 'markers_*.tsv'
     errorStrategy 'ignore'
     container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
 
@@ -511,7 +511,7 @@ process find_markers {
 }
 
 process run_umap {
-    publishDir "${params.result_dir_path}/run_umap", mode: 'copy', pattern: 'embeddings_neighbors_neighbors_*.tsv'
+    publishDir "${params.result_dir_path}/umap", mode: 'copy', pattern: 'embeddings_neighbors_neighbors_*.tsv'
 
     errorStrategy 'ignore'
 
@@ -553,7 +553,7 @@ process run_umap {
 }
 
 process run_tsne {
-    publishDir "${params.result_dir_path}/run_tsne", mode: 'copy', pattern: 'embeddings_perplexity_*\\.tsv'
+    publishDir "${params.result_dir_path}/tsne", mode: 'copy', pattern: 'embeddings_perplexity_*\\.tsv'
 
     errorStrategy 'ignore'
     
@@ -601,7 +601,7 @@ process make_project_file {
         path find_markers
         path TNSEs_mix_UMAPs
     output:
-        path "output.h5"
+        path "project.h5ad"
     script:
     """
         ln -s $neighbors input.h5
@@ -624,6 +624,7 @@ process make_project_file {
                 echo "\${count}"
         done
         python ${projectDir}/scripts/final_project.py
+	mv output.h5 project.h5ad
     """
 }
 

From 831982cc06225f334e428587388da0827598ea01 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 20 Nov 2024 11:04:26 +0000
Subject: [PATCH 129/159] Update main.nf - parameterising container

---
 main.nf | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/main.nf b/main.nf
index d7a54db9..7942fbfa 100644
--- a/main.nf
+++ b/main.nf
@@ -2,6 +2,7 @@
 
 nextflow.enable.dsl=2
 
+params.scanpy_container = "quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0"
 params.technology = "plate"
 params.batch_variable = ""
 params.representation = "X_pca"
@@ -113,7 +114,7 @@ process mergeGeneFiles {
 }
 
 process scanpy_read_10x {
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
     
     input:
         path matrix
@@ -142,7 +143,7 @@ process scanpy_read_10x {
 }
 
 process scanpy_multiplet_scrublet {
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
     
     input:
         path anndata
@@ -172,7 +173,7 @@ process scanpy_multiplet_scrublet {
 
 process scanpy_plot_scrublet {
     publishDir params.result_dir_path, mode: 'copy', pattern: '(scrublet.png)'
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
     
     input:
         path anndata
@@ -192,7 +193,7 @@ process scanpy_plot_scrublet {
 }
 
 process scanpy_filter_cells {
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
     
     input:
         path anndata
@@ -218,7 +219,7 @@ process scanpy_filter_genes {
     publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'matrix.mtx'
     publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'barcodes.tsv'
     publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'genes.tsv'
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
 
     input:
         path anndata
@@ -248,7 +249,7 @@ process normalise_data {
     publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'matrix.mtx'
     publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'barcodes.tsv'
     publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'genes.tsv'
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
 
     input:
         path anndata
@@ -273,7 +274,7 @@ process normalise_data {
 }
 
 process normalise_internal_data {
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
     
     input:
         path anndata
@@ -293,7 +294,7 @@ process normalise_internal_data {
 }
 
 process find_variable_genes {
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
 
     input:
         path anndata
@@ -325,7 +326,7 @@ process find_variable_genes {
 }
 
 process run_pca {
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
 
     input:
         path anndata
@@ -348,7 +349,7 @@ process run_pca {
 }
 
 process harmony_batch {
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
 
     input:
         path anndata
@@ -378,7 +379,7 @@ process harmony_batch {
 }
 
 process neighbors {
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
 
     input:
         path anndata
@@ -405,7 +406,7 @@ process neighbors {
 }
 
 process neighbors_for_umap {
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
 
 
     input:
@@ -434,7 +435,7 @@ process neighbors_for_umap {
 
 process find_clusters {
     publishDir "${params.result_dir_path}/clusters", mode: 'copy', pattern: 'clusters_*.tsv'
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
 
     input:
         tuple path(anndata), val(resolution)
@@ -462,7 +463,7 @@ process find_clusters {
 }
 
 process restore_unscaled {
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
 
     input:
 	tuple path(anndata), path(normalise_internal_data)
@@ -482,7 +483,7 @@ process restore_unscaled {
 process find_markers {
     publishDir "${params.result_dir_path}/markers", mode: 'copy', pattern: 'markers_*.tsv'
     errorStrategy 'ignore'
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
 
     input:
 	tuple path(anndata), val(merged_group_slotname)
@@ -515,7 +516,7 @@ process run_umap {
 
     errorStrategy 'ignore'
 
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
     
     input:
         path anndata
@@ -557,7 +558,7 @@ process run_tsne {
 
     errorStrategy 'ignore'
     
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
     
     input:
         tuple path(anndata), val(perplexity_values)
@@ -591,7 +592,7 @@ process run_tsne {
 process make_project_file {
     publishDir params.result_dir_path, mode: 'copy'
 
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_container
 
     input:
         path neighbors

From 9c05d8e501bf7d17a75d131c55e630dd7680d1ac Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 20 Nov 2024 11:05:09 +0000
Subject: [PATCH 130/159] Update main.nf - rename `scanpy_container` to
 `scanpy_scripts_container`

---
 main.nf | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/main.nf b/main.nf
index 7942fbfa..02163de6 100644
--- a/main.nf
+++ b/main.nf
@@ -2,7 +2,7 @@
 
 nextflow.enable.dsl=2
 
-params.scanpy_container = "quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0"
+params.scanpy_scripts_container = "quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0"
 params.technology = "plate"
 params.batch_variable = ""
 params.representation = "X_pca"
@@ -114,7 +114,7 @@ process mergeGeneFiles {
 }
 
 process scanpy_read_10x {
-    container params.scanpy_container
+    container params.scanpy_scripts_container
     
     input:
         path matrix
@@ -143,7 +143,7 @@ process scanpy_read_10x {
 }
 
 process scanpy_multiplet_scrublet {
-    container params.scanpy_container
+    container params.scanpy_scripts_container
     
     input:
         path anndata
@@ -173,7 +173,7 @@ process scanpy_multiplet_scrublet {
 
 process scanpy_plot_scrublet {
     publishDir params.result_dir_path, mode: 'copy', pattern: '(scrublet.png)'
-    container params.scanpy_container
+    container params.scanpy_scripts_container
     
     input:
         path anndata
@@ -193,7 +193,7 @@ process scanpy_plot_scrublet {
 }
 
 process scanpy_filter_cells {
-    container params.scanpy_container
+    container params.scanpy_scripts_container
     
     input:
         path anndata
@@ -219,7 +219,7 @@ process scanpy_filter_genes {
     publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'matrix.mtx'
     publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'barcodes.tsv'
     publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'genes.tsv'
-    container params.scanpy_container
+    container params.scanpy_scripts_container
 
     input:
         path anndata
@@ -249,7 +249,7 @@ process normalise_data {
     publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'matrix.mtx'
     publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'barcodes.tsv'
     publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'genes.tsv'
-    container params.scanpy_container
+    container params.scanpy_scripts_container
 
     input:
         path anndata
@@ -274,7 +274,7 @@ process normalise_data {
 }
 
 process normalise_internal_data {
-    container params.scanpy_container
+    container params.scanpy_scripts_container
     
     input:
         path anndata
@@ -294,7 +294,7 @@ process normalise_internal_data {
 }
 
 process find_variable_genes {
-    container params.scanpy_container
+    container params.scanpy_scripts_container
 
     input:
         path anndata
@@ -326,7 +326,7 @@ process find_variable_genes {
 }
 
 process run_pca {
-    container params.scanpy_container
+    container params.scanpy_scripts_container
 
     input:
         path anndata
@@ -349,7 +349,7 @@ process run_pca {
 }
 
 process harmony_batch {
-    container params.scanpy_container
+    container params.scanpy_scripts_container
 
     input:
         path anndata
@@ -379,7 +379,7 @@ process harmony_batch {
 }
 
 process neighbors {
-    container params.scanpy_container
+    container params.scanpy_scripts_container
 
     input:
         path anndata
@@ -406,7 +406,7 @@ process neighbors {
 }
 
 process neighbors_for_umap {
-    container params.scanpy_container
+    container params.scanpy_scripts_container
 
 
     input:
@@ -435,7 +435,7 @@ process neighbors_for_umap {
 
 process find_clusters {
     publishDir "${params.result_dir_path}/clusters", mode: 'copy', pattern: 'clusters_*.tsv'
-    container params.scanpy_container
+    container params.scanpy_scripts_container
 
     input:
         tuple path(anndata), val(resolution)
@@ -463,7 +463,7 @@ process find_clusters {
 }
 
 process restore_unscaled {
-    container params.scanpy_container
+    container params.scanpy_scripts_container
 
     input:
 	tuple path(anndata), path(normalise_internal_data)
@@ -483,7 +483,7 @@ process restore_unscaled {
 process find_markers {
     publishDir "${params.result_dir_path}/markers", mode: 'copy', pattern: 'markers_*.tsv'
     errorStrategy 'ignore'
-    container params.scanpy_container
+    container params.scanpy_scripts_container
 
     input:
 	tuple path(anndata), val(merged_group_slotname)
@@ -516,7 +516,7 @@ process run_umap {
 
     errorStrategy 'ignore'
 
-    container params.scanpy_container
+    container params.scanpy_scripts_container
     
     input:
         path anndata
@@ -558,7 +558,7 @@ process run_tsne {
 
     errorStrategy 'ignore'
     
-    container params.scanpy_container
+    container params.scanpy_scripts_container
     
     input:
         tuple path(anndata), val(perplexity_values)
@@ -592,7 +592,7 @@ process run_tsne {
 process make_project_file {
     publishDir params.result_dir_path, mode: 'copy'
 
-    container params.scanpy_container
+    container params.scanpy_scripts_container
 
     input:
         path neighbors

From 05d3e7506067c596faea062eba7aab5bc1b6815e Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 20 Nov 2024 11:05:50 +0000
Subject: [PATCH 131/159] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 76d6284c..0fc91ff7 100644
--- a/README.md
+++ b/README.md
@@ -8,11 +8,11 @@ bash scripts/data_prep.sh <EXP-ID> [output path]
 ```
 ## Run for plate
 ```
-nextflow run main.nf --slurm -resume --dir_path <EXP-ID with path> [--output_path <PATH>]
+nextflow run main.nf --slurm -resume --dir_path <EXP-ID with path> [--output_path <PATH>]  [--scanpy_scripts_container <container_id>]
 ```
 ## Run for droplet
 ```
-nextflow run main.nf --slurm -resume --dir_path <EXP-ID with path> --technology droplet [--output_path <PATH>]
+nextflow run main.nf --slurm -resume --dir_path <EXP-ID with path> --technology droplet [--output_path <PATH>] [--scanpy_scripts_container <container_id>]
 ```
 
 If `[--output_path <PATH>]` is not specified results will be `<EXP-ID with path>/results` dir. 

From 290d17466d1b8161eeba567e30be9a015bc9f1b6 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 20 Nov 2024 11:36:00 +0000
Subject: [PATCH 132/159] Update main.nf - rename output dir

---
 main.nf | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/main.nf b/main.nf
index 02163de6..f54bc2b1 100644
--- a/main.nf
+++ b/main.nf
@@ -216,9 +216,9 @@ process scanpy_filter_cells {
 }
 
 process scanpy_filter_genes {
-    publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'matrix.mtx'
-    publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'barcodes.tsv'
-    publishDir "${params.result_dir_path}/matrices/scanpy_filter_genes", mode: 'copy', pattern: 'genes.tsv'
+    publishDir "${params.result_dir_path}/matrices/raw_filtered", mode: 'copy', pattern: 'matrix.mtx'
+    publishDir "${params.result_dir_path}/matrices/raw_filtered", mode: 'copy', pattern: 'barcodes.tsv'
+    publishDir "${params.result_dir_path}/matrices/raw_filtered", mode: 'copy', pattern: 'genes.tsv'
     container params.scanpy_scripts_container
 
     input:
@@ -246,9 +246,9 @@ process scanpy_filter_genes {
 }
 
 process normalise_data {
-    publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'matrix.mtx'
-    publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'barcodes.tsv'
-    publishDir "${params.result_dir_path}/matrices/normalise_data", mode: 'copy', pattern: 'genes.tsv'
+    publishDir "${params.result_dir_path}/matrices/filtered_normalised", mode: 'copy', pattern: 'matrix.mtx'
+    publishDir "${params.result_dir_path}/matrices/filtered_normalised", mode: 'copy', pattern: 'barcodes.tsv'
+    publishDir "${params.result_dir_path}/matrices/filtered_normalised", mode: 'copy', pattern: 'genes.tsv'
     container params.scanpy_scripts_container
 
     input:

From 021411d4a2974eeea633715b20e7a262e49df495 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Thu, 28 Nov 2024 11:19:29 +0000
Subject: [PATCH 133/159] Update main.nf

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index f54bc2b1..dc5c7568 100644
--- a/main.nf
+++ b/main.nf
@@ -164,7 +164,7 @@ process scanpy_multiplet_scrublet {
             scanpy-cli multiplet scrublet \
             --input-format 'anndata' \
             --output-format 'anndata' \
-            --batch-key "${params.batch_variable}" \
+            --batch-key "$batch_variable" \
             $anndata \
             scrublet.h5ad
         fi

From 4a2af1fd39f91b51db53ac174632f5ac708994f8 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Thu, 28 Nov 2024 11:20:59 +0000
Subject: [PATCH 134/159] Update main.nf

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index dc5c7568..62e09146 100644
--- a/main.nf
+++ b/main.nf
@@ -154,7 +154,7 @@ process scanpy_multiplet_scrublet {
 
     script:
     """
-        if [ -z "${params.batch_variable}" ]; then
+        if [ -z "$batch_variable" ]; then
             scanpy-cli multiplet scrublet \
             --input-format 'anndata' \
             --output-format 'anndata' \

From 5d568d97d2f79ffa2647a943d9c278ac25c6cf3d Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <pmadrigal@ebi.ac.uk>
Date: Fri, 29 Nov 2024 15:59:38 +0000
Subject: [PATCH 135/159] make scripts executable

---
 scripts/data_prep.sh        | 0
 scripts/restore_unscaled.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 scripts/data_prep.sh
 mode change 100644 => 100755 scripts/restore_unscaled.py

diff --git a/scripts/data_prep.sh b/scripts/data_prep.sh
old mode 100644
new mode 100755
diff --git a/scripts/restore_unscaled.py b/scripts/restore_unscaled.py
old mode 100644
new mode 100755

From cc29b1fcf54736aa1520722d96e61511f6b97d08 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Tue, 3 Dec 2024 15:09:25 +0000
Subject: [PATCH 136/159] Update main.nf - renaming tsne and umap tsvs

---
 main.nf | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/main.nf b/main.nf
index 62e09146..309bf6dc 100644
--- a/main.nf
+++ b/main.nf
@@ -512,7 +512,7 @@ process find_markers {
 }
 
 process run_umap {
-    publishDir "${params.result_dir_path}/umap", mode: 'copy', pattern: 'embeddings_neighbors_neighbors_*.tsv'
+    publishDir "${params.result_dir_path}/umap", mode: 'copy', pattern: 'umap_n_neighbors_*.tsv'
 
     errorStrategy 'ignore'
 
@@ -546,15 +546,14 @@ process run_umap {
             $anndata \
             --show-obj stdout \
             --output-format anndata \
-            "umap_\${n_number}.h5ad"  
-            # Not sure if following is needed
-            # && mv "embeddings_neighbors_n_neighbors_\${n_number}.tsv" embeddings.tsv
+            "umap_\${n_number}.h5ad" \
+            && mv "embeddings_neighbors_neighbors_\${n_number}.tsv" umap_n_neighbors_\${n_number}.tsv
 
     """
 }
 
 process run_tsne {
-    publishDir "${params.result_dir_path}/tsne", mode: 'copy', pattern: 'embeddings_perplexity_*\\.tsv'
+    publishDir "${params.result_dir_path}/tsne", mode: 'copy', pattern: 'tsne_perplexity_*\\.tsv'
 
     errorStrategy 'ignore'
     
@@ -583,9 +582,8 @@ process run_tsne {
             $anndata \
             --show-obj stdout \
             --output-format anndata \
-            'tsne_${perplexity_values}.h5ad'
-            # Not sure if following is needed
-            # && mv 'embeddings_perplexity_${perplexity_values}.tsv' embeddings.tsv
+            'tsne_${perplexity_values}.h5ad' \
+            && mv 'embeddings_perplexity_${perplexity_values}.tsv' 'tsne_perplexity_${perplexity_values}.tsv'
     """
 }
 

From 9b89d11321800f261ef1ef207f5dd2f4e1eae856 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Tue, 3 Dec 2024 15:58:16 +0000
Subject: [PATCH 137/159] Update main.nf - updates output names

---
 main.nf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index 309bf6dc..0a48ec6b 100644
--- a/main.nf
+++ b/main.nf
@@ -523,7 +523,7 @@ process run_umap {
 
     output:
         path "umap_*.h5ad"
-	path "embeddings_neighbors_neighbors_*.tsv"
+	path "umap_n_neighbors_*.tsv"
 
     script:
     """
@@ -547,7 +547,7 @@ process run_umap {
             --show-obj stdout \
             --output-format anndata \
             "umap_\${n_number}.h5ad" \
-            && mv "embeddings_neighbors_neighbors_\${n_number}.tsv" umap_n_neighbors_\${n_number}.tsv
+            && mv 'embeddings_neighbors_neighbors_\${n_number}.tsv' 'umap_n_neighbors_\${n_number}.tsv'
 
     """
 }
@@ -565,7 +565,7 @@ process run_tsne {
 
     output:
         path "tsne_${perplexity_values}.h5ad"
-	path "embeddings_perplexity_${perplexity_values}.tsv"
+	path "tsne_perplexity_${perplexity_values}.tsv"
 
     script:
     """

From 2d4e7e314a77992de53cd3e0b6d60f625006d9db Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Tue, 3 Dec 2024 16:39:30 +0000
Subject: [PATCH 138/159] Update main.nf

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 0a48ec6b..efef95c7 100644
--- a/main.nf
+++ b/main.nf
@@ -547,7 +547,7 @@ process run_umap {
             --show-obj stdout \
             --output-format anndata \
             "umap_\${n_number}.h5ad" \
-            && mv 'embeddings_neighbors_neighbors_\${n_number}.tsv' 'umap_n_neighbors_\${n_number}.tsv'
+            && mv "embeddings_neighbors_\${n_number}.tsv" umap_n_\${n_number}.tsv
 
     """
 }

From de3c4f4b8bdc770e76c23fa3184fd2bf9196b1c9 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Tue, 3 Dec 2024 19:52:29 +0000
Subject: [PATCH 139/159] Update main.nf - updates cluster file name to match
 existing and parsing script

---
 main.nf | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index efef95c7..c0beb618 100644
--- a/main.nf
+++ b/main.nf
@@ -434,7 +434,7 @@ process neighbors_for_umap {
 }
 
 process find_clusters {
-    publishDir "${params.result_dir_path}/clusters", mode: 'copy', pattern: 'clusters_*.tsv'
+    publishDir "${params.result_dir_path}/clusters", mode: 'copy', pattern: 'clusters_resolution_*.tsv'
     container params.scanpy_scripts_container
 
     input:
@@ -458,7 +458,7 @@ process find_clusters {
         --output-format anndata \
         'clusters_${resolution}.h5ad'
 	
-	mv 'output.tsv' 'clusters_${resolution}.tsv'
+	mv 'output.tsv' 'clusters_resolution_${resolution}.tsv'
     """
 }
 
@@ -507,7 +507,7 @@ process find_markers {
 	$anndata  \
 	--show-obj stdout \
 	--output-format anndata \
-	'markers_${merged_group_slotname}.h5ad'
+	'markers_${merged_group_slotname}.h5ad' \
     """
 }
 

From c300446f340fd22aad5ac47e59d7c02413a4c543 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Tue, 3 Dec 2024 20:38:44 +0000
Subject: [PATCH 140/159] Update main.nf - fixes cluster output

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index c0beb618..4d31fd07 100644
--- a/main.nf
+++ b/main.nf
@@ -441,7 +441,7 @@ process find_clusters {
         tuple path(anndata), val(resolution)
     output:
         path "clusters_${resolution}.h5ad"
-	path "clusters_${resolution}.tsv"
+	path "clusters_resolution_${resolution}.tsv"
 
     script:
     """

From 651119a3e797322b897f2983153b8f1cba781b67 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Wed, 4 Dec 2024 21:48:39 +0000
Subject: [PATCH 141/159] Update main.nf - changes marker file name

---
 main.nf | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 4d31fd07..bb89d4c2 100644
--- a/main.nf
+++ b/main.nf
@@ -490,10 +490,15 @@ process find_markers {
 
     output:
 	path "markers_${merged_group_slotname}.h5ad"
-	path "markers_${merged_group_slotname}.tsv"
+	path "markers_*.tsv"
 
     script:
     """
+	VAR="$merged_group_slotname"
+	PREFIX={params.slotname}
+	n_number="\${VAR/_\$PREFIX/}"
+	echo \$n_number
+
 	scanpy-find-markers \
 	--save 'markers_${merged_group_slotname}.tsv' \
 	--n-genes '100' \
@@ -508,6 +513,7 @@ process find_markers {
 	--show-obj stdout \
 	--output-format anndata \
 	'markers_${merged_group_slotname}.h5ad' \
+	&& 'markers_${merged_group_slotname}.tsv' 'markers_\${n_number}.tsv'
     """
 }
 

From a69967fc4458f89617ce0a9f1c726a6ca07bd7e5 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Thu, 5 Dec 2024 08:05:12 +0000
Subject: [PATCH 142/159] Update main.nf - ads log

---
 main.nf | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index bb89d4c2..813feccb 100644
--- a/main.nf
+++ b/main.nf
@@ -496,6 +496,8 @@ process find_markers {
     """
 	VAR="$merged_group_slotname"
 	PREFIX={params.slotname}
+	echo \$VAR
+	echo \$PREFIX
 	n_number="\${VAR/_\$PREFIX/}"
 	echo \$n_number
 
@@ -513,7 +515,7 @@ process find_markers {
 	--show-obj stdout \
 	--output-format anndata \
 	'markers_${merged_group_slotname}.h5ad' \
-	&& 'markers_${merged_group_slotname}.tsv' 'markers_\${n_number}.tsv'
+	&& mv 'markers_${merged_group_slotname}.tsv' 'markers_\${n_number}.tsv'
     """
 }
 

From af972a3a1a68e80d46f639e31f5f40bc95b2bf7b Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Thu, 5 Dec 2024 09:42:33 +0000
Subject: [PATCH 143/159] Update main.nf - fixes marker rename

---
 main.nf | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/main.nf b/main.nf
index 813feccb..89fc3e65 100644
--- a/main.nf
+++ b/main.nf
@@ -495,11 +495,11 @@ process find_markers {
     script:
     """
 	VAR="$merged_group_slotname"
-	PREFIX={params.slotname}
-	echo \$VAR
-	echo \$PREFIX
-	n_number="\${VAR/_\$PREFIX/}"
-	echo \$n_number
+        PREFIX="${params.slotname}_"
+        echo \$VAR
+        echo \$PREFIX
+        n_number="\${VAR#\$PREFIX}"
+        echo \$n_number
 
 	scanpy-find-markers \
 	--save 'markers_${merged_group_slotname}.tsv' \
@@ -514,8 +514,8 @@ process find_markers {
 	$anndata  \
 	--show-obj stdout \
 	--output-format anndata \
-	'markers_${merged_group_slotname}.h5ad' \
-	&& mv 'markers_${merged_group_slotname}.tsv' 'markers_\${n_number}.tsv'
+	"markers_${merged_group_slotname}.h5ad" \
+        && mv "markers_${merged_group_slotname}.tsv" "markers_\${n_number}.tsv"
     """
 }
 

From df76b9f65bf856870013c5f73b94069324a8f325 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Thu, 5 Dec 2024 13:31:56 +0000
Subject: [PATCH 144/159] Update main.nf - changes marker tsv name

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 89fc3e65..eaadf080 100644
--- a/main.nf
+++ b/main.nf
@@ -515,7 +515,7 @@ process find_markers {
 	--show-obj stdout \
 	--output-format anndata \
 	"markers_${merged_group_slotname}.h5ad" \
-        && mv "markers_${merged_group_slotname}.tsv" "markers_\${n_number}.tsv"
+        && mv "markers_${merged_group_slotname}.tsv" "markers_resolution_\${n_number}.tsv"
     """
 }
 

From 98ae08ee30553438266632aef60623f09cb60a3e Mon Sep 17 00:00:00 2001
From: Iris Diana Yu <17606346+irisdianauy@users.noreply.github.com>
Date: Fri, 6 Dec 2024 18:05:57 +0800
Subject: [PATCH 145/159] Update main.nf - correct key added in run_umap

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index eaadf080..b50b1168 100644
--- a/main.nf
+++ b/main.nf
@@ -540,7 +540,7 @@ process run_umap {
 	echo \$n_number
 	scanpy-run-umap \
             --neighbors-key "neighbors_n_\${n_number}" \
-            --key-added "neighbors_\${n_number}" \
+            --key-added "neighbors_n_\${n_number}" \
             --export-embedding embeddings.tsv \
             --n-components 2 \
             --min-dist 0.5 \

From 6cb9451c59d7fb84268ab9aeb9e0d1752e9a8aef Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <8195212+pmb59@users.noreply.github.com>
Date: Fri, 6 Dec 2024 10:49:52 +0000
Subject: [PATCH 146/159] add _n in embeddings_neighbors_n_

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index b50b1168..3b187b32 100644
--- a/main.nf
+++ b/main.nf
@@ -555,7 +555,7 @@ process run_umap {
             --show-obj stdout \
             --output-format anndata \
             "umap_\${n_number}.h5ad" \
-            && mv "embeddings_neighbors_\${n_number}.tsv" umap_n_\${n_number}.tsv
+            && mv "embeddings_neighbors_n_\${n_number}.tsv" umap_n_\${n_number}.tsv
 
     """
 }

From c24f6e1a93f8a2f5fd0201abbebb78fb07c99736 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Tue, 10 Dec 2024 12:16:43 +0000
Subject: [PATCH 147/159] Update main.nf - adding log info

---
 main.nf | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/main.nf b/main.nf
index 3b187b32..50bd5cfd 100644
--- a/main.nf
+++ b/main.nf
@@ -30,6 +30,8 @@ resolution_values: ${params.resolution_values}
 slotname: ${params.slotname}
 clustering_slotname: ${params.clustering_slotname}
 merged_group_slotname: ${params.merged_group_slotname}
+batch_variable: ${params.batch_variable}
+representation: ${params.representation}
 ===============================
 """
 

From 0d3248c93e790eeb9f311842bf77ac2516f16e39 Mon Sep 17 00:00:00 2001
From: Anil Thanki <thanki.anil@gmail.com>
Date: Tue, 10 Dec 2024 12:30:39 +0000
Subject: [PATCH 148/159] Update main.nf - filter cell process n_counts based
 on technology

---
 main.nf | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 50bd5cfd..40017769 100644
--- a/main.nf
+++ b/main.nf
@@ -206,8 +206,13 @@ process scanpy_filter_cells {
 
     script:
     """
-        scanpy-filter-cells --gene-name 'gene_symbols' \
-        --param 'c:n_counts' 750.0 1000000000.0 \
+        n_counts=1500
+	if [[ -n "$category" ]]; then
+            n_counts=750
+        fi
+
+	scanpy-filter-cells --gene-name 'gene_symbols' \
+        --param 'c:n_counts' \$n_counts 1000000000.0 \
         --param 'c:pct_counts_mito' 0.0 0.35 \
         --input-format 'anndata' $anndata \
         --show-obj stdout \

From 7e61db3922a1cdd51875450d9e794466606e61ab Mon Sep 17 00:00:00 2001
From: Iris Diana Yu <irisyu@ebi.ac.uk>
Date: Wed, 11 Dec 2024 09:14:29 +0000
Subject: [PATCH 149/159] Add process scale_data

---
 main.nf | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index 40017769..5bb76ccc 100644
--- a/main.nf
+++ b/main.nf
@@ -332,6 +332,26 @@ process find_variable_genes {
     """
 }
 
+process scale_data {
+    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+
+    input:
+        path anndata
+
+    output:
+        path 'scaled_anndata.h5ad'
+
+    script:
+    """
+        scanpy-scale-data \
+        --input-format "anndata" \
+        --output-format "anndata" \
+        $anndata \
+        'scaled_anndata.h5ad'
+
+    """
+}
+
 process run_pca {
     container params.scanpy_scripts_container
 
@@ -710,9 +730,21 @@ workflow {
         normalise_internal_data.out,
         params.batch_variable
     )
-    run_pca(
-        find_variable_genes.out
-    )
+
+    if ( params.technology == "droplet" ) {
+        scale_data(
+            find_variable_genes.out
+        )
+        run_pca(
+            scale_data.out
+        )
+    }
+    else {
+        run_pca(
+            find_variable_genes.out
+        )
+    }
+
     harmony_batch(
         run_pca.out,
         params.batch_variable

From 28064664eb6df7e73cc6e2401a615ef1f79d700b Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <8195212+pmb59@users.noreply.github.com>
Date: Thu, 12 Dec 2024 12:12:17 +0000
Subject: [PATCH 150/159] reorder params.neighbor_values

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 5bb76ccc..1f22ad11 100644
--- a/main.nf
+++ b/main.nf
@@ -9,7 +9,7 @@ params.representation = "X_pca"
 params.dir_path = "."
 params.result_dir_path = params.output_path ?: params.dir_path + "/results"
 params.celltype_field = 'NO_CELLTYPE_FIELD'
-params.neighbor_values = ['10', '100', '15', '20', '25', '3', '30', '5', '50']
+params.neighbor_values = ['3', '5', '10', '15', '20', '25', '30', '50', '100']
 params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40', '45', '50']
 params.resolution_values = ['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0']
 params.slotname = "louvain_resolution"

From d420d3994344e110aa503d6b9a927913b535b906 Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <8195212+pmb59@users.noreply.github.com>
Date: Thu, 12 Dec 2024 15:29:55 +0000
Subject: [PATCH 151/159] Update main.nf

---
 main.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/main.nf b/main.nf
index 1f22ad11..b3104978 100644
--- a/main.nf
+++ b/main.nf
@@ -562,6 +562,7 @@ process run_umap {
 
     script:
     """
+        PYTHONIOENCODING=utf-8
 	VAR="$anndata"
 	n_number="\${VAR%.h5ad}"
 	echo \$n_number

From 38766ea3d8c3ccde8f854ec0d5800aee027d4c04 Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <pmadrigal@ebi.ac.uk>
Date: Thu, 12 Dec 2024 15:43:08 +0000
Subject: [PATCH 152/159] set env variable  PYTHONIOENCODING = 'utf-8' for all
 processes

---
 main.nf         | 6 +++---
 nextflow.config | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/main.nf b/main.nf
index b3104978..109a6cbc 100644
--- a/main.nf
+++ b/main.nf
@@ -558,11 +558,11 @@ process run_umap {
 
     output:
         path "umap_*.h5ad"
-	path "umap_n_neighbors_*.tsv"
+        path "umap_n_neighbors_*.tsv"
 
     script:
     """
-        PYTHONIOENCODING=utf-8
+	echo \$PYTHONIOENCODING
 	VAR="$anndata"
 	n_number="\${VAR%.h5ad}"
 	echo \$n_number
@@ -659,7 +659,7 @@ process make_project_file {
                 echo "\${count}"
         done
         python ${projectDir}/scripts/final_project.py
-	mv output.h5 project.h5ad
+        mv output.h5 project.h5ad
     """
 }
 
diff --git a/nextflow.config b/nextflow.config
index e0bd55d1..8170742f 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -7,6 +7,7 @@ process {
     queueSize=500
     exitReadTimeout='100000 sec'
     pollInterval = '5sec'
+    env.PYTHONIOENCODING = 'utf-8'
     // error strategy
     // errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
     // memory = { 4.GB * 2 ^task.attempt }

From 18a690e0d8f5379c5091d4cd0c295707069bc5cc Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <pmadrigal@ebi.ac.uk>
Date: Thu, 12 Dec 2024 16:13:10 +0000
Subject: [PATCH 153/159] fix env PYTHONIOENCODING = 'utf-8'

---
 nextflow.config | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index 8170742f..9a1f491e 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,3 +1,7 @@
+env {
+    PYTHONIOENCODING = 'utf-8'
+}
+
 process {
     executor='slurm'
     queue="$SCXA_HPC_QUEUE"
@@ -7,7 +11,6 @@ process {
     queueSize=500
     exitReadTimeout='100000 sec'
     pollInterval = '5sec'
-    env.PYTHONIOENCODING = 'utf-8'
     // error strategy
     // errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
     // memory = { 4.GB * 2 ^task.attempt }

From 8aece46179d1ffb6da871d658594a305904d6cf1 Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <8195212+pmb59@users.noreply.github.com>
Date: Thu, 12 Dec 2024 16:33:33 +0000
Subject: [PATCH 154/159] Update main.nf

---
 main.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/main.nf b/main.nf
index 109a6cbc..d959fa6f 100644
--- a/main.nf
+++ b/main.nf
@@ -562,6 +562,7 @@ process run_umap {
 
     script:
     """
+	export PYTHONIOENCODING='utf-8'
 	echo \$PYTHONIOENCODING
 	VAR="$anndata"
 	n_number="\${VAR%.h5ad}"

From 931231157fbcfbcf082d2ff83b7223cc22965a63 Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <pmadrigal@ebi.ac.uk>
Date: Fri, 13 Dec 2024 15:53:31 +0000
Subject: [PATCH 155/159] ensure PYTHONIOENCODING='utf-8' on each scanpy-script
 command

---
 main.nf         | 31 +++++++++++++++++++++++++------
 nextflow.config |  4 ----
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/main.nf b/main.nf
index d959fa6f..a630d414 100644
--- a/main.nf
+++ b/main.nf
@@ -14,7 +14,7 @@ params.perplexity_values = ['1', '5', '10', '15', '20', '25', '30', '35', '40',
 params.resolution_values = ['0.1', '0.3', '0.5', '0.7', '1.0', '2.0', '3.0', '4.0', '5.0']
 params.slotname = "louvain_resolution"
 params.clustering_slotname = params.resolution_values.collect { params.slotname + "_" + it }
-params.merged_group_slotname = params.clustering_slotname + params.celltype_field
+params.merged_group_slotname = params.clustering_slotname.collect { it + params.celltype_field }
 
 log.info """
 ===============================
@@ -133,6 +133,8 @@ process scanpy_read_10x {
         #ln -s $matrix matrix.mtx
         ln -s $genes genes.tsv
         #ln -s $barcodes barcodes.tsv
+
+        export PYTHONIOENCODING='utf-8'
         
         scanpy-read-10x --input-10x-mtx ./ \
         --var-names 'gene_ids' \
@@ -156,6 +158,7 @@ process scanpy_multiplet_scrublet {
 
     script:
     """
+        export PYTHONIOENCODING='utf-8'
         if [ -z "$batch_variable" ]; then
             scanpy-cli multiplet scrublet \
             --input-format 'anndata' \
@@ -185,6 +188,7 @@ process scanpy_plot_scrublet {
 
     script:
     """
+        export PYTHONIOENCODING='utf-8'
         scanpy-cli plot scrublet \
         --input-format "anndata" \
         --scale-hist-obs "linear" \
@@ -206,11 +210,12 @@ process scanpy_filter_cells {
 
     script:
     """
-        n_counts=1500
-	if [[ -n "$category" ]]; then
-            n_counts=750
-        fi
+    n_counts=1500
+    if [[ -n "$category" ]]; then
+        n_counts=750
+    fi
 
+    export PYTHONIOENCODING='utf-8'
 	scanpy-filter-cells --gene-name 'gene_symbols' \
         --param 'c:n_counts' \$n_counts 1000000000.0 \
         --param 'c:pct_counts_mito' 0.0 0.35 \
@@ -240,6 +245,7 @@ process scanpy_filter_genes {
 
     script:
     """
+        export PYTHONIOENCODING='utf-8'
         scanpy-filter-genes \
         --param 'g:n_cells' 3.0 1000000000.0 \
         --subset 'g:index' \
@@ -269,6 +275,7 @@ process normalise_data {
 
     script:
     """
+        export PYTHONIOENCODING='utf-8'
         scanpy-normalise-data \
         --no-log-transform \
         --normalize-to '1000000.0' \
@@ -291,6 +298,7 @@ process normalise_internal_data {
 
     script:
     """
+        export PYTHONIOENCODING='utf-8'
         scanpy-normalise-data \
         --normalize-to '1000000.0' \
         --input-format 'anndata' $anndata \
@@ -317,7 +325,7 @@ process find_variable_genes {
             batch_variable_tag="--batch-key $batch_variable"
         fi
 
-
+        export PYTHONIOENCODING='utf-8'
         scanpy-find-variable-genes \
         --flavor 'seurat' \
         --mean-limits 0.0125 1000000000.0 \
@@ -343,6 +351,7 @@ process scale_data {
 
     script:
     """
+        export PYTHONIOENCODING='utf-8'
         scanpy-scale-data \
         --input-format "anndata" \
         --output-format "anndata" \
@@ -363,6 +372,7 @@ process run_pca {
 
     script:
     """
+        export PYTHONIOENCODING='utf-8'
         scanpy-run-pca \
         --no-zero-center \
         --svd-solver 'arpack' \
@@ -386,6 +396,7 @@ process harmony_batch {
 
     script:
     """
+        export PYTHONIOENCODING='utf-8'
         if [[ -n "$batch_variable" ]]; then
             scanpy-integrate harmony \
             --batch-key $batch_variable \
@@ -416,6 +427,7 @@ process neighbors {
 
     script:
     """
+        export PYTHONIOENCODING='utf-8'
         scanpy-neighbors \
         --n-neighbors 15 \
         --method 'umap' \
@@ -443,6 +455,7 @@ process neighbors_for_umap {
         path "neighbors_${n_neighbors}.h5ad"
     script:
     """
+        export PYTHONIOENCODING='utf-8'
         scanpy-neighbors \
             --n-neighbors $n_neighbors \
             --key-added 'neighbors_n_neighbors_${n_neighbors}' \
@@ -472,6 +485,7 @@ process find_clusters {
 
     script:
     """
+        export PYTHONIOENCODING='utf-8'
         scanpy-find-cluster louvain \
         --neighbors-key 'neighbors' \
         --key-added 'louvain_resolution_${resolution}' \
@@ -500,6 +514,7 @@ process restore_unscaled {
 
     script:
     """
+	export PYTHONIOENCODING='utf-8'
 	ln -s $anndata input.h5
 	ln -s $normalise_internal_data r_source.h5
 	python ${projectDir}/scripts/restore_unscaled.py
@@ -528,6 +543,8 @@ process find_markers {
         n_number="\${VAR#\$PREFIX}"
         echo \$n_number
 
+    export PYTHONIOENCODING='utf-8'
+
 	scanpy-find-markers \
 	--save 'markers_${merged_group_slotname}.tsv' \
 	--n-genes '100' \
@@ -606,6 +623,7 @@ process run_tsne {
 
     script:
     """
+            export PYTHONIOENCODING='utf-8'
             scanpy-run-tsne \
             --use-rep $representation \
             --export-embedding embeddings.tsv \
@@ -640,6 +658,7 @@ process make_project_file {
         path "project.h5ad"
     script:
     """
+        export PYTHONIOENCODING='utf-8'
         ln -s $neighbors input.h5
         ln -s $scanpy_read_10x r_source.h5
         ln -s '$filter_genes' x_source_0.h5
diff --git a/nextflow.config b/nextflow.config
index 9a1f491e..e0bd55d1 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,7 +1,3 @@
-env {
-    PYTHONIOENCODING = 'utf-8'
-}
-
 process {
     executor='slurm'
     queue="$SCXA_HPC_QUEUE"

From 27503b59c0a31611252556aafd4d844f75eb33f3 Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <pmadrigal@ebi.ac.uk>
Date: Wed, 18 Dec 2024 11:54:44 +0000
Subject: [PATCH 156/159] upgrade scanpy-scripts

---
 main.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index a630d414..afe544ef 100644
--- a/main.nf
+++ b/main.nf
@@ -2,7 +2,7 @@
 
 nextflow.enable.dsl=2
 
-params.scanpy_scripts_container = "quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0"
+params.scanpy_scripts_container = "quay.io/biocontainers/scanpy-scripts:1.9.301--pyhdfd78af_0"
 params.technology = "plate"
 params.batch_variable = ""
 params.representation = "X_pca"
@@ -341,7 +341,7 @@ process find_variable_genes {
 }
 
 process scale_data {
-    container 'quay.io/biocontainers/scanpy-scripts:1.1.6--pypyhdfd78af_0'
+    container params.scanpy_scripts_container
 
     input:
         path anndata

From ecdddb71572a905862ee8e17b9a3f17a75f5bd76 Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <pmadrigal@ebi.ac.uk>
Date: Wed, 18 Dec 2024 14:12:45 +0000
Subject: [PATCH 157/159] pin production version of scanpy-scripts

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index afe544ef..54176bd0 100644
--- a/main.nf
+++ b/main.nf
@@ -2,7 +2,7 @@
 
 nextflow.enable.dsl=2
 
-params.scanpy_scripts_container = "quay.io/biocontainers/scanpy-scripts:1.9.301--pyhdfd78af_0"
+params.scanpy_scripts_container = "quay.io/biocontainers/scanpy-scripts:1.1.2--pypyhdfd78af_1"
 params.technology = "plate"
 params.batch_variable = ""
 params.representation = "X_pca"

From c2a285200fe85c5ecd0252167f7587deec223a8b Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <pmadrigal@ebi.ac.uk>
Date: Thu, 19 Dec 2024 17:54:39 +0000
Subject: [PATCH 158/159] upgrade s. scripts

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 54176bd0..afe544ef 100644
--- a/main.nf
+++ b/main.nf
@@ -2,7 +2,7 @@
 
 nextflow.enable.dsl=2
 
-params.scanpy_scripts_container = "quay.io/biocontainers/scanpy-scripts:1.1.2--pypyhdfd78af_1"
+params.scanpy_scripts_container = "quay.io/biocontainers/scanpy-scripts:1.9.301--pyhdfd78af_0"
 params.technology = "plate"
 params.batch_variable = ""
 params.representation = "X_pca"

From ef03cdacd9d843c36e707a935d6147deb3691d06 Mon Sep 17 00:00:00 2001
From: Pedro Madrigal <8195212+pmb59@users.noreply.github.com>
Date: Fri, 20 Dec 2024 15:33:08 +0000
Subject: [PATCH 159/159] specify a Singularity cache directory

---
 nextflow.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index e0bd55d1..9f892942 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -15,7 +15,7 @@ process {
 
 singularity {
     enabled = true
-    // cacheDir = "$SCXA_SINGULARITY_CACHE"
+    cacheDir = "$SCXA_SINGULARITY_CACHE"
 }
 
 conda {