From 72d0df74641acb9a582ac9938a95f3d696253f06 Mon Sep 17 00:00:00 2001
From: Pablo Moreno <pablo.moreno@astrazeneca.com>
Date: Thu, 24 Aug 2023 14:50:06 +0100
Subject: [PATCH 1/8] First working versions of scanpy qc metrics script

---
 .../scanpy/scripts/sc_qc_metrics.py           | 323 ++++++++++++++++++
 1 file changed, 323 insertions(+)
 create mode 100644 tools/tertiary-analysis/scanpy/scripts/sc_qc_metrics.py

diff --git a/tools/tertiary-analysis/scanpy/scripts/sc_qc_metrics.py b/tools/tertiary-analysis/scanpy/scripts/sc_qc_metrics.py
new file mode 100644
index 00000000..6834b463
--- /dev/null
+++ b/tools/tertiary-analysis/scanpy/scripts/sc_qc_metrics.py
@@ -0,0 +1,323 @@
+import argparse
+
+import matplotlib.pyplot as plt
+import scanpy as sc
+import seaborn as sns
+
+
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(
+        description="Generate quality control metrics for single-cell RNA-seq."
+    )
+    parser.add_argument("adata_file", type=str, help="Path to AnnData object file")
+    parser.add_argument(
+        "sample_field", type=str, help="Field in the obs for the sample identifier"
+    )
+    parser.add_argument(
+        "--output_format",
+        type=str,
+        default="pdf",
+        choices=["pdf", "png"],
+        help="Output format of the plots (default: pdf)",
+    )
+    parser.add_argument(
+        "--plot_size",
+        type=float,
+        nargs=2,
+        metavar=("width", "height"),
+        help="Size of the plots",
+    )
+    parser.add_argument(
+        "--percent_mito_field",
+        type=str,
+        default="pct_counts_mito",
+        help="Field in the obs for the percentage of mitochondrial genes",
+    )
+    parser.add_argument(
+        "--percent_ribo_field",
+        type=str,
+        default="pct_counts_ribo",
+        help="Field in the obs for the percentage of ribosomal genes",
+    )
+    parser.add_argument(
+        "--ribo_field",
+        type=str,
+        default="ribo",
+        help="Field in the var for marking ribosomal genes",
+    )
+    parser.add_argument(
+        "--mito_field",
+        type=str,
+        default="mito",
+        help="Field in the var for marking mitochondrial genes",
+    )
+    parser.add_argument(
+        "--doublet_score_field",
+        type=str,
+        default="doublet_score",
+        help="Field in the obs for the doublet score",
+    )
+    args = parser.parse_args()
+
+    # Load AnnData object
+    adata = sc.read(args.adata_file)
+
+    # Set plot size if provided
+    if args.plot_size:
+        sc.settings.figsize = tuple(args.plot_size)
+
+    # Set output format
+    sc.settings.set_figure_params(format=args.output_format)
+
+    run_quality_control = False
+    if "n_genes_by_counts" not in adata.obs.columns:
+        run_quality_control = True
+    if "n_counts" not in adata.obs.columns:
+        run_quality_control = True
+
+    qc_vars = []
+    # calculate mitochondrial genes if not provided
+    if args.percent_mito_field not in adata.obs.columns:
+        qc_vars.append(args.mito_field)
+    # calculate ribo metrics if not provided
+    if args.percent_ribo_field not in adata.obs.columns:
+        qc_vars.append(args.ribo_field)
+
+    if len(qc_vars) > 0 or run_quality_control:
+        sc.pp.calculate_qc_metrics(
+            adata,
+            qc_vars=qc_vars,
+            log1p=True,
+            inplace=True,
+        )
+        adata.obs["n_counts"] = adata.obs["total_counts"]
+        adata.obs["n_genes"] = adata.obs["n_genes_by_counts"]
+        adata.var["n_counts"] = adata.var["total_counts"]
+        adata.var["n_cells"] = adata.var["n_cells_by_counts"]
+    # General quality for whole dataset
+    plt.figure()
+    ax = sc.pl.violin(
+        adata,
+        [
+            "n_genes_by_counts",
+            "total_counts",
+            args.percent_mito_field,
+            args.percent_ribo_field,
+        ],
+        jitter=False,
+        multi_panel=True,
+        show=False,
+    )
+    # ax.set_title("General QC")
+    plt.savefig(f"general.{args.output_format}", bbox_inches="tight")
+    plt.close()
+
+    # Generate quality control plots
+    generate_violin_plots(
+        adata, args.sample_field, args.percent_mito_field, format=args.output_format
+    )
+    generate_scatter_plot(
+        adata,
+        args.sample_field,
+        percent_mito_field=args.percent_mito_field,
+    )
+    if args.doublet_score_field in adata.obs.columns:
+        generate_doublet_plot(
+            adata,
+            args.sample_field,
+            double_score_field=args.doublet_score_field,
+            format=args.output_format,
+        )
+    else:
+        print("Doublet score field provided not in adata.obs.columns, skipping plot.")
+    generate_complexity_plot(adata, args.sample_field, format=args.output_format)
+    # generate_scatter_by_sample(
+    #     adata,
+    #     sample_field=args.sample_field,
+    #     format=args.output_format,
+    #     percent_mito_field=args.percent_mito_field,
+    # )
+
+
+def generate_violin_plots(
+    adata,
+    sample_field,
+    percent_mito_field="percent_mito",
+    format="pdf",
+    gene_symbols_field="gene_symbols",
+):
+    # Number of counts per cell
+    plt.figure()
+    ax = plt.gca()
+    sc.pl.violin(
+        adata,
+        "n_counts",
+        ax=ax,
+        groupby=sample_field,
+        title="Number of Counts per Cell (Separated by Sample)",
+        show=False,
+    )
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
+    plt.savefig(f"n_counts_per_cell.{format}", bbox_inches="tight")
+    plt.close()
+
+    # Number of genes per cell
+    plt.figure()
+    ax = plt.gca()
+    sc.pl.violin(
+        adata,
+        "n_genes",
+        groupby=sample_field,
+        ax=ax,
+        title="Number of Genes per Cell (Separated by Sample)",
+        show=False
+        # show=True,
+        # save="_n_genes_per_cell",
+    )
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
+    plt.savefig(f"n_genes_per_cell.{format}", bbox_inches="tight")
+    plt.close()
+
+    # Percentage of mitochondrial genes per cell
+    plt.figure()
+    ax = plt.gca()
+    sc.pl.violin(
+        adata,
+        percent_mito_field,
+        groupby=sample_field,
+        ax=ax,
+        title="Percentage of Mitochondrial Genes per Cell (Separated by Sample)",
+        show=False,
+    )
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
+    plt.savefig(f"percent_mito_per_cell.{format}", bbox_inches="tight")
+    plt.close()
+
+    # highest expressed genes per cell
+    plt.figure()
+    ax = sc.pl.highest_expr_genes(
+        adata, n_top=30, gene_symbols=gene_symbols_field, show=False
+    )
+    # set title of ax
+    ax.set_title(f"Highest expressed genes per cell (Separated by Sample)")
+    plt.savefig(f"highest_expr_genes.{format}", bbox_inches="tight")
+    plt.close()
+
+    for sample in adata.obs[sample_field].unique():
+        plt.figure()
+        ax = sc.pl.highest_expr_genes(
+            adata[adata.obs[sample_field] == sample],
+            n_top=30,
+            gene_symbols=gene_symbols_field,
+            show=False,
+        )
+        ax.set_title(f"Highest expressed genes {sample}")
+        # sanitise sample for filename
+        sample_fn = sample.replace(" ", "_")
+        # generate filename based on sample for plot
+        plt.savefig(f"highest_expr_genes_{sample_fn}.{format}", bbox_inches="tight")
+        plt.close()
+
+
+def generate_scatter_plot(
+    adata,
+    sample_field,
+    percent_mito_field="percent_mito",
+):
+    # Scatter plot of UMIs vs genes detected
+    plt.figure()
+    sc.pl.scatter(
+        adata,
+        x="n_counts",
+        y="n_genes",
+        color=sample_field,
+        title="UMIs vs Genes Detected (Separated by Sample)",
+        save="_umi_vs_genes_detected",
+        show=False,
+    )
+    plt.close()
+
+    # UMIs vs genes detected scatterplot, colored by mitochondrial gene ratio
+    plt.figure()
+    sc.pl.scatter(
+        adata,
+        x="n_counts",
+        y="n_genes",
+        color=percent_mito_field,
+        title="UMIs vs Genes Detected (Colored by Mitochondrial Gene Ratio)",
+        save="_umi_vs_genes_detected_colored_by_mito",
+        show=False,
+    )
+    plt.close()
+
+
+def generate_scatter_by_sample(
+    adata, sample_field, percent_mito_field="percent_mito", format="pdf"
+):
+    sample_ids = adata.obs[sample_field].unique()
+    num_samples = len(sample_ids)
+
+    plt.figure(figsize=(10, 6 * num_samples))
+
+    for idx, sample_id in enumerate(sample_ids, 1):
+        plt.subplot(num_samples, 1, idx)
+        adata_sample = adata[adata.obs[sample_field] == sample_id]
+        sc.pl.scatter(
+            adata_sample,
+            x="n_counts",
+            y="n_genes",
+            title=f"Sample {sample_id}: UMI vs Genes Detected",
+            color=percent_mito_field,
+            show=False,
+        )
+        plt.title(f"Sample {sample_id}: UMI vs Genes Detected")
+        plt.xlabel("UMIs")
+        plt.ylabel("Genes Detected")
+    plt.savefig(f"n_counts_n_genes_by_sample.{format}")
+    plt.close()
+
+
+def generate_doublet_plot(
+    adata,
+    sample_field,
+    double_score_field="doublet_score",
+    format="pdf",
+):
+    # Ratio of doublets per cell
+    plt.figure()
+    ax = plt.gca()
+    sc.pl.violin(
+        adata,
+        double_score_field,
+        groupby=sample_field,
+        ax=ax,
+        title="Doublet score distribution (Separated by Sample)",
+        # save="_doublet_ratio",
+        show=False,
+    )
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
+    plt.savefig(f"doublet_ratio.{format}", bbox_inches="tight")
+    plt.close()
+
+
+def generate_complexity_plot(adata, sample_field, format="pdf"):
+    # Complexity distribution (log10 Genes per UMI)
+    plt.figure()
+    ax = plt.gca()
+    sc.pl.violin(
+        adata,
+        "log1p_n_genes_by_counts",
+        groupby=sample_field,
+        ax=ax,
+        title="Complexity Distribution (Log10 Genes per UMI)",
+        # save="_complexity_distribution",
+        show=False,
+    )
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
+    plt.savefig(f"complexity_distribution.{format}", bbox_inches="tight")
+    plt.close()
+
+
+if __name__ == "__main__":
+    main()

From 54bf7c26f985b0f28c568841d86d04e041b9154b Mon Sep 17 00:00:00 2001
From: ajroura22 <adria.roura1@gmail.com>
Date: Mon, 6 Nov 2023 14:55:55 +0000
Subject: [PATCH 2/8] initial commit for the scan qc wrapper

---
 .../scanpy/scanpy-qc-plots.xml                | 69 +++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml

diff --git a/tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml b/tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml
new file mode 100644
index 00000000..f97c5d4e
--- /dev/null
+++ b/tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml
@@ -0,0 +1,69 @@
+<?xml version="1.0"?>
+<tool id="scRNAseq_qc_tool" name="scRNAseq Quality Control Tool" version="1.0.0" engine="python" hidden="false">
+    <description>Generate quality control metrics for single-cell RNA-seq data.</description>
+    <requirements>
+        <requirement type="package" version="1.1.3">scanpy-scripts</requirement>
+    </requirements>
+    <command detect_errors="exit_code">
+        <![CDATA[
+        #!/bin/bash
+        python $__tool_directory__/scripts/sc_qc_metrics.py "$adata_file" "$sample_field" \
+            --output_format "$output_format" \
+            --plot_size "$plot_size" \
+            --percent_mito_field "$percent_mito_field" \
+            --percent_ribo_field "$percent_ribo_field" \
+            --ribo_field "$ribo_field" \
+            --mito_field "$mito_field" \
+            --doublet_score_field "$doublet_score_field"
+        ]]>
+    </command>
+    <inputs>
+        <param type="data" format="txt" name="adata_file" label="AnnData object file" />
+        <param type="text" name="sample_field" label="Sample Field" />
+        <param type="select" name="output_format" label="Output Format">
+            <option value="pdf">PDF</option>
+            <option value="png">PNG</option>
+        </param>
+        <param type="text" name="plot_size" label="Plot Size (Width Height)" />
+        <param type="text" name="percent_mito_field" label="Mitochondrial Gene Field" />
+        <param type="text" name="percent_ribo_field" label="Ribosomal Gene Field" />
+        <param type="text" name="ribo_field" label="Ribo Field" />
+        <param type="text" name="mito_field" label="Mito Field" />
+        <param type="text" name="doublet_score_field" label="Doublet Score Field" />
+    </inputs>
+    <outputs>
+        <data name="general_qc_plots" format="pdf" label="General QC Plots" />
+        <!-- Add more output parameters as needed -->
+    </outputs>
+
+    <tests>
+        <!-- Test Case 1: Basic Test -->
+        <test>
+            <param name="adata_file" value="test-data/adata.h5ad" />
+            <param name="sample_field" value="sample_id" />
+            <param name="output_format" value="pdf" />
+            <output name="general_qc_plots" file="expected_output/general_qc_plots.pdf" />
+        </test>
+        <!-- Add more test cases as needed -->
+    </tests>
+    <!-- ... (help section as provided in the previous response) ... -->
+
+    <help>
+        <![CDATA[
+        This tool generates quality control metrics for single-cell RNA-seq data using the provided Python script.
+        Input parameters:
+        - AnnData object file: Path to the AnnData object file.
+        - Sample Field: Field in the obs for the sample identifier.
+        - Output Format: Output format of the plots (PDF or PNG).
+        - Plot Size: Size of the plots (optional, provide as "width height").
+        - Mitochondrial Gene Field: Field in the obs for the percentage of mitochondrial genes.
+        - Ribosomal Gene Field: Field in the obs for the percentage of ribosomal genes.
+        - Ribo Field: Field in the var for marking ribosomal genes.
+        - Mito Field: Field in the var for marking mitochondrial genes.
+        - Doublet Score Field: Field in the obs for the doublet score.
+        Output:
+        - General QC Plots: PDF file containing general quality control plots.
+        ]]>
+    </help>
+</tool>
+

From b4b2b8c68b6d111b884ea3df9657da8a183779ef Mon Sep 17 00:00:00 2001
From: ajroura22 <adria.roura1@gmail.com>
Date: Tue, 21 Nov 2023 14:14:12 +0000
Subject: [PATCH 3/8] testing the xml

---
 .../scanpy/scanpy-qc-plots.xml                | 51 ++++++++++++-------
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml b/tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml
index f97c5d4e..9373c695 100644
--- a/tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml
+++ b/tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml
@@ -1,30 +1,41 @@
 <?xml version="1.0"?>
-<tool id="scRNAseq_qc_tool" name="scRNAseq Quality Control Tool" version="1.0.0" engine="python" hidden="false">
+<tool id="scRNAseq_qc_tool" name="scRNAseq Quality Control Tool" version="1.0.0" hidden="false">
     <description>Generate quality control metrics for single-cell RNA-seq data.</description>
-    <requirements>
-        <requirement type="package" version="1.1.3">scanpy-scripts</requirement>
-    </requirements>
+    <macros>
+        <import>scanpy_macros2.xml</import>
+      </macros>
+      <expand macro="requirements"/>
     <command detect_errors="exit_code">
         <![CDATA[
         #!/bin/bash
-        python $__tool_directory__/scripts/sc_qc_metrics.py "$adata_file" "$sample_field" \
-            --output_format "$output_format" \
-            --plot_size "$plot_size" \
-            --percent_mito_field "$percent_mito_field" \
-            --percent_ribo_field "$percent_ribo_field" \
-            --ribo_field "$ribo_field" \
-            --mito_field "$mito_field" \
-            --doublet_score_field "$doublet_score_field"
+        python $__tool_directory__/scripts/sc_qc_metrics.py "$adata_file" "$sample_field" 
+            --output_format "$output_format" 
+            --plot_size "$plot_size" 
+            #if $percent_mito_field:
+                --percent_mito_field '$percent_mito_field'
+            #end if 
+            #if $percent_ribo_field:
+                --percent_ribo_field '$percent_ribo_field'
+            #end if 
+            #if $ribo_field:
+                --ribo_field '$ribo_field'
+            #end if 
+            #if $mito_field:
+                --mito_field '$mito_field'
+            #end if 
+            #if $doublet_score_field:
+                --doublet_score_field '$doublet_score_field'
+            #end if 
         ]]>
     </command>
     <inputs>
-        <param type="data" format="txt" name="adata_file" label="AnnData object file" />
+        <param type="data" format="h5ad,h5" name="adata_file" label="AnnData object file" />
         <param type="text" name="sample_field" label="Sample Field" />
         <param type="select" name="output_format" label="Output Format">
             <option value="pdf">PDF</option>
             <option value="png">PNG</option>
         </param>
-        <param type="text" name="plot_size" label="Plot Size (Width Height)" />
+        <param type="text" name="plot_size" label="Plot Size (Width Height)" value="10,10"/>
         <param type="text" name="percent_mito_field" label="Mitochondrial Gene Field" />
         <param type="text" name="percent_ribo_field" label="Ribosomal Gene Field" />
         <param type="text" name="ribo_field" label="Ribo Field" />
@@ -39,10 +50,14 @@
     <tests>
         <!-- Test Case 1: Basic Test -->
         <test>
-            <param name="adata_file" value="test-data/adata.h5ad" />
-            <param name="sample_field" value="sample_id" />
+            <param name="adata_file" value="anndata_ops_raw.h5ad" />
+            <param name="sample_field" value="louvain" />
             <param name="output_format" value="pdf" />
-            <output name="general_qc_plots" file="expected_output/general_qc_plots.pdf" />
+            <output name="general_qc_plots" >
+                <assert_contents>
+                    <has_size value="100000" delta="1000"/>
+                </assert_contents>
+            </output>
         </test>
         <!-- Add more test cases as needed -->
     </tests>
@@ -65,5 +80,7 @@
         - General QC Plots: PDF file containing general quality control plots.
         ]]>
     </help>
+    <expand macro="citations"/>
+
 </tool>
 

From c808a19adfc55837572c046b05e76513d644be88 Mon Sep 17 00:00:00 2001
From: ajroura22 <adria.roura1@gmail.com>
Date: Tue, 21 Nov 2023 14:24:08 +0000
Subject: [PATCH 4/8] improved tests

---
 tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml b/tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml
index 9373c695..4e769b5a 100644
--- a/tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml
+++ b/tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml
@@ -50,7 +50,7 @@
     <tests>
         <!-- Test Case 1: Basic Test -->
         <test>
-            <param name="adata_file" value="anndata_ops_raw.h5ad" />
+            <param name="adata_file" value="anndata_ops_raw.h5" />
             <param name="sample_field" value="louvain" />
             <param name="output_format" value="pdf" />
             <output name="general_qc_plots" >

From 7796e54f886984b431450ce35bda440a79a2713e Mon Sep 17 00:00:00 2001
From: ajroura22 <adria.roura1@gmail.com>
Date: Fri, 10 May 2024 12:37:24 +0000
Subject: [PATCH 5/8] Defining data outputs

---
 .../tertiary-analysis/scanpy/scanpy-qc-plots.xml | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml b/tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml
index 4e769b5a..a6542c0d 100644
--- a/tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml
+++ b/tools/tertiary-analysis/scanpy/scanpy-qc-plots.xml
@@ -43,10 +43,22 @@
         <param type="text" name="doublet_score_field" label="Doublet Score Field" />
     </inputs>
     <outputs>
-        <data name="general_qc_plots" format="pdf" label="General QC Plots" />
-        <!-- Add more output parameters as needed -->
+        <data name="general_qc_plots" format="pdf" label="General QC Plots" from_work_dir="general_qc_plots.pdf" />
+        <data name="scatter_umi_vs_genes_detected_colored_by_mito" format="pdf" label="Scatter UMI vs Genes Detected (Colored by Mito)" from_work_dir="scatter_umi_vs_genes_detected_colored_by_mito.pdf" />
+        <data name="scatter_umi_vs_genes_detected" format="pdf" label="Scatter UMI vs Genes Detected" from_work_dir="scatter_umi_vs_genes_detected.pdf" />
+        <data name="doublet_ratio_plot" format="pdf" label="Doublet Ratio Plot" from_work_dir="doublet_ratio_plot.pdf" />
+        <data name="highest_expr_genes" format="pdf" label="Highest Expression Genes Plot" from_work_dir="highest_expr_genes.pdf" />
+        <data name="n_counts_per_cell" format="pdf" label="Counts per Cell Plot" from_work_dir="n_counts_per_cell.pdf" />
+        <data name="n_counts_per_cell_by_sample" format="pdf" label="Counts per Cell by Sample Plot" from_work_dir="n_counts_per_cell_by_sample.pdf" />
+        <data name="n_genes_per_cell" format="pdf" label="Genes per Cell Plot" from_work_dir="n_genes_per_cell.pdf" />
+        <data name="percent_mito_per_cell" format="pdf" label="Percent Mitochondrial per Cell Plot" from_work_dir="percent_mito_per_cell.pdf" />        
+        <collection name="highest_expr_genes_per_sample" type="data" label="highest_expr_genes_${sample}.pdf">
+            <discover_datasets pattern="(?P&lt;name&gt;.+)\.tsv$" format="pdf" directory="output_dir" visible="false"/>
+        </collection>
+        </data>
     </outputs>
 
+
     <tests>
         <!-- Test Case 1: Basic Test -->
         <test>

From 873197a4b7a9cf2507c47d1c2dbb2896d5e359aa Mon Sep 17 00:00:00 2001
From: Pablo Moreno <pablo.moreno@astrazeneca.com>
Date: Mon, 14 Oct 2024 13:17:26 +0100
Subject: [PATCH 6/8] Improvements to qc script

---
 .../scanpy/scripts/sc_qc_metrics.py           | 252 ++++++++++++++++--
 1 file changed, 231 insertions(+), 21 deletions(-)

diff --git a/tools/tertiary-analysis/scanpy/scripts/sc_qc_metrics.py b/tools/tertiary-analysis/scanpy/scripts/sc_qc_metrics.py
index 6834b463..ab395c0f 100644
--- a/tools/tertiary-analysis/scanpy/scripts/sc_qc_metrics.py
+++ b/tools/tertiary-analysis/scanpy/scripts/sc_qc_metrics.py
@@ -2,7 +2,8 @@
 
 import matplotlib.pyplot as plt
 import scanpy as sc
-import seaborn as sns
+
+# import seaborn as sns
 
 
 def main():
@@ -12,7 +13,10 @@ def main():
     )
     parser.add_argument("adata_file", type=str, help="Path to AnnData object file")
     parser.add_argument(
-        "sample_field", type=str, help="Field in the obs for the sample identifier"
+        "--sample_field",
+        type=str,
+        default="Sample_ID",
+        help="Field in the obs for the sample identifier"
     )
     parser.add_argument(
         "--output_format",
@@ -28,6 +32,34 @@ def main():
         metavar=("width", "height"),
         help="Size of the plots",
     )
+    # add an argument for general plot title font size
+    parser.add_argument(
+        "--title_font_size",
+        type=int,
+        default=12,
+        help="General plot title font size",
+    )
+    # add an argument for general plot label font size
+    parser.add_argument(
+        "--label_font_size",
+        type=int,
+        default=8,
+        help="General plot label font size",
+    )
+    # add an argument for general plot legend font size
+    parser.add_argument(
+        "--legend_font_size",
+        type=int,
+        default=10,
+        help="General plot legend font size",
+    )
+    # add an argument for the gene symbols field
+    parser.add_argument(
+        "--gene_symbols_field",
+        type=str,
+        default="gene_symbols",
+        help="Field in the var for the gene symbols",
+    )
     parser.add_argument(
         "--percent_mito_field",
         type=str,
@@ -58,6 +90,13 @@ def main():
         default="doublet_score",
         help="Field in the obs for the doublet score",
     )
+    # add an argument for an embedding to plot the cells
+    parser.add_argument(
+        "--embedding",
+        type=str,
+        default=None,
+        help="Embedding to plot the cells",
+    )
     args = parser.parse_args()
 
     # Load AnnData object
@@ -67,8 +106,14 @@ def main():
     if args.plot_size:
         sc.settings.figsize = tuple(args.plot_size)
 
-    # Set output format
-    sc.settings.set_figure_params(format=args.output_format)
+    # set scanpy general plot font size and output format
+    sc.settings.set_figure_params(scanpy=True,
+                                  fontsize=args.label_font_size,
+                                  format=args.output_format)
+    # disable FutureWarning
+    import warnings
+
+    warnings.simplefilter(action="ignore", category=FutureWarning)
 
     run_quality_control = False
     if "n_genes_by_counts" not in adata.obs.columns:
@@ -77,13 +122,30 @@ def main():
         run_quality_control = True
 
     qc_vars = []
+    fields = [
+        "n_genes_by_counts",
+        "total_counts",
+        args.percent_mito_field,
+        args.percent_ribo_field
+    ]
     # calculate mitochondrial genes if not provided
     if args.percent_mito_field not in adata.obs.columns:
         qc_vars.append(args.mito_field)
     # calculate ribo metrics if not provided
+    if args.ribo_field not in adata.var.columns:
+        # create a new column with the name args.ribo_field where genes that
+        # have in the gene symbols field the pattern ^RP[SL] are
+        # marked as true
+        print(f"Creating {args.ribo_field} column")
+        adata.var[args.ribo_field] = adata.var[args.gene_symbols_field].str.contains(
+                "^RP[SL]"
+            )
+        print(f"Number of ribosomal genes: {adata.var[args.ribo_field].sum()}")
     if args.percent_ribo_field not in adata.obs.columns:
         qc_vars.append(args.ribo_field)
 
+    print(f"Calculating QC metrics for {len(qc_vars)} variables")
+
     if len(qc_vars) > 0 or run_quality_control:
         sc.pp.calculate_qc_metrics(
             adata,
@@ -95,16 +157,54 @@ def main():
         adata.obs["n_genes"] = adata.obs["n_genes_by_counts"]
         adata.var["n_counts"] = adata.var["total_counts"]
         adata.var["n_cells"] = adata.var["n_cells_by_counts"]
+
+    # Define thresholds
+    high_umi_threshold = adata.obs['n_counts'].quantile(0.95)  # Top 5% most UMI counts
+    low_umi_threshold = adata.obs['n_counts'].quantile(0.05)   # Bottom 5% least UMI counts
+    high_mito_threshold = adata.obs[args.percent_mito_field].quantile(0.90) # Top 10% pct mitochondrial genes
+
+    from sklearn.linear_model import LinearRegression
+    from sklearn.preprocessing import PolynomialFeatures
+
+    # Polynomial regression to account for curvature in the n_counts vs. n_genes relationship
+    poly = PolynomialFeatures(degree=2)
+    X_poly = poly.fit_transform(adata.obs[['n_counts']])
+    model = LinearRegression()
+    model.fit(X_poly, adata.obs['n_genes'])
+    predicted_counts = model.predict(X_poly)
+
+    # Calculate residuals
+    residuals = adata.obs['n_genes'] - predicted_counts
+    outlier_threshold = residuals.abs().quantile(0.95)  # Top 5% residuals as outliers
+
+    # Initialize diagnosis column
+    adata.obs['auto_diagnosis'] = 'Healthy'
+
+    # Identify outliers
+    outliers = residuals.abs() > outlier_threshold
+    adata.obs.loc[outliers, 'auto_diagnosis'] = 'Outlier'
+
+
+    # Identify stressed/dying/apoptotic cells
+    stressed_cells = (adata.obs['n_counts'] > high_umi_threshold) & (adata.obs[args.percent_mito_field] > high_mito_threshold)
+    adata.obs.loc[stressed_cells, 'auto_diagnosis'] = 'Stressed/Dying/Apoptotic'
+
+    # Identify poor-quality cells
+    poor_quality_cells = (adata.obs['n_counts'] < low_umi_threshold) & (adata.obs[args.percent_mito_field] > high_mito_threshold)
+    adata.obs.loc[poor_quality_cells, 'auto_diagnosis'] = 'Poor-Quality'
+
+    # Print diagnosis summary
+    print(adata.obs['auto_diagnosis'].value_counts())
+    # make a barplot of the auto_diagnosis, omitting the healthy cells from the plot
+    # but writing the number of healthy cells in the title. Plot per sample
+    healthy_cells = adata.obs['auto_diagnosis'] == 'Healthy'
+    healthy_count = healthy_cells.sum()
+
     # General quality for whole dataset
     plt.figure()
     ax = sc.pl.violin(
         adata,
-        [
-            "n_genes_by_counts",
-            "total_counts",
-            args.percent_mito_field,
-            args.percent_ribo_field,
-        ],
+        fields,
         jitter=False,
         multi_panel=True,
         show=False,
@@ -115,11 +215,18 @@ def main():
 
     # Generate quality control plots
     generate_violin_plots(
-        adata, args.sample_field, args.percent_mito_field, format=args.output_format
+        adata, args.sample_field, args.percent_mito_field,
+        args.percent_ribo_field, format=args.output_format
+    )
+    generate_scatter_plot(
+        adata,
+        args.sample_field,
+        percent_mito_field=args.percent_mito_field,
     )
     generate_scatter_plot(
         adata,
         args.sample_field,
+        y='log1p_n_genes_by_counts',
         percent_mito_field=args.percent_mito_field,
     )
     if args.doublet_score_field in adata.obs.columns:
@@ -130,8 +237,24 @@ def main():
             format=args.output_format,
         )
     else:
-        print("Doublet score field provided not in adata.obs.columns, skipping plot.")
+        print(
+            "Doublet score field provided not in adata.obs.columns, " + "skipping plot."
+        )
     generate_complexity_plot(adata, args.sample_field, format=args.output_format)
+
+    if args.embedding:
+        generate_embedding_plot(
+            adata,
+            fields=fields + [args.sample_field, 'auto_diagnosis'],
+            embedding=args.embedding,
+            format=args.output_format,
+        )
+
+    generate_barplot(adata[~healthy_cells],
+                     groups_field=args.sample_field,
+                     props_field='auto_diagnosis',
+                     figure_path='diagnosis_barplot.pdf',
+                     topic_for_title=f"(Total Healthy/Unhealthy cells: {healthy_count}/{adata.n_obs - healthy_count})")
     # generate_scatter_by_sample(
     #     adata,
     #     sample_field=args.sample_field,
@@ -140,10 +263,69 @@ def main():
     # )
 
 
+def generate_barplot(
+    adata, groups_field, props_field, figure_path=None, topic_for_title=None
+):
+    """
+    Generate a proportional bar plot from an AnnData object.
+
+    Parameters:
+    adata (AnnData): The input AnnData object containing the data to plot.
+    groups_field (str): The column in adata.obs to group the data by.
+    props_field (str): The column in adata.obs to plot as proportions.
+    figure_path (str, optional): The path to save the generated figure. If not provided, the figure is not saved.
+    topic_for_title (str, optional): The topic to be used in the figure title, goes after {props_field} proportion of {topic_for_title} per {groups_field}.
+
+    Returns:
+    matplotlib.figure.Figure: The generated bar plot.
+    """
+    props_plot_data = adata.obs[[groups_field, props_field]]
+    # props_plot_data[groups_field] = props_plot_data[groups_field].cat.reorder_categories(['control', '2 days', '7 days', '10 days', '14 days'])
+    # make a 100% stacked bar plot of props_plot_data, plotting phase counts grouped by cell_line_persister
+
+    grouped = props_plot_data.groupby([groups_field, props_field]).size().unstack()
+    # proportions = grouped.div(grouped.sum(axis=1), axis=0)
+    colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
+    plt.gca().set_prop_cycle(color=colors[2:5])
+    grouped.plot(kind="bar", stacked=False, figsize=(8, 6))
+    if topic_for_title is not None:
+        plt.title(f"{props_field} cells {topic_for_title}\nper {groups_field}")
+    else:
+        plt.title(f"{props_field} cells\nper {groups_field}")
+    plt.xlabel(groups_field)
+    # plt.xticks(rotation=45)
+    plt.ylabel("Number of cells")
+
+    # save plot to PDF file
+    if figure_path is not None:
+        plt.savefig(figure_path, bbox_inches="tight")
+    return plt.figure()
+
+
+def generate_embedding_plot(
+            adata,
+            fields,
+            embedding,
+            format="pdf"
+        ):
+    # Embedding plot
+    plt.figure()
+    sc.pl.embedding(
+        adata,
+        basis=embedding,
+        color=fields,
+        show=False,
+        ncols=1
+    )
+    plt.savefig(f"embedding_plots.{format}", bbox_inches="tight")
+    plt.close()
+
+
 def generate_violin_plots(
     adata,
     sample_field,
     percent_mito_field="percent_mito",
+    percent_ribo_field="percent_ribo",
     format="pdf",
     gene_symbols_field="gene_symbols",
 ):
@@ -171,7 +353,7 @@ def generate_violin_plots(
         groupby=sample_field,
         ax=ax,
         title="Number of Genes per Cell (Separated by Sample)",
-        show=False
+        show=False,
         # show=True,
         # save="_n_genes_per_cell",
     )
@@ -187,20 +369,35 @@ def generate_violin_plots(
         percent_mito_field,
         groupby=sample_field,
         ax=ax,
-        title="Percentage of Mitochondrial Genes per Cell (Separated by Sample)",
+        title="Percentage of Mitochondrial " + "Genes per Cell (Separated by Sample)",
         show=False,
     )
     ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
     plt.savefig(f"percent_mito_per_cell.{format}", bbox_inches="tight")
     plt.close()
 
+    # Percentage of ribosomal genes per cell
+    plt.figure()
+    ax = plt.gca()
+    sc.pl.violin(
+        adata,
+        percent_ribo_field,
+        groupby=sample_field,
+        ax=ax,
+        title="Percentage of Ribosomal " + "Genes per Cell (Separated by Sample)",
+        show=False,
+    )
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
+    plt.savefig(f"percent_ribo_per_cell.{format}", bbox_inches="tight")
+    plt.close()
+
     # highest expressed genes per cell
     plt.figure()
     ax = sc.pl.highest_expr_genes(
         adata, n_top=30, gene_symbols=gene_symbols_field, show=False
     )
     # set title of ax
-    ax.set_title(f"Highest expressed genes per cell (Separated by Sample)")
+    ax.set_title("Highest expressed genes per cell\nby Sample")
     plt.savefig(f"highest_expr_genes.{format}", bbox_inches="tight")
     plt.close()
 
@@ -223,6 +420,7 @@ def generate_violin_plots(
 def generate_scatter_plot(
     adata,
     sample_field,
+    y="n_genes",
     percent_mito_field="percent_mito",
 ):
     # Scatter plot of UMIs vs genes detected
@@ -230,10 +428,10 @@ def generate_scatter_plot(
     sc.pl.scatter(
         adata,
         x="n_counts",
-        y="n_genes",
+        y=y,
         color=sample_field,
-        title="UMIs vs Genes Detected (Separated by Sample)",
-        save="_umi_vs_genes_detected",
+        title="UMIs vs Genes Detected (by Sample)",
+        save=f"_umi_vs_{y}_detected",
         show=False,
     )
     plt.close()
@@ -243,14 +441,26 @@ def generate_scatter_plot(
     sc.pl.scatter(
         adata,
         x="n_counts",
-        y="n_genes",
+        y=y,
         color=percent_mito_field,
-        title="UMIs vs Genes Detected (Colored by Mitochondrial Gene Ratio)",
-        save="_umi_vs_genes_detected_colored_by_mito",
+        title="UMIs vs Genes Detected (by Mitochondrial Gene Ratio)",
+        save=f"_umi_vs_{y}_detected_colored_by_mito",
         show=False,
     )
     plt.close()
 
+    plt.figure()
+    sc.pl.scatter(
+        adata,
+        x='n_counts',
+        y=y,
+        color='auto_diagnosis',
+        title="UMIs vs Genes Detected (by Mitochondrial Gene Ratio)",
+        save=f"_umi_vs_{y}_detected_colored_by_auto_diagnosis",
+        show=False
+        )
+    plt.close()
+
 
 def generate_scatter_by_sample(
     adata, sample_field, percent_mito_field="percent_mito", format="pdf"

From abddadd05699d10d06d0f6fa2df559ff8102e4dd Mon Sep 17 00:00:00 2001
From: Pablo Moreno <pablo.moreno@astrazeneca.com>
Date: Mon, 14 Oct 2024 13:51:42 +0100
Subject: [PATCH 7/8] Formatting improvements

---
 .../scanpy/scripts/sc_qc_metrics.py           | 56 +++++++++----------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/tools/tertiary-analysis/scanpy/scripts/sc_qc_metrics.py b/tools/tertiary-analysis/scanpy/scripts/sc_qc_metrics.py
index ab395c0f..75571069 100644
--- a/tools/tertiary-analysis/scanpy/scripts/sc_qc_metrics.py
+++ b/tools/tertiary-analysis/scanpy/scripts/sc_qc_metrics.py
@@ -138,8 +138,8 @@ def main():
         # marked as true
         print(f"Creating {args.ribo_field} column")
         adata.var[args.ribo_field] = adata.var[args.gene_symbols_field].str.contains(
-                "^RP[SL]"
-            )
+            "^RP[SL]"
+        )
         print(f"Number of ribosomal genes: {adata.var[args.ribo_field].sum()}")
     if args.percent_ribo_field not in adata.obs.columns:
         qc_vars.append(args.ribo_field)
@@ -159,14 +159,18 @@ def main():
         adata.var["n_cells"] = adata.var["n_cells_by_counts"]
 
     # Define thresholds
-    high_umi_threshold = adata.obs['n_counts'].quantile(0.95)  # Top 5% most UMI counts
-    low_umi_threshold = adata.obs['n_counts'].quantile(0.05)   # Bottom 5% least UMI counts
-    high_mito_threshold = adata.obs[args.percent_mito_field].quantile(0.90) # Top 10% pct mitochondrial genes
+    # Top 5% most UMI counts
+    high_umi_threshold = adata.obs['n_counts'].quantile(0.95)
+    # Bottom 5% least UMI counts
+    low_umi_threshold = adata.obs['n_counts'].quantile(0.05)
+    # Top 10% pct mitochondrial genes
+    high_mito_threshold = adata.obs[args.percent_mito_field].quantile(0.90)
 
     from sklearn.linear_model import LinearRegression
     from sklearn.preprocessing import PolynomialFeatures
 
-    # Polynomial regression to account for curvature in the n_counts vs. n_genes relationship
+    # Polynomial regression to account for curvature
+    # in the n_counts vs. n_genes relationship
     poly = PolynomialFeatures(degree=2)
     X_poly = poly.fit_transform(adata.obs[['n_counts']])
     model = LinearRegression()
@@ -184,13 +188,14 @@ def main():
     outliers = residuals.abs() > outlier_threshold
     adata.obs.loc[outliers, 'auto_diagnosis'] = 'Outlier'
 
-
     # Identify stressed/dying/apoptotic cells
-    stressed_cells = (adata.obs['n_counts'] > high_umi_threshold) & (adata.obs[args.percent_mito_field] > high_mito_threshold)
+    stressed_cells = (adata.obs['n_counts'] > high_umi_threshold) & \
+        (adata.obs[args.percent_mito_field] > high_mito_threshold)
     adata.obs.loc[stressed_cells, 'auto_diagnosis'] = 'Stressed/Dying/Apoptotic'
 
     # Identify poor-quality cells
-    poor_quality_cells = (adata.obs['n_counts'] < low_umi_threshold) & (adata.obs[args.percent_mito_field] > high_mito_threshold)
+    poor_quality_cells = (adata.obs['n_counts'] < low_umi_threshold) & \
+        (adata.obs[args.percent_mito_field] > high_mito_threshold)
     adata.obs.loc[poor_quality_cells, 'auto_diagnosis'] = 'Poor-Quality'
 
     # Print diagnosis summary
@@ -209,7 +214,7 @@ def main():
         multi_panel=True,
         show=False,
     )
-    # ax.set_title("General QC")
+    ax.set_title("General QC")
     plt.savefig(f"general.{args.output_format}", bbox_inches="tight")
     plt.close()
 
@@ -254,13 +259,8 @@ def main():
                      groups_field=args.sample_field,
                      props_field='auto_diagnosis',
                      figure_path='diagnosis_barplot.pdf',
-                     topic_for_title=f"(Total Healthy/Unhealthy cells: {healthy_count}/{adata.n_obs - healthy_count})")
-    # generate_scatter_by_sample(
-    #     adata,
-    #     sample_field=args.sample_field,
-    #     format=args.output_format,
-    #     percent_mito_field=args.percent_mito_field,
-    # )
+                     topic_for_title=f"(Total Healthy/Unhealthy cells: \
+                        {healthy_count}/{adata.n_obs - healthy_count})")
 
 
 def generate_barplot(
@@ -273,15 +273,17 @@ def generate_barplot(
     adata (AnnData): The input AnnData object containing the data to plot.
     groups_field (str): The column in adata.obs to group the data by.
     props_field (str): The column in adata.obs to plot as proportions.
-    figure_path (str, optional): The path to save the generated figure. If not provided, the figure is not saved.
-    topic_for_title (str, optional): The topic to be used in the figure title, goes after {props_field} proportion of {topic_for_title} per {groups_field}.
+    figure_path (str, optional): The path to save the generated figure.
+        If not provided, the figure is not saved.
+    topic_for_title (str, optional): The topic to be used in the figure
+        title, goes after {props_field} proportion of {topic_for_title}
+        per {groups_field}.
 
     Returns:
     matplotlib.figure.Figure: The generated bar plot.
     """
     props_plot_data = adata.obs[[groups_field, props_field]]
-    # props_plot_data[groups_field] = props_plot_data[groups_field].cat.reorder_categories(['control', '2 days', '7 days', '10 days', '14 days'])
-    # make a 100% stacked bar plot of props_plot_data, plotting phase counts grouped by cell_line_persister
+    # make a 100% stacked bar plot of props_plot_data
 
     grouped = props_plot_data.groupby([groups_field, props_field]).size().unstack()
     # proportions = grouped.div(grouped.sum(axis=1), axis=0)
@@ -303,11 +305,10 @@ def generate_barplot(
 
 
 def generate_embedding_plot(
-            adata,
-            fields,
-            embedding,
-            format="pdf"
-        ):
+        adata,
+        fields,
+        embedding,
+        format="pdf"):
     # Embedding plot
     plt.figure()
     sc.pl.embedding(
@@ -457,8 +458,7 @@ def generate_scatter_plot(
         color='auto_diagnosis',
         title="UMIs vs Genes Detected (by Mitochondrial Gene Ratio)",
         save=f"_umi_vs_{y}_detected_colored_by_auto_diagnosis",
-        show=False
-        )
+        show=False)
     plt.close()
 
 

From 03f9197008c5b407a8ebcee29b05d94c91a632f9 Mon Sep 17 00:00:00 2001
From: Pablo Moreno <pablo.moreno@astrazeneca.com>
Date: Fri, 1 Nov 2024 21:16:59 +0000
Subject: [PATCH 8/8] Add categories to shed.yml

---
 tools/tertiary-analysis/scanpy/.shed.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/tertiary-analysis/scanpy/.shed.yml b/tools/tertiary-analysis/scanpy/.shed.yml
index a46e310d..5a45e5ef 100644
--- a/tools/tertiary-analysis/scanpy/.shed.yml
+++ b/tools/tertiary-analysis/scanpy/.shed.yml
@@ -13,6 +13,8 @@ categories:
 - Transcriptomics
 - Sequence Analysis
 - RNA
+- Single Cell
+- Spatial Omics
 auto_tool_repositories:
     name_template: "{{ tool_id }}"
     description_template: "Wrapper for the scanpy-scripts tool suite: {{ tool_name }}"