Skip to content

Commit

Permalink
Decoupler pseudobulk: filter genes per min number cells per contrasts (
Browse files Browse the repository at this point in the history
…#331)

* Intermediate more complex version

* Passing all doctests on a decoupler+scanpy env

* After additional tests with real data

* Passing tests locally

* Bump build and updates doc
  • Loading branch information
pcm32 authored Oct 2, 2024
1 parent 6f7bc53 commit dea8a06
Show file tree
Hide file tree
Showing 3 changed files with 238 additions and 10 deletions.
156 changes: 149 additions & 7 deletions tools/tertiary-analysis/decoupler/decoupler_pseudobulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def main(args):
for group in args.adata_obs_fields_to_merge.split(":"):
fields = group.split(",")
check_fields(fields, adata)
adata = merge_adata_obs_fields(fields, adata)
merge_adata_obs_fields(fields, adata)

check_fields([args.groupby, args.sample_key], adata)

Expand All @@ -322,10 +322,10 @@ def main(args):

print("Created pseudo-bulk AnnData, checking if fields still make sense.")
print(
"If this fails this check, it might mean that you asked for factors "
+ "that are not compatible with you sample identifiers (ie. asked for "
+ "phase in the factors, but each sample contains more than one phase, "
+ "try joining fields)"
"If this fails this check, it might mean that you asked for factors \
that are not compatible with you sample identifiers (ie. asked for \
phase in the factors, but each sample contains more than one phase,\
try joining fields)."
)
if factor_fields:
check_fields(
Expand Down Expand Up @@ -374,6 +374,21 @@ def main(args):
min_counts_per_sample_marking=args.min_counts_per_sample_marking,
)

# if contrasts file is provided, produce a file with genes that should be
# filtered for each contrasts
if args.contrasts_file:
contrast_genes_df = identify_genes_to_filter_per_contrast(
contrast_file=args.contrasts_file,
min_perc_cells_expression=args.min_gene_exp_perc_per_cell,
adata=adata,
obs_field=args.groupby
)
contrast_genes_df.to_csv(
f"{args.save_path}/genes_to_filter_by_contrast.tsv",
sep="\t",
index=False,
)


def merge_adata_obs_fields(obs_fields_to_merge, adata):
"""
Expand All @@ -394,7 +409,7 @@ def merge_adata_obs_fields(obs_fields_to_merge, adata):
docstring tests:
>>> import scanpy as sc
>>> ad = sc.datasets.pbmc68k_reduced()
>>> ad = merge_adata_obs_fields(["bulk_labels","louvain"], ad)
>>> merge_adata_obs_fields(["bulk_labels","louvain"], ad)
>>> ad.obs.columns
Index(['bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score',
'G2M_score', 'phase', 'louvain', 'bulk_labels_louvain'],
Expand All @@ -412,7 +427,115 @@ def merge_adata_obs_fields(obs_fields_to_merge, adata):
adata.obs[field_name] = (
adata.obs[field_name] + "_" + adata.obs[field].astype(str)
)
return adata


def identify_genes_to_filter_per_contrast(
contrast_file, min_perc_cells_expression, adata, obs_field
):
"""
Identify genes to filter per contrast based on expression percentage.
We need those genes to be under the threshold for all conditions
in a contrast to be identified for further filtering. If
one condition has the gene expressed above the threshold, the gene
becomes of interest (it can be highly up or down regulated).
Parameters
----------
contrast_file : str
Path to the contrasts file.
min_perc_cells_expression : float
Minimum percentage of cells that should express a gene.
adata: adata
Original AnnData file
obs_field: str
Field in the AnnData observations where the contrasts are defined.
Returns
-------
None
Examples
--------
>>> import anndata
>>> import pandas as pd
>>> import numpy as np
>>> import os
>>> from io import StringIO
>>> contrast_file = StringIO(f"contrast{os.linesep}condition1-\
condition2{os.linesep}")
>>> min_perc_cells_expression = 30.0
>>> data = {
... 'obs': pd.DataFrame({'condition': ['condition1', 'condition1',
... 'condition2', 'condition2']}),
... 'X': np.array([[1, 0, 0, 0, 0], [0, 0, 2, 2, 0],
... [0, 0, 1, 1, 0], [0, 0, 0, 2, 0]]),
... }
>>> adata = anndata.AnnData(X=data['X'], obs=data['obs'])
>>> df = identify_genes_to_filter_per_contrast(
... contrast_file, min_perc_cells_expression, adata, 'condition'
... ) # doctest:+ELLIPSIS
Identifying genes to filter using ...
>>> df.head() # doctest:+ELLIPSIS
contrast gene
0 condition1-condition2 ...
1 condition1-condition2 ...
"""
import re

# Implement the logic to identify genes to filter per contrast
# This is a placeholder implementation
print(
f"Identifying genes to filter using {contrast_file} "
f"with min expression {min_perc_cells_expression}%"
)
sides_regex = re.compile(r"[\+\-\*\/\(\)\^]+")

contrasts = pd.read_csv(contrast_file, sep="\t")
# Iterate over each line in the contrast file
genes_filter_for_contrast = dict()
for contrast in contrasts.iloc[:, 0]:
conditions = set(sides_regex.split(contrast))
# we want to find the genes that are below the threshold
# of % of cells expressed for ALL the conditions in the
# contrast. It is enough for one of the conditions
# of the contrast to have the genes expressed above
# the threshold of % of cells to be of interest.
for condition in conditions:
# remove any starting or trailing whitespaces from condition
condition = condition.strip()
# check the percentage of cells that express each gene
# Filter the AnnData object based on the obs_field value
adata_filtered = adata[adata.obs[obs_field] == condition]
# Calculate the percentage of cells expressing each gene
gene_expression = (adata_filtered.X > 0).mean(axis=0) * 100
genes_to_filter = set(adata_filtered.var[
gene_expression.transpose() < min_perc_cells_expression
].index.tolist())
# Update the genes_filter_for_contrast dictionary
if contrast in genes_filter_for_contrast.keys():
genes_filter_for_contrast[contrast].intersection_update(
genes_to_filter
)
else:
genes_filter_for_contrast[contrast] = genes_to_filter

# write the genes_filter_for_contrast to pandas dataframe of two columns:
# contrast and gene

# Initialize an empty list to store the expanded pairs
expanded_pairs = []

# Iterate over the dictionary
for contrast, genes in genes_filter_for_contrast.items():
for gene in genes:
expanded_pairs.append((contrast, gene))

# Create the DataFrame
contrast_genes_df = pd.DataFrame(
expanded_pairs, columns=["contrast", "gene"]
)

return contrast_genes_df


if __name__ == "__main__":
Expand Down Expand Up @@ -474,6 +597,25 @@ def merge_adata_obs_fields(obs_fields_to_merge, adata):
default=10,
help="Minimum number of cells for pseudobulk analysis",
)
# add argument for min percentage of cells that should express a gene
parser.add_argument(
"--min_gene_exp_perc_per_cell",
type=float,
default=50,
help="If all the conditions of one side of a contrast express a \
gene in less than this percentage of cells, then the genes \
will be added to a list of genes to ignore for that contrast.\
Requires the contrast file to be provided.",
)
parser.add_argument(
"--contrasts_file",
type=str,
required=False,
help="Contrasts file, a one column tsv with a header, each line \
represents a contrast as a combination of conditions at each \
side of a substraction.",
)

parser.add_argument(
"--save_path", type=str, help="Path to save the plot (optional)"
)
Expand Down
90 changes: 87 additions & 3 deletions tools/tertiary-analysis/decoupler/decoupler_pseudobulk.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="decoupler_pseudobulk" name="Decoupler pseudo-bulk" version="1.4.0+galaxy4" profile="20.05">
<tool id="decoupler_pseudobulk" name="Decoupler pseudo-bulk" version="1.4.0+galaxy5" profile="20.05">
<description>aggregates single cell RNA-seq data for running bulk RNA-seq methods</description>
<requirements>
<requirement type="package" version="1.4.0">decoupler</requirement>
Expand Down Expand Up @@ -43,6 +43,10 @@ python '$__tool_directory__/decoupler_pseudobulk.py' $input_file
#if $factor_fields:
--factor_fields '$factor_fields'
#end if
#if $filter_per_contrast.filter == 'yes':
--contrasts_file '$filter_per_contrast.contrasts_file'
--min_gene_exp_perc_per_cell '$filter_per_contrast.min_cells_perc_per_contrast_cond'
#end if
--deseq2_output_path deseq_output_dir
--plot_samples_figsize $plot_samples_figsize
--plot_filtering_figsize $plot_filtering_figsize
Expand All @@ -53,6 +57,18 @@ python '$__tool_directory__/decoupler_pseudobulk.py' $input_file
</environment_variables>
<inputs>
<param type="data" name="input_file" format="data" label="Input AnnData file"/>
<conditional name="filter_per_contrast">
<param name="filter" type="select" label="Produce a list of genes to filter out per contrast?" help="TODO">
<option value="yes">Yes</option>
<option value="no" selected="true">No</option>
</param>
<when value="yes">
<param type="data" name="contrasts_file" format="txt,tabular" label="Contrasts file" help="A file with header and arithmetic operations between existing values in the Groupby column in the AnnData file."/>
<param type="float" name="min_cells_perc_per_contrast_cond" value="20" label="Min. percentage of cells that need to be expressing a gene in any of the conditions of a contrast" help="Genes whose expression across all conditions of a contrast are below this threshold are tagged for removal from the contrast on a separate file"/>
</when>
<when value="no">
</when>
</conditional>
<param type="text" name="adata_obs_fields_to_merge" label="Obs Fields to Merge" optional="true" help="Fields in adata.obs to merge, comma separated (optional). They will be available as field1_field2_field3 in the AnnData Obs dataframe. You can have multiple groups to merge, separated by colon (:)."/>
<param type="text" name="groupby" label="Groupby column" help="The column in adata.obs that defines the groups. Merged columns in the above field are available here."/>
<param type="text" name="sample_key" label="Sample Key column" help="The column in adata.obs that defines the samples. Merged columns in the above field are available here."/>
Expand Down Expand Up @@ -90,6 +106,9 @@ python '$__tool_directory__/decoupler_pseudobulk.py' $input_file
<data name="genes_ignore_per_contrast_field" format="tabular" label="{tool.name} on ${on_string}: Genes to ignore by contrast field" from_work_dir="deseq_output_dir/genes_to_ignore_per_contrast_field.tsv">
<filter>factor_fields</filter>
</data>
<data name="genes_ignore_per_contrast" format="tabular" label="{tool.name} on ${on_string}: Genes to ignore by contrast" from_work_dir="plots_output_dir/genes_to_filter_by_contrast.tsv">
<filter>filter_per_contrast['filter'] == 'yes'</filter>
</data>
</outputs>
<tests>
<test expect_num_outputs="7">
Expand Down Expand Up @@ -147,12 +166,77 @@ python '$__tool_directory__/decoupler_pseudobulk.py' $input_file
</assert_contents>
</output>
</test>
<test expect_num_outputs="8">
<param name="input_file" value="mito_counted_anndata.h5ad"/>
<param name="filter" value="yes"/>
<param name="contrasts_file" value="test_contrasts.txt" ftype="txt"/>
<param name="min_cells_perc_per_contrast_cond" value="25"/>
<param name="adata_obs_fields_to_merge" value="batch,sex:batch,genotype"/>
<param name="groupby" value="batch_sex"/>
<param name="sample_key" value="genotype"/>
<param name="factor_fields" value="genotype,batch_sex"/>
<param name="mode" value="sum"/>
<param name="min_cells" value="10"/>
<param name="produce_plots" value="true"/>
<param name="produce_anndata" value="true"/>
<param name="min_counts" value="10"/>
<param name="min_counts_per_sample" value="50"/>
<param name="min_total_counts" value="1000"/>
<param name="filter_expr" value="true"/>
<param name="plot_samples_figsize" value="10 10"/>
<param name="plot_filtering_figsize" value="10 10"/>
<output name="pbulk_anndata" ftype="h5ad">
<assert_contents>
<has_h5_keys keys="obs/psbulk_n_cells"/>
</assert_contents>
</output>
<output name="count_matrix" ftype="tabular">
<assert_contents>
<has_n_lines n="3620"/>
<has_n_columns n="8"/>
</assert_contents>
</output>
<output name="samples_metadata" ftype="tabular">
<assert_contents>
<has_n_lines n="8"/>
<has_n_columns n="3"/>
</assert_contents>
</output>
<output name="genes_metadata" ftype="tabular">
<assert_contents>
<has_n_lines n="3620"/>
<has_n_columns n="13"/>
</assert_contents>
</output>
<output name="plot_output" ftype="png">
<assert_contents>
<has_size value="31853" delta="3000"/>
</assert_contents>
</output>
<output name="genes_ignore_per_contrast_field" ftype="tabular">
<assert_contents>
<has_n_lines n="5"/>
</assert_contents>
</output>
<output name="filter_by_expr_plot" ftype="png">
<assert_contents>
<has_size value="21656" delta="2000"/>
</assert_contents>
</output>
<output name="genes_ignore_per_contrast" ftype="tabular">
<assert_contents>
<has_n_lines n="35478"/>
</assert_contents>
</output>
</test>
</tests>
<help>
<![CDATA[
This tool performs pseudobulk analysis and filtering using Decoupler-py. Provide the input AnnData file and specify the necessary parameters.
This tool generates a count matrix for pseudo-bulk analysis (to be done with a separate tool like EdgeR or DESeq2) and filtering using Decoupler-py. Provide the input AnnData file and specify the necessary parameters.
- Input AnnData file: The input AnnData file to be processed.
- Contrasts file: optional file with a header and a single column, with arithmetic operations (contrasts definitions) as expected by EdgeR or DESeq2 based on the existing groups in the AnnData. This is optional and only required if you want to get a list of genes to filter out later on based on the percetage of cells that express those genes per contrast's conditions.
- Min % expression per contrast (in at least one condition of the contrast): Percentage of cells (within at least one condition of a contrast) that need to express a gene for that genes not to be marked for later filtering. This requires the contrast file to be provided.
- Obs Fields to Merge: Fields in adata.obs to merge, comma separated (optional).
- Groupby column: The column in adata.obs that defines the groups.
- Sample Key column: The column in adata.obs that defines the samples.
Expand All @@ -167,7 +251,7 @@ python '$__tool_directory__/decoupler_pseudobulk.py' $input_file
- Plot Samples Figsize: Size of the samples plot as a tuple (two arguments).
- Plot Filtering Figsize: Size of the filtering plot as a tuple (two arguments).
The tool will output the filtered AnnData, count matrix, samples metadata, genes metadata (in DESeq2 format), and the pseudobulk plot and filter by expression plot (if enabled).
The tool will output the filtered AnnData, count matrix, samples metadata, genes metadata (in DESeq2 format), and the pseudobulk plot and filter by expression plot (if enabled). Files for filtering genes later on are also generated (to ignore after the DE model).
You can obtain more information about Decoupler pseudobulk at the developers documentation `here <https://decoupler-py.readthedocs.io/en/latest/notebooks/pseudobulk.html>`_ .
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Contrasts
N705_male+N707_male-N703_female

0 comments on commit dea8a06

Please sign in to comment.