Feature Requeset: Add broadpeak support for single-cell cut&tag data? #365

yxwucq · 2024-11-23T07:30:09Z

Hi Dr. Zhang
Thank you great contribution to scalable scATAC analysis tools. Now snap.tl.macs3 only can be used to call narrowPeak, but other omics like many histone modifications are boradPeak. Can you add broadpeak support in snap.tl.macs3? I've tried directly modifing options:

def macs3(
...
options.broad = broad
options.log_broadcutoff = log(broadcutoff, 10) * -1

But there are still conficts in exporting peak to polars.dataframe. For example, in function find_reproducible_peaks of lib.rs, it tries to get attribute of peaks in PeakIO object, but peaks attribute cannot be acessed in BroadPeakIO

    let peaks = get_peaks(peaks)?
        .into_iter()
        .filter(|x| !black.is_overlapped(x))
        .collect::<Vec<_>>();

AttributeError: type object 'MACS3.IO.PeakIO.BroadPeakIO' has no attribute 'peaks'

The text was updated successfully, but these errors were encountered:

yxwucq · 2024-11-23T08:31:14Z

As temporary alternatives, I wrote this function that uses excutable macs3 to call broadpeaks and add peak files back to adata.uns['macs3_broad']

import os
import subprocess
import tempfile
from pathlib import Path
import polars as pl
from typing import Optional, Union, List, Set, Dict
from snapatac2 import AnnData, AnnDataSet
import snapatac2 as snap

def macs3_call_broadpeaks(
    adata: Union[AnnData, AnnDataSet],
    *,
    groupby: Optional[Union[str, List[str]]] = None,
    qvalue: float = 0.05,
    broadcutoff: Optional[float] = 0.1,
    replicate: Optional[Union[str, List[str]]] = None,
    replicate_qvalue: Optional[float] = None,
    max_frag_size: Optional[int] = None,
    selections: Optional[Set[str]] = None,
    nolambda: bool = False,
    shift: int = -100,
    extsize: int = 200,
    min_len: Optional[int] = None,
    blacklist: Optional[Path] = None,
    key_added: str = 'macs3_broad',
    tempdir: Optional[Path] = None,
    inplace: bool = True,
    n_jobs: int = 8,
) -> Optional[Dict[str, 'pl.DataFrame']]:
    """
    Call broad peaks using MACS3.
    
    Parameters
    ----------
    adata : AnnData | AnnDataSet
        The annotated data matrix
    groupby : str | list[str] | None
        The column name(s) in adata.obs for grouping cells
    qvalue : float
        Q-value cutoff for peak detection
    replicate : str | list[str] | None
        Column name(s) in adata.obs for replicates
    replicate_qvalue : float | None
        Q-value cutoff for replicate analysis
    max_frag_size : int | None
        Maximum fragment size
    selections : set[str] | None
        Only process selected groups
    nolambda : bool
        If True, use fixed background lambda
    shift : int
        Shift size
    extsize : int
        Extension size
    min_len : int | None
        Minimum length of peaks
    blacklist : Path | None
        Path to blacklist regions file
    key_added : str
        Key under which to add the peak information
    tempdir : Path | None
        Directory for temporary files
    inplace : bool
        If True, store the result in adata.uns
    n_jobs : int
        Number of parallel jobs
    
    Returns
    -------
    dict[str, polars.DataFrame] | None
        Dictionary of peak DataFrames if inplace=False
    """
    print("Starting MACS3 broad peak calling process...")
    
    # Create temporary directory if not provided
    if tempdir is None:
        temp_dir_obj = tempfile.TemporaryDirectory()
        tempdir = Path(temp_dir_obj.name)
        print(f"Created temporary directory at: {tempdir}")
    else:
        tempdir = Path(tempdir)
        tempdir.mkdir(parents=True, exist_ok=True)
        temp_dir_obj = None
        print(f"Using provided temporary directory: {tempdir}")

    try:
        # Export fragments for each group
        print("\nExporting fragments for processing...")
        fragment_files = snap.ex.export_fragments(
            adata=adata,
            groupby=groupby,
            selections=selections,
            min_frag_length=min_len,
            max_frag_length=max_frag_size,
            out_dir=tempdir,
            suffix='.bed.gz',
            compression='gzip'
        )
        print(f"Successfully exported fragments for {len(fragment_files)} groups")

        results = {}
        
        # Process each group
        total_groups = len(fragment_files)
        for idx, (group_name, fragment_file) in enumerate(fragment_files.items(), 1):
            print(f"\nProcessing group {idx}/{total_groups}: {group_name}")
            output_prefix = tempdir / f"macs3_output_{group_name}"
            
            # Construct MACS3 command
            cmd = [
                'macs3', 'callpeak',
                '-t', str(fragment_file),
                '-f', 'BED',
                '-g', 'hs',  # genome size
                '--broad',
                '-n', str(output_prefix),
                '--outdir', str(tempdir),
                '-q', str(qvalue),
                '--broad-cutoff', str(broadcutoff),
                '--shift', str(shift),
                '--extsize', str(extsize),
                '--nomodel'
            ]
            
            if nolambda:
                cmd.append('--nolambda')
            
            if blacklist:
                cmd.extend(['--blacklist', str(blacklist)])

            print(f"Running MACS3 with parameters:")
            print(f"  - Input file: {fragment_file}")
            print(f"  - Output prefix: {output_prefix}")
            print(f"  - Q-value cutoff: {qvalue}")
            print(f"  - Broad cutoff: {broadcutoff}")
            
            # Run MACS3
            try:
                subprocess.run(cmd, check=True)
                print(f"MACS3 completed successfully for group: {group_name}")
            except subprocess.CalledProcessError as e:
                print(f"Error running MACS3 for group {group_name}: {str(e)}")
                raise

            # Read peaks file
            broad_peaks_file = f"{output_prefix}_peaks.broadPeak"
            if os.path.exists(broad_peaks_file):
                print(f"Reading peaks file: {broad_peaks_file}")
                peaks_df = pl.read_csv(
                    broad_peaks_file,
                    separator='\t',
                    has_header=False,
                    new_columns=['chrom', 'start', 'end', 'name', 'score', 
                                'strand', 'fold_change', 'pvalue', 'qvalue']
                )
                print(f"Found {len(peaks_df)} peaks for group {group_name}")
                results[group_name] = peaks_df
            else:
                print(f"Warning: No peaks file found for group {group_name}")
            
        # Store results
        if inplace:
            print(f"\nStoring results in adata.uns['{key_added}']")
            adata.uns[key_added] = results
            print("Process completed successfully")
            return None
        
        print("\nProcess completed successfully")
        return results

    except Exception as e:
        print(f"\nError occurred during peak calling: {str(e)}")
        raise

    finally:
        # Cleanup temporary directory if we created it
        if temp_dir_obj is not None:
            print("\nCleaning up temporary directory...")
            temp_dir_obj.cleanup()
            print("Cleanup completed")

kaizhang · 2024-11-27T11:07:31Z

Thanks, this is on my to-do list.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Feature Requeset: Add broadpeak support for single-cell cut&tag data? #365

Feature Requeset: Add broadpeak support for single-cell cut&tag data? #365

yxwucq commented Nov 23, 2024

yxwucq commented Nov 23, 2024

kaizhang commented Nov 27, 2024

Feature Requeset: Add broadpeak support for single-cell cut&tag data? #365

Feature Requeset: Add broadpeak support for single-cell cut&tag data? #365

Comments

yxwucq commented Nov 23, 2024

yxwucq commented Nov 23, 2024

kaizhang commented Nov 27, 2024