Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature Requeset: Add broadpeak support for single-cell cut&tag data? #365

Open
yxwucq opened this issue Nov 23, 2024 · 2 comments
Open

Comments

@yxwucq
Copy link

yxwucq commented Nov 23, 2024

Hi Dr. Zhang
Thank you great contribution to scalable scATAC analysis tools. Now snap.tl.macs3 only can be used to call narrowPeak, but other omics like many histone modifications are boradPeak. Can you add broadpeak support in snap.tl.macs3? I've tried directly modifing options:

def macs3(
...
options.broad = broad
options.log_broadcutoff = log(broadcutoff, 10) * -1

But there are still conficts in exporting peak to polars.dataframe. For example, in function find_reproducible_peaks of lib.rs, it tries to get attribute of peaks in PeakIO object, but peaks attribute cannot be acessed in BroadPeakIO

    let peaks = get_peaks(peaks)?
        .into_iter()
        .filter(|x| !black.is_overlapped(x))
        .collect::<Vec<_>>();
AttributeError: type object 'MACS3.IO.PeakIO.BroadPeakIO' has no attribute 'peaks'
@yxwucq
Copy link
Author

yxwucq commented Nov 23, 2024

As temporary alternatives, I wrote this function that uses excutable macs3 to call broadpeaks and add peak files back to adata.uns['macs3_broad']

import os
import subprocess
import tempfile
from pathlib import Path
import polars as pl
from typing import Optional, Union, List, Set, Dict
from snapatac2 import AnnData, AnnDataSet
import snapatac2 as snap

def macs3_call_broadpeaks(
    adata: Union[AnnData, AnnDataSet],
    *,
    groupby: Optional[Union[str, List[str]]] = None,
    qvalue: float = 0.05,
    broadcutoff: Optional[float] = 0.1,
    replicate: Optional[Union[str, List[str]]] = None,
    replicate_qvalue: Optional[float] = None,
    max_frag_size: Optional[int] = None,
    selections: Optional[Set[str]] = None,
    nolambda: bool = False,
    shift: int = -100,
    extsize: int = 200,
    min_len: Optional[int] = None,
    blacklist: Optional[Path] = None,
    key_added: str = 'macs3_broad',
    tempdir: Optional[Path] = None,
    inplace: bool = True,
    n_jobs: int = 8,
) -> Optional[Dict[str, 'pl.DataFrame']]:
    """
    Call broad peaks using MACS3.
    
    Parameters
    ----------
    adata : AnnData | AnnDataSet
        The annotated data matrix
    groupby : str | list[str] | None
        The column name(s) in adata.obs for grouping cells
    qvalue : float
        Q-value cutoff for peak detection
    replicate : str | list[str] | None
        Column name(s) in adata.obs for replicates
    replicate_qvalue : float | None
        Q-value cutoff for replicate analysis
    max_frag_size : int | None
        Maximum fragment size
    selections : set[str] | None
        Only process selected groups
    nolambda : bool
        If True, use fixed background lambda
    shift : int
        Shift size
    extsize : int
        Extension size
    min_len : int | None
        Minimum length of peaks
    blacklist : Path | None
        Path to blacklist regions file
    key_added : str
        Key under which to add the peak information
    tempdir : Path | None
        Directory for temporary files
    inplace : bool
        If True, store the result in adata.uns
    n_jobs : int
        Number of parallel jobs
    
    Returns
    -------
    dict[str, polars.DataFrame] | None
        Dictionary of peak DataFrames if inplace=False
    """
    print("Starting MACS3 broad peak calling process...")
    
    # Create temporary directory if not provided
    if tempdir is None:
        temp_dir_obj = tempfile.TemporaryDirectory()
        tempdir = Path(temp_dir_obj.name)
        print(f"Created temporary directory at: {tempdir}")
    else:
        tempdir = Path(tempdir)
        tempdir.mkdir(parents=True, exist_ok=True)
        temp_dir_obj = None
        print(f"Using provided temporary directory: {tempdir}")

    try:
        # Export fragments for each group
        print("\nExporting fragments for processing...")
        fragment_files = snap.ex.export_fragments(
            adata=adata,
            groupby=groupby,
            selections=selections,
            min_frag_length=min_len,
            max_frag_length=max_frag_size,
            out_dir=tempdir,
            suffix='.bed.gz',
            compression='gzip'
        )
        print(f"Successfully exported fragments for {len(fragment_files)} groups")

        results = {}
        
        # Process each group
        total_groups = len(fragment_files)
        for idx, (group_name, fragment_file) in enumerate(fragment_files.items(), 1):
            print(f"\nProcessing group {idx}/{total_groups}: {group_name}")
            output_prefix = tempdir / f"macs3_output_{group_name}"
            
            # Construct MACS3 command
            cmd = [
                'macs3', 'callpeak',
                '-t', str(fragment_file),
                '-f', 'BED',
                '-g', 'hs',  # genome size
                '--broad',
                '-n', str(output_prefix),
                '--outdir', str(tempdir),
                '-q', str(qvalue),
                '--broad-cutoff', str(broadcutoff),
                '--shift', str(shift),
                '--extsize', str(extsize),
                '--nomodel'
            ]
            
            if nolambda:
                cmd.append('--nolambda')
            
            if blacklist:
                cmd.extend(['--blacklist', str(blacklist)])

            print(f"Running MACS3 with parameters:")
            print(f"  - Input file: {fragment_file}")
            print(f"  - Output prefix: {output_prefix}")
            print(f"  - Q-value cutoff: {qvalue}")
            print(f"  - Broad cutoff: {broadcutoff}")
            
            # Run MACS3
            try:
                subprocess.run(cmd, check=True)
                print(f"MACS3 completed successfully for group: {group_name}")
            except subprocess.CalledProcessError as e:
                print(f"Error running MACS3 for group {group_name}: {str(e)}")
                raise

            # Read peaks file
            broad_peaks_file = f"{output_prefix}_peaks.broadPeak"
            if os.path.exists(broad_peaks_file):
                print(f"Reading peaks file: {broad_peaks_file}")
                peaks_df = pl.read_csv(
                    broad_peaks_file,
                    separator='\t',
                    has_header=False,
                    new_columns=['chrom', 'start', 'end', 'name', 'score', 
                                'strand', 'fold_change', 'pvalue', 'qvalue']
                )
                print(f"Found {len(peaks_df)} peaks for group {group_name}")
                results[group_name] = peaks_df
            else:
                print(f"Warning: No peaks file found for group {group_name}")
            
        # Store results
        if inplace:
            print(f"\nStoring results in adata.uns['{key_added}']")
            adata.uns[key_added] = results
            print("Process completed successfully")
            return None
        
        print("\nProcess completed successfully")
        return results

    except Exception as e:
        print(f"\nError occurred during peak calling: {str(e)}")
        raise

    finally:
        # Cleanup temporary directory if we created it
        if temp_dir_obj is not None:
            print("\nCleaning up temporary directory...")
            temp_dir_obj.cleanup()
            print("Cleanup completed")

@kaizhang
Copy link
Owner

Thanks, this is on my to-do list.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants