diff --git a/notebooks/dnadiffusion.py b/notebooks/dnadiffusion.py index 3f68b328..29384c34 100644 --- a/notebooks/dnadiffusion.py +++ b/notebooks/dnadiffusion.py @@ -1,11 +1,11 @@ import copy +import itertools import math import os import pickle import random -import tempfile from functools import partial -from typing import Any, Dict, List, Optional +from typing import Any import matplotlib.pyplot as plt import numpy as np @@ -118,7 +118,7 @@ def create_sample( conditional_numeric_to_tag: dict, number_of_samples: int = 20, specific_group: bool = False, - group_number: Optional[list] = None, + group_number: list | None = None, cond_weight_to_metric: int = 0, save_timestep_dataframe: bool = False, ): @@ -577,7 +577,8 @@ def __init__( self.init_conv = nn.Conv2d(input_channels, init_dim, (7, 7), padding=3) dims = [init_dim, *(dim * m for m in dim_mults)] - in_out = list(zip(dims[:-1], dims[1:])) + # in_out = list(zip(dims[:-1], dims[1:])) + in_out = itertools.pairwise(dims) block_klass = partial(ResnetBlock, groups=resnet_block_groups) # time embeddings @@ -744,8 +745,8 @@ def save_fasta(df: pd.DataFrame, name: str, num_sequences: int, seq_to_subset_co def generate_motifs_and_fastas( - df: pd.DataFrame, name: str, num_sequences: int, subset_list: Optional[List] = None -) -> Dict[str, Any]: + df: pd.DataFrame, name: str, num_sequences: int, subset_list: list | None = None +) -> dict[str, Any]: print("Generating Motifs and Fastas...", name) print("---" * 10) @@ -775,8 +776,8 @@ def generate_motifs_and_fastas( def preprocess_data( input_csv: str, - subset_list: Optional[List] = None, - limit_total_sequences: Optional[int] = None, + subset_list: list | None = None, + limit_total_sequences: int | None = None, number_of_sequences_to_motif_creation: int = 1000, save_output: bool = True, ): @@ -853,7 +854,7 @@ def __getitem__(self, index): def load_data( data_path: str = "K562_hESCT0_HepG2_GM12878_12k_sequences_per_group.txt", saved_data_path: str = "encode_data_dict.npy", - subset_list: List = [ + subset_list: list = [ "GM12878_ENCLB441ZZZ", "hESCT0_ENCLB449ZZZ", "K562_ENCLB843GMH", diff --git a/notebooks/experiments/conditional_diffusion/full_script_version_from_accelerate_notebook/dnadiffusion.py b/notebooks/experiments/conditional_diffusion/full_script_version_from_accelerate_notebook/dnadiffusion.py index f4ceac01..a096732c 100644 --- a/notebooks/experiments/conditional_diffusion/full_script_version_from_accelerate_notebook/dnadiffusion.py +++ b/notebooks/experiments/conditional_diffusion/full_script_version_from_accelerate_notebook/dnadiffusion.py @@ -1,4 +1,5 @@ import copy +import itertools import math import os import random @@ -891,7 +892,8 @@ def __init__( self.init_conv = nn.Conv2d(input_channels, init_dim, (7, 7), padding=3) dims = [init_dim, *(dim * m for m in dim_mults)] - in_out = list(zip(dims[:-1], dims[1:])) + # in_out = list(zip(dims[:-1], dims[1:])) + in_out = itertools.pairwise(dims) block_klass = partial(ResnetBlock, groups=resnet_block_groups) # time embeddings diff --git a/pyproject.toml b/pyproject.toml index 3d8e4d8f..bff195c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,47 @@ -[tool.black] -target-version = ["py37"] -line-length = 120 -skip-string-normalization = true +[build-system] +requires = ["hatchling", "hatch-regex-commit"] +build-backend = "hatchling.build" + +[project] +name = "dnadiffusion" +authors = [ + { name = "dnadiffusion", email = "dnadiffusion@pinellolab.org" } +] +description = "Library for probabilistic analysis of protein-protein interaction sequencing data." +readme = "README.md" +dynamic = ["version"] +classifiers = [ + "Programming Language :: Python :: 3 :: Only", +] +requires-python = ">=3.10" +dependencies = [ + "accelerate==0.23.0", + "click==8.1.7", + "einops==0.7.0", + "genomepy==0.16.1", + "gimmemotifs==0.18.0", + "gtfparse==1.3.0", + "jupyterlab==4.0.7", + "matplotlib==3.8.0", + "memory-efficient-attention-pytorch==0.1.6", + "pandas==2.1.1", + "pybedtools==0.9.1", + "seaborn==0.13.0", + "sourmash==4.8.4", + "torch==2.1.0", + "torchvision==0.16.0", + "wandb==0.15.12", +] + +[project.scripts] +dnadiffusion = "dnadiffusion.cli:main" + +[project.urls] +Documentation = "https://pinellolab.github.io/DNA-Diffusion" +Source = "https://github.com/pinellolab/DNA-Diffusion" [tool.ruff] -target-version = "py37" +target-version = "py310" line-length = 120 select = [ "A", @@ -54,6 +91,9 @@ exclude = [ [tool.ruff.isort] known-first-party = ["dnadiffusion"] +[tool.ruff.format] +quote-style = "double" + [tool.ruff.flake8-tidy-imports] ban-relative-imports = "all" @@ -94,10 +134,6 @@ module = [ "accelerate", "Bio", "hydra_zen", - "lightning", - "lightning.pytorch.utilities", - "lightning.pytorch.callbacks", - "lightning.pytorch.loggers", "memory_efficient_attention_pytorch", "matplotlib", "matplotlib.pyplot", @@ -124,8 +160,6 @@ path = "src/dnadiffusion/__about__.py" [tool.hatch.envs.default] python = "3.10" dependencies = [ - "black==23.10.1", - "black[jupyter]", "mkdocs-material==9.3.1", "mkdocstrings==0.23.0", "mkdocstrings[python]", @@ -142,55 +176,14 @@ dependencies = [ test = "pytest -rA" test-cov-xml = "pytest -rA --cov-report=xml" lint = [ - "black .", + "ruff format .", "ruff --fix .", # "mypy src/dnadiffusion/", ] lint-check = [ - "black --check .", + "ruff format --check .", "ruff .", # "mypy src/dnadiffusion/", ] docs-serve = "mkdocs serve" docs-build = "mkdocs build" - -[build-system] -requires = ["hatchling", "hatch-regex-commit"] -build-backend = "hatchling.build" - -[project] -name = "dnadiffusion" -authors = [ - { name = "dnadiffusion", email = "dnadiffusion@pinellolab.org" } -] -description = "Library for probabilistic analysis of protein-protein interaction sequencing data." -readme = "README.md" -dynamic = ["version"] -classifiers = [ - "Programming Language :: Python :: 3 :: Only", -] -requires-python = ">=3.10" -dependencies = [ - "accelerate==0.23.0", - "click==8.1.7", - "einops==0.7.0", - "genomepy==0.16.1", - "gimmemotifs==0.18.0", - "gtfparse==1.3.0", - "matplotlib==3.8.0", - "memory-efficient-attention-pytorch==0.1.6", - "pandas==2.1.1", - "pybedtools==0.9.1", - "seaborn==0.13.0", - "sourmash==4.8.4", - "torch==2.1.0", - "torchvision==0.16.0", - "wandb==0.15.12", -] - -[project.scripts] -dnadiffusion = "dnadiffusion.cli:main" - -[project.urls] -Documentation = "https://pinellolab.github.io/DNA-Diffusion" -Source = "https://github.com/pinellolab/DNA-Diffusion" diff --git a/src/dnadiffusion/data/dataloader.py b/src/dnadiffusion/data/dataloader.py index 5901374e..618767a1 100644 --- a/src/dnadiffusion/data/dataloader.py +++ b/src/dnadiffusion/data/dataloader.py @@ -1,7 +1,7 @@ import os import pickle import random -from typing import Any, Dict, List, Optional +from typing import Any import matplotlib.pyplot as plt import numpy as np @@ -16,7 +16,7 @@ def load_data( data_path: str = "K562_hESCT0_HepG2_GM12878_12k_sequences_per_group.txt", saved_data_path: str = "encode_data.pkl", - subset_list: List = [ + subset_list: list = [ "GM12878_ENCLB441ZZZ", "hESCT0_ENCLB449ZZZ", "K562_ENCLB843GMH", @@ -115,8 +115,8 @@ def save_fasta(df: pd.DataFrame, name: str, num_sequences: int, seq_to_subset_co def generate_motifs_and_fastas( - df: pd.DataFrame, name: str, num_sequences: int, subset_list: Optional[List] = None -) -> Dict[str, Any]: + df: pd.DataFrame, name: str, num_sequences: int, subset_list: list | None = None +) -> dict[str, Any]: print("Generating Motifs and Fastas...", name) print("---" * 10) @@ -146,8 +146,8 @@ def generate_motifs_and_fastas( def preprocess_data( input_csv: str, - subset_list: Optional[List] = None, - limit_total_sequences: Optional[int] = None, + subset_list: list | None = None, + limit_total_sequences: int | None = None, number_of_sequences_to_motif_creation: int = 1000, save_output: bool = True, ): diff --git a/src/dnadiffusion/data/preprocessing.py b/src/dnadiffusion/data/preprocessing.py index f3648f8e..1cd9cdc5 100644 --- a/src/dnadiffusion/data/preprocessing.py +++ b/src/dnadiffusion/data/preprocessing.py @@ -1,7 +1,6 @@ import os import shutil from pathlib import Path -from typing import List import numpy as np import pandas as pd @@ -13,7 +12,7 @@ def preprocess_data( data_path: Path = DATA_DIR, df_path: str = "/master_dataset.ftr", - cell_list: List = ["K562_ENCLB843GMH", "hESCT0_ENCLB449ZZZ", "HepG2_ENCLB029COU", "GM12878_ENCLB441ZZZ"], + cell_list: list = ["K562_ENCLB843GMH", "hESCT0_ENCLB449ZZZ", "HepG2_ENCLB029COU", "GM12878_ENCLB441ZZZ"], download_data_bool: bool = True, create_master_dataset_bool: bool = True, filter_data_bool: bool = True, diff --git a/src/dnadiffusion/metrics/motif_composition.py b/src/dnadiffusion/metrics/motif_composition.py index 51beb475..4b08e4d8 100644 --- a/src/dnadiffusion/metrics/motif_composition.py +++ b/src/dnadiffusion/metrics/motif_composition.py @@ -1,6 +1,5 @@ import os import re -from typing import Dict import pandas as pd @@ -61,7 +60,7 @@ def motif_composition_matrix( return output_df -def parse_motif_file(file_path: str = f"{DATA_DIR}/JASPAR2020_vertebrates.pfm", download_data: bool = False) -> Dict: +def parse_motif_file(file_path: str = f"{DATA_DIR}/JASPAR2020_vertebrates.pfm", download_data: bool = False) -> dict: """Given a file path to the motif pfm file, return a sorted dictionary of motifs.""" if download_data: # Download JASPAR2020_vertebrates.pfm diff --git a/src/dnadiffusion/models/layers.py b/src/dnadiffusion/models/layers.py index 65a9e815..249453e6 100644 --- a/src/dnadiffusion/models/layers.py +++ b/src/dnadiffusion/models/layers.py @@ -107,14 +107,14 @@ def forward(self, x: torch.Tensor, *args, **kwargs): return self.fn(x, *args, **kwargs) + x -def Upsample(dim: int, dim_out: Optional[int]): +def Upsample(dim: int, dim_out: int | None): return nn.Sequential( nn.Upsample(scale_factor=2, mode="nearest"), nn.Conv2d(dim, default(dim_out, dim), 3, padding=1), ) -def Downsample(dim: int, dim_out: Optional[int]): +def Downsample(dim: int, dim_out: int | None): return nn.Conv2d(dim, default(dim_out, dim), 4, 2, 1) @@ -173,7 +173,7 @@ def __init__(self, dim: int, dim_out: int, groups: int = 8) -> None: self.norm = nn.GroupNorm(groups, dim_out) self.act = nn.SiLU() - def forward(self, x: torch.Tensor, scale_shift: Optional[torch.Tensor] = None): + def forward(self, x: torch.Tensor, scale_shift: torch.Tensor | None = None): x = self.proj(x) x = self.norm(x) @@ -186,7 +186,7 @@ def forward(self, x: torch.Tensor, scale_shift: Optional[torch.Tensor] = None): class ResnetBlock(nn.Module): - def __init__(self, dim: int, dim_out: int, *, time_emb_dim: Optional[int], groups: int = 8) -> None: + def __init__(self, dim: int, dim_out: int, *, time_emb_dim: int | None, groups: int = 8) -> None: super().__init__() self.mlp = (nn.Sequential(nn.SiLU(), nn.Linear(time_emb_dim, dim_out * 2))) if exists(time_emb_dim) else None @@ -194,7 +194,7 @@ def __init__(self, dim: int, dim_out: int, *, time_emb_dim: Optional[int], group self.block2 = Block(dim_out, dim_out, groups=groups) self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity() - def forward(self, x: torch.Tensor, time_emb: Optional[torch.Tensor] = None): + def forward(self, x: torch.Tensor, time_emb: torch.Tensor | None = None): scale_shift = None if exists(self.mlp) and exists(time_emb): time_emb = self.mlp(time_emb) diff --git a/src/dnadiffusion/models/unet.py b/src/dnadiffusion/models/unet.py index 4ad060b7..748a5530 100644 --- a/src/dnadiffusion/models/unet.py +++ b/src/dnadiffusion/models/unet.py @@ -1,3 +1,4 @@ +import itertools from functools import partial from typing import Optional @@ -32,7 +33,7 @@ def __init__( self.init_conv = nn.Conv2d(input_channels, init_dim, (7, 7), padding=3) dims = [init_dim, *(dim * m for m in dim_mults)] - in_out = list(zip(dims[:-1], dims[1:])) + in_out = itertools.pairwise(dims) block_klass = partial(ResnetBlock, groups=resnet_block_groups) # time embeddings diff --git a/src/dnadiffusion/utils/data_util.py b/src/dnadiffusion/utils/data_util.py index 6689bfce..2f828d0c 100644 --- a/src/dnadiffusion/utils/data_util.py +++ b/src/dnadiffusion/utils/data_util.py @@ -138,7 +138,7 @@ def get_gtf_df(self): return self.df_gtf def geneid2genename(self, gene_list): - '''given a list of geneid convert it to list names + """given a list of geneid convert it to list names ex: usage geneid2genename(["ENSG00000000003", "ENSG00000000004" ]) @@ -148,7 +148,7 @@ def geneid2genename(self, gene_list): return: conversion results gene_id to gene_name list[str] - ''' + """ gtf_df = self.get_gtf_df() dict_conversion = dict(zip(gtf_df["gene_id"].values, gtf_df["gene_name"].values)) return [dict_conversion[g] for g in gene_list] diff --git a/src/dnadiffusion/utils/sample_util.py b/src/dnadiffusion/utils/sample_util.py index 972b4e76..e7b8e47c 100644 --- a/src/dnadiffusion/utils/sample_util.py +++ b/src/dnadiffusion/utils/sample_util.py @@ -14,7 +14,7 @@ def create_sample( cell_types: list, conditional_numeric_to_tag: dict, number_of_samples: int = 1000, - group_number: Optional[list] = None, + group_number: list | None = None, cond_weight_to_metric: int = 0, save_timesteps: bool = False, save_dataframe: bool = False, diff --git a/src/dnadiffusion/utils/train_util.py b/src/dnadiffusion/utils/train_util.py index 25bf7278..f8ce33de 100644 --- a/src/dnadiffusion/utils/train_util.py +++ b/src/dnadiffusion/utils/train_util.py @@ -1,5 +1,5 @@ import copy -from typing import Any, Dict +from typing import Any import torch import torchvision.transforms as T @@ -17,7 +17,7 @@ class TrainLoop: def __init__( self, - data: Dict[str, Any], + data: dict[str, Any], model: torch.nn.Module, accelerator: Accelerator, epochs: int = 10000,