Skip to content

Commit

Permalink
replace black formatter with ruff (#247)
Browse files Browse the repository at this point in the history
* remove black formatter and run lint

* resolve deprecated behavior utilizing zip

* add jupyterlab as a dependency
  • Loading branch information
ssenan authored Oct 25, 2023
1 parent 6e9e327 commit 30b0384
Show file tree
Hide file tree
Showing 11 changed files with 80 additions and 85 deletions.
19 changes: 10 additions & 9 deletions notebooks/dnadiffusion.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import copy
import itertools
import math
import os
import pickle
import random
import tempfile
from functools import partial
from typing import Any, Dict, List, Optional
from typing import Any

import matplotlib.pyplot as plt
import numpy as np
Expand Down Expand Up @@ -118,7 +118,7 @@ def create_sample(
conditional_numeric_to_tag: dict,
number_of_samples: int = 20,
specific_group: bool = False,
group_number: Optional[list] = None,
group_number: list | None = None,
cond_weight_to_metric: int = 0,
save_timestep_dataframe: bool = False,
):
Expand Down Expand Up @@ -577,7 +577,8 @@ def __init__(
self.init_conv = nn.Conv2d(input_channels, init_dim, (7, 7), padding=3)
dims = [init_dim, *(dim * m for m in dim_mults)]

in_out = list(zip(dims[:-1], dims[1:]))
# in_out = list(zip(dims[:-1], dims[1:]))
in_out = itertools.pairwise(dims)
block_klass = partial(ResnetBlock, groups=resnet_block_groups)

# time embeddings
Expand Down Expand Up @@ -744,8 +745,8 @@ def save_fasta(df: pd.DataFrame, name: str, num_sequences: int, seq_to_subset_co


def generate_motifs_and_fastas(
df: pd.DataFrame, name: str, num_sequences: int, subset_list: Optional[List] = None
) -> Dict[str, Any]:
df: pd.DataFrame, name: str, num_sequences: int, subset_list: list | None = None
) -> dict[str, Any]:
print("Generating Motifs and Fastas...", name)
print("---" * 10)

Expand Down Expand Up @@ -775,8 +776,8 @@ def generate_motifs_and_fastas(

def preprocess_data(
input_csv: str,
subset_list: Optional[List] = None,
limit_total_sequences: Optional[int] = None,
subset_list: list | None = None,
limit_total_sequences: int | None = None,
number_of_sequences_to_motif_creation: int = 1000,
save_output: bool = True,
):
Expand Down Expand Up @@ -853,7 +854,7 @@ def __getitem__(self, index):
def load_data(
data_path: str = "K562_hESCT0_HepG2_GM12878_12k_sequences_per_group.txt",
saved_data_path: str = "encode_data_dict.npy",
subset_list: List = [
subset_list: list = [
"GM12878_ENCLB441ZZZ",
"hESCT0_ENCLB449ZZZ",
"K562_ENCLB843GMH",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import copy
import itertools
import math
import os
import random
Expand Down Expand Up @@ -891,7 +892,8 @@ def __init__(
self.init_conv = nn.Conv2d(input_channels, init_dim, (7, 7), padding=3)
dims = [init_dim, *(dim * m for m in dim_mults)]

in_out = list(zip(dims[:-1], dims[1:]))
# in_out = list(zip(dims[:-1], dims[1:]))
in_out = itertools.pairwise(dims)
block_klass = partial(ResnetBlock, groups=resnet_block_groups)

# time embeddings
Expand Down
101 changes: 47 additions & 54 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,47 @@
[tool.black]
target-version = ["py37"]
line-length = 120
skip-string-normalization = true
[build-system]
requires = ["hatchling", "hatch-regex-commit"]
build-backend = "hatchling.build"

[project]
name = "dnadiffusion"
authors = [
{ name = "dnadiffusion", email = "[email protected]" }
]
description = "Library for probabilistic analysis of protein-protein interaction sequencing data."
readme = "README.md"
dynamic = ["version"]
classifiers = [
"Programming Language :: Python :: 3 :: Only",
]
requires-python = ">=3.10"
dependencies = [
"accelerate==0.23.0",
"click==8.1.7",
"einops==0.7.0",
"genomepy==0.16.1",
"gimmemotifs==0.18.0",
"gtfparse==1.3.0",
"jupyterlab==4.0.7",
"matplotlib==3.8.0",
"memory-efficient-attention-pytorch==0.1.6",
"pandas==2.1.1",
"pybedtools==0.9.1",
"seaborn==0.13.0",
"sourmash==4.8.4",
"torch==2.1.0",
"torchvision==0.16.0",
"wandb==0.15.12",
]

[project.scripts]
dnadiffusion = "dnadiffusion.cli:main"

[project.urls]
Documentation = "https://pinellolab.github.io/DNA-Diffusion"
Source = "https://github.com/pinellolab/DNA-Diffusion"

[tool.ruff]
target-version = "py37"
target-version = "py310"
line-length = 120
select = [
"A",
Expand Down Expand Up @@ -54,6 +91,9 @@ exclude = [
[tool.ruff.isort]
known-first-party = ["dnadiffusion"]

[tool.ruff.format]
quote-style = "double"

[tool.ruff.flake8-tidy-imports]
ban-relative-imports = "all"

Expand Down Expand Up @@ -94,10 +134,6 @@ module = [
"accelerate",
"Bio",
"hydra_zen",
"lightning",
"lightning.pytorch.utilities",
"lightning.pytorch.callbacks",
"lightning.pytorch.loggers",
"memory_efficient_attention_pytorch",
"matplotlib",
"matplotlib.pyplot",
Expand All @@ -124,8 +160,6 @@ path = "src/dnadiffusion/__about__.py"
[tool.hatch.envs.default]
python = "3.10"
dependencies = [
"black==23.10.1",
"black[jupyter]",
"mkdocs-material==9.3.1",
"mkdocstrings==0.23.0",
"mkdocstrings[python]",
Expand All @@ -142,55 +176,14 @@ dependencies = [
test = "pytest -rA"
test-cov-xml = "pytest -rA --cov-report=xml"
lint = [
"black .",
"ruff format .",
"ruff --fix .",
# "mypy src/dnadiffusion/",
]
lint-check = [
"black --check .",
"ruff format --check .",
"ruff .",
# "mypy src/dnadiffusion/",
]
docs-serve = "mkdocs serve"
docs-build = "mkdocs build"

[build-system]
requires = ["hatchling", "hatch-regex-commit"]
build-backend = "hatchling.build"

[project]
name = "dnadiffusion"
authors = [
{ name = "dnadiffusion", email = "[email protected]" }
]
description = "Library for probabilistic analysis of protein-protein interaction sequencing data."
readme = "README.md"
dynamic = ["version"]
classifiers = [
"Programming Language :: Python :: 3 :: Only",
]
requires-python = ">=3.10"
dependencies = [
"accelerate==0.23.0",
"click==8.1.7",
"einops==0.7.0",
"genomepy==0.16.1",
"gimmemotifs==0.18.0",
"gtfparse==1.3.0",
"matplotlib==3.8.0",
"memory-efficient-attention-pytorch==0.1.6",
"pandas==2.1.1",
"pybedtools==0.9.1",
"seaborn==0.13.0",
"sourmash==4.8.4",
"torch==2.1.0",
"torchvision==0.16.0",
"wandb==0.15.12",
]

[project.scripts]
dnadiffusion = "dnadiffusion.cli:main"

[project.urls]
Documentation = "https://pinellolab.github.io/DNA-Diffusion"
Source = "https://github.com/pinellolab/DNA-Diffusion"
12 changes: 6 additions & 6 deletions src/dnadiffusion/data/dataloader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import pickle
import random
from typing import Any, Dict, List, Optional
from typing import Any

import matplotlib.pyplot as plt
import numpy as np
Expand All @@ -16,7 +16,7 @@
def load_data(
data_path: str = "K562_hESCT0_HepG2_GM12878_12k_sequences_per_group.txt",
saved_data_path: str = "encode_data.pkl",
subset_list: List = [
subset_list: list = [
"GM12878_ENCLB441ZZZ",
"hESCT0_ENCLB449ZZZ",
"K562_ENCLB843GMH",
Expand Down Expand Up @@ -115,8 +115,8 @@ def save_fasta(df: pd.DataFrame, name: str, num_sequences: int, seq_to_subset_co


def generate_motifs_and_fastas(
df: pd.DataFrame, name: str, num_sequences: int, subset_list: Optional[List] = None
) -> Dict[str, Any]:
df: pd.DataFrame, name: str, num_sequences: int, subset_list: list | None = None
) -> dict[str, Any]:
print("Generating Motifs and Fastas...", name)
print("---" * 10)

Expand Down Expand Up @@ -146,8 +146,8 @@ def generate_motifs_and_fastas(

def preprocess_data(
input_csv: str,
subset_list: Optional[List] = None,
limit_total_sequences: Optional[int] = None,
subset_list: list | None = None,
limit_total_sequences: int | None = None,
number_of_sequences_to_motif_creation: int = 1000,
save_output: bool = True,
):
Expand Down
3 changes: 1 addition & 2 deletions src/dnadiffusion/data/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import os
import shutil
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
Expand All @@ -13,7 +12,7 @@
def preprocess_data(
data_path: Path = DATA_DIR,
df_path: str = "/master_dataset.ftr",
cell_list: List = ["K562_ENCLB843GMH", "hESCT0_ENCLB449ZZZ", "HepG2_ENCLB029COU", "GM12878_ENCLB441ZZZ"],
cell_list: list = ["K562_ENCLB843GMH", "hESCT0_ENCLB449ZZZ", "HepG2_ENCLB029COU", "GM12878_ENCLB441ZZZ"],
download_data_bool: bool = True,
create_master_dataset_bool: bool = True,
filter_data_bool: bool = True,
Expand Down
3 changes: 1 addition & 2 deletions src/dnadiffusion/metrics/motif_composition.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os
import re
from typing import Dict

import pandas as pd

Expand Down Expand Up @@ -61,7 +60,7 @@ def motif_composition_matrix(
return output_df


def parse_motif_file(file_path: str = f"{DATA_DIR}/JASPAR2020_vertebrates.pfm", download_data: bool = False) -> Dict:
def parse_motif_file(file_path: str = f"{DATA_DIR}/JASPAR2020_vertebrates.pfm", download_data: bool = False) -> dict:
"""Given a file path to the motif pfm file, return a sorted dictionary of motifs."""
if download_data:
# Download JASPAR2020_vertebrates.pfm
Expand Down
10 changes: 5 additions & 5 deletions src/dnadiffusion/models/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,14 +107,14 @@ def forward(self, x: torch.Tensor, *args, **kwargs):
return self.fn(x, *args, **kwargs) + x


def Upsample(dim: int, dim_out: Optional[int]):
def Upsample(dim: int, dim_out: int | None):
return nn.Sequential(
nn.Upsample(scale_factor=2, mode="nearest"),
nn.Conv2d(dim, default(dim_out, dim), 3, padding=1),
)


def Downsample(dim: int, dim_out: Optional[int]):
def Downsample(dim: int, dim_out: int | None):
return nn.Conv2d(dim, default(dim_out, dim), 4, 2, 1)


Expand Down Expand Up @@ -173,7 +173,7 @@ def __init__(self, dim: int, dim_out: int, groups: int = 8) -> None:
self.norm = nn.GroupNorm(groups, dim_out)
self.act = nn.SiLU()

def forward(self, x: torch.Tensor, scale_shift: Optional[torch.Tensor] = None):
def forward(self, x: torch.Tensor, scale_shift: torch.Tensor | None = None):
x = self.proj(x)
x = self.norm(x)

Expand All @@ -186,15 +186,15 @@ def forward(self, x: torch.Tensor, scale_shift: Optional[torch.Tensor] = None):


class ResnetBlock(nn.Module):
def __init__(self, dim: int, dim_out: int, *, time_emb_dim: Optional[int], groups: int = 8) -> None:
def __init__(self, dim: int, dim_out: int, *, time_emb_dim: int | None, groups: int = 8) -> None:
super().__init__()
self.mlp = (nn.Sequential(nn.SiLU(), nn.Linear(time_emb_dim, dim_out * 2))) if exists(time_emb_dim) else None

self.block1 = Block(dim, dim_out, groups=groups)
self.block2 = Block(dim_out, dim_out, groups=groups)
self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()

def forward(self, x: torch.Tensor, time_emb: Optional[torch.Tensor] = None):
def forward(self, x: torch.Tensor, time_emb: torch.Tensor | None = None):
scale_shift = None
if exists(self.mlp) and exists(time_emb):
time_emb = self.mlp(time_emb)
Expand Down
3 changes: 2 additions & 1 deletion src/dnadiffusion/models/unet.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import itertools
from functools import partial
from typing import Optional

Expand Down Expand Up @@ -32,7 +33,7 @@ def __init__(
self.init_conv = nn.Conv2d(input_channels, init_dim, (7, 7), padding=3)
dims = [init_dim, *(dim * m for m in dim_mults)]

in_out = list(zip(dims[:-1], dims[1:]))
in_out = itertools.pairwise(dims)
block_klass = partial(ResnetBlock, groups=resnet_block_groups)

# time embeddings
Expand Down
4 changes: 2 additions & 2 deletions src/dnadiffusion/utils/data_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def get_gtf_df(self):
return self.df_gtf

def geneid2genename(self, gene_list):
'''given a list of geneid convert it to list names
"""given a list of geneid convert it to list names
ex:
usage
geneid2genename(["ENSG00000000003", "ENSG00000000004" ])
Expand All @@ -148,7 +148,7 @@ def geneid2genename(self, gene_list):
return:
conversion results gene_id to gene_name
list[str]
'''
"""
gtf_df = self.get_gtf_df()
dict_conversion = dict(zip(gtf_df["gene_id"].values, gtf_df["gene_name"].values))
return [dict_conversion[g] for g in gene_list]
Expand Down
2 changes: 1 addition & 1 deletion src/dnadiffusion/utils/sample_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def create_sample(
cell_types: list,
conditional_numeric_to_tag: dict,
number_of_samples: int = 1000,
group_number: Optional[list] = None,
group_number: list | None = None,
cond_weight_to_metric: int = 0,
save_timesteps: bool = False,
save_dataframe: bool = False,
Expand Down
4 changes: 2 additions & 2 deletions src/dnadiffusion/utils/train_util.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import copy
from typing import Any, Dict
from typing import Any

import torch
import torchvision.transforms as T
Expand All @@ -17,7 +17,7 @@
class TrainLoop:
def __init__(
self,
data: Dict[str, Any],
data: dict[str, Any],
model: torch.nn.Module,
accelerator: Accelerator,
epochs: int = 10000,
Expand Down

0 comments on commit 30b0384

Please sign in to comment.