Skip to content

Commit

Permalink
Data preprocessing scripts for sequence validation dataset (#207)
Browse files Browse the repository at this point in the history
* add gtfparse and pybedtools to dependencies

* convert validation data notebooks to library

* add validation preprocessing helper utils

* replace "SOURCE" with "TAG"
  • Loading branch information
ssenan authored Sep 15, 2023
1 parent c427ade commit ee483a5
Show file tree
Hide file tree
Showing 3 changed files with 351 additions and 0 deletions.
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,11 @@ dependencies = [
"einops==0.6.1",
"genomepy==0.16.1",
"gimmemotifs==0.18.0",
"gtfparse==1.3.0",
"matplotlib==3.7.3",
"memory-efficient-attention-pytorch==0.1.6",
"pandas==2.1.0",
"pybedtools==0.9.1",
"seaborn==0.12.2",
"sourmash==4.8.3",
"torch==2.0.1",
Expand Down
192 changes: 192 additions & 0 deletions src/dnadiffusion/data/validation_preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
import os

import genomepy
import pandas as pd
import pybedtools
from pybedtools import BedTool

from dnadiffusion.utils.data_util import GTFProcessing


def combine_all_seqs(cell_list: list, training_data_path: str, save_output: bool = False) -> pd.DataFrame:
"""A function to take the generated sequences from sample loop and combine them with the training dataset
Args:
cell_list (list): A list of cell types used to generate synthetic sequences
["GM12878_ENCLB441ZZZ",
"HepG2_ENCLB029COU",
"hESCT0_ENCLB449ZZZ",
"K562_ENCLB843GMH"]
training_dataset_path (str): Path to the training dataset
K562_hESCT0_HepG2_GM12878_12k_sequences_per_group.txt
Returns:
pd.DataFrame: A dataframe of the combined synthetic sequences from all four cell types
"""
# Reading the training dataset used to train the model
df_train = pd.read_csv(training_data_path, sep="\t")

dfs_to_save = {}
generated_files = []

for cell in cell_list:
cell_file_name = next(f for f in os.listdir(os.getcwd()) if cell in f)
# Creating variable to save the generated files
file_name = cell_file_name.split("_")[1]
# Reading current generated file
df = pd.read_csv(cell_file_name, header=None, names=["SEQUENCE"])
# Adding some columns
df["CELL_TYPE"] = file_name.upper()
df["index_number"] = [str(i) for i in df.index]
df["TAG"] = "GENERATED"
df["ID"] = df.apply(lambda x: "_".join([x["index_number"], x["TAG"], x["CELL_TYPE"]]), axis=1)
# Saving the modified dataframe
dfs_to_save["DF_" + file_name] = df
# Remove index column
del df["index_number"]
generated_files.append(df)

for k, df in df_train.groupby(["data_label", "TAG"]):
data_in, tag_in = k

# Removing replicate ID from tag
tag_in = (tag_in.split("_")[0]).upper()
data_in = data_in.upper()
print(f"Processing {data_in} {tag_in}")
# Reformatting the dataframe
df_slice = df[["sequence", "TAG", "data_label"]].copy()
df_slice.columns = ["SEQUENCE", "CELL_TYPE", "TAG"]
df_slice["TAG"] = df_slice["TAG"].apply(lambda x: x.upper())
df_slice["CELL_TYPE"] = df_slice["CELL_TYPE"].apply(lambda x: x.upper())
df_slice["CELL_TYPE"] = df_slice["CELL_TYPE"].apply(lambda x: x.split("_")[0])
df_slice["index_number"] = [str(i) for i in df_slice.index]
df_slice["ID"] = df_slice.apply(lambda x: "_".join([x["index_number"], x["TAG"], x["CELL_TYPE"]]), axis=1)
dfs_to_save[data_in.upper() + "_" + tag_in.upper()] = df_slice
# Remove index column
del df_slice["index_number"]
generated_files.append(df_slice)

# Combine the generated sequences with the training dataset
df_combined = pd.concat(generated_files)
# Rename columns
df_combined.columns = ["SEQUENCE", "CELL_TYPE", "TAG", "ID"]
if save_output:
df_combined.to_csv("DNA_DIFFUSION_ALL_SEQS.txt", sep="\t")
return df_combined


def validation_table(
training_data_path: str,
generated_data_path: str,
promoter_path: str = "gencode.v43.annotation.gtf",
add_bp_interval: int = 1000,
num_sequences: int = 10000,
num_filter_sequences: int = 5000,
download_genome: bool = False,
download_gtf_gene_annotation: bool = False,
genome_path: str = "hg38",
save_output: bool = False,
) -> pd.DataFrame:
"""Consolidates the training dataset, generated sequences, randomized sequences and promoter sequences into a single dataframe
Args:
training_data_path (str): Path to the training dataset used during model training
generated_data_path (str): Path to the generated sequences from sample loop
promoter_path (str, optional): Path to the promoter sequences
add_bp_interval (int, optional): Number of base pairs to add to the promoter sequences
num_sequences (int, optional): Number of random sequences to generate
num_filter_sequences (int, optional): Number of random sequences to filter down to
download_genome (bool, optional): Download the hg38 genome if not already installed
download_gtf_gene_annotation (bool, optional): Download the gtf gene annotation if not already installed
genome_path (str, optional): Path to the genome
save_output (bool, optional): Save the output dataframe to a file
Returns:
pd.DataFrame: A dataframe of the combined sequences
"""

if download_genome:
genomepy.install_genome("hg38", "UCSC", genome_path)
if download_gtf_gene_annotation:
os.system(
"wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_43/gencode.v43.annotation.gtf.gz"
)
os.system("gunzip gencode.v43.annotation.gtf.gz")

# Random sequences
print("Generating random sequences dataframe")
bed_tool = BedTool()
random_seqs = bed_tool.random(l=200, n=num_sequences, genome="hg38", seed=1)
random_seqs = random_seqs.to_dataframe()
# Filtering chromosomes
chromosomes = ["chr" + str(i) for i in range(1, 22)] + ["X", "Y"]
random_seqs = random_seqs[random_seqs["chrom"].isin(chromosomes)]
filtered_random_seqs = BedTool.from_dataframe(random_seqs).sequence(f"{genome_path}/hg38/hg38.fa")
# Cleaning up the dataframe and renaming columns
random_seqs["SEQUENCE"] = [x.upper() for x in open(filtered_random_seqs.seqfn).read().split("\n") if ">" not in x][
:-1
]
random_seqs = random_seqs[random_seqs["SEQUENCE"].apply(lambda x: "N" not in x)]
random_seqs = random_seqs.head(num_filter_sequences)
random_seqs["ID"] = random_seqs.apply(lambda x: f"{x['chrom']}_{x['start']!s}_{x['end']!s}_random", axis=1)
random_seqs["CELL_TYPE"] = "NO"
random_seqs["TAG"] = "RANDOM_GENOME_REGIONS"
random_seqs = random_seqs[["chrom", "start", "end", "ID", "CELL_TYPE", "SEQUENCE", "TAG"]]

# Promoter sequences
print("Generating promoter sequences dataframe")
gtf = GTFProcessing(promoter_path)
df_gtf = gtf.get_gtf_df()
df_gtf_filtered = df_gtf.query("feature == 'transcript' and gene_type == 'protein_coding' ").drop_duplicates(
"gene_name"
)
df_gtf_filtered["tss_position"] = df_gtf_filtered.apply(
lambda x: x["start"] if x["strand"] == "+" else x["end"], axis=1
)
df_gtf_filtered["start"] = df_gtf_filtered["tss_position"] - add_bp_interval
df_gtf_filtered["end"] = df_gtf_filtered["tss_position"] + add_bp_interval
df_gtf = gtf.df_to_df_bed(df_gtf_filtered)
# Rename columns
df_gtf_filtered["chrom"] = df_gtf_filtered["chr"]
df_gtf_filtered["ID"] = df_gtf_filtered.apply(
lambda x: f"{x['chrom']}_{x['start']!s}_{x['end']!s}_promoter", axis=1
)
df_gtf_filtered["CELL_TYPE"] = "NO"
df_gtf_filtered["TAG"] = "PROMOTERS"
p_seqs = BedTool.from_dataframe(df_gtf_filtered).sequence(f"{genome_path}/hg38/hg38.fa")
df_gtf_filtered["SEQUENCE"] = [x.upper() for x in open(p_seqs.seqfn).read().split("\n") if ">" not in x][:-1]
df_gtf_filtered = df_gtf_filtered[["chrom", "start", "end", "ID", "CELL_TYPE", "SEQUENCE", "TAG"]]

# Reading the training dataset used to train the model
print("Generating training dataset dataframe")
df_train = pd.read_csv(training_data_path, sep="\t")
df_train_balanced = pd.concat([v for k, v in df_train.groupby("TAG")])
# Adding some metadata columns
df_train_balanced["coord_center"] = df_train_balanced["start"] + (
df_train_balanced["end"] - df_train_balanced["start"]
)
df_train_balanced["start"] = df_train_balanced["coord_center"] - 100
df_train_balanced["end"] = df_train_balanced["coord_center"] + 100
# Selecting only the columns we need
df_train_balanced = df_train_balanced[["chr", "start", "end", "dhs_id", "TAG", "sequence", "data_label"]]
df_train_balanced.columns = ["chrom", "start", "end", "ID", "CELL_TYPE", "SEQUENCE", "TAG"]

# Reading the generated sequences
print("Generating synthetic sequences dataframe")
df_generated = pd.read_csv(generated_data_path, sep="\t").query("TAG == 'GENERATED'")
df_generated_balanced = pd.concat([v for k, v in df_generated.groupby("CELL_TYPE")])
# Adding some metadata columns
df_generated_balanced["chrom"] = "NO"
df_generated_balanced["start"] = "NO"
df_generated_balanced["end"] = "NO"
df_generated_balanced = df_generated_balanced[["chrom", "start", "end", "ID", "CELL_TYPE", "SEQUENCE", "TAG"]]
df_generated_balanced.columns = ["chrom", "start", "end", "ID", "CELL_TYPE", "SEQUENCE", "TAG"]
df_generated_balanced["CELL_TYPE"] = df_generated_balanced["CELL_TYPE"].apply(lambda x: x.split("_")[0])

# Combining all the dataframes
print("Combining all the dataframes")
df_final = pd.concat([df_train_balanced, df_generated_balanced, df_gtf_filtered, random_seqs], ignore_index=True)
if save_output:
df_final.to_csv("DNA_DIFFUSION_VALIDATION_TABLE.txt", sep="\t", index=None)
return df_final
157 changes: 157 additions & 0 deletions src/dnadiffusion/utils/data_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import gtfparse
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import HTML, display
from tqdm import tqdm


class SEQ_EXTRACT:
def __init__(self, data):
self.data = pd.read_csv(data, sep="\t")

def extract_seq(self, tag, cell_type):
return self.data.query(f'TAG == "{tag}" and CELL_TYPE == "{cell_type}" ').copy()

def __repr__(self):
display(self.data.groupby(["TAG", "CELL_TYPE"]).count())
return "Data structure"


class GTFProcessing:
# https://github.com/LucasSilvaFerreira/GTFProcessing
def __init__(self, gtf_file_name):
self.gtf_file_name = gtf_file_name
self.df_gtf = self.__load_gtf__()
self.change_columns_name()
self.__add_interval_lenght()

@staticmethod
def remove_dup_columns(frame):
keep_names = set()
keep_icols = []
for icol, name in enumerate(frame.columns):
if name not in keep_names:
keep_names.add(name)
keep_icols.append(icol)
return frame.iloc[:, keep_icols]

def __load_gtf__(self):
print("loading gtf_file...")
gtf = gtfparse.parse_gtf_and_expand_attributes(self.gtf_file_name)
return gtf

def change_columns_name(self):
print(self.df_gtf.columns)
self.df_gtf.columns = ["chr"] + self.df_gtf.columns.values.tolist()[1:]

def get_gtf_df(self):
"""
return: pandas dataframe
"""

return self.df_gtf

def geneid2genename(self, gene_list):
'''given a list of geneid convert it to list names
ex:
usage
geneid2genename(["ENSG00000000003", "ENSG00000000004" ])
Parameters:
gene_list : list[str]
return:
conversion results gene_id to gene_name
list[str]
'''
gtf_df = self.get_gtf_df()
dict_conversion = dict(zip(gtf_df["gene_id"].values, gtf_df["gene_name"].values))
return [dict_conversion[g] for g in gene_list]

def __add_interval_lenght(self):
self.df_gtf["interval_lenght"] = self.df_gtf.end - self.df_gtf.start

@staticmethod
def get_first_exon_df(gtf_df):
"""Group genes by transcript id and returns a df with first exont relative to strand"""
out_new_df = []
for k, v in gtf_df.query("feature == 'exon' and exon_number == '1' ").groupby("transcript_id"):
out_new_df.append(v)

return pd.concat(out_new_df)

@staticmethod
def get_last_exon_df(gtf_df):
"""Group genes by transcript id and returns a df with last exont relative to strand"""

out_new_df = []
for k, v in tqdm(gtf_df.query("feature == 'exon' ").groupby("transcript_id")):
if v.iloc[0].strand == "+":
out_new_df.append(v.sort_values("end", ascending=True).iloc[-1].values)

# print v.sort_values('exon_number').iloc[0]
if v.iloc[0].strand == "-":
out_new_df.append(v.sort_values("start", ascending=True).iloc[0].values)

return pd.DataFrame(out_new_df, columns=gtf_df.columns)

@staticmethod
def df_to_bed(gtf_df, bed_file_name, fourth_position_feature="gene_name", fifth_position_feature="transcript_id"):
"""Save a bed_file using a gtf as reference and returns the bed_file_name string"""

print(gtf_df[["chr", "start", "end", fourth_position_feature, fifth_position_feature, "strand"]].head())

gtf_df[["chr", "start", "end", fourth_position_feature, fifth_position_feature, "strand"]].to_csv(
bed_file_name, sep="\t", header=None, index=None
)
return bed_file_name

@staticmethod
def df_to_df_bed(gtf_df, fourth_position_feature="gene_name", fifth_position_feature="transcript_id"):
"""Save a bed_file using a gtf as reference and returns df with a bed6 format"""

print(gtf_df[["chr", "start", "end", fourth_position_feature, fifth_position_feature, "strand"]].head())

return gtf_df[["chr", "start", "end", fourth_position_feature, fifth_position_feature, "strand"]]

@staticmethod
def hist_generate(gtf_df, feature="transcript_biotype"):
"""
ex: GTFProcessing.hist_generate(gtf_to_test.head(1600), 'transcript_biotype')
"""
x_axis_feature = GTFProcessing.get_first_exon_df(gtf_df).groupby(feature).count()["start"]
plt.bar(range(0, x_axis_feature.values.shape[0]), x_axis_feature.values)
print(x_axis_feature.keys())
print(x_axis_feature.values)
plt.xticks(range(0, x_axis_feature.values.shape[0]), (x_axis_feature.keys().values), rotation="vertical")
plt.title(feature)
plt.show()

@staticmethod
def generate_hist_by_transcript_biotypes(gtf_df):
GTFProcessing.hist_generate(gtf_df, feature="transcript_biotype")

@staticmethod
def capture_distal_unique_tes(gtf_df):
return_distal_exon = []
last_exon_df = GTFProcessing.get_last_exon_df(gtf_df)
for k, v in tqdm(last_exon_df.groupby("gene_id")):
if v.iloc[0]["strand"] == "+":
return_distal_exon.append(v.sort_values("end", ascending=False).iloc[0].values.tolist())
if v.iloc[0]["strand"] == "-":
return_distal_exon.append(v.sort_values("start", ascending=True).iloc[0].values.tolist())

df_distal_exon_by_gene_id = pd.DataFrame(return_distal_exon, columns=last_exon_df.columns.values.tolist())
return df_distal_exon_by_gene_id

@staticmethod
def capture_distal_unique_tss(gtf_df):
return_distal_tss = []
first_exon_df = GTFProcessing.get_first_exon_df(gtf_df)
for k, v in tqdm(first_exon_df.groupby("gene_id")):
if v.iloc[0]["strand"] == "+":
return_distal_tss.append(v.sort_values("start", ascending=True).iloc[0].values.tolist())
if v.iloc[0]["strand"] == "-":
return_distal_tss.append(v.sort_values("end", ascending=False).iloc[0].values.tolist())

df_distal_exon_by_gene_id = pd.DataFrame(return_distal_tss, columns=first_exon_df.columns.values.tolist())
return df_distal_exon_by_gene_id

0 comments on commit ee483a5

Please sign in to comment.