From ee483a5ad30fdeb9875c78ef2102e54e93857d0e Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 15 Sep 2023 18:49:36 -0400 Subject: [PATCH] Data preprocessing scripts for sequence validation dataset (#207) * add gtfparse and pybedtools to dependencies * convert validation data notebooks to library * add validation preprocessing helper utils * replace "SOURCE" with "TAG" --- pyproject.toml | 2 + .../data/validation_preprocessing.py | 192 ++++++++++++++++++ src/dnadiffusion/utils/data_util.py | 157 ++++++++++++++ 3 files changed, 351 insertions(+) create mode 100644 src/dnadiffusion/data/validation_preprocessing.py create mode 100644 src/dnadiffusion/utils/data_util.py diff --git a/pyproject.toml b/pyproject.toml index 62d51707..5906697b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -176,9 +176,11 @@ dependencies = [ "einops==0.6.1", "genomepy==0.16.1", "gimmemotifs==0.18.0", + "gtfparse==1.3.0", "matplotlib==3.7.3", "memory-efficient-attention-pytorch==0.1.6", "pandas==2.1.0", + "pybedtools==0.9.1", "seaborn==0.12.2", "sourmash==4.8.3", "torch==2.0.1", diff --git a/src/dnadiffusion/data/validation_preprocessing.py b/src/dnadiffusion/data/validation_preprocessing.py new file mode 100644 index 00000000..0842db8e --- /dev/null +++ b/src/dnadiffusion/data/validation_preprocessing.py @@ -0,0 +1,192 @@ +import os + +import genomepy +import pandas as pd +import pybedtools +from pybedtools import BedTool + +from dnadiffusion.utils.data_util import GTFProcessing + + +def combine_all_seqs(cell_list: list, training_data_path: str, save_output: bool = False) -> pd.DataFrame: + """A function to take the generated sequences from sample loop and combine them with the training dataset + + Args: + cell_list (list): A list of cell types used to generate synthetic sequences + ["GM12878_ENCLB441ZZZ", + "HepG2_ENCLB029COU", + "hESCT0_ENCLB449ZZZ", + "K562_ENCLB843GMH"] + + training_dataset_path (str): Path to the training dataset + K562_hESCT0_HepG2_GM12878_12k_sequences_per_group.txt + + Returns: + pd.DataFrame: A dataframe of the combined synthetic sequences from all four cell types + """ + # Reading the training dataset used to train the model + df_train = pd.read_csv(training_data_path, sep="\t") + + dfs_to_save = {} + generated_files = [] + + for cell in cell_list: + cell_file_name = next(f for f in os.listdir(os.getcwd()) if cell in f) + # Creating variable to save the generated files + file_name = cell_file_name.split("_")[1] + # Reading current generated file + df = pd.read_csv(cell_file_name, header=None, names=["SEQUENCE"]) + # Adding some columns + df["CELL_TYPE"] = file_name.upper() + df["index_number"] = [str(i) for i in df.index] + df["TAG"] = "GENERATED" + df["ID"] = df.apply(lambda x: "_".join([x["index_number"], x["TAG"], x["CELL_TYPE"]]), axis=1) + # Saving the modified dataframe + dfs_to_save["DF_" + file_name] = df + # Remove index column + del df["index_number"] + generated_files.append(df) + + for k, df in df_train.groupby(["data_label", "TAG"]): + data_in, tag_in = k + + # Removing replicate ID from tag + tag_in = (tag_in.split("_")[0]).upper() + data_in = data_in.upper() + print(f"Processing {data_in} {tag_in}") + # Reformatting the dataframe + df_slice = df[["sequence", "TAG", "data_label"]].copy() + df_slice.columns = ["SEQUENCE", "CELL_TYPE", "TAG"] + df_slice["TAG"] = df_slice["TAG"].apply(lambda x: x.upper()) + df_slice["CELL_TYPE"] = df_slice["CELL_TYPE"].apply(lambda x: x.upper()) + df_slice["CELL_TYPE"] = df_slice["CELL_TYPE"].apply(lambda x: x.split("_")[0]) + df_slice["index_number"] = [str(i) for i in df_slice.index] + df_slice["ID"] = df_slice.apply(lambda x: "_".join([x["index_number"], x["TAG"], x["CELL_TYPE"]]), axis=1) + dfs_to_save[data_in.upper() + "_" + tag_in.upper()] = df_slice + # Remove index column + del df_slice["index_number"] + generated_files.append(df_slice) + + # Combine the generated sequences with the training dataset + df_combined = pd.concat(generated_files) + # Rename columns + df_combined.columns = ["SEQUENCE", "CELL_TYPE", "TAG", "ID"] + if save_output: + df_combined.to_csv("DNA_DIFFUSION_ALL_SEQS.txt", sep="\t") + return df_combined + + +def validation_table( + training_data_path: str, + generated_data_path: str, + promoter_path: str = "gencode.v43.annotation.gtf", + add_bp_interval: int = 1000, + num_sequences: int = 10000, + num_filter_sequences: int = 5000, + download_genome: bool = False, + download_gtf_gene_annotation: bool = False, + genome_path: str = "hg38", + save_output: bool = False, +) -> pd.DataFrame: + """Consolidates the training dataset, generated sequences, randomized sequences and promoter sequences into a single dataframe + + Args: + training_data_path (str): Path to the training dataset used during model training + generated_data_path (str): Path to the generated sequences from sample loop + promoter_path (str, optional): Path to the promoter sequences + add_bp_interval (int, optional): Number of base pairs to add to the promoter sequences + num_sequences (int, optional): Number of random sequences to generate + num_filter_sequences (int, optional): Number of random sequences to filter down to + download_genome (bool, optional): Download the hg38 genome if not already installed + download_gtf_gene_annotation (bool, optional): Download the gtf gene annotation if not already installed + genome_path (str, optional): Path to the genome + save_output (bool, optional): Save the output dataframe to a file + + Returns: + pd.DataFrame: A dataframe of the combined sequences + """ + + if download_genome: + genomepy.install_genome("hg38", "UCSC", genome_path) + if download_gtf_gene_annotation: + os.system( + "wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_43/gencode.v43.annotation.gtf.gz" + ) + os.system("gunzip gencode.v43.annotation.gtf.gz") + + # Random sequences + print("Generating random sequences dataframe") + bed_tool = BedTool() + random_seqs = bed_tool.random(l=200, n=num_sequences, genome="hg38", seed=1) + random_seqs = random_seqs.to_dataframe() + # Filtering chromosomes + chromosomes = ["chr" + str(i) for i in range(1, 22)] + ["X", "Y"] + random_seqs = random_seqs[random_seqs["chrom"].isin(chromosomes)] + filtered_random_seqs = BedTool.from_dataframe(random_seqs).sequence(f"{genome_path}/hg38/hg38.fa") + # Cleaning up the dataframe and renaming columns + random_seqs["SEQUENCE"] = [x.upper() for x in open(filtered_random_seqs.seqfn).read().split("\n") if ">" not in x][ + :-1 + ] + random_seqs = random_seqs[random_seqs["SEQUENCE"].apply(lambda x: "N" not in x)] + random_seqs = random_seqs.head(num_filter_sequences) + random_seqs["ID"] = random_seqs.apply(lambda x: f"{x['chrom']}_{x['start']!s}_{x['end']!s}_random", axis=1) + random_seqs["CELL_TYPE"] = "NO" + random_seqs["TAG"] = "RANDOM_GENOME_REGIONS" + random_seqs = random_seqs[["chrom", "start", "end", "ID", "CELL_TYPE", "SEQUENCE", "TAG"]] + + # Promoter sequences + print("Generating promoter sequences dataframe") + gtf = GTFProcessing(promoter_path) + df_gtf = gtf.get_gtf_df() + df_gtf_filtered = df_gtf.query("feature == 'transcript' and gene_type == 'protein_coding' ").drop_duplicates( + "gene_name" + ) + df_gtf_filtered["tss_position"] = df_gtf_filtered.apply( + lambda x: x["start"] if x["strand"] == "+" else x["end"], axis=1 + ) + df_gtf_filtered["start"] = df_gtf_filtered["tss_position"] - add_bp_interval + df_gtf_filtered["end"] = df_gtf_filtered["tss_position"] + add_bp_interval + df_gtf = gtf.df_to_df_bed(df_gtf_filtered) + # Rename columns + df_gtf_filtered["chrom"] = df_gtf_filtered["chr"] + df_gtf_filtered["ID"] = df_gtf_filtered.apply( + lambda x: f"{x['chrom']}_{x['start']!s}_{x['end']!s}_promoter", axis=1 + ) + df_gtf_filtered["CELL_TYPE"] = "NO" + df_gtf_filtered["TAG"] = "PROMOTERS" + p_seqs = BedTool.from_dataframe(df_gtf_filtered).sequence(f"{genome_path}/hg38/hg38.fa") + df_gtf_filtered["SEQUENCE"] = [x.upper() for x in open(p_seqs.seqfn).read().split("\n") if ">" not in x][:-1] + df_gtf_filtered = df_gtf_filtered[["chrom", "start", "end", "ID", "CELL_TYPE", "SEQUENCE", "TAG"]] + + # Reading the training dataset used to train the model + print("Generating training dataset dataframe") + df_train = pd.read_csv(training_data_path, sep="\t") + df_train_balanced = pd.concat([v for k, v in df_train.groupby("TAG")]) + # Adding some metadata columns + df_train_balanced["coord_center"] = df_train_balanced["start"] + ( + df_train_balanced["end"] - df_train_balanced["start"] + ) + df_train_balanced["start"] = df_train_balanced["coord_center"] - 100 + df_train_balanced["end"] = df_train_balanced["coord_center"] + 100 + # Selecting only the columns we need + df_train_balanced = df_train_balanced[["chr", "start", "end", "dhs_id", "TAG", "sequence", "data_label"]] + df_train_balanced.columns = ["chrom", "start", "end", "ID", "CELL_TYPE", "SEQUENCE", "TAG"] + + # Reading the generated sequences + print("Generating synthetic sequences dataframe") + df_generated = pd.read_csv(generated_data_path, sep="\t").query("TAG == 'GENERATED'") + df_generated_balanced = pd.concat([v for k, v in df_generated.groupby("CELL_TYPE")]) + # Adding some metadata columns + df_generated_balanced["chrom"] = "NO" + df_generated_balanced["start"] = "NO" + df_generated_balanced["end"] = "NO" + df_generated_balanced = df_generated_balanced[["chrom", "start", "end", "ID", "CELL_TYPE", "SEQUENCE", "TAG"]] + df_generated_balanced.columns = ["chrom", "start", "end", "ID", "CELL_TYPE", "SEQUENCE", "TAG"] + df_generated_balanced["CELL_TYPE"] = df_generated_balanced["CELL_TYPE"].apply(lambda x: x.split("_")[0]) + + # Combining all the dataframes + print("Combining all the dataframes") + df_final = pd.concat([df_train_balanced, df_generated_balanced, df_gtf_filtered, random_seqs], ignore_index=True) + if save_output: + df_final.to_csv("DNA_DIFFUSION_VALIDATION_TABLE.txt", sep="\t", index=None) + return df_final diff --git a/src/dnadiffusion/utils/data_util.py b/src/dnadiffusion/utils/data_util.py new file mode 100644 index 00000000..df8289dd --- /dev/null +++ b/src/dnadiffusion/utils/data_util.py @@ -0,0 +1,157 @@ +import gtfparse +import matplotlib.pyplot as plt +import pandas as pd +from IPython.display import HTML, display +from tqdm import tqdm + + +class SEQ_EXTRACT: + def __init__(self, data): + self.data = pd.read_csv(data, sep="\t") + + def extract_seq(self, tag, cell_type): + return self.data.query(f'TAG == "{tag}" and CELL_TYPE == "{cell_type}" ').copy() + + def __repr__(self): + display(self.data.groupby(["TAG", "CELL_TYPE"]).count()) + return "Data structure" + + +class GTFProcessing: + # https://github.com/LucasSilvaFerreira/GTFProcessing + def __init__(self, gtf_file_name): + self.gtf_file_name = gtf_file_name + self.df_gtf = self.__load_gtf__() + self.change_columns_name() + self.__add_interval_lenght() + + @staticmethod + def remove_dup_columns(frame): + keep_names = set() + keep_icols = [] + for icol, name in enumerate(frame.columns): + if name not in keep_names: + keep_names.add(name) + keep_icols.append(icol) + return frame.iloc[:, keep_icols] + + def __load_gtf__(self): + print("loading gtf_file...") + gtf = gtfparse.parse_gtf_and_expand_attributes(self.gtf_file_name) + return gtf + + def change_columns_name(self): + print(self.df_gtf.columns) + self.df_gtf.columns = ["chr"] + self.df_gtf.columns.values.tolist()[1:] + + def get_gtf_df(self): + """ + return: pandas dataframe + """ + + return self.df_gtf + + def geneid2genename(self, gene_list): + '''given a list of geneid convert it to list names + ex: + usage + geneid2genename(["ENSG00000000003", "ENSG00000000004" ]) + Parameters: + gene_list : list[str] + + return: + conversion results gene_id to gene_name + list[str] + ''' + gtf_df = self.get_gtf_df() + dict_conversion = dict(zip(gtf_df["gene_id"].values, gtf_df["gene_name"].values)) + return [dict_conversion[g] for g in gene_list] + + def __add_interval_lenght(self): + self.df_gtf["interval_lenght"] = self.df_gtf.end - self.df_gtf.start + + @staticmethod + def get_first_exon_df(gtf_df): + """Group genes by transcript id and returns a df with first exont relative to strand""" + out_new_df = [] + for k, v in gtf_df.query("feature == 'exon' and exon_number == '1' ").groupby("transcript_id"): + out_new_df.append(v) + + return pd.concat(out_new_df) + + @staticmethod + def get_last_exon_df(gtf_df): + """Group genes by transcript id and returns a df with last exont relative to strand""" + + out_new_df = [] + for k, v in tqdm(gtf_df.query("feature == 'exon' ").groupby("transcript_id")): + if v.iloc[0].strand == "+": + out_new_df.append(v.sort_values("end", ascending=True).iloc[-1].values) + + # print v.sort_values('exon_number').iloc[0] + if v.iloc[0].strand == "-": + out_new_df.append(v.sort_values("start", ascending=True).iloc[0].values) + + return pd.DataFrame(out_new_df, columns=gtf_df.columns) + + @staticmethod + def df_to_bed(gtf_df, bed_file_name, fourth_position_feature="gene_name", fifth_position_feature="transcript_id"): + """Save a bed_file using a gtf as reference and returns the bed_file_name string""" + + print(gtf_df[["chr", "start", "end", fourth_position_feature, fifth_position_feature, "strand"]].head()) + + gtf_df[["chr", "start", "end", fourth_position_feature, fifth_position_feature, "strand"]].to_csv( + bed_file_name, sep="\t", header=None, index=None + ) + return bed_file_name + + @staticmethod + def df_to_df_bed(gtf_df, fourth_position_feature="gene_name", fifth_position_feature="transcript_id"): + """Save a bed_file using a gtf as reference and returns df with a bed6 format""" + + print(gtf_df[["chr", "start", "end", fourth_position_feature, fifth_position_feature, "strand"]].head()) + + return gtf_df[["chr", "start", "end", fourth_position_feature, fifth_position_feature, "strand"]] + + @staticmethod + def hist_generate(gtf_df, feature="transcript_biotype"): + """ + ex: GTFProcessing.hist_generate(gtf_to_test.head(1600), 'transcript_biotype') + """ + x_axis_feature = GTFProcessing.get_first_exon_df(gtf_df).groupby(feature).count()["start"] + plt.bar(range(0, x_axis_feature.values.shape[0]), x_axis_feature.values) + print(x_axis_feature.keys()) + print(x_axis_feature.values) + plt.xticks(range(0, x_axis_feature.values.shape[0]), (x_axis_feature.keys().values), rotation="vertical") + plt.title(feature) + plt.show() + + @staticmethod + def generate_hist_by_transcript_biotypes(gtf_df): + GTFProcessing.hist_generate(gtf_df, feature="transcript_biotype") + + @staticmethod + def capture_distal_unique_tes(gtf_df): + return_distal_exon = [] + last_exon_df = GTFProcessing.get_last_exon_df(gtf_df) + for k, v in tqdm(last_exon_df.groupby("gene_id")): + if v.iloc[0]["strand"] == "+": + return_distal_exon.append(v.sort_values("end", ascending=False).iloc[0].values.tolist()) + if v.iloc[0]["strand"] == "-": + return_distal_exon.append(v.sort_values("start", ascending=True).iloc[0].values.tolist()) + + df_distal_exon_by_gene_id = pd.DataFrame(return_distal_exon, columns=last_exon_df.columns.values.tolist()) + return df_distal_exon_by_gene_id + + @staticmethod + def capture_distal_unique_tss(gtf_df): + return_distal_tss = [] + first_exon_df = GTFProcessing.get_first_exon_df(gtf_df) + for k, v in tqdm(first_exon_df.groupby("gene_id")): + if v.iloc[0]["strand"] == "+": + return_distal_tss.append(v.sort_values("start", ascending=True).iloc[0].values.tolist()) + if v.iloc[0]["strand"] == "-": + return_distal_tss.append(v.sort_values("end", ascending=False).iloc[0].values.tolist()) + + df_distal_exon_by_gene_id = pd.DataFrame(return_distal_tss, columns=first_exon_df.columns.values.tolist()) + return df_distal_exon_by_gene_id