From 8b3802cd8d007b536fc7b5cd4dce84dfcf3ce66c Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Wed, 24 Apr 2024 16:22:57 +0200 Subject: [PATCH 01/28] Remove inefficient for loop --- scripts/mergeRepeats.R | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/scripts/mergeRepeats.R b/scripts/mergeRepeats.R index 36414d1..7f8ab3e 100644 --- a/scripts/mergeRepeats.R +++ b/scripts/mergeRepeats.R @@ -94,15 +94,12 @@ filteredRepeatsOut$length <- abs(filteredRepeatsOut$End - filteredRepeatsOut$Sta if (lowend == "yes") { filteredRepeatsOut <- filteredRepeatsOut[filteredRepeatsOut$length > 100,] } -filteredRepeatsOut2 <- filteredRepeatsOut[,1:6] -for (i in 1:length(filteredRepeatsOut2$Start)) { - if (filteredRepeatsOut2$End[i] < filteredRepeatsOut2$Start[i]) { - start <- filteredRepeatsOut2$End[i] - end <- filteredRepeatsOut2$Start[i] - filteredRepeatsOut2$Start[i] <- start - filteredRepeatsOut2$End[i] <- end - } -} + +# if end coordinate is before start, switch +filteredRepeatsOut2 <- filteredRepeatsOut[,1:6] %>% + mutate(Start = ifelse(End < Start, End, Start), + End = ifelse(End < Start, Start, End)) + write.table(filteredRepeatsOut2, file = filtBed, quote = FALSE, row.names = FALSE, sep = "\t", col.names = FALSE) From 1acd84dc16f76f80de3044c1a21b48e61363a047 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Wed, 24 Apr 2024 16:28:59 +0200 Subject: [PATCH 02/28] Update to evaluate nested repeats --- scripts/filteringOverlappingRepeats.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/filteringOverlappingRepeats.R b/scripts/filteringOverlappingRepeats.R index f89570f..6448aad 100644 --- a/scripts/filteringOverlappingRepeats.R +++ b/scripts/filteringOverlappingRepeats.R @@ -23,10 +23,10 @@ input <- read.gff(gff.in) # cut overlapping regions in half input %<>% arrange(seqid, start) %>% - mutate(new.start = case_when(seqid == lag(seqid) & start < lag(end) ~ as.integer((start + ((lag(end) - start)/2)) + 1), + mutate(new.start = case_when(seqid == lag(seqid) & start < lag(end) & end > lag(end) ~ as.integer((start + ((lag(end) - start)/2)) + 1), seqid == lag(seqid) & start == lag(end) ~ as.integer(start + 1), .default = start), - new.end = case_when(seqid == lead(seqid) & end > lead(start) ~ as.integer((end - (end - lead(start))/2)), + new.end = case_when(seqid == lead(seqid) & end > lead(start) & end < lead(end) ~ as.integer((end - (end - lead(start))/2)), seqid == lead(seqid) & end == lead(start) ~ as.integer(end), .default = end)) %>% mutate(start = new.start, From d57d7b4945876930f166f21ea704162cfac29cdd Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 10:28:10 +0200 Subject: [PATCH 03/28] Create divergence_calc.py --- scripts/divergenceCalc/divergence_calc.py | 223 ++++++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 scripts/divergenceCalc/divergence_calc.py diff --git a/scripts/divergenceCalc/divergence_calc.py b/scripts/divergenceCalc/divergence_calc.py new file mode 100644 index 0000000..aedb0e7 --- /dev/null +++ b/scripts/divergenceCalc/divergence_calc.py @@ -0,0 +1,223 @@ +import os +from os.path import exists, getsize +import sys +import argparse +import pandas as pd +import multiprocessing +import pybedtools +import subprocess +import shlex +import shutil +from Bio import AlignIO, SeqIO +from math import log, sqrt +from functools import partial +from time import time +from re import sub + +parser = argparse.ArgumentParser() +parser.add_argument('-l', '--repeat_library', type=str, required=True, + help='repeat_library') +parser.add_argument('-i', '--in_gff', type=str, required=True, + help='Path to gff') +parser.add_argument('-g', '--genome', type=str, required=True, + help='Path to genome') +parser.add_argument('-o', '--out_gff', type=str, required=True, + help='Output gff') +parser.add_argument('-tmp', '--temp_dir', type=str, default='tmp/', + help='Temporary directory') +parser.add_argument('-t', '--cores', type=int, default=4, + help='Number of cores') +parser.add_argument('-k', '--timeout', type=int, default=30, + help='Seconds after which water will be cancelled and repeat treated as unalignable') + +args = parser.parse_args() + +def file_check(repeat_library, in_gff, genome, out_gff, temp_dir): + if(exists(repeat_library) == False or exists(in_gff) == False or exists(genome) == False): + sys.exit('Files not found. Requires the repeat library, path to the genome, and path to gff containing coordinates and corresponding repeat files') + if(exists(temp_dir) == False): + os.mkdir(temp_dir) + if(exists(temp_dir+"/qseqs") == False): + os.mkdir(temp_dir+"/qseqs") + if(exists(temp_dir+"/split_library/") == False): + os.mkdir(temp_dir+"/split_library/") + +def splitter(in_seq, temp_dir): + with open(in_seq, 'r') as handle: + for record in SeqIO.parse(handle, "fasta"): + repeat_name = record.name.split(sep="#")[0] + repeat_name = repeat_name.lower() + file_name = (temp_dir+"/split_library/"+repeat_name+".fasta") + SeqIO.write(record, file_name, "fasta-2line") + +def parse_gff(in_gff): + gff = pd.read_table(in_gff, header = None, names=['seqnames', 'tool', 'repeat_class', 'start', 'end', 'score', 'strand', 'phase', 'metadata']) + simple_gff = gff[gff['repeat_class'].str.contains('Simple_repeat|Satellite|Low_complexity')].reset_index() + gff = gff[~gff['repeat_class'].str.contains('Simple_repeat|Satellite|Low_complexity')].reset_index() + gff['metadata_tmp'] = gff['metadata'].str.replace(';SHORTTE.*', '', regex=True) + gff[['tstart', 'tend', 'repeat_family']] = gff['metadata_tmp'].str.split(';', n=3, expand=True) + gff = gff.drop(columns = ['metadata_tmp', 'tstart', 'tend']) + gff['repeat_family'] = gff['repeat_family'].str.replace('ID=', '', regex=True) + gff['repeat_family'] = gff['repeat_family'].str.lower() + return(gff, simple_gff) + +def file_name_generator(): + import random + import string + file_name = ''.join(random.sample(string.ascii_letters, 12))+'.tmp' + return(file_name) + +def Kimura80(qseq, sseq): + """ + Calculations adapted from https://github.com/kgori/python_tools_on_github/blob/master/pairwise_distances.py + """ + # define transitions, transversions, matches + transitions = [ "AG", "GA", "CT", "TC"] + transversions = [ "AC", "CA", "AT", "TA", + "GC", "CG", "GT", "TG" ] + matches = [ "AA", "GG", "CC", "TT"] + # set counters to 0 + m,ts,tv=0,0,0 + # count transitions, transversions, matches + for i, j in zip(qseq, sseq): + if i+j in matches: m+=1 + if i+j in transitions: ts+=1 + if i+j in transversions: tv+=1 + # count number of bp which align (excludes gaps, Ns) + aln_len = m + ts + tv + # calculate p and q + p = ts/aln_len + q = tv/aln_len + + # calculate Kimura distance + Kimura_dist = -0.5 * log((1 - 2*p - q) * sqrt( 1 - 2*q )) + + return(Kimura_dist) + +def outer_func(genome_path, temp_dir, timeoutSeconds, gff): + generated_name = file_name_generator() + holder_file_name = temp_dir+generated_name + failed_file_name = temp_dir+"failed_"+generated_name + with open(holder_file_name, 'w') as tmp_out: + header = list(gff.columns.values)[1:] + ["Kimura"] + header = "\t".join(header)+"\n" + tmp_out.write(header) + for row in gff.iterrows(): + # Set index + idx = row[0] + # Set scaffold, coordinates, strand, repeat family + seqnames, start, end, strand, repeat_family = row[1]['seqnames'], str(row[1]['start'] - 1), str(row[1]['end']), row[1]['strand'], row[1]['repeat_family'] + # Create BED string for BEDtools + bed_str = " ".join([seqnames, start, end, ".", ".", strand]) + # Set path for query sequence + query_path = temp_dir+"/qseqs/"+str(idx) + # Create bedtools command and getfasta + a=pybedtools.BedTool(bed_str, from_string=True) + a = a.sequence(fi=genome_path, fo=query_path, s=True) + # Set path to subject sequence + subject_path=temp_dir+"/split_library/"+repeat_family+".fasta" + # Run water, with timeout exception + test_command = shlex.split("water "+query_path+" "+subject_path+" -gapopen 10 -gapextend 0.5 -outfile "+query_path+".water -aformat fasta") + # Run test and kill if it takes more than 10 seconds + alignment_p = subprocess.Popen(test_command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) + try: + alignment_p.wait(timeoutSeconds) + except subprocess.TimeoutExpired: + # if water fails to complete before timeout, kill and move on + with open(failed_file_name, "a") as failed_file: + failed_file.write(seqnames+":"+start+"-"+end+"_"+strand+"_"+repeat_family+"\n") + alignment_p.kill() + if exists(query_path+".water") is False or getsize(query_path+".water") == 0: + # If no alignment is possible, set distances to NA and alignment length to 0 + Kdist = "NA" + os.remove(query_path) + if exists(query_path+".water") is True: + os.remove(query_path+".water") + else: + # Read in alignments + aln = AlignIO.read(query_path+".water", 'fasta') + # Calculate distances based on model + Kdist = Kimura80(str(aln[0].seq).upper(), str(aln[1].seq).upper()) + # Convert numbers to strings + Kdist = str(round(Kdist, 4)) + # Delete temporary files + os.remove(query_path+".water") + os.remove(query_path) + # Make line for temporary file and write to file + tmp_holder = row[1].to_list()[1:] + tmp_holder = "\t".join(str(x) for x in tmp_holder)+"\t"+Kdist+"\n" + tmp_out.write(tmp_holder) + + return(holder_file_name) + +def tmp_out_parser(file_list, simple_gff): + # Loop through results + gff=pd.DataFrame() + for file in file_list: + # read in gff + in_gff = pd.read_csv(file, sep = "\t") + # concatenate gff + gff = pd.concat([gff, in_gff], ignore_index=True) + # Convert numbers to strings for concatenation + gff['Kimura'] = gff['Kimura'].astype(str) + # Convert new data onto metadata + gff['metadata'] = gff['metadata'] + ";KIMURA80=" + gff['Kimura'] + # Remove unnecessary rows + gff = gff.drop(columns = ['Kimura', 'repeat_family']) + # Combine columns, sort and drop unneccessary columns + gff = pd.concat([gff, simple_gff], ignore_index=True) + gff = gff.sort_values(by=['seqnames', 'start']) + gff = gff.reset_index() + gff = gff.drop(columns = ['level_0', 'index']) + + return(gff) + +if __name__ == "__main__": + + start_time = time() + + # check files exist + file_check(args.repeat_library, args.in_gff, args.genome, args.out_gff, args.temp_dir) + + # split library file + print("Splitting repeat library") + splitter(args.repeat_library, args.temp_dir) + + # read in gff and take head + print("Reading in gff") + in_gff, simple_gff = parse_gff(args.in_gff) + + # create as many processes as instructed cores + num_processes = args.cores + + # calculate the chunk size as an integer + chunk_size = int(in_gff.shape[0]/num_processes) + + # break into chunks + chunks = [in_gff.iloc[in_gff.index[i:i + chunk_size]] for i in range(0, in_gff.shape[0], chunk_size)] + + print("Starting calculations") + # Peform calulations in parallel + func = partial(outer_func, args.genome, args.temp_dir, args.timeout) + pool = multiprocessing.Pool(processes=num_processes) + results = pool.map(func, chunks) + pool.close() + pool.join() + print("Finished calculations") + + # Free up memory (necessary with very large gffs and low memory machines) + del chunks + del in_gff + + # Read in temp files, fix metadata, add simple repeats back, and sort + calc_gff = tmp_out_parser(results, simple_gff) + + # write to file + calc_gff.to_csv(args.out_gff, sep = "\t", header = False, index=False) + + # print run time for number of rows + run_time = time() - start_time + print("Total run time for ", len(calc_gff), " rows was ", run_time, " seconds") + + # Delete folder of split library + shutil.rmtree(args.temp_dir+"/split_library/", ignore_errors=True) From aae0c809665699b27cada2304eb476a38e645de6 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 10:28:46 +0200 Subject: [PATCH 04/28] Create divergence_plot.R --- scripts/divergenceCalc/divergence_plot.R | 151 +++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 scripts/divergenceCalc/divergence_plot.R diff --git a/scripts/divergenceCalc/divergence_plot.R b/scripts/divergenceCalc/divergence_plot.R new file mode 100644 index 0000000..5e8bf7b --- /dev/null +++ b/scripts/divergenceCalc/divergence_plot.R @@ -0,0 +1,151 @@ +library(optparse) + +option_list <- list( + make_option(c("-s", "--species_name"), default=NA, type = "character", help="Species name (required)"), + make_option(c("-g", "--in_gff"), default=NA, type = "character", help="GFF with Kimura distances (required)"), + make_option(c("-o", "--out_directory"), default=NA, type = "character", help="Directory to write plots to (required)") +) + +opt <- parse_args(OptionParser(option_list=option_list)) + +# Check variables are set +if(is.na(opt$species_name)){ + stop("Species name must be supplied") +} +if(is.na(opt$in_gff)){ + stop("Path to input gff must be supplied") +} +if(is.na(opt$out_directory)){ + stop("Path to output directory must be supplied") +} + +suppressPackageStartupMessages(library(tidyverse)) +suppressPackageStartupMessages(library(plyranges)) +suppressPackageStartupMessages(library(viridis)) +suppressPackageStartupMessages(library(cowplot)) +suppressPackageStartupMessages(library(ggtext)) + +# Created plot title +plot_title <- paste0("Repeat landscape of *", gsub("_", " ", opt$species_name), "*") +title_plot <- ggplot() + labs(title = plot_title) + theme(plot.title = element_markdown(hjust = 0.5)) + theme(panel.background = element_blank()) + +# Read in data, remove repeats which Kimura was not calculated for +divergence_eg_gff <- read_gff(opt$in_gff) %>% + mutate(KIMURA80 = as.double(KIMURA80)) %>% + filter(!is.na(KIMURA80), KIMURA80 <= 0.5) + +# Breakdown classification of repeats +divergence_eg_tes_gff <- divergence_eg_gff %>% + dplyr::mutate(subclass = sub("/.*", "", type), + superfamily = sub("-.*", "", sub(".*/", "", type))) + +# Fix Penelopes +divergence_eg_tes_gff <- divergence_eg_tes_gff %>% + dplyr::mutate(subclass = ifelse(superfamily == "Penelope", "PLE", subclass)) %>% + dplyr::mutate(subclass = ifelse(subclass %in% c("DNA", "LINE", "LTR", "PLE", "RC", "SINE", "Unknown"), subclass, "Other")) %>% + dplyr::mutate(named_subclass = case_when(subclass == "DNA" ~ "DNA Transposon", + subclass == "LTR" ~ "LTR Retrotransposon", + subclass == "PLE" ~ "Penelope", + subclass == "RC" ~ "Rolling Circle", + .default = subclass)) + +# Sum lengths to create data for plots (remove subclasses not in standard set) +divergence_eg_tes_rounded_for_plot <- divergence_eg_tes_gff %>% + as_tibble() %>% + dplyr::mutate(KIMURA80 = round(x = KIMURA80, digits = 2)) %>% + group_by(named_subclass, KIMURA80) %>% + mutate(KIMURA_SUM = sum(width)) %>% + ungroup() %>% + dplyr::select(subclass, named_subclass, KIMURA80, KIMURA_SUM) %>% + base::unique() %>% + arrange(named_subclass, KIMURA80) + +# Set fill colours +fill_colours <- tibble(subclass = c("DNA", "LINE", "LTR", "PLE", "RC", "SINE", "Other", "Unknown"), + named_subclass = c("DNA Transposon", "LINE", "LTR Retrotransposon", "Penelope", "Rolling Circle", "SINE", "Other", "Unknown"), + fill_colour = c("#E32017", "#0098D4", "#00782A", "#7156A5", "#EE7C0E", "#9B0056", "#F3A9BB", "#A0A5A9")) %>% + filter(subclass %in% divergence_eg_tes_rounded_for_plot$subclass) %>% + arrange(named_subclass) + +# Create and save main plots +kimura_plot <- ggplot(divergence_eg_tes_rounded_for_plot, + aes(x = KIMURA80, y = KIMURA_SUM, fill = named_subclass)) + + geom_col(position = "stack", width = 0.01) + + scale_x_continuous(limits = c(-0.01, 0.51), + expand = c(0,0), name = "Kimura 2-Parameter Distance") + + theme_bw() + + labs(title = plot_title) + theme(plot.title = element_markdown(hjust = 0.5)) + + scale_fill_manual(values = fill_colours$fill_colour, name = "TE Subclass") +subclass_kimura_plot <- kimura_plot + scale_y_continuous(expand = c(0.01,0), name = "Base pairs") +ggsave(plot = subclass_kimura_plot, filename = paste0(opt$out_directory, "/", opt$species_name, "_subclass_div_plot.pdf"), device = "pdf", width = 12.85, height = 8.5) +split_subclass_kimura_plot <- kimura_plot + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + facet_grid(subclass~., scales = "free") +ggsave(plot = split_subclass_kimura_plot, filename = paste0(opt$out_directory, "/", opt$species_name, "_split_subclass_div_plot.pdf"), device = "pdf", width = 12.85, height = 8.5) + +# Perform maths for more divided plot +divergence_eg_tes_rounded_for_superfamily_plot <- divergence_eg_tes_gff %>% + as_tibble() %>% + dplyr::mutate(KIMURA80 = round(x = KIMURA80, digits = 2), + type = sub("-.*", "", type)) %>% + group_by(superfamily, KIMURA80) %>% + mutate(KIMURA_SUM = sum(width)) %>% + ungroup()%>% + dplyr::select(type, subclass, superfamily, KIMURA80, KIMURA_SUM) %>% + base::unique() %>% + arrange(subclass, superfamily, KIMURA80) + +# Split data as necessary +divergence_eg_tes_rounded_for_superfamily_plot <- split(divergence_eg_tes_rounded_for_superfamily_plot, + f = divergence_eg_tes_rounded_for_superfamily_plot$subclass) + +# Create plots of superfamilies of DNA transposons, LINEs, LTR retrotransposons and SINEs +kimura_superfamily_plot_1 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$DNA, + aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) + + geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) + + scale_x_continuous(limits = c(-0.01, 0.51), + expand = c(0,0), name = "") + + theme_bw() + + theme(legend.title=element_blank()) + + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + + facet_grid(subclass~., scales = "free") + + guides(fill=guide_legend(ncol=3)) +kimura_superfamily_plot_2 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$LINE, + aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) + + geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) + + scale_x_continuous(limits = c(-0.01, 0.51), + expand = c(0,0), name = "") + + theme_bw() + + theme(legend.title=element_blank()) + + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + + facet_grid(subclass~., scales = "free") + + guides(fill=guide_legend(ncol=3)) + + scale_fill_brewer(palette = "Blues", direction = -1) +kimura_superfamily_plot_3 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$LTR, + aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) + + geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) + + scale_x_continuous(limits = c(-0.01, 0.51), + expand = c(0,0), name = "") + + theme_bw() + + theme(legend.title=element_blank()) + + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + + facet_grid(subclass~., scales = "free") + + guides(fill=guide_legend(ncol=3)) + + scale_fill_brewer(palette = "Greens", direction = -1) +kimura_superfamily_plot_4 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$SINE, + aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) + + geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) + + scale_x_continuous(limits = c(-0.01, 0.51), + expand = c(0,0), name = "Kimura 2-Parameter Distance") + + theme_bw() + + theme(legend.title=element_blank()) + + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + + facet_grid(subclass~., scales = "free") + + guides(fill=guide_legend(ncol=3)) + + scale_fill_brewer(palette = "YlOrRd", direction = -1) + +# Combine plots and title +superfamily_kimura_plot <- plot_grid(kimura_superfamily_plot_1, kimura_superfamily_plot_2, kimura_superfamily_plot_3, kimura_superfamily_plot_4, + ncol = 1, align = "v") +superfamily_kimura_plot_titled <- plot_grid(title_plot, superfamily_kimura_plot, ncol = 1, rel_heights = c(1, 30)) + +# Save divided plot +ggsave(plot = superfamily_kimura_plot_titled, filename = paste0(opt$out_directory, "/", opt$species_name, "_superfamily_div_plot.pdf"), device = "pdf", width = 12.85, height = 8.5) From 3455194d722c0208b0726224f05c6fa7b2055ae6 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 10:48:07 +0200 Subject: [PATCH 05/28] Update earlGrey --- earlGrey | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/earlGrey b/earlGrey index 516a78a..dc6a05e 100644 --- a/earlGrey +++ b/earlGrey @@ -3,7 +3,7 @@ usage() { echo " ############################# - earlGrey version 4.1.1 + earlGrey version 4.2.0 Required Parameters: -g == genome.fasta -s == species name @@ -25,12 +25,6 @@ usage() earlGrey -g bombyxMori.fasta -s bombyxMori -o /home/toby/bombyxMori/repeatAnnotation/ -t 16 - - Prerequisites - These must be configured prior to using Earl Grey: - - RepeatMasker (Version 4.1.2) - - Ensure RepeatMasker has been configured with the desired repeat libraries (RepBase and at least Dfam 3.4 are recommended) - - RepeatModeler2 - Queries can be sent to: tobias.baril[at]unine.ch @@ -192,17 +186,6 @@ novoMask() fi } -# Subprocess calcDivRL -# Calculate divergence estimates -calcDivRL() -{ - cd ${OUTDIR}/${species}_RepeatLandscape - genome_size=$(sed -n '4p' ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/*.tbl | rev | cut -f1,1 -d ':' | rev | sed 's/ bp.*//g; s/ //g') - align_file=$(readlink -f ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/*.align) - calcDivergenceFromAlign.pl -s ${species}.divsum $align_file - div_file=$(readlink -f $OUTDIR/${species}_RepeatLandscape/${species}.divsum) -} - # Subprocess rcMergeRepeats # Defragment repeat sequences to adjust for insertion times mergeRep() @@ -237,6 +220,22 @@ charts() fi } +# Subprocess calcDivRL +# Calculate divergence estimates +calcDivRL() +{ + cd ${OUTDIR}/${species}_RepeatLandscape + if [ -z "$RepSpec" ] && [ -z "$startCust" ]; then + python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genome -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -t $ProcNum + Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ + mv ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff + else + python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genome -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -t $ProcNum + Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ + mv ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff + fi +} + # Subprocess sweepUp # Puts required files into a summary folder sweepUp() @@ -485,7 +484,6 @@ if [ ! -f ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/*.tbl ]; then if [ ! -f ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/*.tbl ]; then echo "ERROR: RepeatMasker failed, please check logs" && exit 2 fi - calcDivRL sleep 1 else stage="Final masking already complete, skipping..." && runningTea @@ -495,11 +493,6 @@ else fi # Stage 6 - -##TODO -#### find a way to rename variable in table that is less RAM intensive than the current method in python -#### i think this is inside rcMergeRepeatsLoose - if [ ! -f ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed ] ; then stage="Defragmenting Repeats" && runningTea mergeRep @@ -513,6 +506,7 @@ fi # Stage 7 stage="Generating Summary Plots" && runningTea charts +calcDivRL sleep 1 # Stage 8 From cd4c8853e27acf22ab2ade57dd303f11162ffe9f Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 10:52:22 +0200 Subject: [PATCH 06/28] Update autoPie.R --- scripts/autoPie.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/autoPie.R b/scripts/autoPie.R index d62dd5a..95b5c55 100644 --- a/scripts/autoPie.R +++ b/scripts/autoPie.R @@ -1,7 +1,7 @@ # load libraries -library(tidyverse) -library(data.table) +suppressPackageStartupMessages(library(tidyverse)) +suppressPackageStartupMessages((library(data.table)) # set options From 9cf8bcb3a7d31280e1e4134b495519394c5c6f2b Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 10:52:54 +0200 Subject: [PATCH 07/28] Update filteringOverlappingRepeats.R --- scripts/filteringOverlappingRepeats.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/filteringOverlappingRepeats.R b/scripts/filteringOverlappingRepeats.R index 6448aad..7e0bf3a 100644 --- a/scripts/filteringOverlappingRepeats.R +++ b/scripts/filteringOverlappingRepeats.R @@ -1,7 +1,7 @@ # load libraries -library(GenomicRanges) -library(ape) -library(tidyverse) +suppressPackageStartupMessages(library(GenomicRanges)) +suppressPackageStartupMessages(library(ape)) +suppressPackageStartupMessages(library(tidyverse)) # set options From 38c5047d26f937fcba63763790f9ccbc84a1058a Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 10:53:05 +0200 Subject: [PATCH 08/28] Update makeGff.R --- scripts/makeGff.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/makeGff.R b/scripts/makeGff.R index 8908e80..ee979bd 100644 --- a/scripts/makeGff.R +++ b/scripts/makeGff.R @@ -1,4 +1,4 @@ -library(tidyverse) +suppressPackageStartupMessages(library(tidyverse)) args <- commandArgs() print(args) From a43cd4661534684fd0ed8e207b89632a0a3a8d02 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 10:53:20 +0200 Subject: [PATCH 09/28] Update mergeRepeats.R --- scripts/mergeRepeats.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/mergeRepeats.R b/scripts/mergeRepeats.R index 7f8ab3e..51ba09b 100644 --- a/scripts/mergeRepeats.R +++ b/scripts/mergeRepeats.R @@ -1,8 +1,8 @@ -library(tidyverse) -library(plyr) -library(dplyr) -library(magrittr) -library(data.table) +suppressPackageStartupMessages(library(tidyverse)) +suppressPackageStartupMessages(library(plyr)) +suppressPackageStartupMessages(library(dplyr)) +suppressPackageStartupMessages(library(magrittr)) +suppressPackageStartupMessages(library(data.table)) options(scipen = 100, stringsAsFactors = FALSE) ##### From 7b97aaa7924e88be29d60ea2f8169e9b213cec3e Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 11:00:17 +0200 Subject: [PATCH 10/28] Update divergence_plot.R --- scripts/divergenceCalc/divergence_plot.R | 50 ++++++++++++++++++------ 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/scripts/divergenceCalc/divergence_plot.R b/scripts/divergenceCalc/divergence_plot.R index 5e8bf7b..884a463 100644 --- a/scripts/divergenceCalc/divergence_plot.R +++ b/scripts/divergenceCalc/divergence_plot.R @@ -71,15 +71,31 @@ fill_colours <- tibble(subclass = c("DNA", "LINE", "LTR", "PLE", "RC", "SINE", " kimura_plot <- ggplot(divergence_eg_tes_rounded_for_plot, aes(x = KIMURA80, y = KIMURA_SUM, fill = named_subclass)) + geom_col(position = "stack", width = 0.01) + - scale_x_continuous(limits = c(-0.01, 0.51), - expand = c(0,0), name = "Kimura 2-Parameter Distance") + + scale_x_reverse(limits = c(-0.01, 0.51), + expand = c(0,0), name = "Kimura 2-Parameter Distance") + theme_bw() + labs(title = plot_title) + theme(plot.title = element_markdown(hjust = 0.5)) + scale_fill_manual(values = fill_colours$fill_colour, name = "TE Subclass") subclass_kimura_plot <- kimura_plot + scale_y_continuous(expand = c(0.01,0), name = "Base pairs") -ggsave(plot = subclass_kimura_plot, filename = paste0(opt$out_directory, "/", opt$species_name, "_subclass_div_plot.pdf"), device = "pdf", width = 12.85, height = 8.5) +ggsave(plot = subclass_kimura_plot, + filename = paste0(opt$out_directory, "/", opt$species_name, "_subclass_div_plot.pdf"), + device = "pdf", + scale = 1, + width = 297, + height = 210, + units = "mm", + dpi = 300, + limitsize = FALSE) split_subclass_kimura_plot <- kimura_plot + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + facet_grid(subclass~., scales = "free") -ggsave(plot = split_subclass_kimura_plot, filename = paste0(opt$out_directory, "/", opt$species_name, "_split_subclass_div_plot.pdf"), device = "pdf", width = 12.85, height = 8.5) +ggsave(plot = split_subclass_kimura_plot, + filename = paste0(opt$out_directory, "/", opt$species_name, "_split_subclass_div_plot.pdf"), + device = "pdf", + scale = 1, + width = 297, + height = 210, + units = "mm", + dpi = 300, + limitsize = FALSE) # Perform maths for more divided plot divergence_eg_tes_rounded_for_superfamily_plot <- divergence_eg_tes_gff %>% @@ -101,8 +117,8 @@ divergence_eg_tes_rounded_for_superfamily_plot <- split(divergence_eg_tes_rounde kimura_superfamily_plot_1 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$DNA, aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) + geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) + - scale_x_continuous(limits = c(-0.01, 0.51), - expand = c(0,0), name = "") + + scale_x_reverse(limits = c(-0.01, 0.51), + expand = c(0,0), name = "") + theme_bw() + theme(legend.title=element_blank()) + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + @@ -111,8 +127,8 @@ kimura_superfamily_plot_1 <- ggplot(divergence_eg_tes_rounded_for_superfamily_pl kimura_superfamily_plot_2 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$LINE, aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) + geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) + - scale_x_continuous(limits = c(-0.01, 0.51), - expand = c(0,0), name = "") + + scale_x_reverse(limits = c(-0.01, 0.51), + expand = c(0,0), name = "") + theme_bw() + theme(legend.title=element_blank()) + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + @@ -122,8 +138,8 @@ kimura_superfamily_plot_2 <- ggplot(divergence_eg_tes_rounded_for_superfamily_pl kimura_superfamily_plot_3 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$LTR, aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) + geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) + - scale_x_continuous(limits = c(-0.01, 0.51), - expand = c(0,0), name = "") + + scale_x_reverse(limits = c(-0.01, 0.51), + expand = c(0,0), name = "") + theme_bw() + theme(legend.title=element_blank()) + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + @@ -133,8 +149,8 @@ kimura_superfamily_plot_3 <- ggplot(divergence_eg_tes_rounded_for_superfamily_pl kimura_superfamily_plot_4 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$SINE, aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) + geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) + - scale_x_continuous(limits = c(-0.01, 0.51), - expand = c(0,0), name = "Kimura 2-Parameter Distance") + + scale_x_reverse(limits = c(-0.01, 0.51), + expand = c(0,0), name = "Kimura 2-Parameter Distance") + theme_bw() + theme(legend.title=element_blank()) + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + @@ -148,4 +164,12 @@ superfamily_kimura_plot <- plot_grid(kimura_superfamily_plot_1, kimura_superfami superfamily_kimura_plot_titled <- plot_grid(title_plot, superfamily_kimura_plot, ncol = 1, rel_heights = c(1, 30)) # Save divided plot -ggsave(plot = superfamily_kimura_plot_titled, filename = paste0(opt$out_directory, "/", opt$species_name, "_superfamily_div_plot.pdf"), device = "pdf", width = 12.85, height = 8.5) +ggsave(plot = superfamily_kimura_plot_titled, + filename = paste0(opt$out_directory, "/", opt$species_name, "_superfamily_div_plot.pdf"), + device = "pdf", + scale = 1, + width = 297, + height = 210, + units = "mm", + dpi = 300, + limitsize = FALSE) From aa238ac9331db6778aa8475a70f351c1e3e900ad Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 11:37:35 +0200 Subject: [PATCH 11/28] Update earlGrey --- earlGrey | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/earlGrey b/earlGrey index dc6a05e..16c4c7a 100644 --- a/earlGrey +++ b/earlGrey @@ -66,6 +66,7 @@ prepGenome() genome=${genome}.prep else dict=${genome}.dict + genOrig=$genome genome=${genome}.prep fi } @@ -213,10 +214,8 @@ charts() cd ${OUTDIR}/${species}_summaryFiles/ if [ -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ]; then ${SCRIPT_DIR}/autoPie.sh -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed -t ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -p ${OUTDIR}/${species}_summaryFiles/${species}.summaryPie.pdf -o ${OUTDIR}/${species}_summaryFiles/${species}.highLevelCount.txt - Rscript ${SCRIPT_DIR}/autoLand.R $div_file $genome_size $species ${OUTDIR}/${species}_summaryFiles/${species}.repeatLandscape.pdf else ${SCRIPT_DIR}/autoPie.sh -i ${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.bed -t ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -p ${OUTDIR}/${species}_summaryFiles/${species}.summaryPie.pdf -o ${OUTDIR}/${species}_summaryFiles/${species}.highLevelCount.txt - Rscript ${SCRIPT_DIR}/autoLand.R $div_file $genome_size $species ${OUTDIR}/${species}_summaryFiles/${species}.repeatLandscape.pdf fi } @@ -226,11 +225,11 @@ calcDivRL() { cd ${OUTDIR}/${species}_RepeatLandscape if [ -z "$RepSpec" ] && [ -z "$startCust" ]; then - python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genome -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -t $ProcNum + python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -t $ProcNum Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ mv ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff else - python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genome -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -t $ProcNum + python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -t $ProcNum Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ mv ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff fi From accac1c242ddd961d5a8eba0f8dcdb9e85bb36b7 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 11:40:40 +0200 Subject: [PATCH 12/28] Update earlGrey --- earlGrey | 1 + 1 file changed, 1 insertion(+) diff --git a/earlGrey b/earlGrey index 16c4c7a..c41853c 100644 --- a/earlGrey +++ b/earlGrey @@ -63,6 +63,7 @@ prepGenome() mv ${genome}.tmp.dict ${genome}.dict sed -i '/^>/! s/[DVHBPE]/N/g' ${genome}.prep dict=${genome}.dict + genOrig=$genome genome=${genome}.prep else dict=${genome}.dict From f985001be1bc28df3054bb0c9141ebe516fdf701 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 11:46:28 +0200 Subject: [PATCH 13/28] Update earlGrey --- earlGrey | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/earlGrey b/earlGrey index c41853c..b71be36 100644 --- a/earlGrey +++ b/earlGrey @@ -226,13 +226,13 @@ calcDivRL() { cd ${OUTDIR}/${species}_RepeatLandscape if [ -z "$RepSpec" ] && [ -z "$startCust" ]; then - python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -t $ProcNum + python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ - mv ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff + mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff else - python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -t $ProcNum + python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ - mv ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff + mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff fi } From 38b89379225171a5cd7451c99b8dee00a94c594e Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 11:52:29 +0200 Subject: [PATCH 14/28] Update earlGrey --- earlGrey | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/earlGrey b/earlGrey index b71be36..47c853e 100644 --- a/earlGrey +++ b/earlGrey @@ -227,12 +227,12 @@ calcDivRL() cd ${OUTDIR}/${species}_RepeatLandscape if [ -z "$RepSpec" ] && [ -z "$startCust" ]; then python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum - Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ - mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff + Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ + mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff else python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum - Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ - mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff + Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ + mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff fi } From f3843675e73792f646d4253abdf048bab6fdddc0 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 12:31:19 +0200 Subject: [PATCH 15/28] Update divergence_plot.R --- scripts/divergenceCalc/divergence_plot.R | 49 ++++++++++++++++-------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/scripts/divergenceCalc/divergence_plot.R b/scripts/divergenceCalc/divergence_plot.R index 884a463..138b74d 100644 --- a/scripts/divergenceCalc/divergence_plot.R +++ b/scripts/divergenceCalc/divergence_plot.R @@ -47,6 +47,7 @@ divergence_eg_tes_gff <- divergence_eg_tes_gff %>% subclass == "LTR" ~ "LTR Retrotransposon", subclass == "PLE" ~ "Penelope", subclass == "RC" ~ "Rolling Circle", + subclass == "Unknown" ~ "Unclassified", .default = subclass)) # Sum lengths to create data for plots (remove subclasses not in standard set) @@ -60,23 +61,30 @@ divergence_eg_tes_rounded_for_plot <- divergence_eg_tes_gff %>% base::unique() %>% arrange(named_subclass, KIMURA80) +divergence_eg_tes_rounded_for_plot$named_subclass %<>% + as.factor() %>% + ordered(levels = c("DNA Transposon", "Rolling Circle", "Penelope", "LINE", "SINE", "LTR Retrotransposon", "Other (Simple Repeat, Microsatellite, RNA)", "Unclassified")) + # Set fill colours -fill_colours <- tibble(subclass = c("DNA", "LINE", "LTR", "PLE", "RC", "SINE", "Other", "Unknown"), - named_subclass = c("DNA Transposon", "LINE", "LTR Retrotransposon", "Penelope", "Rolling Circle", "SINE", "Other", "Unknown"), - fill_colour = c("#E32017", "#0098D4", "#00782A", "#7156A5", "#EE7C0E", "#9B0056", "#F3A9BB", "#A0A5A9")) %>% - filter(subclass %in% divergence_eg_tes_rounded_for_plot$subclass) %>% - arrange(named_subclass) +fill_colours <- data.frame(subclass = c("DNA", "RC", "PLE", "LINE", "SINE", "LTR", "Other", "Unknown"), + named_subclass = c("DNA Transposon", "Rolling Circle", "Penelope", "LINE", "SINE", "LTR Retrotransposon", "Other (Simple Repeat, Microsatellite, RNA)", "Unclassified"), + fill_colour = c("#E32017", "#EE7C0E", "#7156A5", "#0098D4", "#9B0056", "#00782A", "#F3A9BB", "#A0A5A9")) %>% + filter(subclass %in% divergence_eg_tes_rounded_for_plot$subclass) + +col <- fill_colours$fill_colour +names(col) <- fill_colours$named_subclass # Create and save main plots kimura_plot <- ggplot(divergence_eg_tes_rounded_for_plot, aes(x = KIMURA80, y = KIMURA_SUM, fill = named_subclass)) + geom_col(position = "stack", width = 0.01) + - scale_x_reverse(limits = c(-0.01, 0.51), - expand = c(0,0), name = "Kimura 2-Parameter Distance") + + scale_x_reverse(expand = c(0,0), name = "Kimura 2-Parameter Distance") + theme_bw() + labs(title = plot_title) + theme(plot.title = element_markdown(hjust = 0.5)) + - scale_fill_manual(values = fill_colours$fill_colour, name = "TE Subclass") + scale_fill_manual(values = col, name = "TE Subclass") + subclass_kimura_plot <- kimura_plot + scale_y_continuous(expand = c(0.01,0), name = "Base pairs") + ggsave(plot = subclass_kimura_plot, filename = paste0(opt$out_directory, "/", opt$species_name, "_subclass_div_plot.pdf"), device = "pdf", @@ -86,7 +94,9 @@ ggsave(plot = subclass_kimura_plot, units = "mm", dpi = 300, limitsize = FALSE) + split_subclass_kimura_plot <- kimura_plot + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + facet_grid(subclass~., scales = "free") + ggsave(plot = split_subclass_kimura_plot, filename = paste0(opt$out_directory, "/", opt$species_name, "_split_subclass_div_plot.pdf"), device = "pdf", @@ -117,46 +127,53 @@ divergence_eg_tes_rounded_for_superfamily_plot <- split(divergence_eg_tes_rounde kimura_superfamily_plot_1 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$DNA, aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) + geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) + - scale_x_reverse(limits = c(-0.01, 0.51), - expand = c(0,0), name = "") + + scale_x_reverse(expand = c(0,0), name = "") + theme_bw() + theme(legend.title=element_blank()) + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + facet_grid(subclass~., scales = "free") + guides(fill=guide_legend(ncol=3)) +if (inherits(try(ggplot_build(kimura_superfamily_plot_1)), "try-error")) + kimura_superfamily_plot_1 <- ggplot() + kimura_superfamily_plot_2 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$LINE, aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) + geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) + - scale_x_reverse(limits = c(-0.01, 0.51), - expand = c(0,0), name = "") + + scale_x_reverse(expand = c(0,0), name = "") + theme_bw() + theme(legend.title=element_blank()) + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + facet_grid(subclass~., scales = "free") + guides(fill=guide_legend(ncol=3)) + scale_fill_brewer(palette = "Blues", direction = -1) +if (inherits(try(ggplot_build(kimura_superfamily_plot_2)), "try-error")) + kimura_superfamily_plot_2 <- ggplot() + kimura_superfamily_plot_3 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$LTR, aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) + geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) + - scale_x_reverse(limits = c(-0.01, 0.51), - expand = c(0,0), name = "") + + scale_x_reverse(expand = c(0,0), name = "") + theme_bw() + theme(legend.title=element_blank()) + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + facet_grid(subclass~., scales = "free") + guides(fill=guide_legend(ncol=3)) + scale_fill_brewer(palette = "Greens", direction = -1) +if (inherits(try(ggplot_build(kimura_superfamily_plot_3)), "try-error")) + kimura_superfamily_plot_3 <- ggplot() + kimura_superfamily_plot_4 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$SINE, aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) + geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) + - scale_x_reverse(limits = c(-0.01, 0.51), - expand = c(0,0), name = "Kimura 2-Parameter Distance") + + scale_x_reverse(expand = c(0,0), name = "Kimura 2-Parameter Distance") + theme_bw() + theme(legend.title=element_blank()) + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + facet_grid(subclass~., scales = "free") + guides(fill=guide_legend(ncol=3)) + scale_fill_brewer(palette = "YlOrRd", direction = -1) +if (inherits(try(ggplot_build(kimura_superfamily_plot_4)), "try-error")) + kimura_superfamily_plot_4 <- ggplot() # Combine plots and title superfamily_kimura_plot <- plot_grid(kimura_superfamily_plot_1, kimura_superfamily_plot_2, kimura_superfamily_plot_3, kimura_superfamily_plot_4, From 76a256d9d832844c528137dddf79e3540a8a2399 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 12:35:25 +0200 Subject: [PATCH 16/28] Update earlGrey --- earlGrey | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/earlGrey b/earlGrey index 47c853e..a56daf0 100644 --- a/earlGrey +++ b/earlGrey @@ -227,12 +227,14 @@ calcDivRL() cd ${OUTDIR}/${species}_RepeatLandscape if [ -z "$RepSpec" ] && [ -z "$startCust" ]; then python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum - Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ - mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff + Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ && \ + mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff && \ + rm -rf ${OUTDIR}/${species}_RepeatLandscape/tmp/ else python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum - Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ - mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff + Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ && \ + mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff && \ + rm -rf ${OUTDIR}/${species}_RepeatLandscape/tmp/ fi } From 78173889f1a8089218df689407e2a80bcb6ca911 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 13:08:22 +0200 Subject: [PATCH 17/28] Update earlGrey --- earlGrey | 2 ++ 1 file changed, 2 insertions(+) diff --git a/earlGrey b/earlGrey index a56daf0..7e6fb8b 100644 --- a/earlGrey +++ b/earlGrey @@ -230,11 +230,13 @@ calcDivRL() Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ && \ mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff && \ rm -rf ${OUTDIR}/${species}_RepeatLandscape/tmp/ + cp ${OUTDIR}/${species}_RepeatLandscape/*.pdf ${OUTDIR}/${species}_summaryFiles/ else python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ && \ mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff && \ rm -rf ${OUTDIR}/${species}_RepeatLandscape/tmp/ + cp ${OUTDIR}/${species}_RepeatLandscape/*.pdf ${OUTDIR}/${species}_summaryFiles/ fi } From cc60110e78db3ec268482aa6f359380f2add8eca Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 13:17:57 +0200 Subject: [PATCH 18/28] Update divergence_plot.R --- scripts/divergenceCalc/divergence_plot.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/divergenceCalc/divergence_plot.R b/scripts/divergenceCalc/divergence_plot.R index 138b74d..e992994 100644 --- a/scripts/divergenceCalc/divergence_plot.R +++ b/scripts/divergenceCalc/divergence_plot.R @@ -79,9 +79,10 @@ kimura_plot <- ggplot(divergence_eg_tes_rounded_for_plot, aes(x = KIMURA80, y = KIMURA_SUM, fill = named_subclass)) + geom_col(position = "stack", width = 0.01) + scale_x_reverse(expand = c(0,0), name = "Kimura 2-Parameter Distance") + - theme_bw() + - labs(title = plot_title) + theme(plot.title = element_markdown(hjust = 0.5)) + - scale_fill_manual(values = col, name = "TE Subclass") + theme_classic() + + labs(title = plot_title) + + theme(plot.title = element_markdown(hjust = 0.5)) + + scale_fill_manual(values = col, name = "TE Classification") subclass_kimura_plot <- kimura_plot + scale_y_continuous(expand = c(0.01,0), name = "Base pairs") From b7ef76a9fc5913f1cca8695e0769e685655542c8 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 14:41:06 +0200 Subject: [PATCH 19/28] Update divergence_plot.R --- scripts/divergenceCalc/divergence_plot.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/divergenceCalc/divergence_plot.R b/scripts/divergenceCalc/divergence_plot.R index e992994..66390ba 100644 --- a/scripts/divergenceCalc/divergence_plot.R +++ b/scripts/divergenceCalc/divergence_plot.R @@ -87,7 +87,7 @@ kimura_plot <- ggplot(divergence_eg_tes_rounded_for_plot, subclass_kimura_plot <- kimura_plot + scale_y_continuous(expand = c(0.01,0), name = "Base pairs") ggsave(plot = subclass_kimura_plot, - filename = paste0(opt$out_directory, "/", opt$species_name, "_subclass_div_plot.pdf"), + filename = paste0(opt$out_directory, "/", opt$species_name, "_classification_landscape.pdf"), device = "pdf", scale = 1, width = 297, @@ -99,7 +99,7 @@ ggsave(plot = subclass_kimura_plot, split_subclass_kimura_plot <- kimura_plot + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + facet_grid(subclass~., scales = "free") ggsave(plot = split_subclass_kimura_plot, - filename = paste0(opt$out_directory, "/", opt$species_name, "_split_subclass_div_plot.pdf"), + filename = paste0(opt$out_directory, "/", opt$species_name, "_split_class_landscape.pdf"), device = "pdf", scale = 1, width = 297, From f1ae8f6c0074a409f78fc7cc957847a813346167 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 14:54:07 +0200 Subject: [PATCH 20/28] Update mergeRepeats.R --- scripts/mergeRepeats.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/mergeRepeats.R b/scripts/mergeRepeats.R index 51ba09b..0723712 100644 --- a/scripts/mergeRepeats.R +++ b/scripts/mergeRepeats.R @@ -97,8 +97,11 @@ if (lowend == "yes") { # if end coordinate is before start, switch filteredRepeatsOut2 <- filteredRepeatsOut[,1:6] %>% - mutate(Start = ifelse(End < Start, End, Start), - End = ifelse(End < Start, Start, End)) + mutate(nStart = ifelse(End < Start, End, Start), + nEnd = ifelse(End < Start, Start, End)) %>% + mutate(Start = nStart, + End = nEnd) %>% + select(! c(nStart, nEnd)) write.table(filteredRepeatsOut2, file = filtBed, quote = FALSE, row.names = FALSE, sep = "\t", col.names = FALSE) From d8f79408ffeb5aecfd75c8cddadb7a86ae6cdc80 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:10:01 +0200 Subject: [PATCH 21/28] Create build.sh --- conda/build.sh | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 conda/build.sh diff --git a/conda/build.sh b/conda/build.sh new file mode 100644 index 0000000..ac56305 --- /dev/null +++ b/conda/build.sh @@ -0,0 +1,76 @@ +#!/bin/bash +#Based on https://github.com/TobyBaril/EarlGrey/blob/main/configure +set -x + +# Define paths +PACKAGE_HOME=${PREFIX}/share/${PKG_NAME}-${PKG_VERSION}-${PKG_BUILDNUM} +SCRIPT_DIR="${PACKAGE_HOME}/scripts/" + + +# Create directories +mkdir -p ${PREFIX}/bin +mkdir -p ${PACKAGE_HOME} + + +# Put package in share directory +cp -r * ${PACKAGE_HOME}/ + + +# Install SA-SSR (has to be done here because SA-SSR is an ancient repository without releases) +git clone https://github.com/ridgelab/SA-SSR +cd SA-SSR +make +cp bin/sa-ssr ${PREFIX}/bin/ + + +# Fixes to earlGrey executable +sed -i.bak "/CONDA_DEFAULT_ENV/,+4d" ${PACKAGE_HOME}/earlGrey #remove check that conda environment has a specific name + + +# Fixes sed command for executables so that it works on both linux and macos +sed -i.bak "s|sed -i |sed -i.bak |g" ${PACKAGE_HOME}/earlGrey ${SCRIPT_DIR}/rcMergeRepeat* ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh + + +# Remove -pa from RepeatClassifier +sed -i.bak 's/RepeatClassifier -pa ${THREADS} /RepeatClassifier /' ${SCRIPT_DIR}/TEstrainer/TEstrainer + + +# Remove -t parameter from sa-ssr (since multithreading doesn't work on OSX) +sed -i.bak 's/-t ${THREADS} / /' ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh +sed -i.bak 's/-t ${THREADS} / /' ${SCRIPT_DIR}/TEstrainer/TEstrainer + + +# Add SCRIPT_DIR to correct path +sed -i.bak "s|SCRIPT_DIR=.*|SCRIPT_DIR=${SCRIPT_DIR}|g" ${PACKAGE_HOME}/earlGrey +sed -i.bak "s|SCRIPT_DIR=.*|SCRIPT_DIR=${SCRIPT_DIR}|g" ${SCRIPT_DIR}/rcMergeRepeat* +sed -i.bak "s|SCRIPT_DIR=.*|SCRIPT_DIR=${SCRIPT_DIR}|g" ${SCRIPT_DIR}/headSwap.sh +sed -i.bak "s|SCRIPT_DIR=.*|SCRIPT_DIR=${SCRIPT_DIR}|g" ${SCRIPT_DIR}/autoPie.sh +sed -i.bak "s|INSERT_FILENAME_HERE|${SCRIPT_DIR}/TEstrainer/scripts/|g" ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh + + +# Set permissions to files +chmod +x ${PACKAGE_HOME}/earlGrey +chmod +x ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh +chmod +x ${SCRIPT_DIR}/* > /dev/null 2>&1 +chmod +x ${SCRIPT_DIR}/bin/LTR_FINDER.x86_64-1.0.7/ltr_finder +chmod a+w ${SCRIPT_DIR}/repeatCraft/example + + +# Extract tRNAdb +tar -zxf ${SCRIPT_DIR}/bin/LTR_FINDER.x86_64-1.0.7/tRNAdb.tar.gz --directory ${SCRIPT_DIR}/bin/LTR_FINDER.x86_64-1.0.7 + + +# Set PERL5LIB upon activate/deactivate +for CHANGE in "activate" "deactivate"; +do + mkdir -p "${PREFIX}/etc/conda/${CHANGE}.d" +done +echo "#!/bin/sh" > "${PREFIX}/etc/conda/activate.d/${PKG_NAME}_activate.sh" +echo "export PERL5LIB=${PREFIX}/share/RepeatMasker/:${PREFIX}/share/RepeatModeler/" >> "${PREFIX}/etc/conda/activate.d/${PKG_NAME}_activate.sh" +echo "#!/bin/sh" > "${PREFIX}/etc/conda/deactivate.d/${PKG_NAME}_deactivate.sh" +echo "unset PERL5LIB" >> "${PREFIX}/etc/conda/deactivate.d/${PKG_NAME}_deactivate.sh" + + +# Put earlGrey executable in bin +cd ${PREFIX}/bin +ln -s ${PACKAGE_HOME}/earlGrey . From 63f5ad91a62fd44c2c091e8cbda5cfa0753167b1 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:11:12 +0200 Subject: [PATCH 22/28] Create meta.yaml --- conda/meta.yaml | 73 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 conda/meta.yaml diff --git a/conda/meta.yaml b/conda/meta.yaml new file mode 100644 index 0000000..2b95cd3 --- /dev/null +++ b/conda/meta.yaml @@ -0,0 +1,73 @@ +{% set name = "EarlGrey" %} +{% set version = "4.1.1" %} +{% set sha256 = "499b39f0887f6b258a0fc7ac8eb4aa1abbc3fbe2e22d412be245c21e2c896381" %} + +package: + name: {{ name|lower }} + version: {{ version }} + +source: + url: https://github.com/TobyBaril/EarlGrey/archive/refs/tags/v{{ version }}.tar.gz + sha256: {{ sha256 }} + +build: + number: 1 + run_exports: + - {{ pin_subpackage('earlgrey', max_pin='x') }} + +requirements: + build: + - make + - {{ compiler('cxx') }} + run: + - python =3.9 + - hmmer + - trf + - cd-hit + - genometools-genometools + - pandas + - ncls =0.0.64 + - pyfaidx + - pyranges + - parallel + - repeatmasker >=4.1.4 + - ltr_retriever + - mafft + - mreps + - ninja-nj + - repeatscout + - recon + - repeatmodeler >=2.0.4 + - bioconductor-genomeinfodb + - bioconductor-genomeinfodbdata + - bioconductor-bsgenome + - bioconductor-plyranges + - r-ape + - r-optparse + - r-tidyverse + - r-plyr + - r-viridis + - r-cowplot + - r-ggtext + - bedtools + - emboss + - pybedtools + +test: + commands: + - earlGrey -h + +about: + home: https://github.com/TobyBaril/EarlGrey + dev_url: https://github.com/TobyBaril/EarlGrey + license: OSL-2.1 + summary: "Earl Grey: A fully automated TE curation and annotation pipeline" + description: | + Earl Grey is a full-automated transposable element (TE) annotation pipeline, + leveraging the most widely-used tools and combining these with a consensus + elongation process (BEAT) to better define de novo consensus sequences when + annotating new genome assemblies. + +extra: + identifiers: + - doi:10.1093/molbev/msae068 From 7d3e0a0ea114454ba01766e8aa27e51119c6dfaf Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:12:31 +0200 Subject: [PATCH 23/28] Create publish_conda.yml --- .github/workflows/publish_conda.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 .github/workflows/publish_conda.yml diff --git a/.github/workflows/publish_conda.yml b/.github/workflows/publish_conda.yml new file mode 100644 index 0000000..e849975 --- /dev/null +++ b/.github/workflows/publish_conda.yml @@ -0,0 +1,16 @@ +name: publish_conda + +on: + release: + types: [published] + +jobs: + publish: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + - name: publish-to-conda + uses: maxibor/conda-package-publish-action@v1.1 + with: + subDir: 'conda' + AnacondaToken: ${{ secrets.ANACONDA_TOKEN }} From 7f6726537ba0642496ed699b7a56b2a4389fe21b Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:32:20 +0200 Subject: [PATCH 24/28] Update meta.yaml --- conda/meta.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index 2b95cd3..bbe5e91 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,6 +1,5 @@ {% set name = "EarlGrey" %} -{% set version = "4.1.1" %} -{% set sha256 = "499b39f0887f6b258a0fc7ac8eb4aa1abbc3fbe2e22d412be245c21e2c896381" %} +{% set version = "4.2.0" %} package: name: {{ name|lower }} From 4edd24db61794ca240d8d8532e60ca6dbf393b07 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:38:31 +0200 Subject: [PATCH 25/28] Update meta.yaml --- conda/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index bbe5e91..7cc0bc7 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,5 +1,5 @@ {% set name = "EarlGrey" %} -{% set version = "4.2.0" %} +{% set version = "4.2.0-dev" %} package: name: {{ name|lower }} From cda8c7f545d1004e643a89292a97d79d846b9ba7 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:08:12 +0200 Subject: [PATCH 26/28] Update meta.yaml --- conda/meta.yaml | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index 7cc0bc7..d5528a7 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,18 +1,15 @@ -{% set name = "EarlGrey" %} -{% set version = "4.2.0-dev" %} - package: - name: {{ name|lower }} - version: {{ version }} + name: earlgrey + version: "4.2.0.dev" source: - url: https://github.com/TobyBaril/EarlGrey/archive/refs/tags/v{{ version }}.tar.gz - sha256: {{ sha256 }} + path: . build: - number: 1 - run_exports: - - {{ pin_subpackage('earlgrey', max_pin='x') }} + # Specify the channels in order of priority + channels: + - conda-forge + - bioconda requirements: build: From 55ea7092e7909158e24b17bbd5b0b343e4986c92 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:18:06 +0200 Subject: [PATCH 27/28] Delete conda directory --- conda/build.sh | 76 ------------------------------------------------- conda/meta.yaml | 69 -------------------------------------------- 2 files changed, 145 deletions(-) delete mode 100644 conda/build.sh delete mode 100644 conda/meta.yaml diff --git a/conda/build.sh b/conda/build.sh deleted file mode 100644 index ac56305..0000000 --- a/conda/build.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash -#Based on https://github.com/TobyBaril/EarlGrey/blob/main/configure -set -x - -# Define paths -PACKAGE_HOME=${PREFIX}/share/${PKG_NAME}-${PKG_VERSION}-${PKG_BUILDNUM} -SCRIPT_DIR="${PACKAGE_HOME}/scripts/" - - -# Create directories -mkdir -p ${PREFIX}/bin -mkdir -p ${PACKAGE_HOME} - - -# Put package in share directory -cp -r * ${PACKAGE_HOME}/ - - -# Install SA-SSR (has to be done here because SA-SSR is an ancient repository without releases) -git clone https://github.com/ridgelab/SA-SSR -cd SA-SSR -make -cp bin/sa-ssr ${PREFIX}/bin/ - - -# Fixes to earlGrey executable -sed -i.bak "/CONDA_DEFAULT_ENV/,+4d" ${PACKAGE_HOME}/earlGrey #remove check that conda environment has a specific name - - -# Fixes sed command for executables so that it works on both linux and macos -sed -i.bak "s|sed -i |sed -i.bak |g" ${PACKAGE_HOME}/earlGrey ${SCRIPT_DIR}/rcMergeRepeat* ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh - - -# Remove -pa from RepeatClassifier -sed -i.bak 's/RepeatClassifier -pa ${THREADS} /RepeatClassifier /' ${SCRIPT_DIR}/TEstrainer/TEstrainer - - -# Remove -t parameter from sa-ssr (since multithreading doesn't work on OSX) -sed -i.bak 's/-t ${THREADS} / /' ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh -sed -i.bak 's/-t ${THREADS} / /' ${SCRIPT_DIR}/TEstrainer/TEstrainer - - -# Add SCRIPT_DIR to correct path -sed -i.bak "s|SCRIPT_DIR=.*|SCRIPT_DIR=${SCRIPT_DIR}|g" ${PACKAGE_HOME}/earlGrey -sed -i.bak "s|SCRIPT_DIR=.*|SCRIPT_DIR=${SCRIPT_DIR}|g" ${SCRIPT_DIR}/rcMergeRepeat* -sed -i.bak "s|SCRIPT_DIR=.*|SCRIPT_DIR=${SCRIPT_DIR}|g" ${SCRIPT_DIR}/headSwap.sh -sed -i.bak "s|SCRIPT_DIR=.*|SCRIPT_DIR=${SCRIPT_DIR}|g" ${SCRIPT_DIR}/autoPie.sh -sed -i.bak "s|INSERT_FILENAME_HERE|${SCRIPT_DIR}/TEstrainer/scripts/|g" ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh - - -# Set permissions to files -chmod +x ${PACKAGE_HOME}/earlGrey -chmod +x ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh -chmod +x ${SCRIPT_DIR}/* > /dev/null 2>&1 -chmod +x ${SCRIPT_DIR}/bin/LTR_FINDER.x86_64-1.0.7/ltr_finder -chmod a+w ${SCRIPT_DIR}/repeatCraft/example - - -# Extract tRNAdb -tar -zxf ${SCRIPT_DIR}/bin/LTR_FINDER.x86_64-1.0.7/tRNAdb.tar.gz --directory ${SCRIPT_DIR}/bin/LTR_FINDER.x86_64-1.0.7 - - -# Set PERL5LIB upon activate/deactivate -for CHANGE in "activate" "deactivate"; -do - mkdir -p "${PREFIX}/etc/conda/${CHANGE}.d" -done -echo "#!/bin/sh" > "${PREFIX}/etc/conda/activate.d/${PKG_NAME}_activate.sh" -echo "export PERL5LIB=${PREFIX}/share/RepeatMasker/:${PREFIX}/share/RepeatModeler/" >> "${PREFIX}/etc/conda/activate.d/${PKG_NAME}_activate.sh" -echo "#!/bin/sh" > "${PREFIX}/etc/conda/deactivate.d/${PKG_NAME}_deactivate.sh" -echo "unset PERL5LIB" >> "${PREFIX}/etc/conda/deactivate.d/${PKG_NAME}_deactivate.sh" - - -# Put earlGrey executable in bin -cd ${PREFIX}/bin -ln -s ${PACKAGE_HOME}/earlGrey . diff --git a/conda/meta.yaml b/conda/meta.yaml deleted file mode 100644 index d5528a7..0000000 --- a/conda/meta.yaml +++ /dev/null @@ -1,69 +0,0 @@ -package: - name: earlgrey - version: "4.2.0.dev" - -source: - path: . - -build: - # Specify the channels in order of priority - channels: - - conda-forge - - bioconda - -requirements: - build: - - make - - {{ compiler('cxx') }} - run: - - python =3.9 - - hmmer - - trf - - cd-hit - - genometools-genometools - - pandas - - ncls =0.0.64 - - pyfaidx - - pyranges - - parallel - - repeatmasker >=4.1.4 - - ltr_retriever - - mafft - - mreps - - ninja-nj - - repeatscout - - recon - - repeatmodeler >=2.0.4 - - bioconductor-genomeinfodb - - bioconductor-genomeinfodbdata - - bioconductor-bsgenome - - bioconductor-plyranges - - r-ape - - r-optparse - - r-tidyverse - - r-plyr - - r-viridis - - r-cowplot - - r-ggtext - - bedtools - - emboss - - pybedtools - -test: - commands: - - earlGrey -h - -about: - home: https://github.com/TobyBaril/EarlGrey - dev_url: https://github.com/TobyBaril/EarlGrey - license: OSL-2.1 - summary: "Earl Grey: A fully automated TE curation and annotation pipeline" - description: | - Earl Grey is a full-automated transposable element (TE) annotation pipeline, - leveraging the most widely-used tools and combining these with a consensus - elongation process (BEAT) to better define de novo consensus sequences when - annotating new genome assemblies. - -extra: - identifiers: - - doi:10.1093/molbev/msae068 From 72a91547fac0fc72203303c7174525a7ed8a1214 Mon Sep 17 00:00:00 2001 From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:18:19 +0200 Subject: [PATCH 28/28] Delete .github/workflows directory --- .github/workflows/publish_conda.yml | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 .github/workflows/publish_conda.yml diff --git a/.github/workflows/publish_conda.yml b/.github/workflows/publish_conda.yml deleted file mode 100644 index e849975..0000000 --- a/.github/workflows/publish_conda.yml +++ /dev/null @@ -1,16 +0,0 @@ -name: publish_conda - -on: - release: - types: [published] - -jobs: - publish: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v1 - - name: publish-to-conda - uses: maxibor/conda-package-publish-action@v1.1 - with: - subDir: 'conda' - AnacondaToken: ${{ secrets.ANACONDA_TOKEN }}