From 8b3802cd8d007b536fc7b5cd4dce84dfcf3ce66c Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Wed, 24 Apr 2024 16:22:57 +0200
Subject: [PATCH 01/28] Remove inefficient for loop

---
 scripts/mergeRepeats.R | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/scripts/mergeRepeats.R b/scripts/mergeRepeats.R
index 36414d1..7f8ab3e 100644
--- a/scripts/mergeRepeats.R
+++ b/scripts/mergeRepeats.R
@@ -94,15 +94,12 @@ filteredRepeatsOut$length <- abs(filteredRepeatsOut$End - filteredRepeatsOut$Sta
 if (lowend == "yes") {
   filteredRepeatsOut <- filteredRepeatsOut[filteredRepeatsOut$length > 100,]
 }
-filteredRepeatsOut2 <- filteredRepeatsOut[,1:6]
-for (i in 1:length(filteredRepeatsOut2$Start)) {
-  if (filteredRepeatsOut2$End[i] < filteredRepeatsOut2$Start[i]) {
-    start <- filteredRepeatsOut2$End[i]
-    end <- filteredRepeatsOut2$Start[i]
-    filteredRepeatsOut2$Start[i] <- start
-    filteredRepeatsOut2$End[i] <- end
-  }
-}
+
+# if end coordinate is before start, switch
+filteredRepeatsOut2 <- filteredRepeatsOut[,1:6] %>%
+  mutate(Start = ifelse(End < Start, End, Start),
+         End = ifelse(End < Start, Start, End))
+
 write.table(filteredRepeatsOut2, file = filtBed, quote = FALSE, row.names = FALSE, sep = "\t", col.names = FALSE)
 
 

From 1acd84dc16f76f80de3044c1a21b48e61363a047 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Wed, 24 Apr 2024 16:28:59 +0200
Subject: [PATCH 02/28] Update to evaluate nested repeats

---
 scripts/filteringOverlappingRepeats.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/filteringOverlappingRepeats.R b/scripts/filteringOverlappingRepeats.R
index f89570f..6448aad 100644
--- a/scripts/filteringOverlappingRepeats.R
+++ b/scripts/filteringOverlappingRepeats.R
@@ -23,10 +23,10 @@ input <- read.gff(gff.in)
 # cut overlapping regions in half
 input %<>%
   arrange(seqid, start) %>%
-  mutate(new.start = case_when(seqid == lag(seqid) & start < lag(end) ~ as.integer((start + ((lag(end) - start)/2)) + 1),
+  mutate(new.start = case_when(seqid == lag(seqid) & start < lag(end) & end > lag(end) ~ as.integer((start + ((lag(end) - start)/2)) + 1),
                                seqid == lag(seqid) & start == lag(end) ~ as.integer(start + 1),
                                .default = start),
-         new.end = case_when(seqid == lead(seqid) & end > lead(start) ~ as.integer((end - (end - lead(start))/2)),
+         new.end = case_when(seqid == lead(seqid) & end > lead(start) & end < lead(end) ~ as.integer((end - (end - lead(start))/2)),
                              seqid == lead(seqid) & end == lead(start) ~ as.integer(end),
                              .default = end)) %>%
   mutate(start = new.start,

From d57d7b4945876930f166f21ea704162cfac29cdd Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 10:28:10 +0200
Subject: [PATCH 03/28] Create divergence_calc.py

---
 scripts/divergenceCalc/divergence_calc.py | 223 ++++++++++++++++++++++
 1 file changed, 223 insertions(+)
 create mode 100644 scripts/divergenceCalc/divergence_calc.py

diff --git a/scripts/divergenceCalc/divergence_calc.py b/scripts/divergenceCalc/divergence_calc.py
new file mode 100644
index 0000000..aedb0e7
--- /dev/null
+++ b/scripts/divergenceCalc/divergence_calc.py
@@ -0,0 +1,223 @@
+import os
+from os.path import exists, getsize
+import sys
+import argparse
+import pandas as pd
+import multiprocessing
+import pybedtools
+import subprocess
+import shlex
+import shutil
+from Bio import AlignIO, SeqIO
+from math import log, sqrt
+from functools import partial
+from time import time
+from re import sub
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-l', '--repeat_library', type=str, required=True,
+                    help='repeat_library')
+parser.add_argument('-i', '--in_gff', type=str, required=True,
+                    help='Path to gff')
+parser.add_argument('-g', '--genome', type=str, required=True,
+                    help='Path to genome')
+parser.add_argument('-o', '--out_gff', type=str, required=True,
+                    help='Output gff')
+parser.add_argument('-tmp', '--temp_dir', type=str, default='tmp/',
+                    help='Temporary directory')
+parser.add_argument('-t', '--cores', type=int, default=4,
+                    help='Number of cores')
+parser.add_argument('-k', '--timeout', type=int, default=30,
+                    help='Seconds after which water will be cancelled and repeat treated as unalignable')
+
+args = parser.parse_args()
+
+def file_check(repeat_library, in_gff, genome, out_gff, temp_dir):
+    if(exists(repeat_library) == False or exists(in_gff) == False or exists(genome) == False):
+        sys.exit('Files not found. Requires the repeat library, path to the genome, and path to gff containing coordinates and corresponding repeat files')
+    if(exists(temp_dir) == False):
+        os.mkdir(temp_dir)
+    if(exists(temp_dir+"/qseqs") == False):
+        os.mkdir(temp_dir+"/qseqs")
+    if(exists(temp_dir+"/split_library/") == False):
+        os.mkdir(temp_dir+"/split_library/")
+
+def splitter(in_seq, temp_dir):
+    with open(in_seq, 'r') as handle:
+        for record in SeqIO.parse(handle, "fasta"):
+            repeat_name = record.name.split(sep="#")[0]
+            repeat_name = repeat_name.lower()
+            file_name = (temp_dir+"/split_library/"+repeat_name+".fasta")
+            SeqIO.write(record, file_name, "fasta-2line")
+
+def parse_gff(in_gff):
+    gff = pd.read_table(in_gff, header = None, names=['seqnames', 'tool', 'repeat_class', 'start', 'end', 'score', 'strand', 'phase', 'metadata'])
+    simple_gff = gff[gff['repeat_class'].str.contains('Simple_repeat|Satellite|Low_complexity')].reset_index()
+    gff = gff[~gff['repeat_class'].str.contains('Simple_repeat|Satellite|Low_complexity')].reset_index()
+    gff['metadata_tmp'] = gff['metadata'].str.replace(';SHORTTE.*', '', regex=True)
+    gff[['tstart', 'tend', 'repeat_family']] = gff['metadata_tmp'].str.split(';', n=3, expand=True)
+    gff = gff.drop(columns = ['metadata_tmp', 'tstart', 'tend'])
+    gff['repeat_family'] = gff['repeat_family'].str.replace('ID=', '', regex=True)
+    gff['repeat_family'] = gff['repeat_family'].str.lower()
+    return(gff, simple_gff)
+
+def file_name_generator():
+    import random
+    import string
+    file_name = ''.join(random.sample(string.ascii_letters, 12))+'.tmp'
+    return(file_name)
+
+def Kimura80(qseq, sseq):
+    """
+    Calculations adapted from https://github.com/kgori/python_tools_on_github/blob/master/pairwise_distances.py
+    """
+    # define transitions, transversions, matches
+    transitions = [ "AG", "GA", "CT", "TC"]
+    transversions = [ "AC", "CA", "AT", "TA",
+                    "GC", "CG", "GT", "TG" ]
+    matches = [ "AA", "GG", "CC", "TT"]
+    # set counters to 0
+    m,ts,tv=0,0,0
+    # count transitions, transversions, matches
+    for i, j in zip(qseq, sseq):
+        if i+j in matches: m+=1
+        if i+j in transitions: ts+=1
+        if i+j in transversions: tv+=1
+    # count number of bp which align (excludes gaps, Ns)
+    aln_len = m + ts + tv
+    # calculate p and q 
+    p = ts/aln_len
+    q = tv/aln_len
+    
+    # calculate Kimura distance
+    Kimura_dist = -0.5 * log((1 - 2*p - q) * sqrt( 1 - 2*q ))
+    
+    return(Kimura_dist)
+
+def outer_func(genome_path, temp_dir, timeoutSeconds, gff):
+    generated_name = file_name_generator()
+    holder_file_name = temp_dir+generated_name
+    failed_file_name = temp_dir+"failed_"+generated_name
+    with open(holder_file_name, 'w') as tmp_out:
+        header = list(gff.columns.values)[1:] + ["Kimura"]
+        header = "\t".join(header)+"\n"
+        tmp_out.write(header)
+        for row in gff.iterrows():
+            # Set index
+            idx = row[0]
+            # Set scaffold, coordinates, strand, repeat family
+            seqnames, start, end, strand, repeat_family = row[1]['seqnames'], str(row[1]['start'] - 1), str(row[1]['end']), row[1]['strand'], row[1]['repeat_family']
+            # Create BED string for BEDtools
+            bed_str = " ".join([seqnames, start, end, ".", ".", strand])
+            # Set path for query sequence
+            query_path = temp_dir+"/qseqs/"+str(idx)
+            # Create bedtools command and getfasta
+            a=pybedtools.BedTool(bed_str, from_string=True)
+            a = a.sequence(fi=genome_path, fo=query_path, s=True)
+            # Set path to subject sequence
+            subject_path=temp_dir+"/split_library/"+repeat_family+".fasta"
+            # Run water, with timeout exception
+            test_command = shlex.split("water "+query_path+" "+subject_path+" -gapopen 10 -gapextend 0.5 -outfile "+query_path+".water -aformat fasta")
+            # Run test and kill if it takes more than 10 seconds
+            alignment_p = subprocess.Popen(test_command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
+            try:
+                alignment_p.wait(timeoutSeconds)
+            except subprocess.TimeoutExpired:
+                # if water fails to complete before timeout, kill and move on
+                with open(failed_file_name, "a") as failed_file:
+                    failed_file.write(seqnames+":"+start+"-"+end+"_"+strand+"_"+repeat_family+"\n")
+                alignment_p.kill()
+            if exists(query_path+".water") is False or getsize(query_path+".water") == 0:
+                # If no alignment is possible, set distances to NA and alignment length to 0
+                Kdist = "NA"
+                os.remove(query_path)
+                if exists(query_path+".water") is True:
+                    os.remove(query_path+".water")
+            else:
+                # Read in alignments
+                aln = AlignIO.read(query_path+".water", 'fasta')
+                # Calculate distances based on model
+                Kdist = Kimura80(str(aln[0].seq).upper(), str(aln[1].seq).upper())
+                # Convert numbers to strings
+                Kdist = str(round(Kdist, 4))
+                # Delete temporary files
+                os.remove(query_path+".water")
+                os.remove(query_path)
+            # Make line for temporary file and write to file
+            tmp_holder = row[1].to_list()[1:]
+            tmp_holder = "\t".join(str(x) for x in tmp_holder)+"\t"+Kdist+"\n"
+            tmp_out.write(tmp_holder)
+
+    return(holder_file_name)
+
+def tmp_out_parser(file_list, simple_gff):
+    # Loop through results 
+    gff=pd.DataFrame()
+    for file in file_list:
+        # read in gff
+        in_gff = pd.read_csv(file, sep = "\t")
+        # concatenate gff
+        gff = pd.concat([gff, in_gff], ignore_index=True)
+    # Convert numbers to strings for concatenation
+    gff['Kimura'] = gff['Kimura'].astype(str)
+    # Convert new data onto metadata
+    gff['metadata'] = gff['metadata'] + ";KIMURA80=" + gff['Kimura']
+    # Remove unnecessary rows
+    gff = gff.drop(columns = ['Kimura', 'repeat_family'])
+    # Combine columns, sort and drop unneccessary columns
+    gff = pd.concat([gff, simple_gff], ignore_index=True)
+    gff = gff.sort_values(by=['seqnames', 'start'])
+    gff = gff.reset_index()
+    gff = gff.drop(columns = ['level_0', 'index'])
+
+    return(gff)
+
+if __name__ == "__main__":
+    
+    start_time = time()
+
+    # check files exist
+    file_check(args.repeat_library, args.in_gff, args.genome, args.out_gff, args.temp_dir)
+    
+    # split library file
+    print("Splitting repeat library")
+    splitter(args.repeat_library, args.temp_dir)
+
+    # read in gff and take head
+    print("Reading in gff")
+    in_gff, simple_gff = parse_gff(args.in_gff)
+    
+    # create as many processes as instructed cores
+    num_processes = args.cores
+
+    # calculate the chunk size as an integer
+    chunk_size = int(in_gff.shape[0]/num_processes)
+
+    # break into chunks
+    chunks = [in_gff.iloc[in_gff.index[i:i + chunk_size]] for i in range(0, in_gff.shape[0], chunk_size)]
+
+    print("Starting calculations") 
+    # Peform calulations in parallel
+    func = partial(outer_func, args.genome, args.temp_dir, args.timeout)
+    pool = multiprocessing.Pool(processes=num_processes)
+    results = pool.map(func, chunks)
+    pool.close()
+    pool.join()
+    print("Finished calculations") 
+
+    # Free up memory (necessary with very large gffs and low memory machines)
+    del chunks
+    del in_gff
+  
+    # Read in temp files, fix metadata, add simple repeats back, and sort
+    calc_gff = tmp_out_parser(results, simple_gff)
+        
+    # write to file
+    calc_gff.to_csv(args.out_gff, sep = "\t", header = False, index=False)
+
+    # print run time for number of rows
+    run_time = time() - start_time
+    print("Total run time for ", len(calc_gff), " rows was ", run_time, " seconds")
+
+    # Delete folder of split library
+    shutil.rmtree(args.temp_dir+"/split_library/", ignore_errors=True)

From aae0c809665699b27cada2304eb476a38e645de6 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 10:28:46 +0200
Subject: [PATCH 04/28] Create divergence_plot.R

---
 scripts/divergenceCalc/divergence_plot.R | 151 +++++++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100644 scripts/divergenceCalc/divergence_plot.R

diff --git a/scripts/divergenceCalc/divergence_plot.R b/scripts/divergenceCalc/divergence_plot.R
new file mode 100644
index 0000000..5e8bf7b
--- /dev/null
+++ b/scripts/divergenceCalc/divergence_plot.R
@@ -0,0 +1,151 @@
+library(optparse)
+
+option_list <- list(
+  make_option(c("-s", "--species_name"), default=NA, type = "character", help="Species name (required)"),
+  make_option(c("-g", "--in_gff"), default=NA, type = "character", help="GFF with Kimura distances (required)"),
+  make_option(c("-o", "--out_directory"), default=NA, type = "character", help="Directory to write plots to (required)")
+)
+
+opt <- parse_args(OptionParser(option_list=option_list))
+
+# Check variables are set
+if(is.na(opt$species_name)){
+  stop("Species name must be supplied")
+}
+if(is.na(opt$in_gff)){
+  stop("Path to input gff must be supplied")
+}
+if(is.na(opt$out_directory)){
+  stop("Path to output directory must be supplied")
+}
+
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages(library(plyranges))
+suppressPackageStartupMessages(library(viridis))
+suppressPackageStartupMessages(library(cowplot))
+suppressPackageStartupMessages(library(ggtext))
+
+# Created plot title
+plot_title <- paste0("Repeat landscape of *", gsub("_", " ", opt$species_name), "*")
+title_plot <- ggplot() + labs(title = plot_title) + theme(plot.title = element_markdown(hjust = 0.5)) + theme(panel.background = element_blank())
+
+# Read in data, remove repeats which Kimura was not calculated for
+divergence_eg_gff <- read_gff(opt$in_gff) %>%
+  mutate(KIMURA80 = as.double(KIMURA80)) %>%
+  filter(!is.na(KIMURA80), KIMURA80 <= 0.5)
+
+# Breakdown classification of repeats
+divergence_eg_tes_gff <- divergence_eg_gff %>%
+  dplyr::mutate(subclass = sub("/.*", "", type),
+                superfamily = sub("-.*", "", sub(".*/", "", type)))
+
+# Fix Penelopes
+divergence_eg_tes_gff <- divergence_eg_tes_gff %>%
+  dplyr::mutate(subclass = ifelse(superfamily == "Penelope", "PLE", subclass)) %>%
+  dplyr::mutate(subclass = ifelse(subclass %in% c("DNA", "LINE", "LTR", "PLE", "RC", "SINE", "Unknown"), subclass, "Other")) %>%
+  dplyr::mutate(named_subclass = case_when(subclass == "DNA" ~ "DNA Transposon",
+                                           subclass == "LTR" ~ "LTR Retrotransposon",
+                                           subclass == "PLE" ~ "Penelope",
+                                           subclass == "RC" ~ "Rolling Circle",
+                                           .default = subclass))
+
+# Sum lengths to create data for plots (remove subclasses not in standard set)
+divergence_eg_tes_rounded_for_plot  <- divergence_eg_tes_gff %>%
+  as_tibble() %>%
+  dplyr::mutate(KIMURA80 = round(x = KIMURA80, digits = 2)) %>%
+  group_by(named_subclass, KIMURA80) %>%
+  mutate(KIMURA_SUM = sum(width)) %>%
+  ungroup() %>%
+  dplyr::select(subclass, named_subclass, KIMURA80, KIMURA_SUM) %>%
+  base::unique() %>%
+  arrange(named_subclass, KIMURA80)
+
+# Set fill colours
+fill_colours <- tibble(subclass = c("DNA", "LINE", "LTR", "PLE", "RC", "SINE", "Other", "Unknown"),
+                       named_subclass = c("DNA Transposon", "LINE", "LTR Retrotransposon", "Penelope", "Rolling Circle", "SINE", "Other", "Unknown"),
+                       fill_colour = c("#E32017", "#0098D4", "#00782A", "#7156A5", "#EE7C0E", "#9B0056", "#F3A9BB", "#A0A5A9")) %>%
+  filter(subclass %in% divergence_eg_tes_rounded_for_plot$subclass) %>%
+  arrange(named_subclass)
+
+# Create and save main plots
+kimura_plot <- ggplot(divergence_eg_tes_rounded_for_plot,
+                      aes(x = KIMURA80, y = KIMURA_SUM, fill = named_subclass)) +
+  geom_col(position = "stack", width = 0.01) +
+  scale_x_continuous(limits = c(-0.01, 0.51),
+                     expand = c(0,0), name = "Kimura 2-Parameter Distance") +
+  theme_bw() +
+  labs(title = plot_title) + theme(plot.title = element_markdown(hjust = 0.5)) +
+  scale_fill_manual(values = fill_colours$fill_colour, name = "TE Subclass")
+subclass_kimura_plot <- kimura_plot + scale_y_continuous(expand = c(0.01,0), name = "Base pairs")
+ggsave(plot = subclass_kimura_plot, filename = paste0(opt$out_directory, "/", opt$species_name, "_subclass_div_plot.pdf"), device = "pdf", width = 12.85, height = 8.5)
+split_subclass_kimura_plot <- kimura_plot + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + facet_grid(subclass~., scales = "free")
+ggsave(plot = split_subclass_kimura_plot, filename = paste0(opt$out_directory, "/", opt$species_name, "_split_subclass_div_plot.pdf"), device = "pdf", width = 12.85, height = 8.5)
+
+# Perform maths for more divided plot
+divergence_eg_tes_rounded_for_superfamily_plot  <- divergence_eg_tes_gff %>%
+  as_tibble() %>%
+  dplyr::mutate(KIMURA80 = round(x = KIMURA80, digits = 2),
+                type = sub("-.*", "", type)) %>%
+  group_by(superfamily, KIMURA80) %>%
+  mutate(KIMURA_SUM = sum(width)) %>%
+  ungroup()%>%
+  dplyr::select(type, subclass, superfamily, KIMURA80, KIMURA_SUM) %>%
+  base::unique() %>%
+  arrange(subclass, superfamily, KIMURA80)
+
+# Split data as necessary
+divergence_eg_tes_rounded_for_superfamily_plot <- split(divergence_eg_tes_rounded_for_superfamily_plot,
+                                                        f = divergence_eg_tes_rounded_for_superfamily_plot$subclass)
+
+# Create plots of superfamilies of DNA transposons, LINEs, LTR retrotransposons and SINEs
+kimura_superfamily_plot_1 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$DNA,
+                                    aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) +
+  geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) +
+  scale_x_continuous(limits = c(-0.01, 0.51),
+                     expand = c(0,0), name = "") +
+  theme_bw() +
+  theme(legend.title=element_blank()) +
+  scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) +
+  facet_grid(subclass~., scales = "free") +
+  guides(fill=guide_legend(ncol=3))
+kimura_superfamily_plot_2 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$LINE,
+                                    aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) +
+  geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) +
+  scale_x_continuous(limits = c(-0.01, 0.51),
+                     expand = c(0,0), name = "") +
+  theme_bw() +
+  theme(legend.title=element_blank()) +
+  scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) +
+  facet_grid(subclass~., scales = "free") +
+  guides(fill=guide_legend(ncol=3)) +
+  scale_fill_brewer(palette = "Blues", direction = -1)
+kimura_superfamily_plot_3 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$LTR,
+                                    aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) +
+  geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) +
+  scale_x_continuous(limits = c(-0.01, 0.51),
+                     expand = c(0,0), name = "") +
+  theme_bw() +
+  theme(legend.title=element_blank()) +
+  scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) +
+  facet_grid(subclass~., scales = "free") +
+  guides(fill=guide_legend(ncol=3)) +
+  scale_fill_brewer(palette = "Greens", direction = -1)
+kimura_superfamily_plot_4 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$SINE,
+                                    aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) +
+  geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) +
+  scale_x_continuous(limits = c(-0.01, 0.51),
+                     expand = c(0,0), name = "Kimura 2-Parameter Distance") +
+  theme_bw() +
+  theme(legend.title=element_blank()) +
+  scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) +
+  facet_grid(subclass~., scales = "free") +
+  guides(fill=guide_legend(ncol=3)) +
+  scale_fill_brewer(palette = "YlOrRd", direction = -1)
+
+# Combine plots and title
+superfamily_kimura_plot <- plot_grid(kimura_superfamily_plot_1, kimura_superfamily_plot_2, kimura_superfamily_plot_3, kimura_superfamily_plot_4, 
+          ncol = 1, align = "v")
+superfamily_kimura_plot_titled <- plot_grid(title_plot, superfamily_kimura_plot, ncol = 1, rel_heights = c(1, 30))
+
+# Save divided plot
+ggsave(plot = superfamily_kimura_plot_titled, filename = paste0(opt$out_directory, "/", opt$species_name, "_superfamily_div_plot.pdf"), device = "pdf", width = 12.85, height = 8.5)

From 3455194d722c0208b0726224f05c6fa7b2055ae6 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 10:48:07 +0200
Subject: [PATCH 05/28] Update earlGrey

---
 earlGrey | 42 ++++++++++++++++++------------------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/earlGrey b/earlGrey
index 516a78a..dc6a05e 100644
--- a/earlGrey
+++ b/earlGrey
@@ -3,7 +3,7 @@
 usage()
 {
 	echo "	#############################
-	earlGrey version 4.1.1
+	earlGrey version 4.2.0
 	Required Parameters:
 		-g == genome.fasta
 		-s == species name
@@ -25,12 +25,6 @@ usage()
 
 	earlGrey -g bombyxMori.fasta -s bombyxMori -o /home/toby/bombyxMori/repeatAnnotation/ -t 16
 
-
-	Prerequisites - These must be configured prior to using Earl Grey:
-		- RepeatMasker (Version 4.1.2)
-		- Ensure RepeatMasker has been configured with the desired repeat libraries (RepBase and at least Dfam 3.4 are recommended)
-		- RepeatModeler2
-
 	Queries can be sent to:
 	tobias.baril[at]unine.ch
 
@@ -192,17 +186,6 @@ novoMask()
 	fi
 }
 
-# Subprocess calcDivRL
-# Calculate divergence estimates
-calcDivRL()
-{
-	cd ${OUTDIR}/${species}_RepeatLandscape
-	genome_size=$(sed -n '4p' ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/*.tbl | rev | cut -f1,1 -d ':' | rev | sed 's/ bp.*//g; s/ //g')
-	align_file=$(readlink -f ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/*.align)
-	calcDivergenceFromAlign.pl -s ${species}.divsum $align_file
-	div_file=$(readlink -f $OUTDIR/${species}_RepeatLandscape/${species}.divsum)
-}
-
 # Subprocess rcMergeRepeats
 # Defragment repeat sequences to adjust for insertion times
 mergeRep()
@@ -237,6 +220,22 @@ charts()
 	fi
 }
 
+# Subprocess calcDivRL
+# Calculate divergence estimates
+calcDivRL()
+{
+	cd ${OUTDIR}/${species}_RepeatLandscape
+	if [ -z "$RepSpec" ] && [ -z "$startCust" ]; then
+		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genome -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
+		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/
+		mv ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff
+	else
+		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genome -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
+		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/
+		mv ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff
+	fi
+}
+
 # Subprocess sweepUp
 # Puts required files into a summary folder
 sweepUp()
@@ -485,7 +484,6 @@ if [ ! -f ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/*.tbl ]; then
 	if [ ! -f ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/*.tbl ]; then
 		echo "ERROR: RepeatMasker failed, please check logs" && exit 2
 	fi
-	calcDivRL
 	sleep 1
 else
 	stage="Final masking already complete, skipping..." && runningTea
@@ -495,11 +493,6 @@ else
 fi
 
 # Stage 6
-
-##TODO
-#### find a way to rename variable in table that is less RAM intensive than the current method in python
-#### i think this is inside rcMergeRepeatsLoose
-
 if [ ! -f ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed ] ; then
 	stage="Defragmenting Repeats" && runningTea
 	mergeRep
@@ -513,6 +506,7 @@ fi
 # Stage 7
 stage="Generating Summary Plots" && runningTea
 charts
+calcDivRL
 sleep 1
 
 # Stage 8

From cd4c8853e27acf22ab2ade57dd303f11162ffe9f Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 10:52:22 +0200
Subject: [PATCH 06/28] Update autoPie.R

---
 scripts/autoPie.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/autoPie.R b/scripts/autoPie.R
index d62dd5a..95b5c55 100644
--- a/scripts/autoPie.R
+++ b/scripts/autoPie.R
@@ -1,7 +1,7 @@
 # load libraries
 
-library(tidyverse)
-library(data.table)
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages((library(data.table))
 
 # set options
 

From 9cf8bcb3a7d31280e1e4134b495519394c5c6f2b Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 10:52:54 +0200
Subject: [PATCH 07/28] Update filteringOverlappingRepeats.R

---
 scripts/filteringOverlappingRepeats.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/filteringOverlappingRepeats.R b/scripts/filteringOverlappingRepeats.R
index 6448aad..7e0bf3a 100644
--- a/scripts/filteringOverlappingRepeats.R
+++ b/scripts/filteringOverlappingRepeats.R
@@ -1,7 +1,7 @@
 # load libraries
-library(GenomicRanges)
-library(ape)
-library(tidyverse)
+suppressPackageStartupMessages(library(GenomicRanges))
+suppressPackageStartupMessages(library(ape))
+suppressPackageStartupMessages(library(tidyverse))
 
 # set options
 

From 38c5047d26f937fcba63763790f9ccbc84a1058a Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 10:53:05 +0200
Subject: [PATCH 08/28] Update makeGff.R

---
 scripts/makeGff.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/makeGff.R b/scripts/makeGff.R
index 8908e80..ee979bd 100644
--- a/scripts/makeGff.R
+++ b/scripts/makeGff.R
@@ -1,4 +1,4 @@
-library(tidyverse)
+suppressPackageStartupMessages(library(tidyverse))
 
 args <- commandArgs()
 print(args)

From a43cd4661534684fd0ed8e207b89632a0a3a8d02 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 10:53:20 +0200
Subject: [PATCH 09/28] Update mergeRepeats.R

---
 scripts/mergeRepeats.R | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/mergeRepeats.R b/scripts/mergeRepeats.R
index 7f8ab3e..51ba09b 100644
--- a/scripts/mergeRepeats.R
+++ b/scripts/mergeRepeats.R
@@ -1,8 +1,8 @@
-library(tidyverse)
-library(plyr)
-library(dplyr)
-library(magrittr)
-library(data.table)
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages(library(plyr))
+suppressPackageStartupMessages(library(dplyr))
+suppressPackageStartupMessages(library(magrittr))
+suppressPackageStartupMessages(library(data.table))
 options(scipen = 100, stringsAsFactors = FALSE)
 
   #####

From 7b97aaa7924e88be29d60ea2f8169e9b213cec3e Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 11:00:17 +0200
Subject: [PATCH 10/28] Update divergence_plot.R

---
 scripts/divergenceCalc/divergence_plot.R | 50 ++++++++++++++++++------
 1 file changed, 37 insertions(+), 13 deletions(-)

diff --git a/scripts/divergenceCalc/divergence_plot.R b/scripts/divergenceCalc/divergence_plot.R
index 5e8bf7b..884a463 100644
--- a/scripts/divergenceCalc/divergence_plot.R
+++ b/scripts/divergenceCalc/divergence_plot.R
@@ -71,15 +71,31 @@ fill_colours <- tibble(subclass = c("DNA", "LINE", "LTR", "PLE", "RC", "SINE", "
 kimura_plot <- ggplot(divergence_eg_tes_rounded_for_plot,
                       aes(x = KIMURA80, y = KIMURA_SUM, fill = named_subclass)) +
   geom_col(position = "stack", width = 0.01) +
-  scale_x_continuous(limits = c(-0.01, 0.51),
-                     expand = c(0,0), name = "Kimura 2-Parameter Distance") +
+  scale_x_reverse(limits = c(-0.01, 0.51),
+                  expand = c(0,0), name = "Kimura 2-Parameter Distance") +
   theme_bw() +
   labs(title = plot_title) + theme(plot.title = element_markdown(hjust = 0.5)) +
   scale_fill_manual(values = fill_colours$fill_colour, name = "TE Subclass")
 subclass_kimura_plot <- kimura_plot + scale_y_continuous(expand = c(0.01,0), name = "Base pairs")
-ggsave(plot = subclass_kimura_plot, filename = paste0(opt$out_directory, "/", opt$species_name, "_subclass_div_plot.pdf"), device = "pdf", width = 12.85, height = 8.5)
+ggsave(plot = subclass_kimura_plot, 
+       filename = paste0(opt$out_directory, "/", opt$species_name, "_subclass_div_plot.pdf"),
+       device = "pdf", 
+       scale = 1,
+       width = 297, 
+       height = 210,
+       units = "mm",
+       dpi = 300,
+       limitsize = FALSE)
 split_subclass_kimura_plot <- kimura_plot + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + facet_grid(subclass~., scales = "free")
-ggsave(plot = split_subclass_kimura_plot, filename = paste0(opt$out_directory, "/", opt$species_name, "_split_subclass_div_plot.pdf"), device = "pdf", width = 12.85, height = 8.5)
+ggsave(plot = split_subclass_kimura_plot, 
+       filename = paste0(opt$out_directory, "/", opt$species_name, "_split_subclass_div_plot.pdf"), 
+       device = "pdf", 
+       scale = 1,
+       width = 297, 
+       height = 210,
+       units = "mm",
+       dpi = 300,
+       limitsize = FALSE)
 
 # Perform maths for more divided plot
 divergence_eg_tes_rounded_for_superfamily_plot  <- divergence_eg_tes_gff %>%
@@ -101,8 +117,8 @@ divergence_eg_tes_rounded_for_superfamily_plot <- split(divergence_eg_tes_rounde
 kimura_superfamily_plot_1 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$DNA,
                                     aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) +
   geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) +
-  scale_x_continuous(limits = c(-0.01, 0.51),
-                     expand = c(0,0), name = "") +
+  scale_x_reverse(limits = c(-0.01, 0.51),
+                  expand = c(0,0), name = "") +
   theme_bw() +
   theme(legend.title=element_blank()) +
   scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) +
@@ -111,8 +127,8 @@ kimura_superfamily_plot_1 <- ggplot(divergence_eg_tes_rounded_for_superfamily_pl
 kimura_superfamily_plot_2 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$LINE,
                                     aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) +
   geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) +
-  scale_x_continuous(limits = c(-0.01, 0.51),
-                     expand = c(0,0), name = "") +
+  scale_x_reverse(limits = c(-0.01, 0.51),
+                  expand = c(0,0), name = "") +
   theme_bw() +
   theme(legend.title=element_blank()) +
   scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) +
@@ -122,8 +138,8 @@ kimura_superfamily_plot_2 <- ggplot(divergence_eg_tes_rounded_for_superfamily_pl
 kimura_superfamily_plot_3 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$LTR,
                                     aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) +
   geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) +
-  scale_x_continuous(limits = c(-0.01, 0.51),
-                     expand = c(0,0), name = "") +
+  scale_x_reverse(limits = c(-0.01, 0.51),
+                  expand = c(0,0), name = "") +
   theme_bw() +
   theme(legend.title=element_blank()) +
   scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) +
@@ -133,8 +149,8 @@ kimura_superfamily_plot_3 <- ggplot(divergence_eg_tes_rounded_for_superfamily_pl
 kimura_superfamily_plot_4 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$SINE,
                                     aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) +
   geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) +
-  scale_x_continuous(limits = c(-0.01, 0.51),
-                     expand = c(0,0), name = "Kimura 2-Parameter Distance") +
+  scale_x_reverse(limits = c(-0.01, 0.51),
+                  expand = c(0,0), name = "Kimura 2-Parameter Distance") +
   theme_bw() +
   theme(legend.title=element_blank()) +
   scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) +
@@ -148,4 +164,12 @@ superfamily_kimura_plot <- plot_grid(kimura_superfamily_plot_1, kimura_superfami
 superfamily_kimura_plot_titled <- plot_grid(title_plot, superfamily_kimura_plot, ncol = 1, rel_heights = c(1, 30))
 
 # Save divided plot
-ggsave(plot = superfamily_kimura_plot_titled, filename = paste0(opt$out_directory, "/", opt$species_name, "_superfamily_div_plot.pdf"), device = "pdf", width = 12.85, height = 8.5)
+ggsave(plot = superfamily_kimura_plot_titled, 
+       filename = paste0(opt$out_directory, "/", opt$species_name, "_superfamily_div_plot.pdf"), 
+       device = "pdf", 
+       scale = 1,
+       width = 297, 
+       height = 210,
+       units = "mm",
+       dpi = 300,
+       limitsize = FALSE)

From aa238ac9331db6778aa8475a70f351c1e3e900ad Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 11:37:35 +0200
Subject: [PATCH 11/28] Update earlGrey

---
 earlGrey | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/earlGrey b/earlGrey
index dc6a05e..16c4c7a 100644
--- a/earlGrey
+++ b/earlGrey
@@ -66,6 +66,7 @@ prepGenome()
 		genome=${genome}.prep
 	else
 		dict=${genome}.dict
+		genOrig=$genome
 		genome=${genome}.prep
 	fi
 }
@@ -213,10 +214,8 @@ charts()
 	cd ${OUTDIR}/${species}_summaryFiles/
 	if [ -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ]; then
 		${SCRIPT_DIR}/autoPie.sh -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed -t ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -p ${OUTDIR}/${species}_summaryFiles/${species}.summaryPie.pdf -o ${OUTDIR}/${species}_summaryFiles/${species}.highLevelCount.txt
-		Rscript ${SCRIPT_DIR}/autoLand.R $div_file $genome_size $species ${OUTDIR}/${species}_summaryFiles/${species}.repeatLandscape.pdf
 	else
 		${SCRIPT_DIR}/autoPie.sh -i ${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.bed -t ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -p ${OUTDIR}/${species}_summaryFiles/${species}.summaryPie.pdf -o ${OUTDIR}/${species}_summaryFiles/${species}.highLevelCount.txt
-		Rscript ${SCRIPT_DIR}/autoLand.R $div_file $genome_size $species ${OUTDIR}/${species}_summaryFiles/${species}.repeatLandscape.pdf
 	fi
 }
 
@@ -226,11 +225,11 @@ calcDivRL()
 {
 	cd ${OUTDIR}/${species}_RepeatLandscape
 	if [ -z "$RepSpec" ] && [ -z "$startCust" ]; then
-		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genome -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
+		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
 		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/
 		mv ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff
 	else
-		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genome -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
+		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
 		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/
 		mv ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff
 	fi

From accac1c242ddd961d5a8eba0f8dcdb9e85bb36b7 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 11:40:40 +0200
Subject: [PATCH 12/28] Update earlGrey

---
 earlGrey | 1 +
 1 file changed, 1 insertion(+)

diff --git a/earlGrey b/earlGrey
index 16c4c7a..c41853c 100644
--- a/earlGrey
+++ b/earlGrey
@@ -63,6 +63,7 @@ prepGenome()
 		mv ${genome}.tmp.dict ${genome}.dict
 		sed -i '/^>/! s/[DVHBPE]/N/g' ${genome}.prep
 		dict=${genome}.dict
+		genOrig=$genome
 		genome=${genome}.prep
 	else
 		dict=${genome}.dict

From f985001be1bc28df3054bb0c9141ebe516fdf701 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 11:46:28 +0200
Subject: [PATCH 13/28] Update earlGrey

---
 earlGrey | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/earlGrey b/earlGrey
index c41853c..b71be36 100644
--- a/earlGrey
+++ b/earlGrey
@@ -226,13 +226,13 @@ calcDivRL()
 {
 	cd ${OUTDIR}/${species}_RepeatLandscape
 	if [ -z "$RepSpec" ] && [ -z "$startCust" ]; then
-		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
+		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
 		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/
-		mv ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff
+		mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff
 	else
-		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
+		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
 		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/
-		mv ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff
+		mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff
 	fi
 }
 

From 38b89379225171a5cd7451c99b8dee00a94c594e Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 11:52:29 +0200
Subject: [PATCH 14/28] Update earlGrey

---
 earlGrey | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/earlGrey b/earlGrey
index b71be36..47c853e 100644
--- a/earlGrey
+++ b/earlGrey
@@ -227,12 +227,12 @@ calcDivRL()
 	cd ${OUTDIR}/${species}_RepeatLandscape
 	if [ -z "$RepSpec" ] && [ -z "$startCust" ]; then
 		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
-		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/
-		mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff
+		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/
+		mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff
 	else
 		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
-		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/
-		mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_RepeatLandscape/${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff
+		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/
+		mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff
 	fi
 }
 

From f3843675e73792f646d4253abdf048bab6fdddc0 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 12:31:19 +0200
Subject: [PATCH 15/28] Update divergence_plot.R

---
 scripts/divergenceCalc/divergence_plot.R | 49 ++++++++++++++++--------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/scripts/divergenceCalc/divergence_plot.R b/scripts/divergenceCalc/divergence_plot.R
index 884a463..138b74d 100644
--- a/scripts/divergenceCalc/divergence_plot.R
+++ b/scripts/divergenceCalc/divergence_plot.R
@@ -47,6 +47,7 @@ divergence_eg_tes_gff <- divergence_eg_tes_gff %>%
                                            subclass == "LTR" ~ "LTR Retrotransposon",
                                            subclass == "PLE" ~ "Penelope",
                                            subclass == "RC" ~ "Rolling Circle",
+                                           subclass == "Unknown" ~ "Unclassified",
                                            .default = subclass))
 
 # Sum lengths to create data for plots (remove subclasses not in standard set)
@@ -60,23 +61,30 @@ divergence_eg_tes_rounded_for_plot  <- divergence_eg_tes_gff %>%
   base::unique() %>%
   arrange(named_subclass, KIMURA80)
 
+divergence_eg_tes_rounded_for_plot$named_subclass %<>% 
+  as.factor() %>%
+  ordered(levels = c("DNA Transposon", "Rolling Circle", "Penelope", "LINE", "SINE", "LTR Retrotransposon", "Other (Simple Repeat, Microsatellite, RNA)", "Unclassified"))
+
 # Set fill colours
-fill_colours <- tibble(subclass = c("DNA", "LINE", "LTR", "PLE", "RC", "SINE", "Other", "Unknown"),
-                       named_subclass = c("DNA Transposon", "LINE", "LTR Retrotransposon", "Penelope", "Rolling Circle", "SINE", "Other", "Unknown"),
-                       fill_colour = c("#E32017", "#0098D4", "#00782A", "#7156A5", "#EE7C0E", "#9B0056", "#F3A9BB", "#A0A5A9")) %>%
-  filter(subclass %in% divergence_eg_tes_rounded_for_plot$subclass) %>%
-  arrange(named_subclass)
+fill_colours <- data.frame(subclass = c("DNA", "RC", "PLE", "LINE", "SINE", "LTR", "Other", "Unknown"),
+                       named_subclass = c("DNA Transposon", "Rolling Circle", "Penelope", "LINE", "SINE", "LTR Retrotransposon", "Other (Simple Repeat, Microsatellite, RNA)", "Unclassified"),
+                       fill_colour = c("#E32017", "#EE7C0E", "#7156A5", "#0098D4", "#9B0056", "#00782A", "#F3A9BB", "#A0A5A9")) %>%
+  filter(subclass %in% divergence_eg_tes_rounded_for_plot$subclass) 
+
+col <- fill_colours$fill_colour
+names(col) <- fill_colours$named_subclass
 
 # Create and save main plots
 kimura_plot <- ggplot(divergence_eg_tes_rounded_for_plot,
                       aes(x = KIMURA80, y = KIMURA_SUM, fill = named_subclass)) +
   geom_col(position = "stack", width = 0.01) +
-  scale_x_reverse(limits = c(-0.01, 0.51),
-                  expand = c(0,0), name = "Kimura 2-Parameter Distance") +
+  scale_x_reverse(expand = c(0,0), name = "Kimura 2-Parameter Distance") +
   theme_bw() +
   labs(title = plot_title) + theme(plot.title = element_markdown(hjust = 0.5)) +
-  scale_fill_manual(values = fill_colours$fill_colour, name = "TE Subclass")
+  scale_fill_manual(values = col, name = "TE Subclass")
+
 subclass_kimura_plot <- kimura_plot + scale_y_continuous(expand = c(0.01,0), name = "Base pairs")
+
 ggsave(plot = subclass_kimura_plot, 
        filename = paste0(opt$out_directory, "/", opt$species_name, "_subclass_div_plot.pdf"),
        device = "pdf", 
@@ -86,7 +94,9 @@ ggsave(plot = subclass_kimura_plot,
        units = "mm",
        dpi = 300,
        limitsize = FALSE)
+
 split_subclass_kimura_plot <- kimura_plot + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + facet_grid(subclass~., scales = "free")
+
 ggsave(plot = split_subclass_kimura_plot, 
        filename = paste0(opt$out_directory, "/", opt$species_name, "_split_subclass_div_plot.pdf"), 
        device = "pdf", 
@@ -117,46 +127,53 @@ divergence_eg_tes_rounded_for_superfamily_plot <- split(divergence_eg_tes_rounde
 kimura_superfamily_plot_1 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$DNA,
                                     aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) +
   geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) +
-  scale_x_reverse(limits = c(-0.01, 0.51),
-                  expand = c(0,0), name = "") +
+  scale_x_reverse(expand = c(0,0), name = "") +
   theme_bw() +
   theme(legend.title=element_blank()) +
   scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) +
   facet_grid(subclass~., scales = "free") +
   guides(fill=guide_legend(ncol=3))
+if (inherits(try(ggplot_build(kimura_superfamily_plot_1)), "try-error")) 
+  kimura_superfamily_plot_1 <- ggplot()
+
 kimura_superfamily_plot_2 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$LINE,
                                     aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) +
   geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) +
-  scale_x_reverse(limits = c(-0.01, 0.51),
-                  expand = c(0,0), name = "") +
+  scale_x_reverse(expand = c(0,0), name = "") +
   theme_bw() +
   theme(legend.title=element_blank()) +
   scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) +
   facet_grid(subclass~., scales = "free") +
   guides(fill=guide_legend(ncol=3)) +
   scale_fill_brewer(palette = "Blues", direction = -1)
+if (inherits(try(ggplot_build(kimura_superfamily_plot_2)), "try-error")) 
+  kimura_superfamily_plot_2 <- ggplot()
+
 kimura_superfamily_plot_3 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$LTR,
                                     aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) +
   geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) +
-  scale_x_reverse(limits = c(-0.01, 0.51),
-                  expand = c(0,0), name = "") +
+  scale_x_reverse(expand = c(0,0), name = "") +
   theme_bw() +
   theme(legend.title=element_blank()) +
   scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) +
   facet_grid(subclass~., scales = "free") +
   guides(fill=guide_legend(ncol=3)) +
   scale_fill_brewer(palette = "Greens", direction = -1)
+if (inherits(try(ggplot_build(kimura_superfamily_plot_3)), "try-error")) 
+  kimura_superfamily_plot_3 <- ggplot()
+
 kimura_superfamily_plot_4 <- ggplot(divergence_eg_tes_rounded_for_superfamily_plot$SINE,
                                     aes(x = KIMURA80, y = KIMURA_SUM, fill = superfamily)) +
   geom_col(position = "stack", width = 0.01, colour = "black", linewidth = 0.2) +
-  scale_x_reverse(limits = c(-0.01, 0.51),
-                  expand = c(0,0), name = "Kimura 2-Parameter Distance") +
+  scale_x_reverse(expand = c(0,0), name = "Kimura 2-Parameter Distance") +
   theme_bw() +
   theme(legend.title=element_blank()) +
   scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) +
   facet_grid(subclass~., scales = "free") +
   guides(fill=guide_legend(ncol=3)) +
   scale_fill_brewer(palette = "YlOrRd", direction = -1)
+if (inherits(try(ggplot_build(kimura_superfamily_plot_4)), "try-error")) 
+  kimura_superfamily_plot_4 <- ggplot()
 
 # Combine plots and title
 superfamily_kimura_plot <- plot_grid(kimura_superfamily_plot_1, kimura_superfamily_plot_2, kimura_superfamily_plot_3, kimura_superfamily_plot_4, 

From 76a256d9d832844c528137dddf79e3540a8a2399 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 12:35:25 +0200
Subject: [PATCH 16/28] Update earlGrey

---
 earlGrey | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/earlGrey b/earlGrey
index 47c853e..a56daf0 100644
--- a/earlGrey
+++ b/earlGrey
@@ -227,12 +227,14 @@ calcDivRL()
 	cd ${OUTDIR}/${species}_RepeatLandscape
 	if [ -z "$RepSpec" ] && [ -z "$startCust" ]; then
 		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
-		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/
-		mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff
+		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ && \
+		mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff && \
+		rm -rf ${OUTDIR}/${species}_RepeatLandscape/tmp/
 	else
 		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
-		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/
-		mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff
+		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ && \
+		mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff && \
+		rm -rf ${OUTDIR}/${species}_RepeatLandscape/tmp/
 	fi
 }
 

From 78173889f1a8089218df689407e2a80bcb6ca911 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 13:08:22 +0200
Subject: [PATCH 17/28] Update earlGrey

---
 earlGrey | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/earlGrey b/earlGrey
index a56daf0..7e6fb8b 100644
--- a/earlGrey
+++ b/earlGrey
@@ -230,11 +230,13 @@ calcDivRL()
 		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ && \
 		mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff && \
 		rm -rf ${OUTDIR}/${species}_RepeatLandscape/tmp/
+		cp ${OUTDIR}/${species}_RepeatLandscape/*.pdf ${OUTDIR}/${species}_summaryFiles/
 	else
 		python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
 		Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ && \
 		mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff && \
 		rm -rf ${OUTDIR}/${species}_RepeatLandscape/tmp/
+		cp ${OUTDIR}/${species}_RepeatLandscape/*.pdf ${OUTDIR}/${species}_summaryFiles/
 	fi
 }
 

From cc60110e78db3ec268482aa6f359380f2add8eca Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 13:17:57 +0200
Subject: [PATCH 18/28] Update divergence_plot.R

---
 scripts/divergenceCalc/divergence_plot.R | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/divergenceCalc/divergence_plot.R b/scripts/divergenceCalc/divergence_plot.R
index 138b74d..e992994 100644
--- a/scripts/divergenceCalc/divergence_plot.R
+++ b/scripts/divergenceCalc/divergence_plot.R
@@ -79,9 +79,10 @@ kimura_plot <- ggplot(divergence_eg_tes_rounded_for_plot,
                       aes(x = KIMURA80, y = KIMURA_SUM, fill = named_subclass)) +
   geom_col(position = "stack", width = 0.01) +
   scale_x_reverse(expand = c(0,0), name = "Kimura 2-Parameter Distance") +
-  theme_bw() +
-  labs(title = plot_title) + theme(plot.title = element_markdown(hjust = 0.5)) +
-  scale_fill_manual(values = col, name = "TE Subclass")
+  theme_classic() +
+  labs(title = plot_title) + 
+  theme(plot.title = element_markdown(hjust = 0.5)) +
+  scale_fill_manual(values = col, name = "TE Classification")
 
 subclass_kimura_plot <- kimura_plot + scale_y_continuous(expand = c(0.01,0), name = "Base pairs")
 

From b7ef76a9fc5913f1cca8695e0769e685655542c8 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 14:41:06 +0200
Subject: [PATCH 19/28] Update divergence_plot.R

---
 scripts/divergenceCalc/divergence_plot.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/divergenceCalc/divergence_plot.R b/scripts/divergenceCalc/divergence_plot.R
index e992994..66390ba 100644
--- a/scripts/divergenceCalc/divergence_plot.R
+++ b/scripts/divergenceCalc/divergence_plot.R
@@ -87,7 +87,7 @@ kimura_plot <- ggplot(divergence_eg_tes_rounded_for_plot,
 subclass_kimura_plot <- kimura_plot + scale_y_continuous(expand = c(0.01,0), name = "Base pairs")
 
 ggsave(plot = subclass_kimura_plot, 
-       filename = paste0(opt$out_directory, "/", opt$species_name, "_subclass_div_plot.pdf"),
+       filename = paste0(opt$out_directory, "/", opt$species_name, "_classification_landscape.pdf"),
        device = "pdf", 
        scale = 1,
        width = 297, 
@@ -99,7 +99,7 @@ ggsave(plot = subclass_kimura_plot,
 split_subclass_kimura_plot <- kimura_plot + scale_y_continuous(name = "Base pairs", labels = function(x) format(x, scientific = TRUE)) + facet_grid(subclass~., scales = "free")
 
 ggsave(plot = split_subclass_kimura_plot, 
-       filename = paste0(opt$out_directory, "/", opt$species_name, "_split_subclass_div_plot.pdf"), 
+       filename = paste0(opt$out_directory, "/", opt$species_name, "_split_class_landscape.pdf"), 
        device = "pdf", 
        scale = 1,
        width = 297, 

From f1ae8f6c0074a409f78fc7cc957847a813346167 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 14:54:07 +0200
Subject: [PATCH 20/28] Update mergeRepeats.R

---
 scripts/mergeRepeats.R | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/scripts/mergeRepeats.R b/scripts/mergeRepeats.R
index 51ba09b..0723712 100644
--- a/scripts/mergeRepeats.R
+++ b/scripts/mergeRepeats.R
@@ -97,8 +97,11 @@ if (lowend == "yes") {
 
 # if end coordinate is before start, switch
 filteredRepeatsOut2 <- filteredRepeatsOut[,1:6] %>%
-  mutate(Start = ifelse(End < Start, End, Start),
-         End = ifelse(End < Start, Start, End))
+  mutate(nStart = ifelse(End < Start, End, Start),
+         nEnd = ifelse(End < Start, Start, End)) %>%
+  mutate(Start = nStart,
+         End = nEnd) %>%
+  select(! c(nStart, nEnd))
 
 write.table(filteredRepeatsOut2, file = filtBed, quote = FALSE, row.names = FALSE, sep = "\t", col.names = FALSE)
 

From d8f79408ffeb5aecfd75c8cddadb7a86ae6cdc80 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:10:01 +0200
Subject: [PATCH 21/28] Create build.sh

---
 conda/build.sh | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 conda/build.sh

diff --git a/conda/build.sh b/conda/build.sh
new file mode 100644
index 0000000..ac56305
--- /dev/null
+++ b/conda/build.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+#Based on https://github.com/TobyBaril/EarlGrey/blob/main/configure
+set -x
+
+# Define paths
+PACKAGE_HOME=${PREFIX}/share/${PKG_NAME}-${PKG_VERSION}-${PKG_BUILDNUM}
+SCRIPT_DIR="${PACKAGE_HOME}/scripts/"
+
+
+# Create directories
+mkdir -p ${PREFIX}/bin
+mkdir -p ${PACKAGE_HOME}
+
+
+# Put package in share directory
+cp -r * ${PACKAGE_HOME}/
+
+
+# Install SA-SSR (has to be done here because SA-SSR is an ancient repository without releases)
+git clone https://github.com/ridgelab/SA-SSR
+cd SA-SSR
+make
+cp bin/sa-ssr ${PREFIX}/bin/
+
+
+# Fixes to earlGrey executable
+sed -i.bak "/CONDA_DEFAULT_ENV/,+4d" ${PACKAGE_HOME}/earlGrey  #remove check that conda environment has a specific name
+
+
+# Fixes sed command for executables so that it works on both linux and macos
+sed -i.bak "s|sed -i |sed -i.bak |g" ${PACKAGE_HOME}/earlGrey ${SCRIPT_DIR}/rcMergeRepeat* ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh
+
+
+# Remove -pa from RepeatClassifier
+sed -i.bak 's/RepeatClassifier -pa ${THREADS} /RepeatClassifier /' ${SCRIPT_DIR}/TEstrainer/TEstrainer
+
+
+# Remove -t parameter from sa-ssr (since multithreading doesn't work on OSX)
+sed -i.bak 's/-t ${THREADS} / /' ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh
+sed -i.bak 's/-t ${THREADS} / /' ${SCRIPT_DIR}/TEstrainer/TEstrainer
+
+
+# Add SCRIPT_DIR to correct path
+sed -i.bak "s|SCRIPT_DIR=.*|SCRIPT_DIR=${SCRIPT_DIR}|g" ${PACKAGE_HOME}/earlGrey
+sed -i.bak "s|SCRIPT_DIR=.*|SCRIPT_DIR=${SCRIPT_DIR}|g" ${SCRIPT_DIR}/rcMergeRepeat*
+sed -i.bak "s|SCRIPT_DIR=.*|SCRIPT_DIR=${SCRIPT_DIR}|g" ${SCRIPT_DIR}/headSwap.sh
+sed -i.bak "s|SCRIPT_DIR=.*|SCRIPT_DIR=${SCRIPT_DIR}|g" ${SCRIPT_DIR}/autoPie.sh
+sed -i.bak "s|INSERT_FILENAME_HERE|${SCRIPT_DIR}/TEstrainer/scripts/|g" ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh
+
+
+# Set permissions to files
+chmod +x ${PACKAGE_HOME}/earlGrey
+chmod +x ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh
+chmod +x ${SCRIPT_DIR}/* > /dev/null 2>&1
+chmod +x ${SCRIPT_DIR}/bin/LTR_FINDER.x86_64-1.0.7/ltr_finder
+chmod a+w ${SCRIPT_DIR}/repeatCraft/example
+
+
+# Extract tRNAdb
+tar -zxf ${SCRIPT_DIR}/bin/LTR_FINDER.x86_64-1.0.7/tRNAdb.tar.gz --directory ${SCRIPT_DIR}/bin/LTR_FINDER.x86_64-1.0.7
+
+
+# Set PERL5LIB upon activate/deactivate
+for CHANGE in "activate" "deactivate";
+do
+  mkdir -p "${PREFIX}/etc/conda/${CHANGE}.d"
+done
+echo "#!/bin/sh" > "${PREFIX}/etc/conda/activate.d/${PKG_NAME}_activate.sh"
+echo "export PERL5LIB=${PREFIX}/share/RepeatMasker/:${PREFIX}/share/RepeatModeler/" >> "${PREFIX}/etc/conda/activate.d/${PKG_NAME}_activate.sh"
+echo "#!/bin/sh" > "${PREFIX}/etc/conda/deactivate.d/${PKG_NAME}_deactivate.sh"
+echo "unset PERL5LIB" >> "${PREFIX}/etc/conda/deactivate.d/${PKG_NAME}_deactivate.sh"
+
+
+# Put earlGrey executable in bin
+cd ${PREFIX}/bin
+ln -s ${PACKAGE_HOME}/earlGrey .

From 63f5ad91a62fd44c2c091e8cbda5cfa0753167b1 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:11:12 +0200
Subject: [PATCH 22/28] Create meta.yaml

---
 conda/meta.yaml | 73 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 conda/meta.yaml

diff --git a/conda/meta.yaml b/conda/meta.yaml
new file mode 100644
index 0000000..2b95cd3
--- /dev/null
+++ b/conda/meta.yaml
@@ -0,0 +1,73 @@
+{% set name = "EarlGrey" %}
+{% set version = "4.1.1" %}
+{% set sha256 = "499b39f0887f6b258a0fc7ac8eb4aa1abbc3fbe2e22d412be245c21e2c896381" %}
+
+package:
+  name: {{ name|lower }}
+  version: {{ version }}
+
+source:
+  url: https://github.com/TobyBaril/EarlGrey/archive/refs/tags/v{{ version }}.tar.gz
+  sha256: {{ sha256 }}
+
+build:
+  number: 1
+  run_exports:
+    - {{ pin_subpackage('earlgrey', max_pin='x') }}
+
+requirements:
+  build:
+    - make
+    - {{ compiler('cxx') }}
+  run:
+    - python =3.9
+    - hmmer
+    - trf
+    - cd-hit
+    - genometools-genometools
+    - pandas
+    - ncls =0.0.64
+    - pyfaidx
+    - pyranges 
+    - parallel
+    - repeatmasker >=4.1.4
+    - ltr_retriever
+    - mafft
+    - mreps
+    - ninja-nj
+    - repeatscout
+    - recon
+    - repeatmodeler >=2.0.4
+    - bioconductor-genomeinfodb
+    - bioconductor-genomeinfodbdata
+    - bioconductor-bsgenome
+    - bioconductor-plyranges
+    - r-ape
+    - r-optparse
+    - r-tidyverse
+    - r-plyr
+    - r-viridis
+    - r-cowplot
+    - r-ggtext
+    - bedtools
+    - emboss
+    - pybedtools
+
+test:
+  commands:
+    - earlGrey -h
+
+about:
+  home: https://github.com/TobyBaril/EarlGrey
+  dev_url: https://github.com/TobyBaril/EarlGrey
+  license: OSL-2.1
+  summary: "Earl Grey: A fully automated TE curation and annotation pipeline"
+  description: |
+    Earl Grey is a full-automated transposable element (TE) annotation pipeline,
+    leveraging the most widely-used tools and combining these with a consensus
+    elongation process (BEAT) to better define de novo consensus sequences when
+    annotating new genome assemblies.
+    
+extra:
+  identifiers:
+    - doi:10.1093/molbev/msae068

From 7d3e0a0ea114454ba01766e8aa27e51119c6dfaf Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:12:31 +0200
Subject: [PATCH 23/28] Create publish_conda.yml

---
 .github/workflows/publish_conda.yml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 .github/workflows/publish_conda.yml

diff --git a/.github/workflows/publish_conda.yml b/.github/workflows/publish_conda.yml
new file mode 100644
index 0000000..e849975
--- /dev/null
+++ b/.github/workflows/publish_conda.yml
@@ -0,0 +1,16 @@
+name: publish_conda
+
+on:
+  release:
+    types: [published]
+    
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v1
+    - name: publish-to-conda
+      uses: maxibor/conda-package-publish-action@v1.1
+      with:
+        subDir: 'conda'
+        AnacondaToken: ${{ secrets.ANACONDA_TOKEN }}

From 7f6726537ba0642496ed699b7a56b2a4389fe21b Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:32:20 +0200
Subject: [PATCH 24/28] Update meta.yaml

---
 conda/meta.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/conda/meta.yaml b/conda/meta.yaml
index 2b95cd3..bbe5e91 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,6 +1,5 @@
 {% set name = "EarlGrey" %}
-{% set version = "4.1.1" %}
-{% set sha256 = "499b39f0887f6b258a0fc7ac8eb4aa1abbc3fbe2e22d412be245c21e2c896381" %}
+{% set version = "4.2.0" %}
 
 package:
   name: {{ name|lower }}

From 4edd24db61794ca240d8d8532e60ca6dbf393b07 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:38:31 +0200
Subject: [PATCH 25/28] Update meta.yaml

---
 conda/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/meta.yaml b/conda/meta.yaml
index bbe5e91..7cc0bc7 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = "EarlGrey" %}
-{% set version = "4.2.0" %}
+{% set version = "4.2.0-dev" %}
 
 package:
   name: {{ name|lower }}

From cda8c7f545d1004e643a89292a97d79d846b9ba7 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Mon, 29 Apr 2024 14:08:12 +0200
Subject: [PATCH 26/28] Update meta.yaml

---
 conda/meta.yaml | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/conda/meta.yaml b/conda/meta.yaml
index 7cc0bc7..d5528a7 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,18 +1,15 @@
-{% set name = "EarlGrey" %}
-{% set version = "4.2.0-dev" %}
-
 package:
-  name: {{ name|lower }}
-  version: {{ version }}
+  name: earlgrey
+  version: "4.2.0.dev"
 
 source:
-  url: https://github.com/TobyBaril/EarlGrey/archive/refs/tags/v{{ version }}.tar.gz
-  sha256: {{ sha256 }}
+  path: .
 
 build:
-  number: 1
-  run_exports:
-    - {{ pin_subpackage('earlgrey', max_pin='x') }}
+  # Specify the channels in order of priority
+  channels:
+    - conda-forge
+    - bioconda
 
 requirements:
   build:

From 55ea7092e7909158e24b17bbd5b0b343e4986c92 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Mon, 29 Apr 2024 14:18:06 +0200
Subject: [PATCH 27/28] Delete conda directory

---
 conda/build.sh  | 76 -------------------------------------------------
 conda/meta.yaml | 69 --------------------------------------------
 2 files changed, 145 deletions(-)
 delete mode 100644 conda/build.sh
 delete mode 100644 conda/meta.yaml

diff --git a/conda/build.sh b/conda/build.sh
deleted file mode 100644
index ac56305..0000000
--- a/conda/build.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/bin/bash
-#Based on https://github.com/TobyBaril/EarlGrey/blob/main/configure
-set -x
-
-# Define paths
-PACKAGE_HOME=${PREFIX}/share/${PKG_NAME}-${PKG_VERSION}-${PKG_BUILDNUM}
-SCRIPT_DIR="${PACKAGE_HOME}/scripts/"
-
-
-# Create directories
-mkdir -p ${PREFIX}/bin
-mkdir -p ${PACKAGE_HOME}
-
-
-# Put package in share directory
-cp -r * ${PACKAGE_HOME}/
-
-
-# Install SA-SSR (has to be done here because SA-SSR is an ancient repository without releases)
-git clone https://github.com/ridgelab/SA-SSR
-cd SA-SSR
-make
-cp bin/sa-ssr ${PREFIX}/bin/
-
-
-# Fixes to earlGrey executable
-sed -i.bak "/CONDA_DEFAULT_ENV/,+4d" ${PACKAGE_HOME}/earlGrey  #remove check that conda environment has a specific name
-
-
-# Fixes sed command for executables so that it works on both linux and macos
-sed -i.bak "s|sed -i |sed -i.bak |g" ${PACKAGE_HOME}/earlGrey ${SCRIPT_DIR}/rcMergeRepeat* ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh
-
-
-# Remove -pa from RepeatClassifier
-sed -i.bak 's/RepeatClassifier -pa ${THREADS} /RepeatClassifier /' ${SCRIPT_DIR}/TEstrainer/TEstrainer
-
-
-# Remove -t parameter from sa-ssr (since multithreading doesn't work on OSX)
-sed -i.bak 's/-t ${THREADS} / /' ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh
-sed -i.bak 's/-t ${THREADS} / /' ${SCRIPT_DIR}/TEstrainer/TEstrainer
-
-
-# Add SCRIPT_DIR to correct path
-sed -i.bak "s|SCRIPT_DIR=.*|SCRIPT_DIR=${SCRIPT_DIR}|g" ${PACKAGE_HOME}/earlGrey
-sed -i.bak "s|SCRIPT_DIR=.*|SCRIPT_DIR=${SCRIPT_DIR}|g" ${SCRIPT_DIR}/rcMergeRepeat*
-sed -i.bak "s|SCRIPT_DIR=.*|SCRIPT_DIR=${SCRIPT_DIR}|g" ${SCRIPT_DIR}/headSwap.sh
-sed -i.bak "s|SCRIPT_DIR=.*|SCRIPT_DIR=${SCRIPT_DIR}|g" ${SCRIPT_DIR}/autoPie.sh
-sed -i.bak "s|INSERT_FILENAME_HERE|${SCRIPT_DIR}/TEstrainer/scripts/|g" ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh
-
-
-# Set permissions to files
-chmod +x ${PACKAGE_HOME}/earlGrey
-chmod +x ${SCRIPT_DIR}/TEstrainer/TEstrainer_for_earlGrey.sh
-chmod +x ${SCRIPT_DIR}/* > /dev/null 2>&1
-chmod +x ${SCRIPT_DIR}/bin/LTR_FINDER.x86_64-1.0.7/ltr_finder
-chmod a+w ${SCRIPT_DIR}/repeatCraft/example
-
-
-# Extract tRNAdb
-tar -zxf ${SCRIPT_DIR}/bin/LTR_FINDER.x86_64-1.0.7/tRNAdb.tar.gz --directory ${SCRIPT_DIR}/bin/LTR_FINDER.x86_64-1.0.7
-
-
-# Set PERL5LIB upon activate/deactivate
-for CHANGE in "activate" "deactivate";
-do
-  mkdir -p "${PREFIX}/etc/conda/${CHANGE}.d"
-done
-echo "#!/bin/sh" > "${PREFIX}/etc/conda/activate.d/${PKG_NAME}_activate.sh"
-echo "export PERL5LIB=${PREFIX}/share/RepeatMasker/:${PREFIX}/share/RepeatModeler/" >> "${PREFIX}/etc/conda/activate.d/${PKG_NAME}_activate.sh"
-echo "#!/bin/sh" > "${PREFIX}/etc/conda/deactivate.d/${PKG_NAME}_deactivate.sh"
-echo "unset PERL5LIB" >> "${PREFIX}/etc/conda/deactivate.d/${PKG_NAME}_deactivate.sh"
-
-
-# Put earlGrey executable in bin
-cd ${PREFIX}/bin
-ln -s ${PACKAGE_HOME}/earlGrey .
diff --git a/conda/meta.yaml b/conda/meta.yaml
deleted file mode 100644
index d5528a7..0000000
--- a/conda/meta.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-package:
-  name: earlgrey
-  version: "4.2.0.dev"
-
-source:
-  path: .
-
-build:
-  # Specify the channels in order of priority
-  channels:
-    - conda-forge
-    - bioconda
-
-requirements:
-  build:
-    - make
-    - {{ compiler('cxx') }}
-  run:
-    - python =3.9
-    - hmmer
-    - trf
-    - cd-hit
-    - genometools-genometools
-    - pandas
-    - ncls =0.0.64
-    - pyfaidx
-    - pyranges 
-    - parallel
-    - repeatmasker >=4.1.4
-    - ltr_retriever
-    - mafft
-    - mreps
-    - ninja-nj
-    - repeatscout
-    - recon
-    - repeatmodeler >=2.0.4
-    - bioconductor-genomeinfodb
-    - bioconductor-genomeinfodbdata
-    - bioconductor-bsgenome
-    - bioconductor-plyranges
-    - r-ape
-    - r-optparse
-    - r-tidyverse
-    - r-plyr
-    - r-viridis
-    - r-cowplot
-    - r-ggtext
-    - bedtools
-    - emboss
-    - pybedtools
-
-test:
-  commands:
-    - earlGrey -h
-
-about:
-  home: https://github.com/TobyBaril/EarlGrey
-  dev_url: https://github.com/TobyBaril/EarlGrey
-  license: OSL-2.1
-  summary: "Earl Grey: A fully automated TE curation and annotation pipeline"
-  description: |
-    Earl Grey is a full-automated transposable element (TE) annotation pipeline,
-    leveraging the most widely-used tools and combining these with a consensus
-    elongation process (BEAT) to better define de novo consensus sequences when
-    annotating new genome assemblies.
-    
-extra:
-  identifiers:
-    - doi:10.1093/molbev/msae068

From 72a91547fac0fc72203303c7174525a7ed8a1214 Mon Sep 17 00:00:00 2001
From: Tobias Baril <46785187+TobyBaril@users.noreply.github.com>
Date: Mon, 29 Apr 2024 14:18:19 +0200
Subject: [PATCH 28/28] Delete .github/workflows directory

---
 .github/workflows/publish_conda.yml | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 .github/workflows/publish_conda.yml

diff --git a/.github/workflows/publish_conda.yml b/.github/workflows/publish_conda.yml
deleted file mode 100644
index e849975..0000000
--- a/.github/workflows/publish_conda.yml
+++ /dev/null
@@ -1,16 +0,0 @@
-name: publish_conda
-
-on:
-  release:
-    types: [published]
-    
-jobs:
-  publish:
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v1
-    - name: publish-to-conda
-      uses: maxibor/conda-package-publish-action@v1.1
-      with:
-        subDir: 'conda'
-        AnacondaToken: ${{ secrets.ANACONDA_TOKEN }}