Skip to content

Commit

Permalink
Merge pull request #101 from TobyBaril/nextPatch_efficiencyAndNesting
Browse files Browse the repository at this point in the history
Version 4.2.0 - Increased Efficiency and Patch Fixes
  • Loading branch information
TobyBaril authored Apr 29, 2024
2 parents 6fe5f9e + 72a9154 commit 695a96a
Show file tree
Hide file tree
Showing 7 changed files with 462 additions and 48 deletions.
50 changes: 24 additions & 26 deletions earlGrey
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
usage()
{
echo " #############################
earlGrey version 4.1.1
earlGrey version 4.2.0
Required Parameters:
-g == genome.fasta
-s == species name
Expand All @@ -25,12 +25,6 @@ usage()
earlGrey -g bombyxMori.fasta -s bombyxMori -o /home/toby/bombyxMori/repeatAnnotation/ -t 16
Prerequisites - These must be configured prior to using Earl Grey:
- RepeatMasker (Version 4.1.2)
- Ensure RepeatMasker has been configured with the desired repeat libraries (RepBase and at least Dfam 3.4 are recommended)
- RepeatModeler2
Queries can be sent to:
tobias.baril[at]unine.ch
Expand Down Expand Up @@ -69,9 +63,11 @@ prepGenome()
mv ${genome}.tmp.dict ${genome}.dict
sed -i '/^>/! s/[DVHBPE]/N/g' ${genome}.prep
dict=${genome}.dict
genOrig=$genome
genome=${genome}.prep
else
dict=${genome}.dict
genOrig=$genome
genome=${genome}.prep
fi
}
Expand Down Expand Up @@ -192,17 +188,6 @@ novoMask()
fi
}

# Subprocess calcDivRL
# Calculate divergence estimates
calcDivRL()
{
cd ${OUTDIR}/${species}_RepeatLandscape
genome_size=$(sed -n '4p' ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/*.tbl | rev | cut -f1,1 -d ':' | rev | sed 's/ bp.*//g; s/ //g')
align_file=$(readlink -f ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/*.align)
calcDivergenceFromAlign.pl -s ${species}.divsum $align_file
div_file=$(readlink -f $OUTDIR/${species}_RepeatLandscape/${species}.divsum)
}

# Subprocess rcMergeRepeats
# Defragment repeat sequences to adjust for insertion times
mergeRep()
Expand Down Expand Up @@ -230,10 +215,28 @@ charts()
cd ${OUTDIR}/${species}_summaryFiles/
if [ -f "${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed" ]; then
${SCRIPT_DIR}/autoPie.sh -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed -t ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -p ${OUTDIR}/${species}_summaryFiles/${species}.summaryPie.pdf -o ${OUTDIR}/${species}_summaryFiles/${species}.highLevelCount.txt
Rscript ${SCRIPT_DIR}/autoLand.R $div_file $genome_size $species ${OUTDIR}/${species}_summaryFiles/${species}.repeatLandscape.pdf
else
${SCRIPT_DIR}/autoPie.sh -i ${OUTDIR}/${species}_mergedRepeats/${species}.filteredRepeats.bed -t ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/$(basename $genome).tbl -p ${OUTDIR}/${species}_summaryFiles/${species}.summaryPie.pdf -o ${OUTDIR}/${species}_summaryFiles/${species}.highLevelCount.txt
Rscript ${SCRIPT_DIR}/autoLand.R $div_file $genome_size $species ${OUTDIR}/${species}_summaryFiles/${species}.repeatLandscape.pdf
fi
}

# Subprocess calcDivRL
# Calculate divergence estimates
calcDivRL()
{
cd ${OUTDIR}/${species}_RepeatLandscape
if [ -z "$RepSpec" ] && [ -z "$startCust" ]; then
python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l $latestFile -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ && \
mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff && \
rm -rf ${OUTDIR}/${species}_RepeatLandscape/tmp/
cp ${OUTDIR}/${species}_RepeatLandscape/*.pdf ${OUTDIR}/${species}_summaryFiles/
else
python ${SCRIPT_DIR}/divergenceCalc/divergence_calc.py -l ${OUTDIR}/${species}_Curated_Library/${species}_combined_library.fasta -g $genOrig -i ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff -o ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -t $ProcNum
Rscript ${SCRIPT_DIR}/divergenceCalc/divergence_plot.R -s $species -g ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff -o ${OUTDIR}/${species}_RepeatLandscape/ && \
mv ${OUTDIR}/${species}_RepeatLandscape/${species}.filteredRepeats.withDivergence.gff ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.gff && \
rm -rf ${OUTDIR}/${species}_RepeatLandscape/tmp/
cp ${OUTDIR}/${species}_RepeatLandscape/*.pdf ${OUTDIR}/${species}_summaryFiles/
fi
}

Expand Down Expand Up @@ -485,7 +488,6 @@ if [ ! -f ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/*.tbl ]; then
if [ ! -f ${OUTDIR}/${species}_RepeatMasker_Against_Custom_Library/*.tbl ]; then
echo "ERROR: RepeatMasker failed, please check logs" && exit 2
fi
calcDivRL
sleep 1
else
stage="Final masking already complete, skipping..." && runningTea
Expand All @@ -495,11 +497,6 @@ else
fi

# Stage 6

##TODO
#### find a way to rename variable in table that is less RAM intensive than the current method in python
#### i think this is inside rcMergeRepeatsLoose

if [ ! -f ${OUTDIR}/${species}_mergedRepeats/looseMerge/${species}.filteredRepeats.bed ] ; then
stage="Defragmenting Repeats" && runningTea
mergeRep
Expand All @@ -513,6 +510,7 @@ fi
# Stage 7
stage="Generating Summary Plots" && runningTea
charts
calcDivRL
sleep 1

# Stage 8
Expand Down
4 changes: 2 additions & 2 deletions scripts/autoPie.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# load libraries

library(tidyverse)
library(data.table)
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages((library(data.table))

# set options

Expand Down
223 changes: 223 additions & 0 deletions scripts/divergenceCalc/divergence_calc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
import os
from os.path import exists, getsize
import sys
import argparse
import pandas as pd
import multiprocessing
import pybedtools
import subprocess
import shlex
import shutil
from Bio import AlignIO, SeqIO
from math import log, sqrt
from functools import partial
from time import time
from re import sub

parser = argparse.ArgumentParser()
parser.add_argument('-l', '--repeat_library', type=str, required=True,
help='repeat_library')
parser.add_argument('-i', '--in_gff', type=str, required=True,
help='Path to gff')
parser.add_argument('-g', '--genome', type=str, required=True,
help='Path to genome')
parser.add_argument('-o', '--out_gff', type=str, required=True,
help='Output gff')
parser.add_argument('-tmp', '--temp_dir', type=str, default='tmp/',
help='Temporary directory')
parser.add_argument('-t', '--cores', type=int, default=4,
help='Number of cores')
parser.add_argument('-k', '--timeout', type=int, default=30,
help='Seconds after which water will be cancelled and repeat treated as unalignable')

args = parser.parse_args()

def file_check(repeat_library, in_gff, genome, out_gff, temp_dir):
if(exists(repeat_library) == False or exists(in_gff) == False or exists(genome) == False):
sys.exit('Files not found. Requires the repeat library, path to the genome, and path to gff containing coordinates and corresponding repeat files')
if(exists(temp_dir) == False):
os.mkdir(temp_dir)
if(exists(temp_dir+"/qseqs") == False):
os.mkdir(temp_dir+"/qseqs")
if(exists(temp_dir+"/split_library/") == False):
os.mkdir(temp_dir+"/split_library/")

def splitter(in_seq, temp_dir):
with open(in_seq, 'r') as handle:
for record in SeqIO.parse(handle, "fasta"):
repeat_name = record.name.split(sep="#")[0]
repeat_name = repeat_name.lower()
file_name = (temp_dir+"/split_library/"+repeat_name+".fasta")
SeqIO.write(record, file_name, "fasta-2line")

def parse_gff(in_gff):
gff = pd.read_table(in_gff, header = None, names=['seqnames', 'tool', 'repeat_class', 'start', 'end', 'score', 'strand', 'phase', 'metadata'])
simple_gff = gff[gff['repeat_class'].str.contains('Simple_repeat|Satellite|Low_complexity')].reset_index()
gff = gff[~gff['repeat_class'].str.contains('Simple_repeat|Satellite|Low_complexity')].reset_index()
gff['metadata_tmp'] = gff['metadata'].str.replace(';SHORTTE.*', '', regex=True)
gff[['tstart', 'tend', 'repeat_family']] = gff['metadata_tmp'].str.split(';', n=3, expand=True)
gff = gff.drop(columns = ['metadata_tmp', 'tstart', 'tend'])
gff['repeat_family'] = gff['repeat_family'].str.replace('ID=', '', regex=True)
gff['repeat_family'] = gff['repeat_family'].str.lower()
return(gff, simple_gff)

def file_name_generator():
import random
import string
file_name = ''.join(random.sample(string.ascii_letters, 12))+'.tmp'
return(file_name)

def Kimura80(qseq, sseq):
"""
Calculations adapted from https://github.com/kgori/python_tools_on_github/blob/master/pairwise_distances.py
"""
# define transitions, transversions, matches
transitions = [ "AG", "GA", "CT", "TC"]
transversions = [ "AC", "CA", "AT", "TA",
"GC", "CG", "GT", "TG" ]
matches = [ "AA", "GG", "CC", "TT"]
# set counters to 0
m,ts,tv=0,0,0
# count transitions, transversions, matches
for i, j in zip(qseq, sseq):
if i+j in matches: m+=1
if i+j in transitions: ts+=1
if i+j in transversions: tv+=1
# count number of bp which align (excludes gaps, Ns)
aln_len = m + ts + tv
# calculate p and q
p = ts/aln_len
q = tv/aln_len

# calculate Kimura distance
Kimura_dist = -0.5 * log((1 - 2*p - q) * sqrt( 1 - 2*q ))

return(Kimura_dist)

def outer_func(genome_path, temp_dir, timeoutSeconds, gff):
generated_name = file_name_generator()
holder_file_name = temp_dir+generated_name
failed_file_name = temp_dir+"failed_"+generated_name
with open(holder_file_name, 'w') as tmp_out:
header = list(gff.columns.values)[1:] + ["Kimura"]
header = "\t".join(header)+"\n"
tmp_out.write(header)
for row in gff.iterrows():
# Set index
idx = row[0]
# Set scaffold, coordinates, strand, repeat family
seqnames, start, end, strand, repeat_family = row[1]['seqnames'], str(row[1]['start'] - 1), str(row[1]['end']), row[1]['strand'], row[1]['repeat_family']
# Create BED string for BEDtools
bed_str = " ".join([seqnames, start, end, ".", ".", strand])
# Set path for query sequence
query_path = temp_dir+"/qseqs/"+str(idx)
# Create bedtools command and getfasta
a=pybedtools.BedTool(bed_str, from_string=True)
a = a.sequence(fi=genome_path, fo=query_path, s=True)
# Set path to subject sequence
subject_path=temp_dir+"/split_library/"+repeat_family+".fasta"
# Run water, with timeout exception
test_command = shlex.split("water "+query_path+" "+subject_path+" -gapopen 10 -gapextend 0.5 -outfile "+query_path+".water -aformat fasta")
# Run test and kill if it takes more than 10 seconds
alignment_p = subprocess.Popen(test_command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
try:
alignment_p.wait(timeoutSeconds)
except subprocess.TimeoutExpired:
# if water fails to complete before timeout, kill and move on
with open(failed_file_name, "a") as failed_file:
failed_file.write(seqnames+":"+start+"-"+end+"_"+strand+"_"+repeat_family+"\n")
alignment_p.kill()
if exists(query_path+".water") is False or getsize(query_path+".water") == 0:
# If no alignment is possible, set distances to NA and alignment length to 0
Kdist = "NA"
os.remove(query_path)
if exists(query_path+".water") is True:
os.remove(query_path+".water")
else:
# Read in alignments
aln = AlignIO.read(query_path+".water", 'fasta')
# Calculate distances based on model
Kdist = Kimura80(str(aln[0].seq).upper(), str(aln[1].seq).upper())
# Convert numbers to strings
Kdist = str(round(Kdist, 4))
# Delete temporary files
os.remove(query_path+".water")
os.remove(query_path)
# Make line for temporary file and write to file
tmp_holder = row[1].to_list()[1:]
tmp_holder = "\t".join(str(x) for x in tmp_holder)+"\t"+Kdist+"\n"
tmp_out.write(tmp_holder)

return(holder_file_name)

def tmp_out_parser(file_list, simple_gff):
# Loop through results
gff=pd.DataFrame()
for file in file_list:
# read in gff
in_gff = pd.read_csv(file, sep = "\t")
# concatenate gff
gff = pd.concat([gff, in_gff], ignore_index=True)
# Convert numbers to strings for concatenation
gff['Kimura'] = gff['Kimura'].astype(str)
# Convert new data onto metadata
gff['metadata'] = gff['metadata'] + ";KIMURA80=" + gff['Kimura']
# Remove unnecessary rows
gff = gff.drop(columns = ['Kimura', 'repeat_family'])
# Combine columns, sort and drop unneccessary columns
gff = pd.concat([gff, simple_gff], ignore_index=True)
gff = gff.sort_values(by=['seqnames', 'start'])
gff = gff.reset_index()
gff = gff.drop(columns = ['level_0', 'index'])

return(gff)

if __name__ == "__main__":

start_time = time()

# check files exist
file_check(args.repeat_library, args.in_gff, args.genome, args.out_gff, args.temp_dir)

# split library file
print("Splitting repeat library")
splitter(args.repeat_library, args.temp_dir)

# read in gff and take head
print("Reading in gff")
in_gff, simple_gff = parse_gff(args.in_gff)

# create as many processes as instructed cores
num_processes = args.cores

# calculate the chunk size as an integer
chunk_size = int(in_gff.shape[0]/num_processes)

# break into chunks
chunks = [in_gff.iloc[in_gff.index[i:i + chunk_size]] for i in range(0, in_gff.shape[0], chunk_size)]

print("Starting calculations")
# Peform calulations in parallel
func = partial(outer_func, args.genome, args.temp_dir, args.timeout)
pool = multiprocessing.Pool(processes=num_processes)
results = pool.map(func, chunks)
pool.close()
pool.join()
print("Finished calculations")

# Free up memory (necessary with very large gffs and low memory machines)
del chunks
del in_gff

# Read in temp files, fix metadata, add simple repeats back, and sort
calc_gff = tmp_out_parser(results, simple_gff)

# write to file
calc_gff.to_csv(args.out_gff, sep = "\t", header = False, index=False)

# print run time for number of rows
run_time = time() - start_time
print("Total run time for ", len(calc_gff), " rows was ", run_time, " seconds")

# Delete folder of split library
shutil.rmtree(args.temp_dir+"/split_library/", ignore_errors=True)
Loading

0 comments on commit 695a96a

Please sign in to comment.