Skip to content

Commit

Permalink
Merge pull request #6 from andrewjpage/reference_genome_file
Browse files Browse the repository at this point in the history
Reference genome file
  • Loading branch information
andrewjpage authored Jun 14, 2019
2 parents c08167e + e8b3209 commit 2ad8385
Show file tree
Hide file tree
Showing 13 changed files with 140 additions and 45 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.0.0
2.1.0
1 change: 1 addition & 0 deletions scripts/socru
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ parser.add_argument('--threads', '-t', help='No. of threads to use', type=int,

# Output
parser.add_argument('--output_file', '-o', help='Output filename, defaults to STDOUT', type=str)
parser.add_argument('--output_plot_file', '-p', help='Filename of plot of genome structure', type=str, default = 'genome_structure.pdf')
parser.add_argument('--novel_profiles', '-n', help='Filename for novel profiles', type=str, default = 'profile.txt.novel')
parser.add_argument('--new_fragments', '-f', help='Filename for novel fragments', type=str, default = 'novel_fragments.fa')
parser.add_argument('--top_blast_hits', '-b', help='Filename for top blast hits', type=str)
Expand Down
8 changes: 5 additions & 3 deletions socru/Fragment.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@

class Fragment:
def __init__(self, coords):
def __init__(self, coords, sequence = "", number = 0, reversed_frag = False, dna_A = False):
self.coords = coords
self.sequence = ""
self.number = 0
self.sequence = sequence
self.number = number
self.reversed_frag = reversed_frag
self.dna_A = dna_A

def num_bases(self):
return len(self.sequence)
Expand Down
2 changes: 2 additions & 0 deletions socru/FragmentFiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def fragments_with_largest_first(self):
if m:
reordered_fragments[i].number = m.group(1)
reordered_fragments[i].sequence = reordered_fragments[i].sequence.reverse_complement()
reordered_fragments[i].reversed = True

else:
reordered_fragments[i].number = input_order[i]
Expand All @@ -48,6 +49,7 @@ def create_fragment_fastas(self):
record = [SeqRecord(f.sequence, str(f) , '', '')]
outname = os.path.join(self.output_directory, f.output_filename())
self.output_filenames.append(outname)
f.output_filename = outname
SeqIO.write(record, outname, "fasta")

def split_fragment_order(self):
Expand Down
23 changes: 23 additions & 0 deletions socru/GATProfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,29 @@ def is_profile_in_correct_orientation(self):
# dnaA hasnt been found so really should raise an exception
return True

def reorder_fragment_objects_based_on_fragment_name_array(self, fragment_objects):
reordered_fragment_objects = []
# inefficient but small searches
# need to check for unmatched
for frag_name in self.fragments:
frag_name_orientationless = frag_name
frag_name_reversed = False
m = re.match(r"([\d]+)'", frag_name)
if m:
frag_name_orientationless = m.group(1)
frag_name_reversed = True

for frag_obj in fragment_objects:
if str(frag_obj.number) == str(frag_name_orientationless):
if frag_name_reversed:
frag_obj.reversed_frag = True
else:
frag_obj.reversed_frag = False
reordered_fragment_objects.append(frag_obj)

return reordered_fragment_objects


def orientate_for_dnaA(self):
if not self.is_profile_in_correct_orientation():
self.fragments = self.invert_fragments()
Expand Down
35 changes: 35 additions & 0 deletions socru/PlotProfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import matplotlib.pyplot as plt

class PlotProfile:

def __init__(self, fragments, output_file):
self.fragments = fragments
self.output_file = output_file

def total_bases(self):
return sum([f.num_bases() for f in self.fragments])

# make this look pretty
def create_plot(self):
names = [str(f.number) for f in self.fragments]
size = [int(f.num_bases()) for f in self.fragments]
reversed_fragments = [f.reversed_frag for f in self.fragments]
dna_A = [f.dna_A for f in self.fragments]

for i in range(len(names)):
if dna_A[i]:
names[i] += " - Ori"

my_circle=plt.Circle( (0,0), 0.7, color='white')

piechart = plt.pie(size, labels=names, wedgeprops = { 'linewidth' : 7, 'edgecolor' : 'white' })

# set hatching pattern
for i in range(len(piechart[0])):
if reversed_fragments[i]:
piechart[0][i].set_hatch('+')
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.savefig( self.output_file )


5 changes: 3 additions & 2 deletions socru/ProfileGenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
from socru.DnaA import DnaA

class ProfileGenerator:
def __init__(self, output_directory, num_fragments, dnaa_fasta, threads, prefix = 'GS', output_filename = 'profile.txt', metadata_file_suffix = '.yml',):
def __init__(self, output_directory, num_fragments, dnaa_fasta, threads, input_file, prefix = 'GS', output_filename = 'profile.txt', metadata_file_suffix = '.yml',):
self.output_directory = output_directory
self.output_filename = os.path.join(self.output_directory, output_filename)
self.input_file = input_file
self.prefix = prefix
self.num_fragments = num_fragments
self.dnaa_fasta = dnaa_fasta
Expand Down Expand Up @@ -37,7 +38,7 @@ def write_output_file(self):
def write_metadata_file(self):
d = DnaA(self.dnaa_fasta, self.output_directory, self.threads)
metadata_file = self.output_filename + self.metadata_file_suffix
metadata_content = { 'dnaa_fragment': int(d.fragment_with_dnaa), 'dnaa_forward_orientation': d.forward_orientation }
metadata_content = { 'dnaa_fragment': int(d.fragment_with_dnaa), 'dnaa_forward_orientation': d.forward_orientation, 'reference_genome': self.input_file }

with open(metadata_file, 'w') as yaml_file:
yaml.dump(metadata_content, yaml_file, default_flow_style=False)
Expand Down
34 changes: 26 additions & 8 deletions socru/Socru.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from socru.GATProfile import GATProfile
from socru.TypeGenerator import TypeGenerator
from socru.Schemas import Schemas
from socru.PlotProfile import PlotProfile

class Socru:
def __init__(self,options):
Expand All @@ -32,6 +33,7 @@ def __init__(self,options):
self.new_fragments = options.new_fragments
self.max_bases_from_ends = options.max_bases_from_ends
self.top_blast_hits = options.top_blast_hits
self.output_plot_file = options.output_plot_file
self.dirs_to_cleanup = []
self.top_results = []

Expand Down Expand Up @@ -68,7 +70,7 @@ def output_results(self, input_file, profile_type):
with open(self.output_file, "a+") as output_fh:
output_fh.write(input_file + "\t" + profile_type + "\n")

def run_analysis(self, input_file, p, d):
def find_rrna_boundries(self, input_file):
# run the fasta through barrnap
fd, barrnap_outputfile = mkstemp()
b = Barrnap(input_file, self.threads)
Expand All @@ -78,6 +80,11 @@ def run_analysis(self, input_file, p, d):

boundries = b.read_barrnap_output(barrnap_outputfile)
os.close(fd)
return boundries

# refactor
def run_analysis(self, input_file, p, d):
boundries = self.find_rrna_boundries(input_file)

f = Fasta(input_file, is_circular = self.is_circular)
fragments = f.calc_fragment_coords( boundries)
Expand All @@ -93,27 +100,38 @@ def run_analysis(self, input_file, p, d):
blast = Blast(d.db_prefix, self.threads)

gat_profile = GATProfile(fragments = [])
for fasta_file in ff.output_filenames:
for current_fragment in ff.ordered_fragments:
fasta_file = current_fragment.output_filename

blast_results = blast.run_blast(fasta_file)
fb = FilterBlast(blast_results, self.min_bit_score, self.min_alignment_length)
top_result = fb.return_top_result()
if top_result is None:
gat_profile.fragments.append('?')
fasta_file
current_fragment.number = '?'

with open(fasta_file, "r") as fasta_file_fh:
with open(self.new_fragments, "a+") as newfrag_fh:
newfrag_fh.write(fasta_file_fh.read())
continue
else:
self.top_results.append(top_result)

if top_result.is_forward():
gat_profile.fragments.append( str(top_result.subject))
else:
gat_profile.fragments.append( str(top_result.subject)+ '\'')

current_fragment.number = str(top_result.subject)
if str(p.dnaA_fragment_number) == str(current_fragment.number):
current_fragment.dna_A = True

if top_result.is_forward():
gat_profile.fragments.append( str(top_result.subject))
else:
current_fragment.reversed_frag = True
gat_profile.fragments.append( str(top_result.subject)+ '\'')

gat_profile.orientate_for_dnaA()
reordered_frag_objs = gat_profile.reorder_fragment_objects_based_on_fragment_name_array( ff.ordered_fragments )
pp = PlotProfile(reordered_frag_objs, self.output_plot_file)
pp.create_plot()

# lookup the gat_profile to get the number
tg = TypeGenerator(p, gat_profile)
type_output_string = tg.calculate_type() + "\t" + str(gat_profile)
Expand Down
2 changes: 1 addition & 1 deletion socru/SocruCreate.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def run(self):
ff.create_fragment_fastas()

# create a default profile.txt file
default_profile = ProfileGenerator(self.output_directory, len(ff.ordered_fragments), self.dnaa_fasta, self.threads)
default_profile = ProfileGenerator(self.output_directory, len(ff.ordered_fragments), self.dnaa_fasta, self.threads, self.input_file )
default_profile.write_output_file()

def __del__(self):
Expand Down
27 changes: 0 additions & 27 deletions socru/data/Salmonella_enterica/profile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,30 +25,3 @@ GS Frag_1 Frag_2 Frag_3 Frag_4 Frag_5 Frag_6 Frag_7
17.0 1 2 3 5 4 6 7
17.122 1 7' 6' 4' 5' 3 2'
18.83 1' 7' 5' 3 4 6 2'
GS Frag_1 Frag_2 Frag_3 Frag_4 Frag_5 Frag_6 Frag_7
1.0 1 2 3 4 5 6 7
1.123 1' 7' 6' 5' 4' 3 2'
2.66 1 7' 3 5 6 4 2'
2.67 1' 7' 3 5 6 4 2'
3.67 1' 7' 3 4 6 5 2'
4.0 1 2 3 6 4 5 7
4.1 1' 2 3 6 4 5 7
5.65 1' 7' 2 3 4 5 6
6.66 1 7' 3 5 4 6 2'
6.67 1' 7' 3 5 4 6 2'
7.56 1 2 4' 5' 6' 3 7
8.67 1' 7' 3 4 5 6 2'
9.2 1 3 4 5 6 7 2'
10.74 1 7' 4' 3 5 6 2'
11.0 1 2 3 5 6 4 7
11.1 1' 2 3 5 6 4 7
11.122 1 7' 4' 6' 5' 3 2'
12.1 1' 2 3 4 6 5 7
13.17 1' 5' 2 3 6 4 7
14.0 1 2 3 6 5 4 7
14.1 1' 2 3 6 5 4 7
15.17 1' 2 5' 3 6 4 7
16.33 1' 2 6' 3 5 4 7
17.0 1 2 3 5 4 6 7
17.122 1 7' 6' 4' 5' 3 2'
18.83 1' 7' 5' 3 4 6 2'
39 changes: 39 additions & 0 deletions socru/tests/PlotProfile_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import unittest
import os
import shutil
import filecmp
from socru.PlotProfile import PlotProfile
from socru.Fragment import Fragment

test_modules_dir = os.path.dirname(os.path.realpath(__file__))
data_dir = os.path.join(test_modules_dir, 'data','plot_profile')

class TestPlotProfile(unittest.TestCase):

def test_plot_profile(self):
fragments = []
fragments.append(Fragment([], sequence = "AAAA", number = 1, reversed_frag = False, dna_A = False))
fragments.append(Fragment([], sequence = "AAA", number = 3, reversed_frag = False, dna_A = False))
fragments.append(Fragment([], sequence = "AAAAAAAAAA", number = 2, reversed_frag = False, dna_A = True))
fragments.append(Fragment([], sequence = "A", number = 4, reversed_frag = False, dna_A = False))

p = PlotProfile(fragments, 'plot.pdf')
p.create_plot()
self.assertTrue(os.path.exists('plot.pdf'))
os.remove('plot.pdf')

def test_plot_profile_some_reversed(self):
fragments = []
fragments.append(Fragment([], sequence = "AAAA", number = 1, reversed_frag = True, dna_A = True))
fragments.append(Fragment([], sequence = "AAA", number = 3, reversed_frag = False, dna_A = False))
fragments.append(Fragment([], sequence = "AAAAAAAAAA", number = 2, reversed_frag = True, dna_A = False))
fragments.append(Fragment([], sequence = "A", number = 4, reversed_frag = False, dna_A = False))

p = PlotProfile(fragments, 'plot2.pdf')
p.create_plot()

self.assertTrue(os.path.exists('plot2.pdf'))
os.remove('plot2.pdf')



2 changes: 1 addition & 1 deletion socru/tests/ProfileGenerator_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
class TestProfileGenerator(unittest.TestCase):

def test_profile_generator(self):
p = ProfileGenerator( os.path.join(data_dir,'database'), 7, os.path.join(data_dir, 'dnaA.fa.gz'), 1)
p = ProfileGenerator( os.path.join(data_dir,'database'), 7, os.path.join(data_dir, 'dnaA.fa.gz'), 1, 'reffile.fa')
p.write_output_file()

self.assertTrue(os.path.exists(os.path.join(data_dir,'database', 'profile.txt')))
Expand Down
5 changes: 3 additions & 2 deletions socru/tests/Socru_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
data_dir = os.path.join(test_modules_dir, 'data','socru')

class TestOptions:
def __init__(self, input_files, species, db_dir, min_bit_score,min_alignment_length, threads, output_file, not_circular, novel_profiles, new_fragments, max_bases_from_ends,top_blast_hits ):
def __init__(self, input_files, species, db_dir, min_bit_score,min_alignment_length, threads, output_file, not_circular, novel_profiles, new_fragments, max_bases_from_ends,top_blast_hits, output_plot_file ):
self.input_files = input_files
self.species = species
self.db_dir = db_dir
Expand All @@ -21,11 +21,12 @@ def __init__(self, input_files, species, db_dir, min_bit_score,min_alignment_len
self.new_fragments = new_fragments
self.max_bases_from_ends = max_bases_from_ends
self.top_blast_hits = top_blast_hits
self.output_plot_file = output_plot_file

class TestSocru(unittest.TestCase):

def test_socru_valid(self):
g = Socru(TestOptions([os.path.join(data_dir, 'test.fa')], 'Salmonella_enterica', None, 1000,1000,1, 'output_file', False, 'novel', 'newfrag.fa', None, 'blast'))
g = Socru(TestOptions([os.path.join(data_dir, 'test.fa')], 'Salmonella_enterica', None, 1000,1000,1, 'output_file', False, 'novel', 'newfrag.fa', None, 'blast', 'output_plot.png'))
g.run()
self.assertTrue(os.path.exists('output_file'))
self.assertTrue(os.path.exists('blast'))
Expand Down

0 comments on commit 2ad8385

Please sign in to comment.