Skip to content

Commit

Permalink
Removed the --save option (just use prefix to supply save directory).
Browse files Browse the repository at this point in the history
  • Loading branch information
oleraj committed Feb 11, 2017
1 parent e51fe44 commit 2137cea
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 53 deletions.
53 changes: 21 additions & 32 deletions convert_reads_to_amino_acid.pl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
use Getopt::Long;
use Data::Dumper;
use File::Basename;
use Bio::DB::Sam; #http://search.cpan.org/~lds/Bio-SamTools-1.37/lib/Bio/DB/Sam.pm
use Bio::SeqIO;
use Array::Utils qw(:all);
use File::Slurp qw( prepend_file );
Expand All @@ -27,12 +26,6 @@
use Bio::Tools::Run::Alignment::Clustalw;
use File::Which;

#use Bio::Perl;
#use Getopt::Std;
#use PostData;
#use Fasta_utils;
#use feature ":5.10"; #allows say (same as print, but adds "\n"), given/when switch, etc.


# Andrew J. Oler, PhD
# Computational Biology Section
Expand Down Expand Up @@ -114,21 +107,19 @@
clustalw (if using --clustalw)
Non-standard Perl modules:
Bioperl, including Bio::DB::Sam and Bio::Tools::Run::Alignment::Clustalw
Bioperl, including Bio::Tools::Run::Alignment::Clustalw
Array::Utils
Parallel::Loops
Required Arguments:
-i/--input Input file, comma-delimited. Required.
-i/--input Input FASTA file. Required.
-r/--ref Reference coding (CDS) sequence fasta file. Required. Used to determine
translation frame of reference.
Optional Arguments:
-l/--label Name for sample. Default: Sample_1
--prefix Prefix for output files. Default = Convert_reads
-s/--save Directory in which to save files. Default = pwd. If folder doesn\'t exist, it
will be created.
--prefix Prefix for output files. Default = Convert_reads. If the prefix value
contains a directory, the directory must exist.
-p/--cpu Number of processors to use. Default = 1.
--clustalw Use clustalw to run alignments. Default is to use mafft. *Executables must be
on the PATH.* Speed is about the same for clustalw and mafft (about 1.3x faster
Expand All @@ -148,8 +139,8 @@
1. Recommended h_vmem (RAM) value per thread is 6G, although it may be fine with less,
depending on the size and complexity of your input file.
2. If running more than 20,000 sequences, it is recommended to split the file and run each
separately, saving the output into a separate directory, then merging the results with
merge_tally.pl.
separately, saving the output into a single directory, then merging the results with
merge_tally.pl
";


Expand Down Expand Up @@ -272,16 +263,14 @@
# Resulting simplifications:
# 1. Move global variables to subroutines, including filehandles for output files (tally.xls, variants, etc.)
# 2. Move some subroutines to module primerid.pm so they can be used by merge_tally.pl as well.
# 2017-02-11
# Removed --save option. Just include any directory in the --prefix. ($prefix can contain absolute or relative path).
# -s/--save Directory in which to save files. Default = pwd. If folder doesn\'t exist, it
# will be created.


unless ($ref && ($input||$ARGV[0]) ){ print STDERR "$usage\n"; exit; } #fasta and gff are required

my $save_dir = $save || Cwd::cwd();
# print $save_dir;
# exit;
unless (-d $save_dir){ mkdir($save_dir) or warn "$!\n" }
# warn "save directory: $save_dir\n";

my $start_time = time;

if ($label =~ m/,/){
Expand Down Expand Up @@ -332,15 +321,15 @@



my $unique_seqs_file = get_unique_seqs($input);
my $unique_seqs_file = get_unique_seqs($input, $prefix);

my $nuc_aa_codon_tally = read_input_reads($unique_seqs_file);
my $nuc_aa_codon_tally = read_input_reads($unique_seqs_file, $prefix);
# print Dumper($nuc_aa_codon_tally); exit;
print_reports($nuc_aa_codon_tally, $label, $gene, \@NUC, \@CODON, \@AA, $cds, \%converter, $save_dir, $prefix, $verbose, $debug);
print_reports($nuc_aa_codon_tally, $label, $gene, \@NUC, \@CODON, \@AA, $cds, \%converter, $prefix, $verbose, $debug);

print_merged_report($nuc_aa_codon_tally, $label, $gene, \@CODON, $cds, $save_dir, $prefix);
print_merged_report($nuc_aa_codon_tally, $label, $gene, \@CODON, $cds, $prefix);

print_variants($nuc_aa_codon_tally, $label, $gene, $variant_threshold, $save_dir, $prefix, $start_time, $verbose);
print_variants($nuc_aa_codon_tally, $label, $gene, $variant_threshold, $prefix, $start_time, $verbose);



Expand All @@ -350,7 +339,7 @@
#---------------------------------- SUBS -------------------------------------
#-----------------------------------------------------------------------------
sub get_unique_seqs {
my $file = shift;
my ($file, $prefix) = @_;

# Get the unique sequences in a fasta file with counts and a hash with number of times the sequence was found in the file = "count".
# This is designed to speed up the computation significantly, since many of the reads will be identical.
Expand All @@ -367,9 +356,9 @@ sub get_unique_seqs {
# ...
# The id has seq number - count ; it's sorted by most common to least common sequence

my ($filename,$dir,$ext) = fileparse($file,@SUFFIXES);
#my ($filename,$dir,$ext) = fileparse($file,@SUFFIXES);
# my $unique_seqs_fasta = $dir . $filename . ".uniq" . $ext; # Make it a real file, not a temp file
my $unique_seqs_fasta = $save_dir . "/$filename" . ".uniq" . $ext; # Make it a real file, not a temp file
my $unique_seqs_fasta = $prefix . ".uniq.fasta"; # Make it a real file, not a temp file

#my $check_for_fasta_collapser = which("fasta_collapser.pl"); # returns undef if not found on system.
#if ($check_for_fasta_collapser){
Expand All @@ -386,7 +375,7 @@ sub get_unique_seqs {
}
#-----------------------------------------------------------------------------
sub read_input_reads {
my $file = shift;
my ($file, $prefix) = @_;
my $nuc_aa_codon_tally;
print STDERR "Processing file $file ...\n";

Expand Down Expand Up @@ -658,11 +647,11 @@ sub read_input_reads {
# Prepare the output files of cleaned reads and full peptides
my ($filename,$dir,$ext) = fileparse($file,@SUFFIXES);

my $newfilename = $save_dir . "/" . $filename . ".cleanreads.txt";
my $newfilename = $prefix . ".cleanreads.txt";
my $clean_reads_fh = open_to_write("$newfilename");
print $clean_reads_fh "#readID|gene|positionForFirstNucleotideBase\tcleanRead\n";

my $fullpeptide = $save_dir . "/" . $filename . ".cleanpeptides.txt";
my $fullpeptide = $prefix . ".cleanpeptides.txt";
# print STDERR "Full peptide alignment will be stored in this file: $fullpeptide\n";
my $pepfh = open_to_write("$fullpeptide");
print $pepfh "#readID|gene|positionForFirstAminoAcid\tcleanPeptide\n";
Expand Down
20 changes: 8 additions & 12 deletions merge_tally.pl
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,7 @@
Optional Arguments:
--prefix Prefix for output files. Default = Merge_convert_reads. Sample id will be
added to the prefix.
--save Directory in which to save files. Default = pwd. If folder doesn\'t exist, it
will be created.
added to the prefix. If the prefix value contains a directory, the directory must exist.
--variant_threshold Minimum threshold frequency for output variant file. Default = 0
(i.e., print all variants). (Note that in calculate_linkage_disequilibrium.pl
there is an option to filter for a variant_threshold as well.)
Expand All @@ -118,6 +116,10 @@
# Change log
# 2017-02-04
# Created script, based on convert_reads_to_amino_acid.pl
# 2017-02-11
# Removed --save option. Just include any directory in the --prefix. ($prefix can contain absolute or relative path).
# --save Directory in which to save files. Default = pwd. If folder doesn\'t exist, it
# will be created.


unless ($dir||$ARGV[0]){ print STDERR "$usage\n"; exit; } #fasta and gff are required
Expand All @@ -132,12 +134,6 @@
}


my $save_dir = $save || Cwd::cwd();
# print $save_dir;
# exit;
unless (-d $save_dir){ mkdir($save_dir) or warn "$!\n" }
# warn "save directory: $save_dir\n";

my $start_time = time;

my @suffixes = (qw(.bed .bed.gz .bed12 .bed12.gz .txt .txt.gz .BED .BED.gz .BED12 .BED12.gz .fasta .fa .FA .FASTA .FAS .fas), @SUFFIXES); #for fileparse. Feel free to add more accepted extensions. @SUFFIXES comes from aomisc.pm.
Expand Down Expand Up @@ -193,9 +189,9 @@

for (my $i = 0; $i < @samples; $i++){
my $prefix_with_sample = $prefix . "." . $samples[$i];
print_reports($sample_nuc_aa_codon_tally->{$samples[$i]}, $samples[$i], $gene, \@NUC, \@CODON, \@AA, $cds, \%converter, $save_dir, $prefix_with_sample, $verbose, $debug);
print_merged_report($sample_nuc_aa_codon_tally->{$samples[$i]}, $samples[$i], $gene, \@CODON, $cds, $save_dir, $prefix_with_sample);
print_variants($sample_nuc_aa_codon_tally->{$samples[$i]}, $samples[$i], $gene, $variant_threshold, $save_dir, $prefix_with_sample, $start_time, $verbose);
print_reports($sample_nuc_aa_codon_tally->{$samples[$i]}, $samples[$i], $gene, \@NUC, \@CODON, \@AA, $cds, \%converter, $prefix_with_sample, $verbose, $debug);
print_merged_report($sample_nuc_aa_codon_tally->{$samples[$i]}, $samples[$i], $gene, \@CODON, $cds, $prefix_with_sample);
print_variants($sample_nuc_aa_codon_tally->{$samples[$i]}, $samples[$i], $gene, $variant_threshold, $prefix_with_sample, $start_time, $verbose);
}


Expand Down
20 changes: 11 additions & 9 deletions primerid.pm
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ our @EXPORT = qw(
# 2017-02-05
# Created module. Moved some subroutines from convert_reads_to_amino_acid.pl to
# this module so they can be used by merge_tally.pl as well.

# 2017-02-11
# Removed $save_dir as input, using $prefix alone ($prefix can contain absolute
# or relative path).

#-------------------------------------------------------------------------------
#----------------------------------- FUNCTIONS ---------------------------------
Expand Down Expand Up @@ -159,16 +161,16 @@ sub make_codon_to_aa_hash {
}
#-----------------------------------------------------------------------------
sub print_reports {
my ($nuc_aa_codon_tally, $label, $gene, $NUC, $CODON, $AA, $cds, $converter, $save_dir, $prefix, $verbose, $debug) = @_;
my ($nuc_aa_codon_tally, $label, $gene, $NUC, $CODON, $AA, $cds, $converter, $prefix, $verbose, $debug) = @_;
# Print Reports to filehandles $nucleotide_report_fh, $codon_report_fh, and $amino_acid_report_fh
# TO DO: Add more documentation about the input data format ($cds, $nuc_aa_codon_tally, etc.)

#name(chr:gene:aminoAcidPosition:consensusAminoAcid) chr gene aminoAcidPosition refAminoAcid consensusAminoAcid Sample coverageDepth numA numC numG numT numN [del SumACGT numNonzeroACGT Ref Nonref]


my $nucleotide_report = $save_dir . "/" . $prefix . ".nuc.tally.xls";
my $codon_report = $save_dir . "/" . $prefix . ".codon.tally.xls";
my $amino_acid_report = $save_dir . "/" . $prefix . ".aa.tally.xls";
my $nucleotide_report = $prefix . ".nuc.tally.xls";
my $codon_report = $prefix . ".codon.tally.xls";
my $amino_acid_report = $prefix . ".aa.tally.xls";

my $nucleotide_report_fh = open_to_write("$nucleotide_report");
my $codon_report_fh = open_to_write("$codon_report");
Expand Down Expand Up @@ -371,12 +373,12 @@ sub remove_one_from_tally {
}
#-----------------------------------------------------------------------------
sub print_merged_report {
my ($nuc_aa_codon_tally, $label, $gene, $CODON, $cds, $save_dir, $prefix) = @_;
my ($nuc_aa_codon_tally, $label, $gene, $CODON, $cds, $prefix) = @_;
# Print Reports to filehandles $nucleotide_report_fh, $codon_report_fh, and $amino_acid_report_fh
#name(gene:aminoAcidPosition:consensusAminoAcid) gene nucleotidePosition refNucleotide consensusNucleotide aminoAcidPosition refCodon consensusCodon refAminoAcid consensusAminoAcid Sample coverageDepth topNucleotide secondNucleotide thirdNucleotide otherNucleotide countTopNucleotide countSecondNucleotide countThirdNucleotide countOtherNucleotide topCodon secondCodon thirdCodon otherCodon countTopCodon countSecondCodon countThirdCodon countOtherCodon topAminoAcid secondAminoAcid thirdAminoAcid otherAminoAcid countTopAminoAcid countSecondAminoAcid countThirdAminoAcid countOtherAminoAcid
# $nuc_aa_codon_tally format example: $nuc_aa_codon_tally->{'aa'}->{$pos}->{$aa}++;

my $merged_report = $save_dir . "/" . $prefix . ".merged.tally.xls";
my $merged_report = $prefix . ".merged.tally.xls";

my $merged_report_fh = open_to_write("$merged_report");

Expand Down Expand Up @@ -534,7 +536,7 @@ sub print_top_hits_alternative {
}
#-----------------------------------------------------------------------------
sub print_variants {
my ($nuc_aa_codon_tally, $label, $gene, $variant_threshold, $save_dir, $prefix, $start_time, $verbose) = @_;
my ($nuc_aa_codon_tally, $label, $gene, $variant_threshold, $prefix, $start_time, $verbose) = @_;
# Filters the variants (nuc, aa, codon) for those that are above a certain threshold in frequency and then performs linkage disequilibrium in a pairwise manner for all variants at level of nuc, codon, or aa.

# First get the variants that are above a frequency threshold
Expand All @@ -547,7 +549,7 @@ sub print_variants {
# Coverage is the number of reads at this position
# Frequency is count/coverage

my $variants = $save_dir . "/" . $prefix . ".variants.minfreq".$variant_threshold.".xls"; # Maybe add threshold to the name. Or in the header of the file.
my $variants = $prefix . ".variants.minfreq".$variant_threshold.".xls"; # Maybe add threshold to the name. Or in the header of the file.

my $variants_fh = open_to_write("$variants");

Expand Down

0 comments on commit 2137cea

Please sign in to comment.