Removed the --save option (just use prefix to supply save directory).

niaid · Feb 11, 2017 · 2137cea · 2137cea
1 parent e51fe44
commit 2137cea
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 53 deletions.
diff --git a/convert_reads_to_amino_acid.pl b/convert_reads_to_amino_acid.pl
@@ -18,7 +18,6 @@
 use Getopt::Long;
 use Data::Dumper;
 use File::Basename;
-use Bio::DB::Sam;		#http://search.cpan.org/~lds/Bio-SamTools-1.37/lib/Bio/DB/Sam.pm
 use Bio::SeqIO;
 use Array::Utils qw(:all);
 use File::Slurp qw( prepend_file );
@@ -27,12 +26,6 @@
 use Bio::Tools::Run::Alignment::Clustalw;
 use File::Which;
 
-#use Bio::Perl;
-#use Getopt::Std;
-#use PostData;
-#use Fasta_utils;
-#use feature ":5.10";	#allows say (same as print, but adds "\n"), given/when switch, etc.
-
 
 # Andrew J. Oler, PhD
 # Computational Biology Section
@@ -114,21 +107,19 @@
 clustalw (if using --clustalw)
 
 Non-standard Perl modules:
-Bioperl, including Bio::DB::Sam and Bio::Tools::Run::Alignment::Clustalw
+Bioperl, including Bio::Tools::Run::Alignment::Clustalw
 Array::Utils
 Parallel::Loops
 
-
 Required Arguments:
--i/--input 	Input file, comma-delimited.  Required.
+-i/--input 	Input FASTA file.  Required.
 -r/--ref	Reference coding (CDS) sequence fasta file.  Required.  Used to determine 
 		translation frame of reference.
 
 Optional Arguments:
 -l/--label		Name for sample.  Default: Sample_1
---prefix	Prefix for output files.  Default = Convert_reads
--s/--save		Directory in which to save files. Default = pwd.  If folder doesn\'t exist, it 
-		will be created.
+--prefix	Prefix for output files.  Default = Convert_reads.  If the prefix value 
+		contains a directory, the directory must exist.
 -p/--cpu	Number of processors to use. Default = 1.
 --clustalw	Use clustalw to run alignments. Default is to use mafft.  *Executables must be 
 		on the PATH.*  Speed is about the same for clustalw and mafft (about 1.3x faster 
@@ -148,8 +139,8 @@
 1. Recommended h_vmem (RAM) value per thread is 6G, although it may be fine with less, 
 	depending on the size and complexity of your input file. 
 2. If running more than 20,000 sequences, it is recommended to split the file and run each
-	separately,	saving the output into a separate directory, then merging the results with 
-	merge_tally.pl.
+	separately,	saving the output into a single directory, then merging the results with 
+	merge_tally.pl
 ";
 
 
@@ -272,16 +263,14 @@
 # Resulting simplifications: 
 #	1. Move global variables to subroutines, including filehandles for output files (tally.xls, variants, etc.)
 #	2. Move some subroutines to module primerid.pm so they can be used by merge_tally.pl as well.
+# 2017-02-11
+# Removed --save option.  Just include any directory in the --prefix.  ($prefix can contain absolute or relative path).
+# -s/--save		Directory in which to save files. Default = pwd.  If folder doesn\'t exist, it 
+#			will be created.
 
 
 unless ($ref && ($input||$ARGV[0]) ){	print STDERR "$usage\n";	exit;	}	#fasta and gff are required
 
-my $save_dir = $save || Cwd::cwd();
-# print $save_dir;
-# exit;
-unless (-d $save_dir){	mkdir($save_dir) or warn "$!\n"	}
-# warn "save directory: $save_dir\n";
-
 my $start_time = time;
 
 if ($label =~ m/,/){
@@ -332,15 +321,15 @@
 
 
 
-my $unique_seqs_file = get_unique_seqs($input);
+my $unique_seqs_file = get_unique_seqs($input, $prefix);
 
-my $nuc_aa_codon_tally = read_input_reads($unique_seqs_file);
+my $nuc_aa_codon_tally = read_input_reads($unique_seqs_file, $prefix);
 	#			print Dumper($nuc_aa_codon_tally); exit;
-print_reports($nuc_aa_codon_tally, $label, $gene, \@NUC, \@CODON, \@AA, $cds, \%converter, $save_dir, $prefix, $verbose, $debug);		
+print_reports($nuc_aa_codon_tally, $label, $gene, \@NUC, \@CODON, \@AA, $cds, \%converter, $prefix, $verbose, $debug);		
 
-print_merged_report($nuc_aa_codon_tally, $label, $gene, \@CODON, $cds, $save_dir, $prefix);	
+print_merged_report($nuc_aa_codon_tally, $label, $gene, \@CODON, $cds, $prefix);	
 
-print_variants($nuc_aa_codon_tally, $label, $gene, $variant_threshold, $save_dir, $prefix, $start_time, $verbose);
+print_variants($nuc_aa_codon_tally, $label, $gene, $variant_threshold, $prefix, $start_time, $verbose);
 
 
 
@@ -350,7 +339,7 @@
 #---------------------------------- SUBS -------------------------------------
 #-----------------------------------------------------------------------------
 sub get_unique_seqs {
-	my $file = shift;
+	my ($file, $prefix) = @_;
 
 	# Get the unique sequences in a fasta file with counts and a hash with number of times the sequence was found in the file = "count".  
 	# This is designed to speed up the computation significantly, since many of the reads will be identical.  
@@ -367,9 +356,9 @@ sub get_unique_seqs {
 	# ...
 	# The id has seq number - count ; it's sorted by most common to least common sequence
 
-	my ($filename,$dir,$ext) = fileparse($file,@SUFFIXES);
+	#my ($filename,$dir,$ext) = fileparse($file,@SUFFIXES);
 	#	my $unique_seqs_fasta = $dir . $filename . ".uniq" . $ext; 		# Make it a real file, not a temp file
-	my $unique_seqs_fasta = $save_dir . "/$filename" . ".uniq" . $ext; 		# Make it a real file, not a temp file
+	my $unique_seqs_fasta = $prefix . ".uniq.fasta"; 		# Make it a real file, not a temp file
 
 	#my $check_for_fasta_collapser = which("fasta_collapser.pl");		# returns undef if not found on system.
 	#if ($check_for_fasta_collapser){
@@ -386,7 +375,7 @@ sub get_unique_seqs {
 }	
 #-----------------------------------------------------------------------------
 sub read_input_reads {
-	my $file = shift;
+	my ($file, $prefix) = @_;
 	my $nuc_aa_codon_tally; 
 	print STDERR "Processing file $file ...\n";
 
@@ -658,11 +647,11 @@ sub read_input_reads {
 	# Prepare the output files of cleaned reads and full peptides
 	my ($filename,$dir,$ext) = fileparse($file,@SUFFIXES);
 
-	my $newfilename = $save_dir . "/" . $filename . ".cleanreads.txt";
+	my $newfilename = $prefix . ".cleanreads.txt";
 	my $clean_reads_fh = open_to_write("$newfilename");
 	print $clean_reads_fh "#readID|gene|positionForFirstNucleotideBase\tcleanRead\n";
 
-	my $fullpeptide = $save_dir . "/" . $filename . ".cleanpeptides.txt";
+	my $fullpeptide = $prefix . ".cleanpeptides.txt";
 	#					print STDERR "Full peptide alignment will be stored in this file: $fullpeptide\n";
 	my $pepfh = open_to_write("$fullpeptide");
 	print $pepfh "#readID|gene|positionForFirstAminoAcid\tcleanPeptide\n";

diff --git a/merge_tally.pl b/merge_tally.pl
@@ -97,9 +97,7 @@
 
 Optional Arguments:
 --prefix	Prefix for output files.  Default = Merge_convert_reads.  Sample id will be 
-		added to the prefix.
---save		Directory in which to save files. Default = pwd.  If folder doesn\'t exist, it 
-		will be created.
+		added to the prefix.  If the prefix value contains a directory, the directory must exist.
 --variant_threshold	Minimum threshold frequency for output variant file.  Default = 0 
 		(i.e., print all variants).  (Note that in calculate_linkage_disequilibrium.pl
 		there is an option to filter for a variant_threshold as well.)
@@ -118,6 +116,10 @@
 # Change log
 # 2017-02-04
 # Created script, based on convert_reads_to_amino_acid.pl
+# 2017-02-11
+# Removed --save option.  Just include any directory in the --prefix.  ($prefix can contain absolute or relative path).
+# --save		Directory in which to save files. Default = pwd.  If folder doesn\'t exist, it 
+#			will be created.
 
 
 unless ($dir||$ARGV[0]){	print STDERR "$usage\n";	exit;	}	#fasta and gff are required
@@ -132,12 +134,6 @@
 }
 
 
-my $save_dir = $save || Cwd::cwd();
-# print $save_dir;
-# exit;
-unless (-d $save_dir){	mkdir($save_dir) or warn "$!\n"	}
-# warn "save directory: $save_dir\n";
-
 my $start_time = time;
 
 my @suffixes = (qw(.bed .bed.gz .bed12 .bed12.gz .txt .txt.gz .BED .BED.gz .BED12 .BED12.gz .fasta .fa .FA .FASTA .FAS .fas), @SUFFIXES);	#for fileparse.  Feel free to add more accepted extensions.  @SUFFIXES comes from aomisc.pm.  
@@ -193,9 +189,9 @@
 
 for (my $i = 0; $i < @samples; $i++){
 	my $prefix_with_sample = $prefix . "." . $samples[$i];
-	print_reports($sample_nuc_aa_codon_tally->{$samples[$i]}, $samples[$i], $gene, \@NUC, \@CODON, \@AA, $cds, \%converter, $save_dir, $prefix_with_sample, $verbose, $debug);		
-	print_merged_report($sample_nuc_aa_codon_tally->{$samples[$i]}, $samples[$i], $gene, \@CODON, $cds, $save_dir, $prefix_with_sample);	
-	print_variants($sample_nuc_aa_codon_tally->{$samples[$i]}, $samples[$i], $gene, $variant_threshold, $save_dir, $prefix_with_sample, $start_time, $verbose);
+	print_reports($sample_nuc_aa_codon_tally->{$samples[$i]}, $samples[$i], $gene, \@NUC, \@CODON, \@AA, $cds, \%converter, $prefix_with_sample, $verbose, $debug);		
+	print_merged_report($sample_nuc_aa_codon_tally->{$samples[$i]}, $samples[$i], $gene, \@CODON, $cds, $prefix_with_sample);	
+	print_variants($sample_nuc_aa_codon_tally->{$samples[$i]}, $samples[$i], $gene, $variant_threshold, $prefix_with_sample, $start_time, $verbose);
 }
 
 

diff --git a/primerid.pm b/primerid.pm
@@ -44,7 +44,9 @@ our @EXPORT = qw(
 # 2017-02-05
 # Created module.  Moved some subroutines from convert_reads_to_amino_acid.pl to 
 # this module so they can be used by merge_tally.pl as well.
-
+# 2017-02-11
+# Removed $save_dir as input, using $prefix alone ($prefix can contain absolute 
+# or relative path).
 
 #-------------------------------------------------------------------------------
 #----------------------------------- FUNCTIONS ---------------------------------
@@ -159,16 +161,16 @@ sub make_codon_to_aa_hash {
 }
 #-----------------------------------------------------------------------------
 sub print_reports {
-	my ($nuc_aa_codon_tally, $label, $gene, $NUC, $CODON, $AA, $cds, $converter, $save_dir, $prefix, $verbose, $debug) = @_;
+	my ($nuc_aa_codon_tally, $label, $gene, $NUC, $CODON, $AA, $cds, $converter, $prefix, $verbose, $debug) = @_;
 	# Print Reports to filehandles $nucleotide_report_fh, $codon_report_fh, and $amino_acid_report_fh
 	# TO DO: Add more documentation about the input data format ($cds, $nuc_aa_codon_tally, etc.)
 
 	#name(chr:gene:aminoAcidPosition:consensusAminoAcid)	chr	gene	aminoAcidPosition	refAminoAcid	consensusAminoAcid	Sample	coverageDepth	numA	numC	numG	numT	numN	[del	SumACGT	numNonzeroACGT	Ref	Nonref]
 
 
-	my $nucleotide_report 	= $save_dir . "/" . $prefix . ".nuc.tally.xls";
-	my $codon_report 		= $save_dir . "/" . $prefix . ".codon.tally.xls";
-	my $amino_acid_report 	= $save_dir . "/" . $prefix . ".aa.tally.xls";
+	my $nucleotide_report 	= $prefix . ".nuc.tally.xls";
+	my $codon_report 		= $prefix . ".codon.tally.xls";
+	my $amino_acid_report 	= $prefix . ".aa.tally.xls";
 
 	my $nucleotide_report_fh 	= open_to_write("$nucleotide_report");
 	my $codon_report_fh 		= open_to_write("$codon_report");
@@ -371,12 +373,12 @@ sub remove_one_from_tally {
 }
 #-----------------------------------------------------------------------------
 sub print_merged_report {
-	my ($nuc_aa_codon_tally, $label, $gene, $CODON, $cds, $save_dir, $prefix) = @_;
+	my ($nuc_aa_codon_tally, $label, $gene, $CODON, $cds, $prefix) = @_;
 	# Print Reports to filehandles $nucleotide_report_fh, $codon_report_fh, and $amino_acid_report_fh
 	#name(gene:aminoAcidPosition:consensusAminoAcid)	gene nucleotidePosition refNucleotide consensusNucleotide aminoAcidPosition refCodon	consensusCodon	refAminoAcid	consensusAminoAcid	Sample	coverageDepth	topNucleotide	secondNucleotide	thirdNucleotide	otherNucleotide	countTopNucleotide	countSecondNucleotide	countThirdNucleotide	countOtherNucleotide	topCodon	secondCodon	thirdCodon	otherCodon	countTopCodon	countSecondCodon	countThirdCodon	countOtherCodon	topAminoAcid	secondAminoAcid	thirdAminoAcid	otherAminoAcid	countTopAminoAcid	countSecondAminoAcid	countThirdAminoAcid	countOtherAminoAcid
 	# $nuc_aa_codon_tally format example: $nuc_aa_codon_tally->{'aa'}->{$pos}->{$aa}++;
 
-	my $merged_report 		= $save_dir . "/" . $prefix . ".merged.tally.xls";
+	my $merged_report 		= $prefix . ".merged.tally.xls";
 
 	my $merged_report_fh		= open_to_write("$merged_report");
 
@@ -534,7 +536,7 @@ sub print_top_hits_alternative {
 }
 #-----------------------------------------------------------------------------
 sub print_variants {
-	my ($nuc_aa_codon_tally, $label, $gene, $variant_threshold, $save_dir, $prefix, $start_time, $verbose) = @_;
+	my ($nuc_aa_codon_tally, $label, $gene, $variant_threshold, $prefix, $start_time, $verbose) = @_;
 	# Filters the variants (nuc, aa, codon) for those that are above a certain threshold in frequency and then performs linkage disequilibrium in a pairwise manner for all variants at level of nuc, codon, or aa.
 
 	# First get the variants that are above a frequency threshold
@@ -547,7 +549,7 @@ sub print_variants {
 	 	# Coverage is the number of reads at this position
 	 	# Frequency is count/coverage
 
-	my $variants			= $save_dir . "/" . $prefix . ".variants.minfreq".$variant_threshold.".xls";		# Maybe add threshold to the name.  Or in the header of the file.
+	my $variants			= $prefix . ".variants.minfreq".$variant_threshold.".xls";		# Maybe add threshold to the name.  Or in the header of the file.
 
 	my $variants_fh 		= open_to_write("$variants");