mobile_pipeline_stagetwo_version2

#!/usr/bin/perl -w
use strict;

##Authur Xiao-Tao Jiang
##Email biofuture.jiang@gmail.com.
use Getopt::Std;
use File::Basename;
use FindBin qw($Bin);

##Generalize dir for this program
our (@dirset,$ublastxdir);
BEGIN {
    @dirset = split(/\//,$Bin);
    $ublastxdir = join("/", @dirset);
    unshift @INC, "$ublastxdir/bin";
}


our ($opt_h, $opt_i, $opt_o, $opt_n, $opt_m, $opt_l, $opt_e, $opt_d, $opt_b) = "";
my  $usage = <<USE;
	Author: JIANG Xiao-Tao
	Modidied : 06-04-2018
	Email: biofuture.jiang\@gmail.com
	$0 -i <extracted.fa> -m <meta_data_online.txt> -n [number of threads] -l [length] -e [evalue] -d [identity] -o <output_prefix> -b 

	-i the potential arg reads from stage one 
	-m meta data online from stage one 
	-o Output prefix
	-n number of threads used for blastx, default 1
	-l length filtering default 25 aa 
	-e evalue filtering default 1e-7
	-d identity filtering default 80
	-b if set then process the blastx results directly [default off], useful if user want to accelerate the stage two by running blastx paralell    
	-h print this help information
USE

##Description
##This pipeline is designed to process multisamples ARG identification, this is the part two pipeline

#die " perl $0 <Extracted_fasta> <Meta_data_info> <Catergory> <lenth> <e-value> <identity> <institution name> <email address> <Taskname> <PDF16s> <PDFCELL> <TABLE1> <TABLE2> <TABLE3>\n Authur: Xiao-Tao Jiang\n Email: biofuture.jiang\@gmail.com\n" unless (@ARGV == 14);
#-------------------------------------------------------------------------------------------------
#blastx aginst ARG database for accurately identification of reads for antibiotic resistence gene

getopts('i:o:n:m:e:d:l:b:h');
if($opt_h  ||  (!$opt_i) ){
        die "$usage\n";
}
my $efa = $opt_i;
my $blast6out = "$opt_o.blast6out.txt";
my $ARDB_PATH = "$ublastxdir/DB/mobileOG1.6.fasta";
my $ARDB_STRUCTURE = "$ublastxdir/DB/mobileOG_1.6_list";
my $ARDBFA = "$ublastxdir/DB/mobileOG1.6.fasta";
$opt_n ||= 10;
$opt_l ||= 25;
$opt_e ||= 1e-7;
$opt_d ||= 80;
$opt_b ||= "";

my $lenmatch = $opt_l;
my $evaluematch = $opt_e;
my $identitymatch = $opt_d;
#my $rlen = 100; ##the pair-end length
my $envtable16s = "$ublastxdir/DB/update_45_sarg2.0_16s.table_201704.txt";
my $envtablecellnumber = "$ublastxdir/DB/update_45_sarg2.0_cell.table_201704.txt";

##generate ppm 
my $subtypeppm = "$opt_o.ppm.subtype.txt";
my $typeppm = "$opt_o.ppm.type.txt";
my $geneppm = "$opt_o.ppm.gene.txt";

#for 16s normalization
my $subtype16s = "$opt_o.normalize_16s.subtype.tab.txt";
my $type16s = "$opt_o.normalize_16s.type.tab.txt";
my $subtypemerge16s = "$opt_o.normalize_16s.mergesubtype.tab.txt";
my $gene16s = "$opt_o.normalize_16s.gene.tab.txt";

#for cell number normalization
my $subtypecellnumber = "$opt_o.normalize_cellnumber.subtype.tab.txt";
my $typecellnumber = "$opt_o.normalize_cellnumber.type.tab.txt";
my $subtypemergecellnumber = "$opt_o.normalize_cellnumber.mergesubtype.tab.txt";
my $genecellnumber = "$opt_o.normalize_cellnumber.gene.txt";

my $begin = localtime;

print "blastx begain";

##if blastx results is already there, the blastx can be skipped. this is important that some users would like to run blastx paralelly in a cluster for 
##huge amount of data to accelerate the whole process 
if($opt_b){
	$blast6out = $opt_b;
}else{
	`$ublastxdir/bin/blastx -query $opt_i -out $blast6out -db $ARDB_PATH -evalue $evaluematch -num_threads $opt_n -outfmt 6 -max_target_seqs 1`;
}
##process blastx results and the structure information of arg database 
print "process meta data";
##process meta data-------------------------------------------------------------------------------
die "$! HERE META\n" unless open(Meta,"$opt_m");
my %sample2reads;
my %sample216s;
my %sample2cellnumber;
my %sample2cater;

my $headmeta = <Meta>;
my @hmeta = split(/\t/,$headmeta);
my $index = 3;

while(<Meta>){
	chomp;
	my @tt = split(/\t/,$_);
	$sample2reads{$tt[1]} = $tt[-3];
	$sample216s{$tt[1]} = $tt[-2];
	$sample2cellnumber{$tt[1]} = $tt[-1];
	$sample2cater{$tt[1]} = $tt[$index-1];
}
close Meta;
print "process ARDB to get the length information";
#process ARDB to get the length information 
my %len;
die "$! HEREFA\n" unless open(LEN, "$ARDBFA");
while(my $name = <LEN>){
	chomp($name);
	$name =~ s/^>//;
	my $seq = <LEN>; chomp($seq);
	my $idsarg = (split(/\s+/,$name))[0];
	my $le = length($seq);
	$len{$idsarg} = $le;
}
close LEN;

print "process ARDB structure files";
##process ARDB structure files------------------------------------------------------------------- 
die "$!HERESTRUCTURE\n" unless open(STRU, "$ARDB_STRUCTURE");
my %type;
my %subtype;
my %typelist;
my %subtypelist;
my %gene;

<STRU>;
while(<STRU>){
	chomp;
	my @tem = split /\t/;
	my @stem = split("__", $tem[0]);
	#print "$tem[0]\t$stem[0]\n";
	$tem[1] =~ s/^\[//;
	$tem[1] =~ s/\]$//;
	my @ids = split(", ", $tem[1]);
	##for each ids identify their type and subtype
	for(my $i = 0; $i <=$#ids; $i++){
		$ids[$i] =~ s/^\'//;
		$ids[$i] =~ s/\'$//;
		#print "$ids[$i]\n";
		$subtype{$ids[$i]} = $tem[0];
		$type{$ids[$i]} = $stem[0];
	        $gene{$ids[$i]} = 1;	
	}

	#including all type and subtype
	$typelist{$stem[0]} = 1;
	$subtypelist{$tem[0]} = 1;
}
close STRU;

##parse blast6out results-----------------------------------------------------------------------
die "$!HEREBLAST\n" unless open(BLAST6, "$blast6out"); 
my %samplehit; #Hash->Hash  sample->ARGs type/subtype->number of this ARG
my %samplenum;
my $upper="";
while(<BLAST6>){
	chomp;
	my @tem = split /\t/;
	my $record = $tem[0];
	next if ($record eq $upper);	
	if($tem[3] >= $lenmatch && $tem[2] >= $identitymatch && $tem[-2] <= $evaluematch){
		$tem[0] =~ s/\_\d+$//g;
		#print "$_\n";
		die "$tem[0] $tem[1]\t $!\n" unless($type{$tem[1]} && $subtype{$tem[1]});
		die "$tem[1]" unless(exists $len{$tem[1]});
		##for type subtype quantification to 16S, calculate the ratio of the fragmented aligned genes 
		#my $ratio = 1 * 100 / ($len{$tem[1]} * 3);
		my $ratio = $tem[3] / $len{$tem[1]};
			
		if(exists $samplehit{$tem[0]}){
			$samplehit{$tem[0]}{$type{$tem[1]}} += $ratio;
			$samplehit{$tem[0]}{$subtype{$tem[1]}} += $ratio;
			$samplehit{$tem[0]}{$tem[1]} += $ratio;

			##For counts
			$samplenum{$tem[0]}{$type{$tem[1]}} ++;
			$samplenum{$tem[0]}{$subtype{$tem[1]}} ++;
			$samplenum{$tem[0]}{$tem[1]} ++;
		}else{
			##type
			$samplehit{$tem[0]}{$type{$tem[1]}} = $ratio;
			##subtype
			$samplehit{$tem[0]}{$subtype{$tem[1]}} = $ratio;
			$samplehit{$tem[0]}{$tem[1]} = $ratio;
			##fro PPM calculation count the reads as one 
		        $samplenum{$tem[0]}{$type{$tem[1]}} =1;
                        $samplenum{$tem[0]}{$subtype{$tem[1]}} =1;
			$samplenum{$tem[0]}{$tem[1]} = 1;

		}
	}
	$upper = $record;

}
close BLAST6;

#-------------------------------------------------------16S Normalization-------Cell Number Normalization--------
##For each ARG type subtype generate mothor tables------------------------------------------------
##Hash -> Hash

die "$!\n" unless open(SUBP, ">$subtypeppm");
die "$!\n" unless open(TYPEP, ">$typeppm");

#----------------NEW ADD----------------------
die "$!\n" unless open(SUBM, ">$subtype16s");
die "$!\n" unless open(TYPEM, ">$type16s");
#------

die "$!\n" unless open(SUBC, ">$subtypecellnumber");
die "$!\n" unless open(TYPEC, ">$typecellnumber");
#------
die "$!\n" unless open(GENEP, ">$geneppm");
die "$!\n" unless open(GENEM, ">$gene16s");
die "$!\n" unless open(GENEC, ">$genecellnumber");

print SUBP "\nSubtype ppm\n";
print TYPEP "Type ppm\n";

print SUBM "ARGs abundance normalization aganist 16S";
print SUBC "ARGs abundance normalization aganist Cell number";
print TYPEM "Type level results";
print TYPEC "Type level results";

print GENEP "Gene to ppm\tSubtype\tType";
print GENEM "Gene to number of 16s copies\tSubtype\tType";
print GENEC "Gene to cell number level\tSutype\tType";

for my $id (sort keys %sample2reads){
		
	print SUBP "\t$id";
        print TYPEP "\t$id";	

	print SUBM "\t$id";
	print TYPEM "\t$id";
	#------
	print SUBC "\t$id";
	print TYPEC "\t$id";

	print GENEP "\t$id";
	print GENEM "\t$id";
	print GENEC "\t$id";
}
print SUBP "\n";
print TYPEP "\n";	
#--------------------
print SUBM "\n";
print TYPEM "\n";
#-------
print SUBC "\n";
print TYPEC "\n";

print GENEP "\n";
print GENEM "\n";
print GENEC "\n";


## output gene mothor table
for my $g (sort keys %gene){
	print GENEP "$g\t$subtype{$g}\t$type{$g}";
	print GENEM "$g\t$subtype{$g}\t$type{$g}";
	print GENEC "$g\t$subtype{$g}\t$type{$g}";
	for my $sam(sort keys %sample2reads){
		#print "$sam\n";
		 if(exists $samplehit{$sam}{$g}){
			 ##normalize the abundance with sample size and 16s copies
			 my $value = $samplehit{$sam}{$g} /  $sample216s{$sam};
			 my $valuecls = $samplehit{$sam}{$g} / $sample2cellnumber{$sam};
			 print GENEM "\t$value";
			 print GENEC "\t$valuecls";
		}else{
			print GENEM "\t0";
			print GENEC "\t0";
		}
		if(exists $samplenum{$sam}{$g}){
			my $numb = $samplenum{$sam}{$g} * 1000000 / $sample2reads{$sam};
			print GENEP "\t$numb";
		}else{
			print GENEP "\t0";
		}
	}
	print GENEP "\n";
	print GENEM "\n";
	print GENEC "\n";
}
close GENEP;
close GENEM;
close GENEC;


##output subtype mothor table
for my $sub (sort keys %subtypelist){
	
	print SUBP "$sub";

	print SUBM "$sub";
	print SUBC "$sub";
	for my $sam(sort keys %sample2reads){

		#print "$sam\n";
		if(exists $samplehit{$sam}{$sub}){
			##normalize the abundance with sample size and 16s copies
			#my $value = 1000000 * $samplehit{$sam}{$sub} * 1432 /  ($sample216s{$sam} * $sample2reads{$sam});
			my $value = $samplehit{$sam}{$sub} /  $sample216s{$sam};
			#print "$sub\n";
			my $valuecls = $samplehit{$sam}{$sub} / $sample2cellnumber{$sam};

			print SUBM "\t$value";
			print SUBC "\t$valuecls";
		}else{
			print SUBM "\t0";
			print SUBC "\t0";
		}

			
		if(exists $samplenum{$sam}{$sub}){	

			my $numb = $samplenum{$sam}{$sub} * 1000000 / $sample2reads{$sam};
			print SUBP "\t$numb";

		}else{
			print SUBP "\t0";
				
		}

	}
	print SUBP "\n";
	print SUBM "\n";
	print SUBC "\n";
}
close SUBP;
close SUBM;
close SUBC;

for my $ty (sort keys %typelist){
	print TYPEP "$ty";
	print TYPEM "$ty";
	print TYPEC"$ty";

	for my $sam(sort keys %sample2reads){
		if(exists $samplehit{$sam}{$ty}){
			##normalize the abundance with sample size and 16s copies
			#my $value = 1000000*$samplehit{$sam}{$ty} * 1432/ ($sample216s{$sam} * $sample2reads{$sam});
			my $value = $samplehit{$sam}{$ty} / $sample216s{$sam};
			print TYPEM "\t$value";

			my $valuecls = $samplehit{$sam}{$ty} / $sample2cellnumber{$sam};
			print TYPEC "\t$valuecls";
		}else{
			print TYPEM "\t0";
			print TYPEC "\t0";
		}


		if(exists $samplenum{$sam}{$ty}){
			my $numb = $samplenum{$sam}{$ty} * 1000000 / $sample2reads{$sam};
			print TYPEP "\t$numb";

		}else{
			print TYPEP "\t0";
		}	

	}
	print TYPEP "\n";
	print TYPEM "\n";
	print TYPEC "\n";
}
#print TYPEM "\nSubtype ARGs abundance merge with other environmental samples\n";
#print TYPEC "\nSubtype ARGs abundance merge with other environmental samples\n";
close TYPEM;
close TYPEC;
close TYPEP;

#Merge users table with other environment samples ARGs table----------------------------------------
die "$!\n" unless open(SUBT, "$subtype16s");
die "$!\n" unless open(ENV, "$envtable16s");
die "$!\n" unless open(Merge, ">$subtypemerge16s");
#----------------------------------------------------------
<SUBT>;
my $h1 = <SUBT>;
my $h2 = <ENV>;
chomp($h1); chomp($h2);
my @h2 = split(/\t/, $h2, 2);
print Merge "$h1\t$h2[1]\n";

my %subt;
my %env;
while(<SUBT>){
	chomp;
	my @tem = split(/\t/, $_,2);
	#print "$tem[0]\n";
	$subt{$tem[0]} = $tem[1];
}
close SUBT;

my $envn = 0;
while(<ENV>){
	chomp;
	my @tem = split(/\t/,$_,2);
	$env{$tem[0]} = $tem[1];
	my @ts = split("\t", $tem[1]);
	$envn = $#ts;
}
close ENV;

my @addzero;
for(my $i = 0; $i <= $envn; $i++){
	push @addzero, 0;
}
my $addz = join("\t", @addzero);

for my $subts (sort keys %subt){
	if(exists $env{$subts}){
		print Merge "$subts\t$subt{$subts}\t$env{$subts}\n";
	}else{
		print Merge "$subts\t$subt{$subts}\t$addz\n";
		#print  "$subts\t$subt{$subts}\t$addz\n";
		#die "wrong $subts\n";
	}
}
close Merge;
#-------------------------------------------------------------------

#-----------------
die "$! 351 Line\n" unless open(SUBTC, "$subtypecellnumber");
die "$! 352 Line \n" unless open(ENVC, "$envtablecellnumber");
die "$! 353 Line\n" unless open(MergeC, ">$subtypemergecellnumber");

<SUBTC>;
my $h1c = <SUBTC>;
my $h2c = <ENVC>;
chomp($h1c); chomp($h2c);
my @h2c = split(/\t/, $h2c, 2);
print MergeC "$h1c\t$h2c[1]\n";

my %subtc;
my %envc;
while(<SUBTC>){
	chomp;
	my @tem = split(/\t/, $_,2);
	#print "$tem[0]\n";
	$subtc{$tem[0]} = $tem[1];
}
close SUBTC;

my $envnc = 0;
while(<ENVC>){
	chomp;
	my @tem = split(/\t/,$_,2);
	$envc{$tem[0]} = $tem[1];
	my @ts = split("\t", $tem[1]);
	$envnc = $#ts;
}
close ENVC;

my @addzeroc;
for(my $i = 0; $i <= $envnc; $i++){
	push @addzeroc, 0;
}
my $addzc = join("\t", @addzeroc);

for my $subtsc (sort keys %subtc){
	if(exists $envc{$subtsc}){
		print MergeC "$subtsc\t$subtc{$subtsc}\t$envc{$subtsc}\n";
	}else{
		print MergeC "$subtsc\t$subtc{$subtsc}\t$addzc\n";
		#die "wrong $subts\n";
	}
}
close MergeC;
#-----------------------------------------------------------------------------------------------------------------

##input user information 6 7 
##output 6 7 8 9 10 

my $pdf16s = "$opt_o.16S.pdf";
my $pdfcell = "$opt_o.cell.pdf";
my $normalize16s = "$opt_o.normalize.16s.txt";
my $normalizecell = "$opt_o.normalize.cell.txt";
my $ppmout = "$opt_o.ppm.txt";

##generate compressed output zip files 
`cat  $subtype16s  $type16s $subtypemerge16s > $normalize16s`;
#$ARGV[8] = $zip16s;
`cat  $subtypecellnumber $typecellnumber $subtypemergecellnumber  > $normalizecell`;
#$ARGV[9] = $zipcellnumber;
`cat $typeppm $subtypeppm > $ppmout`;

##------------------------------------------------------------------------------------------------------
##PcoA analysis of existing environment samples and users samples-------------------------------------------------
my $rscript = "$opt_o.tmp.R";
die "$! 442 line\n" unless open(R,">$rscript");
my @name = split(/\t/, $h1);
my @caters = ();
for(my $i =1; $i <=$#name; $i++){
	if(exists $sample2cater{$name[$i]}){
		push @caters, "\"$sample2cater{$name[$i]}\"";
	}else{
		die "Wrong $name[$i]\n";
	}
}
my $ocater = join(", ", @caters);

#print LOG "Start to generate PDF format PcoA figure for user samples and the samples in database\n";
##16S----------------------------------------------------------------------------
my $trs = <<RS;

mothertable <- read.table(file="$subtypemerge16s", sep="\\t",header=T,row.names=1,quote = "", stringsAsFactors = FALSE)
#remove zero line
mothertable <- mothertable[which(rowSums(mothertable)!=0),]
mothertable <- t(mothertable)
samples.orig <- rownames(mothertable) # recording original sample names
mothertable <- mothertable[which(rowSums(mothertable)!=0),]
dim(mothertable)

library(vegan)
library(labdsv)
library(ggplot2)
library(scales)
cols <- c($ocater, \"DrinkingwaterSystem\", \"DrinkingwaterSystem\", \"DrinkingwaterSystem\", \"DrinkingwaterSystem\", \"DrinkingwaterSystem\", \"DrinkingwaterSystem\", \"DrinkingwaterSystem\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"SewageTreatmentPlant\", \"Sediment\", \"Sediment\", \"Sediment\", \"LiveStock\", \"LiveStock\", \"LiveStock\", \"LiveStock\", \"LiveStock\", \"LiveStock\", \"LiveStock\", \"LiveStock\", \"LiveStock\", \"LiveStock\", \"LiveStock\", \"LiveStock\", \"Ocean\", \"Ocean\", \"Ocean\")
cols <- cols[match(rownames(mothertable), samples.orig)] #update cols in case certain samples has been filtered out.

pdf("$pdf16s")
vd <- vegdist(mothertable,method="bray")
vd.pco <- pco(vd, k=10)
pcoadata <- data.frame(vd.pco\$points[,1], vd.pco\$points[,2], cols)
EnvironmentType <- pcoadata\$cols
pc1n <- vd.pco\$eig[1]/sum(vd.pco\$eig)
pc2n <- vd.pco\$eig[2]/sum(vd.pco\$eig)
xl <- paste("Pco1(",(pc1n*10000)%/%100,"%)",sep="")
yl <- paste("Pco2(", (pc2n*10000)%/%100,"%)",sep="")
p <- ggplot(pcoadata, aes(x=pcoadata\$vd.pco.points...1., y=pcoadata\$vd.pco.points...2., color=EnvironmentType)) + geom_point(size=5,alpha=.8)
p+theme(legend.position=c(0.2,0.8), panel.border = element_blank(), panel.grid.major = element_blank(), panel.background = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), legend.background = element_rect(fill=alpha('white', 0.2)) ) + labs(fill="Samples Type") + xlab(xl) + ylab(yl)
dev.off()
RS
print R $trs;
#------------------------------------------------------
##Cell number----------------------------------------------------------------------------
my $rscriptc = "$opt_o.cellnumber.tmp.R";
my $trsc = <<RS;
mothertable <- read.table(file="$subtypemergecellnumber", sep="\\t",header=T,row.names=1,quote = "", stringsAsFactors = FALSE)
#remove zero line
mothertable <- mothertable[which(rowSums(mothertable)!=0),]
mothertable <- t(mothertable)
mothertable <- mothertable[which(rowSums(mothertable)!=0),]
dim(mothertable)
pdf("$pdfcell")
vd <- vegdist(mothertable,method="bray")
vd.pco <- pco(vd, k=10)
pcoadata <- data.frame(vd.pco\$points[,1], vd.pco\$points[,2], cols)
EnvironmentType <- pcoadata\$cols
pc1n <- vd.pco\$eig[1]/sum(vd.pco\$eig)
pc2n <- vd.pco\$eig[2]/sum(vd.pco\$eig)
xl <- paste("Pco1(",(pc1n*10000)%/%100,"%)",sep="")
yl <- paste("Pco2(", (pc2n*10000)%/%100,"%)",sep="")
p <- ggplot(pcoadata, aes(x=pcoadata\$vd.pco.points...1., y=pcoadata\$vd.pco.points...2., color=EnvironmentType)) + geom_point(size=5,alpha=.8)
p+theme(legend.position=c(0.2,0.8), panel.border = element_blank(), panel.grid.major = element_blank(), panel.background = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), legend.background = element_rect(fill=alpha('white', 0.2)) ) + labs(fill="Samples Type") + xlab(xl) + ylab(yl)
dev.off()
RS
print R $trsc;
close R;

`R CMD BATCH --args $rscript`;
__END__

1;