From ce9a9c6adc7002f4acc90da46abfb5ad145f72ac Mon Sep 17 00:00:00 2001 From: Daniel Portik Date: Mon, 30 Jan 2023 11:28:28 -0800 Subject: [PATCH] Move scripts --- HiFi-MAG-Pipeline/scripts/Checkm-Plot.py | 131 ++++++++++++++++++ .../scripts/Concoct-organize-outputs.py | 32 +++++ .../scripts/Make-maxbin-depths.py | 53 +++++++ .../scripts/Maxbin2-organize-outputs.py | 49 +++++++ 4 files changed, 265 insertions(+) create mode 100644 HiFi-MAG-Pipeline/scripts/Checkm-Plot.py create mode 100644 HiFi-MAG-Pipeline/scripts/Concoct-organize-outputs.py create mode 100644 HiFi-MAG-Pipeline/scripts/Make-maxbin-depths.py create mode 100644 HiFi-MAG-Pipeline/scripts/Maxbin2-organize-outputs.py diff --git a/HiFi-MAG-Pipeline/scripts/Checkm-Plot.py b/HiFi-MAG-Pipeline/scripts/Checkm-Plot.py new file mode 100644 index 0000000..4376ff0 --- /dev/null +++ b/HiFi-MAG-Pipeline/scripts/Checkm-Plot.py @@ -0,0 +1,131 @@ +import argparse +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + + +def get_args(): + """ + Get arguments from command line with argparse. + """ + parser = argparse.ArgumentParser( + prog='Metabat-Plot.py', + description="""Plot bin information from MetaBAT2.""") + + parser.add_argument("-i", "--input", + required=True, + help="The o2 format summary file from CheckM.") + parser.add_argument("-l", "--label", + required=True, + help="A label for the plot.") + parser.add_argument("-c", "--completeness", + required=True, + type=int, + help="Completeness threshold.") + parser.add_argument("-m", "--contamination", + required=True, + type=int, + help="Contamination threshold.") + parser.add_argument("-g", "--contigs", + required=True, + type=int, + help="Contigs threshold.") + parser.add_argument("-o1", "--output1", + required=True, + help="The name of the output file (a plot).") + parser.add_argument("-o2", "--output2", + required=True, + help="The name of the output file (a plot).") + parser.add_argument("-o3", "--output3", + required=True, + help="The name of the output file (a plot).") + parser.add_argument("-o4", "--output4", + required=True, + help="The name of the output file (a plot).") + parser.add_argument("-o5", "--output5", + required=True, + help="The name of the output file (a plot).") + parser.add_argument("-o6", "--output6", + required=True, + help="The name of the output file (a plot).") + + return parser.parse_args() + + +def create_unfiltered_joint_scatter(df, output): + sns.jointplot(data=df, x='Completeness', y='Contamination', s=50, alpha=0.7, xlim=(-2,102), ylim=(-2,102)) + plt.savefig("{}".format(output)) + plt.close() + +def create_unfiltered_scatter_bins(df, label, output, completeness, contamination): + ax = df.plot.scatter(x='Completeness', y='Contamination', s=40, alpha=0.9, xlim=(-2,102), ylim=(-2,102)) + for i, txt in enumerate(df['Bin Id']): + ax.annotate(txt, (df['Completeness'].iat[i], df['Contamination'].iat[i]), + xytext=(df['Completeness'].iat[i] + 0.15, df['Contamination'].iat[i] + 0.15), + fontsize=5) + plt.xlabel('Genome Completeness') + plt.ylabel('Genome Contamination') + plt.title("Unfiltered Genome Bins: {}\n(labeled by bin name)".format(label)) + plt.savefig("{}".format(output)) + plt.close() + +def create_unfiltered_scatter_contigs(df, label, output, completeness, contamination): + ax = df.plot.scatter(x='Completeness', y='Contamination', s=40, alpha=0.9, xlim=(-2,102), ylim=(-2,102)) + for i, txt in enumerate(df['# contigs']): + ax.annotate(txt, (df['Completeness'].iat[i], df['Contamination'].iat[i]), + xytext=(df['Completeness'].iat[i] + 0.15, df['Contamination'].iat[i] + 0.15), + fontsize=5) + plt.xlabel('Genome Completeness') + plt.ylabel('Genome Contamination') + plt.title("Unfiltered Genome Bins: {}\n(labeled with numbers of contigs in bins)".format(label)) + plt.savefig("{}".format(output)) + plt.close() + +def create_filtered_scatter(df, label, output, completeness, contamination): + ax = df.plot.scatter(x='Completeness', y='Contamination', s=40, alpha=0.9, xlim=((completeness-2),102), ylim=(-0.5,(contamination+0.5))) + plt.xlabel('Genome Completeness') + plt.ylabel('Genome Contamination') + plt.title("Filtered Genome Bins: {}".format(label)) + plt.savefig("{}".format(output)) + plt.close() + +def create_filtered_scatter_contigs(df, label, output, completeness, contamination): + ax = df.plot.scatter(x='Completeness', y='Contamination', s=40, alpha=0.9, xlim=((completeness-2),102), ylim=(-0.5,(contamination+0.5))) + for i, txt in enumerate(df['# contigs']): + ax.annotate(txt, (df['Completeness'].iat[i], df['Contamination'].iat[i]), + xytext=(df['Completeness'].iat[i] + 0.1, df['Contamination'].iat[i] + 0.15), + fontsize=6) + plt.xlabel('Genome Completeness') + plt.ylabel('Genome Contamination') + plt.title("Filtered Genome Bins: {}\n(labeled with numbers of contigs in bins)".format(label)) + plt.savefig("{}".format(output)) + plt.close() + +def create_filtered_scatter_bins(df, label, output, completeness, contamination): + ax = df.plot.scatter(x='Completeness', y='Contamination', s=40, alpha=0.9, xlim=((completeness-2),102), ylim=(-0.5,(contamination+0.5))) + for i, txt in enumerate(df['Bin Id']): + ax.annotate(txt, (df['Completeness'].iat[i], df['Contamination'].iat[i]), + xytext=(df['Completeness'].iat[i] + 0.1, df['Contamination'].iat[i] + 0.12), + fontsize=5) + plt.xlabel('Genome Completeness') + plt.ylabel('Genome Contamination') + plt.title("Filtered Genome Bins: {}\n(labeled by bin name)".format(label)) + plt.savefig("{}".format(output)) + plt.close() + + +def main(): + args = get_args() + df = pd.read_csv(args.input, sep='\t') + scatfilt = (df['Completeness'] >= args.completeness) & (df['Contamination'] <= args.contamination) & (df['# contigs'] < args.contigs) + filt = df[scatfilt] + create_unfiltered_joint_scatter(df, args.output1, args.completeness, args.contamination) + create_unfiltered_scatter_contigs(df, args.label, args.output2, args.completeness, args.contamination) + create_unfiltered_scatter_bins(df, args.label, args.output3, args.completeness, args.contamination) + create_filtered_scatter(filt, args.label, args.output4, args.completeness, args.contamination) + create_filtered_scatter_contigs(filt, args.label, args.output5, args.completeness, args.contamination) + create_filtered_scatter_bins(filt, args.label, args.output6, args.completeness, args.contamination) + +if __name__ == '__main__': + main() + diff --git a/HiFi-MAG-Pipeline/scripts/Concoct-organize-outputs.py b/HiFi-MAG-Pipeline/scripts/Concoct-organize-outputs.py new file mode 100644 index 0000000..5ff4f42 --- /dev/null +++ b/HiFi-MAG-Pipeline/scripts/Concoct-organize-outputs.py @@ -0,0 +1,32 @@ +import argparse +import os +import shutil + +def get_args(): + """ + Get arguments from command line with argparse. + """ + parser = argparse.ArgumentParser( + prog='Concoct-organize-outputs.py', + description="""Relabel bin files from concoct.""") + parser.add_argument("-i", "--indir", + required=True, + help="Name of output directory.") + return parser.parse_args() + +def relabel_outputs(indir): + os.chdir(indir) + fasta_files = [f for f in os.listdir('.') if f.endswith('.fa')] + for f in fasta_files: + outname = "concoct.{}.{}".format(f.split('.')[0], f.split('.')[-1]) + print("Relabeling file: {}".format(f)) + print("\t{}".format(outname)) + shutil.move(f, os.path.join(indir, outname)) + +def main(): + args = get_args() + relabel_outputs(args.indir) + print("Finished.") + +if __name__ == '__main__': + main() diff --git a/HiFi-MAG-Pipeline/scripts/Make-maxbin-depths.py b/HiFi-MAG-Pipeline/scripts/Make-maxbin-depths.py new file mode 100644 index 0000000..a20888d --- /dev/null +++ b/HiFi-MAG-Pipeline/scripts/Make-maxbin-depths.py @@ -0,0 +1,53 @@ +import argparse +import os + +def get_args(): + """ + Get arguments from command line with argparse. + """ + parser = argparse.ArgumentParser( + prog='Make-maxbin-depths.py', + description="""Filter JGI depths for maxbin.""") + + parser.add_argument("-i", "--infile", + required=True, + help="Path to JGI depth file.") + parser.add_argument("-o", "--outfile", + required=True, + help="Name of output depth file.") + return parser.parse_args() + +def write_new_depth_file(input_depth_file, output_depth_file): + """ + contigName contigLen totalAvgDepth sludge.bam sludge.bam-var + s0.ctg000001c 4.13709e+06 61.7612 61.7612 103.936 + s1.ctg000002c 2.23849e+06 14.3086 14.3086 33.5774 + s2.ctg000003l 40246 10.6046 10.6046 38.4902 + s3.ctg000004l 461151 4.44724 4.44724 3.09768 + s4.ctg000005l 516089 30.5501 30.5501 65.15 + s5.ctg000006l 1.6986e+06 8.46604 8.46604 13.8984 + s2.ctg000007l 24711 0.643378 0.643378 0.229328 + s6.ctg000008l 31810 35.0132 35.0132 468.885 + s7.ctg000009c 3.24454e+06 200.377 200.377 1599.71 + """ + if os.path.exists(output_depth_file): + print("Removing existing version of file.") + os.remove(output_depth_file) + + with open(input_depth_file, 'r') as fh_in, open(output_depth_file, 'a') as fh_out: + kept_count = int(0) + for line in fh_in: + if line.startswith("contigName"): + pass + else: + fh_out.write("{}\t{}\n".format(line.split('\t')[0], line.split('\t')[2])) + kept_count += 1 + print("Parsed {:,} contigs".format(kept_count)) + +def main(): + args = get_args() + write_new_depth_file(args.infile, args.outfile) + +if __name__ == '__main__': + main() + diff --git a/HiFi-MAG-Pipeline/scripts/Maxbin2-organize-outputs.py b/HiFi-MAG-Pipeline/scripts/Maxbin2-organize-outputs.py new file mode 100644 index 0000000..5740c76 --- /dev/null +++ b/HiFi-MAG-Pipeline/scripts/Maxbin2-organize-outputs.py @@ -0,0 +1,49 @@ +import argparse +import os +import shutil + +def get_args(): + """ + Get arguments from command line with argparse. + """ + parser = argparse.ArgumentParser( + prog='Maxbin2-organize-outputs.py', + description="""Organize output files from maxbin2.""") + parser.add_argument("-s", "--sample", + required=True, + help="The sample name.") + parser.add_argument("-o", "--outdir", + required=True, + help="Name of output directory.") + return parser.parse_args() + +def make_outdir(outdir): + fulldir = os.path.join(os.getcwd(), outdir) + if not os.path.exists(fulldir): + os.mkdir(fulldir) + print("Created directory: {}".format(fulldir)) + return fulldir + +def move_outputs(sample, fulldir): + target_files = [f for f in os.listdir('.') if f.startswith(sample) + and f.endswith(('.seed', '.log', '.marker', '.tar.gz', '.noclass', '.summary', '.tooshort'))] + for f in target_files: + print("\tMoving file: {}".format(f)) + shutil.move(f, fulldir) + + fasta_files = [f for f in os.listdir('.') if f.startswith(sample) + and f.endswith('.fasta')] + for f in fasta_files: + outname = "maxbin.{}.{}".format(f.split('.')[1], f.split('.')[-1]) + print("Relabeling file: {}".format(f)) + print("\t{}".format(outname)) + shutil.move(f, os.path.join(fulldir, outname)) + +def main(): + args = get_args() + fulldir = make_outdir(args.outdir) + move_outputs(args.sample, fulldir) + print("Finished.") + +if __name__ == '__main__': + main()