[MRG] Change column names in intermediate CSVs. (#133)

* change column names * remove old notebooks * fix mistake
dib-lab · Jan 11, 2022 · 79626e5 · 79626e5
1 parent 80fa463
commit 79626e5
Show file tree

Hide file tree

Showing 11 changed files with 80 additions and 2,818 deletions.
diff --git a/gather-p8808mo11.ipynb b/gather-p8808mo11.ipynb
diff --git a/gather-p8808mo9.ipynb b/gather-p8808mo9.ipynb
diff --git a/gather-podar.ipynb b/gather-podar.ipynb
diff --git a/gathergram.ipynb b/gathergram.ipynb
diff --git a/generate-private-info.py b/generate-private-info.py
diff --git a/genome_grist/conf/Snakefile b/genome_grist/conf/Snakefile
@@ -173,8 +173,8 @@ checkpoint gather_reads_wc:
 
 
 class Checkpoint_GatherResults:
-    """Given a pattern containing {acc} and {sample}, this class
-    will generate the list of {acc} for that {sample}'s gather results.
+    """Given a pattern containing {ident} and {sample}, this class
+    will generate the list of {ident} for that {sample}'s gather results.
 
     Alternatively, you can omit {sample} from the pattern and include the
     list of sample names in the second argument to the constructor.
@@ -183,20 +183,20 @@ class Checkpoint_GatherResults:
         self.pattern = pattern
         self.samples = samples
 
-    def get_genome_accs(self, sample):
+    def get_genome_idents(self, sample):
         gather_csv = f'{outdir}/gather/{sample}.gather.csv'
         assert os.path.exists(gather_csv), "gather output does not exist!?"
 
-        genome_accs = []
+        genome_idents = []
         with open(gather_csv, 'rt') as fp:
            r = csv.DictReader(fp)
            for row in r:
-               acc = row['name'].split(' ')[0]
-               genome_accs.append(acc)
-        print(f'loaded {len(genome_accs)} accessions from {gather_csv}.',
+               ident = row['name'].split(' ')[0]
+               genome_idents.append(ident)
+        print(f'loaded {len(genome_idents)} identifiers from {gather_csv}.',
               file=sys.stderr)
 
-        return genome_accs
+        return genome_idents
 
     def __call__(self, w):
         # get 'sample' from wildcards?
@@ -221,9 +221,9 @@ class Checkpoint_GatherResults:
         checkpoints.gather_reads_wc.get(**w)
 
         # parse hitlist_genomes,
-        genome_accs = self.get_genome_accs(w.sample)
+        genome_idents = self.get_genome_idents(w.sample)
 
-        p = expand(self.pattern, acc=genome_accs, **w)
+        p = expand(self.pattern, ident=genome_idents, **w)
 
         return p
 
@@ -243,18 +243,18 @@ class ListGatherGenomes(Checkpoint_GatherResults):
         private_info = {}
         for filename in private_databases_info:
             for row in load_csv(filename):
-                acc = row['acc']
+                ident = row['ident']
 
                 genome_dir = os.path.dirname(row['genome_filename'])
                 row['genome_filename'] = os.path.normpath(row['genome_filename'])
 
                 genome_dir = os.path.dirname(row['genome_filename'])
-                info_filename = f'{acc}.info.csv'
+                info_filename = f'{ident}.info.csv'
                 info_filename = os.path.join(genome_dir, info_filename)
 
                 row['info_filename'] = os.path.normpath(info_filename)
 
-                private_info[acc] = row
+                private_info[ident] = row
 
         print(f"Loaded info on {len(private_info)} private genomes.")
         return private_info
@@ -273,18 +273,18 @@ class ListGatherGenomes(Checkpoint_GatherResults):
         assert os.path.exists(gather_csv), f"gather output does not exist for {sample}!?"
 
         for row in load_csv(gather_csv):
-            acc = row['name'].split(' ')[0]
+            ident = row['name'].split(' ')[0]
 
             # if in private information, use that as genome source.
-            if acc in private_info:
-                info = private_info[acc]
+            if ident in private_info:
+                info = private_info[ident]
                 genome_filenames.append(info['genome_filename'])
                 genome_filenames.append(info['info_filename'])
 
             # genbank: point at genbank_genomes
             else:
-                genome_filenames.append(f'genbank_genomes/{acc}_genomic.fna.gz')
-                genome_filenames.append(f'genbank_genomes/{acc}.info.csv')
+                genome_filenames.append(f'genbank_genomes/{ident}_genomic.fna.gz')
+                genome_filenames.append(f'genbank_genomes/{ident}.info.csv')
 
         return genome_filenames
 
@@ -293,12 +293,12 @@ class Checkpoint_GenomeFiles(Checkpoint_GatherResults):
         checkpoints.copy_sample_genomes_to_output_wc.get(**w)
 
         # parse hitlist_genomes,
-        genome_accs = self.get_genome_accs(w.sample)
+        genome_idents = self.get_genome_idents(w.sample)
 
-        if 'acc' in dict(w):
+        if 'ident' in dict(w):
             p = expand(self.pattern, **w)
         else:
-            p = expand(self.pattern, acc=genome_accs, **w)
+            p = expand(self.pattern, ident=genome_idents, **w)
 
         return p
 
@@ -326,7 +326,7 @@ rule combine_genome_info:
 @toplevel
 rule download_genbank_genomes:
     input:
-        Checkpoint_GatherResults("genbank_genomes/{acc}_genomic.fna.gz",
+        Checkpoint_GatherResults("genbank_genomes/{ident}_genomic.fna.gz",
                                  samples=SAMPLES)
 
 @toplevel
@@ -343,8 +343,8 @@ rule map_reads:
 @toplevel
 rule build_consensus:
     input:
-        Checkpoint_GatherResults(outdir + f"/mapping/{{sample}}.x.{{acc}}.consensus.fa.gz"),
-        Checkpoint_GatherResults(outdir + f"/leftover/{{sample}}.x.{{acc}}.consensus.fa.gz"),
+        Checkpoint_GatherResults(outdir + f"/mapping/{{sample}}.x.{{ident}}.consensus.fa.gz"),
+        Checkpoint_GatherResults(outdir + f"/leftover/{{sample}}.x.{{ident}}.consensus.fa.gz"),
 
 @toplevel
 rule summarize_mapping:
@@ -556,10 +556,10 @@ rule count_trimmed_reads_wc:
 # map abundtrim reads and produce a bam
 rule minimap_wc:
     input:
-        query = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{acc}}_genomic.fna.gz"),
+        query = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{ident}}_genomic.fna.gz"),
         metagenome = outdir + "/abundtrim/{sample}.abundtrim.fq.gz",
     output:
-        bam = outdir + "/mapping/{sample}.x.{acc}.bam",
+        bam = outdir + "/mapping/{sample}.x.{ident}.bam",
     conda: "env/minimap2.yml"
     threads: 4
     shell: """
@@ -604,12 +604,12 @@ rule bam_covered_regions_wc:
 # calculating SNPs/etc.
 rule mpileup_wc:
     input:
-        query = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{acc}}_genomic.fna.gz"),
-        bam = outdir + "/{dir}/{sample}.x.{acc}.bam",
+        query = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{ident}}_genomic.fna.gz"),
+        bam = outdir + "/{dir}/{sample}.x.{ident}.bam",
     output:
-        bcf = outdir + "/{dir}/{sample}.x.{acc}.bcf",
-        vcf = outdir + "/{dir}/{sample}.x.{acc}.vcf.gz",
-        vcfi = outdir + "/{dir}/{sample}.x.{acc}.vcf.gz.csi",
+        bcf = outdir + "/{dir}/{sample}.x.{ident}.bcf",
+        vcf = outdir + "/{dir}/{sample}.x.{ident}.vcf.gz",
+        vcfi = outdir + "/{dir}/{sample}.x.{ident}.vcf.gz.csi",
     conda: "env/bcftools.yml"
     shell: """
         genomefile=$(mktemp -t grist.genome.XXXXXXX)
@@ -623,13 +623,13 @@ rule mpileup_wc:
 # build new consensus
 rule build_new_consensus_wc:
     input:
-        vcf = outdir + "/{dir}/{sample}.x.{acc}.vcf.gz",
-        query = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{acc}}_genomic.fna.gz"),
-        regions = outdir + "/{dir}/{sample}.x.{acc}.regions.bed",
+        vcf = outdir + "/{dir}/{sample}.x.{ident}.vcf.gz",
+        query = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{ident}}_genomic.fna.gz"),
+        regions = outdir + "/{dir}/{sample}.x.{ident}.regions.bed",
     output:
-        mask = outdir + "/{dir}/{sample}.x.{acc}.mask.bed",
-        genomefile = outdir + "/{dir}/{sample}.x.{acc}.fna.gz.sizes",
-        consensus = outdir + "/{dir}/{sample}.x.{acc}.consensus.fa.gz",
+        mask = outdir + "/{dir}/{sample}.x.{ident}.mask.bed",
+        genomefile = outdir + "/{dir}/{sample}.x.{ident}.fna.gz.sizes",
+        consensus = outdir + "/{dir}/{sample}.x.{ident}.consensus.fa.gz",
     conda: "env/bcftools.yml"
     shell: """
         genomefile=$(mktemp -t grist.genome.XXXXXXX)
@@ -645,7 +645,7 @@ rule build_new_consensus_wc:
 # summarize depth into a CSV
 rule summarize_samtools_depth_wc:
     input:
-        Checkpoint_GatherResults(outdir + f"/{{dir}}/{{sample}}.x.{{acc}}.depth.txt")
+        Checkpoint_GatherResults(outdir + f"/{{dir}}/{{sample}}.x.{{ident}}.depth.txt")
     output:
         csv = f"{outdir}/{{dir}}/{{sample}}.summary.csv"
     shell: """
@@ -760,7 +760,7 @@ rule make_mapping_notebook_wc:
 rule extract_leftover_reads_wc:
     input:
         csv = f'{outdir}/gather/{{sample}}.gather.csv',
-        mapped = Checkpoint_GatherResults(f"{outdir}/mapping/{{sample}}.x.{{acc}}.mapped.fq.gz"),
+        mapped = Checkpoint_GatherResults(f"{outdir}/mapping/{{sample}}.x.{{ident}}.mapped.fq.gz"),
     output:
         touch(f"{outdir}/leftover/.leftover.{{sample}}")
     conda: "env/sourmash.yml"
@@ -775,15 +775,15 @@ rule extract_leftover_reads_wc:
 rule map_leftover_reads_wc:
     input:
         all_csv = f"{outdir}/mapping/{{sample}}.summary.csv",
-        query = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{acc}}_genomic.fna.gz"),
+        query = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{ident}}_genomic.fna.gz"),
         leftover_reads_flag = f"{outdir}/leftover/.leftover.{{sample}}",
     output:
-        bam=outdir + "/leftover/{sample}.x.{acc}.bam",
+        bam=outdir + "/leftover/{sample}.x.{ident}.bam",
     conda: "env/minimap2.yml"
     threads: 4
     shell: """
         minimap2 -ax sr -t {threads} {input.query} \
-     {outdir}/mapping/{wildcards.sample}.x.{wildcards.acc}.leftover.fq.gz | \
+     {outdir}/mapping/{wildcards.sample}.x.{wildcards.ident}.leftover.fq.gz | \
             samtools view -b -F 4 - | samtools sort - > {output.bam}
     """
 
@@ -878,9 +878,9 @@ rule summarize_tax_wc:
 # download genbank genome details; make an info.csv file for entry.
 rule make_genbank_info_csv:
     output:
-        csvfile = 'genbank_genomes/{acc}.info.csv'
+        csvfile = 'genbank_genomes/{ident}.info.csv'
     shell: """
-        python -Werror -m genome_grist.genbank_genomes {wildcards.acc} \
+        python -Werror -m genome_grist.genbank_genomes {wildcards.ident} \
             --output {output.csvfile}
     """
 
@@ -898,33 +898,33 @@ checkpoint copy_sample_genomes_to_output_wc:
 # combined info.csv per sample
 rule make_combined_info_csv_wc:
     input:
-        csvs = Checkpoint_GenomeFiles(f'{outdir}/genomes/{{acc}}.info.csv'),
+        csvs = Checkpoint_GenomeFiles(f'{outdir}/genomes/{{ident}}.info.csv'),
     output:
         genomes_info_csv = f"{outdir}/gather/{{sample}}.genomes.info.csv",
     shell: """
         python -Werror -m genome_grist.combine_csvs \
-             --fields acc,ncbi_tax_name \
+             --fields ident,display_name \
              {input.csvs} > {output}
     """
 
 # download actual genomes!
 rule download_matching_genome_wc:
     input:
-        csvfile = ancient('genbank_genomes/{acc}.info.csv')
+        csvfile = ancient('genbank_genomes/{ident}.info.csv')
     output:
-        genome = "genbank_genomes/{acc}_genomic.fna.gz"
+        genome = "genbank_genomes/{ident}_genomic.fna.gz"
     run:
         with open(input.csvfile, 'rt') as infp:
             r = csv.DictReader(infp)
             rows = list(r)
             assert len(rows) == 1
             row = rows[0]
-            acc = row['acc']
-            assert wildcards.acc.startswith(acc)
+            ident = row['ident']
+            assert wildcards.ident.startswith(ident)
             url = row['genome_url']
-            name = row['ncbi_tax_name']
+            name = row['display_name']
 
-            print(f"downloading genome for acc {acc}/{name} from NCBI...",
+            print(f"downloading genome for ident {ident}/{name} from NCBI...",
                 file=sys.stderr)
             with open(output.genome, 'wb') as outfp:
                 with urllib.request.urlopen(url) as response:
@@ -972,7 +972,7 @@ rule summarize_reads_info_wc:
 rule create_sgc_conf_wc:
     input:
         csv = outdir + "/gather/{sample}.gather.csv",
-        queries = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{acc}}_genomic.fna.gz")
+        queries = Checkpoint_GenomeFiles(f"{outdir}/genomes/{{ident}}_genomic.fna.gz")
     output:
         conf = outdir + "/sgc/{sample}.conf"
     run:

diff --git a/genome_grist/copy_private_genomes.py b/genome_grist/copy_private_genomes.py
@@ -19,8 +19,8 @@ def main():
     args = p.parse_args()
 
     output_fp = open(args.output_csv, 'wt')
-    w = csv.DictWriter(output_fp, fieldnames=['acc',
-                                              'ncbi_tax_name',
+    w = csv.DictWriter(output_fp, fieldnames=['ident',
+                                              'display_name',
                                               'genome_filename'])
     w.writeheader()
 
@@ -42,15 +42,15 @@ def main():
             break
 
         record_name = record_name.split(' ', 1)
-        acc, remainder = record_name
+        ident, remainder = record_name
 
-        print(f"read identifer '{acc}' and name '{remainder}'")
+        print(f"read identifer '{ident}' and name '{remainder}'")
 
-        destfile = os.path.join(args.output_directory, f"{acc}_genomic.fna.gz")
+        destfile = os.path.join(args.output_directory, f"{ident}_genomic.fna.gz")
         print(f"copying '{filename}' to '{destfile}'")
         shutil.copyfile(filename, destfile)
 
-        w.writerow(dict(acc=acc, ncbi_tax_name=remainder,
+        w.writerow(dict(ident=ident, display_name=remainder,
                         genome_filename=destfile))
         n += 1