Merge pull request #24 from treangenlab/empty-fasta-bug

Distinguish between unmapped and unclassified
treangenlab · Sep 19, 2024 · 63b6e69 · 63b6e69
2 parents 43944a8 + 7c1f51c
commit 63b6e69
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 67 deletions.
diff --git a/README.md b/README.md
@@ -97,14 +97,17 @@ Each step of the installation process is expected to take a matter of seconds.
 |--output-dir| ./results| directory for output results|
 |--output-basename| stem of input_file(s)| basename of all output files saved in output-dir; default utilizes basename from input file(s)|
 |--keep-files| FALSE| keep working files in output-dir ( alignments [.sam], reads of specied length [.fa])|
-|--keep-counts| FALSE| include estimated read counts for each species in output|
+|--keep-counts| FALSE| include estimated read counts for each species in output*|
 |--keep-read-assignments| FALSE| output .tsv file with read assignment distributions: each row as an input read; each entry as the likelihood it is dervied from that taxa (taxid is the column header); each row sums to 1|
-|--output-unclassified| FALSE| generate a separate output file of unclassified sequences|
+|--output-unclassified| FALSE| generate two additional sequence files of unmapped and unclassified mapped input reads**|
 |--threads| 3| number of threads utilized by minimap2|
 
+*Estimated read counts are based on likelihood probabilities and therefore may not be integer values. They are calculated as the product of estimated relative abundance and total classified reads.
+
+**Here, "unmapped reads" are reads that did not result in a mapping to the provided database with minimap2. "Unclassified mapped reads" are those that mapped only to database sequences of species that are presumed to not be present in the sample by Emu's algorithm (likely due to low overall abundance).
 
 Note: If you are experiencing heavy RAM consumption, first upgrade minimap2 to at least v2.22. If memory is still an issue, try decreasing the number of secondary alignments evaluated for each read (--N).
-Note: Estimated read counts are based on likelihood probabilities and therefore may not be integer values.
+
 
 ### Build Custom Database
 

diff --git a/emu b/emu
@@ -77,37 +77,22 @@ def get_align_len(alignment):
     return sum(alignment.get_cigar_stats()[0][cigar_op] for cigar_op in CIGAR_OPS_ALL)
 
 
-def output_unclassified(sam_path, seq_output_path, input_type):
-    """ Gather list of input sequences that are deemed unclassified by minimap2,
-        and output as seq_output_path
+def output_sequences(in_path, seq_output_path, input_type, keep_ids):
+    """ Output specified list of sequences from input_file based on sequence id
 
-        sam_path (str): path to sam file generated by minimap2
-        fa_output_path (str): output path for fasta of unclassified sequences
+        in_path (str): path to input fasta or fastq
+        seq_output_path (str): output path for fasta/q of unclassified sequences
+        input_type (str): fasta or fastq
+        keep_ids (set): set of seuqence id strings
     """
-    output_records = []
-    # pylint: disable=maybe-no-member
-    sam_pysam = pysam.AlignmentFile(sam_path)
-    if input_type == "fasta":
-        for alignment in sam_pysam.fetch():
-            if alignment.reference_name is None:
-                record = SeqRecord(
-                    Seq(alignment.query_sequence),
-                    id=alignment.query_name,
-                    name=alignment.query_name,
-                )
-                output_records.append(record)
-        SeqIO.write(output_records, "{}.fa".format(seq_output_path), "fasta")
-    elif input_type == "fastq":
-        for alignment in sam_pysam.fetch():
-            if alignment.reference_name is None:
-                record = SeqRecord(
-                    Seq(alignment.query_sequence),
-                    id=alignment.query_name,
-                    name=alignment.query_name,
-                    letter_annotations={"phred_quality": alignment.query_qualities}
-                )
-                output_records.append(record)
-        SeqIO.write(output_records, "{}.fq".format(seq_output_path), "fastq")
+    # Filter and write only the sequences with matching read IDs
+    with open(in_path, "r", encoding="utf-8") as in_file, \
+            open("{}.{}".format(seq_output_path, input_type), "w", encoding="utf-8") as out_seq_file:
+        # Parse the FASTA file and filter by read IDs
+        filtered_sequences = (seq for seq in SeqIO.parse(in_file, input_type) if seq.id in keep_ids)
+
+        # Write the filtered sequences to the output file
+        SeqIO.write(filtered_sequences, out_seq_file, input_type)
 
 
 def get_cigar_op_log_probabilities(sam_path):
@@ -177,11 +162,11 @@ def log_prob_rgs_dict(sam_path, log_p_cigar_op, dict_longest_align, p_cigar_op_z
         dict_longest_align (dict[str]:(int)): dict of max alignment length for each query read
         zero_locs(list(int)): list of indices (int) where probability == 0
         return ({[str,int]:float}): dict[(query_name,ref_tax_id)]=log(L(query_name|ref_tax_id))
-            int: unassigned read count
-            int: assigned read count
+            int: unmapped read count
+            int: mapped read count
     """
     # calculate log(L(read|seq)) for all alignments
-    log_p_rgs, unassigned_set = {}, set()
+    log_p_rgs, unmapped_set = {}, set()
     # pylint: disable=maybe-no-member
     sam_filename = pysam.AlignmentFile(sam_path, 'rb')
 
@@ -206,7 +191,7 @@ def log_prob_rgs_dict(sam_path, log_p_cigar_op, dict_longest_align, p_cigar_op_z
                             log_p_rgs[query_name][1][logprgs_idx] = log_score
 
             else:
-                unassigned_set.add(alignment.query_name)
+                unmapped_set.add(alignment.query_name)
     else:
         for alignment in sam_filename.fetch():
             align_len = get_align_len(alignment)
@@ -229,18 +214,18 @@ def log_prob_rgs_dict(sam_path, log_p_cigar_op, dict_longest_align, p_cigar_op_z
                         if log_p_rgs[query_name][1][logprgs_idx] < log_score:
                             log_p_rgs[query_name][1][logprgs_idx] = log_score
             else:
-                unassigned_set.add(alignment.query_name)
+                unmapped_set.add(alignment.query_name)
 
-    assigned_reads = set(log_p_rgs.keys())
-    unassigned_reads = unassigned_set - assigned_reads
-    unassigned_count = len(unassigned_reads)
-    stdout.write(f"Unassigned read count: {unassigned_count}\n")
+    mapped_set = set(log_p_rgs.keys())
+    unmapped_set = unmapped_set - mapped_set
+    unmapped_count = len(unmapped_set)
+    stdout.write(f"Unmapped read count: {unmapped_count}\n")
 
     ## remove low likelihood alignments?
     ## remove if p(r|s) < 0.01
     #min_p_thresh = math.log(0.01)
     #log_p_rgs = {r_map: val for r_map, val in log_p_rgs.items() if val > min_p_thresh}
-    return log_p_rgs, unassigned_count, len(assigned_reads)
+    return log_p_rgs, unmapped_set, mapped_set
 
 
 def expectation_maximization(log_p_rgs, freq):
@@ -305,14 +290,14 @@ def expectation_maximization_iterations(log_p_rgs, db_ids, lli_thresh, input_thr
     """
     n_db = len(db_ids)
     n_reads = len(log_p_rgs)
-    stdout.write("Assigned read count: {}\n".format(n_reads))
+    stdout.write("Mapped read count: {}\n".format(n_reads))
     # check if there are enough reads
     if n_reads == 0:
-        raise ValueError("0 reads assigned")
+        raise ValueError("0 reads mapped")
     freq, counter = dict.fromkeys(db_ids, 1 / n_db), 1
 
     # set output abundance threshold
-    freq_thresh = 1/n_reads
+    freq_thresh = 1/(n_reads + 1)
     if n_reads > 1000:
         freq_thresh = 10/n_reads
 
@@ -375,28 +360,32 @@ def lineage_dict_from_tid(taxid, nodes_dict, names_dict):
     return tuple(lineage_list)
 
 
-def freq_to_lineage_df(freq, tsv_output_path, taxonomy_df, assigned_count,
-                       unassigned_count, counts=False):
+def freq_to_lineage_df(freq, tsv_output_path, taxonomy_df, mapped_count,
+                       unmapped_count, mapped_unclassified_count, counts=False):
     """Converts freq to a pandas df where each row contains abundance and tax lineage for
                 classified species in f.keys(). Stores df as .tsv file in tsv_output_path.
 
         freq{int:float}: dict[species_tax_id]:estimated likelihood species is present in sample
         tsv_output_path(str): path to output .tsv file
         taxonomy_df(df): pandas df of all db sequence taxonomy with index 'tax_id'
-        assigned_count(int): number of assigned reads
-        unassigned_count(int): number of unassigned reads
+        mapped_count(int): number of mapped reads
+        unmapped_count(int): number of unmapped reads
+        mapped_unclassified_count(int): number of that mapped but were assigned due to
+                                        low abundant classification
         counts(boolean): True if include estimated counts in output .tsv file
         returns(df): pandas df with lineage and abundances for values in f
     """
     #add tax lineage for values in freq
-    results_df = pd.DataFrame(zip(list(freq.keys()) + ['unassigned'],
-                                  list(freq.values()) + [0]),
+    results_df = pd.DataFrame(zip(list(freq.keys()) + ['unmapped', 'mapped_unclassified'],
+                                  list(freq.values()) + [0, 0]),
                               columns=["tax_id", "abundance"]).set_index('tax_id')
     results_df = results_df.join(taxonomy_df, how='left').reset_index()
-        #add in the estimated count values for the assigned and unassigned counts
+        #add in the estimated count values for the mapped and unmapped counts
     if counts:
-        counts_series = pd.concat([(results_df["abundance"] * assigned_count)[:-1],
-                                   pd.Series(unassigned_count)], ignore_index=True)
+        classified_count = mapped_count - mapped_unclassified_count
+        counts_series = pd.concat([(results_df["abundance"] * classified_count)[:-2],
+                                   pd.Series(unmapped_count), pd.Series(mapped_unclassified_count)],
+                                    ignore_index=True)
         results_df["estimated counts"] = counts_series
     results_df.to_csv("{}.tsv".format(tsv_output_path), sep='\t', index=False)
     return results_df
@@ -666,11 +655,15 @@ def combine_outputs(dir_path, rank, split_files=False, count_table=False):
             df_sample = pd.read_csv(os.path.join(dir_path, file), sep='\t', dtype=str)
             df_sample[[metric]] = df_sample[[metric]].apply(pd.to_numeric)
             if rank in df_sample.columns and metric in df_sample.columns:
-                keep_ranks_sample = [value for value in keep_ranks if value in set(df_sample.columns)] #check which keep_ranks are in df_sample
-                if df_sample.at[len(df_sample.index)-1, 'tax_id'] == 'unassigned':
-                    df_sample.at[len(df_sample.index)-1, rank] = 'unassigned'
-                df_sample_reduced = df_sample[keep_ranks_sample + [metric]].rename(columns={metric: name})
-                df_sample_reduced = df_sample_reduced.groupby(keep_ranks_sample, dropna=False).sum().reset_index() #sum metric within df_sample_reduced if same tax lineage
+                #check which keep_ranks are in df_sample
+                keep_ranks_sample = [value for value in keep_ranks
+                                     if value in set(df_sample.columns)]
+                if df_sample.at[len(df_sample.index)-1, 'tax_id'] == 'unmapped':
+                    df_sample.at[len(df_sample.index)-1, rank] = 'unmapped'
+                df_sample_reduced = df_sample[keep_ranks_sample +
+                                              [metric]].rename(columns={metric: name})
+                df_sample_reduced = df_sample_reduced.groupby(keep_ranks_sample, dropna=False)\
+                    .sum().reset_index() #sum metric within df_sample_reduced if same tax lineage
                 df_sample_reduced = df_sample_reduced.astype(object)
                 df_sample_reduced[[name]] = df_sample_reduced[[name]].apply(pd.to_numeric)
                 df_combined_full = pd.merge(df_combined_full, df_sample_reduced, how='outer')
@@ -679,7 +672,8 @@ def combine_outputs(dir_path, rank, split_files=False, count_table=False):
     if count_table:
         filename_suffix = "-counts"
     if split_files:
-        abundance_out_path = os.path.join(dir_path, "emu-combined-abundance-{}{}.tsv".format(rank, filename_suffix))
+        abundance_out_path = os.path.join(dir_path, "emu-combined-abundance-{}{}.tsv"
+                                          .format(rank, filename_suffix))
         tax_out_path = os.path.join(dir_path, "emu-combined-taxonomy-{}.tsv".format(rank))
         stdout.write("Combined taxonomy table generated: {}\n".format(tax_out_path))
         df_combined_full[keep_ranks].to_csv(tax_out_path, sep='\t', index=False)
@@ -693,7 +687,7 @@ def combine_outputs(dir_path, rank, split_files=False, count_table=False):
     return df_combined_full
 
 if __name__ == "__main__":
-    __version__ = "3.4.6"
+    __version__ = "3.5.0"
     parser = argparse.ArgumentParser()
     parser.add_argument('--version', '-v', action='version', version='%(prog)s v' + __version__)
     subparsers = parser.add_subparsers(dest="subparser_name", help='sub-commands')
@@ -772,7 +766,7 @@ if __name__ == "__main__":
         help='collapsed taxonomic rank')
 
     combine_parser = subparsers.add_parser("combine-outputs",
-                                            help="Combine Emu rel abundance outputs to a single table")
+                help="Combine Emu rel abundance outputs to a single table")
     combine_parser.add_argument(
         'dir_path', type=str,
         help='path to directory containing Emu output files')
@@ -810,13 +804,17 @@ if __name__ == "__main__":
         SAM_FILE = generate_alignments(args.input_file, out_file, args.db)
         log_prob_cigar_op, locs_p_cigar_zero, longest_align_dict = \
             get_cigar_op_log_probabilities(SAM_FILE)
-        log_prob_rgs, counts_unassigned, counts_assigned = log_prob_rgs_dict(
+        log_prob_rgs, set_unmapped, set_mapped = log_prob_rgs_dict(
             SAM_FILE, log_prob_cigar_op, longest_align_dict, locs_p_cigar_zero)
         f_full, f_set_thresh, read_dist = expectation_maximization_iterations(log_prob_rgs,
                                                                    db_species_tids,
                                                                    .01, args.min_abundance)
+        classified_reads = {read_id for inner_dict in read_dist.values() for read_id in inner_dict}
+        mapped_unclassified = set_mapped - classified_reads
+        stdout.write(f"Unclassified mapped read count: {len(mapped_unclassified)}\n")
         freq_to_lineage_df(f_full, "{}_rel-abundance".format(out_file), df_taxonomy,
-                           counts_assigned, counts_unassigned, args.keep_counts)
+                           len(set_mapped), len(set_unmapped), len(mapped_unclassified),
+                           args.keep_counts)
 
 
         # output read assignment distributions as a tsv
@@ -828,15 +826,19 @@ if __name__ == "__main__":
             freq_to_lineage_df(
                 f_set_thresh,
                 "{}_rel-abundance-threshold-{}".format(out_file, args.min_abundance),
-                df_taxonomy, counts_assigned, counts_unassigned, args.keep_counts)
+                df_taxonomy, len(set_mapped), len(set_unmapped), len(mapped_unclassified),
+                args.keep_counts)
 
-        # gather input sequences that are unclassified according to minimap2
+        # gather input sequences that are unmapped according to minimap2
         if args.output_unclassified:
             ext = os.path.splitext(args.input_file[0])[-1]
             INPUT_FILETYPE = "fasta"
             if ext in [".fastq", ".fq"]:
                 INPUT_FILETYPE = "fastq"
-            output_unclassified(SAM_FILE, "{}_unclassified".format(out_file), INPUT_FILETYPE)
+            output_sequences(args.input_file[0], "{}_unmapped".format(out_file), INPUT_FILETYPE,
+                                                set_unmapped)
+            output_sequences(args.input_file[0], "{}_unclassified_mapped".format(out_file),
+                             INPUT_FILETYPE, mapped_unclassified)
 
         # clean up extra file
         if not args.keep_files: