Merge pull request #25 from Illumina/develop

Release 1.3.3
Illumina · Oct 28, 2020 · cb3ecbf · cb3ecbf
2 parents 8117799 + dc4eb37
commit cb3ecbf
Show file tree

Hide file tree

Showing 13 changed files with 568 additions and 361 deletions.
diff --git a/.gitignore b/.gitignore
@@ -55,3 +55,5 @@ docs/_build/
 
 # PyBuilder
 target/
+
+.vscode/
diff --git a/README.md b/README.md
@@ -8,6 +8,8 @@ If you have intensity data files (IDATs) for which GTC files are not currentty a
 & "C:\Program Files\Illumina\AutoConvert 2.0\AutoConvert.exe" path_to_idat_folder path_to_output_folder manifest_file egt_file
 ```
 
+You can also use a newer, cross-platform CLI (Win10, OSX, Linux) to achieve the same functionality: https://support.illumina.com/downloads/iaap-genotyping-cli.html
+
 ## Build instructions
 The IlluminaBeadArrayFiles repository supports building a package with the python distutils module (https://docs.python.org/2/distutils/setupscript.html). To build a source distribution, run the included setup.py script supplying the "sdist" command
 
@@ -23,16 +25,21 @@ The library depends on the availability of the numpy package in the python insta
 
 ## Example usage
 
-> from IlluminaBeadArrayFiles import GenotypeCalls, BeadPoolManifest, code2genotype  
-> import sys  
-> gtc_file = "path_to_genotypes.gtc"  
-> manifest_file = "path_to_manifest.bpm"  
-> names = BeadPoolManifest( manifest_file ).names  
-> genotypes = GenotypeCalls( gtc_file ).get_genotypes()     
-> for (locus, genotype) in zip( names, genotypes ):  
+> from IlluminaBeadArrayFiles import GenotypeCalls, BeadPoolManifest, code2genotype
+> import sys
+> gtc_file = "path_to_genotypes.gtc"
+> manifest_file = "path_to_manifest.bpm"
+> names = BeadPoolManifest( manifest_file ).names
+> genotypes = GenotypeCalls( gtc_file ).get_genotypes()
+> for (locus, genotype) in zip( names, genotypes ):
 > &nbsp;&nbsp;sys.stdout.write( locus + "," + code2genotype[genotype] + "\n" )
 
-Also, see examples/gtc_final_report.py for an additional example of usage. 
+Also, see examples/* for additional examples of usage.
+These scripts are based on common Genome Studio (https://support.illumina.com/array/array_software/genomestudio.html) reports.
+
+**NOTE:**
+For the DNA summary report, it does not exclude zeroed loci from the cluster file as it does in Genome Studio, so overall call rates may look lower.
+There is an open issue to address this: https://github.com/Illumina/BeadArrayFiles/issues/22
 
 ## GTC File Format
 The specification for the GTC file format is provided in docs/GTC_File_Format_v5.pdf
@@ -56,7 +63,7 @@ Class to encapsulate a normalization transform for conversion of raw intensity d
 Class to encapsulate the meta-data collected in the GTC file for a scanner instrument
 
 ### BeadPoolManifest
-Class for parsing a binary (BPM) manifest file. This class can be used to recover the in-order list of loci names in a given manifest, which is necessary to associate the genotype information present in the GTC file to a specific locus name. 
+Class for parsing a binary (BPM) manifest file. This class can be used to recover the in-order list of loci names in a given manifest, which is necessary to associate the genotype information present in the GTC file to a specific locus name.
 
 ### RefStrand, SourceStrand
 Represents different strand conventions in manifest
@@ -71,11 +78,11 @@ Aggregate information across many samples for a given loci
 Read information from a binary EGT file
 
 ## Compatibility
-This library is compatible with Python 2.7 and was tested with Python 2.7.11
+This library is compatible with Python 3 and was tested with Python 3.8.5
 
 ## License
 
->Copyright (c) 2017, Illumina
+>Copyright (c) 2020, Illumina
 > All rights reserved.
 >
 > Redistribution and use in source and binary forms, with or without
@@ -102,6 +109,3 @@ This library is compatible with Python 2.7 and was tested with Python 2.7.11
 >of the authors and should not be interpreted as representing official policies,
 >either expressed or implied, of the FreeBSD Project.
 
-
-
-
diff --git a/change_log.txt b/change_log.txt
@@ -1,25 +1,29 @@
-1.3.2
-  -Address missing import of functions
-
-1.3.1
-  -Support for version 3 GTC files during locus aggregation
-
-1.3.0
-  -Added new function to assist with aggregation of information across samples for loci (see LocusAggregate)
-  -Added new class to parse binary EGT files
-  -Added example script to generate text report similar to "Locus Summary" report from GenomeStudio
-
-1.2.0
-  -Added function to verify GTC file is complete
-  -Automatic check for GTC file completeness in GenotypeCalls constructor
-  -Switch to numpy data types for floating point calculations (e.g., normalized intensities) for
-    improved consistency with Genome Studio final reports. 
-
-1.1.1
-  -Address issue reading manifest lacking reference strand annotation
-
-1.1.0
-  -Added ability to parse customer and reference strand from manifest
-  -Added ability to generate forward and plus strand nucleotide calls from genotypes
-
+1.3.3
+  -Updated to python3
+  -Added GenomeStudio-style dna_report to examples
+
+1.3.2
+  -Address missing import of functions
+
+1.3.1
+  -Support for version 3 GTC files during locus aggregation
+
+1.3.0
+  -Added new function to assist with aggregation of information across samples for loci (see LocusAggregate)
+  -Added new class to parse binary EGT files
+  -Added example script to generate text report similar to "Locus Summary" report from GenomeStudio
+
+1.2.0
+  -Added function to verify GTC file is complete
+  -Automatic check for GTC file completeness in GenotypeCalls constructor
+  -Switch to numpy data types for floating point calculations (e.g., normalized intensities) for
+    improved consistency with Genome Studio final reports.
+
+1.1.1
+  -Address issue reading manifest lacking reference strand annotation
+
+1.1.0
+  -Added ability to parse customer and reference strand from manifest
+  -Added ability to generate forward and plus strand nucleotide calls from genotypes
+
 1.0.0
diff --git a/examples/dna_summary.py b/examples/dna_summary.py
@@ -0,0 +1,179 @@
+import sys
+import os
+import argparse
+import logging
+from math import isnan
+from numpy import percentile
+from scipy.stats import norm
+
+from IlluminaBeadArrayFiles import BeadPoolManifest, GenotypeCalls, RefStrand
+
+nan = float("nan")
+
+
+def compute_genotypes(genotypes):
+    """
+    Summarize information about genotype counts for diploid genotyping counting
+
+    Returns:
+        list: genotype counts and frequencies
+    """
+    # Count genotypes
+    no_calls = genotypes.count(0)
+    aa_count = genotypes.count(1)
+    ab_count = genotypes.count(2)
+    bb_count = genotypes.count(3)
+
+    # Get the number of calls (i.e., not no-calls)
+    num_calls = aa_count + ab_count + bb_count
+
+    # Get the call rate
+    if (num_calls + no_calls) > 0: 
+        call_rate = num_calls / float(num_calls + no_calls)
+    else:
+        call_rate = nan
+
+    # Genotype frequencies (as fraction of all calls)
+    if num_calls > 0:
+        # Frequency of AA genotype
+        aa_freq = round(aa_count / float(num_calls),4)
+        # Frequency of AB genotype
+        ab_freq = round(ab_count / float(num_calls),4)
+        # Frequency of BB genotype
+        bb_freq = round(bb_count / float(num_calls),4)
+    else:
+        aa_freq = nan
+        ab_freq = nan
+        bb_freq = nan
+
+    # Computes and return the minor allele frequency. If no calls, will be NaN
+    a_allele_count = aa_count * 2 + ab_count
+    if num_calls > 0:
+        a_frequency = a_allele_count / float(2 * num_calls)
+        minor_freq = round(min(a_frequency, 1.0 - a_frequency),4)
+    else:
+        minor_freq = nan
+    return [no_calls, num_calls, call_rate, aa_freq, ab_freq, bb_freq, minor_freq]
+
+
+
+def ScoreStatistics(scores, percentile):
+    """
+    Capture statistics related to the gencall score distribution
+
+    Args:
+        scores (list(float)): A list of gencall scores
+        percentile (int): percentile to calculate
+            gc_10 : 10th percentile of Gencall score distribution
+            gc_50 : 50th percentile of Gencall score distribution
+        
+    Returns:
+        float
+    """
+
+    num_scores = len(scores)
+    if num_scores == 0:
+        return nan
+
+    idx = int(num_scores*percentile/100)
+    fractional_index = num_scores*percentile/100.0 - idx
+    if fractional_index < 0.5 :
+        idx -= 1
+
+    if idx < 0:
+        return scores[0]
+
+    if idx >= num_scores - 1:
+        return scores[-1]
+
+    x1 = 100 * (idx + 0.5)/float(num_scores)
+    x2 = 100 * (idx + 1 + 0.5)/float(num_scores)
+    y1 = float(scores[idx])
+    y2 = float(scores[idx+1])
+    score_stat = y1 + (y2 - y1) / (x2 - x1) * (percentile - x1)
+    score_stat = round(score_stat,4)
+    return score_stat
+
+
+
+
+def get_logger():
+    # set up log file
+    # create logger
+    logger = logging.getLogger('DNA Report')
+    logger.setLevel(logging.DEBUG)
+
+    # create console handler and set level to debug
+    handler = logging.StreamHandler()
+    handler.setLevel(logging.INFO)
+
+    # create formatter
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    # add formatter to ch
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    return logger
+
+
+def driver(gtc_dir, manifest_filename, output_filename, project_name, delim, logger):
+    logger.info("Reading manifest file")
+    bpm = BeadPoolManifest(manifest_filename)
+
+    samples = []
+    logger.info("Initializing genotype data")
+    gtc_files = []
+    for gtc_file in os.listdir(gtc_dir):
+        if gtc_file.endswith(".gtc"):
+            gtc_files.append(os.path.join(gtc_dir, gtc_file))
+
+
+
+
+    logger.info("Generating report")
+    loci = range(len(bpm.normalization_lookups))
+    with open(output_filename, "w") as output_handle:
+        output_handle.write("DNA Report on " + os.path.abspath(output_filename) + "\n")
+        header = [""]
+        header.append("# LOCI = {}".format(len(loci)))
+        header.append("# DNAs = {}".format(len(samples)))
+        header.append("ProjectName = {}".format(project_name))
+        header.append("GenCall Version = NaN")
+        header.append("Low GenCall Score Cutoff = NaN")
+
+        output_handle.write(delim.join(header) + "\n")
+        output_handle.write(delim.join("Row,DNA_ID,#No_Calls,#Calls,Call_Rate,A/A_Freq,A/B_Freq,B/B_Freq,Minor_Freq,50%_GC_Score,10%_GC_Score".split(",")) + "\n")
+        row = 0
+        for gtc_file in gtc_files:
+            row += 1
+            gtc = GenotypeCalls(gtc_file)
+            genotypes = gtc.get_genotypes()
+            scores = gtc.get_genotype_scores()
+            assert len(genotypes) == len(bpm.names)
+            row_data = []
+            row_data.append(row)
+            row_data.append(gtc.get_sample_name())
+            row_data += compute_genotypes(genotypes)
+            row_data.append(ScoreStatistics(scores, 50))
+            row_data.append(ScoreStatistics(scores, 10))
+            output_handle.write(delim.join(map(str, row_data)) + "\n")
+        logger.info("Report generation complete")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        "Generate a DNA report from a directory of GTC files")
+
+    parser.add_argument("gtc_directory", help="Directory containing GTC files")
+    parser.add_argument("manifest_file", help="BPM manifest file")
+    parser.add_argument("output_file", help="Location to write report")
+    parser.add_argument("-p", "--project-name", dest="project_name",
+                        default="Project", help="A project name to report in the output header")
+
+    args = parser.parse_args()
+    logger = get_logger()
+    driver(args.gtc_directory, args.manifest_file, args.output_file, args.project_name, ",", logger)
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gtc_final_report.py b/examples/gtc_final_report.py
@@ -39,15 +39,16 @@
     output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n")
 
     output_handle.write("[Data]\n")
-    output_handle.write(delim.join(["SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB", "Alleles - Plus", "Alleles - Forward"]) + "\n")
+    output_handle.write(delim.join(["SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB", "Alleles - Plus", "Alleles - Forward", "X",	"Y"]) + "\n")
     for gtc_file in samples:
         sys.stderr.write("Processing " + gtc_file + "\n")
         gtc_file = os.path.join(args.gtc_directory, gtc_file)
         gtc = GenotypeCalls(gtc_file)
         genotypes = gtc.get_genotypes()
         plus_strand_genotypes = gtc.get_base_calls_plus_strand(manifest.snps, manifest.ref_strands)
         forward_strand_genotypes = gtc.get_base_calls_forward_strand(manifest.snps, manifest.source_strands)
+        normalized_intensities = gtc.get_normalized_intensities(manifest.normalization_lookups)
 
         assert len(genotypes) == len(manifest.names)
-        for (name, chrom, map_info, genotype, ref_strand_genotype, source_strand_genotype) in zip(manifest.names, manifest.chroms, manifest.map_infos, genotypes, plus_strand_genotypes, forward_strand_genotypes):
-            output_handle.write(delim.join([name, os.path.basename(gtc_file)[:-4], chrom, str(map_info), code2genotype[genotype], ref_strand_genotype, source_strand_genotype]) + "\n")
+        for (name, chrom, map_info, genotype, ref_strand_genotype, source_strand_genotype, (x_norm, y_norm)) in zip(manifest.names, manifest.chroms, manifest.map_infos, genotypes, plus_strand_genotypes, forward_strand_genotypes, normalized_intensities):
+            output_handle.write(delim.join([name, os.path.basename(gtc_file)[:-4], chrom, str(map_info), code2genotype[genotype], ref_strand_genotype, source_strand_genotype, str(x_norm), str(y_norm)]) + "\n")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -55,3 +55,5 @@ docs/_build/

		# PyBuilder
		target/

		.vscode/