diff --git a/.gitignore b/.gitignore index ba74660..b9e72b1 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,5 @@ docs/_build/ # PyBuilder target/ + +.vscode/ \ No newline at end of file diff --git a/README.md b/README.md index c9246bf..d465d3c 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,8 @@ If you have intensity data files (IDATs) for which GTC files are not currentty a & "C:\Program Files\Illumina\AutoConvert 2.0\AutoConvert.exe" path_to_idat_folder path_to_output_folder manifest_file egt_file ``` +You can also use a newer, cross-platform CLI (Win10, OSX, Linux) to achieve the same functionality: https://support.illumina.com/downloads/iaap-genotyping-cli.html + ## Build instructions The IlluminaBeadArrayFiles repository supports building a package with the python distutils module (https://docs.python.org/2/distutils/setupscript.html). To build a source distribution, run the included setup.py script supplying the "sdist" command @@ -23,16 +25,21 @@ The library depends on the availability of the numpy package in the python insta ## Example usage -> from IlluminaBeadArrayFiles import GenotypeCalls, BeadPoolManifest, code2genotype -> import sys -> gtc_file = "path_to_genotypes.gtc" -> manifest_file = "path_to_manifest.bpm" -> names = BeadPoolManifest( manifest_file ).names -> genotypes = GenotypeCalls( gtc_file ).get_genotypes() -> for (locus, genotype) in zip( names, genotypes ): +> from IlluminaBeadArrayFiles import GenotypeCalls, BeadPoolManifest, code2genotype +> import sys +> gtc_file = "path_to_genotypes.gtc" +> manifest_file = "path_to_manifest.bpm" +> names = BeadPoolManifest( manifest_file ).names +> genotypes = GenotypeCalls( gtc_file ).get_genotypes() +> for (locus, genotype) in zip( names, genotypes ): >   sys.stdout.write( locus + "," + code2genotype[genotype] + "\n" ) -Also, see examples/gtc_final_report.py for an additional example of usage. +Also, see examples/* for additional examples of usage. +These scripts are based on common Genome Studio (https://support.illumina.com/array/array_software/genomestudio.html) reports. + +**NOTE:** +For the DNA summary report, it does not exclude zeroed loci from the cluster file as it does in Genome Studio, so overall call rates may look lower. +There is an open issue to address this: https://github.com/Illumina/BeadArrayFiles/issues/22 ## GTC File Format The specification for the GTC file format is provided in docs/GTC_File_Format_v5.pdf @@ -56,7 +63,7 @@ Class to encapsulate a normalization transform for conversion of raw intensity d Class to encapsulate the meta-data collected in the GTC file for a scanner instrument ### BeadPoolManifest -Class for parsing a binary (BPM) manifest file. This class can be used to recover the in-order list of loci names in a given manifest, which is necessary to associate the genotype information present in the GTC file to a specific locus name. +Class for parsing a binary (BPM) manifest file. This class can be used to recover the in-order list of loci names in a given manifest, which is necessary to associate the genotype information present in the GTC file to a specific locus name. ### RefStrand, SourceStrand Represents different strand conventions in manifest @@ -71,11 +78,11 @@ Aggregate information across many samples for a given loci Read information from a binary EGT file ## Compatibility -This library is compatible with Python 2.7 and was tested with Python 2.7.11 +This library is compatible with Python 3 and was tested with Python 3.8.5 ## License ->Copyright (c) 2017, Illumina +>Copyright (c) 2020, Illumina > All rights reserved. > > Redistribution and use in source and binary forms, with or without @@ -102,6 +109,3 @@ This library is compatible with Python 2.7 and was tested with Python 2.7.11 >of the authors and should not be interpreted as representing official policies, >either expressed or implied, of the FreeBSD Project. - - - diff --git a/change_log.txt b/change_log.txt index e1f41ed..4f0fd8c 100644 --- a/change_log.txt +++ b/change_log.txt @@ -1,25 +1,29 @@ -1.3.2 - -Address missing import of functions - -1.3.1 - -Support for version 3 GTC files during locus aggregation - -1.3.0 - -Added new function to assist with aggregation of information across samples for loci (see LocusAggregate) - -Added new class to parse binary EGT files - -Added example script to generate text report similar to "Locus Summary" report from GenomeStudio - -1.2.0 - -Added function to verify GTC file is complete - -Automatic check for GTC file completeness in GenotypeCalls constructor - -Switch to numpy data types for floating point calculations (e.g., normalized intensities) for - improved consistency with Genome Studio final reports. - -1.1.1 - -Address issue reading manifest lacking reference strand annotation - -1.1.0 - -Added ability to parse customer and reference strand from manifest - -Added ability to generate forward and plus strand nucleotide calls from genotypes - +1.3.3 + -Updated to python3 + -Added GenomeStudio-style dna_report to examples + +1.3.2 + -Address missing import of functions + +1.3.1 + -Support for version 3 GTC files during locus aggregation + +1.3.0 + -Added new function to assist with aggregation of information across samples for loci (see LocusAggregate) + -Added new class to parse binary EGT files + -Added example script to generate text report similar to "Locus Summary" report from GenomeStudio + +1.2.0 + -Added function to verify GTC file is complete + -Automatic check for GTC file completeness in GenotypeCalls constructor + -Switch to numpy data types for floating point calculations (e.g., normalized intensities) for + improved consistency with Genome Studio final reports. + +1.1.1 + -Address issue reading manifest lacking reference strand annotation + +1.1.0 + -Added ability to parse customer and reference strand from manifest + -Added ability to generate forward and plus strand nucleotide calls from genotypes + 1.0.0 \ No newline at end of file diff --git a/examples/dna_summary.py b/examples/dna_summary.py new file mode 100644 index 0000000..5844e59 --- /dev/null +++ b/examples/dna_summary.py @@ -0,0 +1,179 @@ +import sys +import os +import argparse +import logging +from math import isnan +from numpy import percentile +from scipy.stats import norm + +from IlluminaBeadArrayFiles import BeadPoolManifest, GenotypeCalls, RefStrand + +nan = float("nan") + + +def compute_genotypes(genotypes): + """ + Summarize information about genotype counts for diploid genotyping counting + + Returns: + list: genotype counts and frequencies + """ + # Count genotypes + no_calls = genotypes.count(0) + aa_count = genotypes.count(1) + ab_count = genotypes.count(2) + bb_count = genotypes.count(3) + + # Get the number of calls (i.e., not no-calls) + num_calls = aa_count + ab_count + bb_count + + # Get the call rate + if (num_calls + no_calls) > 0: + call_rate = num_calls / float(num_calls + no_calls) + else: + call_rate = nan + + # Genotype frequencies (as fraction of all calls) + if num_calls > 0: + # Frequency of AA genotype + aa_freq = round(aa_count / float(num_calls),4) + # Frequency of AB genotype + ab_freq = round(ab_count / float(num_calls),4) + # Frequency of BB genotype + bb_freq = round(bb_count / float(num_calls),4) + else: + aa_freq = nan + ab_freq = nan + bb_freq = nan + + # Computes and return the minor allele frequency. If no calls, will be NaN + a_allele_count = aa_count * 2 + ab_count + if num_calls > 0: + a_frequency = a_allele_count / float(2 * num_calls) + minor_freq = round(min(a_frequency, 1.0 - a_frequency),4) + else: + minor_freq = nan + return [no_calls, num_calls, call_rate, aa_freq, ab_freq, bb_freq, minor_freq] + + + +def ScoreStatistics(scores, percentile): + """ + Capture statistics related to the gencall score distribution + + Args: + scores (list(float)): A list of gencall scores + percentile (int): percentile to calculate + gc_10 : 10th percentile of Gencall score distribution + gc_50 : 50th percentile of Gencall score distribution + + Returns: + float + """ + + num_scores = len(scores) + if num_scores == 0: + return nan + + idx = int(num_scores*percentile/100) + fractional_index = num_scores*percentile/100.0 - idx + if fractional_index < 0.5 : + idx -= 1 + + if idx < 0: + return scores[0] + + if idx >= num_scores - 1: + return scores[-1] + + x1 = 100 * (idx + 0.5)/float(num_scores) + x2 = 100 * (idx + 1 + 0.5)/float(num_scores) + y1 = float(scores[idx]) + y2 = float(scores[idx+1]) + score_stat = y1 + (y2 - y1) / (x2 - x1) * (percentile - x1) + score_stat = round(score_stat,4) + return score_stat + + + + +def get_logger(): + # set up log file + # create logger + logger = logging.getLogger('DNA Report') + logger.setLevel(logging.DEBUG) + + # create console handler and set level to debug + handler = logging.StreamHandler() + handler.setLevel(logging.INFO) + + # create formatter + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s') + + # add formatter to ch + handler.setFormatter(formatter) + logger.addHandler(handler) + return logger + + +def driver(gtc_dir, manifest_filename, output_filename, project_name, delim, logger): + logger.info("Reading manifest file") + bpm = BeadPoolManifest(manifest_filename) + + samples = [] + logger.info("Initializing genotype data") + gtc_files = [] + for gtc_file in os.listdir(gtc_dir): + if gtc_file.endswith(".gtc"): + gtc_files.append(os.path.join(gtc_dir, gtc_file)) + + + + + logger.info("Generating report") + loci = range(len(bpm.normalization_lookups)) + with open(output_filename, "w") as output_handle: + output_handle.write("DNA Report on " + os.path.abspath(output_filename) + "\n") + header = [""] + header.append("# LOCI = {}".format(len(loci))) + header.append("# DNAs = {}".format(len(samples))) + header.append("ProjectName = {}".format(project_name)) + header.append("GenCall Version = NaN") + header.append("Low GenCall Score Cutoff = NaN") + + output_handle.write(delim.join(header) + "\n") + output_handle.write(delim.join("Row,DNA_ID,#No_Calls,#Calls,Call_Rate,A/A_Freq,A/B_Freq,B/B_Freq,Minor_Freq,50%_GC_Score,10%_GC_Score".split(",")) + "\n") + row = 0 + for gtc_file in gtc_files: + row += 1 + gtc = GenotypeCalls(gtc_file) + genotypes = gtc.get_genotypes() + scores = gtc.get_genotype_scores() + assert len(genotypes) == len(bpm.names) + row_data = [] + row_data.append(row) + row_data.append(gtc.get_sample_name()) + row_data += compute_genotypes(genotypes) + row_data.append(ScoreStatistics(scores, 50)) + row_data.append(ScoreStatistics(scores, 10)) + output_handle.write(delim.join(map(str, row_data)) + "\n") + logger.info("Report generation complete") + + +def main(): + parser = argparse.ArgumentParser( + "Generate a DNA report from a directory of GTC files") + + parser.add_argument("gtc_directory", help="Directory containing GTC files") + parser.add_argument("manifest_file", help="BPM manifest file") + parser.add_argument("output_file", help="Location to write report") + parser.add_argument("-p", "--project-name", dest="project_name", + default="Project", help="A project name to report in the output header") + + args = parser.parse_args() + logger = get_logger() + driver(args.gtc_directory, args.manifest_file, args.output_file, args.project_name, ",", logger) + +if __name__ == "__main__": + main() diff --git a/examples/gtc_final_report.py b/examples/gtc_final_report.py index dd4174f..0ad94d0 100644 --- a/examples/gtc_final_report.py +++ b/examples/gtc_final_report.py @@ -39,7 +39,7 @@ output_handle.write(delim.join(["Total Samples", str(len(samples))]) + "\n") output_handle.write("[Data]\n") - output_handle.write(delim.join(["SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB", "Alleles - Plus", "Alleles - Forward"]) + "\n") + output_handle.write(delim.join(["SNP Name", "Sample ID", "Chr", "MapInfo", "Alleles - AB", "Alleles - Plus", "Alleles - Forward", "X", "Y"]) + "\n") for gtc_file in samples: sys.stderr.write("Processing " + gtc_file + "\n") gtc_file = os.path.join(args.gtc_directory, gtc_file) @@ -47,7 +47,8 @@ genotypes = gtc.get_genotypes() plus_strand_genotypes = gtc.get_base_calls_plus_strand(manifest.snps, manifest.ref_strands) forward_strand_genotypes = gtc.get_base_calls_forward_strand(manifest.snps, manifest.source_strands) + normalized_intensities = gtc.get_normalized_intensities(manifest.normalization_lookups) assert len(genotypes) == len(manifest.names) - for (name, chrom, map_info, genotype, ref_strand_genotype, source_strand_genotype) in zip(manifest.names, manifest.chroms, manifest.map_infos, genotypes, plus_strand_genotypes, forward_strand_genotypes): - output_handle.write(delim.join([name, os.path.basename(gtc_file)[:-4], chrom, str(map_info), code2genotype[genotype], ref_strand_genotype, source_strand_genotype]) + "\n") + for (name, chrom, map_info, genotype, ref_strand_genotype, source_strand_genotype, (x_norm, y_norm)) in zip(manifest.names, manifest.chroms, manifest.map_infos, genotypes, plus_strand_genotypes, forward_strand_genotypes, normalized_intensities): + output_handle.write(delim.join([name, os.path.basename(gtc_file)[:-4], chrom, str(map_info), code2genotype[genotype], ref_strand_genotype, source_strand_genotype, str(x_norm), str(y_norm)]) + "\n") diff --git a/examples/locus_summary.py b/examples/locus_summary.py index 7f9f655..e24bb98 100644 --- a/examples/locus_summary.py +++ b/examples/locus_summary.py @@ -2,316 +2,329 @@ import os import argparse import logging +import pandas as pd from math import isnan -from itertools import izip from numpy import percentile from scipy.stats import norm + from IlluminaBeadArrayFiles import LocusAggregate, BeadPoolManifest, GenotypeCalls, ClusterFile, RefStrand nan = float("nan") class LocusSummary(object): - def __init__(self, genotype_counts, score_stats): - self.genotype_counts = genotype_counts - self.score_stats = score_stats + def __init__(self, genotype_counts, score_stats): + self.genotype_counts = genotype_counts + self.score_stats = score_stats class GenotypeCounts(object): - """ - Summarize information about genotype counts for diploid genotyping counting - """ - - def __init__(self, genotypes): - self.no_calls = 0 - self.aa_count = 0 - self.ab_count = 0 - self.bb_count = 0 - - for genotype in genotypes: - if genotype == 0: - self.no_calls += 1 - elif genotype == 1: - self.aa_count += 1 - elif genotype == 2: - self.ab_count += 1 - elif genotype == 3: - self.bb_count += 1 - - def get_num_calls(self): - """ - Get the number of calls (i.e., not no-calls) - - Returns: - int: The number of calls - """ - return self.aa_count + self.ab_count + self.bb_count - - def get_call_frequency(self): - """ - Get the call rate - - Returns: - float: The frequency of calls - """ - num_calls = self.get_num_calls() - return num_calls / float(num_calls + self.no_calls) if num_calls + self.no_calls > 0 else nan - - def get_aa_frequency(self): - """ - Frequency of AA genotype (as fraction of all calls) - - Returns: - float: AA genotype frequency - """ - return self.aa_count / float(self.get_num_calls()) if self.get_num_calls() > 0 else nan - - def get_ab_frequency(self): - """ - Frequency of AB genotype (as fraction of all calls) - - Returns: - float: AB genotype frequency - """ - return self.ab_count / float(self.get_num_calls()) if self.get_num_calls() > 0 else nan - - def get_bb_frequency(self): - """ - Frequency of BB genotype (as fraction of all calls) - - Returns: - float: BB genotype frequency - """ - return self.bb_count / float(self.get_num_calls()) if self.get_num_calls() > 0 else nan - - def get_minor_frequency(self): - """ - Comoputes and return the minor allele frequency. If no calls, will be NaN - - Returns: - float - """ - a_allele_count = self.aa_count * 2 + self.ab_count - a_frequency = a_allele_count / \ - float(2 * self.get_num_calls()) if self.get_num_calls() > 0 else nan - return min(a_frequency, 1.0 - a_frequency) if not isnan(a_frequency) else nan - - def compute_hardy_weinberg(self): - """ - Computes and returns statistics related to HW equilibrium - - Returns: - (float, float): Het excess and ChiSq 100 statistics, respectively - """ - num_calls = self.get_num_calls() - if num_calls == 0: - return (0.0, 0.0) - - if self.aa_count + self.ab_count == 0 or self.ab_count + self.bb_count == 0: - return (1.0, 0.0) - - num_calls = float(num_calls) - - q = self.get_minor_frequency() - p = 1 - q - - temp = 0.013 / q - k = temp * temp * temp * temp - dh = ((self.ab_count / num_calls + k) / (2 * p * q + k)) - 1 - if dh < 0: - hw = (2 * norm.cdf(dh, 0, 1 / 10.0)) - else: - hw = (2 * (1 - norm.cdf(dh, 0, 1 / 10.0))) - - return (hw, dh) + """ + Summarize information about genotype counts for diploid genotyping counting + """ + + def __init__(self, genotypes): + self.no_calls = 0 + self.aa_count = 0 + self.ab_count = 0 + self.bb_count = 0 + + + for genotype in genotypes: + if genotype == 0: + self.no_calls += 1 + elif genotype == 1: + self.aa_count += 1 + elif genotype == 2: + self.ab_count += 1 + elif genotype == 3: + self.bb_count += 1 + + def get_num_calls(self): + """ + Get the number of calls (i.e., not no-calls) + + Returns: + int: The number of calls + """ + return self.aa_count + self.ab_count + self.bb_count + + def get_call_frequency(self): + """ + Get the call rate + + Returns: + float: The frequency of calls + """ + num_calls = self.get_num_calls() + return num_calls / float(num_calls + self.no_calls) if num_calls + self.no_calls > 0 else nan + + def get_aa_frequency(self): + """ + Frequency of AA genotype (as fraction of all calls) + + Returns: + float: AA genotype frequency + """ + return self.aa_count / float(self.get_num_calls()) if self.get_num_calls() > 0 else nan + + def get_ab_frequency(self): + """ + Frequency of AB genotype (as fraction of all calls) + + Returns: + float: AB genotype frequency + """ + return self.ab_count / float(self.get_num_calls()) if self.get_num_calls() > 0 else nan + + def get_bb_frequency(self): + """ + Frequency of BB genotype (as fraction of all calls) + + Returns: + float: BB genotype frequency + """ + return self.bb_count / float(self.get_num_calls()) if self.get_num_calls() > 0 else nan + + def get_minor_frequency(self): + """ + Comoputes and return the minor allele frequency. If no calls, will be NaN + + Returns: + float + """ + a_allele_count = self.aa_count * 2 + self.ab_count + a_frequency = a_allele_count / \ + float(2 * self.get_num_calls()) if self.get_num_calls() > 0 else nan + return min(a_frequency, 1.0 - a_frequency) if not isnan(a_frequency) else nan + + def compute_hardy_weinberg(self): + """ + Computes and returns statistics related to HW equilibrium + + Returns: + (float, float): Het excess and ChiSq 100 statistics, respectively + """ + num_calls = self.get_num_calls() + if num_calls == 0: + return (0.0, 0.0) + + if self.aa_count + self.ab_count == 0 or self.ab_count + self.bb_count == 0: + return (1.0, 0.0) + + num_calls = float(num_calls) + + q = self.get_minor_frequency() + p = 1 - q + + temp = 0.013 / q + k = temp * temp * temp * temp + dh = ((self.ab_count / num_calls + k) / (2 * p * q + k)) - 1 + if dh < 0: + hw = (2 * norm.cdf(dh, 0, 1 / 10.0)) + else: + hw = (2 * (1 - norm.cdf(dh, 0, 1 / 10.0))) + + return (hw, dh) class ScoreStatistics(object): - """ - Capture statistics related to the gencall score distribution - - Attributes: - gc_10 : 10th percentile of Gencall score distribution - gc_50 : 50th percentile of Gencall score distribution - """ - - def __init__(self, scores, genotypes): - """ - Create new ScoreStatistics object - - Args: - score (list(float)): A list of gencall scores - genotypes (list(int)): A list of genotypes - - Returns: - ScoreStatistics - """ - called_scores = sorted([score for (score, genotype) in zip(scores, genotypes) if genotype != 0]) - self.gc_10 = ScoreStatistics.percentile(called_scores, 10) - self.gc_50 = ScoreStatistics.percentile(called_scores, 50) - - @staticmethod - def percentile(scores, percentile): - """ - Percentile as calculated in GenomeStudio - - Args: - scores (list(float)): list of scores (typically for called genotypes) - percentile (int): percentile to calculate - - Returns: - float - """ - num_scores = len(scores) - if num_scores == 0: - return nan - - idx = int(num_scores*percentile/100) - fractional_index = num_scores*percentile/100.0 - idx - if fractional_index < 0.5 : - idx -= 1 - - if idx < 0: - return scores[0] - - if idx >= num_scores - 1: - return scores[-1] - - x1 = 100 * (idx + 0.5)/float(num_scores) - x2 = 100 * (idx + 1 + 0.5)/float(num_scores) - y1 = float(scores[idx]) - y2 = float(scores[idx+1]) - - return y1 + (y2 - y1) / (x2 - x1) * (percentile - x1) - - - - -def summarize_locus(locus_aggregate): - """ - Generate a locus summary based on aggregated locus information - - Args: - LocusAggregate : Aggregated information for a locus - - Returns - LocusSummary - """ - genotype_counts = GenotypeCounts(locus_aggregate.genotypes) - score_stats = ScoreStatistics(locus_aggregate.scores, locus_aggregate.genotypes) - return LocusSummary(genotype_counts, score_stats) + """ + Capture statistics related to the gencall score distribution + + Attributes: + gc_10 : 10th percentile of Gencall score distribution + gc_50 : 50th percentile of Gencall score distribution + """ + + def __init__(self, scores, genotypes): + """ + Create new ScoreStatistics object + + Args: + score (list(float)): A list of gencall scores + genotypes (list(int)): A list of genotypes + + Returns: + ScoreStatistics + """ + called_scores = sorted([score for (score, genotype) in zip(scores, genotypes) if genotype != 0]) + self.gc_10 = ScoreStatistics.percentile(called_scores, 10) + self.gc_50 = ScoreStatistics.percentile(called_scores, 50) + + @staticmethod + def percentile(scores, percentile): + """ + Percentile as calculated in GenomeStudio + + Args: + scores (list(float)): list of scores (typically for called genotypes) + percentile (int): percentile to calculate + + Returns: + float + """ + num_scores = len(scores) + if num_scores == 0: + return nan + + idx = int(num_scores*percentile/100) + fractional_index = num_scores*percentile/100.0 - idx + if fractional_index < 0.5 : + idx -= 1 + + if idx < 0: + return scores[0] + + if idx >= num_scores - 1: + return scores[-1] + + x1 = 100 * (idx + 0.5)/float(num_scores) + x2 = 100 * (idx + 1 + 0.5)/float(num_scores) + y1 = float(scores[idx]) + y2 = float(scores[idx+1]) + + return y1 + (y2 - y1) / (x2 - x1) * (percentile - x1) + + + +def summarize_locus(snp_wise_genotypes,snp_wise_scores): + """ + Generate a locus summary based on aggregated locus information + + Args: + LocusAggregate : Aggregated information for a locus + + Returns + LocusSummary + """ + genotype_counts = GenotypeCounts(snp_wise_genotypes) + score_stats = ScoreStatistics(snp_wise_scores, snp_wise_genotypes) + return LocusSummary(genotype_counts, score_stats) def get_logger(): - # set up log file - # create logger - logger = logging.getLogger('Locus Summary Report') - logger.setLevel(logging.DEBUG) + # set up log file + # create logger + logger = logging.getLogger('Locus Summary Report') + logger.setLevel(logging.DEBUG) - # create console handler and set level to debug - handler = logging.StreamHandler() - handler.setLevel(logging.INFO) + # create console handler and set level to debug + handler = logging.StreamHandler() + handler.setLevel(logging.INFO) - # create formatter - formatter = logging.Formatter( - '%(asctime)s - %(name)s - %(levelname)s - %(message)s') + # create formatter + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s') - # add formatter to ch - handler.setFormatter(formatter) - logger.addHandler(handler) - return logger + # add formatter to ch + handler.setFormatter(formatter) + logger.addHandler(handler) + return logger def driver(gtc_dir, manifest_filename, cluster_filename, output_filename, project_name, delim, logger): - logger.info("Reading cluster file") - with open(cluster_filename, "rb") as cluster_handle: - egt = ClusterFile.read_cluster_file(cluster_handle) - - logger.info("Reading manifest file") - bpm = BeadPoolManifest(manifest_filename) - samples = [] - - logger.info("Initializing genotype data") - gtc_files = [] - for gtc_file in os.listdir(gtc_dir): - if gtc_file.endswith(".gtc"): - gtc_files.append(os.path.join(gtc_dir, gtc_file)) - - samples = map(GenotypeCalls, gtc_files) - - logger.info("Generating report") - loci = range(len(bpm.normalization_lookups)) - with open(output_filename, "w") as output_handle: - output_handle.write("Locus Summary on " + - os.path.abspath(output_filename) + "\n") - - header = [""] - header.append("# LOCI = {}".format(len(loci))) - header.append("# DNAs = {}".format(len(samples))) - header.append("ProjectName = {}".format(project_name)) - header.append("GenCall Version = {}".format(egt.gencall_version)) - header.append("Low GenCall Score Cutoff = NaN") - - output_handle.write(delim.join(header) + "\n") - - output_handle.write(delim.join("Row,Locus_Name,Illumicode_Name,#No_Calls,#Calls,Call_Freq,A/A_Freq,A/B_Freq,B/B_Freq,Minor_Freq,Gentrain_Score,50%_GC_Score,10%_GC_Score,Het_Excess_Freq,ChiTest_P100,Cluster_Sep,AA_T_Mean,AA_T_Std,AB_T_Mean,AB_T_Std,BB_T_Mean,BB_T_Std,AA_R_Mean,AA_R_Std,AB_R_Mean,AB_R_Std,BB_R_Mean,BB_R_Std,Plus/Minus Strand".split(",")) + "\n") - for (locus, locus_summary) in izip(loci, LocusAggregate.aggregate_samples(samples, loci, summarize_locus, bpm.normalization_lookups)): - locus_name = bpm.names[locus] - cluster_record = egt.get_record(locus_name) - row_data = [] - row_data.append(locus + 1) - row_data.append(locus_name) - row_data.append(cluster_record.address) - row_data.append(locus_summary.genotype_counts.no_calls) - row_data.append(locus_summary.genotype_counts.get_num_calls()) - row_data.append(locus_summary.genotype_counts.get_call_frequency()) - row_data.append(locus_summary.genotype_counts.get_aa_frequency()) - row_data.append(locus_summary.genotype_counts.get_ab_frequency()) - row_data.append(locus_summary.genotype_counts.get_bb_frequency()) - row_data.append( - locus_summary.genotype_counts.get_minor_frequency()) - row_data.append(cluster_record.cluster_score.total_score) - row_data.append(locus_summary.score_stats.gc_50) - row_data.append(locus_summary.score_stats.gc_10) - - (hw_equilibrium, het_excess) = locus_summary.genotype_counts.compute_hardy_weinberg() - row_data.append(het_excess) - row_data.append(hw_equilibrium) - - row_data.append(cluster_record.cluster_score.cluster_separation) - - for cluster_stats in (cluster_record.aa_cluster_stats, cluster_record.ab_cluster_stats, cluster_record.bb_cluster_stats): - row_data.append(cluster_stats.theta_mean) - row_data.append(cluster_stats.theta_dev) - - for cluster_stats in (cluster_record.aa_cluster_stats, cluster_record.ab_cluster_stats, cluster_record.bb_cluster_stats): - row_data.append(cluster_stats.r_mean) - row_data.append(cluster_stats.r_dev) - - if len(bpm.ref_strands) > 0: - row_data.append(RefStrand.to_string(bpm.ref_strands[locus])) - else: - row_data.append("U") - - output_handle.write(delim.join(map(str, row_data)) + "\n") - logger.info("Report generation complete") + logger.info("Reading cluster file") + with open(cluster_filename, "rb") as cluster_handle: + egt = ClusterFile.read_cluster_file(cluster_handle) + + logger.info("Reading manifest file") + bpm = BeadPoolManifest(manifest_filename) + samples = [] + + logger.info("Initializing genotype data") + gtc_files = [] + for gtc_file in os.listdir(gtc_dir): + if gtc_file.endswith(".gtc"): + gtc_files.append(os.path.join(gtc_dir, gtc_file)) + + samples = map(GenotypeCalls, gtc_files) + + + ls_genotypes = [] + ls_genotype_scores = [] + ls_sample_names = [] + ls_snps = bpm.names + for sample in samples: + genotypes = sample.get_genotypes() + assert len(genotypes) == len(bpm.names) + ls_genotypes.append(genotypes) + ls_genotype_scores.append(sample.get_genotype_scores()) + ls_sample_names.append(sample.get_sample_name()) + + logger.info("Generating report") + loci = range(len(bpm.normalization_lookups)) + row = 0 + with open(output_filename, "w") as output_handle: + output_handle.write("Locus Summary on " + os.path.abspath(output_filename) + "\n") + header = [""] + header.append("# LOCI = {}".format(len(loci))) + header.append("# DNAs = {}".format(len(gtc_files))) + header.append("ProjectName = {}".format(project_name)) + header.append("GenCall Version = {}".format(egt.gencall_version)) + header.append("Low GenCall Score Cutoff = NaN") + + output_handle.write(delim.join(header) + "\n") + + output_handle.write(delim.join("Row,Locus_Name,Illumicode_Name,#No_Calls,#Calls,Call_Freq,A/A_Freq,A/B_Freq,B/B_Freq,Minor_Freq,Gentrain_Score,50%_GC_Score,10%_GC_Score,Het_Excess_Freq,ChiTest_P100,Cluster_Sep,AA_T_Mean,AA_T_Std,AB_T_Mean,AB_T_Std,BB_T_Mean,BB_T_Std,AA_R_Mean,AA_R_Std,AB_R_Mean,AB_R_Std,BB_R_Mean,BB_R_Std,Plus/Minus Strand".split(",")) + "\n") + for i in range(0,len(ls_snps)): + row += 1 + snp_wise_genotypes = [item[i] for item in ls_genotypes] + snp_wise_scores = [item[i] for item in ls_genotype_scores] + locus_summary = summarize_locus(snp_wise_genotypes,snp_wise_scores) + cluster_record = egt.get_record(ls_snps[i]) + row_data = [] + row_data.append(row) + row_data.append(ls_snps[i]) + row_data.append(cluster_record.address) + row_data.append(locus_summary.genotype_counts.no_calls) + row_data.append(locus_summary.genotype_counts.get_num_calls()) + row_data.append(locus_summary.genotype_counts.get_call_frequency()) + row_data.append(locus_summary.genotype_counts.get_aa_frequency()) + row_data.append(locus_summary.genotype_counts.get_ab_frequency()) + row_data.append(locus_summary.genotype_counts.get_bb_frequency()) + row_data.append(locus_summary.genotype_counts.get_minor_frequency()) + row_data.append(cluster_record.cluster_score.total_score) + row_data.append(locus_summary.score_stats.gc_50) + row_data.append(locus_summary.score_stats.gc_10) + + (hw_equilibrium, het_excess) = locus_summary.genotype_counts.compute_hardy_weinberg() + row_data.append(het_excess) + row_data.append(hw_equilibrium) + + row_data.append(cluster_record.cluster_score.cluster_separation) + + for cluster_stats in (cluster_record.aa_cluster_stats, cluster_record.ab_cluster_stats, cluster_record.bb_cluster_stats): + row_data.append(cluster_stats.theta_mean) + row_data.append(cluster_stats.theta_dev) + + for cluster_stats in (cluster_record.aa_cluster_stats, cluster_record.ab_cluster_stats, cluster_record.bb_cluster_stats): + row_data.append(cluster_stats.r_mean) + row_data.append(cluster_stats.r_dev) + + if len(bpm.ref_strands) > 0: + row_data.append(RefStrand.to_string(bpm.ref_strands[i])) + else: + row_data.append("U") + output_handle.write(delim.join(map(str, row_data)) + "\n") + logger.info("Report generation complete") def main(): - parser = argparse.ArgumentParser( - "Generate a locus summary report from a directory of GTC files") + parser = argparse.ArgumentParser( + "Generate a locus summary report from a directory of GTC files") - parser.add_argument("gtc_directory", help="Directory containing GTC files") - parser.add_argument("manifest_file", help="BPM manifest file") - parser.add_argument("cluster_file", help="EGT cluster file") - parser.add_argument("output_file", help="Location to write report") - parser.add_argument("-p", "--project-name", dest="project_name", - default="Project", help="A project name to report in the output header") + parser.add_argument("gtc_directory", help="Directory containing GTC files") + parser.add_argument("manifest_file", help="BPM manifest file") + parser.add_argument("cluster_file", help="EGT cluster file") + parser.add_argument("output_file", help="Location to write report") + parser.add_argument("-p", "--project-name", dest="project_name", + default="Project", help="A project name to report in the output header") - args = parser.parse_args() - logger = get_logger() - driver(args.gtc_directory, args.manifest_file, args.cluster_file, - args.output_file, args.project_name, ",", logger) + args = parser.parse_args() + logger = get_logger() + driver(args.gtc_directory, args.manifest_file, args.cluster_file, + args.output_file, args.project_name, ",", logger) if __name__ == "__main__": - main() + main() diff --git a/module/BeadArrayUtility.py b/module/BeadArrayUtility.py index 1ea6d88..bf1c958 100644 --- a/module/BeadArrayUtility.py +++ b/module/BeadArrayUtility.py @@ -103,6 +103,7 @@ def read_string(handle): num_bytes += 1 total_length += partial_length << (7 * num_bytes) result = handle.read(total_length) + result = result.decode("utf-8") if len(result) < total_length: raise Exception("Failed to read complete string") else: diff --git a/module/BeadPoolManifest.py b/module/BeadPoolManifest.py index dbf6bba..e55ad5d 100644 --- a/module/BeadPoolManifest.py +++ b/module/BeadPoolManifest.py @@ -60,6 +60,7 @@ def __parse_file(self, manifest_file): """ with open(manifest_file, "rb") as manifest_handle: header = manifest_handle.read(3) + header = header.decode("utf-8") if len(header) != 3 or header != "BPM": raise Exception("Invalid BPM format") version = read_byte(manifest_handle) @@ -82,11 +83,11 @@ def __parse_file(self, manifest_file): self.num_loci = read_int(manifest_handle) manifest_handle.seek(4 * self.num_loci, 1) name_lookup = {} - for idx in xrange(self.num_loci): + for idx in range(self.num_loci): self.names.append(read_string(manifest_handle)) name_lookup[self.names[-1]] = idx - for idx in xrange(self.num_loci): + for idx in range(self.num_loci): normalization_id = read_byte(manifest_handle) if normalization_id >= 100: raise Exception( @@ -100,7 +101,7 @@ def __parse_file(self, manifest_file): self.map_infos = [0] * self.num_loci self.ref_strands = [RefStrand.Unknown] * self.num_loci self.source_strands = [SourceStrand.Unknown] * self.num_loci - for idx in xrange(self.num_loci): + for idx in range(self.num_loci): locus_entry = LocusEntry(manifest_handle) self.assay_types[name_lookup[locus_entry.name]] = locus_entry.assay_type self.addresses[name_lookup[locus_entry.name]] = locus_entry.address_a @@ -115,13 +116,15 @@ def __parse_file(self, manifest_file): "Manifest format error: read invalid number of assay entries") all_norm_ids = set() - for locus_idx in xrange(self.num_loci): + for locus_idx in range(self.num_loci): self.normalization_ids[locus_idx] += 100 * \ self.assay_types[locus_idx] + # To mimic the byte-wrapping behavior from GenomeStudio, AutoCall, IAAP take the mod of 256 + self.normalization_ids[locus_idx] %= 256 all_norm_ids.add(self.normalization_ids[locus_idx]) sorted_norm_ids = sorted(all_norm_ids) lookup_dictionary = {} - for idx in xrange(len(sorted_norm_ids)): + for idx in range(len(sorted_norm_ids)): lookup_dictionary[sorted_norm_ids[idx]] = idx self.normalization_lookups = [ lookup_dictionary[normalization_id] for normalization_id in self.normalization_ids] @@ -309,21 +312,21 @@ def __parse_locus_version_6(self, handle): self.source_strand = SourceStrand.from_string( self.ilmn_id.split("_")[-2]) self.name = read_string(handle) - for idx in xrange(3): + for idx in range(3): read_string(handle) handle.read(4) - for idx in xrange(2): + for idx in range(2): read_string(handle) self.snp = read_string(handle) self.chrom = read_string(handle) - for idx in xrange(2): + for idx in range(2): read_string(handle) self.map_info = int(read_string(handle)) - for idx in xrange(2): + for idx in range(2): read_string(handle) self.address_a = read_int(handle) self.address_b = read_int(handle) - for idx in xrange(7): + for idx in range(7): read_string(handle) handle.read(3) self.assay_type = read_byte(handle) diff --git a/module/ClusterFile.py b/module/ClusterFile.py index 7272fd4..b2d510d 100644 --- a/module/ClusterFile.py +++ b/module/ClusterFile.py @@ -1,4 +1,3 @@ -from itertools import izip from .BeadArrayUtility import read_int, read_string, read_byte, read_float class ClusterFile(object): @@ -76,7 +75,7 @@ def read_array(handle, num_entries, read_record): list(value): A list of type value returned by read_record function """ result = [] - for idx in xrange(num_entries): + for idx in range(num_entries): result.append(read_record(handle)) return result @@ -134,7 +133,7 @@ def read_cluster_file(handle): # cluster counts cluster_counts = [] - for idx in xrange(num_records): + for idx in range(num_records): # 3 corresponds to number genotypes (AA, AB, BB) cluster_counts.append(ClusterFile.read_array(handle, 3, read_int)) @@ -143,7 +142,7 @@ def read_cluster_file(handle): assert cluster_record.ab_cluster_stats.N == count_record[1] assert cluster_record.bb_cluster_stats.N == count_record[2] - for (locus_name, address, cluster_record, cluster_score) in izip(loci_names, addresses, cluster_records, cluster_scores): + for (locus_name, address, cluster_record, cluster_score) in zip(loci_names, addresses, cluster_records, cluster_scores): cluster_record.address = address cluster_record.cluster_score = cluster_score result.add_record(locus_name, cluster_record) @@ -220,7 +219,7 @@ def read_record(handle, version): "Unsupported cluster record version " + str(version)) # read through unused fields - for idx in xrange(14): + for idx in range(14): _ = read_float(handle) aa_cluster_stats = ClusterStats( diff --git a/module/GenotypeCalls.py b/module/GenotypeCalls.py index 80b12ec..eb45f20 100644 --- a/module/GenotypeCalls.py +++ b/module/GenotypeCalls.py @@ -116,6 +116,7 @@ def __init__(self, filename, ignore_version=False, check_write_complete=True): self.filename = filename with open(self.filename, "rb") as gtc_handle: identifier = gtc_handle.read(3) + identifier = identifier.decode("utf-8") if identifier != "gtc": raise Exception("GTC format error: bad format identifier") self.version = read_byte(gtc_handle) @@ -129,7 +130,7 @@ def __init__(self, filename, ignore_version=False, check_write_complete=True): # to the lookup # self.toc_table = {} - for toc_idx in xrange(number_toc_entries): + for toc_idx in range(number_toc_entries): (id, offset) = struct.unpack("