diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..8e2c7ae --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,33 @@ +name: CI +on: + push: + pull_request: + +jobs: + ci: + strategy: + fail-fast: true + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + os: [ubuntu-22.04] #, macos-latest, windows-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Test + run: | + PYTHONPATH=$(pwd) python example/filter_dp.py \ + --input_vcf tests/hc_subset.vcf.gz \ + --output_vcf tests/hc_subset_dp.vcf.gz + if [ ! -f tests/hc_subset_dp.vcf.gz ]; then exit 1; fi + if [ ! -f tests/hc_subset_dp.vcf.gz.tbi ]; then exit 1; fi + - name: Test csi + run: | + rm tests/hc_subset.vcf.gz.tbi + PYTHONPATH=$(pwd) VCF_INDEX_TYPE=2 python example/filter_dp.py \ + --input_vcf tests/hc_subset.vcf.gz \ + --output_vcf tests/hc_subset_dp.vcf.gz + if [ ! -f tests/hc_subset_dp.vcf.gz ]; then exit 1; fi + if [ ! -f tests/hc_subset_dp.vcf.gz.csi ]; then exit 1; fi \ No newline at end of file diff --git a/.gitignore b/.gitignore index 1ea80ab..c1f3553 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +tmp/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/tests/hc_subset.vcf.gz b/tests/hc_subset.vcf.gz new file mode 100644 index 0000000..7f903db Binary files /dev/null and b/tests/hc_subset.vcf.gz differ diff --git a/tests/hc_subset.vcf.gz.csi b/tests/hc_subset.vcf.gz.csi new file mode 100644 index 0000000..e657e5f Binary files /dev/null and b/tests/hc_subset.vcf.gz.csi differ diff --git a/tests/hc_subset.vcf.gz.tbi b/tests/hc_subset.vcf.gz.tbi new file mode 100644 index 0000000..ad5d4f0 Binary files /dev/null and b/tests/hc_subset.vcf.gz.tbi differ diff --git a/vcflib/bgzf.py b/vcflib/bgzf.py index 5c51c80..4ad67fa 100644 --- a/vcflib/bgzf.py +++ b/vcflib/bgzf.py @@ -1,4 +1,4 @@ -# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved +# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved import io import struct import zlib diff --git a/vcflib/compat.py b/vcflib/compat.py index 1197360..a0a7210 100644 --- a/vcflib/compat.py +++ b/vcflib/compat.py @@ -1,4 +1,4 @@ -# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved +# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved import sys if sys.version_info[0] == 2: diff --git a/vcflib/sharder.py b/vcflib/sharder.py index 3477355..15f6e1a 100644 --- a/vcflib/sharder.py +++ b/vcflib/sharder.py @@ -1,4 +1,4 @@ -# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved +# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved from abc import ABCMeta, abstractmethod import copy import heapq diff --git a/vcflib/tabix.py b/vcflib/tabix.py index c86544f..df379a0 100644 --- a/vcflib/tabix.py +++ b/vcflib/tabix.py @@ -1,5 +1,6 @@ -# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved +# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved import collections +import os import struct import sys @@ -9,8 +10,7 @@ __all__ = ['Tabix'] class Header(object): - __slots__ = ('magic', 'n_ref', 'format', - 'col_seq', 'col_beg', 'col_end', 'meta', 'skip', 'l_nm') + __slots__ = ('format', 'col_seq', 'col_beg', 'col_end', 'meta', 'skip') def __init__(self, *args): for k,v in zip(self.__slots__, args): setattr(self, k, v) @@ -19,13 +19,12 @@ def __iter__(self): yield getattr(self, k) class Tabix(object): - SHIFTS = (14, 17, 20, 23, 26, 29) - MAXBIN = ((1 << SHIFTS[-1]-SHIFTS[0]+3) - 1) // 7 + 1 - MAGIC = 0x01494254 + TBI_MAGIC = 0x01494254 + CSI_MAGIC = 0x01495343 FMT_GENERIC, FMT_SAM, FMT_VCF, FMT_ZERO_BASED = 0, 1, 2, 0x10000 - def __init__(self, idxf, mode='r'): - self.path = idxf + def __init__(self, path, mode='r'): + self.path = path self.mode = mode if mode[0:1] == 'r': self.load() @@ -37,72 +36,116 @@ def __init__(self, idxf, mode='r'): def load(self): self.indices = collections.OrderedDict() - with bgzf.open(self.path, 'rb') as fp: + if os.path.exists(self.path + '.csi'): + idxf = self.path + '.csi' + magic = self.CSI_MAGIC + else: + idxf = self.path + '.tbi' + magic = self.TBI_MAGIC + + with bgzf.open(idxf, 'rb') as fp: s4 = struct.Struct('> self.SHIFTS[0] - minoff = ci[1][i] if i < len(ci[1]) else ci[1][-1] - for shift in reversed(self.SHIFTS): - bo = ((1 << 29-shift) - 1) // 7 + i = s >> self.min_shift + minoff = ci[1][min(i,len(ci[1])-1)] if ci[1] else 0 + for shift in range(self.max_shift, self.min_shift-3, -3): + bo = ((1 << self.max_shift - shift) - 1) // 7 bs = bo + (s >> shift) be = bo + (e-1 >> shift) - be = min(be, self.MAXBIN-1) + if not ci[1]: + for bi in xrange(bs, bo-1, -1): + b = ci[0].get(bi) + if b is not None: + minoff = max(minoff, b[0]) + break for bi in xrange(bs, be+1): - if bi not in ci[0]: - continue - for chunk in ci[0][bi]: - if chunk[1] > minoff: - ranges.append(chunk) + b = ci[0].get(bi) + if b is not None: + ranges.extend(b[1]) + if minoff > 0: + ranges = [(max(s,minoff), e) for s,e in ranges if e > minoff] return self.merge(ranges, 16) @staticmethod @@ -141,14 +189,37 @@ def merge(ranges, shift): yield p def init(self): - h = Header(self.MAGIC, 0, self.FMT_VCF, 1, 2, 2, ord('#'), 0, 0) - self.header = h + self.magic = self.TBI_MAGIC + self.min_shift = 14 + self.depth = 5 + type = list(map(int, os.getenv('VCF_INDEX_TYPE', '1').split(':'))) + if len(type) > 0 and type[0] == 2: + self.magic = self.CSI_MAGIC + if len(type) > 1: + self.min_shift = type[1] + if len(type) > 2: + self.depth = type[2] + self.max_shift = self.min_shift + self.depth * 3 + self.header = Header(self.FMT_VCF, 1, 2, 2, ord('#'), 0) self.indices = collections.OrderedDict() self.ci = None self.pos = 0 self.end = 0 def add(self, c, s, e, off): + if c is None and s > 0: + # s is the max contig length + shift = self.min_shift + limit = 1 << shift + while s > limit: + limit <<= 1 + shift += 1 + if shift >= 32: + raise RuntimeError('Some contigs are too long') + if shift > self.min_shift + self.depth * 3: + self.magic = self.CSI_MAGIC; + self.depth = (shift - self.min_shift + 2) // 3; + self.max_shift = self.min_shift + self.depth * 3 if self.ci and self.ci[0] != c: self.optimize(self.ci) self.ci = None @@ -159,17 +230,19 @@ def add(self, c, s, e, off): if self.ci: chrom, bins, intvs = self.ci assert chrom == c and s >= self.pos - be = e-1 >> self.SHIFTS[0] + be = e-1 >> self.min_shift if be >= len(intvs): intvs += [self.end] * (be+1 - len(intvs)) bin = 0 - for shift in self.SHIFTS: + for shift in range(self.min_shift, self.max_shift+3, 3): bs, be = s >> shift, e-1 >> shift if bs == be: - bo = ((1 << 29-shift) - 1) // 7 + bo = ((1 << self.max_shift - shift) - 1) // 7 bin = bo + bs break - chunks = bins.setdefault(bin,[]) + b = bins.setdefault(bin,[]) + if not b: b.extend((0, [])) + chunks = b[1] if chunks and chunks[-1][1] == self.end: chunks[-1] = (chunks[-1][0], off) else: @@ -179,25 +252,31 @@ def add(self, c, s, e, off): def optimize(self, ci): bins = ci[1] - for shift in self.SHIFTS[:-1]: - bo = ((1 << 29-shift) - 1) // 7 + for shift in range(self.min_shift, self.max_shift+3, 3): + bo = ((1 << self.max_shift - shift) - 1) // 7 for bin in sorted(bins.keys()): if bin < bo: continue if bin > bo << 3: break - chunks = bins.get(bin) - if chunks is None: + b = bins.get(bin) + if b is None: continue + chunks = b[1] if len(chunks) == 0: del bins[bin] continue bs = chunks[0][0] >> 16 be = chunks[-1][1] >> 16 - if be - bs < 65536: + if be - bs < 65536 and bo > 0: del bins[bin] bin = bin-1 >> 3 - chunks += bins.get(bin,[]) - bins[bin] = list(self.merge(chunks, 16)) + b = bins.setdefault(bin,[]) + if not b: b.extend((0, [])) + b[1] = list(self.merge(chunks + b[1], 16)) + elif ci[2]: + intv = (bin - bo) << (shift - self.min_shift) + intv = min(intv, len(ci[2])-1) + b[0] = ci[2][intv] # vim: ts=4 sw=4 expandtab diff --git a/vcflib/tribble.py b/vcflib/tribble.py index 270d8f5..d2ad88f 100644 --- a/vcflib/tribble.py +++ b/vcflib/tribble.py @@ -1,4 +1,4 @@ -# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved +# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved import bisect import collections import heapq @@ -178,10 +178,8 @@ class TribbleIndex(object): MAX_FEATURES_PER_BIN = 100 MAX_FEATURES_PER_INTERVAL = 600 - def __init__(self, idxf, mode='r'): - if not idxf.endswith('.idx'): - raise ValueError('File name suffix is not .idx') - self.path = idxf + def __init__(self, path, mode='r'): + self.path = path + '.idx' self.mode = mode if mode[0:1] == 'r': self.load() diff --git a/vcflib/vcf.py b/vcflib/vcf.py index a0efe7f..70b6820 100644 --- a/vcflib/vcf.py +++ b/vcflib/vcf.py @@ -1,4 +1,4 @@ -# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved +# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved import collections import fnmatch import io @@ -63,10 +63,14 @@ def open(self, path, mode): self.fp, self.index = None, None if path.endswith('.gz'): self.fp = bgzf.open(path, mode) - self.index = tabix.Tabix(path + '.tbi', mode) + self.index = tabix.Tabix(path, mode) + elif path == '-': + if mode[0:1] == 'r': + raise RuntimeError('Input vcf cannot be stdin') + self.fp = os.fdopen(1, mode) else: self.fp = io.open(path, mode) - self.index = tribble.TribbleIndex(path + '.idx', mode) + self.index = tribble.TribbleIndex(path, mode) def load_header(self): self.headers = [] @@ -114,7 +118,6 @@ def copy_header(self, src, update=None, remove=None): self.parse_header() def emit_header(self): - assert self.fp.tell() == 0 for line in self.headers: if line.startswith('#CHROM'): continue @@ -125,7 +128,10 @@ def emit_header(self): cols += self.samples line = '\t'.join(cols) self.fp.write(line.encode() + b'\n') - self.index.add(None, 0, 0, self.fp.tell()) + if self.index is not None: + maxlen = max(int(t.get('length',0)) + for c,t in iteritems(self.contigs)) + self.index.add(None, maxlen, 0, self.fp.tell()) def parse_header(self): self.contigs = collections.OrderedDict() @@ -323,7 +329,8 @@ def emit(self, v): if v.line is None: self.format(v) self.fp.write(v.line.encode() + b'\n') - self.index.add(v.chrom, v.pos, v.end, self.fp.tell()) + if self.index is not None: + self.index.add(v.chrom, v.pos, v.end, self.fp.tell()) def cmp_variants(self, v1, v2): v1_contig_idx = list(self.contigs.keys()).index(v1.chrom) @@ -366,10 +373,10 @@ def __accum__(self, tmpf): if i >= 0: end = int(info[i+4:].split(';',1)[0]) self.fp.write(line) - self.index.add(chrom, pos, end, self.fp.tell()) + if self.index is not None: + self.index.add(chrom, pos, end, self.fp.tell()) tfp.close() os.unlink(tmpf) - os.unlink(tmpf + '.idx') class VCFReader(object): def __init__(self, vcf, chrom, start, end): @@ -432,7 +439,6 @@ def __init__(self, vcf, chrom, start, end): self.fp = tempfile.NamedTemporaryFile(mode='wb', suffix='.vcf', delete=False, dir=os.getenv('SENTIEON_TMPDIR')) self.path = self.fp.name - self.index = tribble.TribbleIndex(self.path + '.idx', 'w') self.chrom = chrom self.start = start self.end = end @@ -441,7 +447,7 @@ def __getattr__(self, key): return getattr(self.vcf, key) def emit_header(self): - self.index.add(None, 0, 0, self.fp.tell()) + pass def emit(self, v): if v.chrom != self.chrom or v.pos >= self.end or v.end <= self.start: @@ -449,11 +455,9 @@ def emit(self, v): if v.line is None: self.vcf.format(v) self.fp.write(v.line.encode() + b'\n') - self.index.add(v.chrom, v.pos, v.end, self.fp.tell()) def close(self): self.fp.close() - self.index.save() def __getdata__(self): self.close()