Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for the csi index #2

Merged
merged 2 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: CI
on:
push:
pull_request:

jobs:
ci:
strategy:
fail-fast: true
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
os: [ubuntu-22.04] #, macos-latest, windows-latest]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Test
run: |
PYTHONPATH=$(pwd) python example/filter_dp.py \
--input_vcf tests/hc_subset.vcf.gz \
--output_vcf tests/hc_subset_dp.vcf.gz
if [ ! -f tests/hc_subset_dp.vcf.gz ]; then exit 1; fi
if [ ! -f tests/hc_subset_dp.vcf.gz.tbi ]; then exit 1; fi
- name: Test csi
run: |
rm tests/hc_subset.vcf.gz.tbi
PYTHONPATH=$(pwd) VCF_INDEX_TYPE=2 python example/filter_dp.py \
--input_vcf tests/hc_subset.vcf.gz \
--output_vcf tests/hc_subset_dp.vcf.gz
if [ ! -f tests/hc_subset_dp.vcf.gz ]; then exit 1; fi
if [ ! -f tests/hc_subset_dp.vcf.gz.csi ]; then exit 1; fi
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
tmp/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
Binary file added tests/hc_subset.vcf.gz
Binary file not shown.
Binary file added tests/hc_subset.vcf.gz.csi
Binary file not shown.
Binary file added tests/hc_subset.vcf.gz.tbi
Binary file not shown.
2 changes: 1 addition & 1 deletion vcflib/bgzf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved
# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved
import io
import struct
import zlib
Expand Down
2 changes: 1 addition & 1 deletion vcflib/compat.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved
# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved
import sys

if sys.version_info[0] == 2:
Expand Down
2 changes: 1 addition & 1 deletion vcflib/sharder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved
# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved
from abc import ABCMeta, abstractmethod
import copy
import heapq
Expand Down
213 changes: 146 additions & 67 deletions vcflib/tabix.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2014-2021 Sentieon Inc. All rights reserved
# Copyright (c) 2014-2024 Sentieon Inc. All rights reserved
import collections
import os
import struct
import sys

Expand All @@ -9,8 +10,7 @@
__all__ = ['Tabix']

class Header(object):
__slots__ = ('magic', 'n_ref', 'format',
'col_seq', 'col_beg', 'col_end', 'meta', 'skip', 'l_nm')
__slots__ = ('format', 'col_seq', 'col_beg', 'col_end', 'meta', 'skip')
def __init__(self, *args):
for k,v in zip(self.__slots__, args):
setattr(self, k, v)
Expand All @@ -19,13 +19,12 @@ def __iter__(self):
yield getattr(self, k)

class Tabix(object):
SHIFTS = (14, 17, 20, 23, 26, 29)
MAXBIN = ((1 << SHIFTS[-1]-SHIFTS[0]+3) - 1) // 7 + 1
MAGIC = 0x01494254
TBI_MAGIC = 0x01494254
CSI_MAGIC = 0x01495343
FMT_GENERIC, FMT_SAM, FMT_VCF, FMT_ZERO_BASED = 0, 1, 2, 0x10000

def __init__(self, idxf, mode='r'):
self.path = idxf
def __init__(self, path, mode='r'):
self.path = path
self.mode = mode
if mode[0:1] == 'r':
self.load()
Expand All @@ -37,72 +36,116 @@ def __init__(self, idxf, mode='r'):
def load(self):
self.indices = collections.OrderedDict()

with bgzf.open(self.path, 'rb') as fp:
if os.path.exists(self.path + '.csi'):
idxf = self.path + '.csi'
magic = self.CSI_MAGIC
else:
idxf = self.path + '.tbi'
magic = self.TBI_MAGIC

with bgzf.open(idxf, 'rb') as fp:
s4 = struct.Struct('<L')
s8 = struct.Struct('<Q')
sh = struct.Struct('<9L')
data = fp.read()
off = 0
h = Header(*sh.unpack_from(data, off)); off += sh.size
if h.magic != self.MAGIC:
sh = struct.Struct('<6L')
data = fp.read(); off = 0
self.magic, = s4.unpack_from(data, off); off += s4.size
if self.magic != magic:
raise RuntimeError('Not a tabix file')
self.header = h
names, l_nm = [], 0
for i in xrange(h.n_ref):
eos = data.find(b'\0', off)
if eos < 0: break
names.append(data[off:eos].decode())
l_nm += eos + 1 - off
off = eos + 1
if h.l_nm != l_nm:
if self.magic == self.TBI_MAGIC:
self.min_shift, self.depth = 14, 5
n_ref, = s4.unpack_from(data, off); off += s4.size
aux = sh.unpack_from(data, off); off += sh.size
l_nm, = s4.unpack_from(data, off); off += s4.size
names = data[off:off+l_nm].split(b'\0'); off += l_nm
else:
self.min_shift, = s4.unpack_from(data, off); off += s4.size
self.depth, = s4.unpack_from(data, off); off += s4.size
l_aux, = s4.unpack_from(data, off); off += s4.size
if l_aux < sh.size + s4.size:
raise RuntimeError('Invalid header')
aux = sh.unpack_from(data, off); off += sh.size
l_nm, = s4.unpack_from(data, off); off += s4.size
names = data[off:off+l_nm].split(b'\0'); off += l_nm
off += l_aux - (sh.size + s4.size + l_nm)
n_ref, = s4.unpack_from(data, off); off += s4.size
if len(names) != n_ref+1 or len(names[-1]) != 0:
raise RuntimeError('Header sequence name length mismatch')
for i in xrange(h.n_ref):
self.header = Header(*aux)
for i in xrange(n_ref):
bins = {}
n_bin, = s4.unpack_from(data, off); off += s4.size
for _ in xrange(n_bin):
bin, = s4.unpack_from(data, off); off += s4.size
if self.magic == self.TBI_MAGIC:
loffset = 0
else:
loffset, = s8.unpack_from(data, off); off += s8.size
chunks = []
n_chunk, = s4.unpack_from(data, off); off += s4.size
for _ in xrange(n_chunk):
s, = s8.unpack_from(data, off); off += s8.size
e, = s8.unpack_from(data, off); off += s8.size
chunks.append((s, e))
bins[bin] = chunks
bins[bin] = (loffset, chunks)
intvs = []
n_intv, = s4.unpack_from(data, off); off += s4.size
for _ in xrange(n_intv):
o, = s8.unpack_from(data, off); off += s8.size
intvs.append(o)
if n_intv == 0:
intvs.append(0)
self.indices[names[i]] = (bins, intvs)
if self.magic == self.TBI_MAGIC:
n_intv, = s4.unpack_from(data, off); off += s4.size
for _ in xrange(n_intv):
o, = s8.unpack_from(data, off); off += s8.size
intvs.append(o)
if n_intv == 0:
intvs.append(0)
self.indices[names[i].decode()] = (bins, intvs)
self.max_shift = self.min_shift + self.depth * 3

def save(self):
if self.header is None:
return
self.add(None, 0, 0, self.end)
h = self.header
h.n_ref = len(self.indices)
nms = b''.join(c.encode()+b'\0' for c,_ in iteritems(self.indices))
h.l_nm = len(nms)
with bgzf.open(self.path, 'wb') as fp:

for ext in ('.tbi', '.csi'):
f = self.path + ext
if os.path.exists(f): os.remove(f)

if self.magic == self.TBI_MAGIC:
idxf = self.path + '.tbi'
else:
idxf = self.path + '.csi'

with bgzf.open(idxf, 'wb') as fp:
s4 = struct.Struct('<L')
s8 = struct.Struct('<Q')
sh = struct.Struct('<9L')
fp.write(sh.pack(*h))
fp.write(nms)
sh = struct.Struct('<6L')
nms = b''.join(c.encode()+b'\0' for c,_ in iteritems(self.indices))
fp.write(s4.pack(self.magic))
if self.magic == self.TBI_MAGIC:
fp.write(s4.pack(len(self.indices)))
fp.write(sh.pack(*self.header))
fp.write(s4.pack(len(nms)))
fp.write(nms)
else:
fp.write(s4.pack(self.min_shift))
fp.write(s4.pack(self.depth))
fp.write(s4.pack(sh.size + s4.size + len(nms)))
fp.write(sh.pack(*self.header))
fp.write(s4.pack(len(nms)))
fp.write(nms)
fp.write(s4.pack(len(self.indices)))
for c, (bins, intvs) in iteritems(self.indices):
fp.write(s4.pack(len(bins)))
for bin in sorted(bins.keys()):
chunks = bins[bin]
loffset, chunks = bins[bin]
fp.write(s4.pack(bin))
if self.magic != self.TBI_MAGIC:
fp.write(s8.pack(loffset))
fp.write(s4.pack(len(chunks)))
for s,e in chunks:
fp.write(s8.pack(s))
fp.write(s8.pack(e))
fp.write(s4.pack(len(intvs)))
for o in intvs:
fp.write(s8.pack(o))
if self.magic == self.TBI_MAGIC:
fp.write(s4.pack(len(intvs)))
for o in intvs:
fp.write(s8.pack(o))
self.header = None

def query(self, c, s, e):
Expand All @@ -111,19 +154,24 @@ def query(self, c, s, e):
if ci is None:
return ranges
s = max(s, 0)
i = s >> self.SHIFTS[0]
minoff = ci[1][i] if i < len(ci[1]) else ci[1][-1]
for shift in reversed(self.SHIFTS):
bo = ((1 << 29-shift) - 1) // 7
i = s >> self.min_shift
minoff = ci[1][min(i,len(ci[1])-1)] if ci[1] else 0
for shift in range(self.max_shift, self.min_shift-3, -3):
bo = ((1 << self.max_shift - shift) - 1) // 7
bs = bo + (s >> shift)
be = bo + (e-1 >> shift)
be = min(be, self.MAXBIN-1)
if not ci[1]:
for bi in xrange(bs, bo-1, -1):
b = ci[0].get(bi)
if b is not None:
minoff = max(minoff, b[0])
break
for bi in xrange(bs, be+1):
if bi not in ci[0]:
continue
for chunk in ci[0][bi]:
if chunk[1] > minoff:
ranges.append(chunk)
b = ci[0].get(bi)
if b is not None:
ranges.extend(b[1])
if minoff > 0:
ranges = [(max(s,minoff), e) for s,e in ranges if e > minoff]
return self.merge(ranges, 16)

@staticmethod
Expand All @@ -141,14 +189,37 @@ def merge(ranges, shift):
yield p

def init(self):
h = Header(self.MAGIC, 0, self.FMT_VCF, 1, 2, 2, ord('#'), 0, 0)
self.header = h
self.magic = self.TBI_MAGIC
self.min_shift = 14
self.depth = 5
type = list(map(int, os.getenv('VCF_INDEX_TYPE', '1').split(':')))
if len(type) > 0 and type[0] == 2:
self.magic = self.CSI_MAGIC
if len(type) > 1:
self.min_shift = type[1]
if len(type) > 2:
self.depth = type[2]
self.max_shift = self.min_shift + self.depth * 3
self.header = Header(self.FMT_VCF, 1, 2, 2, ord('#'), 0)
self.indices = collections.OrderedDict()
self.ci = None
self.pos = 0
self.end = 0

def add(self, c, s, e, off):
if c is None and s > 0:
# s is the max contig length
shift = self.min_shift
limit = 1 << shift
while s > limit:
limit <<= 1
shift += 1
if shift >= 32:
raise RuntimeError('Some contigs are too long')
if shift > self.min_shift + self.depth * 3:
self.magic = self.CSI_MAGIC;
self.depth = (shift - self.min_shift + 2) // 3;
self.max_shift = self.min_shift + self.depth * 3
if self.ci and self.ci[0] != c:
self.optimize(self.ci)
self.ci = None
Expand All @@ -159,17 +230,19 @@ def add(self, c, s, e, off):
if self.ci:
chrom, bins, intvs = self.ci
assert chrom == c and s >= self.pos
be = e-1 >> self.SHIFTS[0]
be = e-1 >> self.min_shift
if be >= len(intvs):
intvs += [self.end] * (be+1 - len(intvs))
bin = 0
for shift in self.SHIFTS:
for shift in range(self.min_shift, self.max_shift+3, 3):
bs, be = s >> shift, e-1 >> shift
if bs == be:
bo = ((1 << 29-shift) - 1) // 7
bo = ((1 << self.max_shift - shift) - 1) // 7
bin = bo + bs
break
chunks = bins.setdefault(bin,[])
b = bins.setdefault(bin,[])
if not b: b.extend((0, []))
chunks = b[1]
if chunks and chunks[-1][1] == self.end:
chunks[-1] = (chunks[-1][0], off)
else:
Expand All @@ -179,25 +252,31 @@ def add(self, c, s, e, off):

def optimize(self, ci):
bins = ci[1]
for shift in self.SHIFTS[:-1]:
bo = ((1 << 29-shift) - 1) // 7
for shift in range(self.min_shift, self.max_shift+3, 3):
bo = ((1 << self.max_shift - shift) - 1) // 7
for bin in sorted(bins.keys()):
if bin < bo:
continue
if bin > bo << 3:
break
chunks = bins.get(bin)
if chunks is None:
b = bins.get(bin)
if b is None:
continue
chunks = b[1]
if len(chunks) == 0:
del bins[bin]
continue
bs = chunks[0][0] >> 16
be = chunks[-1][1] >> 16
if be - bs < 65536:
if be - bs < 65536 and bo > 0:
del bins[bin]
bin = bin-1 >> 3
chunks += bins.get(bin,[])
bins[bin] = list(self.merge(chunks, 16))
b = bins.setdefault(bin,[])
if not b: b.extend((0, []))
b[1] = list(self.merge(chunks + b[1], 16))
elif ci[2]:
intv = (bin - bo) << (shift - self.min_shift)
intv = min(intv, len(ci[2])-1)
b[0] = ci[2][intv]

# vim: ts=4 sw=4 expandtab
Loading
Loading