diff --git a/.prettierrc b/.prettierrc index a1b59c6..b6cb800 100644 --- a/.prettierrc +++ b/.prettierrc @@ -2,5 +2,6 @@ "semi": false, "trailingComma": "all", "singleQuote": true, - "arrowParens": "avoid" + "arrowParens": "avoid", + "proseWrap": "always" } diff --git a/README.md b/README.md index 5ccf2b4..7ed9a85 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,21 @@ [![Coverage Status](https://img.shields.io/codecov/c/github/GMOD/bgzf-filehandle/master.svg?style=flat-square)](https://codecov.io/gh/GMOD/bgzf-filehandle/branch/master) [![Build Status](https://img.shields.io/github/actions/workflow/status/GMOD/bgzf-filehandle/push.yml?branch=master)](https://github.com/GMOD/bgzf-filehandle/actions) -Transparently read [indexed block-gzipped (BGZF)](http://www.htslib.org/doc/bgzip.html) files, such as those created by bgzip, using coordinates from the uncompressed file. The module is used in @gmod/indexedfasta to read bgzip-indexed fasta files (with gzi index, fai index, and fa). +Transparently read +[indexed block-gzipped (BGZF)](http://www.htslib.org/doc/bgzip.html) files, such +as those created by bgzip, using coordinates from the uncompressed file. The +module is used in @gmod/indexedfasta to read bgzip-indexed fasta files (with gzi +index, fai index, and fa). -Users can also use the `unzip` function to unzip bgzip files whole (which pako has trouble with natively) +Users can also use the `unzip` function to unzip bgzip files whole (which pako +has trouble with natively) -You can also use the unzipChunk or unzipChunkSlice functions to unzip ranges given by BAI or TBI files for BAM or tabix file formats (which are bgzip based). +You can also use the unzipChunkSlice function to unzip ranges given by BAI or +TBI files for BAM or tabix file formats (which are bgzip based). -The `unzip` utility function properly decompresses BGZF chunks in both node and the browser, using `pako` when running in the browser and native `zlib` when running in node. +The `unzip` utility function properly decompresses BGZF chunks in both node and +the browser, using `pako` when running in the browser and native `zlib` when +running in node. ## Install @@ -54,7 +62,10 @@ const { buffer, dpositions, cpositions } = await unzipChunkSlice( ## Academic Use -This package was written with funding from the [NHGRI](http://genome.gov) as part of the [JBrowse](http://jbrowse.org) project. If you use it in an academic project that you publish, please cite the most recent JBrowse paper, which will be linked from [jbrowse.org](http://jbrowse.org). +This package was written with funding from the [NHGRI](http://genome.gov) as +part of the [JBrowse](http://jbrowse.org) project. If you use it in an academic +project that you publish, please cite the most recent JBrowse paper, which will +be linked from [jbrowse.org](http://jbrowse.org). ## License diff --git a/package.json b/package.json index 0869e88..74681ae 100644 --- a/package.json +++ b/package.json @@ -35,7 +35,7 @@ "biojs" ], "dependencies": { - "generic-filehandle2": "^0.0.1", + "generic-filehandle2": "^0.0.2", "long": "^4.0.0", "pako": "^1.0.11" }, diff --git a/src/index.ts b/src/index.ts index 37cb679..8a9d0f0 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,2 +1,2 @@ export { default as BgzfFilehandle } from './bgzFilehandle' -export { unzip, unzipChunk, unzipChunkSlice } from './unzip' +export * from './unzip' diff --git a/src/unzip.ts b/src/unzip.ts index e3c5895..cceb630 100644 --- a/src/unzip.ts +++ b/src/unzip.ts @@ -15,12 +15,12 @@ interface Chunk { // does not properly uncompress bgzf chunks that contain more than one bgzf // block, so export an unzip function that uses pako directly if we are running // in a browser. -async function unzip(inputData: Uint8Array) { - const blocks = [] as Uint8Array[] +export async function unzip(inputData: Uint8Array) { try { let strm let pos = 0 let inflator + const blocks = [] as Uint8Array[] do { const remainingInput = inputData.subarray(pos) inflator = new Inflate() @@ -37,7 +37,7 @@ async function unzip(inputData: Uint8Array) { return concatUint8Array(blocks) } catch (e) { - //cleanup error message + // return a slightly more informative error message if (/incorrect header check/.exec(`${e}`)) { throw new Error( 'problem decompressing block: incorrect gzip header check', @@ -47,127 +47,66 @@ async function unzip(inputData: Uint8Array) { } } -// similar to pakounzip, except it does extra counting -// to return the positions of compressed and decompressed -// data offsets -async function unzipChunk(inputData: Uint8Array) { - try { - let strm - let cpos = 0 - let dpos = 0 - const blocks = [] as Uint8Array[] - const cpositions = [] as number[] - const dpositions = [] as number[] - do { - const remainingInput = inputData.slice(cpos) - const inflator = new Inflate() - // @ts-ignore - ;({ strm } = inflator) - inflator.push(remainingInput, Z_SYNC_FLUSH) - if (inflator.err) { - throw new Error(inflator.msg) - } - - const buffer = inflator.result as Uint8Array - blocks.push(buffer) - - cpositions.push(cpos) - dpositions.push(dpos) +// keeps track of the position of compressed blocks in terms of file offsets, +// and a decompressed equivalent +// +// also slices (0,minv.dataPosition) and (maxv.dataPosition,end) off +export async function unzipChunkSlice(inputData: Uint8Array, chunk: Chunk) { + let strm + const { minv, maxv } = chunk + let cpos = minv.blockPosition + let dpos = minv.dataPosition + const chunks = [] as Uint8Array[] + const cpositions = [] as number[] + const dpositions = [] as number[] + + let i = 0 + do { + const remainingInput = inputData.subarray(cpos - minv.blockPosition) + const inflator = new Inflate() + // @ts-ignore + ;({ strm } = inflator) + inflator.push(remainingInput, Z_SYNC_FLUSH) + if (inflator.err) { + throw new Error(inflator.msg) + } - cpos += strm.next_in - dpos += buffer.length - } while (strm.avail_in) + const buffer = inflator.result + chunks.push(buffer as Uint8Array) + let len = buffer.length - const buffer = concatUint8Array(blocks) - return { - buffer, - cpositions, - dpositions, + cpositions.push(cpos) + dpositions.push(dpos) + if (chunks.length === 1 && minv.dataPosition) { + // this is the first chunk, trim it + chunks[0] = chunks[0]!.subarray(minv.dataPosition) + len = chunks[0].length } - } catch (e) { - //cleanup error message - if (/incorrect header check/.exec(`${e}`)) { - throw new Error( - 'problem decompressing block: incorrect gzip header check', + const origCpos = cpos + cpos += strm.next_in + dpos += len + + if (origCpos >= maxv.blockPosition) { + // this is the last chunk, trim it and stop decompressing. note if it is + // the same block is minv it subtracts that already trimmed part of the + // slice length + chunks[i] = chunks[i]!.subarray( + 0, + maxv.blockPosition === minv.blockPosition + ? maxv.dataPosition - minv.dataPosition + 1 + : maxv.dataPosition + 1, ) - } - throw e - } -} - -// similar to unzipChunk above but slices (0,minv.dataPosition) and -// (maxv.dataPosition,end) off -async function unzipChunkSlice(inputData: Uint8Array, chunk: Chunk) { - try { - let strm - const { minv, maxv } = chunk - let cpos = minv.blockPosition - let dpos = minv.dataPosition - const chunks = [] as Uint8Array[] - const cpositions = [] as number[] - const dpositions = [] as number[] - - let i = 0 - do { - const remainingInput = inputData.subarray(cpos - minv.blockPosition) - const inflator = new Inflate() - // @ts-ignore - ;({ strm } = inflator) - inflator.push(remainingInput, Z_SYNC_FLUSH) - if (inflator.err) { - throw new Error(inflator.msg) - } - - const buffer = inflator.result - chunks.push(buffer as Uint8Array) - let len = buffer.length cpositions.push(cpos) dpositions.push(dpos) - if (chunks.length === 1 && minv.dataPosition) { - // this is the first chunk, trim it - chunks[0] = chunks[0]!.subarray(minv.dataPosition) - len = chunks[0].length - } - const origCpos = cpos - cpos += strm.next_in - dpos += len - - if (origCpos >= maxv.blockPosition) { - // this is the last chunk, trim it and stop decompressing - // note if it is the same block is minv it subtracts that already - // trimmed part of the slice length - - chunks[i] = chunks[i]!.subarray( - 0, - maxv.blockPosition === minv.blockPosition - ? maxv.dataPosition - minv.dataPosition + 1 - : maxv.dataPosition + 1, - ) - - cpositions.push(cpos) - dpositions.push(dpos) - break - } - i++ - } while (strm.avail_in) - - const buffer = concatUint8Array(chunks) - - return { buffer, cpositions, dpositions } - } catch (e) { - //cleanup error message - if (/incorrect header check/.exec(`${e}`)) { - throw new Error( - 'problem decompressing block: incorrect gzip header check', - ) + break } - throw e - } -} + i++ + } while (strm.avail_in) -function nodeUnzip() { - throw new Error('nodeUnzip not implemented.') + return { + buffer: concatUint8Array(chunks), + cpositions, + dpositions, + } } - -export { unzip, unzipChunk, unzipChunkSlice, unzip as pakoUnzip, nodeUnzip } diff --git a/yarn.lock b/yarn.lock index 85456c7..7c8ea5b 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1102,10 +1102,10 @@ function-bind@^1.1.2: resolved "https://registry.yarnpkg.com/function-bind/-/function-bind-1.1.2.tgz#2c02d864d97f3ea6c8830c464cbd11ab6eab7a1c" integrity sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA== -generic-filehandle2@^0.0.1: - version "0.0.1" - resolved "https://registry.yarnpkg.com/generic-filehandle2/-/generic-filehandle2-0.0.1.tgz#7f26ee54a939ed588d6bdb3a453bb2255ccd2be9" - integrity sha512-cySnWoVmNUSkRztAwlghNVAYXUh+VVy/fxn8tT3jZIo8UQEHkYL7ueSUseBZrwqBCq9n06Wp/F4xv2q2/SwYCQ== +generic-filehandle2@^0.0.2: + version "0.0.2" + resolved "https://registry.yarnpkg.com/generic-filehandle2/-/generic-filehandle2-0.0.2.tgz#a87839bdefa5cebb8aec6aa1486f904595908dde" + integrity sha512-8v2ZOMiJ2uneKpaKNqCTFDBmXai4nihRd8QY9G6vQ9JRcuaDrkwSKxlGM4A0k2X4XNr+X02T6RMv1QPT9fgPEw== glob-parent@^5.1.2: version "5.1.2"