diff --git a/dist/cli.cjs b/dist/cli.cjs index babe0ac..d20f836 100755 --- a/dist/cli.cjs +++ b/dist/cli.cjs @@ -1,5 +1,5 @@ #!/usr/bin/env node -"use strict";var ie=Object.create;var q=Object.defineProperty;var ae=Object.getOwnPropertyDescriptor;var oe=Object.getOwnPropertyNames;var de=Object.getPrototypeOf,le=Object.prototype.hasOwnProperty;var ce=(s,e,t,r)=>{if(e&&typeof e=="object"||typeof e=="function")for(let n of oe(e))!le.call(s,n)&&n!==t&&q(s,n,{get:()=>e[n],enumerable:!(r=ae(e,n))||r.enumerable});return s};var L=(s,e,t)=>(t=s!=null?ie(de(s)):{},ce(e||!s||!s.__esModule?q(t,"default",{value:s,enumerable:!0}):t,s));var U=require("fs"),te=require("path"),H=require("process"),re=L(require("yargs"),1),se=require("yargs/helpers");var B=s=>s.positional("filenames",{describe:"WARC file(s) to index",type:"string",array:!0,demandOption:"true"}).option("fields",{alias:"f",describe:"fields to include in index",type:"string"}),j=s=>s.positional("filenames",{describe:"WARC file(s) to index",type:"string",array:!0,demandOption:"true"}).option("all",{alias:"a",describe:"index all WARC records",type:"boolean"}).option("format",{describe:"output format",choices:["json","cdxj","cdx"],default:"cdxj"}).option("noSurt",{describe:"Use plain urlkey, do not convert to SURT form (Sort-friendly URI Reordering Transform)",type:"boolean"});var X=L(require("pako"),1);function N(s){let e;typeof s=="string"?e=s:s?.length?e=s.reduce((t,r)=>(t+=String.fromCharCode(r),t),""):s?e=s.toString():e="";try{return"__wb_post_data="+btoa(e)}catch{return"__wb_post_data="}}function $(s){return s.replace(/[.*+?^${}()|[\]\\]/g,"\\$&")}function M(s){try{if(!s.startsWith("https:")&&!s.startsWith("http:"))return s;s=s.replace(/^(https?:\/\/)www\d*\./,"$1");let e=s.toLowerCase(),t=new URL(e),n=t.hostname.split(".").reverse().join(",");if(t.port&&(n+=":"+t.port),n+=")",n+=t.pathname,t.search){t.searchParams.sort(),n+=t.search;for(let[a,i]of t.searchParams.entries())if(!i){let o=encodeURIComponent(a),d=new RegExp(`(?<=[&?])${$(a)}=(?=&|$)`);if(!d.exec(e)){let l=a===o?d:new RegExp(`(?<=[&?])${$(o)}=(?=&|$)`);n=n.replace(l,o)}}}return n}catch{return s}}function Q(s){let{method:e,headers:t,postData:r=""}=s;if(e==="GET")return!1;let n=(t.get("content-type")||"").split(";")[0];function a(o){return o instanceof Uint8Array&&(o=new TextDecoder().decode(o)),o}let i="";switch(n){case"application/x-www-form-urlencoded":i=a(r);break;case"application/json":i=z(a(r));break;case"text/plain":try{i=z(a(r),!1)}catch{i=N(r)}break;case"multipart/form-data":{let o=t.get("content-type");if(!o)throw new Error("utils cannot call postToGetURL when missing content-type header");i=pe(a(r),o);break}default:i=N(r)}return i!=null?(s.url=ue(s.url,i,s.method),s.method="GET",s.requestBody=i,!0):!1}function ue(s,e,t){if(!t)return s;let r=s.indexOf("?")>0?"&":"?";return`${s}${r}__wb_method=${t}&${e}`}function he(s,e=!0){if(typeof s=="string")try{s=JSON.parse(s)}catch{s={}}let t=new URLSearchParams,r={},n=i=>t.has(i)?(i in r||(r[i]=1),i+"."+ ++r[i]+"_"):i,a=(i,o="")=>{let d="";if(typeof i=="object"&&!(i instanceof Array))try{for(let[l,h]of Object.entries(i))a(h,l)}catch{i===null&&(d="null")}else if(i instanceof Array)for(let l=0;l{r.done||!r.value?t.close():t.enqueue(r.value)})}})}async readFully(){return await s.readFully(this)}async readline(e=0){let t=await this.readlineRaw(e);return t?G.decode(t):""}async*iterLines(e=0){let t=null;for(;t=await this.readline(e);)yield t}};function ye(s){return!!(s&&Symbol.iterator in Object(s))}function me(s){return!!(s&&Symbol.asyncIterator in Object(s))}var R=class s extends y{constructor(e,t="gzip",r=!1){super(),this.compressed=t,this.opts={raw:t==="deflateRaw"},this.inflator=t?new S(this.opts,this):null;let n;if(me(e))n=e;else if(typeof e=="object"&&"read"in e&&typeof e.read=="function")n=s.fromReadable(e);else if(e instanceof ReadableStream)n=s.fromReadable(e.getReader());else if(ye(e))n=s.fromIter(e);else throw new TypeError("Invalid Stream Source");r?this._sourceIter=this.dechunk(n):this._sourceIter=n[Symbol.asyncIterator](),this.lastValue=null,this.errored=!1,this._savedChunk=null,this._rawOffset=0,this._readOffset=0,this.numChunks=0}async _loadNext(){let e=await this._sourceIter.next();return e.done?null:e.value}async*dechunk(e){let t=e instanceof s?e:new s(e,null),r=-1,n=!0;for(;r!=0;){let a=await t.readlineRaw(64),i=new Uint8Array;if(r=a?parseInt(G.decode(a),16):0,!r||r>2**32){if(Number.isNaN(r)||r>2**32){n||(this.errored=!0),yield a;break}}else if(i=await t.readSize(r),i.length!=r){n?yield a:this.errored=!0,yield i;break}let o=await t.readSize(2);if(o[0]!=13||o[1]!=10){n?yield a:this.errored=!0,yield i,yield o;break}else{if(n=!1,!i||r===0)return;yield i}}yield*t}unread(e){e.length&&(this._readOffset-=e.length,this._savedChunk&&console.log("Already have chunk!"),this._savedChunk=e)}async _next(){if(this._savedChunk){let t=this._savedChunk;return this._savedChunk=null,t}if(this.compressed){let t=this._getNextChunk();if(t)return t}let e=await this._loadNext();for(;this.compressed&&e;){this._push(e);let t=this._getNextChunk(e);if(t)return t;e=await this._loadNext()}return e}_push(e){if(!this.inflator)throw new Error("AsyncIterReader cannot call _push when this.compressed is null");this.lastValue=e,this.inflator.ended&&(this.inflator=new S(this.opts,this)),this.inflator.push(e),this.inflator.err&&this.inflator.ended&&this.compressed==="deflate"&&!this.opts.raw&&this.numChunks===0&&(this.opts.raw=!0,this.compressed="deflateRaw",this.inflator=new S(this.opts,this),this.inflator.push(e))}_getNextChunk(e){if(!this.inflator)throw new Error("AsyncIterReader cannot call _getNextChunk when this.compressed is null");for(;;){if(this.inflator.chunks.length>0)return this.numChunks++,this.inflator.chunks.shift();if(this.inflator.ended){if(this.inflator.err!==0)return this.compressed=null,e;let t=this.inflator.strm.avail_in;if(t&&this.lastValue){this._push(this.lastValue.slice(-t));continue}}return null}}async*[Symbol.asyncIterator](){let e=null;for(;e=await this._next();)this._readOffset+=e.length,yield e}async readlineRaw(e){let t=[],r=0,n=-1,a=null;for await(let i of this){if(e&&r+i.byteLength>e){a=i,n=e-r-1;let o=i.slice(0,n+1).indexOf(10);o>=0&&(n=o);break}if(n=i.indexOf(10),n>=0){a=i;break}t.push(i),r+=i.byteLength}if(a){let[i,o]=g(a,n+1);t.push(i),r+=i.byteLength,this.unread(o)}else if(!t.length)return null;return _(t,r)}async readFully(){return(await this._readOrSkip())[1]}async readSize(e){return(await this._readOrSkip(e))[1]}async skipSize(e){return(await this._readOrSkip(e,!0))[0]}async _readOrSkip(e=-1,t=!1){let r=[],n=0;for await(let a of this){if(e>=0)if(a.length>e){let[i,o]=g(a,e);t||r.push(i),n+=i.byteLength,this.unread(o);break}else if(a.length===e){t||r.push(a),n+=a.byteLength,e=0;break}else e-=a.length;t||r.push(a),n+=a.byteLength}return t?[n,new Uint8Array]:[n,_(r,n)]}getReadOffset(){return this._readOffset}getRawOffset(){return this.compressed?this._rawOffset:this._readOffset}getRawLength(e){return this.compressed?this.inflator.strm.total_in:this._readOffset-e}static fromReadable(e){return{async*[Symbol.asyncIterator](){let r=null;for(;(r=await e.read())&&!r.done;)yield r.value}}}static fromIter(e){return{async*[Symbol.asyncIterator](){for(let r of e)yield r}}}},w=class extends y{constructor(e,t,r=0){super(),this.sourceIter=e,this.length=t,this.limit=t,this.skip=r}setLimitSkip(e,t=0){this.limit=e,this.skip=t}async*[Symbol.asyncIterator](){if(!(this.limit<=0))for await(let e of this.sourceIter){if(this.skip>0)if(e.length>=this.skip){let[,t]=g(e,this.skip);e=t,this.skip=0}else{this.skip-=e.length;continue}if(e.length>this.limit){let[t,r]=g(e,this.limit);e=t,this.sourceIter.unread&&this.sourceIter.unread(r)}if(e.length&&(this.limit-=e.length,yield e),this.limit<=0)break}}async readlineRaw(e){if(this.limit<=0)return null;let t=await this.sourceIter.readlineRaw(e?Math.min(e,this.limit):this.limit);return this.limit-=t?.length||0,t}async skipFully(){let e=this.limit;for(;this.limit>0;)this.limit-=await this.sourceIter.skipSize(this.limit);return e}};var ge=new Uint8Array([13,10]),$e=new Uint8Array([13,10,13,10]),Re=new TextDecoder("utf-8"),x=class{constructor({statusline:e,headers:t}){this.statusline=e,this.headers=t}toString(){let e=[this.statusline];for(let[t,r]of this.headers)e.push(`${t}: ${r}`);return e.join(`\r +"use strict";var ie=Object.create;var q=Object.defineProperty;var ae=Object.getOwnPropertyDescriptor;var oe=Object.getOwnPropertyNames;var de=Object.getPrototypeOf,le=Object.prototype.hasOwnProperty;var ce=(s,e,t,r)=>{if(e&&typeof e=="object"||typeof e=="function")for(let n of oe(e))!le.call(s,n)&&n!==t&&q(s,n,{get:()=>e[n],enumerable:!(r=ae(e,n))||r.enumerable});return s};var L=(s,e,t)=>(t=s!=null?ie(de(s)):{},ce(e||!s||!s.__esModule?q(t,"default",{value:s,enumerable:!0}):t,s));var U=require("fs"),te=require("path"),H=require("process"),re=L(require("yargs"),1),se=require("yargs/helpers");var B=s=>s.positional("filenames",{describe:"WARC file(s) to index",type:"string",array:!0,demandOption:"true"}).option("fields",{alias:"f",describe:"fields to include in index",type:"string"}),N=s=>s.positional("filenames",{describe:"WARC file(s) to index",type:"string",array:!0,demandOption:"true"}).option("all",{alias:"a",describe:"index all WARC records",type:"boolean"}).option("format",{describe:"output format",choices:["json","cdxj","cdx"],default:"cdxj"}).option("noSurt",{describe:"Use plain urlkey, do not convert to SURT form (Sort-friendly URI Reordering Transform)",type:"boolean"});var X=L(require("pako"),1);function $(s){let e;typeof s=="string"?e=s:s?.length?e=s.reduce((t,r)=>(t+=String.fromCharCode(r),t),""):s?e=s.toString():e="";try{return"__wb_post_data="+btoa(e)}catch{return"__wb_post_data="}}function j(s){return s.replace(/[.*+?^${}()|[\]\\]/g,"\\$&")}function M(s){try{if(!s.startsWith("https:")&&!s.startsWith("http:"))return s;s=s.replace(/^(https?:\/\/)www\d*\./,"$1");let e=s.toLowerCase(),t=new URL(e),n=t.hostname.split(".").reverse().join(",");if(t.port&&(n+=":"+t.port),n+=")",n+=t.pathname,t.search){t.searchParams.sort(),n+=t.search;for(let[a,i]of t.searchParams.entries())if(!i){let o=encodeURIComponent(a),d=new RegExp(`(?<=[&?])${j(a)}=(?=&|$)`);if(!d.exec(e)){let l=a===o?d:new RegExp(`(?<=[&?])${j(o)}=(?=&|$)`);n=n.replace(l,o)}}}return n}catch{return s}}function Q(s){let{method:e,headers:t,postData:r=""}=s;if(e==="GET")return!1;let n=(t.get("content-type")||"").split(";")[0];function a(o){return o instanceof Uint8Array&&(o=new TextDecoder().decode(o)),o}let i="";switch(n){case"application/x-www-form-urlencoded":i=a(r);break;case"application/json":i=z(a(r));break;case"text/plain":try{i=z(a(r),!1)}catch{i=$(r)}break;case"multipart/form-data":{let o=t.get("content-type");if(!o)throw new Error("utils cannot call postToGetURL when missing content-type header");i=pe(a(r),o);break}default:i=$(r)}return i!=null?(s.url=ue(s.url,i,s.method),s.method="GET",s.requestBody=i,!0):!1}function ue(s,e,t){if(!t)return s;let r=s.indexOf("?")>0?"&":"?";return`${s}${r}__wb_method=${t}&${e}`}function he(s,e=!0){if(typeof s=="string")try{s=JSON.parse(s)}catch{s={}}let t=new URLSearchParams,r={},n=i=>t.has(i)?(i in r||(r[i]=1),i+"."+ ++r[i]+"_"):i,a=(i,o="")=>{let d="";if(typeof i=="object"&&!(i instanceof Array))try{for(let[l,h]of Object.entries(i))a(h,l)}catch{i===null&&(d="null")}else if(i instanceof Array)for(let l=0;l{r.done||!r.value?t.close():t.enqueue(r.value)})}})}async readFully(){return await s.readFully(this)}async readline(e=0){let t=await this.readlineRaw(e);return t?G.decode(t):""}async*iterLines(e=0){let t=null;for(;t=await this.readline(e);)yield t}};function ye(s){return!!(s&&Symbol.iterator in Object(s))}function me(s){return!!(s&&Symbol.asyncIterator in Object(s))}var R=class s extends y{constructor(e,t="gzip",r=!1){super(),this.compressed=t,this.opts={raw:t==="deflateRaw"},this.inflator=t?new S(this.opts,this):null;let n;if(me(e))n=e;else if(typeof e=="object"&&"read"in e&&typeof e.read=="function")n=s.fromReadable(e);else if(e instanceof ReadableStream)n=s.fromReadable(e.getReader());else if(ye(e))n=s.fromIter(e);else throw new TypeError("Invalid Stream Source");r?this._sourceIter=this.dechunk(n):this._sourceIter=n[Symbol.asyncIterator](),this.lastValue=null,this.errored=!1,this._savedChunk=null,this._rawOffset=0,this._readOffset=0,this.numChunks=0}async _loadNext(){let e=await this._sourceIter.next();return e.done?null:e.value}async*dechunk(e){let t=e instanceof s?e:new s(e,null),r=-1,n=!0;for(;r!=0;){let a=await t.readlineRaw(64),i=new Uint8Array;if(r=a?parseInt(G.decode(a),16):0,!r||r>2**32){if(Number.isNaN(r)||r>2**32){n||(this.errored=!0),yield a;break}}else if(i=await t.readSize(r),i.length!=r){n?yield a:this.errored=!0,yield i;break}let o=await t.readSize(2);if(o[0]!=13||o[1]!=10){n?yield a:this.errored=!0,yield i,yield o;break}else{if(n=!1,!i||r===0)return;yield i}}yield*t}unread(e){e.length&&(this._readOffset-=e.length,this._savedChunk&&console.log("Already have chunk!"),this._savedChunk=e)}async _next(){if(this._savedChunk){let t=this._savedChunk;return this._savedChunk=null,t}if(this.compressed){let t=this._getNextChunk();if(t)return t}let e=await this._loadNext();for(;this.compressed&&e;){this._push(e);let t=this._getNextChunk(e);if(t)return t;e=await this._loadNext()}return e}_push(e){if(!this.inflator)throw new Error("AsyncIterReader cannot call _push when this.compressed is null");this.lastValue=e,this.inflator.ended&&(this.inflator=new S(this.opts,this)),this.inflator.push(e),this.inflator.err&&this.inflator.ended&&this.compressed==="deflate"&&!this.opts.raw&&this.numChunks===0&&(this.opts.raw=!0,this.compressed="deflateRaw",this.inflator=new S(this.opts,this),this.inflator.push(e))}_getNextChunk(e){if(!this.inflator)throw new Error("AsyncIterReader cannot call _getNextChunk when this.compressed is null");for(;;){if(this.inflator.chunks.length>0)return this.numChunks++,this.inflator.chunks.shift();if(this.inflator.ended){if(this.inflator.err!==0)return this.compressed=null,e;let t=this.inflator.strm.avail_in;if(t&&this.lastValue){this._push(this.lastValue.slice(-t));continue}}return null}}async*[Symbol.asyncIterator](){let e=null;for(;e=await this._next();)this._readOffset+=e.length,yield e}async readlineRaw(e){let t=[],r=0,n=-1,a=null;for await(let i of this){if(e&&r+i.byteLength>e){a=i,n=e-r-1;let o=i.slice(0,n+1).indexOf(10);o>=0&&(n=o);break}if(n=i.indexOf(10),n>=0){a=i;break}t.push(i),r+=i.byteLength}if(a){let[i,o]=g(a,n+1);t.push(i),r+=i.byteLength,this.unread(o)}else if(!t.length)return null;return _(t,r)}async readFully(){return(await this._readOrSkip())[1]}async readSize(e){return(await this._readOrSkip(e))[1]}async skipSize(e){return(await this._readOrSkip(e,!0))[0]}async _readOrSkip(e=-1,t=!1){let r=[],n=0;for await(let a of this){if(e>=0)if(a.length>e){let[i,o]=g(a,e);t||r.push(i),n+=i.byteLength,this.unread(o);break}else if(a.length===e){t||r.push(a),n+=a.byteLength,e=0;break}else e-=a.length;t||r.push(a),n+=a.byteLength}return t?[n,new Uint8Array]:[n,_(r,n)]}getReadOffset(){return this._readOffset}getRawOffset(){return this.compressed?this._rawOffset:this._readOffset}getRawLength(e){return this.compressed?this.inflator.strm.total_in:this._readOffset-e}static fromReadable(e){return{async*[Symbol.asyncIterator](){let r=null;for(;(r=await e.read())&&!r.done;)yield r.value}}}static fromIter(e){return{async*[Symbol.asyncIterator](){for(let r of e)yield r}}}},w=class extends y{constructor(e,t,r=0){super(),this.sourceIter=e,this.length=t,this.limit=t,this.skip=r}setLimitSkip(e,t=0){this.limit=e,this.skip=t}async*[Symbol.asyncIterator](){if(!(this.limit<=0))for await(let e of this.sourceIter){if(this.skip>0)if(e.length>=this.skip){let[,t]=g(e,this.skip);e=t,this.skip=0}else{this.skip-=e.length;continue}if(e.length>this.limit){let[t,r]=g(e,this.limit);e=t,this.sourceIter.unread&&this.sourceIter.unread(r)}if(e.length&&(this.limit-=e.length,yield e),this.limit<=0)break}}async readlineRaw(e){if(this.limit<=0)return null;let t=await this.sourceIter.readlineRaw(e?Math.min(e,this.limit):this.limit);return this.limit-=t?.length||0,t}async skipFully(){let e=this.limit;for(;this.limit>0;)this.limit-=await this.sourceIter.skipSize(this.limit);return e}};var ge=new Uint8Array([13,10]),je=new Uint8Array([13,10,13,10]),Re=new TextDecoder("utf-8"),x=class{constructor({statusline:e,headers:t}){this.statusline=e,this.headers=t}toString(){let e=[this.statusline];for(let[t,r]of this.headers)e.push(`${t}: ${r}`);return e.join(`\r `)+`\r `}async*iterSerialize(e){yield e.encode(this.statusline),yield ge;for(let[t,r]of this.headers)yield e.encode(`${t}: ${r}\r `)}_parseResponseStatusLine(){let e=we(this.statusline," ",2);this._protocol=e[0]??"",this._statusCode=e.length>1?Number(e[1]):"",this._statusText=e.length>2?e[2]:""}get statusCode(){return this._statusCode===void 0&&this._parseResponseStatusLine(),this._statusCode}get protocol(){return this._protocol===void 0&&this._parseResponseStatusLine(),this._protocol}get statusText(){return this._statusText===void 0&&this._parseResponseStatusLine(),this._statusText}_parseRequestStatusLine(){let e=this.statusline.split(" ",2);this._method=e[0]??"",this._requestPath=e.length>1?e[1]:""}get method(){return this._method===void 0&&this._parseRequestStatusLine(),this._method}get requestPath(){return this._requestPath===void 0&&this._parseRequestStatusLine(),this._requestPath}},k=class{async parse(e,{headersClass:t,firstLine:r}={headersClass:Map}){let n=r||await e.readline();if(!n)return null;let a=n.trimEnd();if(!a)return null;let i=new t,o=i instanceof Headers,d=await _e(e),l=0,h,b,f,u="",c;for(;l")&&this.warcHeaders.headers.set("WARC-Target-URI",t.slice(1,-1))}async readFully(t=!1){if(this.httpHeaders){if(this.payload&&!this.payload.length)return this.payload;if(this._contentReader&&!t)throw new TypeError("WARC Record decoding already started, but requesting raw payload");if(t&&this.consumed==="raw"&&this.payload)return await this._createDecodingReader([this.payload]).readFully()}return this.payload?this.payload:(t?(this.payload=await super.readFully(),this.consumed="content"):(this.payload=await y.readFully(this._reader),this.consumed="raw"),this.payload)}get reader(){if(this.payload&&!this.payload.length)return V();if(this._contentReader)throw new TypeError("WARC Record decoding already started, but requesting raw payload");return this._reader}get contentReader(){return this.httpHeaders?(this._contentReader||(this._contentReader=this._createDecodingReader(this._reader)),this._contentReader):this._reader}_createDecodingReader(t){if(!this.httpHeaders)throw new Error("WARCRecord cannot call _createDecodingReader when this.httpHeaders === null");let r=this.httpHeaders.headers.get("Content-Encoding"),n=this.httpHeaders.headers.get("Transfer-Encoding"),a=n==="chunked";return!r&&!a&&(r=n),new R(t,r,a)}async readlineRaw(t){if(this.consumed)throw new Error("Record already consumed.. Perhaps a promise was not awaited?");if(this.contentReader instanceof y)return this.contentReader.readlineRaw(t);throw new Error("WARCRecord cannot call readlineRaw on this.contentReader if it does not extend BaseAsyncIterReader")}async contentText(){let t=await this.readFully(!0);return xe.decode(t)}async*[Symbol.asyncIterator](){for await(let t of this.contentReader)if(yield t,this.consumed)throw new Error("Record already consumed.. Perhaps a promise was not awaited?");this.consumed="content"}async skipFully(){if(!this.consumed){if(this._reader instanceof w){let t=await this._reader.skipFully();return this.consumed="skipped",t}throw new Error("WARCRecord cannot call skipFully on this._reader if it is not a LimitReader")}}warcHeader(t){return this.warcHeaders.headers.get(t)}get warcType(){return this.warcHeaders.headers.get("WARC-Type")}get warcTargetURI(){return this.warcHeaders.headers.get("WARC-Target-URI")}get warcDate(){return this.warcHeaders.headers.get("WARC-Date")}get warcRefersToTargetURI(){return this.warcHeaders.headers.get("WARC-Refers-To-Target-URI")}get warcRefersToDate(){return this.warcHeaders.headers.get("WARC-Refers-To-Date")}get warcPayloadDigest(){return this.warcHeaders.headers.get("WARC-Payload-Digest")}get warcBlockDigest(){return this.warcHeaders.headers.get("WARC-Block-Digest")}get warcContentType(){return this.warcHeaders.headers.get("Content-Type")}get warcContentLength(){return Number(this.warcHeaders.headers.get("Content-Length"))}};async function*V(){}var Z=new TextDecoder,P=new Uint8Array([]),W=class s{static async parse(e,t){return new s(e,t).parse()}static iterRecords(e,t){return new s(e,t)[Symbol.asyncIterator]()}constructor(e,{keepHeadersCase:t=!1,parseHttp:r=!0}={}){this._offset=0,this._warcHeadersLength=0,this._headersClass=t?Map:Headers,this._parseHttp=r,e instanceof R?this._reader=e:this._reader=new R(e),this._record=null}async readToNextRecord(){if(!this._reader||!this._record)return P;await this._record.skipFully(),this._reader.compressed&&(this._offset=this._reader.getRawOffset());let e=await this._reader.readlineRaw(),t=0;if(!e)e=P;else{if(t=e.byteLength-1,t===9&&Z.decode(e).startsWith("WARC/"))return e;for(;t>0;){let r=e[t-1];if(r!==10&&r!==13)break;t--}t&&console.warn(`Content-Length Too Small: Record not followed by newline, Remainder Length: ${t}, Offset: ${this._reader.getRawOffset()-e.byteLength}`)}if(this._reader.compressed)await this._reader.skipSize(2),e=P;else{for(e=await this._reader.readlineRaw();e&&e.byteLength===2;)e=await this._reader.readlineRaw();this._offset=this._reader.getRawOffset(),e&&(this._offset-=e.length)}return e}_initRecordReader(e){return new w(this._reader,Number(e.headers.get("Content-Length")||0))}async parse(){let e=await this.readToNextRecord(),t=e?Z.decode(e):"",r=new k,n=await r.parse(this._reader,{firstLine:t,headersClass:this._headersClass});if(!n)return null;this._warcHeadersLength=this._reader.getReadOffset();let a=new T({warcHeaders:n,reader:this._initRecordReader(n)});if(this._record=a,this._parseHttp)switch(a.warcType){case"response":case"request":await this._addHttpHeaders(a,r);break;case"revisit":a.warcContentLength>0&&await this._addHttpHeaders(a,r);break}return a}get offset(){return this._offset}get recordLength(){return this._reader.getRawLength(this._offset)}async*[Symbol.asyncIterator](){let e=null;for(;(e=await this.parse())!==null;)yield e;this._record=null}async _addHttpHeaders(e,t){let r=await t.parse(this._reader,{headersClass:this._headersClass});e.httpHeaders=r;let n=this._reader.getReadOffset()-this._warcHeadersLength;e.reader instanceof w&&e.reader.setLimitSkip(e.warcContentLength-n)}};var Te=["offset","warc-type","warc-target-uri"],O=class{constructor(e={}){this.opts=e,this.fields=e.fields?e.fields.split(","):Te,this.parseHttp=!1}serialize(e){return JSON.stringify(e)+` `}write(e,t){t.write(this.serialize(e))}async writeAll(e,t){for await(let r of this.iterIndex(e))this.write(r,t)}async*iterIndex(e){let t={strictHeaders:!0,parseHttp:this.parseHttp};for(let{filename:r,reader:n}of e){let a=new W(n,t);yield*this.iterRecords(a,r)}}async*iterRecords(e,t){for await(let r of e){await r.skipFully();let n=this.indexRecord(r,e,t);n&&(yield n)}}indexRecord(e,t,r){if(this.filterRecord&&!this.filterRecord(e))return null;let n={},{offset:a,recordLength:i}=t,o={offset:""+a,length:""+i,filename:r};for(let d of this.fields)d in o?n[d]=o[d]:this.setField(d,e,n);return n}setField(e,t,r){let n=this.getField(e,t);n!==null&&(r[e]=n)}getField(e,t){if(e==="http:status")return t.httpHeaders&&(t.warcType==="response"||t.warcType==="revisit")?""+t.httpHeaders.statusCode:null;if(e.startsWith("http:")){if(t.httpHeaders){let r=t.httpHeaders.headers;return r instanceof Map&&(r=new Headers(Object.fromEntries(r))),r.get(e.slice(5))}return null}return t.warcHeaders.headers.get(e)||null}},C=class extends O{constructor(e){super(e);for(let t of this.fields)if(t.startsWith("http:")){this.parseHttp=!0;break}}},We="urlkey,timestamp,url,mime,status,digest,length,offset,filename".split(","),Ue="urlkey,timestamp,url,mime,status,digest,redirect,meta,length,offset,filename".split(","),I=class extends C{constructor(e){switch(super(e),this.includeAll=!!e?.all,this.overrideIndexForAll=!!e?.all,this.fields=We,this.parseHttp=!0,this.noSurt=!!e?.noSurt,this._lastRecord=null,e?.format){case"cdxj":this.serialize=this.serializeCDXJ;break;case"cdx":this.serialize=this.serializeCDX11;break;case"json":default:break}}async*iterRecords(e,t){this._lastRecord=null;for await(let n of e){await n.readFully();let a=this.indexRecord(n,e,t);a&&(yield a)}let r=this.indexRecord(null,e,t);r&&(yield r)}filterRecord(e){if(this.includeAll)return!0;let t=e.warcType;return!(t==="request"||t==="warcinfo"||(t==="metadata"||t==="resource")&&e.warcContentType==="application/warc-fields")}indexRecord(e,t,r){if(this.overrideIndexForAll)return e?super.indexRecord(e,t,r):null;let n=this._lastRecord;if(this._lastRecord=e,e&&(e._offset=t.offset,e._length=t.recordLength),!n)return null;if(!e||n.warcTargetURI!=e.warcTargetURI)return this.indexRecordPair(n,null,t,r);let a=e.warcType,i=n.warcType;return a==="request"&&(i==="response"||i==="revisit")?(this._lastRecord=null,this.indexRecordPair(n,e,t,r)):(a==="response"||a==="revisit")&&i==="request"?(this._lastRecord=null,this.indexRecordPair(e,n,t,r)):this.indexRecordPair(n,null,t,r)}indexRecordPair(e,t,r,n){let a,i,o=e.warcTargetURI||"";if(t?.httpHeaders&&t.httpHeaders.method!=="GET"){let l={url:o,method:t.httpHeaders.method,headers:t.httpHeaders.headers,postData:t.payload};a=l.method,Q(l)&&(i=l.requestBody,e.method=a,e.requestBody=i,o=l.url)}e._urlkey=o;let d=super.indexRecord(e,r,n);return d&&(e._offset!==void 0&&(d.offset=e._offset,d.length=e._length),a&&(d.method=a),i&&(d.requestBody=i)),d}serializeCDXJ(e){let{urlkey:t,timestamp:r}=e;return delete e.urlkey,delete e.timestamp,`${t} ${r} ${JSON.stringify(e)} `}serializeCDX11(e){let t=[];for(let r of Ue)t.push(e[r]!=null?e[r]:"-");return t.join(" ")+` -`}getField(e,t){let r=null;switch(e){case"urlkey":return r=t._urlkey||t.warcTargetURI||null,this.noSurt||r===null?r:M(r);case"timestamp":return r=t.warcDate??"",r.replace(/[-:T]/g,"").slice(0,14);case"url":return t.warcTargetURI;case"mime":switch(t.warcType){case"revisit":return"warc/revisit";case"response":case"request":e="http:content-type";break;default:e="content-type"}return r=super.getField(e,t),r?r.toString().split(";",1)[0]?.trim():null;case"status":return super.getField("http:status",t);case"digest":return r=t.warcPayloadDigest,r?r.split(":",2)[1]:null;default:return null}}};var Y="2.3.0";var Le=1024*128;async function E(s=H.stdout,e){let t=Promise.resolve();return e=e||(0,se.hideBin)(process.argv),(0,re.default)().version(Y).usage("$0 [command]").command({command:"index ",describe:"Index WARC(s)",builder:B,handler:async r=>{t=new C(r).writeAll(ee(r.filenames),s)}}).command({command:"cdx-index ",describe:"CDX(J) Index of WARC(s)",builder:j,handler:async r=>{t=new I(r).writeAll(ee(r.filenames),s)}}).demandCommand(1,"Please specify a command").strictCommands().help().parseAsync(e),t}function ee(s){return s.reduce((e,t)=>{if(!(0,U.lstatSync)(t).isFile())return H.stderr.write(`Skipping ${t}, not a file +`}getField(e,t){let r=null;switch(e){case"urlkey":return r=t._urlkey||t.warcTargetURI||null,this.noSurt||r===null?r:M(r);case"timestamp":return r=t.warcDate??"",r.replace(/[-:T]/g,"").slice(0,14);case"url":return t.warcTargetURI;case"mime":switch(t.warcType){case"revisit":return"warc/revisit";case"response":case"request":e="http:content-type";break;default:e="content-type"}return r=super.getField(e,t),r?r.toString().split(";",1)[0]?.trim():null;case"status":return super.getField("http:status",t);case"digest":return r=t.warcPayloadDigest,r?r.split(":",2)[1]:null;default:return null}}};var Y="2.3.0";var Le=1024*128;async function E(s=H.stdout,e){let t=Promise.resolve();return e=e||(0,se.hideBin)(process.argv),(0,re.default)().version(Y).usage("$0 [command]").command({command:"index ",describe:"Index WARC(s)",builder:B,handler:async r=>{t=new C(r).writeAll(ee(r.filenames),s)}}).command({command:"cdx-index ",describe:"CDX(J) Index of WARC(s)",builder:N,handler:async r=>{t=new I(r).writeAll(ee(r.filenames),s)}}).demandCommand(1,"Please specify a command").strictCommands().help().parseAsync(e),t}function ee(s){return s.reduce((e,t)=>{if(!(0,U.lstatSync)(t).isFile())return H.stderr.write(`Skipping ${t}, not a file `),e;let r=(0,U.createReadStream)(t,{highWaterMark:Le});return t=(0,te.basename)(t),e.push({filename:t,reader:r}),e},[])}E(); diff --git a/dist/cli.js b/dist/cli.js index bb55286..0b9c8c4 100755 --- a/dist/cli.js +++ b/dist/cli.js @@ -1,11 +1,11 @@ #!/usr/bin/env node -import{lstatSync as _e,createReadStream as xe}from"node:fs";import{basename as Ce}from"node:path";import{stdout as be,stderr as Se}from"node:process";import Ie from"yargs";import{hideBin as ke}from"yargs/helpers";var D=n=>n.positional("filenames",{describe:"WARC file(s) to index",type:"string",array:!0,demandOption:"true"}).option("fields",{alias:"f",describe:"fields to include in index",type:"string"}),F=n=>n.positional("filenames",{describe:"WARC file(s) to index",type:"string",array:!0,demandOption:"true"}).option("all",{alias:"a",describe:"index all WARC records",type:"boolean"}).option("format",{describe:"output format",choices:["json","cdxj","cdx"],default:"cdxj"}).option("noSurt",{describe:"Use plain urlkey, do not convert to SURT form (Sort-friendly URI Reordering Transform)",type:"boolean"});import te from"pako";function q(n){let e;typeof n=="string"?e=n:n?.length?e=n.reduce((t,r)=>(t+=String.fromCharCode(r),t),""):n?e=n.toString():e="";try{return"__wb_post_data="+btoa(e)}catch{return"__wb_post_data="}}function B(n){return n.replace(/[.*+?^${}()|[\]\\]/g,"\\$&")}function N(n){try{if(!n.startsWith("https:")&&!n.startsWith("http:"))return n;n=n.replace(/^(https?:\/\/)www\d*\./,"$1");let e=n.toLowerCase(),t=new URL(e),s=t.hostname.split(".").reverse().join(",");if(t.port&&(s+=":"+t.port),s+=")",s+=t.pathname,t.search){t.searchParams.sort(),s+=t.search;for(let[a,i]of t.searchParams.entries())if(!i){let o=encodeURIComponent(a),d=new RegExp(`(?<=[&?])${B(a)}=(?=&|$)`);if(!d.exec(e)){let l=a===o?d:new RegExp(`(?<=[&?])${B(o)}=(?=&|$)`);s=s.replace(l,o)}}}return s}catch{return n}}function $(n){let{method:e,headers:t,postData:r=""}=n;if(e==="GET")return!1;let s=(t.get("content-type")||"").split(";")[0];function a(o){return o instanceof Uint8Array&&(o=new TextDecoder().decode(o)),o}let i="";switch(s){case"application/x-www-form-urlencoded":i=a(r);break;case"application/json":i=j(a(r));break;case"text/plain":try{i=j(a(r),!1)}catch{i=q(r)}break;case"multipart/form-data":{let o=t.get("content-type");if(!o)throw new Error("utils cannot call postToGetURL when missing content-type header");i=ee(a(r),o);break}default:i=q(r)}return i!=null?(n.url=K(n.url,i,n.method),n.method="GET",n.requestBody=i,!0):!1}function K(n,e,t){if(!t)return n;let r=n.indexOf("?")>0?"&":"?";return`${n}${r}__wb_method=${t}&${e}`}function Z(n,e=!0){if(typeof n=="string")try{n=JSON.parse(n)}catch{n={}}let t=new URLSearchParams,r={},s=i=>t.has(i)?(i in r||(r[i]=1),i+"."+ ++r[i]+"_"):i,a=(i,o="")=>{let d="";if(typeof i=="object"&&!(i instanceof Array))try{for(let[l,h]of Object.entries(i))a(h,l)}catch{i===null&&(d="null")}else if(i instanceof Array)for(let l=0;l{r.done||!r.value?t.close():t.enqueue(r.value)})}})}async readFully(){return await n.readFully(this)}async readline(e=0){let t=await this.readlineRaw(e);return t?z.decode(t):""}async*iterLines(e=0){let t=null;for(;t=await this.readline(e);)yield t}};function re(n){return!!(n&&Symbol.iterator in Object(n))}function se(n){return!!(n&&Symbol.asyncIterator in Object(n))}var w=class n extends m{constructor(e,t="gzip",r=!1){super(),this.compressed=t,this.opts={raw:t==="deflateRaw"},this.inflator=t?new I(this.opts,this):null;let s;if(se(e))s=e;else if(typeof e=="object"&&"read"in e&&typeof e.read=="function")s=n.fromReadable(e);else if(e instanceof ReadableStream)s=n.fromReadable(e.getReader());else if(re(e))s=n.fromIter(e);else throw new TypeError("Invalid Stream Source");r?this._sourceIter=this.dechunk(s):this._sourceIter=s[Symbol.asyncIterator](),this.lastValue=null,this.errored=!1,this._savedChunk=null,this._rawOffset=0,this._readOffset=0,this.numChunks=0}async _loadNext(){let e=await this._sourceIter.next();return e.done?null:e.value}async*dechunk(e){let t=e instanceof n?e:new n(e,null),r=-1,s=!0;for(;r!=0;){let a=await t.readlineRaw(64),i=new Uint8Array;if(r=a?parseInt(z.decode(a),16):0,!r||r>2**32){if(Number.isNaN(r)||r>2**32){s||(this.errored=!0),yield a;break}}else if(i=await t.readSize(r),i.length!=r){s?yield a:this.errored=!0,yield i;break}let o=await t.readSize(2);if(o[0]!=13||o[1]!=10){s?yield a:this.errored=!0,yield i,yield o;break}else{if(s=!1,!i||r===0)return;yield i}}yield*t}unread(e){e.length&&(this._readOffset-=e.length,this._savedChunk&&console.log("Already have chunk!"),this._savedChunk=e)}async _next(){if(this._savedChunk){let t=this._savedChunk;return this._savedChunk=null,t}if(this.compressed){let t=this._getNextChunk();if(t)return t}let e=await this._loadNext();for(;this.compressed&&e;){this._push(e);let t=this._getNextChunk(e);if(t)return t;e=await this._loadNext()}return e}_push(e){if(!this.inflator)throw new Error("AsyncIterReader cannot call _push when this.compressed is null");this.lastValue=e,this.inflator.ended&&(this.inflator=new I(this.opts,this)),this.inflator.push(e),this.inflator.err&&this.inflator.ended&&this.compressed==="deflate"&&!this.opts.raw&&this.numChunks===0&&(this.opts.raw=!0,this.compressed="deflateRaw",this.inflator=new I(this.opts,this),this.inflator.push(e))}_getNextChunk(e){if(!this.inflator)throw new Error("AsyncIterReader cannot call _getNextChunk when this.compressed is null");for(;;){if(this.inflator.chunks.length>0)return this.numChunks++,this.inflator.chunks.shift();if(this.inflator.ended){if(this.inflator.err!==0)return this.compressed=null,e;let t=this.inflator.strm.avail_in;if(t&&this.lastValue){this._push(this.lastValue.slice(-t));continue}}return null}}async*[Symbol.asyncIterator](){let e=null;for(;e=await this._next();)this._readOffset+=e.length,yield e}async readlineRaw(e){let t=[],r=0,s=-1,a=null;for await(let i of this){if(e&&r+i.byteLength>e){a=i,s=e-r-1;let o=i.slice(0,s+1).indexOf(10);o>=0&&(s=o);break}if(s=i.indexOf(10),s>=0){a=i;break}t.push(i),r+=i.byteLength}if(a){let[i,o]=R(a,s+1);t.push(i),r+=i.byteLength,this.unread(o)}else if(!t.length)return null;return x(t,r)}async readFully(){return(await this._readOrSkip())[1]}async readSize(e){return(await this._readOrSkip(e))[1]}async skipSize(e){return(await this._readOrSkip(e,!0))[0]}async _readOrSkip(e=-1,t=!1){let r=[],s=0;for await(let a of this){if(e>=0)if(a.length>e){let[i,o]=R(a,e);t||r.push(i),s+=i.byteLength,this.unread(o);break}else if(a.length===e){t||r.push(a),s+=a.byteLength,e=0;break}else e-=a.length;t||r.push(a),s+=a.byteLength}return t?[s,new Uint8Array]:[s,x(r,s)]}getReadOffset(){return this._readOffset}getRawOffset(){return this.compressed?this._rawOffset:this._readOffset}getRawLength(e){return this.compressed?this.inflator.strm.total_in:this._readOffset-e}static fromReadable(e){return{async*[Symbol.asyncIterator](){let r=null;for(;(r=await e.read())&&!r.done;)yield r.value}}}static fromIter(e){return{async*[Symbol.asyncIterator](){for(let r of e)yield r}}}},A=class extends m{constructor(e,t,r=0){super(),this.sourceIter=e,this.length=t,this.limit=t,this.skip=r}setLimitSkip(e,t=0){this.limit=e,this.skip=t}async*[Symbol.asyncIterator](){if(!(this.limit<=0))for await(let e of this.sourceIter){if(this.skip>0)if(e.length>=this.skip){let[,t]=R(e,this.skip);e=t,this.skip=0}else{this.skip-=e.length;continue}if(e.length>this.limit){let[t,r]=R(e,this.limit);e=t,this.sourceIter.unread&&this.sourceIter.unread(r)}if(e.length&&(this.limit-=e.length,yield e),this.limit<=0)break}}async readlineRaw(e){if(this.limit<=0)return null;let t=await this.sourceIter.readlineRaw(e?Math.min(e,this.limit):this.limit);return this.limit-=t?.length||0,t}async skipFully(){let e=this.limit;for(;this.limit>0;)this.limit-=await this.sourceIter.skipSize(this.limit);return e}};var ne=new Uint8Array([13,10]),Fe=new Uint8Array([13,10,13,10]),ie=new TextDecoder("utf-8"),C=class{constructor({statusline:e,headers:t}){this.statusline=e,this.headers=t}toString(){let e=[this.statusline];for(let[t,r]of this.headers)e.push(`${t}: ${r}`);return e.join(`\r +import{lstatSync as _e,createReadStream as xe}from"fs";import{basename as Ce}from"path";import{stdout as be,stderr as Se}from"process";import Ie from"yargs";import{hideBin as ke}from"yargs/helpers";var D=n=>n.positional("filenames",{describe:"WARC file(s) to index",type:"string",array:!0,demandOption:"true"}).option("fields",{alias:"f",describe:"fields to include in index",type:"string"}),F=n=>n.positional("filenames",{describe:"WARC file(s) to index",type:"string",array:!0,demandOption:"true"}).option("all",{alias:"a",describe:"index all WARC records",type:"boolean"}).option("format",{describe:"output format",choices:["json","cdxj","cdx"],default:"cdxj"}).option("noSurt",{describe:"Use plain urlkey, do not convert to SURT form (Sort-friendly URI Reordering Transform)",type:"boolean"});import te from"pako";function q(n){let e;typeof n=="string"?e=n:n?.length?e=n.reduce((t,r)=>(t+=String.fromCharCode(r),t),""):n?e=n.toString():e="";try{return"__wb_post_data="+btoa(e)}catch{return"__wb_post_data="}}function B(n){return n.replace(/[.*+?^${}()|[\]\\]/g,"\\$&")}function $(n){try{if(!n.startsWith("https:")&&!n.startsWith("http:"))return n;n=n.replace(/^(https?:\/\/)www\d*\./,"$1");let e=n.toLowerCase(),t=new URL(e),s=t.hostname.split(".").reverse().join(",");if(t.port&&(s+=":"+t.port),s+=")",s+=t.pathname,t.search){t.searchParams.sort(),s+=t.search;for(let[a,i]of t.searchParams.entries())if(!i){let o=encodeURIComponent(a),d=new RegExp(`(?<=[&?])${B(a)}=(?=&|$)`);if(!d.exec(e)){let l=a===o?d:new RegExp(`(?<=[&?])${B(o)}=(?=&|$)`);s=s.replace(l,o)}}}return s}catch{return n}}function j(n){let{method:e,headers:t,postData:r=""}=n;if(e==="GET")return!1;let s=(t.get("content-type")||"").split(";")[0];function a(o){return o instanceof Uint8Array&&(o=new TextDecoder().decode(o)),o}let i="";switch(s){case"application/x-www-form-urlencoded":i=a(r);break;case"application/json":i=N(a(r));break;case"text/plain":try{i=N(a(r),!1)}catch{i=q(r)}break;case"multipart/form-data":{let o=t.get("content-type");if(!o)throw new Error("utils cannot call postToGetURL when missing content-type header");i=ee(a(r),o);break}default:i=q(r)}return i!=null?(n.url=K(n.url,i,n.method),n.method="GET",n.requestBody=i,!0):!1}function K(n,e,t){if(!t)return n;let r=n.indexOf("?")>0?"&":"?";return`${n}${r}__wb_method=${t}&${e}`}function Z(n,e=!0){if(typeof n=="string")try{n=JSON.parse(n)}catch{n={}}let t=new URLSearchParams,r={},s=i=>t.has(i)?(i in r||(r[i]=1),i+"."+ ++r[i]+"_"):i,a=(i,o="")=>{let d="";if(typeof i=="object"&&!(i instanceof Array))try{for(let[l,h]of Object.entries(i))a(h,l)}catch{i===null&&(d="null")}else if(i instanceof Array)for(let l=0;l{r.done||!r.value?t.close():t.enqueue(r.value)})}})}async readFully(){return await n.readFully(this)}async readline(e=0){let t=await this.readlineRaw(e);return t?z.decode(t):""}async*iterLines(e=0){let t=null;for(;t=await this.readline(e);)yield t}};function re(n){return!!(n&&Symbol.iterator in Object(n))}function se(n){return!!(n&&Symbol.asyncIterator in Object(n))}var w=class n extends m{constructor(e,t="gzip",r=!1){super(),this.compressed=t,this.opts={raw:t==="deflateRaw"},this.inflator=t?new I(this.opts,this):null;let s;if(se(e))s=e;else if(typeof e=="object"&&"read"in e&&typeof e.read=="function")s=n.fromReadable(e);else if(e instanceof ReadableStream)s=n.fromReadable(e.getReader());else if(re(e))s=n.fromIter(e);else throw new TypeError("Invalid Stream Source");r?this._sourceIter=this.dechunk(s):this._sourceIter=s[Symbol.asyncIterator](),this.lastValue=null,this.errored=!1,this._savedChunk=null,this._rawOffset=0,this._readOffset=0,this.numChunks=0}async _loadNext(){let e=await this._sourceIter.next();return e.done?null:e.value}async*dechunk(e){let t=e instanceof n?e:new n(e,null),r=-1,s=!0;for(;r!=0;){let a=await t.readlineRaw(64),i=new Uint8Array;if(r=a?parseInt(z.decode(a),16):0,!r||r>2**32){if(Number.isNaN(r)||r>2**32){s||(this.errored=!0),yield a;break}}else if(i=await t.readSize(r),i.length!=r){s?yield a:this.errored=!0,yield i;break}let o=await t.readSize(2);if(o[0]!=13||o[1]!=10){s?yield a:this.errored=!0,yield i,yield o;break}else{if(s=!1,!i||r===0)return;yield i}}yield*t}unread(e){e.length&&(this._readOffset-=e.length,this._savedChunk&&console.log("Already have chunk!"),this._savedChunk=e)}async _next(){if(this._savedChunk){let t=this._savedChunk;return this._savedChunk=null,t}if(this.compressed){let t=this._getNextChunk();if(t)return t}let e=await this._loadNext();for(;this.compressed&&e;){this._push(e);let t=this._getNextChunk(e);if(t)return t;e=await this._loadNext()}return e}_push(e){if(!this.inflator)throw new Error("AsyncIterReader cannot call _push when this.compressed is null");this.lastValue=e,this.inflator.ended&&(this.inflator=new I(this.opts,this)),this.inflator.push(e),this.inflator.err&&this.inflator.ended&&this.compressed==="deflate"&&!this.opts.raw&&this.numChunks===0&&(this.opts.raw=!0,this.compressed="deflateRaw",this.inflator=new I(this.opts,this),this.inflator.push(e))}_getNextChunk(e){if(!this.inflator)throw new Error("AsyncIterReader cannot call _getNextChunk when this.compressed is null");for(;;){if(this.inflator.chunks.length>0)return this.numChunks++,this.inflator.chunks.shift();if(this.inflator.ended){if(this.inflator.err!==0)return this.compressed=null,e;let t=this.inflator.strm.avail_in;if(t&&this.lastValue){this._push(this.lastValue.slice(-t));continue}}return null}}async*[Symbol.asyncIterator](){let e=null;for(;e=await this._next();)this._readOffset+=e.length,yield e}async readlineRaw(e){let t=[],r=0,s=-1,a=null;for await(let i of this){if(e&&r+i.byteLength>e){a=i,s=e-r-1;let o=i.slice(0,s+1).indexOf(10);o>=0&&(s=o);break}if(s=i.indexOf(10),s>=0){a=i;break}t.push(i),r+=i.byteLength}if(a){let[i,o]=R(a,s+1);t.push(i),r+=i.byteLength,this.unread(o)}else if(!t.length)return null;return x(t,r)}async readFully(){return(await this._readOrSkip())[1]}async readSize(e){return(await this._readOrSkip(e))[1]}async skipSize(e){return(await this._readOrSkip(e,!0))[0]}async _readOrSkip(e=-1,t=!1){let r=[],s=0;for await(let a of this){if(e>=0)if(a.length>e){let[i,o]=R(a,e);t||r.push(i),s+=i.byteLength,this.unread(o);break}else if(a.length===e){t||r.push(a),s+=a.byteLength,e=0;break}else e-=a.length;t||r.push(a),s+=a.byteLength}return t?[s,new Uint8Array]:[s,x(r,s)]}getReadOffset(){return this._readOffset}getRawOffset(){return this.compressed?this._rawOffset:this._readOffset}getRawLength(e){return this.compressed?this.inflator.strm.total_in:this._readOffset-e}static fromReadable(e){return{async*[Symbol.asyncIterator](){let r=null;for(;(r=await e.read())&&!r.done;)yield r.value}}}static fromIter(e){return{async*[Symbol.asyncIterator](){for(let r of e)yield r}}}},A=class extends m{constructor(e,t,r=0){super(),this.sourceIter=e,this.length=t,this.limit=t,this.skip=r}setLimitSkip(e,t=0){this.limit=e,this.skip=t}async*[Symbol.asyncIterator](){if(!(this.limit<=0))for await(let e of this.sourceIter){if(this.skip>0)if(e.length>=this.skip){let[,t]=R(e,this.skip);e=t,this.skip=0}else{this.skip-=e.length;continue}if(e.length>this.limit){let[t,r]=R(e,this.limit);e=t,this.sourceIter.unread&&this.sourceIter.unread(r)}if(e.length&&(this.limit-=e.length,yield e),this.limit<=0)break}}async readlineRaw(e){if(this.limit<=0)return null;let t=await this.sourceIter.readlineRaw(e?Math.min(e,this.limit):this.limit);return this.limit-=t?.length||0,t}async skipFully(){let e=this.limit;for(;this.limit>0;)this.limit-=await this.sourceIter.skipSize(this.limit);return e}};var ne=new Uint8Array([13,10]),Fe=new Uint8Array([13,10,13,10]),ie=new TextDecoder("utf-8"),C=class{constructor({statusline:e,headers:t}){this.statusline=e,this.headers=t}toString(){let e=[this.statusline];for(let[t,r]of this.headers)e.push(`${t}: ${r}`);return e.join(`\r `)+`\r `}async*iterSerialize(e){yield e.encode(this.statusline),yield ne;for(let[t,r]of this.headers)yield e.encode(`${t}: ${r}\r `)}_parseResponseStatusLine(){let e=ae(this.statusline," ",2);this._protocol=e[0]??"",this._statusCode=e.length>1?Number(e[1]):"",this._statusText=e.length>2?e[2]:""}get statusCode(){return this._statusCode===void 0&&this._parseResponseStatusLine(),this._statusCode}get protocol(){return this._protocol===void 0&&this._parseResponseStatusLine(),this._protocol}get statusText(){return this._statusText===void 0&&this._parseResponseStatusLine(),this._statusText}_parseRequestStatusLine(){let e=this.statusline.split(" ",2);this._method=e[0]??"",this._requestPath=e.length>1?e[1]:""}get method(){return this._method===void 0&&this._parseRequestStatusLine(),this._method}get requestPath(){return this._requestPath===void 0&&this._parseRequestStatusLine(),this._requestPath}},T=class{async parse(e,{headersClass:t,firstLine:r}={headersClass:Map}){let s=r||await e.readline();if(!s)return null;let a=s.trimEnd();if(!a)return null;let i=new t,o=i instanceof Headers,d=await de(e),l=0,h,S,f,u="",c;for(;l=0&&h0&&s.push(r.slice(t).join(e)),s}async function oe(n,e){let t=0;for(let r=0;r=n.length){let{value:a}=await e.next();if(!a)break;let i=new Uint8Array(a.length+n.length);i.set(n,0),i.set(a,n.length),n=i}if(n[s+1]===10&&n[s+2]===13&&n[s+3]===10)return[s+3,n];t=s+1}return[-1,n]}async function de(n){let e=[],t=0,r=0,s=null,a=n[Symbol.asyncIterator]();for await(let i of a){if([r,i]=await oe(i,a),r>=0){s=i;break}e.push(i),t+=i.byteLength}if(s){let[i,o]=R(s,r+1);e.push(i),t+=i.byteLength,n.unread(o)}else if(!e.length)return"";return ie.decode(x(e,t))}import le from"uuid-random";var ce=new TextDecoder("utf-8"),ue=new TextEncoder,he="WARC/1.1",M="WARC/1.0",fe="http://netpreserve.org/warc/1.0/revisit/identical-payload-digest",pe="http://netpreserve.org/warc/1.1/revisit/identical-payload-digest",ye={warcinfo:"application/warc-fields",response:"application/http; msgtype=response",revisit:"application/http; msgtype=response",request:"application/http; msgtype=request",metadata:"application/warc-fields"},W=class n extends m{constructor({warcHeaders:t,reader:r}){super();this._offset=0;this._length=0;this.method="";this.requestBody="";this._urlkey="";this.warcHeaders=t,this._reader=r,this._contentReader=null,this.payload=null,this.httpHeaders=null,this.consumed="",this.fixUp()}static create({url:t,date:r,type:s,warcHeaders:a={},filename:i="",httpHeaders:o={},statusline:d="HTTP/1.1 200 OK",warcVersion:l=M,keepHeadersCase:h=!0,refersToUrl:S=void 0,refersToDate:f=void 0}={},u){function c(_){let V=_;return l===M&&(_=_.split(".")[0],_.charAt(V.length-1)!="Z"&&(_+="Z")),_}r=c(r||new Date().toISOString()),a={...a},s==="warcinfo"?i&&(a["WARC-Filename"]=i):t&&(a["WARC-Target-URI"]=t),a["WARC-Date"]=r,s&&(a["WARC-Type"]=s),s==="revisit"&&(a["WARC-Profile"]=l===he?pe:fe,S&&(a["WARC-Refers-To-Target-URI"]=S,a["WARC-Refers-To-Date"]=c(f||new Date().toISOString())));let g=new C({statusline:l,headers:new Map(Object.entries(a))});g.headers.get("WARC-Record-ID")||g.headers.set("WARC-Record-ID",``),g.headers.get("Content-Type")||g.headers.set("Content-Type",s&&ye[s]||"application/octet-stream"),u||(u=Q());let O=new n({warcHeaders:g,reader:u}),E=null,H=[];switch(s){case"response":case"request":case"revisit":H=Object.entries(o),E=h?new Map(H):new Headers(o),(H.length>0||s!=="revisit")&&(O.httpHeaders=new C({statusline:d,headers:E}));break}return O}static createWARCInfo(t={},r){async function*s(){for(let[a,i]of Object.entries(r))yield ue.encode(`${a}: ${i}\r `)}return t.type="warcinfo",n.create(t,s())}getResponseInfo(){let t=this.httpHeaders;return t?{headers:t.headers,status:t.statusCode,statusText:t.statusText}:null}fixUp(){let t=this.warcHeaders.headers.get("WARC-Target-URI");t&&t.startsWith("<")&&t.endsWith(">")&&this.warcHeaders.headers.set("WARC-Target-URI",t.slice(1,-1))}async readFully(t=!1){if(this.httpHeaders){if(this.payload&&!this.payload.length)return this.payload;if(this._contentReader&&!t)throw new TypeError("WARC Record decoding already started, but requesting raw payload");if(t&&this.consumed==="raw"&&this.payload)return await this._createDecodingReader([this.payload]).readFully()}return this.payload?this.payload:(t?(this.payload=await super.readFully(),this.consumed="content"):(this.payload=await m.readFully(this._reader),this.consumed="raw"),this.payload)}get reader(){if(this.payload&&!this.payload.length)return Q();if(this._contentReader)throw new TypeError("WARC Record decoding already started, but requesting raw payload");return this._reader}get contentReader(){return this.httpHeaders?(this._contentReader||(this._contentReader=this._createDecodingReader(this._reader)),this._contentReader):this._reader}_createDecodingReader(t){if(!this.httpHeaders)throw new Error("WARCRecord cannot call _createDecodingReader when this.httpHeaders === null");let r=this.httpHeaders.headers.get("Content-Encoding"),s=this.httpHeaders.headers.get("Transfer-Encoding"),a=s==="chunked";return!r&&!a&&(r=s),new w(t,r,a)}async readlineRaw(t){if(this.consumed)throw new Error("Record already consumed.. Perhaps a promise was not awaited?");if(this.contentReader instanceof m)return this.contentReader.readlineRaw(t);throw new Error("WARCRecord cannot call readlineRaw on this.contentReader if it does not extend BaseAsyncIterReader")}async contentText(){let t=await this.readFully(!0);return ce.decode(t)}async*[Symbol.asyncIterator](){for await(let t of this.contentReader)if(yield t,this.consumed)throw new Error("Record already consumed.. Perhaps a promise was not awaited?");this.consumed="content"}async skipFully(){if(!this.consumed){if(this._reader instanceof A){let t=await this._reader.skipFully();return this.consumed="skipped",t}throw new Error("WARCRecord cannot call skipFully on this._reader if it is not a LimitReader")}}warcHeader(t){return this.warcHeaders.headers.get(t)}get warcType(){return this.warcHeaders.headers.get("WARC-Type")}get warcTargetURI(){return this.warcHeaders.headers.get("WARC-Target-URI")}get warcDate(){return this.warcHeaders.headers.get("WARC-Date")}get warcRefersToTargetURI(){return this.warcHeaders.headers.get("WARC-Refers-To-Target-URI")}get warcRefersToDate(){return this.warcHeaders.headers.get("WARC-Refers-To-Date")}get warcPayloadDigest(){return this.warcHeaders.headers.get("WARC-Payload-Digest")}get warcBlockDigest(){return this.warcHeaders.headers.get("WARC-Block-Digest")}get warcContentType(){return this.warcHeaders.headers.get("Content-Type")}get warcContentLength(){return Number(this.warcHeaders.headers.get("Content-Length"))}};async function*Q(){}var X=new TextDecoder,v=new Uint8Array([]),U=class n{static async parse(e,t){return new n(e,t).parse()}static iterRecords(e,t){return new n(e,t)[Symbol.asyncIterator]()}constructor(e,{keepHeadersCase:t=!1,parseHttp:r=!0}={}){this._offset=0,this._warcHeadersLength=0,this._headersClass=t?Map:Headers,this._parseHttp=r,e instanceof w?this._reader=e:this._reader=new w(e),this._record=null}async readToNextRecord(){if(!this._reader||!this._record)return v;await this._record.skipFully(),this._reader.compressed&&(this._offset=this._reader.getRawOffset());let e=await this._reader.readlineRaw(),t=0;if(!e)e=v;else{if(t=e.byteLength-1,t===9&&X.decode(e).startsWith("WARC/"))return e;for(;t>0;){let r=e[t-1];if(r!==10&&r!==13)break;t--}t&&console.warn(`Content-Length Too Small: Record not followed by newline, Remainder Length: ${t}, Offset: ${this._reader.getRawOffset()-e.byteLength}`)}if(this._reader.compressed)await this._reader.skipSize(2),e=v;else{for(e=await this._reader.readlineRaw();e&&e.byteLength===2;)e=await this._reader.readlineRaw();this._offset=this._reader.getRawOffset(),e&&(this._offset-=e.length)}return e}_initRecordReader(e){return new A(this._reader,Number(e.headers.get("Content-Length")||0))}async parse(){let e=await this.readToNextRecord(),t=e?X.decode(e):"",r=new T,s=await r.parse(this._reader,{firstLine:t,headersClass:this._headersClass});if(!s)return null;this._warcHeadersLength=this._reader.getReadOffset();let a=new W({warcHeaders:s,reader:this._initRecordReader(s)});if(this._record=a,this._parseHttp)switch(a.warcType){case"response":case"request":await this._addHttpHeaders(a,r);break;case"revisit":a.warcContentLength>0&&await this._addHttpHeaders(a,r);break}return a}get offset(){return this._offset}get recordLength(){return this._reader.getRawLength(this._offset)}async*[Symbol.asyncIterator](){let e=null;for(;(e=await this.parse())!==null;)yield e;this._record=null}async _addHttpHeaders(e,t){let r=await t.parse(this._reader,{headersClass:this._headersClass});e.httpHeaders=r;let s=this._reader.getReadOffset()-this._warcHeadersLength;e.reader instanceof A&&e.reader.setLimitSkip(e.warcContentLength-s)}};var me=["offset","warc-type","warc-target-uri"],L=class{constructor(e={}){this.opts=e,this.fields=e.fields?e.fields.split(","):me,this.parseHttp=!1}serialize(e){return JSON.stringify(e)+` -`}write(e,t){t.write(this.serialize(e))}async writeAll(e,t){for await(let r of this.iterIndex(e))this.write(r,t)}async*iterIndex(e){let t={strictHeaders:!0,parseHttp:this.parseHttp};for(let{filename:r,reader:s}of e){let a=new U(s,t);yield*this.iterRecords(a,r)}}async*iterRecords(e,t){for await(let r of e){await r.skipFully();let s=this.indexRecord(r,e,t);s&&(yield s)}}indexRecord(e,t,r){if(this.filterRecord&&!this.filterRecord(e))return null;let s={},{offset:a,recordLength:i}=t,o={offset:""+a,length:""+i,filename:r};for(let d of this.fields)d in o?s[d]=o[d]:this.setField(d,e,s);return s}setField(e,t,r){let s=this.getField(e,t);s!==null&&(r[e]=s)}getField(e,t){if(e==="http:status")return t.httpHeaders&&(t.warcType==="response"||t.warcType==="revisit")?""+t.httpHeaders.statusCode:null;if(e.startsWith("http:")){if(t.httpHeaders){let r=t.httpHeaders.headers;return r instanceof Map&&(r=new Headers(Object.fromEntries(r))),r.get(e.slice(5))}return null}return t.warcHeaders.headers.get(e)||null}},b=class extends L{constructor(e){super(e);for(let t of this.fields)if(t.startsWith("http:")){this.parseHttp=!0;break}}},ge="urlkey,timestamp,url,mime,status,digest,length,offset,filename".split(","),Re="urlkey,timestamp,url,mime,status,digest,redirect,meta,length,offset,filename".split(","),k=class extends b{constructor(e){switch(super(e),this.includeAll=!!e?.all,this.overrideIndexForAll=!!e?.all,this.fields=ge,this.parseHttp=!0,this.noSurt=!!e?.noSurt,this._lastRecord=null,e?.format){case"cdxj":this.serialize=this.serializeCDXJ;break;case"cdx":this.serialize=this.serializeCDX11;break;case"json":default:break}}async*iterRecords(e,t){this._lastRecord=null;for await(let s of e){await s.readFully();let a=this.indexRecord(s,e,t);a&&(yield a)}let r=this.indexRecord(null,e,t);r&&(yield r)}filterRecord(e){if(this.includeAll)return!0;let t=e.warcType;return!(t==="request"||t==="warcinfo"||(t==="metadata"||t==="resource")&&e.warcContentType==="application/warc-fields")}indexRecord(e,t,r){if(this.overrideIndexForAll)return e?super.indexRecord(e,t,r):null;let s=this._lastRecord;if(this._lastRecord=e,e&&(e._offset=t.offset,e._length=t.recordLength),!s)return null;if(!e||s.warcTargetURI!=e.warcTargetURI)return this.indexRecordPair(s,null,t,r);let a=e.warcType,i=s.warcType;return a==="request"&&(i==="response"||i==="revisit")?(this._lastRecord=null,this.indexRecordPair(s,e,t,r)):(a==="response"||a==="revisit")&&i==="request"?(this._lastRecord=null,this.indexRecordPair(e,s,t,r)):this.indexRecordPair(s,null,t,r)}indexRecordPair(e,t,r,s){let a,i,o=e.warcTargetURI||"";if(t?.httpHeaders&&t.httpHeaders.method!=="GET"){let l={url:o,method:t.httpHeaders.method,headers:t.httpHeaders.headers,postData:t.payload};a=l.method,$(l)&&(i=l.requestBody,e.method=a,e.requestBody=i,o=l.url)}e._urlkey=o;let d=super.indexRecord(e,r,s);return d&&(e._offset!==void 0&&(d.offset=e._offset,d.length=e._length),a&&(d.method=a),i&&(d.requestBody=i)),d}serializeCDXJ(e){let{urlkey:t,timestamp:r}=e;return delete e.urlkey,delete e.timestamp,`${t} ${r} ${JSON.stringify(e)} +`}write(e,t){t.write(this.serialize(e))}async writeAll(e,t){for await(let r of this.iterIndex(e))this.write(r,t)}async*iterIndex(e){let t={strictHeaders:!0,parseHttp:this.parseHttp};for(let{filename:r,reader:s}of e){let a=new U(s,t);yield*this.iterRecords(a,r)}}async*iterRecords(e,t){for await(let r of e){await r.skipFully();let s=this.indexRecord(r,e,t);s&&(yield s)}}indexRecord(e,t,r){if(this.filterRecord&&!this.filterRecord(e))return null;let s={},{offset:a,recordLength:i}=t,o={offset:""+a,length:""+i,filename:r};for(let d of this.fields)d in o?s[d]=o[d]:this.setField(d,e,s);return s}setField(e,t,r){let s=this.getField(e,t);s!==null&&(r[e]=s)}getField(e,t){if(e==="http:status")return t.httpHeaders&&(t.warcType==="response"||t.warcType==="revisit")?""+t.httpHeaders.statusCode:null;if(e.startsWith("http:")){if(t.httpHeaders){let r=t.httpHeaders.headers;return r instanceof Map&&(r=new Headers(Object.fromEntries(r))),r.get(e.slice(5))}return null}return t.warcHeaders.headers.get(e)||null}},b=class extends L{constructor(e){super(e);for(let t of this.fields)if(t.startsWith("http:")){this.parseHttp=!0;break}}},ge="urlkey,timestamp,url,mime,status,digest,length,offset,filename".split(","),Re="urlkey,timestamp,url,mime,status,digest,redirect,meta,length,offset,filename".split(","),k=class extends b{constructor(e){switch(super(e),this.includeAll=!!e?.all,this.overrideIndexForAll=!!e?.all,this.fields=ge,this.parseHttp=!0,this.noSurt=!!e?.noSurt,this._lastRecord=null,e?.format){case"cdxj":this.serialize=this.serializeCDXJ;break;case"cdx":this.serialize=this.serializeCDX11;break;case"json":default:break}}async*iterRecords(e,t){this._lastRecord=null;for await(let s of e){await s.readFully();let a=this.indexRecord(s,e,t);a&&(yield a)}let r=this.indexRecord(null,e,t);r&&(yield r)}filterRecord(e){if(this.includeAll)return!0;let t=e.warcType;return!(t==="request"||t==="warcinfo"||(t==="metadata"||t==="resource")&&e.warcContentType==="application/warc-fields")}indexRecord(e,t,r){if(this.overrideIndexForAll)return e?super.indexRecord(e,t,r):null;let s=this._lastRecord;if(this._lastRecord=e,e&&(e._offset=t.offset,e._length=t.recordLength),!s)return null;if(!e||s.warcTargetURI!=e.warcTargetURI)return this.indexRecordPair(s,null,t,r);let a=e.warcType,i=s.warcType;return a==="request"&&(i==="response"||i==="revisit")?(this._lastRecord=null,this.indexRecordPair(s,e,t,r)):(a==="response"||a==="revisit")&&i==="request"?(this._lastRecord=null,this.indexRecordPair(e,s,t,r)):this.indexRecordPair(s,null,t,r)}indexRecordPair(e,t,r,s){let a,i,o=e.warcTargetURI||"";if(t?.httpHeaders&&t.httpHeaders.method!=="GET"){let l={url:o,method:t.httpHeaders.method,headers:t.httpHeaders.headers,postData:t.payload};a=l.method,j(l)&&(i=l.requestBody,e.method=a,e.requestBody=i,o=l.url)}e._urlkey=o;let d=super.indexRecord(e,r,s);return d&&(e._offset!==void 0&&(d.offset=e._offset,d.length=e._length),a&&(d.method=a),i&&(d.requestBody=i)),d}serializeCDXJ(e){let{urlkey:t,timestamp:r}=e;return delete e.urlkey,delete e.timestamp,`${t} ${r} ${JSON.stringify(e)} `}serializeCDX11(e){let t=[];for(let r of Re)t.push(e[r]!=null?e[r]:"-");return t.join(" ")+` -`}getField(e,t){let r=null;switch(e){case"urlkey":return r=t._urlkey||t.warcTargetURI||null,this.noSurt||r===null?r:N(r);case"timestamp":return r=t.warcDate??"",r.replace(/[-:T]/g,"").slice(0,14);case"url":return t.warcTargetURI;case"mime":switch(t.warcType){case"revisit":return"warc/revisit";case"response":case"request":e="http:content-type";break;default:e="content-type"}return r=super.getField(e,t),r?r.toString().split(";",1)[0]?.trim():null;case"status":return super.getField("http:status",t);case"digest":return r=t.warcPayloadDigest,r?r.split(":",2)[1]:null;default:return null}}};var G="2.3.0";var Te=1024*128;async function P(n=be,e){let t=Promise.resolve();return e=e||ke(process.argv),Ie().version(G).usage("$0 [command]").command({command:"index ",describe:"Index WARC(s)",builder:D,handler:async r=>{t=new b(r).writeAll(J(r.filenames),n)}}).command({command:"cdx-index ",describe:"CDX(J) Index of WARC(s)",builder:F,handler:async r=>{t=new k(r).writeAll(J(r.filenames),n)}}).demandCommand(1,"Please specify a command").strictCommands().help().parseAsync(e),t}function J(n){return n.reduce((e,t)=>{if(!_e(t).isFile())return Se.write(`Skipping ${t}, not a file +`}getField(e,t){let r=null;switch(e){case"urlkey":return r=t._urlkey||t.warcTargetURI||null,this.noSurt||r===null?r:$(r);case"timestamp":return r=t.warcDate??"",r.replace(/[-:T]/g,"").slice(0,14);case"url":return t.warcTargetURI;case"mime":switch(t.warcType){case"revisit":return"warc/revisit";case"response":case"request":e="http:content-type";break;default:e="content-type"}return r=super.getField(e,t),r?r.toString().split(";",1)[0]?.trim():null;case"status":return super.getField("http:status",t);case"digest":return r=t.warcPayloadDigest,r?r.split(":",2)[1]:null;default:return null}}};var G="2.3.0";var Te=1024*128;async function P(n=be,e){let t=Promise.resolve();return e=e||ke(process.argv),Ie().version(G).usage("$0 [command]").command({command:"index ",describe:"Index WARC(s)",builder:D,handler:async r=>{t=new b(r).writeAll(J(r.filenames),n)}}).command({command:"cdx-index ",describe:"CDX(J) Index of WARC(s)",builder:F,handler:async r=>{t=new k(r).writeAll(J(r.filenames),n)}}).demandCommand(1,"Please specify a command").strictCommands().help().parseAsync(e),t}function J(n){return n.reduce((e,t)=>{if(!_e(t).isFile())return Se.write(`Skipping ${t}, not a file `),e;let r=xe(t,{highWaterMark:Te});return t=Ce(t),e.push({filename:t,reader:r}),e},[])}P(); diff --git a/dist/node/index.js b/dist/node/index.js index 6f82abb..ac00a17 100644 --- a/dist/node/index.js +++ b/dist/node/index.js @@ -1,2 +1,2 @@ -import I from"node:fs";import{unlink as F}from"node:fs/promises";import{temporaryFile as O}from"tempy";function g(r){if(r instanceof Int8Array||r instanceof Uint8Array||r instanceof Uint8ClampedArray)return new DataView(r.buffer,r.byteOffset,r.byteLength);if(r instanceof ArrayBuffer)return new DataView(r);throw new TypeError("Expected `data` to be an ArrayBuffer, Buffer, Int8Array, Uint8Array or Uint8ClampedArray")}var B="ABCDEFGHIJKLMNOPQRSTUVWXYZ234567",z="0123456789ABCDEFGHIJKLMNOPQRSTUV",v="0123456789ABCDEFGHJKMNPQRSTVWXYZ";function w(r,n,e){e=e||{};let t,s;switch(n){case"RFC3548":case"RFC4648":t=B,s=!0;break;case"RFC4648-HEX":t=z,s=!0;break;case"Crockford":t=v,s=!1;break;default:throw new Error("Unknown base32 variant: "+n)}let i=e.padding!==void 0?e.padding:s,a=g(r),o=0,l=0,f="";for(let m=0;m=5;)f+=t[l>>>o-5&31],o-=5;if(o>0&&(f+=t[l<<5-o&31]),i)for(;f.length%8!==0;)f+="=";return f}import T from"pako";import{createSHA256 as C,createSHA1 as E}from"hash-wasm";import L from"pako";function x(r,n){if(r.length===1)return r[0];let e=new Uint8Array(n),t=0;for(let s of r)e.set(s,t),t+=s.byteLength;return e}var H=new TextDecoder("utf-8"),k=class extends L.Inflate{constructor(e,t){super(e);this.ended=!1;this.chunks=[];this.reader=t}onEnd(e){this.err=e,this.err||(this.reader._rawOffset+=this.strm.total_in)}},d=class r{static async readFully(n){let e=[],t=0;for await(let s of n)e.push(s),t+=s.byteLength;return x(e,t)}getReadableStream(){let n=this[Symbol.asyncIterator]();return new ReadableStream({async pull(e){return n.next().then(t=>{t.done||!t.value?e.close():e.enqueue(t.value)})}})}async readFully(){return await r.readFully(this)}async readline(n=0){let e=await this.readlineRaw(n);return e?H.decode(e):""}async*iterLines(n=0){let e=null;for(;e=await this.readline(n);)yield e}};var S=new Uint8Array([13,10]),_=new Uint8Array([13,10,13,10]),Y=new TextDecoder("utf-8");var U=new TextEncoder,b=class{},h=class extends b{constructor(){super(...arguments);this.buffers=[]}write(e){this.buffers.push(e)}async*readAll(){for(let e of this.buffers)yield e}},p=class r extends d{constructor(e,t={},s=new h){super();this.gzip=!1;this.digestAlgo="";this.digestAlgoPrefix="";this.digestBase32=!1;this.preferPako=!1;this._alreadyDigested=!1;this.blockHasher=null;this.payloadHasher=null;this.httpHeadersBuff=null;this.warcHeadersBuff=null;this.gzip=!!t.gzip,this.record=e;let i=t.digest||{};this.digestAlgo=i.algo||"sha-256",this.digestAlgoPrefix=i.prefix||"sha256:",this.digestBase32=!!i.base32,this.preferPako=!!t.preferPako,r.noComputeDigest(e)&&(this.digestAlgo=""),this.externalBuffer=s}static async serialize(e,t,s=new h){return await new r(e,t,s).readFully()}static noComputeDigest(e){return e.warcType==="revisit"||e.warcType==="warcinfo"||e.warcPayloadDigest&&e.warcBlockDigest}async*[Symbol.asyncIterator](){if(!this.gzip){yield*this.generateRecord();return}if("CompressionStream"in globalThis&&!this.preferPako){let e=new globalThis.CompressionStream("gzip");yield*this.streamCompress(e)}else yield*this.pakoCompress()}async readlineRaw(e){return null}async*pakoCompress(){let e=new T.Deflate({gzip:!0}),t=null;for await(let s of this.generateRecord())for(t&&t.length>0&&e.push(t),t=s;e.chunks.length;)yield e.chunks.shift();t&&e.push(t,!0),yield e.result}async*streamCompress(e){let t=this.generateRecord();new ReadableStream({async pull(o){let l=await t.next();l.done?o.close():o.enqueue(l.value)}}).pipeThrough(e);let i=null,a=e.readable.getReader();for(;(i=await a.read())&&!i.done;)yield i.value}async newHasher(){switch(this.digestAlgo){case"sha-256":return C();case"sha-1":return E();case"":return null;default:return C()}}getDigest(e){return this.digestAlgoPrefix+(this.digestBase32?w(e.digest("binary"),"RFC4648"):e.digest("hex"))}async digestRecord(){let e=this.record;if(this._alreadyDigested)return Number(e.warcHeaders.headers.get("Content-Length"));let t=await this.newHasher(),s=await this.newHasher(),i=0;e.httpHeaders&&(this.httpHeadersBuff=U.encode(e.httpHeaders.toString()+`\r +import I from"fs";import{unlink as F}from"fs/promises";import{temporaryFile as O}from"tempy";function g(r){if(r instanceof Int8Array||r instanceof Uint8Array||r instanceof Uint8ClampedArray)return new DataView(r.buffer,r.byteOffset,r.byteLength);if(r instanceof ArrayBuffer)return new DataView(r);throw new TypeError("Expected `data` to be an ArrayBuffer, Buffer, Int8Array, Uint8Array or Uint8ClampedArray")}var B="ABCDEFGHIJKLMNOPQRSTUVWXYZ234567",z="0123456789ABCDEFGHIJKLMNOPQRSTUV",v="0123456789ABCDEFGHJKMNPQRSTVWXYZ";function w(r,n,e){e=e||{};let t,s;switch(n){case"RFC3548":case"RFC4648":t=B,s=!0;break;case"RFC4648-HEX":t=z,s=!0;break;case"Crockford":t=v,s=!1;break;default:throw new Error("Unknown base32 variant: "+n)}let i=e.padding!==void 0?e.padding:s,a=g(r),o=0,l=0,f="";for(let m=0;m=5;)f+=t[l>>>o-5&31],o-=5;if(o>0&&(f+=t[l<<5-o&31]),i)for(;f.length%8!==0;)f+="=";return f}import T from"pako";import{createSHA256 as C,createSHA1 as E}from"hash-wasm";import L from"pako";function x(r,n){if(r.length===1)return r[0];let e=new Uint8Array(n),t=0;for(let s of r)e.set(s,t),t+=s.byteLength;return e}var H=new TextDecoder("utf-8"),k=class extends L.Inflate{constructor(e,t){super(e);this.ended=!1;this.chunks=[];this.reader=t}onEnd(e){this.err=e,this.err||(this.reader._rawOffset+=this.strm.total_in)}},d=class r{static async readFully(n){let e=[],t=0;for await(let s of n)e.push(s),t+=s.byteLength;return x(e,t)}getReadableStream(){let n=this[Symbol.asyncIterator]();return new ReadableStream({async pull(e){return n.next().then(t=>{t.done||!t.value?e.close():e.enqueue(t.value)})}})}async readFully(){return await r.readFully(this)}async readline(n=0){let e=await this.readlineRaw(n);return e?H.decode(e):""}async*iterLines(n=0){let e=null;for(;e=await this.readline(n);)yield e}};var S=new Uint8Array([13,10]),_=new Uint8Array([13,10,13,10]),Y=new TextDecoder("utf-8");var U=new TextEncoder,b=class{},h=class extends b{constructor(){super(...arguments);this.buffers=[]}write(e){this.buffers.push(e)}async*readAll(){for(let e of this.buffers)yield e}},p=class r extends d{constructor(e,t={},s=new h){super();this.gzip=!1;this.digestAlgo="";this.digestAlgoPrefix="";this.digestBase32=!1;this.preferPako=!1;this._alreadyDigested=!1;this.blockHasher=null;this.payloadHasher=null;this.httpHeadersBuff=null;this.warcHeadersBuff=null;this.gzip=!!t.gzip,this.record=e;let i=t.digest||{};this.digestAlgo=i.algo||"sha-256",this.digestAlgoPrefix=i.prefix||"sha256:",this.digestBase32=!!i.base32,this.preferPako=!!t.preferPako,r.noComputeDigest(e)&&(this.digestAlgo=""),this.externalBuffer=s}static async serialize(e,t,s=new h){return await new r(e,t,s).readFully()}static noComputeDigest(e){return e.warcType==="revisit"||e.warcType==="warcinfo"||e.warcPayloadDigest&&e.warcBlockDigest}async*[Symbol.asyncIterator](){if(!this.gzip){yield*this.generateRecord();return}if("CompressionStream"in globalThis&&!this.preferPako){let e=new globalThis.CompressionStream("gzip");yield*this.streamCompress(e)}else yield*this.pakoCompress()}async readlineRaw(e){return null}async*pakoCompress(){let e=new T.Deflate({gzip:!0}),t=null;for await(let s of this.generateRecord())for(t&&t.length>0&&e.push(t),t=s;e.chunks.length;)yield e.chunks.shift();t&&e.push(t,!0),yield e.result}async*streamCompress(e){let t=this.generateRecord();new ReadableStream({async pull(o){let l=await t.next();l.done?o.close():o.enqueue(l.value)}}).pipeThrough(e);let i=null,a=e.readable.getReader();for(;(i=await a.read())&&!i.done;)yield i.value}async newHasher(){switch(this.digestAlgo){case"sha-256":return C();case"sha-1":return E();case"":return null;default:return C()}}getDigest(e){return this.digestAlgoPrefix+(this.digestBase32?w(e.digest("binary"),"RFC4648"):e.digest("hex"))}async digestRecord(){let e=this.record;if(this._alreadyDigested)return Number(e.warcHeaders.headers.get("Content-Length"));let t=await this.newHasher(),s=await this.newHasher(),i=0;e.httpHeaders&&(this.httpHeadersBuff=U.encode(e.httpHeaders.toString()+`\r `),i+=this.httpHeadersBuff.length,t?.update(this.httpHeadersBuff));for await(let a of e.reader)t?.update(a),s?.update(a),this.externalBuffer.write(a),i+=a.length;return s&&e.warcHeaders.headers.set("WARC-Payload-Digest",this.getDigest(s)),t&&e.warcHeaders.headers.set("WARC-Block-Digest",this.getDigest(t)),e.warcHeaders.headers.set("Content-Length",i.toString()),this.warcHeadersBuff=U.encode(e.warcHeaders.toString()),this._alreadyDigested=!0,i}async*generateRecord(){if(await this.digestRecord(),this.warcHeadersBuff&&(yield this.warcHeadersBuff),yield S,this.httpHeadersBuff&&(yield this.httpHeadersBuff),this.externalBuffer)for await(let e of this.externalBuffer.readAll())yield e;yield _}};var R=1024*1024*2,A=class r extends p{static async serialize(n,e){return await new r(n,e).readFully()}constructor(n,e={}){super(n,e,new y(e.maxMemSize||R))}},y=class extends h{constructor(e=R){super();this.currSize=0;this.fh=null;this.filename="";this.memSize=e}write(e){this.currSize+e.length<=this.memSize?this.buffers.push(e):(this.fh||(this.filename=O(),this.fh=I.createWriteStream(this.filename)),this.fh.write(e)),this.currSize+=e.length}async*readAll(){for(let t of this.buffers)yield t;if(!this.fh)return;await D(this.fh),this.fh=null;let e=I.createReadStream(this.filename);for await(let t of e)yield t;await F(this.filename)}};async function D(r){let n=new Promise(e=>{r.once("finish",()=>e())});return r.end(),n}export{R as DEFAULT_MEM_SIZE,y as TempFileBuffer,A as WARCSerializer}; diff --git a/package.json b/package.json index 1784c88..9f2f36f 100644 --- a/package.json +++ b/package.json @@ -63,8 +63,6 @@ "@types/eslint": "^7.29.0", "@types/jest": "^29.2.3", "@types/node": "^18.11.9", - "@types/pako": "^1.0.0", - "@types/stream-buffers": "^3.0.4", "@types/yargs": "^17.0.17", "@typescript-eslint/eslint-plugin": "^5.43.0", "@typescript-eslint/parser": "^5.43.0", @@ -73,6 +71,7 @@ "eslint-plugin-deprecation": "^1.5.0", "jest": "^29.6.2", "jest-expect-message": "^1.1.3", + "prettier": "^3.3.3", "stream-buffers": "^3.0.2", "ts-jest": "^29.1.1", "tsup": "^7.2.0", diff --git a/yarn.lock b/yarn.lock index b4436e1..d1b0d32 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2997,6 +2997,11 @@ prelude-ls@^1.2.1: resolved "https://registry.yarnpkg.com/prelude-ls/-/prelude-ls-1.2.1.tgz#debc6489d7a6e6b0e7611888cec880337d316396" integrity sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g== +prettier@^3.3.3: + version "3.3.3" + resolved "https://registry.yarnpkg.com/prettier/-/prettier-3.3.3.tgz#30c54fe0be0d8d12e6ae61dbb10109ea00d53105" + integrity sha512-i2tDNA0O5IrMO757lfrdQZCc2jPNDVntV0m/+4whiDfWaTKfMNgR7Qz0NAeGz/nRqF4m5/6CLzbP4/liHt12Ew== + pretty-format@^29.0.0, pretty-format@^29.7.0: version "29.7.0" resolved "https://registry.yarnpkg.com/pretty-format/-/pretty-format-29.7.0.tgz#ca42c758310f365bfa71a0bda0a807160b776812"