Skip to content

Commit

Permalink
readd
Browse files Browse the repository at this point in the history
  • Loading branch information
ikreymer committed Aug 22, 2024
1 parent 3a62a88 commit 12f7b26
Show file tree
Hide file tree
Showing 18 changed files with 650 additions and 11 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,3 @@ yarn-error.log
coverage/
.DS_Store
**/*.cts
dist
11 changes: 11 additions & 0 deletions dist/cli.cjs

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions dist/cli.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
#!/usr/bin/env node
11 changes: 11 additions & 0 deletions dist/cli.js

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions dist/index.all.js

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions dist/index.all.js.map

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions dist/index.cjs

Large diffs are not rendered by default.

302 changes: 302 additions & 0 deletions dist/index.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,302 @@
import pako from 'pako';
import { IHasher } from 'hash-wasm/dist/lib/WASMInterface.js';
import { WritableStreamBuffer } from 'stream-buffers';

type SourceReader = {
read: Function;
};
type SourceReadable = {
getReader: (...args: any) => {
read: Function;
};
};
type Source = SourceReader | SourceReadable | AsyncIterable<Uint8Array> | Iterable<Uint8Array>;
type StreamResult = {
filename: string;
reader: AsyncIterable<Uint8Array>;
};
type StreamResults = StreamResult[];
type IndexerOffsetLength = {
offset: number;
recordLength: number;
};
type Request = {
method: string;
url: string;
headers: Map<string, string> | Headers;
postData?: Uint8Array | string | undefined | null;
requestBody?: any;
};

declare class NoConcatInflator<T extends BaseAsyncIterReader> extends pako.Inflate {
reader: T;
ended: boolean;
chunks: Uint8Array[];
constructor(options: pako.InflateOptions, reader: T);
onEnd(status: pako.ReturnCodes): void;
}
declare abstract class BaseAsyncIterReader {
static readFully(iter: AsyncIterable<Uint8Array> | Iterable<Uint8Array>): Promise<Uint8Array>;
abstract [Symbol.asyncIterator](): AsyncIterator<Uint8Array>;
getReadableStream(): ReadableStream<any>;
readFully(): Promise<Uint8Array>;
abstract readlineRaw(maxLength?: number): Promise<Uint8Array | null>;
readline(maxLength?: number): Promise<string>;
iterLines(maxLength?: number): AsyncGenerator<string, void, unknown>;
}
type AsyncIterReaderOpts = {
raw: boolean;
};
declare class AsyncIterReader extends BaseAsyncIterReader {
compressed: string | null;
opts: AsyncIterReaderOpts;
inflator: NoConcatInflator<this> | null;
_sourceIter: AsyncIterator<Uint8Array | null>;
lastValue: Uint8Array | null;
errored: boolean;
_savedChunk: Uint8Array | null;
_rawOffset: number;
_readOffset: number;
numChunks: number;
constructor(streamOrIter: Source, compressed?: string | null, dechunk?: boolean);
_loadNext(): Promise<Uint8Array | null>;
dechunk(source: AsyncIterable<Uint8Array>): AsyncIterator<Uint8Array | null>;
unread(chunk: Uint8Array): void;
_next(): Promise<Uint8Array | null>;
_push(value: Uint8Array): void;
_getNextChunk(original?: Uint8Array): Uint8Array | null | undefined;
[Symbol.asyncIterator](): AsyncGenerator<Uint8Array, void, unknown>;
readlineRaw(maxLength?: number): Promise<Uint8Array | null>;
readFully(): Promise<Uint8Array>;
readSize(sizeLimit: number): Promise<Uint8Array>;
skipSize(sizeLimit: number): Promise<number>;
_readOrSkip(sizeLimit?: number, skip?: boolean): Promise<readonly [number, Uint8Array]>;
getReadOffset(): number;
getRawOffset(): number;
getRawLength(prevOffset: number): number;
static fromReadable<Readable extends SourceReader>(source: Readable): {
[Symbol.asyncIterator](): AsyncGenerator<Uint8Array, void, unknown>;
};
static fromIter(source: Iterable<Uint8Array>): {
[Symbol.asyncIterator](): AsyncGenerator<Uint8Array, void, unknown>;
};
}
declare class LimitReader extends BaseAsyncIterReader {
sourceIter: AsyncIterReader;
length: number;
limit: number;
skip: number;
constructor(streamIter: AsyncIterReader, limit: number, skip?: number);
setLimitSkip(limit: number, skip?: number): void;
[Symbol.asyncIterator](): AsyncGenerator<Uint8Array, void, unknown>;
readlineRaw(maxLength?: number): Promise<Uint8Array | null>;
skipFully(): Promise<number>;
}

declare class StatusAndHeaders {
statusline: string;
headers: Map<string, string> | Headers;
constructor({ statusline, headers, }: {
statusline: string;
headers: Map<string, string> | Headers;
});
toString(): string;
iterSerialize(encoder: TextEncoder): AsyncGenerator<Uint8Array, void, unknown>;
_protocol: string | undefined;
_statusCode: number | string | undefined;
_statusText: string | undefined;
_parseResponseStatusLine(): void;
get statusCode(): string | number | undefined;
get protocol(): string | undefined;
get statusText(): string | undefined;
_method: string | undefined;
_requestPath: string | undefined;
_parseRequestStatusLine(): void;
get method(): string | undefined;
get requestPath(): string | undefined;
}
declare class StatusAndHeadersParser {
parse(reader: AsyncIterReader, { headersClass, firstLine, }?: {
firstLine?: string;
headersClass: typeof Map | typeof Headers;
}): Promise<StatusAndHeaders | null>;
}

declare const WARC_1_1 = "WARC/1.1";
declare const WARC_1_0 = "WARC/1.0";
type WARCType = "warcinfo" | "response" | "resource" | "request" | "metadata" | "revisit" | "conversion" | "continuation";
type WARCRecordOpts = {
url?: string;
date?: string;
type?: WARCType;
warcHeaders?: Record<string, string>;
filename?: string;
httpHeaders?: HeadersInit;
statusline?: string;
warcVersion?: typeof WARC_1_0 | typeof WARC_1_1;
keepHeadersCase?: boolean;
refersToUrl?: string;
refersToDate?: string;
};
declare class WARCRecord extends BaseAsyncIterReader {
static create({ url, date, type, warcHeaders, filename, httpHeaders, statusline, warcVersion, keepHeadersCase, refersToUrl, refersToDate, }?: WARCRecordOpts, reader?: AsyncIterable<Uint8Array> | Iterable<Uint8Array>): WARCRecord;
static createWARCInfo(opts: WARCRecordOpts | undefined, info: Record<string, string>): WARCRecord;
warcHeaders: StatusAndHeaders;
_reader: AsyncIterable<Uint8Array> | Iterable<Uint8Array>;
_contentReader: BaseAsyncIterReader | null;
payload: Uint8Array | null;
httpHeaders: StatusAndHeaders | null;
consumed: "content" | "raw" | "skipped" | "";
_offset: number | undefined;
_length: number;
method: string | undefined;
requestBody: string;
_urlkey: string;
constructor({ warcHeaders, reader, }: {
warcHeaders: StatusAndHeaders;
reader: AsyncIterable<Uint8Array> | Iterable<Uint8Array>;
});
getResponseInfo(): {
headers: Map<string, string> | Headers;
status: string | number | undefined;
statusText: string | undefined;
} | null;
fixUp(): void;
readFully(isContent?: boolean): Promise<Uint8Array>;
get reader(): AsyncIterable<Uint8Array> | Iterable<Uint8Array>;
get contentReader(): AsyncIterable<Uint8Array> | Iterable<Uint8Array>;
_createDecodingReader(source: Source): AsyncIterReader;
readlineRaw(maxLength?: number): Promise<Uint8Array | null>;
contentText(): Promise<string>;
[Symbol.asyncIterator](): AsyncGenerator<Uint8Array, void, unknown>;
skipFully(): Promise<number | undefined>;
warcHeader(name: string): string | null | undefined;
get warcType(): string | null | undefined;
get warcTargetURI(): string | null | undefined;
get warcDate(): string | null | undefined;
get warcRefersToTargetURI(): string | null | undefined;
get warcRefersToDate(): string | null | undefined;
get warcPayloadDigest(): string | null | undefined;
get warcBlockDigest(): string | null | undefined;
get warcContentType(): string | null | undefined;
get warcContentLength(): number;
}

type WARCParserOpts = {
keepHeadersCase?: boolean;
parseHttp?: boolean;
};
declare class WARCParser implements IndexerOffsetLength {
static parse(source: Source, options?: WARCParserOpts): Promise<WARCRecord | null>;
static iterRecords(source: Source, options?: WARCParserOpts): AsyncGenerator<WARCRecord, void, unknown>;
_offset: number;
_warcHeadersLength: number;
_headersClass: typeof Map | typeof Headers;
_parseHttp: boolean;
_reader: AsyncIterReader;
_record: WARCRecord | null;
constructor(source: Source, { keepHeadersCase, parseHttp }?: WARCParserOpts);
readToNextRecord(): Promise<Uint8Array | null>;
_initRecordReader(warcHeaders: StatusAndHeaders): LimitReader;
parse(): Promise<WARCRecord | null>;
get offset(): number;
get recordLength(): number;
[Symbol.asyncIterator](): AsyncGenerator<WARCRecord, void, unknown>;
_addHttpHeaders(record: WARCRecord, headersParser: StatusAndHeadersParser): Promise<void>;
}

type WARCSerializerOpts = {
gzip?: boolean;
digest?: {
algo?: AlgorithmIdentifier;
prefix?: string;
base32?: boolean;
};
preferPako?: boolean;
};
declare abstract class BaseSerializerBuffer {
abstract write(chunk: Uint8Array): void;
abstract readAll(): AsyncIterable<Uint8Array>;
}
declare class WARCSerializer extends BaseAsyncIterReader {
gzip: boolean;
digestAlgo: AlgorithmIdentifier;
digestAlgoPrefix: string;
digestBase32: boolean;
preferPako: boolean;
record: WARCRecord;
externalBuffer: BaseSerializerBuffer | undefined;
_alreadyDigested: boolean;
blockHasher: IHasher | null;
payloadHasher: IHasher | null;
httpHeadersBuff: Uint8Array | null;
warcHeadersBuff: Uint8Array | null;
static serialize(record: WARCRecord, opts?: WARCSerializerOpts, externalBuffer?: BaseSerializerBuffer): Promise<Uint8Array>;
constructor(record: WARCRecord, opts?: WARCSerializerOpts, externalBuffer?: BaseSerializerBuffer);
static noComputeDigest(record: WARCRecord): string | true | null | undefined;
[Symbol.asyncIterator](): AsyncGenerator<any, void, unknown>;
readlineRaw(maxLength?: number): Promise<Uint8Array | null>;
pakoCompress(): AsyncGenerator<any, void, unknown>;
streamCompress(cs: CompressionStream): AsyncGenerator<Uint8Array, void, unknown>;
newHasher(): Promise<IHasher | null>;
getDigest(hasher: IHasher): string;
digestRecord(): Promise<number>;
generateRecord(): AsyncGenerator<Uint8Array, void, unknown>;
}

type IndexCommandArgs = any;
type CdxIndexCommandArgs = any;

declare abstract class BaseIndexer {
opts: Partial<IndexCommandArgs>;
fields: string[];
parseHttp: boolean;
constructor(opts?: Partial<IndexCommandArgs>);
serialize(result: Record<string, any>): string;
write(result: Record<string, any>, out: WritableStreamBuffer | NodeJS.WriteStream): void;
writeAll(files: StreamResults, out: WritableStreamBuffer | NodeJS.WriteStream): Promise<void>;
iterIndex(files: StreamResults): AsyncGenerator<Record<string, any>, void, unknown>;
iterRecords(parser: WARCParser, filename: string): AsyncGenerator<Record<string, any>, void, unknown>;
filterRecord?(record: WARCRecord): boolean;
indexRecord(record: WARCRecord, indexerOffset: IndexerOffsetLength, filename: string): Record<string, any> | null;
setField(field: string, record: WARCRecord, result: Record<string, any>): void;
getField(field: string, record: WARCRecord): string | number | null | undefined;
}
declare class Indexer extends BaseIndexer {
constructor(opts?: Partial<IndexCommandArgs>);
}
interface CDXAndRecord {
cdx: Record<string, any>;
record: WARCRecord;
reqRecord: WARCRecord | null;
}
declare class CDXIndexer extends Indexer {
includeAll: boolean;
overrideIndexForAll: boolean;
noSurt: boolean;
_lastRecord: WARCRecord | null;
constructor(opts?: Partial<CdxIndexCommandArgs>);
iterRecords(parser: WARCParser, filename: string): AsyncGenerator<Record<string, any>, void, unknown>;
filterRecord(record: WARCRecord): boolean;
indexRecord(record: WARCRecord | null, indexOffset: IndexerOffsetLength, filename: string): Record<string, any> | null;
indexRecordPair(record: WARCRecord, reqRecord: WARCRecord | null, indexOffset: IndexerOffsetLength, filename: string): Record<string, any> | null;
serializeCDXJ(result: Record<string, any>): string;
serializeCDX11(result: Record<string, any>): string;
getField(field: string, record: WARCRecord): string | number | null | undefined;
}
declare class CDXAndRecordIndexer extends CDXIndexer {
constructor(opts?: Partial<CdxIndexCommandArgs>);
indexRecordPair(record: WARCRecord, reqRecord: WARCRecord | null, indexOffset: IndexerOffsetLength, filename: string): CDXAndRecord | null;
}

declare function getSurt(url: string): string;
declare function postToGetUrl(request: Request): boolean;
declare function appendRequestQuery(url: string, query: string, method: string): string;
declare function jsonToQueryParams(json: unknown, ignoreInvalid?: boolean): URLSearchParams;
declare function mfdToQueryParams(mfd: string | Uint8Array | undefined | null, contentType: string): URLSearchParams;
declare function jsonToQueryString(json?: string | Record<string, unknown> | undefined | null, ignoreInvalid?: boolean): string;
declare function mfdToQueryString(mfd: string | Uint8Array | undefined | null, contentType: string): string;
declare function concatChunks(chunks: Uint8Array[], size: number): Uint8Array;
declare function splitChunk(chunk: Uint8Array, inx: number): [Uint8Array, Uint8Array];

export { AsyncIterReader, type AsyncIterReaderOpts, BaseAsyncIterReader, BaseSerializerBuffer, CDXAndRecordIndexer, CDXIndexer, Indexer, type IndexerOffsetLength, LimitReader, NoConcatInflator, type Request, type Source, type SourceReadable, type SourceReader, StatusAndHeaders, StatusAndHeadersParser, type StreamResult, type StreamResults, WARCParser, type WARCParserOpts, WARCRecord, type WARCRecordOpts, WARCSerializer, type WARCSerializerOpts, type WARCType, WARC_1_0, WARC_1_1, appendRequestQuery, concatChunks, getSurt, jsonToQueryParams, jsonToQueryString, mfdToQueryParams, mfdToQueryString, postToGetUrl, splitChunk };
Loading

0 comments on commit 12f7b26

Please sign in to comment.