Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix SURT edge case encoded query arg #73

Merged
merged 2 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion src/lib/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,19 @@ export function getSurt(url: string) {
surt += urlObj.search;
for (const [key, value] of urlObj.searchParams.entries()) {
if (!value) {
// if no value set, by default the surt contains 'key='
// however, for compatibility, only want to add a trailing '='
// if original URL has it.
const keyEncoded = encodeURIComponent(key);
const rx = new RegExp(`(?<=[&?])${rxEscape(key)}=(?=&|$)`);
// if original URL does *not* have trailing '=', attempt to remove it below
if (!rx.exec(urlLower)) {
surt = surt.replace(rx, key);
// use URI encoded version to match the query arg if key is %-encoded
const rxEncoded =
key === keyEncoded
? rx
: new RegExp(`(?<=[&?])${rxEscape(keyEncoded)}=(?=&|$)`);
surt = surt.replace(rxEncoded, keyEncoded);
}
}
}
Expand Down
11 changes: 11 additions & 0 deletions test/testUtils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,15 @@ describe("utils", () => {
"com,example)/some/path?*&a=b&c=d&z",
);
});

test("surt with %-encoded query, trailing = param", () => {
expect(getSurt("https://www.example.com/some/path?a=b&c=d&e^=&z")).toBe(
"com,example)/some/path?a=b&c=d&e%5E=&z",
);
});
test("surt with %-encoded query, no trailing = param", () => {
expect(getSurt("https://www.example.com/some/path?a=b&c=d&e^&z")).toBe(
"com,example)/some/path?a=b&c=d&e%5E&z",
);
});
});
42 changes: 21 additions & 21 deletions test/testWARCParser.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ Multi-Line: Value1\r\n\
Also This\r\n\
\r\n\
Body",
])
)
]),
),
);
expect(result?.toString()).toBe(`\
HTTP/1.0 200 OK\r
Expand All @@ -76,8 +76,8 @@ Content-Type: Value\r\n\
Content-Length: 0\r\n\
Bad: multi\nline\r\n\
\r\n",
])
)
]),
),
);
expect(result?.toString()).toBe(`HTTP/1.0 204 Empty\r
Content-Type: Value\r
Expand All @@ -89,15 +89,15 @@ Bad: multi\r
test("StatusAndHeaders test 3", async () => {
const parser = new StatusAndHeadersParser();
const result = await parser.parse(
new AsyncIterReader(getReader(["HTTP/1.0 204 None\r\n\r\n"]))
new AsyncIterReader(getReader(["HTTP/1.0 204 None\r\n\r\n"])),
);
expect(result?.toString()).toBe("HTTP/1.0 204 None\r\n");
});

test("StatusAndHeaders test empty", async () => {
const parser = new StatusAndHeadersParser();
const result = await parser.parse(
new AsyncIterReader(getReader(["\r\n\r\n"]))
new AsyncIterReader(getReader(["\r\n\r\n"])),
);
expect(result).toBe(null);
});
Expand Down Expand Up @@ -178,7 +178,7 @@ text\r\n\
software: recorder test\r\n\
format: WARC File Format 1.0\r\n\
json-metadata: {"foo": "bar"}\r\n\
'
',
);

// eslint-disable-next-line @typescript-eslint/no-non-null-assertion -- checked in expect
Expand Down Expand Up @@ -252,15 +252,15 @@ Content-Length: 0\r\n\

expect(record.warcHeaders.protocol).toBe("WARC/1.0");
expect(record.warcHeader("WARC-Record-ID")).toBe(
"<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>"
"<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>",
);
expect(record.warcType).toBe("revisit");
expect(record.warcTargetURI).toBe("http://example.com/");
expect(record.warcDate).toBe("2000-01-01T00:00:00Z");
expect(record.warcRefersToTargetURI).toBe("http://example.com/foo");
expect(record.warcRefersToDate).toBe("1999-01-01T00:00:00Z");
expect(record.warcPayloadDigest).toBe(
"sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O"
"sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O",
);
expect(record.warcContentType).toBe("application/http; msgtype=response");
expect(record.warcContentLength).toBe(0);
Expand Down Expand Up @@ -307,15 +307,15 @@ Foo: Bar\r\n\
expect(record).not.toBeNull();
expect(record.warcHeaders.protocol).toBe("WARC/1.0");
expect(record.warcHeader("WARC-Record-ID")).toBe(
"<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>"
"<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>",
);
expect(record.warcType).toBe("revisit");
expect(record.warcTargetURI).toBe("http://example.com/");
expect(record.warcDate).toBe("2000-01-01T00:00:00Z");
expect(record.warcRefersToTargetURI).toBe("http://example.com/foo");
expect(record.warcRefersToDate).toBe("1999-01-01T00:00:00Z");
expect(record.warcPayloadDigest).toBe(
"sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O"
"sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O",
);
expect(record.warcContentType).toBe("application/http; msgtype=response");
expect(record.warcContentLength).toBe(54);
Expand Down Expand Up @@ -374,7 +374,7 @@ Foo: Bar\r\n\

expect(record.warcHeaders.protocol).toBe("WARC/1.0");
expect(record.warcHeader("WARC-Record-ID")).toBe(
"<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>"
"<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>",
);
expect(record.warcType).toBe("revisit");
expect(record.warcContentLength).toBe(82);
Expand Down Expand Up @@ -542,7 +542,7 @@ test("warc1.1 serialize records match", async () => {

test("chunked warc read", async () => {
const input = fs.createReadStream(
get_warc_path("data/example-iana.org-chunked.warc")
get_warc_path("data/example-iana.org-chunked.warc"),
);

const parser = new WARCParser(input);
Expand All @@ -558,7 +558,7 @@ test("chunked warc read", async () => {

// can't read raw data anymore
await expect(async () => await record.readFully(false)).rejects.toThrow(
"WARC Record decoding already started, but requesting raw payload"
"WARC Record decoding already started, but requesting raw payload",
);

const text = await record.contentText();
Expand All @@ -568,15 +568,15 @@ test("chunked warc read", async () => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- checking invalid type
const busyRecord = record as any as { reader: LimitReader };
await expect(async () => await busyRecord.reader.readFully()).rejects.toThrow(
"WARC Record decoding already started, but requesting raw payload"
"WARC Record decoding already started, but requesting raw payload",
);

expect(await record.readFully(true)).not.toBeNull();
});

test("no await catch errors", async () => {
const input = fs.createReadStream(
get_warc_path("data/example-iana.org-chunked.warc")
get_warc_path("data/example-iana.org-chunked.warc"),
);

const parser = new WARCParser(input);
Expand All @@ -597,10 +597,10 @@ test("no await catch errors", async () => {
const record1 = (await parser.parse())!;
expect(record1).not.toBeNull();
await expect(async () => await iter.next()).rejects.toThrow(
"Record already consumed.. Perhaps a promise was not awaited?"
"Record already consumed.. Perhaps a promise was not awaited?",
);
await expect(async () => await record0.readline()).rejects.toThrow(
"Record already consumed.. Perhaps a promise was not awaited?"
"Record already consumed.. Perhaps a promise was not awaited?",
);

let count = 0;
Expand Down Expand Up @@ -685,15 +685,15 @@ text\r\n\
["custom-header", "somevalue"],
["set-cookie", "greeting=hello"],
["set-cookie", "name=world"],
])
]),
);
} else {
expect(JSON.stringify(headerEntries)).toBe(
JSON.stringify([
["content-type", 'text/plain; charset="UTF-8"'],
["custom-header", "somevalue"],
["set-cookie", "greeting=hello, name=world"],
])
]),
);
}

Expand All @@ -718,7 +718,7 @@ text\r\n\
["content-type", 'text/plain; charset="UTF-8"'],
["custom-header", "somevalue"],
["unicode-header", "%F0%9F%93%81%20text%20%F0%9F%97%84%EF%B8%8F"],
])
]),
);

expect(decoder.decode(await record2.readFully())).toBe("more\ntext");
Expand Down
Loading