Skip to content

Commit

Permalink
Fix SURT edge case encoded query arg (#73)
Browse files Browse the repository at this point in the history
ensure getSurt() preserves trailing '=' presence / absence if no value:
- '&x^' kept as '&x%5E' while '&x^=' kept as '&x%5E=' 
- fixes #70
- update formatting in testWARCParser.test.ts
  • Loading branch information
ikreymer authored Aug 21, 2024
1 parent f0ab495 commit 53997e6
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 22 deletions.
12 changes: 11 additions & 1 deletion src/lib/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,19 @@ export function getSurt(url: string) {
surt += urlObj.search;
for (const [key, value] of urlObj.searchParams.entries()) {
if (!value) {
// if no value set, by default the surt contains 'key='
// however, for compatibility, only want to add a trailing '='
// if original URL has it.
const keyEncoded = encodeURIComponent(key);
const rx = new RegExp(`(?<=[&?])${rxEscape(key)}=(?=&|$)`);
// if original URL does *not* have trailing '=', attempt to remove it below
if (!rx.exec(urlLower)) {
surt = surt.replace(rx, key);
// use URI encoded version to match the query arg if key is %-encoded
const rxEncoded =
key === keyEncoded
? rx
: new RegExp(`(?<=[&?])${rxEscape(keyEncoded)}=(?=&|$)`);
surt = surt.replace(rxEncoded, keyEncoded);
}
}
}
Expand Down
11 changes: 11 additions & 0 deletions test/testUtils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,15 @@ describe("utils", () => {
"com,example)/some/path?*&a=b&c=d&z",
);
});

test("surt with %-encoded query, trailing = param", () => {
expect(getSurt("https://www.example.com/some/path?a=b&c=d&e^=&z")).toBe(
"com,example)/some/path?a=b&c=d&e%5E=&z",
);
});
test("surt with %-encoded query, no trailing = param", () => {
expect(getSurt("https://www.example.com/some/path?a=b&c=d&e^&z")).toBe(
"com,example)/some/path?a=b&c=d&e%5E&z",
);
});
});
42 changes: 21 additions & 21 deletions test/testWARCParser.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ Multi-Line: Value1\r\n\
Also This\r\n\
\r\n\
Body",
])
)
]),
),
);
expect(result?.toString()).toBe(`\
HTTP/1.0 200 OK\r
Expand All @@ -76,8 +76,8 @@ Content-Type: Value\r\n\
Content-Length: 0\r\n\
Bad: multi\nline\r\n\
\r\n",
])
)
]),
),
);
expect(result?.toString()).toBe(`HTTP/1.0 204 Empty\r
Content-Type: Value\r
Expand All @@ -89,15 +89,15 @@ Bad: multi\r
test("StatusAndHeaders test 3", async () => {
const parser = new StatusAndHeadersParser();
const result = await parser.parse(
new AsyncIterReader(getReader(["HTTP/1.0 204 None\r\n\r\n"]))
new AsyncIterReader(getReader(["HTTP/1.0 204 None\r\n\r\n"])),
);
expect(result?.toString()).toBe("HTTP/1.0 204 None\r\n");
});

test("StatusAndHeaders test empty", async () => {
const parser = new StatusAndHeadersParser();
const result = await parser.parse(
new AsyncIterReader(getReader(["\r\n\r\n"]))
new AsyncIterReader(getReader(["\r\n\r\n"])),
);
expect(result).toBe(null);
});
Expand Down Expand Up @@ -178,7 +178,7 @@ text\r\n\
software: recorder test\r\n\
format: WARC File Format 1.0\r\n\
json-metadata: {"foo": "bar"}\r\n\
'
',
);

// eslint-disable-next-line @typescript-eslint/no-non-null-assertion -- checked in expect
Expand Down Expand Up @@ -252,15 +252,15 @@ Content-Length: 0\r\n\

expect(record.warcHeaders.protocol).toBe("WARC/1.0");
expect(record.warcHeader("WARC-Record-ID")).toBe(
"<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>"
"<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>",
);
expect(record.warcType).toBe("revisit");
expect(record.warcTargetURI).toBe("http://example.com/");
expect(record.warcDate).toBe("2000-01-01T00:00:00Z");
expect(record.warcRefersToTargetURI).toBe("http://example.com/foo");
expect(record.warcRefersToDate).toBe("1999-01-01T00:00:00Z");
expect(record.warcPayloadDigest).toBe(
"sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O"
"sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O",
);
expect(record.warcContentType).toBe("application/http; msgtype=response");
expect(record.warcContentLength).toBe(0);
Expand Down Expand Up @@ -307,15 +307,15 @@ Foo: Bar\r\n\
expect(record).not.toBeNull();
expect(record.warcHeaders.protocol).toBe("WARC/1.0");
expect(record.warcHeader("WARC-Record-ID")).toBe(
"<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>"
"<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>",
);
expect(record.warcType).toBe("revisit");
expect(record.warcTargetURI).toBe("http://example.com/");
expect(record.warcDate).toBe("2000-01-01T00:00:00Z");
expect(record.warcRefersToTargetURI).toBe("http://example.com/foo");
expect(record.warcRefersToDate).toBe("1999-01-01T00:00:00Z");
expect(record.warcPayloadDigest).toBe(
"sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O"
"sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O",
);
expect(record.warcContentType).toBe("application/http; msgtype=response");
expect(record.warcContentLength).toBe(54);
Expand Down Expand Up @@ -374,7 +374,7 @@ Foo: Bar\r\n\

expect(record.warcHeaders.protocol).toBe("WARC/1.0");
expect(record.warcHeader("WARC-Record-ID")).toBe(
"<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>"
"<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>",
);
expect(record.warcType).toBe("revisit");
expect(record.warcContentLength).toBe(82);
Expand Down Expand Up @@ -542,7 +542,7 @@ test("warc1.1 serialize records match", async () => {

test("chunked warc read", async () => {
const input = fs.createReadStream(
get_warc_path("data/example-iana.org-chunked.warc")
get_warc_path("data/example-iana.org-chunked.warc"),
);

const parser = new WARCParser(input);
Expand All @@ -558,7 +558,7 @@ test("chunked warc read", async () => {

// can't read raw data anymore
await expect(async () => await record.readFully(false)).rejects.toThrow(
"WARC Record decoding already started, but requesting raw payload"
"WARC Record decoding already started, but requesting raw payload",
);

const text = await record.contentText();
Expand All @@ -568,15 +568,15 @@ test("chunked warc read", async () => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- checking invalid type
const busyRecord = record as any as { reader: LimitReader };
await expect(async () => await busyRecord.reader.readFully()).rejects.toThrow(
"WARC Record decoding already started, but requesting raw payload"
"WARC Record decoding already started, but requesting raw payload",
);

expect(await record.readFully(true)).not.toBeNull();
});

test("no await catch errors", async () => {
const input = fs.createReadStream(
get_warc_path("data/example-iana.org-chunked.warc")
get_warc_path("data/example-iana.org-chunked.warc"),
);

const parser = new WARCParser(input);
Expand All @@ -597,10 +597,10 @@ test("no await catch errors", async () => {
const record1 = (await parser.parse())!;
expect(record1).not.toBeNull();
await expect(async () => await iter.next()).rejects.toThrow(
"Record already consumed.. Perhaps a promise was not awaited?"
"Record already consumed.. Perhaps a promise was not awaited?",
);
await expect(async () => await record0.readline()).rejects.toThrow(
"Record already consumed.. Perhaps a promise was not awaited?"
"Record already consumed.. Perhaps a promise was not awaited?",
);

let count = 0;
Expand Down Expand Up @@ -685,15 +685,15 @@ text\r\n\
["custom-header", "somevalue"],
["set-cookie", "greeting=hello"],
["set-cookie", "name=world"],
])
]),
);
} else {
expect(JSON.stringify(headerEntries)).toBe(
JSON.stringify([
["content-type", 'text/plain; charset="UTF-8"'],
["custom-header", "somevalue"],
["set-cookie", "greeting=hello, name=world"],
])
]),
);
}

Expand All @@ -718,7 +718,7 @@ text\r\n\
["content-type", 'text/plain; charset="UTF-8"'],
["custom-header", "somevalue"],
["unicode-header", "%F0%9F%93%81%20text%20%F0%9F%97%84%EF%B8%8F"],
])
]),
);

expect(decoder.decode(await record2.readFully())).toBe("more\ntext");
Expand Down

0 comments on commit 53997e6

Please sign in to comment.