Skip to content

Commit

Permalink
{feature} update multiline string handling cf. update in spec
Browse files Browse the repository at this point in the history
  • Loading branch information
bgotink committed Apr 2, 2024
1 parent d600707 commit f66084b
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 22 deletions.
14 changes: 4 additions & 10 deletions src/parser/parse.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@ import {format} from "../format.js";
import {storeLocation as _storeLocation} from "../locations.js";
import {Document, Entry, Identifier, Node, Tag, Value} from "../model.js";
import {
removeEscapedWhitespace,
removeLeadingWhitespace,
replaceEscapes,
postProcessRawStringValue,
postProcessStringValue,
} from "../string-utils.js";

import {
Expand Down Expand Up @@ -186,12 +185,7 @@ function _parseString(ctx) {
case T_QUOTED_STRING:
pop(ctx);
return [
replaceEscapes(
removeLeadingWhitespace(
removeEscapedWhitespace(token.text.slice(1, -1)),
token,
),
),
postProcessStringValue(token.text.slice(1, -1), token),
token.text,
token,
];
Expand All @@ -202,7 +196,7 @@ function _parseString(ctx) {
const quoteIndex = raw.indexOf('"');

return [
removeLeadingWhitespace(
postProcessRawStringValue(
raw.slice(quoteIndex + 1, -(quoteIndex + 1)),
token,
),
Expand Down
83 changes: 72 additions & 11 deletions src/string-utils.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import {InvalidKdlError, stringifyTokenOffset} from "./error.js";

const escapedWhitespace =
/(?<=(?:^|[^\\])(?:\\\\)*)\\[\x0A\x0C\x0D\x85\u2028\u2029\uFEFF\u0009\u000B\u0020\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]+/g;
const escape = /\\(?:[^u]|u\{([0-9a-fA-F]{1,5}|10[0-9a-fA-F]{4})\})/g;
const escape =
/(?<=(?:^|[^\\])(?:\\\\)*)\\(?:$|([\x0A\x0C\x0D\x85\u2028\u2029\uFEFF\u0009\u000B\u0020\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]+)|[^u]|u\{([0-9a-fA-F]{1,5}|10[0-9a-fA-F]{4})\})/g;

const escapedValues = new Map([
["\\n", "\n"],
Expand All @@ -17,14 +16,18 @@ const escapedValues = new Map([

const reAllNewlines = /\x0D\x0A|[\x0A\x0C\x0D\x85\u2028\u2029]/g;

export const reEntirelyInlineWhitespace =
const reEntirelyInlineWhitespace =
/^[\uFEFF\u0009\u000B\u0020\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]*$/;

const reEndsWithEscapedWhitespace =
/(?<=(?:^|[^\\])(?:\\\\)*)\\[\uFEFF\u0009\u000B\u0020\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]*$/;

/**
* @param {string} value
* @param {import("./parser/tokenize.js").Token} token
* @returns {string}
*/
export function removeLeadingWhitespace(value, token) {
export function postProcessRawStringValue(value, token) {
const lines = value.split(reAllNewlines);

if (lines.length === 1) {
Expand Down Expand Up @@ -69,23 +72,81 @@ export function removeLeadingWhitespace(value, token) {

/**
* @param {string} value
* @param {import("./parser/tokenize.js").Token} token
* @returns {string}
*/
export function removeEscapedWhitespace(value) {
return value.replaceAll(escapedWhitespace, "");
export function postProcessStringValue(value, token) {
const lines = value.split(reAllNewlines);

if (lines.length === 1) {
return replaceEscapes(value);
}

if (lines[0].length) {
// mustn't be a multiline string...
if (
lines.slice(0, -1).some((line) => !reEndsWithEscapedWhitespace.test(line))
) {
throw new InvalidKdlError(
`Multi-line strings must start with a newline at ${stringifyTokenOffset(
token,
)}`,
);
}

return replaceEscapes(value);
}

// multiline string

lines.shift();
const lastLine = /** @type {string} */ (lines.pop());

if (!reEntirelyInlineWhitespace.test(lastLine)) {
throw new InvalidKdlError(
`Multi-line strings must end with a line containing only whitespace at ${stringifyTokenOffset(
token,
)}`,
);
}

return replaceEscapes(
lines
.map((line, index) => {
if (reEntirelyInlineWhitespace.test(line)) {
return "";
} else if (!line.startsWith(lastLine)) {
throw new InvalidKdlError(
`Line ${index + 1} of multi-line string at ${stringifyTokenOffset(
token,
)} doesn't start with the offset defined by the last line of the string`,
);
} else {
return line.slice(lastLine.length);
}
})
.join("\n"),
);
}

/**
* @param {string} value
*/
export function replaceEscapes(value) {
return value.replaceAll(escape, (escape, unicode) => {
if (unicode) {
function replaceEscapes(value) {
return value.replaceAll(escape, (escape, escapedWhitespace, unicode) => {
if (escapedWhitespace) {
return "";
} else if (unicode) {
return String.fromCodePoint(parseInt(unicode, 16));
} else {
const replacement = escapedValues.get(escape);

if (replacement == null) {
throw new InvalidKdlError(`Invalid escape "${escape}"`);
throw new InvalidKdlError(
escape ?
`Invalid escape "\\${escape}"`
: "Invalid whitespace escape at the end of a string",
);
}

return replacement;
Expand Down
21 changes: 21 additions & 0 deletions test/string.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,25 @@ test("\\u escapes", () => {
);
});

test("invalid multiline escaped whitespace", () => {
assert.throws(() => {
parse(String.raw`
node "
foo \
bar
baz
"
`);
});

assert.throws(() => {
parse(String.raw`
node "
foo
bar\
"
`);
});
});

test.run();
2 changes: 1 addition & 1 deletion test/upstream
Submodule upstream updated 1 files
+30 −3 SPEC.md

0 comments on commit f66084b

Please sign in to comment.