Skip to content

Commit

Permalink
Fix multiline string whitespace handling
Browse files Browse the repository at this point in the history
  • Loading branch information
tjol committed Dec 12, 2024
1 parent 647b784 commit ec64093
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 31 deletions.
55 changes: 32 additions & 23 deletions src/str.c
Original file line number Diff line number Diff line change
Expand Up @@ -461,34 +461,43 @@ kdl_owned_string _kdl_dedent_multiline_string(kdl_str const* s)
// Remove the whitespace from the beginning of all lines
buf_dedented = malloc(norm_lf.len);
char* out = buf_dedented;
char const* in = norm_lf.data; // skip initial LF
char const* in = norm_lf.data + 1; // skip initial LF
char const* end = norm_lf.data + norm_lf.len;
bool at_start = true;
// copy the rest of the string
if (norm_lf.len > 1) {
while (in < end) {
*out = *in;
if (*in == '\n') {
if (in + 1 < end && *(in + 1) == '\n') {
// double newline - ok
} else {
// check indent
if (memcmp(in + 1, indent.data, indent.len) == 0) {
// skip indent
in += indent.len;
} else {
goto dedent_err;
}
}
while (in < end) {
// find the next newline
char const* eol = in;
while (eol < end && *eol != '\n') ++eol;
if (eol == end) break;

// check if the line is all whitespace
kdl_str line = (kdl_str){.data = in, .len = (eol - in)};
uint32_t c;
bool is_ws = true;
while (_kdl_pop_codepoint(&line, &c) == KDL_UTF8_OK) {
if (!_kdl_is_whitespace(KDL_CHARACTER_SET_V2, c)) {
is_ws = false;
break;
}
if (!at_start) {
// Skip the initial newline => only advance the output pointer
// if we're somewhere other than the initial newline
++out;
}

if (is_ws) {
// push a single newline
*(out++) = '\n';
} else {
// check if the line starts with the indent
size_t line_len = eol - in;
if (line_len >= indent.len && memcmp(in, indent.data, indent.len) == 0) {
// copy the line after the indent to the output (incl newline)
size_t n = line_len - indent.len + 1;
memcpy(out, in + indent.len, n);
out += n;
} else {
goto dedent_err;
}
++in;
at_start = false;
}

in = eol + 1;
}

size_t len = out - buf_dedented;
Expand Down
15 changes: 7 additions & 8 deletions tests/kdlv2_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ static void test_tokenizer_multiline_strings(void)
kdl_destroy_tokenizer(tok);
}


static void test_tokenizer_identifiers(void)
{
kdl_token token;
Expand Down Expand Up @@ -240,11 +239,13 @@ static void test_multiline_strings(void)
}

kdl_str edge_cases[][2] = {
{kdl_str_from_cstr("\n\t"), kdl_str_from_cstr("") }, // empty
{kdl_str_from_cstr("\n"), kdl_str_from_cstr("") }, // empty
{kdl_str_from_cstr("\n\n hello\n "), kdl_str_from_cstr("\nhello")}, // double newline at start
{kdl_str_from_cstr("\n \\\n \n "), kdl_str_from_cstr("") }, // escaped newline within
{kdl_str_from_cstr("\n \n \n "), kdl_str_from_cstr("\n ") }, // whitespace only
{kdl_str_from_cstr("\n\t"), kdl_str_from_cstr("") }, // empty
{kdl_str_from_cstr("\n"), kdl_str_from_cstr("") }, // empty
{kdl_str_from_cstr("\n\n hello\n "), kdl_str_from_cstr("\nhello")}, // double newline at start
{kdl_str_from_cstr("\n \\\n \n "), kdl_str_from_cstr("") }, // escaped newline within
{kdl_str_from_cstr("\n \n \n "), kdl_str_from_cstr("\n") }, // empty line with extra ws
{kdl_str_from_cstr("\n \n\t\n "), kdl_str_from_cstr("\n") }, // empty line with odd whitespace
{kdl_str_from_cstr("\n \n \\s \n "), kdl_str_from_cstr("\n ") }, // whitespace only
};
int n_edge_cases = sizeof(edge_cases) / sizeof(edge_cases[0]);

Expand Down Expand Up @@ -444,7 +445,6 @@ static void test_parser_comment_in_property(void)
kdl_destroy_parser(parser);
}


static void test_parser_comment_in_type(void)
{
kdl_event_data* ev;
Expand Down Expand Up @@ -475,7 +475,6 @@ static void test_parser_comment_in_type(void)
kdl_destroy_parser(parser);
}


void TEST_MAIN(void)
{
run_test("Tokenizer: KDLv2 raw strings", &test_tokenizer_raw_strings);
Expand Down

0 comments on commit ec64093

Please sign in to comment.