Skip to content

Commit

Permalink
Add KDLv2 identifiers to tokenizer, but not to parser
Browse files Browse the repository at this point in the history
  • Loading branch information
tjol committed May 13, 2024
1 parent f805304 commit 38783e5
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 6 deletions.
2 changes: 2 additions & 0 deletions src/grammar.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ bool _kdl_is_whitespace(uint32_t c);
bool _kdl_is_newline(uint32_t c);
bool _kdl_is_id(uint32_t c);
bool _kdl_is_id_start(uint32_t c);
bool _kdl_is_v1_id(uint32_t c);
bool _kdl_is_v1_id_start(uint32_t c);
bool _kdl_is_end_of_word(uint32_t c);

#endif // KDL_INTERNAL_GRAMMAR_H_
36 changes: 32 additions & 4 deletions src/parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

#include "bigint.h"
#include "compat.h"
#include "grammar.h"
#include "utf8.h"

#include <math.h>
#include <stdio.h>
Expand Down Expand Up @@ -119,6 +121,7 @@ static bool _parse_decimal_float(kdl_str number, kdl_value* val, kdl_owned_strin
static bool _parse_hex_number(kdl_str number, kdl_value* val, kdl_owned_string* s);
static bool _parse_octal_number(kdl_str number, kdl_value* val, kdl_owned_string* s);
static bool _parse_binary_number(kdl_str number, kdl_value* val, kdl_owned_string* s);
static bool _identifier_is_valid(kdl_str value);

kdl_event_data* kdl_parser_next_event(kdl_parser* self)
{
Expand Down Expand Up @@ -557,10 +560,13 @@ static bool _parse_value(kdl_token const* token, kdl_value* val, kdl_owned_strin
}
}
// this is a regular identifier
*s = kdl_clone_str(&token->value);
val->type = KDL_TYPE_STRING;
val->string = kdl_borrow_str(s);
return true;
if (_identifier_is_valid(token->value)) {
*s = kdl_clone_str(&token->value);
val->type = KDL_TYPE_STRING;
val->string = kdl_borrow_str(s);
return true;
}
_fallthrough_;
default:
return false;
}
Expand Down Expand Up @@ -962,3 +968,25 @@ static bool _parse_binary_number(kdl_str number, kdl_value* val, kdl_owned_strin
_kdl_ubigint_free(n);
return false;
}

static bool _identifier_is_valid(kdl_str value)
{
// Check that this is a valid KDLv1 identifier! The tokenizer accepts KDLv2
// identifiers, but the parser doesn't yet
uint32_t c = 0;
if (_kdl_pop_codepoint(&value, &c) != KDL_UTF8_OK || !_kdl_is_v1_id_start(c)) {
return false;
}

while (true) {
switch (_kdl_pop_codepoint(&value, &c)) {
case KDL_UTF8_OK:
if (!_kdl_is_v1_id(c)) return false;
break;
case KDL_UTF8_EOF:
return true;
default:
return false;
}
}
}
6 changes: 4 additions & 2 deletions src/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,14 @@ bool _kdl_is_newline(uint32_t c)
bool _kdl_is_id(uint32_t c)
{
return c > 0x20 && c <= 0x10FFFF //
&& c != '\\' && c != '/' && c != '(' && c != ')' && c != '{' && c != '}' && c != '<' && c != '>'
&& c != ';' && c != '[' && c != ']' && c != '=' && c != ',' && c != '"' && !_kdl_is_whitespace(c)
&& c != '\\' && c != '/' && c != '(' && c != ')' && c != '{' && c != '}'
&& c != ';' && c != '[' && c != ']' && c != '=' && c != '"' && !_kdl_is_whitespace(c)
&& !_kdl_is_newline(c);
}

bool _kdl_is_v1_id(uint32_t c) { return _kdl_is_id(c) && c != '<' && c != '>' && c != ','; }
bool _kdl_is_id_start(uint32_t c) { return _kdl_is_id(c) && (c < '0' || c > '9'); }
bool _kdl_is_v1_id_start(uint32_t c) { return _kdl_is_v1_id(c) && (c < '0' || c > '9'); }

bool _kdl_is_end_of_word(uint32_t c)
{
Expand Down
35 changes: 35 additions & 0 deletions tests/kdlv2_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,42 @@ static void test_tokenizer_strings(void)
kdl_destroy_tokenizer(tok);
}

static void test_tokenizer_identifiers(void)
{
kdl_token token;

kdl_str doc = kdl_str_from_cstr("a<b;a,b;a>b");

kdl_tokenizer* tok = kdl_create_string_tokenizer(doc);

ASSERT(kdl_pop_token(tok, &token) == KDL_TOKENIZER_OK);
ASSERT(token.type == KDL_TOKEN_WORD);
ASSERT(token.value.len == 3);
ASSERT(memcmp(token.value.data, "a<b", 3) == 0);

ASSERT(kdl_pop_token(tok, &token) == KDL_TOKENIZER_OK);
ASSERT(token.type == KDL_TOKEN_SEMICOLON);

ASSERT(kdl_pop_token(tok, &token) == KDL_TOKENIZER_OK);
ASSERT(token.type == KDL_TOKEN_WORD);
ASSERT(token.value.len == 3);
ASSERT(memcmp(token.value.data, "a,b", 3) == 0);

ASSERT(kdl_pop_token(tok, &token) == KDL_TOKENIZER_OK);
ASSERT(token.type == KDL_TOKEN_SEMICOLON);

ASSERT(kdl_pop_token(tok, &token) == KDL_TOKENIZER_OK);
ASSERT(token.type == KDL_TOKEN_WORD);
ASSERT(token.value.len == 3);
ASSERT(memcmp(token.value.data, "a>b", 3) == 0);

ASSERT(kdl_pop_token(tok, &token) == KDL_TOKENIZER_EOF);

kdl_destroy_tokenizer(tok);
}

void TEST_MAIN(void)
{
run_test("Tokenizer: KDLv2 strings", &test_tokenizer_strings);
run_test("Tokenizer: KDLv2 identifiers", &test_tokenizer_identifiers);
}

0 comments on commit 38783e5

Please sign in to comment.