Skip to content

Commit

Permalink
Merge pull request #3137 from ruby/unterminated-heredoc
Browse files Browse the repository at this point in the history
Pop lex mode for heredocs in the lexer, not the parser
  • Loading branch information
kddnewton authored Oct 3, 2024
2 parents 34cbade + 5dd36b9 commit cc7918f
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 62 deletions.
34 changes: 22 additions & 12 deletions include/prism/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,23 @@ typedef enum {
PM_HEREDOC_INDENT_TILDE,
} pm_heredoc_indent_t;

/**
* All of the information necessary to store to lexing a heredoc.
*/
typedef struct {
/** A pointer to the start of the heredoc identifier. */
const uint8_t *ident_start;

/** The length of the heredoc identifier. */
size_t ident_length;

/** The type of quote that the heredoc uses. */
pm_heredoc_quote_t quote;

/** The type of indentation that the heredoc uses. */
pm_heredoc_indent_t indent;
} pm_heredoc_lex_mode_t;

/**
* When lexing Ruby source, the lexer has a small amount of state to tell which
* kind of token it is currently lexing. For example, when we find the start of
Expand Down Expand Up @@ -210,17 +227,10 @@ typedef struct pm_lex_mode {
} string;

struct {
/** A pointer to the start of the heredoc identifier. */
const uint8_t *ident_start;

/** The length of the heredoc identifier. */
size_t ident_length;

/** The type of quote that the heredoc uses. */
pm_heredoc_quote_t quote;

/** The type of indentation that the heredoc uses. */
pm_heredoc_indent_t indent;
/**
* All of the data necessary to lex a heredoc.
*/
pm_heredoc_lex_mode_t base;

/**
* This is the pointer to the character where lexing should resume
Expand All @@ -233,7 +243,7 @@ typedef struct pm_lex_mode {
* line so that we know how much to dedent each line in the case of
* a tilde heredoc.
*/
size_t common_whitespace;
size_t *common_whitespace;

/** True if the previous token ended with a line continuation. */
bool line_continuation;
Expand Down
95 changes: 45 additions & 50 deletions src/prism.c
Original file line number Diff line number Diff line change
Expand Up @@ -544,10 +544,7 @@ pm_parser_warn_node(pm_parser_t *parser, const pm_node_t *node, pm_diagnostic_id
* token.
*/
static void
pm_parser_err_heredoc_term(pm_parser_t *parser, pm_lex_mode_t *lex_mode) {
const uint8_t *ident_start = lex_mode->as.heredoc.ident_start;
size_t ident_length = lex_mode->as.heredoc.ident_length;

pm_parser_err_heredoc_term(pm_parser_t *parser, const uint8_t *ident_start, size_t ident_length) {
PM_PARSER_ERR_FORMAT(
parser,
ident_start,
Expand Down Expand Up @@ -11153,12 +11150,14 @@ parser_lex(pm_parser_t *parser) {
lex_mode_push(parser, (pm_lex_mode_t) {
.mode = PM_LEX_HEREDOC,
.as.heredoc = {
.ident_start = ident_start,
.ident_length = ident_length,
.base = {
.ident_start = ident_start,
.ident_length = ident_length,
.quote = quote,
.indent = indent
},
.next_start = parser->current.end,
.quote = quote,
.indent = indent,
.common_whitespace = (size_t) -1,
.common_whitespace = NULL,
.line_continuation = false
}
});
Expand All @@ -11171,7 +11170,7 @@ parser_lex(pm_parser_t *parser) {
// this is not a valid heredoc declaration. In this case we
// will add an error, but we will still return a heredoc
// start.
if (!ident_error) pm_parser_err_heredoc_term(parser, parser->lex_modes.current);
if (!ident_error) pm_parser_err_heredoc_term(parser, ident_start, ident_length);
body_start = parser->end;
} else {
// Otherwise, we want to indicate that the body of the
Expand Down Expand Up @@ -12514,6 +12513,7 @@ parser_lex(pm_parser_t *parser) {
// Now let's grab the information about the identifier off of the
// current lex mode.
pm_lex_mode_t *lex_mode = parser->lex_modes.current;
pm_heredoc_lex_mode_t *heredoc_lex_mode = &lex_mode->as.heredoc.base;

bool line_continuation = lex_mode->as.heredoc.line_continuation;
lex_mode->as.heredoc.line_continuation = false;
Expand All @@ -12523,15 +12523,16 @@ parser_lex(pm_parser_t *parser) {
// terminator) but still continue parsing so that content after the
// declaration of the heredoc can be parsed.
if (parser->current.end >= parser->end) {
pm_parser_err_heredoc_term(parser, lex_mode);
pm_parser_err_heredoc_term(parser, heredoc_lex_mode->ident_start, heredoc_lex_mode->ident_length);
parser->next_start = lex_mode->as.heredoc.next_start;
parser->heredoc_end = parser->current.end;
lex_state_set(parser, PM_LEX_STATE_END);
lex_mode_pop(parser);
LEX(PM_TOKEN_HEREDOC_END);
}

const uint8_t *ident_start = lex_mode->as.heredoc.ident_start;
size_t ident_length = lex_mode->as.heredoc.ident_length;
const uint8_t *ident_start = heredoc_lex_mode->ident_start;
size_t ident_length = heredoc_lex_mode->ident_length;

// If we are immediately following a newline and we have hit the
// terminator, then we need to return the ending of the heredoc.
Expand All @@ -12556,10 +12557,7 @@ parser_lex(pm_parser_t *parser) {
const uint8_t *terminator_start = ident_end - ident_length;
const uint8_t *cursor = start;

if (
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_DASH ||
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE
) {
if (heredoc_lex_mode->indent == PM_HEREDOC_INDENT_DASH || heredoc_lex_mode->indent == PM_HEREDOC_INDENT_TILDE) {
while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) {
cursor++;
}
Expand All @@ -12582,17 +12580,19 @@ parser_lex(pm_parser_t *parser) {
}

lex_state_set(parser, PM_LEX_STATE_END);
lex_mode_pop(parser);
LEX(PM_TOKEN_HEREDOC_END);
}
}

size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, heredoc_lex_mode->indent);
if (
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE &&
(lex_mode->as.heredoc.common_whitespace > whitespace) &&
heredoc_lex_mode->indent == PM_HEREDOC_INDENT_TILDE &&
lex_mode->as.heredoc.common_whitespace != NULL &&
(*lex_mode->as.heredoc.common_whitespace > whitespace) &&
peek_at(parser, start) != '\n'
) {
lex_mode->as.heredoc.common_whitespace = whitespace;
*lex_mode->as.heredoc.common_whitespace = whitespace;
}
}

Expand All @@ -12601,7 +12601,7 @@ parser_lex(pm_parser_t *parser) {
// strpbrk to find the first of these characters.
uint8_t breakpoints[] = "\r\n\\#";

pm_heredoc_quote_t quote = lex_mode->as.heredoc.quote;
pm_heredoc_quote_t quote = heredoc_lex_mode->quote;
if (quote == PM_HEREDOC_QUOTE_SINGLE) {
breakpoints[3] = '\0';
}
Expand Down Expand Up @@ -12664,8 +12664,7 @@ parser_lex(pm_parser_t *parser) {
// leading whitespace if we have a - or ~ heredoc.
const uint8_t *cursor = start;

if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_DASH ||
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
if (heredoc_lex_mode->indent == PM_HEREDOC_INDENT_DASH || heredoc_lex_mode->indent == PM_HEREDOC_INDENT_TILDE) {
while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) {
cursor++;
}
Expand All @@ -12681,16 +12680,16 @@ parser_lex(pm_parser_t *parser) {
}
}

size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.base.indent);

// If we have hit a newline that is followed by a valid
// terminator, then we need to return the content of the
// heredoc here as string content. Then, the next time a
// token is lexed, it will match again and return the
// end of the heredoc.
if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
if ((lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') {
lex_mode->as.heredoc.common_whitespace = whitespace;
if (lex_mode->as.heredoc.base.indent == PM_HEREDOC_INDENT_TILDE) {
if ((lex_mode->as.heredoc.common_whitespace != NULL) && (*lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') {
*lex_mode->as.heredoc.common_whitespace = whitespace;
}

parser->current.end = breakpoint + 1;
Expand Down Expand Up @@ -12757,7 +12756,7 @@ parser_lex(pm_parser_t *parser) {
// If we are in a tilde here, we should
// break out of the loop and return the
// string content.
if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
if (heredoc_lex_mode->indent == PM_HEREDOC_INDENT_TILDE) {
const uint8_t *end = parser->current.end;
pm_newline_list_append(&parser->newline_list, end);

Expand Down Expand Up @@ -13165,13 +13164,11 @@ expect3(pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_to
* lex mode accordingly.
*/
static void
expect1_heredoc_term(pm_parser_t *parser, pm_lex_mode_t *lex_mode) {
expect1_heredoc_term(pm_parser_t *parser, const uint8_t *ident_start, size_t ident_length) {
if (match1(parser, PM_TOKEN_HEREDOC_END)) {
lex_mode_pop(parser);
parser_lex(parser);
} else {
pm_parser_err_heredoc_term(parser, lex_mode);
lex_mode_pop(parser);
pm_parser_err_heredoc_term(parser, ident_start, ident_length);
parser->previous.start = parser->previous.end;
parser->previous.type = PM_TOKEN_MISSING;
}
Expand Down Expand Up @@ -18442,10 +18439,11 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
case PM_TOKEN_HEREDOC_START: {
// Here we have found a heredoc. We'll parse it and add it to the
// list of strings.
pm_lex_mode_t *lex_mode = parser->lex_modes.current;
assert(lex_mode->mode == PM_LEX_HEREDOC);
pm_heredoc_quote_t quote = lex_mode->as.heredoc.quote;
pm_heredoc_indent_t indent = lex_mode->as.heredoc.indent;
assert(parser->lex_modes.current->mode == PM_LEX_HEREDOC);
pm_heredoc_lex_mode_t lex_mode = parser->lex_modes.current->as.heredoc.base;

size_t common_whitespace = (size_t) -1;
parser->lex_modes.current->as.heredoc.common_whitespace = &common_whitespace;

parser_lex(parser);
pm_token_t opening = parser->previous;
Expand All @@ -18456,10 +18454,10 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
if (match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) {
// If we get here, then we have an empty heredoc. We'll create
// an empty content token and return an empty string node.
expect1_heredoc_term(parser, lex_mode);
expect1_heredoc_term(parser, lex_mode.ident_start, lex_mode.ident_length);
pm_token_t content = parse_strings_empty_content(parser->previous.start);

if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
if (lex_mode.quote == PM_HEREDOC_QUOTE_BACKTICK) {
node = (pm_node_t *) pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->previous, &PM_STRING_EMPTY);
} else {
node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &PM_STRING_EMPTY);
Expand All @@ -18486,18 +18484,17 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
cast->closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->current);
cast->base.location = cast->opening_loc;

if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
if (lex_mode.quote == PM_HEREDOC_QUOTE_BACKTICK) {
assert(sizeof(pm_string_node_t) == sizeof(pm_x_string_node_t));
cast->base.type = PM_X_STRING_NODE;
}

size_t common_whitespace = lex_mode->as.heredoc.common_whitespace;
if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) {
if (lex_mode.indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) {
parse_heredoc_dedent_string(&cast->unescaped, common_whitespace);
}

node = (pm_node_t *) cast;
expect1_heredoc_term(parser, lex_mode);
expect1_heredoc_term(parser, lex_mode.ident_start, lex_mode.ident_length);
} else {
// If we get here, then we have multiple parts in the heredoc,
// so we'll need to create an interpolated string node to hold
Expand All @@ -18511,15 +18508,13 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
}
}

size_t common_whitespace = lex_mode->as.heredoc.common_whitespace;

// Now that we have all of the parts, create the correct type of
// interpolated node.
if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
if (lex_mode.quote == PM_HEREDOC_QUOTE_BACKTICK) {
pm_interpolated_x_string_node_t *cast = pm_interpolated_xstring_node_create(parser, &opening, &opening);
cast->parts = parts;

expect1_heredoc_term(parser, lex_mode);
expect1_heredoc_term(parser, lex_mode.ident_start, lex_mode.ident_length);
pm_interpolated_xstring_node_closing_set(cast, &parser->previous);

cast->base.location = cast->opening_loc;
Expand All @@ -18528,7 +18523,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
pm_interpolated_string_node_t *cast = pm_interpolated_string_node_create(parser, &opening, &parts, &opening);
pm_node_list_free(&parts);

expect1_heredoc_term(parser, lex_mode);
expect1_heredoc_term(parser, lex_mode.ident_start, lex_mode.ident_length);
pm_interpolated_string_node_closing_set(cast, &parser->previous);

cast->base.location = cast->opening_loc;
Expand All @@ -18537,9 +18532,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b

// If this is a heredoc that is indented with a ~, then we need
// to dedent each line by the common leading whitespace.
if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) {
if (lex_mode.indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) {
pm_node_list_t *nodes;
if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
if (lex_mode.quote == PM_HEREDOC_QUOTE_BACKTICK) {
nodes = &((pm_interpolated_x_string_node_t *) node)->parts;
} else {
nodes = &((pm_interpolated_string_node_t *) node)->parts;
Expand Down
9 changes: 9 additions & 0 deletions test/prism/errors/heredoc_unterminated.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
a=>{<<b
^ unterminated heredoc; can't find string "b" anywhere before EOF
^~~ unexpected heredoc beginning; expected a key in the hash pattern
^ unterminated heredoc; can't find string "b" anywhere before EOF
^~~ expected a label as the key in the hash pattern
^ expected a `}` to close the pattern expression
^ unexpected heredoc ending, expecting end-of-input
^ unexpected heredoc ending, ignoring it

0 comments on commit cc7918f

Please sign in to comment.