Skip to content

Commit

Permalink
Merge pull request #1339 from flavorjones/flavorjones-fix-percent-her…
Browse files Browse the repository at this point in the history
…edocs

fix %i %I %w %W when spanning a heredoc
  • Loading branch information
kddnewton authored Aug 28, 2023
2 parents 7dcef1c + 6b5911b commit 59e3e29
Show file tree
Hide file tree
Showing 6 changed files with 337 additions and 131 deletions.
65 changes: 47 additions & 18 deletions src/yarp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2816,10 +2816,11 @@ yp_interpolated_symbol_node_create(yp_parser_t *parser, const yp_token_t *openin

static inline void
yp_interpolated_symbol_node_append(yp_interpolated_symbol_node_t *node, yp_node_t *part) {
yp_node_list_append(&node->parts, part);
if (!node->base.location.start) {
if (node->parts.size == 0 && node->opening_loc.start == NULL) {
node->base.location.start = part->location.start;
}

yp_node_list_append(&node->parts, part);
node->base.location.end = part->location.end;
}

Expand Down Expand Up @@ -6941,9 +6942,19 @@ parser_lex(yp_parser_t *parser) {
yp_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);

// If the result is an escaped newline, then we need to
// track that newline.
yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);
// If the result is an escaped newline ...
if (*(breakpoint + difference - 1) == '\n') {
if (parser->heredoc_end) {
// ... if we are on the same line as a heredoc, flush the heredoc and
// continue parsing after heredoc_end.
parser->current.end = breakpoint + difference;
parser_flush_heredoc_end(parser);
LEX(YP_TOKEN_STRING_CONTENT);
} else {
// ... else track the newline.
yp_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
}
}

breakpoint = yp_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
continue;
Expand Down Expand Up @@ -11933,14 +11944,9 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
yp_array_node_t *array = yp_array_node_create(parser, &parser->previous);

while (!match_any_type_p(parser, 2, YP_TOKEN_STRING_END, YP_TOKEN_EOF)) {
if (yp_array_node_size(array) == 0) {
accept(parser, YP_TOKEN_WORDS_SEP);
} else {
expect(parser, YP_TOKEN_WORDS_SEP, "Expected a separator for the symbols in a `%i` list.");
if (match_type_p(parser, YP_TOKEN_STRING_END)) break;
}

accept(parser, YP_TOKEN_WORDS_SEP);
if (match_type_p(parser, YP_TOKEN_STRING_END)) break;

expect(parser, YP_TOKEN_STRING_CONTENT, "Expected a symbol in a `%i` list.");

yp_token_t opening = not_provided(parser);
Expand Down Expand Up @@ -11995,6 +12001,19 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
// to the list of child nodes.
yp_node_t *part = parse_string_part(parser);
yp_interpolated_symbol_node_append((yp_interpolated_symbol_node_t *) current, part);
} else if (YP_NODE_TYPE_P(current, YP_NODE_SYMBOL_NODE)) {
// If we hit string content and the current node is a string node,
// then we need to convert the current node into an interpolated
// string and add the string content to the list of child nodes.
yp_token_t opening = not_provided(parser);
yp_token_t closing = not_provided(parser);
yp_interpolated_symbol_node_t *interpolated =
yp_interpolated_symbol_node_create(parser, &opening, NULL, &closing);
yp_interpolated_symbol_node_append(interpolated, current);

yp_node_t *part = parse_string_part(parser);
yp_interpolated_symbol_node_append(interpolated, part);
current = (yp_node_t *) interpolated;
} else {
assert(false && "unreachable");
}
Expand Down Expand Up @@ -12097,12 +12116,9 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
accept(parser, YP_TOKEN_WORDS_SEP);

while (!match_any_type_p(parser, 2, YP_TOKEN_STRING_END, YP_TOKEN_EOF)) {
if (yp_array_node_size(array) == 0) {
accept(parser, YP_TOKEN_WORDS_SEP);
} else {
expect(parser, YP_TOKEN_WORDS_SEP, "Expected a separator for the strings in a `%w` list.");
if (match_type_p(parser, YP_TOKEN_STRING_END)) break;
}
accept(parser, YP_TOKEN_WORDS_SEP);
if (match_type_p(parser, YP_TOKEN_STRING_END)) break;

expect(parser, YP_TOKEN_STRING_CONTENT, "Expected a string in a `%w` list.");

yp_token_t opening = not_provided(parser);
Expand Down Expand Up @@ -12152,6 +12168,19 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
// to the list of child nodes.
yp_node_t *part = parse_string_part(parser);
yp_interpolated_string_node_append((yp_interpolated_string_node_t *) current, part);
} else if (YP_NODE_TYPE_P(current, YP_NODE_STRING_NODE)) {
// If we hit string content and the current node is a string node,
// then we need to convert the current node into an interpolated
// string and add the string content to the list of child nodes.
yp_token_t opening = not_provided(parser);
yp_token_t closing = not_provided(parser);
yp_interpolated_string_node_t *interpolated =
yp_interpolated_string_node_create(parser, &opening, NULL, &closing);
yp_interpolated_string_node_append(interpolated, current);

yp_node_t *part = parse_string_part(parser);
yp_interpolated_string_node_append(interpolated, part);
current = (yp_node_t *) interpolated;
} else {
assert(false && "unreachable");
}
Expand Down
51 changes: 51 additions & 0 deletions test/yarp/fixtures/spanning_heredoc.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# test regex, string, and lists that span a heredoc thanks to an escaped newline

# ripper incorrectly creates a "b\nb" token instead of two separate string tokens
pp <<-A.gsub(/b\
a
A
b/, "")

# ripper incorrectly creates a "d\nd" token instead of two separate string tokens
pp <<-A, "d\
c
A
d"

# ripper gets this right
pp <<-A, %q[f\
e
A
f]

# ripper incorrectly creates a "h\nh" token instead of two separate string tokens
pp <<-A, %Q[h\
g
A
h]

# ripper can't parse this successfully, though ruby runs it correctly
pp <<-A, %w[j\
i
A
j]

# ripper can't parse this successfully, though ruby runs it correctly
# TODO: yarp does not include the "\n" in "l\nl" in the AST like ruby does
pp <<-A, %W[l\
k
A
l]

# ripper can't parse this successfully, though ruby runs it correctly
pp <<-A, %i[n\
m
A
n]

# ripper gets this one wrong in the same way that YARP does ...
# TODO: yarp does not include the "\n" in "p\np" in the AST like ruby does
pp <<-A, %I[p\
o
A
p]
13 changes: 0 additions & 13 deletions test/yarp/fixtures/wrapping_heredoc.txt

This file was deleted.

44 changes: 24 additions & 20 deletions test/yarp/parse_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def test_parse_lex_file

# To accurately compare against Ripper, we need to make sure that we're
# running on Ruby 3.2+.
check_ripper = RUBY_VERSION >= "3.2.0"
ripper_enabled = RUBY_VERSION >= "3.2.0"

# The FOCUS environment variable allows you to specify one particular fixture
# to test, instead of all of them.
Expand All @@ -63,14 +63,30 @@ def test_parse_lex_file
directory = File.dirname(snapshot)
FileUtils.mkdir_p(directory) unless File.directory?(directory)

ripper_should_parse = ripper_should_match = ripper_enabled

# This file has changed behavior in Ripper in Ruby 3.3, so we skip it if
# we're on an earlier version.
ripper_should_match = false if relative == "seattlerb/pct_w_heredoc_interp_nested.txt" && RUBY_VERSION < "3.3.0"

# It seems like there are some oddities with nested heredocs and ripper.
# Waiting for feedback on https://bugs.ruby-lang.org/issues/19838.
ripper_should_match = false if relative == "seattlerb/heredoc_nested.txt"

# Ripper seems to have a bug that the regex portions before and after the heredoc are combined
# into a single token. See https://bugs.ruby-lang.org/issues/19838.
#
# Additionally, Ripper cannot parse the %w[] fixture in this file, so set ripper_should_parse to false.
ripper_should_parse = false if relative == "spanning_heredoc.txt"

define_method "test_filepath_#{relative}" do
# First, read the source from the filepath. Use binmode to avoid converting CRLF on Windows,
# and explicitly set the external encoding to UTF-8 to override the binmode default.
source = File.read(filepath, binmode: true, external_encoding: Encoding::UTF_8)

# Make sure that it can be correctly parsed by Ripper. If it can't, then we have a fixture
# that is invalid Ruby.
refute_nil Ripper.sexp_raw(source) if check_ripper
refute_nil(Ripper.sexp_raw(source), "Ripper failed to parse") if ripper_should_parse

# Next, assert that there were no errors during parsing.
result = YARP.parse(source, relative)
Expand Down Expand Up @@ -118,25 +134,13 @@ def test_parse_lex_file

assert_equal expected_newlines, YARP.const_get(:Debug).newlines(source)

# This file has changed behavior in Ripper in Ruby 3.3, so we skip it if
# we're on an earlier version.
return if relative == "seattlerb/pct_w_heredoc_interp_nested.txt" && RUBY_VERSION < "3.3.0"

# It seems like there are some oddities with nested heredocs and ripper.
# Waiting for feedback on https://bugs.ruby-lang.org/issues/19838.
return if relative == "seattlerb/heredoc_nested.txt"

# Ripper seems to have a bug that the regex portions before and after the heredoc are combined
# into a single token.
return if relative == "wrapping_heredoc.txt"

# Finally, assert that we can lex the source and get the same tokens as
# Ripper.
lex_result = YARP.lex_compat(source)
assert_equal [], lex_result.errors
tokens = lex_result.value
if ripper_should_parse && ripper_should_match
# Finally, assert that we can lex the source and get the same tokens as
# Ripper.
lex_result = YARP.lex_compat(source)
assert_equal [], lex_result.errors
tokens = lex_result.value

if check_ripper
begin
YARP.lex_ripper(source).zip(tokens).each do |(ripper, yarp)|
assert_equal ripper, yarp
Expand Down
Loading

0 comments on commit 59e3e29

Please sign in to comment.