Merge pull request #1339 from flavorjones/flavorjones-fix-percent-her…

…edocs fix %i %I %w %W when spanning a heredoc
ruby · Aug 28, 2023 · 59e3e29 · 59e3e29
2 parents 7dcef1c + 6b5911b
commit 59e3e29
Show file tree

Hide file tree

Showing 6 changed files with 337 additions and 131 deletions.
diff --git a/src/yarp.c b/src/yarp.c
@@ -2816,10 +2816,11 @@ yp_interpolated_symbol_node_create(yp_parser_t *parser, const yp_token_t *openin
 
 static inline void
 yp_interpolated_symbol_node_append(yp_interpolated_symbol_node_t *node, yp_node_t *part) {
-    yp_node_list_append(&node->parts, part);
-    if (!node->base.location.start) {
+    if (node->parts.size == 0 && node->opening_loc.start == NULL) {
         node->base.location.start = part->location.start;
     }
+
+    yp_node_list_append(&node->parts, part);
     node->base.location.end = part->location.end;
 }
 
@@ -6941,9 +6942,19 @@ parser_lex(yp_parser_t *parser) {
                     yp_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
                     size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
 
-                    // If the result is an escaped newline, then we need to
-                    // track that newline.
-                    yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);
+                    // If the result is an escaped newline ...
+                    if (*(breakpoint + difference - 1) == '\n') {
+                        if (parser->heredoc_end) {
+                            // ... if we are on the same line as a heredoc, flush the heredoc and
+                            // continue parsing after heredoc_end.
+                            parser->current.end = breakpoint + difference;
+                            parser_flush_heredoc_end(parser);
+                            LEX(YP_TOKEN_STRING_CONTENT);
+                        } else {
+                            // ... else track the newline.
+                            yp_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
+                        }
+                    }
 
                     breakpoint = yp_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
                     continue;
@@ -11933,14 +11944,9 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
             yp_array_node_t *array = yp_array_node_create(parser, &parser->previous);
 
             while (!match_any_type_p(parser, 2, YP_TOKEN_STRING_END, YP_TOKEN_EOF)) {
-                if (yp_array_node_size(array) == 0) {
-                    accept(parser, YP_TOKEN_WORDS_SEP);
-                } else {
-                    expect(parser, YP_TOKEN_WORDS_SEP, "Expected a separator for the symbols in a `%i` list.");
-                    if (match_type_p(parser, YP_TOKEN_STRING_END)) break;
-                }
-
+                accept(parser, YP_TOKEN_WORDS_SEP);
                 if (match_type_p(parser, YP_TOKEN_STRING_END)) break;
+
                 expect(parser, YP_TOKEN_STRING_CONTENT, "Expected a symbol in a `%i` list.");
 
                 yp_token_t opening = not_provided(parser);
@@ -11995,6 +12001,19 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
                             // to the list of child nodes.
                             yp_node_t *part = parse_string_part(parser);
                             yp_interpolated_symbol_node_append((yp_interpolated_symbol_node_t *) current, part);
+                        } else if (YP_NODE_TYPE_P(current, YP_NODE_SYMBOL_NODE)) {
+                            // If we hit string content and the current node is a string node,
+                            // then we need to convert the current node into an interpolated
+                            // string and add the string content to the list of child nodes.
+                            yp_token_t opening = not_provided(parser);
+                            yp_token_t closing = not_provided(parser);
+                            yp_interpolated_symbol_node_t *interpolated =
+                                yp_interpolated_symbol_node_create(parser, &opening, NULL, &closing);
+                            yp_interpolated_symbol_node_append(interpolated, current);
+
+                            yp_node_t *part = parse_string_part(parser);
+                            yp_interpolated_symbol_node_append(interpolated, part);
+                            current = (yp_node_t *) interpolated;
                         } else {
                             assert(false && "unreachable");
                         }
@@ -12097,12 +12116,9 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
             accept(parser, YP_TOKEN_WORDS_SEP);
 
             while (!match_any_type_p(parser, 2, YP_TOKEN_STRING_END, YP_TOKEN_EOF)) {
-                if (yp_array_node_size(array) == 0) {
-                    accept(parser, YP_TOKEN_WORDS_SEP);
-                } else {
-                    expect(parser, YP_TOKEN_WORDS_SEP, "Expected a separator for the strings in a `%w` list.");
-                    if (match_type_p(parser, YP_TOKEN_STRING_END)) break;
-                }
+                accept(parser, YP_TOKEN_WORDS_SEP);
+                if (match_type_p(parser, YP_TOKEN_STRING_END)) break;
+
                 expect(parser, YP_TOKEN_STRING_CONTENT, "Expected a string in a `%w` list.");
 
                 yp_token_t opening = not_provided(parser);
@@ -12152,6 +12168,19 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
                             // to the list of child nodes.
                             yp_node_t *part = parse_string_part(parser);
                             yp_interpolated_string_node_append((yp_interpolated_string_node_t *) current, part);
+                        } else if (YP_NODE_TYPE_P(current, YP_NODE_STRING_NODE)) {
+                            // If we hit string content and the current node is a string node,
+                            // then we need to convert the current node into an interpolated
+                            // string and add the string content to the list of child nodes.
+                            yp_token_t opening = not_provided(parser);
+                            yp_token_t closing = not_provided(parser);
+                            yp_interpolated_string_node_t *interpolated =
+                                yp_interpolated_string_node_create(parser, &opening, NULL, &closing);
+                            yp_interpolated_string_node_append(interpolated, current);
+
+                            yp_node_t *part = parse_string_part(parser);
+                            yp_interpolated_string_node_append(interpolated, part);
+                            current = (yp_node_t *) interpolated;
                         } else {
                             assert(false && "unreachable");
                         }

diff --git a/test/yarp/fixtures/spanning_heredoc.txt b/test/yarp/fixtures/spanning_heredoc.txt
@@ -0,0 +1,51 @@
+# test regex, string, and lists that span a heredoc thanks to an escaped newline
+
+# ripper incorrectly creates a "b\nb" token instead of two separate string tokens
+pp <<-A.gsub(/b\
+a
+A
+b/, "")
+
+# ripper incorrectly creates a "d\nd" token instead of two separate string tokens
+pp <<-A, "d\
+c
+A
+d"
+
+# ripper gets this right
+pp <<-A, %q[f\
+e
+A
+f]
+
+# ripper incorrectly creates a "h\nh" token instead of two separate string tokens
+pp <<-A, %Q[h\
+g
+A
+h]
+
+# ripper can't parse this successfully, though ruby runs it correctly
+pp <<-A, %w[j\
+i
+A
+j]
+
+# ripper can't parse this successfully, though ruby runs it correctly
+# TODO: yarp does not include the "\n" in "l\nl" in the AST like ruby does
+pp <<-A, %W[l\
+k
+A
+l]
+
+# ripper can't parse this successfully, though ruby runs it correctly
+pp <<-A, %i[n\
+m
+A
+n]
+
+# ripper gets this one wrong in the same way that YARP does ...
+# TODO: yarp does not include the "\n" in "p\np" in the AST like ruby does
+pp <<-A, %I[p\
+o
+A
+p]
diff --git a/test/yarp/fixtures/wrapping_heredoc.txt b/test/yarp/fixtures/wrapping_heredoc.txt
diff --git a/test/yarp/parse_test.rb b/test/yarp/parse_test.rb
@@ -46,7 +46,7 @@ def test_parse_lex_file
 
   # To accurately compare against Ripper, we need to make sure that we're
   # running on Ruby 3.2+.
-  check_ripper = RUBY_VERSION >= "3.2.0"
+  ripper_enabled = RUBY_VERSION >= "3.2.0"
 
   # The FOCUS environment variable allows you to specify one particular fixture
   # to test, instead of all of them.
@@ -63,14 +63,30 @@ def test_parse_lex_file
     directory = File.dirname(snapshot)
     FileUtils.mkdir_p(directory) unless File.directory?(directory)
 
+    ripper_should_parse = ripper_should_match = ripper_enabled
+
+    # This file has changed behavior in Ripper in Ruby 3.3, so we skip it if
+    # we're on an earlier version.
+    ripper_should_match = false if relative == "seattlerb/pct_w_heredoc_interp_nested.txt" && RUBY_VERSION < "3.3.0"
+
+    # It seems like there are some oddities with nested heredocs and ripper.
+    # Waiting for feedback on https://bugs.ruby-lang.org/issues/19838.
+    ripper_should_match = false if relative == "seattlerb/heredoc_nested.txt"
+
+    # Ripper seems to have a bug that the regex portions before and after the heredoc are combined
+    # into a single token. See https://bugs.ruby-lang.org/issues/19838.
+    #
+    # Additionally, Ripper cannot parse the %w[] fixture in this file, so set ripper_should_parse to false.
+    ripper_should_parse = false if relative == "spanning_heredoc.txt"
+
     define_method "test_filepath_#{relative}" do
       # First, read the source from the filepath. Use binmode to avoid converting CRLF on Windows,
       # and explicitly set the external encoding to UTF-8 to override the binmode default.
       source = File.read(filepath, binmode: true, external_encoding: Encoding::UTF_8)
 
       # Make sure that it can be correctly parsed by Ripper. If it can't, then we have a fixture
       # that is invalid Ruby.
-      refute_nil Ripper.sexp_raw(source) if check_ripper
+      refute_nil(Ripper.sexp_raw(source), "Ripper failed to parse") if ripper_should_parse
 
       # Next, assert that there were no errors during parsing.
       result = YARP.parse(source, relative)
@@ -118,25 +134,13 @@ def test_parse_lex_file
 
       assert_equal expected_newlines, YARP.const_get(:Debug).newlines(source)
 
-      # This file has changed behavior in Ripper in Ruby 3.3, so we skip it if
-      # we're on an earlier version.
-      return if relative == "seattlerb/pct_w_heredoc_interp_nested.txt" && RUBY_VERSION < "3.3.0"
-
-      # It seems like there are some oddities with nested heredocs and ripper.
-      # Waiting for feedback on https://bugs.ruby-lang.org/issues/19838.
-      return if relative == "seattlerb/heredoc_nested.txt"
-
-      # Ripper seems to have a bug that the regex portions before and after the heredoc are combined
-      # into a single token.
-      return if relative == "wrapping_heredoc.txt"
-
-      # Finally, assert that we can lex the source and get the same tokens as
-      # Ripper.
-      lex_result = YARP.lex_compat(source)
-      assert_equal [], lex_result.errors
-      tokens = lex_result.value
+      if ripper_should_parse && ripper_should_match
+        # Finally, assert that we can lex the source and get the same tokens as
+        # Ripper.
+        lex_result = YARP.lex_compat(source)
+        assert_equal [], lex_result.errors
+        tokens = lex_result.value
 
-      if check_ripper
         begin
           YARP.lex_ripper(source).zip(tokens).each do |(ripper, yarp)|
             assert_equal ripper, yarp