From e68262ba67214baebdb734b22ce9df66cc7cb6e1 Mon Sep 17 00:00:00 2001 From: Kun Li <122563761+kunli2@users.noreply.github.com> Date: Mon, 2 Oct 2023 16:51:13 -0700 Subject: [PATCH] Update json parser to support multiple-bytes-unicodes (#3596) * Update json parser to support multiple-bytes-unicodes --- .../json/internal/JsonParserVisitor.java | 51 ++++++++++++++++--- .../org/openrewrite/json/JsonParserTest.java | 14 +++++ 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/rewrite-json/src/main/java/org/openrewrite/json/internal/JsonParserVisitor.java b/rewrite-json/src/main/java/org/openrewrite/json/internal/JsonParserVisitor.java index 37f6c8717cf..033a07b3ced 100644 --- a/rewrite-json/src/main/java/org/openrewrite/json/internal/JsonParserVisitor.java +++ b/rewrite-json/src/main/java/org/openrewrite/json/internal/JsonParserVisitor.java @@ -49,12 +49,49 @@ public class JsonParserVisitor extends JSON5BaseVisitor { private int cursor = 0; - public JsonParserVisitor(Path path, @Nullable FileAttributes fileAttributes, EncodingDetectingInputStream source) { + // Whether the source has multi bytes (> 2 bytes) unicode characters + private final boolean hasMultiBytesUnicode; + // Antlr index to source index mapping + private final int[] indexes; + + public JsonParserVisitor(Path path, + @Nullable FileAttributes fileAttributes, + EncodingDetectingInputStream sourceInput + ) { this.path = path; this.fileAttributes = fileAttributes; - this.source = source.readFully(); - this.charset = source.getCharset(); - this.charsetBomMarked = source.isCharsetBomMarked(); + this.source = sourceInput.readFully(); + this.charset = sourceInput.getCharset(); + this.charsetBomMarked = sourceInput.isCharsetBomMarked(); + + boolean hasMultiBytesUnicode = false; + int[] pos = new int[source.length() + 1]; + int cursor = 0; + int i = 1; + pos[0] = 0; + + while (cursor < source.length()) { + int newCursor = source.offsetByCodePoints(cursor, 1); + if (newCursor > cursor + 1) { + hasMultiBytesUnicode = true; + } + pos[i++] = newCursor; + cursor = newCursor; + } + + this.hasMultiBytesUnicode = hasMultiBytesUnicode; + this.indexes = hasMultiBytesUnicode ? pos : null; + } + + /** + * Characters index to source index mapping, valid only when `hasMultiBytesUnicode` is true. + * Antlr index is based on characters index and reader is based on source index. + * If there are any >2 bytes unicode characters in source code, it will make the index mismatch. + * @param index index from Antlr + * @return corrected cursor index + */ + private int getCursorIndex(int index) { + return hasMultiBytesUnicode ? indexes[index] : index; } @Override @@ -264,7 +301,7 @@ private Space prefix(@Nullable TerminalNode terminalNode) { } private Space prefix(Token token) { - int start = token.getStartIndex(); + int start = getCursorIndex(token.getStartIndex()); if (start < cursor) { return Space.EMPTY; } @@ -281,7 +318,7 @@ private T convert(C ctx, BiFunction T convert(C ctx, BiFunction T convert(TerminalNode node, BiFunction conversion) { T t = conversion.apply(node, prefix(node)); - cursor = node.getSymbol().getStopIndex() + 1; + cursor = getCursorIndex(node.getSymbol().getStopIndex()) + 1; return t; } diff --git a/rewrite-json/src/test/java/org/openrewrite/json/JsonParserTest.java b/rewrite-json/src/test/java/org/openrewrite/json/JsonParserTest.java index 854a74974e7..5eb7c1703da 100644 --- a/rewrite-json/src/test/java/org/openrewrite/json/JsonParserTest.java +++ b/rewrite-json/src/test/java/org/openrewrite/json/JsonParserTest.java @@ -146,4 +146,18 @@ void empty() { json("") ); } + + @Issue("https://github.com/openrewrite/rewrite/issues/3582") + @Test + void multiBytesUnicode() { + rewriteRun( + json( + """ + { + "🤖" : "robot" + } + """ + ) + ); + } }