From 7c6edc1b024fc3e01ab46c39763f5121f498fe3a Mon Sep 17 00:00:00 2001 From: Arber Shabhasa Date: Wed, 3 Apr 2024 20:30:35 +0200 Subject: [PATCH] Avoid string trimming --- lib/saxy/parser/builder.ex | 28 +++++++++++++++------------- test/saxy_test.exs | 6 +++--- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/lib/saxy/parser/builder.ex b/lib/saxy/parser/builder.ex index 2ebb5fd..3b5c0b8 100644 --- a/lib/saxy/parser/builder.ex +++ b/lib/saxy/parser/builder.ex @@ -851,7 +851,7 @@ defmodule Saxy.Parser.Builder do open_tag_name(rest, more?, original, pos, state, Utils.compute_char_len(codepoint)) "/" <> rest -> - close_tag_name(rest, more?, original, pos + 1, state, 0) + close_tag_name(rest, more?, original, pos + 1, state, 0, 0) "![CDATA[" <> rest -> element_cdata(rest, more?, original, pos + 8, state, 0) @@ -1161,30 +1161,31 @@ defmodule Saxy.Parser.Builder do end end - defp close_tag_name(<>, more?, original, pos, state, 0) do + defp close_tag_name(<>, more?, original, pos, state, 0, 0) do lookahead buffer, @streaming do char <> rest when is_ascii_name_start_char(char) -> - close_tag_name(rest, more?, original, pos, state, 1) + close_tag_name(rest, more?, original, pos, state, 1, 1) token in unquote(utf8_binaries()) when more? -> - halt!(close_tag_name(token, more?, original, pos, state, 0)) + halt!(close_tag_name(token, more?, original, pos, state, 0, 0)) <> <> rest when is_utf8_name_start_char(codepoint) -> - close_tag_name(rest, more?, original, pos, state, Utils.compute_char_len(codepoint)) + len = Utils.compute_char_len(codepoint) + close_tag_name(rest, more?, original, pos, state, len, len) _ in [""] when more? -> - halt!(close_tag_name("", more?, original, pos, state, 0)) + halt!(close_tag_name("", more?, original, pos, state, 0, 0)) _ -> Utils.parse_error(original, pos, state, {:token, :end_tag}) end end - defp close_tag_name(<>, more?, original, pos, state, len) do + defp close_tag_name(<>, more?, original, pos, state, len, copy_to) do lookahead buffer, @streaming do ">" <> rest -> [open_tag | stack] = state.stack - ending_tag = binary_part(original, pos, len) |> String.trim() + ending_tag = binary_part(original, pos, copy_to) pos = pos + len + 1 if open_tag == ending_tag do @@ -1205,19 +1206,20 @@ defmodule Saxy.Parser.Builder do end char <> rest when is_ascii_name_char(char) -> - close_tag_name(rest, more?, original, pos, state, len + 1) + close_tag_name(rest, more?, original, pos, state, len + 1, copy_to + 1) char <> rest when is_whitespace(char) -> - close_tag_name(rest, more?, original, pos, state, len + 1) + close_tag_name(rest, more?, original, pos, state, len + 1, copy_to) token in unquote(utf8_binaries()) when more? -> - halt!(close_tag_name(token, more?, original, pos, state, len)) + halt!(close_tag_name(token, more?, original, pos, state, len, copy_to)) <> <> rest when is_utf8_name_char(codepoint) -> - close_tag_name(rest, more?, original, pos, state, len + Utils.compute_char_len(codepoint)) + char_len = Utils.compute_char_len(codepoint) + close_tag_name(rest, more?, original, pos, state, len + char_len, copy_to + char_len) _ in [""] when more? -> - halt!(close_tag_name("", more?, original, pos, state, len)) + halt!(close_tag_name("", more?, original, pos, state, len, copy_to)) _ -> Utils.parse_error(original, pos + len, state, {:token, :end_tag}) diff --git a/test/saxy_test.exs b/test/saxy_test.exs index b579494..593f449 100644 --- a/test/saxy_test.exs +++ b/test/saxy_test.exs @@ -74,14 +74,14 @@ defmodule SaxyTest do end test "parse_string/4 parses XML binary with closing tags containing whitespaces" do - data = "Some Data" + data = "Some data" - assert {:ok, state} = parse(data, StackHandler, [], expand_entity: :keep) + assert {:ok, state} = parse(data, StackHandler, []) assert state == [ end_document: {}, end_element: "foo", - characters: "Some Data", + characters: "Some data", start_element: {"foo", []}, start_document: [] ]