Skip to content

Commit

Permalink
Fix Issue 65: incorrect node return format for doctype when using htm…
Browse files Browse the repository at this point in the history
…l5ever (#66)

* Add a failing test for issue #65

* Fix #65.

* Run html5ever tests only if Elixir version >= 1.13

* Make html5ever dependency conditional on running Elixir >=1.13

* Run tests sequentially, to avoid conflicts with setting the app config.
  • Loading branch information
vkryukov authored Nov 14, 2024
1 parent e3c152b commit 06db3f1
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 2 deletions.
1 change: 1 addition & 0 deletions lib/readability/helper.ex
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ defmodule Readability.Helper do
@spec remove_tag(html_tree, fun) :: html_tree
def remove_tag(content, _) when is_binary(content), do: content
def remove_tag([], _), do: []
def remove_tag([{:doctype, _, _, _} | t], fun), do: remove_tag(t, fun)

def remove_tag([h | t], fun) do
node = remove_tag(h, fun)
Expand Down
10 changes: 9 additions & 1 deletion mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,14 @@ defmodule Readability.Mixfile do
# https://github.com/lpil/mix-test.watch/pull/140#issuecomment-1853912030
test_watch_runtime = match?(["test.watch" | _], System.argv())

# Make test suite run with Elixir 1.10 happy
html5ever_dep =
if Version.match?(System.version(), ">= 1.13.0") do
{:html5ever, "~> 0.16", only: :test}
else
[]
end

[
{:floki, "~> 0.24"},
{:httpoison, "~> 1.8 or ~> 2.0"},
Expand All @@ -42,7 +50,7 @@ defmodule Readability.Mixfile do
{:mock, "~> 0.3", only: :test},
{:excoveralls, "~> 0.18", only: :test},
{:mix_test_watch, "~> 1.0", only: [:dev, :test], runtime: test_watch_runtime}
]
] ++ List.wrap(html5ever_dep)
end

defp package do
Expand Down
3 changes: 3 additions & 0 deletions mix.lock
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
%{
"bunt": {:hex, :bunt, "0.2.1", "e2d4792f7bc0ced7583ab54922808919518d0e57ee162901a16a1b6664ef3b14", [:mix], [], "hexpm", "a330bfb4245239787b15005e66ae6845c9cd524a288f0d141c148b02603777a5"},
"castore": {:hex, :castore, "1.0.9", "5cc77474afadf02c7c017823f460a17daa7908e991b0cc917febc90e466a375c", [:mix], [], "hexpm", "5ea956504f1ba6f2b4eb707061d8e17870de2bee95fb59d512872c2ef06925e7"},
"certifi": {:hex, :certifi, "2.12.0", "2d1cca2ec95f59643862af91f001478c9863c2ac9cb6e2f89780bfd8de987329", [:rebar3], [], "hexpm", "ee68d85df22e554040cdb4be100f33873ac6051387baf6a8f6ce82272340ff1c"},
"credo": {:hex, :credo, "1.7.1", "6e26bbcc9e22eefbff7e43188e69924e78818e2fe6282487d0703652bc20fd62", [:mix], [{:bunt, "~> 0.2.1", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2.8", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "e9871c6095a4c0381c89b6aa98bc6260a8ba6addccf7f6a53da8849c748a58a2"},
"dialyxir": {:hex, :dialyxir, "1.4.2", "764a6e8e7a354f0ba95d58418178d486065ead1f69ad89782817c296d0d746a5", [:mix], [{:erlex, ">= 0.2.6", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "516603d8067b2fd585319e4b13d3674ad4f314a5902ba8130cd97dc902ce6bbd"},
Expand All @@ -10,6 +11,7 @@
"file_system": {:hex, :file_system, "0.2.10", "fb082005a9cd1711c05b5248710f8826b02d7d1784e7c3451f9c1231d4fc162d", [:mix], [], "hexpm", "41195edbfb562a593726eda3b3e8b103a309b733ad25f3d642ba49696bf715dc"},
"floki": {:hex, :floki, "0.35.2", "87f8c75ed8654b9635b311774308b2760b47e9a579dabf2e4d5f1e1d42c39e0b", [:mix], [], "hexpm", "6b05289a8e9eac475f644f09c2e4ba7e19201fd002b89c28c1293e7bd16773d9"},
"hackney": {:hex, :hackney, "1.20.1", "8d97aec62ddddd757d128bfd1df6c5861093419f8f7a4223823537bad5d064e2", [:rebar3], [{:certifi, "~> 2.12.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~> 6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~> 1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~> 1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.4.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~> 1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "fe9094e5f1a2a2c0a7d10918fee36bfec0ec2a979994cff8cfe8058cd9af38e3"},
"html5ever": {:hex, :html5ever, "0.16.1", "3dccc3349e0c3e5f5542bcc09253e6246d174391aca692bdecccd446a1c62132", [:mix], [{:rustler, ">= 0.0.0", [hex: :rustler, repo: "hexpm", optional: true]}, {:rustler_precompiled, "~> 0.6.0 or ~> 0.7.0", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}], "hexpm", "6eb06b7796eb100bc815dffd3f500de376a426a088a8405402305cdd8e7cc08a"},
"httpoison": {:hex, :httpoison, "2.2.1", "87b7ed6d95db0389f7df02779644171d7319d319178f6680438167d7b69b1f3d", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "51364e6d2f429d80e14fe4b5f8e39719cacd03eb3f9a9286e61e216feac2d2df"},
"idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
"jason": {:hex, :jason, "1.4.1", "af1504e35f629ddcdd6addb3513c3853991f694921b1b9368b0bd32beb9f1b63", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "fbb01ecdfd565b56261302f7e1fcc27c4fb8f32d56eab74db621fc154604a7a1"},
Expand All @@ -23,6 +25,7 @@
"mock": {:hex, :mock, "0.3.8", "7046a306b71db2488ef54395eeb74df0a7f335a7caca4a3d3875d1fc81c884dd", [:mix], [{:meck, "~> 0.9.2", [hex: :meck, repo: "hexpm", optional: false]}], "hexpm", "7fa82364c97617d79bb7d15571193fc0c4fe5afd0c932cef09426b3ee6fe2022"},
"nimble_parsec": {:hex, :nimble_parsec, "1.4.0", "51f9b613ea62cfa97b25ccc2c1b4216e81df970acd8e16e8d1bdc58fef21370d", [:mix], [], "hexpm", "9c565862810fb383e9838c1dd2d7d2c437b3d13b267414ba6af33e50d2d1cf28"},
"parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"},
"rustler_precompiled": {:hex, :rustler_precompiled, "0.7.3", "42cb9449785cd86c87453e39afdd27a0bdfa5c77a4ec5dc5ce45112e06b9f89b", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "cbc4b3777682e5f6f43ed39b0e0b4a42dccde8053aba91b4514e8f5ff9a5ac6d"},
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"},
"unicode_util_compat": {:hex, :unicode_util_compat, "0.7.0", "bc84380c9ab48177092f43ac89e4dfa2c6d62b40b8bd132b1059ecc7232f9a78", [:rebar3], [], "hexpm", "25eee6d67df61960cf6a794239566599b09e17e668d3700247bc498638152521"},
}
19 changes: 18 additions & 1 deletion test/readability_test.exs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
defmodule ReadabilityTest do
use ExUnit.Case, async: true
use ExUnit.Case, async: false

test "readability for NY Times" do
html = TestHelper.read_fixture("nytimes.html")
Expand Down Expand Up @@ -82,4 +82,21 @@ defmodule ReadabilityTest do
assert pubmed_text =~
~r/with different mechanisms yielded potent antihypertensive efficacy with safety and decreased plasma BNP levels.$/
end

test "correctly processing DOCTYPE when using html5ever parser" do
# Since html5ever requires Elixir 1.13 or later, we won't run it on ealier Elixir versions
if Version.match?(System.version(), ">=1.13.0") do
original_parser = Application.get_env(:floki, :html_parser) || Floki.HTMLParser.Mochiweb
Application.put_env(:floki, :html_parser, Floki.HTMLParser.Html5ever)

try do
html = TestHelper.read_fixture("medium.html")
html |> Readability.article() |> Readability.readable_html()
after
Application.put_env(:floki, :html_parser, original_parser)
end
else
:ok
end
end
end

0 comments on commit 06db3f1

Please sign in to comment.