fix(markdown): convert markdown to html, then sanitize

revert the scrubber back from `html5` to a custom scrubber that is a variation of the markdown scrubber from html_sanitize_ex in addition - * add a test that covers - inputs and forms should NOT be allowed in markdown content * add a test that covers - inputs, forms, and html5 tags should be allowed within codeblocks in markdown content
hashrocket · Apr 11, 2023 · abf05fa · abf05fa
1 parent e8d14d3
commit abf05fa
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 16 deletions.
diff --git a/lib/tilex/blog/post_scrubber.ex b/lib/tilex/blog/post_scrubber.ex
@@ -0,0 +1,73 @@
+defmodule Tilex.Blog.PostScrubber do
+  @moduledoc """
+  PostScrubber is mostly a copy/pasta from the HtmlSanitizeEx library markdown_html scrubber
+
+  The difference is below under the "additions" comment where we've added some additional customizations
+
+  For more info on customizing scrubbers, see the docs for HtmlSanitizeEx
+  """
+
+  require HtmlSanitizeEx.Scrubber.Meta
+  alias HtmlSanitizeEx.Scrubber.Meta
+
+  @valid_schemes ["http", "https", "mailto"]
+
+  # Removes any CDATA tags before the traverser/scrubber runs.
+  Meta.remove_cdata_sections_before_scrub()
+
+  Meta.strip_comments()
+
+  Meta.allow_tag_with_uri_attributes("a", ["href"], @valid_schemes)
+  Meta.allow_tag_with_these_attributes("a", ["name", "title"])
+
+  Meta.allow_tag_with_this_attribute_values("a", "target", ["_blank"])
+
+  Meta.allow_tag_with_this_attribute_values("a", "rel", [
+    "noopener",
+    "noreferrer"
+  ])
+
+  Meta.allow_tag_with_these_attributes("b", [])
+  Meta.allow_tag_with_these_attributes("blockquote", [])
+  Meta.allow_tag_with_these_attributes("br", [])
+  Meta.allow_tag_with_these_attributes("code", ["class"])
+  Meta.allow_tag_with_these_attributes("del", [])
+  Meta.allow_tag_with_these_attributes("em", [])
+  Meta.allow_tag_with_these_attributes("h1", [])
+  Meta.allow_tag_with_these_attributes("h2", [])
+  Meta.allow_tag_with_these_attributes("h3", [])
+  Meta.allow_tag_with_these_attributes("h4", [])
+  Meta.allow_tag_with_these_attributes("h5", [])
+  Meta.allow_tag_with_these_attributes("h6", [])
+  Meta.allow_tag_with_these_attributes("hr", [])
+  Meta.allow_tag_with_these_attributes("i", [])
+
+  Meta.allow_tag_with_uri_attributes("img", ["src"], @valid_schemes)
+
+  Meta.allow_tag_with_these_attributes("img", [
+    "width",
+    "height",
+    "title",
+    "alt"
+  ])
+
+  Meta.allow_tag_with_these_attributes("li", [])
+  Meta.allow_tag_with_these_attributes("ol", [])
+  Meta.allow_tag_with_these_attributes("p", [])
+  Meta.allow_tag_with_these_attributes("pre", [])
+  Meta.allow_tag_with_these_attributes("span", [])
+  Meta.allow_tag_with_these_attributes("strong", [])
+  Meta.allow_tag_with_these_attributes("table", [])
+  Meta.allow_tag_with_these_attributes("tbody", [])
+  Meta.allow_tag_with_these_attributes("td", [])
+  Meta.allow_tag_with_these_attributes("th", [])
+  Meta.allow_tag_with_these_attributes("thead", [])
+  Meta.allow_tag_with_these_attributes("tr", [])
+  Meta.allow_tag_with_these_attributes("u", [])
+  Meta.allow_tag_with_these_attributes("ul", [])
+
+  # Additions
+  Meta.allow_tag_with_these_attributes("div", [])
+
+  Meta.strip_everything_not_covered()
+end
diff --git a/lib/tilex/markdown.ex b/lib/tilex/markdown.ex
@@ -1,5 +1,7 @@
 defmodule Tilex.Markdown do
   alias Tilex.Cache
+  alias Tilex.Blog.PostScrubber
+  alias HtmlSanitizeEx.Scrubber
 
   @earmark_options %Earmark.Options{
     code_class_prefix: "language-",
@@ -18,24 +20,22 @@ defmodule Tilex.Markdown do
   def to_html_live(markdown) do
     markdown
     |> Earmark.as_html!(@earmark_options)
-    |> HtmlSanitizeEx.html5()
+    |> sanitize_markdown_html()
     |> String.trim()
   end
 
-  def to_html(markdown) do
-    Cache.cache(markdown, fn ->
-      to_html_live(markdown)
-    end)
-  end
+  def to_html(markdown), do: Cache.cache(markdown, fn -> to_html_live(markdown) end)
 
   def to_content(markdown) do
     markdown
     |> Earmark.as_html!(@content_earmark_options)
-    |> HtmlSanitizeEx.html5()
+    |> sanitize_markdown_html()
     |> Floki.parse_fragment()
     |> case do
       {:ok, fragment} -> fragment |> Floki.text() |> String.trim()
       _error -> markdown
     end
   end
+
+  defp sanitize_markdown_html(html), do: Scrubber.scrub(html, PostScrubber)
 end
diff --git a/test/lib/tilex/markdown_test.exs b/test/lib/tilex/markdown_test.exs
@@ -200,30 +200,47 @@ defmodule Lib.Tilex.MarkdownTest do
       input: "regular script <script>alert('attack')</script>",
       html: "<p>\nregular script &lt;script&gt;alert('attack')&lt;/script&gt;</p>",
       content: "regular script alert('attack')"
+    },
+    %{
+      snippet_description: "does not allow inputs or forms to be rendered as html",
+      input: "<form><input name='email' required /></form>",
+      html: "",
+      content: ""
+    },
+    %{
+      snippet_description:
+        "allows HTML5 elements (such as fieldset and legend) and forms as part of codeblocks",
+      input:
+        "```html\n<form name='login'>\n<fieldset>\n<legend>Email</legend>\n<input type='email' name='email' required />\n</fieldset>\n<div>\n<input type='submit' value='login' />\n<button type='button' id='validate'>\nvalidate\n</button>\n</div>\n</form>\n```",
+      html:
+        "<pre><code class=\"html language-html\">&lt;form name='login'&gt;\n&lt;fieldset&gt;\n&lt;legend&gt;Email&lt;/legend&gt;\n&lt;input type='email' name='email' required /&gt;\n&lt;/fieldset&gt;\n&lt;div&gt;\n&lt;input type='submit' value='login' /&gt;\n&lt;button type='button' id='validate'&gt;\nvalidate\n&lt;/button&gt;\n&lt;/div&gt;\n&lt;/form&gt;</code></pre>",
+      content: "Email\n\n\n\n\n\nvalidate"
     }
   ]
 
   describe "to_html/1" do
-    for %{input: input, html: html} <- @to_html_data do
-      @input input
-      @html html
+    for case <- @to_html_data do
+      @input Map.get(case, :input)
+      @html Map.get(case, :html)
+      @test_context Map.get(case, :snippet_description, inspect(@input))
 
-      test "converts markdown '#{inspect(@input)}' into html live" do
+      test "converts markdown '#{@test_context}' into html live" do
         assert Markdown.to_html_live(@input) == String.trim(@html)
       end
 
-      test "live and cached produce same value for '#{inspect(@input)}'" do
+      test "live and cached produce same value for '#{@test_context}'" do
         assert Markdown.to_html_live(@input) == Markdown.to_html(@input)
       end
     end
   end
 
   describe "to_content/1" do
-    for %{input: input, content: content} <- @to_html_data do
-      @input input
-      @content content
+    for case <- @to_html_data do
+      @input Map.get(case, :input)
+      @content Map.get(case, :content)
+      @test_context Map.get(case, :snippet_description, inspect(@input))
 
-      test "gests content out of markdown '#{inspect(@input)}'" do
+      test "gests content out of markdown '#{@test_context}'" do
         assert Markdown.to_content(@input) == String.trim(@content)
       end
     end