Skip to content

Commit

Permalink
fix(markdown): convert markdown to html, then sanitize
Browse files Browse the repository at this point in the history
revert the scrubber back from `html5` to a custom scrubber
that is a variation of the markdown scrubber from html_sanitize_ex

in addition -
* add a test that covers - inputs and forms should NOT be allowed in
  markdown content
* add a test that covers - inputs, forms, and html5 tags should be
  allowed within codeblocks in markdown content
  • Loading branch information
avogel3 committed Apr 11, 2023
1 parent e8d14d3 commit abf05fa
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 16 deletions.
73 changes: 73 additions & 0 deletions lib/tilex/blog/post_scrubber.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
defmodule Tilex.Blog.PostScrubber do
@moduledoc """
PostScrubber is mostly a copy/pasta from the HtmlSanitizeEx library markdown_html scrubber
The difference is below under the "additions" comment where we've added some additional customizations
For more info on customizing scrubbers, see the docs for HtmlSanitizeEx
"""

require HtmlSanitizeEx.Scrubber.Meta
alias HtmlSanitizeEx.Scrubber.Meta

@valid_schemes ["http", "https", "mailto"]

# Removes any CDATA tags before the traverser/scrubber runs.
Meta.remove_cdata_sections_before_scrub()

Meta.strip_comments()

Meta.allow_tag_with_uri_attributes("a", ["href"], @valid_schemes)
Meta.allow_tag_with_these_attributes("a", ["name", "title"])

Meta.allow_tag_with_this_attribute_values("a", "target", ["_blank"])

Meta.allow_tag_with_this_attribute_values("a", "rel", [
"noopener",
"noreferrer"
])

Meta.allow_tag_with_these_attributes("b", [])
Meta.allow_tag_with_these_attributes("blockquote", [])
Meta.allow_tag_with_these_attributes("br", [])
Meta.allow_tag_with_these_attributes("code", ["class"])
Meta.allow_tag_with_these_attributes("del", [])
Meta.allow_tag_with_these_attributes("em", [])
Meta.allow_tag_with_these_attributes("h1", [])
Meta.allow_tag_with_these_attributes("h2", [])
Meta.allow_tag_with_these_attributes("h3", [])
Meta.allow_tag_with_these_attributes("h4", [])
Meta.allow_tag_with_these_attributes("h5", [])
Meta.allow_tag_with_these_attributes("h6", [])
Meta.allow_tag_with_these_attributes("hr", [])
Meta.allow_tag_with_these_attributes("i", [])

Meta.allow_tag_with_uri_attributes("img", ["src"], @valid_schemes)

Meta.allow_tag_with_these_attributes("img", [
"width",
"height",
"title",
"alt"
])

Meta.allow_tag_with_these_attributes("li", [])
Meta.allow_tag_with_these_attributes("ol", [])
Meta.allow_tag_with_these_attributes("p", [])
Meta.allow_tag_with_these_attributes("pre", [])
Meta.allow_tag_with_these_attributes("span", [])
Meta.allow_tag_with_these_attributes("strong", [])
Meta.allow_tag_with_these_attributes("table", [])
Meta.allow_tag_with_these_attributes("tbody", [])
Meta.allow_tag_with_these_attributes("td", [])
Meta.allow_tag_with_these_attributes("th", [])
Meta.allow_tag_with_these_attributes("thead", [])
Meta.allow_tag_with_these_attributes("tr", [])
Meta.allow_tag_with_these_attributes("u", [])
Meta.allow_tag_with_these_attributes("ul", [])

# Additions
Meta.allow_tag_with_these_attributes("div", [])

Meta.strip_everything_not_covered()
end
14 changes: 7 additions & 7 deletions lib/tilex/markdown.ex
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
defmodule Tilex.Markdown do
alias Tilex.Cache
alias Tilex.Blog.PostScrubber
alias HtmlSanitizeEx.Scrubber

@earmark_options %Earmark.Options{
code_class_prefix: "language-",
Expand All @@ -18,24 +20,22 @@ defmodule Tilex.Markdown do
def to_html_live(markdown) do
markdown
|> Earmark.as_html!(@earmark_options)
|> HtmlSanitizeEx.html5()
|> sanitize_markdown_html()
|> String.trim()
end

def to_html(markdown) do
Cache.cache(markdown, fn ->
to_html_live(markdown)
end)
end
def to_html(markdown), do: Cache.cache(markdown, fn -> to_html_live(markdown) end)

def to_content(markdown) do
markdown
|> Earmark.as_html!(@content_earmark_options)
|> HtmlSanitizeEx.html5()
|> sanitize_markdown_html()
|> Floki.parse_fragment()
|> case do
{:ok, fragment} -> fragment |> Floki.text() |> String.trim()
_error -> markdown
end
end

defp sanitize_markdown_html(html), do: Scrubber.scrub(html, PostScrubber)
end
35 changes: 26 additions & 9 deletions test/lib/tilex/markdown_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -200,30 +200,47 @@ defmodule Lib.Tilex.MarkdownTest do
input: "regular script <script>alert('attack')</script>",
html: "<p>\nregular script &lt;script&gt;alert('attack')&lt;/script&gt;</p>",
content: "regular script alert('attack')"
},
%{
snippet_description: "does not allow inputs or forms to be rendered as html",
input: "<form><input name='email' required /></form>",
html: "",
content: ""
},
%{
snippet_description:
"allows HTML5 elements (such as fieldset and legend) and forms as part of codeblocks",
input:
"```html\n<form name='login'>\n<fieldset>\n<legend>Email</legend>\n<input type='email' name='email' required />\n</fieldset>\n<div>\n<input type='submit' value='login' />\n<button type='button' id='validate'>\nvalidate\n</button>\n</div>\n</form>\n```",
html:
"<pre><code class=\"html language-html\">&lt;form name='login'&gt;\n&lt;fieldset&gt;\n&lt;legend&gt;Email&lt;/legend&gt;\n&lt;input type='email' name='email' required /&gt;\n&lt;/fieldset&gt;\n&lt;div&gt;\n&lt;input type='submit' value='login' /&gt;\n&lt;button type='button' id='validate'&gt;\nvalidate\n&lt;/button&gt;\n&lt;/div&gt;\n&lt;/form&gt;</code></pre>",
content: "Email\n\n\n\n\n\nvalidate"
}
]

describe "to_html/1" do
for %{input: input, html: html} <- @to_html_data do
@input input
@html html
for case <- @to_html_data do
@input Map.get(case, :input)
@html Map.get(case, :html)
@test_context Map.get(case, :snippet_description, inspect(@input))

test "converts markdown '#{inspect(@input)}' into html live" do
test "converts markdown '#{@test_context}' into html live" do
assert Markdown.to_html_live(@input) == String.trim(@html)
end

test "live and cached produce same value for '#{inspect(@input)}'" do
test "live and cached produce same value for '#{@test_context}'" do
assert Markdown.to_html_live(@input) == Markdown.to_html(@input)
end
end
end

describe "to_content/1" do
for %{input: input, content: content} <- @to_html_data do
@input input
@content content
for case <- @to_html_data do
@input Map.get(case, :input)
@content Map.get(case, :content)
@test_context Map.get(case, :snippet_description, inspect(@input))

test "gests content out of markdown '#{inspect(@input)}'" do
test "gests content out of markdown '#{@test_context}'" do
assert Markdown.to_content(@input) == String.trim(@content)
end
end
Expand Down

0 comments on commit abf05fa

Please sign in to comment.