Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closing tags behavior #492

Merged
merged 5 commits into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 157 additions & 11 deletions src/main/java/org/owasp/validator/html/scan/ASHTMLSerializer.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,31 @@

import java.io.IOException;
import java.io.Writer;
import java.util.Locale;
import org.apache.xml.serialize.ElementState;
import org.apache.xml.serialize.HTMLdtd;
import org.apache.xml.serialize.OutputFormat;
import org.owasp.validator.html.InternalPolicy;
import org.owasp.validator.html.TagMatcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Attr;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;

@SuppressWarnings("deprecation")
public class ASHTMLSerializer extends org.apache.xml.serialize.HTMLSerializer {

private static final Logger logger = LoggerFactory.getLogger(ASHTMLSerializer.class);
private boolean encodeAllPossibleEntities;
private final TagMatcher allowedEmptyTags;
private final TagMatcher requireClosingTags;

public ASHTMLSerializer(Writer w, OutputFormat format, InternalPolicy policy) {
super(w, format);
this.allowedEmptyTags = policy.getAllowedEmptyTags();
this.requireClosingTags = policy.getRequiresClosingTags();
this.encodeAllPossibleEntities = policy.isEntityEncodeIntlCharacters();
}

Expand All @@ -27,6 +37,128 @@ protected String getEntityRef(int charToPrint) {
return null;
}

/**
* Called to serialize a DOM element. Equivalent to calling {@link #startElement}, {@link
* #endElement} and serializing everything inbetween, but better optimized.
*/
@Override
protected void serializeElement(Element elem) throws IOException {
Attr attr;
NamedNodeMap attrMap;
int i;
Node child;
ElementState state;
boolean preserveSpace;
String name;
String value;
String tagName;

tagName = elem.getTagName();
state = getElementState();
if (isDocumentState()) {
// If this is the root element handle it differently.
// If the first root element in the document, serialize
// the document's DOCTYPE. Space preserving defaults
// to that of the output format.
if (!_started) startDocument(tagName);
} else {
// For any other element, if first in parent, then
// close parent's opening tag and use the parnet's
// space preserving.
if (state.empty) _printer.printText('>');
// Indent this element on a new line if the first
// content of the parent element or immediately
// following an element.
if (_indenting && !state.preserveSpace && (state.empty || state.afterElement))
_printer.breakLine();
}
preserveSpace = state.preserveSpace;

// Do not change the current element state yet.
// This only happens in endElement().

// XHTML: element names are lower case, DOM will be different
_printer.printText('<');
_printer.printText(tagName);
_printer.indent();

// Lookup the element's attribute, but only print specified
// attributes. (Unspecified attributes are derived from the DTD.
// For each attribute print it's name and value as one part,
// separated with a space so the element can be broken on
// multiple lines.
attrMap = elem.getAttributes();
if (attrMap != null) {
for (i = 0; i < attrMap.getLength(); ++i) {
attr = (Attr) attrMap.item(i);
name = attr.getName().toLowerCase(Locale.ENGLISH);
value = attr.getValue();
if (attr.getSpecified()) {
_printer.printSpace();
// HTML: Empty values print as attribute name, no value.
// HTML: URI attributes will print unescaped
if (value == null) {
value = "";
}
if (!_format.getPreserveEmptyAttributes() && value.length() == 0)
_printer.printText(name);
else if (HTMLdtd.isURI(tagName, name)) {
_printer.printText(name);
_printer.printText("=\"");
_printer.printText(escapeURI(value));
_printer.printText('"');
} else if (HTMLdtd.isBoolean(tagName, name)) _printer.printText(name);
else {
_printer.printText(name);
_printer.printText("=\"");
printEscaped(value);
_printer.printText('"');
}
}
}
}
if (HTMLdtd.isPreserveSpace(tagName)) preserveSpace = true;

// If element has children, or if element is not an empty tag,
// serialize an opening tag.
if (elem.hasChildNodes() || !HTMLdtd.isEmptyTag(tagName)) {
// Enter an element state, and serialize the children
// one by one. Finally, end the element.
state = enterElementState(null, null, tagName, preserveSpace);

// Prevents line breaks inside A/TD
if (tagName.equalsIgnoreCase("A") || tagName.equalsIgnoreCase("TD")) {
state.empty = false;
_printer.printText('>');
}

// Handle SCRIPT and STYLE specifically by changing the
// state of the current element to CDATA (XHTML) or
// unescaped (HTML).
if (tagName.equalsIgnoreCase("SCRIPT") || tagName.equalsIgnoreCase("STYLE")) {
// HTML: Print contents unescaped
state.unescaped = true;
}
child = elem.getFirstChild();
while (child != null) {
serializeNode(child);
child = child.getNextSibling();
}
endElementIO(null, null, tagName);
} else {
_printer.unindent();
// XHTML: Close empty tag with ' />' so it's XML and HTML compatible.
// HTML: Empty tags are defined as such in DTD no in document.
if (!elem.hasChildNodes() && isAllowedEmptyTag(tagName) && !requiresClosingTag(tagName))
_printer.printText("/>");
else _printer.printText('>');
// After element but parent element is no longer empty.
state.afterElement = true;
state.empty = false;
if (isDocumentState()) _printer.flush();
}
}

@Override
public void endElementIO(String namespaceURI, String localName, String rawName)
throws IOException {
Expand All @@ -38,17 +170,23 @@ public void endElementIO(String namespaceURI, String localName, String rawName)
_printer.unindent();
state = getElementState();

if (state.empty) _printer.printText('>');
// This element is not empty and that last content was another element, so print a line break
// before that last element and this element's closing tag. [keith] Provided this is not an
// anchor. HTML: some elements do not print closing tag (e.g. LI)
if (rawName == null || !HTMLdtd.isOnlyOpening(rawName) || HTMLdtd.isOptionalClosing(rawName)) {
if (_indenting && !state.preserveSpace && state.afterElement) _printer.breakLine();
// Must leave CData section first (Illegal in HTML, but still)
if (state.inCData) _printer.printText("]]>");
_printer.printText("</");
_printer.printText(state.rawName);
_printer.printText('>');
if (state.empty && isAllowedEmptyTag(rawName) && !requiresClosingTag(rawName)) { //
_printer.printText("/>");
} else {
if (state.empty) _printer.printText('>');
// This element is not empty and that last content was another element, so print a line break
// before that last element and this element's closing tag. [keith] Provided this is not an
// anchor. HTML: some elements do not print closing tag (e.g. LI)
if (rawName == null
|| !HTMLdtd.isOnlyOpening(rawName)
|| HTMLdtd.isOptionalClosing(rawName)) {
if (_indenting && !state.preserveSpace && state.afterElement) _printer.breakLine();
// Must leave CData section first (Illegal in HTML, but still)
if (state.inCData) _printer.printText("]]>");
_printer.printText("</");
_printer.printText(state.rawName);
_printer.printText('>');
}
}

// Leave the element state and update that of the parent (if we're not root) to not empty and
Expand Down Expand Up @@ -76,4 +214,12 @@ protected String escapeURI(String uri) {
}
return "";
}

private boolean requiresClosingTag(String tagName) {
return requireClosingTags.matches(tagName);
}

private boolean isAllowedEmptyTag(String tagName) {
return "head".equals(tagName) || allowedEmptyTags.matches(tagName);
}
}
51 changes: 37 additions & 14 deletions src/test/java/org/owasp/validator/html/test/AntiSamyTest.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2007-2023, Arshan Dabirsiaghi, Jason Li
* Copyright (c) 2007-2024, Arshan Dabirsiaghi, Jason Li
*
* All rights reserved.
*
Expand Down Expand Up @@ -48,7 +48,10 @@
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.MissingResourceException;
import java.util.ResourceBundle;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.codec.binary.Base64;
Expand All @@ -63,6 +66,7 @@
import org.owasp.validator.html.model.Attribute;
import org.owasp.validator.html.model.Property;
import org.owasp.validator.html.model.Tag;
import org.owasp.validator.html.scan.Constants;

/**
* This class tests AntiSamy functionality and the basic policy file which should be immune to XSS
Expand Down Expand Up @@ -100,6 +104,7 @@ public class AntiSamyTest {

private AntiSamy as = new AntiSamy();
private TestPolicy policy = null;
private ResourceBundle messages = null;

@Before
public void setUp() throws Exception {
Expand All @@ -111,6 +116,13 @@ public void setUp() throws Exception {
// get Policy instance from a URL.
URL url = getClass().getResource("/antisamy.xml");
policy = TestPolicy.getInstance(url);
try {
messages = ResourceBundle.getBundle("AntiSamy", Locale.getDefault());
} catch (MissingResourceException mre) {
messages =
ResourceBundle.getBundle(
"AntiSamy", new Locale(Constants.DEFAULT_LOCALE_LANG, Constants.DEFAULT_LOCALE_LOC));
}
}

@Test
Expand Down Expand Up @@ -868,8 +880,8 @@ public void issue12() throws ScanException, PolicyException {
assertFalse(p.matcher(s1).matches());
assertFalse(p.matcher(s2).matches());

assertThat(s1, containsString("<hr>"));
assertThat(s2, containsString("<hr>"));
assertThat(s1, containsString("<hr/>"));
assertThat(s2, containsString("<hr/>"));
}

@Test
Expand Down Expand Up @@ -1431,7 +1443,7 @@ public void issue112() throws ScanException, PolicyException {

StringBuilder sb = new StringBuilder();
sb.append("<html><head><title>foobar</title></head><body>");
sb.append("<img src=\"http://foobar.com/pic.gif\"></body></html>");
sb.append("<img src=\"http://foobar.com/pic.gif\"/></body></html>");

html = sb.toString();

Expand Down Expand Up @@ -1572,12 +1584,12 @@ public void validateParamAsEmbed() throws ScanException, PolicyException {
String input =
"<object width=\"560\" height=\"340\"><param name=\"movie\" value=\"http://www.youtube.com/v/IyAyd4WnvhU&hl=en&fs=1&\"></param><param name=\"allowFullScreen\" value=\"true\"></param><param name=\"allowscriptaccess\" value=\"always\"></param><embed src=\"http://www.youtube.com/v/IyAyd4WnvhU&hl=en&fs=1&\" type=\"application/x-shockwave-flash\" allowscriptaccess=\"always\" allowfullscreen=\"true\" width=\"560\" height=\"340\"></embed></object>";
String expectedOutput =
"<object height=\"340\" width=\"560\"><param name=\"movie\" value=\"http://www.youtube.com/v/IyAyd4WnvhU&amp;hl=en&amp;fs=1&amp;\"><param name=\"allowFullScreen\" value=\"true\"><param name=\"allowscriptaccess\" value=\"always\"><embed allowfullscreen=\"true\" allowscriptaccess=\"always\" height=\"340\" src=\"http://www.youtube.com/v/IyAyd4WnvhU&amp;hl=en&amp;fs=1&amp;\" type=\"application/x-shockwave-flash\" width=\"560\"></embed></object>";
"<object height=\"340\" width=\"560\"><param name=\"movie\" value=\"http://www.youtube.com/v/IyAyd4WnvhU&amp;hl=en&amp;fs=1&amp;\"/><param name=\"allowFullScreen\" value=\"true\"/><param name=\"allowscriptaccess\" value=\"always\"/><embed allowfullscreen=\"true\" allowscriptaccess=\"always\" height=\"340\" src=\"http://www.youtube.com/v/IyAyd4WnvhU&amp;hl=en&amp;fs=1&amp;\" type=\"application/x-shockwave-flash\" width=\"560\"/></object>";
CleanResults cr = as.scan(input, revised, AntiSamy.DOM);
assertThat(cr.getCleanHTML(), containsString(expectedOutput));

String saxExpectedOutput =
"<object width=\"560\" height=\"340\"><param name=\"movie\" value=\"http://www.youtube.com/v/IyAyd4WnvhU&amp;hl=en&amp;fs=1&amp;\"><param name=\"allowFullScreen\" value=\"true\"><param name=\"allowscriptaccess\" value=\"always\"><embed src=\"http://www.youtube.com/v/IyAyd4WnvhU&amp;hl=en&amp;fs=1&amp;\" type=\"application/x-shockwave-flash\" allowscriptaccess=\"always\" allowfullscreen=\"true\" width=\"560\" height=\"340\"></embed></object>";
"<object width=\"560\" height=\"340\"><param name=\"movie\" value=\"http://www.youtube.com/v/IyAyd4WnvhU&amp;hl=en&amp;fs=1&amp;\"/><param name=\"allowFullScreen\" value=\"true\"/><param name=\"allowscriptaccess\" value=\"always\"/><embed src=\"http://www.youtube.com/v/IyAyd4WnvhU&amp;hl=en&amp;fs=1&amp;\" type=\"application/x-shockwave-flash\" allowscriptaccess=\"always\" allowfullscreen=\"true\" width=\"560\" height=\"340\"/></object>";
cr = as.scan(input, revised, AntiSamy.SAX);
assertThat(cr.getCleanHTML(), equalTo(saxExpectedOutput));

Expand All @@ -1586,9 +1598,9 @@ public void validateParamAsEmbed() throws ScanException, PolicyException {
input =
"<object width=\"560\" height=\"340\"><param name=\"movie\" value=\"http://supermaliciouscode.com/badstuff.swf\"></param><param name=\"allowFullScreen\" value=\"true\"></param><param name=\"allowscriptaccess\" value=\"always\"></param><embed src=\"http://www.youtube.com/v/IyAyd4WnvhU&hl=en&fs=1&\" type=\"application/x-shockwave-flash\" allowscriptaccess=\"always\" allowfullscreen=\"true\" width=\"560\" height=\"340\"></embed></object>";
expectedOutput =
"<object height=\"340\" width=\"560\"><param name=\"allowFullScreen\" value=\"true\"><param name=\"allowscriptaccess\" value=\"always\"><embed allowfullscreen=\"true\" allowscriptaccess=\"always\" height=\"340\" src=\"http://www.youtube.com/v/IyAyd4WnvhU&amp;hl=en&amp;fs=1&amp;\" type=\"application/x-shockwave-flash\" width=\"560\"></embed></object>";
"<object height=\"340\" width=\"560\"><param name=\"allowFullScreen\" value=\"true\"/><param name=\"allowscriptaccess\" value=\"always\"/><embed allowfullscreen=\"true\" allowscriptaccess=\"always\" height=\"340\" src=\"http://www.youtube.com/v/IyAyd4WnvhU&amp;hl=en&amp;fs=1&amp;\" type=\"application/x-shockwave-flash\" width=\"560\"/></object>";
saxExpectedOutput =
"<object width=\"560\" height=\"340\"><param name=\"allowFullScreen\" value=\"true\"><param name=\"allowscriptaccess\" value=\"always\"><embed src=\"http://www.youtube.com/v/IyAyd4WnvhU&amp;hl=en&amp;fs=1&amp;\" type=\"application/x-shockwave-flash\" allowscriptaccess=\"always\" allowfullscreen=\"true\" width=\"560\" height=\"340\"></embed></object>";
"<object width=\"560\" height=\"340\"><param name=\"allowFullScreen\" value=\"true\"/><param name=\"allowscriptaccess\" value=\"always\"/><embed src=\"http://www.youtube.com/v/IyAyd4WnvhU&amp;hl=en&amp;fs=1&amp;\" type=\"application/x-shockwave-flash\" allowscriptaccess=\"always\" allowfullscreen=\"true\" width=\"560\" height=\"340\"/></object>";
cr = as.scan(input, revised, AntiSamy.DOM);
assertThat(cr.getCleanHTML(), containsString(expectedOutput));

Expand All @@ -1600,9 +1612,9 @@ public void validateParamAsEmbed() throws ScanException, PolicyException {
input =
"<object width=\"560\" height=\"340\"><param name=\"movie\" value=\"http://www.youtube.com/v/IyAyd4WnvhU&hl=en&fs=1&\"></param><param name=\"allowFullScreen\" value=\"true\"></param><param name=\"allowscriptaccess\" value=\"always\"></param><embed src=\"http://hereswhereikeepbadcode.com/ohnoscary.swf\" type=\"application/x-shockwave-flash\" allowscriptaccess=\"always\" allowfullscreen=\"true\" width=\"560\" height=\"340\"></embed></object>";
expectedOutput =
"<object height=\"340\" width=\"560\"><param name=\"movie\" value=\"http://www.youtube.com/v/IyAyd4WnvhU&amp;hl=en&amp;fs=1&amp;\"><param name=\"allowFullScreen\" value=\"true\"><param name=\"allowscriptaccess\" value=\"always\"></object>";
"<object height=\"340\" width=\"560\"><param name=\"movie\" value=\"http://www.youtube.com/v/IyAyd4WnvhU&amp;hl=en&amp;fs=1&amp;\"/><param name=\"allowFullScreen\" value=\"true\"/><param name=\"allowscriptaccess\" value=\"always\"/></object>";
saxExpectedOutput =
"<object width=\"560\" height=\"340\"><param name=\"movie\" value=\"http://www.youtube.com/v/IyAyd4WnvhU&amp;hl=en&amp;fs=1&amp;\"><param name=\"allowFullScreen\" value=\"true\"><param name=\"allowscriptaccess\" value=\"always\"></object>";
"<object width=\"560\" height=\"340\"><param name=\"movie\" value=\"http://www.youtube.com/v/IyAyd4WnvhU&amp;hl=en&amp;fs=1&amp;\"/><param name=\"allowFullScreen\" value=\"true\"/><param name=\"allowscriptaccess\" value=\"always\"/></object>";

cr = as.scan(input, revised, AntiSamy.DOM);
assertThat(cr.getCleanHTML(), containsString(expectedOutput));
Expand Down Expand Up @@ -1799,10 +1811,10 @@ public void testXSSInAntiSamy151() throws ScanException, PolicyException {
CleanResults results_sax = as.scan(test, policy, AntiSamy.SAX);
CleanResults results_dom = as.scan(test, policy, AntiSamy.DOM);

assertEquals(results_sax.getCleanHTML(), results_dom.getCleanHTML());
assertEquals(
"whatever<img src=\"https://ssl.gstatic.com/codesite/ph/images/defaultlogo.png\">",
results_dom.getCleanHTML());
"whatever<img src=\"https://ssl.gstatic.com/codesite/ph/images/defaultlogo.png\"/>",
results_sax.getCleanHTML());
assertEquals(results_sax.getCleanHTML(), results_dom.getCleanHTML());
}

@Test
Expand Down Expand Up @@ -1845,7 +1857,7 @@ public void testStreamScan() throws ScanException {
Writer writer = new StringWriter();
as.scan(reader, writer, policy);
String cleanHtml = writer.toString().trim();
assertEquals("whatever" + testImgSrcURL + ">", cleanHtml);
assertEquals("whatever" + testImgSrcURL + "/>", cleanHtml);
}

@Test
Expand Down Expand Up @@ -2702,4 +2714,15 @@ public void testGithubIssue453() throws ScanException, PolicyException {
+ "<select name=\"Lang\"> "
+ "<option value=\"da\">Dansk</option> "));
}

@Test
public void testGithubIssue484() throws ScanException, PolicyException {
String s = "<p>this is para data</p><br/><p>this is para data 2</p>";
CleanResults crDom = as.scan(s, policy, AntiSamy.DOM);
CleanResults crSax = as.scan(s, policy, AntiSamy.SAX);
String domValue = crDom.getCleanHTML();
String saxValue = crSax.getCleanHTML();
assertEquals("<p>this is para data</p>\n" + "<br/>\n" + "<p>this is para data 2</p>", domValue);
assertEquals("<p>this is para data</p>\n" + "<br/>\n" + "<p>this is para data 2</p>", saxValue);
}
}
Loading