Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: CDATA nesting into CDATA is avoided as it is prohibited #231

Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/cc2olx/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CDATA_PATTERN = r"<!\[CDATA\[(?P<content>.*?)\]\]>"
4 changes: 3 additions & 1 deletion src/cc2olx/olx.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from cc2olx.iframe_link_parser import KalturaIframeLinkParser

from cc2olx.qti import QtiExport
from cc2olx.utils import element_builder, passport_file_parser
from cc2olx.utils import clean_from_cdata, element_builder, passport_file_parser

logger = logging.getLogger()

Expand Down Expand Up @@ -363,6 +363,7 @@ def _process_html(self, details):
html = self._process_static_links(details["html"])
if self.link_file:
html, video_olx = self._process_html_for_iframe(html)
html = clean_from_cdata(html)
txt = self.doc.createCDATASection(html)
child.appendChild(txt)
nodes.append(child)
Expand Down Expand Up @@ -434,6 +435,7 @@ def _create_discussion_node(self, details):
node.setAttribute("discussion_target", details["title"])
html_node = self.doc.createElement("html")
txt = "MISSING CONTENT" if details["text"] is None else details["text"]
txt = clean_from_cdata(txt)
txt = self.doc.createCDATASection(txt)
html_node.appendChild(txt)
return [html_node, node]
Expand Down
15 changes: 15 additions & 0 deletions src/cc2olx/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import csv
import re

from cc2olx.constants import CDATA_PATTERN

logger = logging.getLogger()


Expand Down Expand Up @@ -108,3 +110,16 @@ def clean_file_name(filename: str):

cleaned_name = re.sub(special_characters, "_", filename)
return cleaned_name


def clean_from_cdata(xml_string: str) -> str:
"""
Deletes CDATA tag from XML string while keeping its content.

Args:
xml_string (str): original XML string to clean.

Returns:
str: cleaned XML string.
"""
return re.sub(CDATA_PATTERN, r"\g<content>", xml_string, flags=re.DOTALL)
63 changes: 63 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import shutil
import zipfile

import xml.dom.minidom
from pathlib import Path
from tempfile import NamedTemporaryFile
from xml.dom.minidom import parse
Expand All @@ -12,6 +13,7 @@

from cc2olx.cli import parse_args
from cc2olx.models import Cartridge
from cc2olx.olx import OlxExport
from cc2olx.settings import collect_settings


Expand Down Expand Up @@ -242,3 +244,64 @@ def transcript_file(fixtures_data_dir):
fixtures_data_dir / "video_files/01___Intro_to_Knowledge_Based_AI/0 - Introductions.en.srt"
)
return transcript_file_path


@pytest.fixture(scope="session")
def html_without_cdata(fixtures_data_dir: Path) -> str:
"""
HTML string that doesn't contain CDATA sections.

Args:
fixtures_data_dir (str): Path to the directory where fixture data is present.

Returns:
str: HTML string.
"""
html_without_cdata_path = fixtures_data_dir / "html_files/html-without-cdata.html"
return html_without_cdata_path.read_text()


@pytest.fixture(scope="session")
def cdata_containing_html(fixtures_data_dir: Path) -> str:
"""
HTML string that contains CDATA sections.

Args:
fixtures_data_dir (str): Path to the directory where fixture data is present.

Returns:
str: HTML string.
"""
html_without_cdata_path = fixtures_data_dir / "html_files/cdata-containing-html.html"
return html_without_cdata_path.read_text()


@pytest.fixture(scope="session")
def expected_cleaned_cdata_containing_html(fixtures_data_dir: Path) -> str:
"""
The string with expected HTML after cleaning from CDATA sections.

Args:
fixtures_data_dir (str): Path to the directory where fixture data is present.

Returns:
str: HTML string.
"""
html_without_cdata_path = fixtures_data_dir / "html_files/cleaned-cdata-containing-html.html"
return html_without_cdata_path.read_text()


@pytest.fixture
def bare_olx_exporter(cartridge: Cartridge) -> OlxExport:
"""
Provides bare OLX exporter.

Args:
cartridge (Cartridge): Cartridge class instance.

Returns:
OlxExport: OlxExport instance.
"""
olx_exporter = OlxExport(cartridge)
olx_exporter.doc = xml.dom.minidom.Document()
return olx_exporter
15 changes: 15 additions & 0 deletions tests/fixtures_data/html_files/cdata-containing-html.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>CDATA containing HTML document</title>
</head>
<body>
<script type="text/javascript">
<![CDATA[
var htmlContent = "<div>Hello, world!</div>";
alert(htmlContent);
]]>
</script>
</body>
</html>
15 changes: 15 additions & 0 deletions tests/fixtures_data/html_files/cleaned-cdata-containing-html.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>CDATA containing HTML document</title>
</head>
<body>
<script type="text/javascript">

var htmlContent = "<div>Hello, world!</div>";
alert(htmlContent);

</script>
</body>
</html>
13 changes: 13 additions & 0 deletions tests/fixtures_data/html_files/html-without-cdata.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>HTML document without CDATA</title>
</head>
<body>
<script type="text/javascript">
var htmlContent = "<div>Hello, world!</div>";
alert(htmlContent);
</script>
</body>
</html>
35 changes: 35 additions & 0 deletions tests/fixtures_data/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from cc2olx.utils import clean_from_cdata


class TestXMLCleaningFromCDATA:
"""
Tests XML string cleaning from CDATA sections.
"""

def test_cdata_containing_html_is_cleaned_successfully(
self,
cdata_containing_html: str,
expected_cleaned_cdata_containing_html: str,
) -> None:
"""
Test if CDATA tags are removed from HTML while their content is kept.

Args:
cdata_containing_html (str): HTML that contains CDATA tags.
expected_cleaned_cdata_containing_html (str): Expected HTML after
successful cleaning.
"""
actual_cleaned_cdata_containing_html = clean_from_cdata(cdata_containing_html)

assert actual_cleaned_cdata_containing_html == expected_cleaned_cdata_containing_html

def test_html_without_cdata_remains_the_same_after_cleaning(self, html_without_cdata: str) -> None:
"""
Test if HTML that doesn't contain CDATA tags remains the same.

Args:
html_without_cdata (str): HTML that doesn't contains CDATA tags.
"""
actual_cleaned_html_without_cdata = clean_from_cdata(html_without_cdata)

assert actual_cleaned_html_without_cdata == html_without_cdata
104 changes: 102 additions & 2 deletions tests/test_olx.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import json
from unittest.mock import Mock

import lxml
import xml.dom.minidom

from cc2olx import olx

from .utils import format_xml
import xml.dom.minidom
import lxml


def test_olx_export_xml(cartridge, link_map_csv, studio_course_xml):
Expand Down Expand Up @@ -36,6 +40,54 @@ def test_process_link():
)


class TestOlXExporeterHTMLProcessing:
"""
Test the OLX exporter for HTML parsing flow.
"""

def test_html_cleaning_from_cdata(
self,
mocker,
bare_olx_exporter,
cdata_containing_html,
expected_cleaned_cdata_containing_html,
):
"""
Test that CDATA cleaning function is called during HTML processing.

Args:
mocker (MockerFixture): MockerFixture instance.
bare_olx_exporter (OlxExport): bare OLX exporter.
cdata_containing_html (str): HTML that contains CDATA tags.
expected_cleaned_cdata_containing_html (str): Expected HTML after
successful cleaning.
"""
details = {"html": cdata_containing_html}

clean_from_cdata_mock = mocker.patch(
"cc2olx.olx.clean_from_cdata",
return_value=expected_cleaned_cdata_containing_html,
)

bare_olx_exporter._process_html(details)

clean_from_cdata_mock.assert_called_once()

def test_processed_html_content_is_wrapped_into_cdata(self, bare_olx_exporter, cdata_containing_html):
"""
Test that processed HTML content is wrapped into CDATA section.

Args:
bare_olx_exporter (OlxExport): bare OLX exporter.
cdata_containing_html (str): HTML that contains CDATA tags.
"""
details = {"html": cdata_containing_html}

result_html, *__ = bare_olx_exporter._process_html(details)

assert isinstance(result_html.childNodes[0], xml.dom.minidom.CDATASection)


class TestOlXExporeterIframeParser:
"""
Test the olx exporter for iframe link parsing flow
Expand Down Expand Up @@ -141,3 +193,51 @@ def test_policy_contains_advanced_module(self, cartridge, passports_csv, caplog)
assert ["Missing LTI Passport for learning_tools_interoperability. Using default."] == [
rec.message for rec in caplog.records
]


class TestDiscussionParsing:
"""
Test the OLX exporter for discussion parsing flow.
"""

def test_discussion_content_cleaning_from_cdata(
self,
mocker,
bare_olx_exporter,
cdata_containing_html,
expected_cleaned_cdata_containing_html,
):
"""
Test that CDATA cleaning function is called during discussion parsing.

Args:
mocker (MockerFixture): MockerFixture instance.
bare_olx_exporter (OlxExport): bare OLX exporter.
cdata_containing_html (str): HTML that contains CDATA tags.
expected_cleaned_cdata_containing_html (str): Expected HTML after
successful cleaning.
"""
details = {"dependencies": [], "title": Mock(), "text": cdata_containing_html}

clean_from_cdata_mock = mocker.patch(
"cc2olx.olx.clean_from_cdata",
return_value=expected_cleaned_cdata_containing_html,
)

bare_olx_exporter._create_discussion_node(details)

clean_from_cdata_mock.assert_called_once()

def test_discussion_decription_is_wrapped_into_cdata(self, bare_olx_exporter, cdata_containing_html):
"""
Test that processed HTML content is wrapped into CDATA section.

Args:
bare_olx_exporter (OlxExport): bare OLX exporter.
cdata_containing_html (str): HTML that contains CDATA tags.
"""
details = {"dependencies": [], "title": Mock(), "text": cdata_containing_html}

discussion_decription_html, __ = bare_olx_exporter._create_discussion_node(details)

assert isinstance(discussion_decription_html.childNodes[0], xml.dom.minidom.CDATASection)