openedx · ormsbee · Dec 11, 2024 · Sep 13, 2024 · Nov 9, 2024 · Nov 9, 2024
diff --git a/src/cc2olx/constants.py b/src/cc2olx/constants.py
@@ -0,0 +1 @@
+CDATA_PATTERN = r"<!\[CDATA\[(?P<content>.*?)\]\]>"
diff --git a/src/cc2olx/olx.py b/src/cc2olx/olx.py
@@ -8,7 +8,7 @@
 from cc2olx.iframe_link_parser import KalturaIframeLinkParser
 
 from cc2olx.qti import QtiExport
-from cc2olx.utils import element_builder, passport_file_parser
+from cc2olx.utils import clean_from_cdata, element_builder, passport_file_parser
 
 logger = logging.getLogger()
 
@@ -363,6 +363,7 @@ def _process_html(self, details):
         html = self._process_static_links(details["html"])
         if self.link_file:
             html, video_olx = self._process_html_for_iframe(html)
+        html = clean_from_cdata(html)
         txt = self.doc.createCDATASection(html)
         child.appendChild(txt)
         nodes.append(child)
@@ -434,6 +435,7 @@ def _create_discussion_node(self, details):
         node.setAttribute("discussion_target", details["title"])
         html_node = self.doc.createElement("html")
         txt = "MISSING CONTENT" if details["text"] is None else details["text"]
+        txt = clean_from_cdata(txt)
         txt = self.doc.createCDATASection(txt)
         html_node.appendChild(txt)
         return [html_node, node]

diff --git a/src/cc2olx/utils.py b/src/cc2olx/utils.py
@@ -5,6 +5,8 @@
 import csv
 import re
 
+from cc2olx.constants import CDATA_PATTERN
+
 logger = logging.getLogger()
 
 
@@ -108,3 +110,16 @@ def clean_file_name(filename: str):
 
     cleaned_name = re.sub(special_characters, "_", filename)
     return cleaned_name
+
+
+def clean_from_cdata(xml_string: str) -> str:
+    """
+    Deletes CDATA tag from XML string while keeping its content.
+
+    Args:
+        xml_string (str): original XML string to clean.
+
+    Returns:
+        str: cleaned XML string.
+    """
+    return re.sub(CDATA_PATTERN, r"\g<content>", xml_string, flags=re.DOTALL)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,6 +4,7 @@
 import shutil
 import zipfile
 
+import xml.dom.minidom
 from pathlib import Path
 from tempfile import NamedTemporaryFile
 from xml.dom.minidom import parse
@@ -12,6 +13,7 @@
 
 from cc2olx.cli import parse_args
 from cc2olx.models import Cartridge
+from cc2olx.olx import OlxExport
 from cc2olx.settings import collect_settings
 
 
@@ -242,3 +244,64 @@ def transcript_file(fixtures_data_dir):
         fixtures_data_dir / "video_files/01___Intro_to_Knowledge_Based_AI/0 - Introductions.en.srt"
     )
     return transcript_file_path
+
+
+@pytest.fixture(scope="session")
+def html_without_cdata(fixtures_data_dir: Path) -> str:
+    """
+    HTML string that doesn't contain CDATA sections.
+
+    Args:
+        fixtures_data_dir (str): Path to the directory where fixture data is present.
+
+    Returns:
+        str: HTML string.
+    """
+    html_without_cdata_path = fixtures_data_dir / "html_files/html-without-cdata.html"
+    return html_without_cdata_path.read_text()
+
+
+@pytest.fixture(scope="session")
+def cdata_containing_html(fixtures_data_dir: Path) -> str:
+    """
+    HTML string that contains CDATA sections.
+
+    Args:
+        fixtures_data_dir (str): Path to the directory where fixture data is present.
+
+    Returns:
+        str: HTML string.
+    """
+    html_without_cdata_path = fixtures_data_dir / "html_files/cdata-containing-html.html"
+    return html_without_cdata_path.read_text()
+
+
+@pytest.fixture(scope="session")
+def expected_cleaned_cdata_containing_html(fixtures_data_dir: Path) -> str:
+    """
+    The string with expected HTML after cleaning from CDATA sections.
+
+    Args:
+        fixtures_data_dir (str): Path to the directory where fixture data is present.
+
+    Returns:
+        str: HTML string.
+    """
+    html_without_cdata_path = fixtures_data_dir / "html_files/cleaned-cdata-containing-html.html"
+    return html_without_cdata_path.read_text()
+
+
+@pytest.fixture
+def bare_olx_exporter(cartridge: Cartridge) -> OlxExport:
+    """
+    Provides bare OLX exporter.
+
+    Args:
+        cartridge (Cartridge): Cartridge class instance.
+
+    Returns:
+        OlxExport: OlxExport instance.
+    """
+    olx_exporter = OlxExport(cartridge)
+    olx_exporter.doc = xml.dom.minidom.Document()
+    return olx_exporter
diff --git a/tests/fixtures_data/html_files/cdata-containing-html.html b/tests/fixtures_data/html_files/cdata-containing-html.html
@@ -0,0 +1,15 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+    <head>
+        <title>CDATA containing HTML document</title>
+    </head>
+    <body>
+        <script type="text/javascript">
+            <![CDATA[
+            var htmlContent = "<div>Hello, world!</div>";
+            alert(htmlContent);
+            ]]>
+        </script>
+    </body>
+</html>
diff --git a/tests/fixtures_data/html_files/cleaned-cdata-containing-html.html b/tests/fixtures_data/html_files/cleaned-cdata-containing-html.html
@@ -0,0 +1,15 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+    <head>
+        <title>CDATA containing HTML document</title>
+    </head>
+    <body>
+        <script type="text/javascript">
+
+            var htmlContent = "<div>Hello, world!</div>";
+            alert(htmlContent);
+
+        </script>
+    </body>
+</html>
diff --git a/tests/fixtures_data/html_files/html-without-cdata.html b/tests/fixtures_data/html_files/html-without-cdata.html
@@ -0,0 +1,13 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+    <head>
+        <title>HTML document without CDATA</title>
+    </head>
+    <body>
+        <script type="text/javascript">
+            var htmlContent = "<div>Hello, world!</div>";
+            alert(htmlContent);
+        </script>
+    </body>
+</html>
diff --git a/tests/test_olx.py b/tests/test_olx.py
@@ -1,8 +1,12 @@
 import json
+from unittest.mock import Mock
+
+import lxml
+import xml.dom.minidom
+
 from cc2olx import olx
+
 from .utils import format_xml
-import xml.dom.minidom
-import lxml
 
 
 def test_olx_export_xml(cartridge, link_map_csv, studio_course_xml):
@@ -36,6 +40,54 @@ def test_process_link():
     )
 
 
+class TestOlXExporeterHTMLProcessing:
+    """
+    Test the OLX exporter for HTML parsing flow.
+    """
+
+    def test_html_cleaning_from_cdata(
+        self,
+        mocker,
+        bare_olx_exporter,
+        cdata_containing_html,
+        expected_cleaned_cdata_containing_html,
+    ):
+        """
+        Test that CDATA cleaning function is called during HTML processing.
+
+        Args:
+            mocker (MockerFixture): MockerFixture instance.
+            bare_olx_exporter (OlxExport): bare OLX exporter.
+            cdata_containing_html (str): HTML that contains CDATA tags.
+            expected_cleaned_cdata_containing_html (str): Expected HTML after
+                successful cleaning.
+        """
+        details = {"html": cdata_containing_html}
+
+        clean_from_cdata_mock = mocker.patch(
+            "cc2olx.olx.clean_from_cdata",
+            return_value=expected_cleaned_cdata_containing_html,
+        )
+
+        bare_olx_exporter._process_html(details)
+
+        clean_from_cdata_mock.assert_called_once()
+
+    def test_processed_html_content_is_wrapped_into_cdata(self, bare_olx_exporter, cdata_containing_html):
+        """
+        Test that processed HTML content is wrapped into CDATA section.
+
+        Args:
+            bare_olx_exporter (OlxExport): bare OLX exporter.
+            cdata_containing_html (str): HTML that contains CDATA tags.
+        """
+        details = {"html": cdata_containing_html}
+
+        result_html, *__ = bare_olx_exporter._process_html(details)
+
+        assert isinstance(result_html.childNodes[0], xml.dom.minidom.CDATASection)
+
+
 class TestOlXExporeterIframeParser:
     """
     Test the olx exporter for iframe link parsing flow
@@ -141,3 +193,51 @@ def test_policy_contains_advanced_module(self, cartridge, passports_csv, caplog)
         assert ["Missing LTI Passport for learning_tools_interoperability. Using default."] == [
             rec.message for rec in caplog.records
         ]
+
+
+class TestDiscussionParsing:
+    """
+    Test the OLX exporter for discussion parsing flow.
+    """
+
+    def test_discussion_content_cleaning_from_cdata(
+        self,
+        mocker,
+        bare_olx_exporter,
+        cdata_containing_html,
+        expected_cleaned_cdata_containing_html,
+    ):
+        """
+        Test that CDATA cleaning function is called during discussion parsing.
+
+        Args:
+            mocker (MockerFixture): MockerFixture instance.
+            bare_olx_exporter (OlxExport): bare OLX exporter.
+            cdata_containing_html (str): HTML that contains CDATA tags.
+            expected_cleaned_cdata_containing_html (str): Expected HTML after
+                successful cleaning.
+        """
+        details = {"dependencies": [], "title": Mock(), "text": cdata_containing_html}
+
+        clean_from_cdata_mock = mocker.patch(
+            "cc2olx.olx.clean_from_cdata",
+            return_value=expected_cleaned_cdata_containing_html,
+        )
+
+        bare_olx_exporter._create_discussion_node(details)
+
+        clean_from_cdata_mock.assert_called_once()
+
+    def test_discussion_decription_is_wrapped_into_cdata(self, bare_olx_exporter, cdata_containing_html):
+        """
+        Test that processed HTML content is wrapped into CDATA section.
+
+        Args:
+            bare_olx_exporter (OlxExport): bare OLX exporter.
+            cdata_containing_html (str): HTML that contains CDATA tags.
+        """
+        details = {"dependencies": [], "title": Mock(), "text": cdata_containing_html}
+
+        discussion_decription_html, __ = bare_olx_exporter._create_discussion_node(details)
+
+        assert isinstance(discussion_decription_html.childNodes[0], xml.dom.minidom.CDATASection)
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,35 @@
+from cc2olx.utils import clean_from_cdata
+
+
+class TestXMLCleaningFromCDATA:
+    """
+    Test XML string cleaning from CDATA sections.
+    """
+
+    def test_cdata_containing_html_is_cleaned_successfully(
+        self,
+        cdata_containing_html: str,
+        expected_cleaned_cdata_containing_html: str,
+    ) -> None:
+        """
+        Test if CDATA tags are removed from HTML while their content is kept.
+
+        Args:
+            cdata_containing_html (str): HTML that contains CDATA tags.
+            expected_cleaned_cdata_containing_html (str): Expected HTML after
+                successful cleaning.
+        """
+        actual_cleaned_cdata_containing_html = clean_from_cdata(cdata_containing_html)
+
+        assert actual_cleaned_cdata_containing_html == expected_cleaned_cdata_containing_html
+
+    def test_html_without_cdata_remains_the_same_after_cleaning(self, html_without_cdata: str) -> None:
+        """
+        Test if HTML that doesn't contain CDATA tags remains the same.
+
+        Args:
+            html_without_cdata (str): HTML that doesn't contains CDATA tags.
+        """
+        actual_cleaned_html_without_cdata = clean_from_cdata(html_without_cdata)
+
+        assert actual_cleaned_html_without_cdata == html_without_cdata
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		CDATA_PATTERN = r"<!\[CDATA\[(?P<content>.*?)\]\]>"