diff --git a/src/cc2olx/constants.py b/src/cc2olx/constants.py new file mode 100644 index 00000000..1b956935 --- /dev/null +++ b/src/cc2olx/constants.py @@ -0,0 +1 @@ +CDATA_PATTERN = r".*?)\]\]>" diff --git a/src/cc2olx/olx.py b/src/cc2olx/olx.py index d178488f..f447a0f1 100644 --- a/src/cc2olx/olx.py +++ b/src/cc2olx/olx.py @@ -8,7 +8,7 @@ from cc2olx.iframe_link_parser import KalturaIframeLinkParser from cc2olx.qti import QtiExport -from cc2olx.utils import element_builder, passport_file_parser +from cc2olx.utils import clean_from_cdata, element_builder, passport_file_parser logger = logging.getLogger() @@ -363,6 +363,7 @@ def _process_html(self, details): html = self._process_static_links(details["html"]) if self.link_file: html, video_olx = self._process_html_for_iframe(html) + html = clean_from_cdata(html) txt = self.doc.createCDATASection(html) child.appendChild(txt) nodes.append(child) @@ -434,6 +435,7 @@ def _create_discussion_node(self, details): node.setAttribute("discussion_target", details["title"]) html_node = self.doc.createElement("html") txt = "MISSING CONTENT" if details["text"] is None else details["text"] + txt = clean_from_cdata(txt) txt = self.doc.createCDATASection(txt) html_node.appendChild(txt) return [html_node, node] diff --git a/src/cc2olx/utils.py b/src/cc2olx/utils.py index e5c4fbf4..40cf8c58 100644 --- a/src/cc2olx/utils.py +++ b/src/cc2olx/utils.py @@ -5,6 +5,8 @@ import csv import re +from cc2olx.constants import CDATA_PATTERN + logger = logging.getLogger() @@ -108,3 +110,16 @@ def clean_file_name(filename: str): cleaned_name = re.sub(special_characters, "_", filename) return cleaned_name + + +def clean_from_cdata(xml_string: str) -> str: + """ + Deletes CDATA tag from XML string while keeping its content. + + Args: + xml_string (str): original XML string to clean. + + Returns: + str: cleaned XML string. + """ + return re.sub(CDATA_PATTERN, r"\g", xml_string, flags=re.DOTALL) diff --git a/tests/conftest.py b/tests/conftest.py index 87799829..31b10605 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ import shutil import zipfile +import xml.dom.minidom from pathlib import Path from tempfile import NamedTemporaryFile from xml.dom.minidom import parse @@ -12,6 +13,7 @@ from cc2olx.cli import parse_args from cc2olx.models import Cartridge +from cc2olx.olx import OlxExport from cc2olx.settings import collect_settings @@ -242,3 +244,64 @@ def transcript_file(fixtures_data_dir): fixtures_data_dir / "video_files/01___Intro_to_Knowledge_Based_AI/0 - Introductions.en.srt" ) return transcript_file_path + + +@pytest.fixture(scope="session") +def html_without_cdata(fixtures_data_dir: Path) -> str: + """ + HTML string that doesn't contain CDATA sections. + + Args: + fixtures_data_dir (str): Path to the directory where fixture data is present. + + Returns: + str: HTML string. + """ + html_without_cdata_path = fixtures_data_dir / "html_files/html-without-cdata.html" + return html_without_cdata_path.read_text() + + +@pytest.fixture(scope="session") +def cdata_containing_html(fixtures_data_dir: Path) -> str: + """ + HTML string that contains CDATA sections. + + Args: + fixtures_data_dir (str): Path to the directory where fixture data is present. + + Returns: + str: HTML string. + """ + html_without_cdata_path = fixtures_data_dir / "html_files/cdata-containing-html.html" + return html_without_cdata_path.read_text() + + +@pytest.fixture(scope="session") +def expected_cleaned_cdata_containing_html(fixtures_data_dir: Path) -> str: + """ + The string with expected HTML after cleaning from CDATA sections. + + Args: + fixtures_data_dir (str): Path to the directory where fixture data is present. + + Returns: + str: HTML string. + """ + html_without_cdata_path = fixtures_data_dir / "html_files/cleaned-cdata-containing-html.html" + return html_without_cdata_path.read_text() + + +@pytest.fixture +def bare_olx_exporter(cartridge: Cartridge) -> OlxExport: + """ + Provides bare OLX exporter. + + Args: + cartridge (Cartridge): Cartridge class instance. + + Returns: + OlxExport: OlxExport instance. + """ + olx_exporter = OlxExport(cartridge) + olx_exporter.doc = xml.dom.minidom.Document() + return olx_exporter diff --git a/tests/fixtures_data/html_files/cdata-containing-html.html b/tests/fixtures_data/html_files/cdata-containing-html.html new file mode 100644 index 00000000..5c5e51d5 --- /dev/null +++ b/tests/fixtures_data/html_files/cdata-containing-html.html @@ -0,0 +1,15 @@ + + + + CDATA containing HTML document + + + + + diff --git a/tests/fixtures_data/html_files/cleaned-cdata-containing-html.html b/tests/fixtures_data/html_files/cleaned-cdata-containing-html.html new file mode 100644 index 00000000..d207cdf7 --- /dev/null +++ b/tests/fixtures_data/html_files/cleaned-cdata-containing-html.html @@ -0,0 +1,15 @@ + + + + CDATA containing HTML document + + + + + diff --git a/tests/fixtures_data/html_files/html-without-cdata.html b/tests/fixtures_data/html_files/html-without-cdata.html new file mode 100644 index 00000000..95ff7b80 --- /dev/null +++ b/tests/fixtures_data/html_files/html-without-cdata.html @@ -0,0 +1,13 @@ + + + + HTML document without CDATA + + + + + diff --git a/tests/test_olx.py b/tests/test_olx.py index 56d88d5a..a35d67c6 100644 --- a/tests/test_olx.py +++ b/tests/test_olx.py @@ -1,8 +1,12 @@ import json +from unittest.mock import Mock + +import lxml +import xml.dom.minidom + from cc2olx import olx + from .utils import format_xml -import xml.dom.minidom -import lxml def test_olx_export_xml(cartridge, link_map_csv, studio_course_xml): @@ -36,6 +40,54 @@ def test_process_link(): ) +class TestOlXExporeterHTMLProcessing: + """ + Test the OLX exporter for HTML parsing flow. + """ + + def test_html_cleaning_from_cdata( + self, + mocker, + bare_olx_exporter, + cdata_containing_html, + expected_cleaned_cdata_containing_html, + ): + """ + Test that CDATA cleaning function is called during HTML processing. + + Args: + mocker (MockerFixture): MockerFixture instance. + bare_olx_exporter (OlxExport): bare OLX exporter. + cdata_containing_html (str): HTML that contains CDATA tags. + expected_cleaned_cdata_containing_html (str): Expected HTML after + successful cleaning. + """ + details = {"html": cdata_containing_html} + + clean_from_cdata_mock = mocker.patch( + "cc2olx.olx.clean_from_cdata", + return_value=expected_cleaned_cdata_containing_html, + ) + + bare_olx_exporter._process_html(details) + + clean_from_cdata_mock.assert_called_once() + + def test_processed_html_content_is_wrapped_into_cdata(self, bare_olx_exporter, cdata_containing_html): + """ + Test that processed HTML content is wrapped into CDATA section. + + Args: + bare_olx_exporter (OlxExport): bare OLX exporter. + cdata_containing_html (str): HTML that contains CDATA tags. + """ + details = {"html": cdata_containing_html} + + result_html, *__ = bare_olx_exporter._process_html(details) + + assert isinstance(result_html.childNodes[0], xml.dom.minidom.CDATASection) + + class TestOlXExporeterIframeParser: """ Test the olx exporter for iframe link parsing flow @@ -141,3 +193,51 @@ def test_policy_contains_advanced_module(self, cartridge, passports_csv, caplog) assert ["Missing LTI Passport for learning_tools_interoperability. Using default."] == [ rec.message for rec in caplog.records ] + + +class TestDiscussionParsing: + """ + Test the OLX exporter for discussion parsing flow. + """ + + def test_discussion_content_cleaning_from_cdata( + self, + mocker, + bare_olx_exporter, + cdata_containing_html, + expected_cleaned_cdata_containing_html, + ): + """ + Test that CDATA cleaning function is called during discussion parsing. + + Args: + mocker (MockerFixture): MockerFixture instance. + bare_olx_exporter (OlxExport): bare OLX exporter. + cdata_containing_html (str): HTML that contains CDATA tags. + expected_cleaned_cdata_containing_html (str): Expected HTML after + successful cleaning. + """ + details = {"dependencies": [], "title": Mock(), "text": cdata_containing_html} + + clean_from_cdata_mock = mocker.patch( + "cc2olx.olx.clean_from_cdata", + return_value=expected_cleaned_cdata_containing_html, + ) + + bare_olx_exporter._create_discussion_node(details) + + clean_from_cdata_mock.assert_called_once() + + def test_discussion_decription_is_wrapped_into_cdata(self, bare_olx_exporter, cdata_containing_html): + """ + Test that processed HTML content is wrapped into CDATA section. + + Args: + bare_olx_exporter (OlxExport): bare OLX exporter. + cdata_containing_html (str): HTML that contains CDATA tags. + """ + details = {"dependencies": [], "title": Mock(), "text": cdata_containing_html} + + discussion_decription_html, __ = bare_olx_exporter._create_discussion_node(details) + + assert isinstance(discussion_decription_html.childNodes[0], xml.dom.minidom.CDATASection) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..82b41955 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,35 @@ +from cc2olx.utils import clean_from_cdata + + +class TestXMLCleaningFromCDATA: + """ + Test XML string cleaning from CDATA sections. + """ + + def test_cdata_containing_html_is_cleaned_successfully( + self, + cdata_containing_html: str, + expected_cleaned_cdata_containing_html: str, + ) -> None: + """ + Test if CDATA tags are removed from HTML while their content is kept. + + Args: + cdata_containing_html (str): HTML that contains CDATA tags. + expected_cleaned_cdata_containing_html (str): Expected HTML after + successful cleaning. + """ + actual_cleaned_cdata_containing_html = clean_from_cdata(cdata_containing_html) + + assert actual_cleaned_cdata_containing_html == expected_cleaned_cdata_containing_html + + def test_html_without_cdata_remains_the_same_after_cleaning(self, html_without_cdata: str) -> None: + """ + Test if HTML that doesn't contain CDATA tags remains the same. + + Args: + html_without_cdata (str): HTML that doesn't contains CDATA tags. + """ + actual_cleaned_html_without_cdata = clean_from_cdata(html_without_cdata) + + assert actual_cleaned_html_without_cdata == html_without_cdata