diff --git a/config_generation/db_to_xml.py b/config_generation/db_to_xml.py
index 87269dc0..844f9f9c 100644
--- a/config_generation/db_to_xml.py
+++ b/config_generation/db_to_xml.py
@@ -23,12 +23,31 @@ def _get_tree(self, xml_string) -> ET.ElementTree:
"""takes the path of an xml file and opens it as an ElementTree object"""
return ET.ElementTree(ET.fromstring(xml_string))
- def get_tag_value(self, tag_name: str) -> list:
+ def get_tag_value(self, tag_name: str, strict: bool = False) -> str | list[str]:
"""
- tag_name can be either the top level tag
- or you can get a child by saying 'parent/child'
+ Retrieves the value of the specified XML tag. If 'strict' is True, the function will
+ raise an error if more than one value is found, and it will return the single value.
+
+ Parameters:
+ - tag_name (str): Can be either the top level tag or a path specifying a child tag, e.g., 'parent/child'.
+ - strict (bool): If True, raises an error when more than one value is found, or if no values are found.
+
+ Returns:
+ - str: The text of the single XML element matching the tag_name if strict is True and exactly one match exists.
+
+ Raises:
+ - ValueError: If 'strict' is True and either no values or more than one value is found.
"""
- return [element.text for element in self.xml_tree.findall(tag_name)]
+
+ elements = self.xml_tree.findall(tag_name)
+ if strict:
+ if len(elements) == 0:
+ raise ValueError(f"No elements found for the tag '{tag_name}'")
+ elif len(elements) > 1:
+ raise ValueError(f"Multiple elements found for the tag '{tag_name}': expected exactly one.")
+ return elements[0].text
+ else:
+ return [element.text for element in elements]
def _add_declaration(self, xml_string: str):
"""adds xml declaration to xml string"""
@@ -129,6 +148,50 @@ def convert_template_to_scraper(self, collection) -> None:
scraper_config = self.update_config_xml()
return scraper_config
+ def convert_template_to_plugin_indexer(self, scraper_editor) -> None:
+ """
+ assuming this class has been instantiated with the scraper_template.xml
+ """
+
+ transfer_fields = [
+ "KeepHashFragmentInUrl",
+ "CorrectDomainCookies",
+ "IgnoreSessionCookies",
+ "DownloadImages",
+ "DownloadMedia",
+ "DownloadCss",
+ "DownloadFtp",
+ "DownloadFile",
+ "IndexJs",
+ "FollowJs",
+ "CrawlFlash",
+ "NormalizeSecureSchemesWhenTestingVisited",
+ "RetryCount",
+ "RetryPause",
+ "AddBaseHref",
+ "AddMetaContentType",
+ "NormalizeUrls",
+ ]
+
+ double_transfer_fields = [
+ ("UrlAccess", "AllowXPathCookies"),
+ ("UrlAccess", "UseBrowserForWebRequests"),
+ ("UrlAccess", "UseHttpClientForWebRequests"),
+ ]
+
+ for field in transfer_fields:
+ print(field, scraper_editor.get_tag_value(field, strict=True))
+ self.update_or_add_element_value(field, scraper_editor.get_tag_value(field, strict=True))
+
+ for parent, child in double_transfer_fields:
+ print(parent, child, scraper_editor.get_tag_value(f"{parent}/{child}", strict=True))
+ self.update_or_add_element_value(
+ f"{parent}/{child}", scraper_editor.get_tag_value(f"{parent}/{child}", strict=True)
+ )
+
+ scraper_config = self.update_config_xml()
+ return scraper_config
+
def convert_template_to_indexer(self, collection) -> None:
"""
assuming this class has been instantiated with the indexer_template.xml
diff --git a/config_generation/xmls/plugin_indexing_template.xml b/config_generation/xmls/plugin_indexing_template.xml
index a9b5bd86..b7a9ce63 100644
--- a/config_generation/xmls/plugin_indexing_template.xml
+++ b/config_generation/xmls/plugin_indexing_template.xml
@@ -31,22 +31,8 @@
10
-1
-1
- true
- false
- false
- false
- false
- false
- true
- true
- false
- true
- true
true
true
- false
- 1
- 1000 ms
true
no
@@ -54,7 +40,6 @@
false
false
- false
false
false
@@ -224,8 +209,6 @@
false
- true
- false
diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
index 999203e1..a70ffd0f 100644
--- a/sde_collections/models/collection.py
+++ b/sde_collections/models/collection.py
@@ -207,7 +207,20 @@ def create_plugin_config(self, overwrite: bool = False):
if overwrite is True, it will overwrite the existing file
"""
- plugin_config = open("config_generation/xmls/plugin_indexing_template.xml").read()
+
+ # there needs to be a scraper config file before creating the plugin config
+ gh = GitHubHandler()
+ scraper_exists = gh.check_file_exists(self._scraper_config_path)
+ if not scraper_exists:
+ raise ValueError(f"Scraper does not exist for the collection {self.config_folder}")
+ else:
+ scraper_content = gh._get_file_contents(self._scraper_config_path)
+ scraper_content = scraper_content.decoded_content.decode("utf-8")
+ scraper_editor = XmlEditor(scraper_content)
+
+ plugin_template = open("config_generation/xmls/plugin_indexing_template.xml").read()
+ plugin_editor = XmlEditor(plugin_template)
+ plugin_config = plugin_editor.convert_template_to_plugin_indexer(scraper_editor)
self._write_to_github(self._plugin_config_path, plugin_config, overwrite)
def create_indexer_config(self, overwrite: bool = False):