NASA-IMPACT · CarsonDavis · Apr 18, 2024 · Apr 17, 2024 · Apr 17, 2024 · Apr 17, 2024
diff --git a/config_generation/db_to_xml.py b/config_generation/db_to_xml.py
@@ -23,12 +23,31 @@ def _get_tree(self, xml_string) -> ET.ElementTree:
         """takes the path of an xml file and opens it as an ElementTree object"""
         return ET.ElementTree(ET.fromstring(xml_string))
 
-    def get_tag_value(self, tag_name: str) -> list:
+    def get_tag_value(self, tag_name: str, strict: bool = False) -> str | list[str]:
         """
-        tag_name can be either the top level tag
-        or you can get a child by saying 'parent/child'
+        Retrieves the value of the specified XML tag. If 'strict' is True, the function will
+        raise an error if more than one value is found, and it will return the single value.
+
+        Parameters:
+        - tag_name (str): Can be either the top level tag or a path specifying a child tag, e.g., 'parent/child'.
+        - strict (bool): If True, raises an error when more than one value is found, or if no values are found.
+
+        Returns:
+        - str: The text of the single XML element matching the tag_name if strict is True and exactly one match exists.
+
+        Raises:
+        - ValueError: If 'strict' is True and either no values or more than one value is found.
         """
-        return [element.text for element in self.xml_tree.findall(tag_name)]
+
+        elements = self.xml_tree.findall(tag_name)
+        if strict:
+            if len(elements) == 0:
+                raise ValueError(f"No elements found for the tag '{tag_name}'")
+            elif len(elements) > 1:
+                raise ValueError(f"Multiple elements found for the tag '{tag_name}': expected exactly one.")
+            return elements[0].text
+        else:
+            return [element.text for element in elements]
 
     def _add_declaration(self, xml_string: str):
         """adds xml declaration to xml string"""
@@ -129,6 +148,50 @@ def convert_template_to_scraper(self, collection) -> None:
         scraper_config = self.update_config_xml()
         return scraper_config
 
+    def convert_template_to_plugin_indexer(self, scraper_editor) -> None:
+        """
+        assuming this class has been instantiated with the scraper_template.xml
+        """
+
+        transfer_fields = [
+            "KeepHashFragmentInUrl",
+            "CorrectDomainCookies",
+            "IgnoreSessionCookies",
+            "DownloadImages",
+            "DownloadMedia",
+            "DownloadCss",
+            "DownloadFtp",
+            "DownloadFile",
+            "IndexJs",
+            "FollowJs",
+            "CrawlFlash",
+            "NormalizeSecureSchemesWhenTestingVisited",
+            "RetryCount",
+            "RetryPause",
+            "AddBaseHref",
+            "AddMetaContentType",
+            "NormalizeUrls",
+        ]
+
+        double_transfer_fields = [
+            ("UrlAccess", "AllowXPathCookies"),
+            ("UrlAccess", "UseBrowserForWebRequests"),
+            ("UrlAccess", "UseHttpClientForWebRequests"),
+        ]
+
+        for field in transfer_fields:
+            print(field, scraper_editor.get_tag_value(field, strict=True))
+            self.update_or_add_element_value(field, scraper_editor.get_tag_value(field, strict=True))
+
+        for parent, child in double_transfer_fields:
+            print(parent, child, scraper_editor.get_tag_value(f"{parent}/{child}", strict=True))
+            self.update_or_add_element_value(
+                f"{parent}/{child}", scraper_editor.get_tag_value(f"{parent}/{child}", strict=True)
+            )
+
+        scraper_config = self.update_config_xml()
+        return scraper_config
+
     def convert_template_to_indexer(self, collection) -> None:
         """
         assuming this class has been instantiated with the indexer_template.xml

diff --git a/config_generation/xmls/plugin_indexing_template.xml b/config_generation/xmls/plugin_indexing_template.xml
@@ -31,30 +31,15 @@
     <MaxRedirection>10</MaxRedirection>
     <CrawlMaxSize>-1</CrawlMaxSize>
     <CrawlTimeout>-1</CrawlTimeout>
-    <NormalizeUrls>true</NormalizeUrls>
-    <CorrectDomainCookies>false</CorrectDomainCookies>
-    <IgnoreSessionCookies>false</IgnoreSessionCookies>
-    <DownloadImages>false</DownloadImages>
-    <DownloadMedia>false</DownloadMedia>
-    <DownloadCss>false</DownloadCss>
-    <DownloadFtp>true</DownloadFtp>
-    <DownloadFile>true</DownloadFile>
-    <IndexJs>false</IndexJs>
-    <FollowJs>true</FollowJs>
-    <CrawlFlash>true</CrawlFlash>
     <IndexEmptyPages>true</IndexEmptyPages>
     <CrawlWebsphereSeedlist>true</CrawlWebsphereSeedlist>
-    <KeepHashFragmentInUrl>false</KeepHashFragmentInUrl>
-    <RetryCount>1</RetryCount>
-    <RetryPause>1000 ms</RetryPause>
     <HttpCodesToRetry></HttpCodesToRetry>
     <UseIfModifiedSince>true</UseIfModifiedSince>
     <UseIfNoneMatch>no</UseIfNoneMatch>
     <AcceptWeakETag>false</AcceptWeakETag>
     <ForcedEncoding></ForcedEncoding>
     <UseCompression>false</UseCompression>
     <UseUnsafeHeaderParsing>false</UseUnsafeHeaderParsing>
-    <NormalizeSecureSchemesWhenTestingVisited>false</NormalizeSecureSchemesWhenTestingVisited>
     <ExactDeduplication>false</ExactDeduplication>
     <NearDeduplication>false</NearDeduplication>
     <CrawlPauseDelay></CrawlPauseDelay>
@@ -224,8 +209,6 @@
     <RealTimeInfoOnError>false</RealTimeInfoOnError>
     <ConversionProxies></ConversionProxies>
     <ConversionPlan></ConversionPlan>
-    <AddBaseHref>true</AddBaseHref>
-    <AddMetaContentType>false</AddMetaContentType>
     <Throttle></Throttle>
     <DocumentClass></DocumentClass>
     <ConnectorLanguage></ConnectorLanguage>

diff --git a/sde_collections/models/collection.py b/sde_collections/models/collection.py
@@ -207,7 +207,20 @@ def create_plugin_config(self, overwrite: bool = False):
 
         if overwrite is True, it will overwrite the existing file
         """
-        plugin_config = open("config_generation/xmls/plugin_indexing_template.xml").read()
+
+        # there needs to be a scraper config file before creating the plugin config
+        gh = GitHubHandler()
+        scraper_exists = gh.check_file_exists(self._scraper_config_path)
+        if not scraper_exists:
+            raise ValueError(f"Scraper does not exist for the collection {self.config_folder}")
+        else:
+            scraper_content = gh._get_file_contents(self._scraper_config_path)
+            scraper_content = scraper_content.decoded_content.decode("utf-8")
+            scraper_editor = XmlEditor(scraper_content)
+
+        plugin_template = open("config_generation/xmls/plugin_indexing_template.xml").read()
+        plugin_editor = XmlEditor(plugin_template)
+        plugin_config = plugin_editor.convert_template_to_plugin_indexer(scraper_editor)
         self._write_to_github(self._plugin_config_path, plugin_config, overwrite)
 
     def create_indexer_config(self, overwrite: bool = False):