Refactor DocumentValidator to improve regex matching

Fixes some false positives when detecting a valid tool document.
galaxyproject · Nov 4, 2024 · 1ad7558 · 1ad7558
1 parent a13ab13
commit 1ad7558
Showing 1 changed file with 9 additions and 8 deletions.
diff --git a/server/galaxyls/services/validation.py b/server/galaxyls/services/validation.py
@@ -6,6 +6,9 @@
 from galaxyls.services.xml.types import DocumentType
 
 MAX_PEEK_CONTENT = 1000
+TAG_GROUP_NAME = "root_tag"
+TAG_REGEX = r"[\n\s]*?.*?[\n\s]*?<(?!\?)(?!\!)(?P<root_tag>[\w]*)"
+SUPPORTED_ROOT_TAGS = [e.name.lower() for e in DocumentType if e != DocumentType.UNKNOWN]
 
 
 class DocumentValidator:
@@ -17,17 +20,15 @@ def has_valid_root(cls, document: Document) -> bool:
         or is an empty document."""
         if DocumentValidator.is_empty_document(document):
             return True
-        root = DocumentValidator._get_document_root_tag(document)
-        if root is not None:
-            root_tag = root.upper()
-            supported = [e.name for e in DocumentType if e != DocumentType.UNKNOWN]
-            return root_tag == "" or root_tag in supported
+        root_tag = DocumentValidator.get_document_root_tag(document)
+        if root_tag is not None:
+            return root_tag == "" or root_tag in SUPPORTED_ROOT_TAGS
         return False
 
     @classmethod
     def is_tool_document(cls, document: Document) -> bool:
         """Checks if the document's root element is <tool>."""
-        root = DocumentValidator._get_document_root_tag(document)
+        root = DocumentValidator.get_document_root_tag(document)
         if root is not None:
             root_tag = root.upper()
             return root_tag == DocumentType.TOOL.name
@@ -39,11 +40,11 @@ def is_empty_document(cls, document: Document) -> bool:
         return not document.source or document.source.isspace()
 
     @classmethod
-    def _get_document_root_tag(cls, document: Document) -> Optional[str]:
+    def get_document_root_tag(cls, document: Document) -> Optional[str]:
         """Checks the first MAX_PEEK_CONTENT characters of the document for a root tag and
         returns the name of the tag if found."""
         content_peek = document.source[:MAX_PEEK_CONTENT]
-        match = re.match(TAG_REGEX, content_peek)
+        match = re.search(TAG_REGEX, content_peek)
         if match:
             group = match.group(TAG_GROUP_NAME)
             return group