microsoft · omri374 · Jan 9, 2024 · Oct 28, 2023 · Nov 3, 2023 · Nov 3, 2023
diff --git a/presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py b/presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py
@@ -4,7 +4,12 @@
 from typing import List, Dict, Optional
 
 from presidio_anonymizer.core import EngineBase
-from presidio_anonymizer.entities import OperatorConfig, RecognizerResult, EngineResult
+from presidio_anonymizer.entities import (
+    OperatorConfig,
+    RecognizerResult,
+    EngineResult,
+    ConflictResolutionStrategy,
+)
 from presidio_anonymizer.operators import OperatorType
 
 DEFAULT = "replace"
@@ -28,6 +33,9 @@ def anonymize(
             text: str,
             analyzer_results: List[RecognizerResult],
             operators: Optional[Dict[str, OperatorConfig]] = None,
+            conflict_resolution: ConflictResolutionStrategy = (
+                ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED
+            )
     ) -> EngineResult:
         """Anonymize method to anonymize the given text.
 
@@ -37,6 +45,8 @@ def anonymize(
         :param operators: The configuration of the anonymizers we would like
         to use for each entity e.g.: {"PHONE_NUMBER":OperatorConfig("redact", {})}
         received from the analyzer
+        :param conflict_resolution: The configuration designed to handle conflicts
+        among entities
         :return: the anonymized text and a list of information about the
         anonymized entities.
 
@@ -76,7 +86,7 @@ def anonymize(
 
         """
         analyzer_results = self._remove_conflicts_and_get_text_manipulation_data(
-            analyzer_results
+            analyzer_results, conflict_resolution
         )
 
         merged_results = self._merge_entities_with_whitespace_between(
@@ -88,7 +98,9 @@ def anonymize(
         return self._operate(text, merged_results, operators, OperatorType.Anonymize)
 
     def _remove_conflicts_and_get_text_manipulation_data(
-            self, analyzer_results: List[RecognizerResult]
+            self,
+            analyzer_results: List[RecognizerResult],
+            conflict_resolution: ConflictResolutionStrategy
     ) -> List[RecognizerResult]:
         """
         Iterate the list and create a sorted unique results list from it.
@@ -142,6 +154,32 @@ def _remove_conflicts_and_get_text_manipulation_data(
                 self.logger.debug(
                     f"removing element {result} from results list due to conflict"
                 )
+
+        # This further improves the quality of handling the conflict between the
+        # various entities overlapping. This will not drop the results insted
+        # it adjust the start and end positions of overlapping results and removes
+        # All types of conflicts among entities as well as text.
+        if conflict_resolution == ConflictResolutionStrategy.REMOVE_INTERSECTIONS:
+            unique_text_metadata_elements.sort(key=lambda element: element.start)
+            elements_length = len(unique_text_metadata_elements)
+            index = 0
+            while index < elements_length - 1:
+                current_entity = unique_text_metadata_elements[index]
+                next_entity = unique_text_metadata_elements[index + 1]
+                if current_entity.end <= next_entity.start:
+                    index += 1
+                else:
+                    if current_entity.score >= next_entity.score:
+                        next_entity.start = current_entity.end
+                    else:
+                        current_entity.end = next_entity.start
+                    unique_text_metadata_elements.sort(
+                        key=lambda element: element.start
+                    )
+            unique_text_metadata_elements = [
+                element for element in unique_text_metadata_elements
+                if element.start <= element.end
+                ]
         return unique_text_metadata_elements
 
     def _merge_entities_with_whitespace_between(

diff --git a/presidio-anonymizer/presidio_anonymizer/entities/__init__.py b/presidio-anonymizer/presidio_anonymizer/entities/__init__.py
@@ -1,5 +1,6 @@
 """Handles all the entities objects (structs) of the anonymizer."""
 from .invalid_exception import InvalidParamException
+from .conflict_resolution_strategy import ConflictResolutionStrategy
 from .engine.pii_entity import PIIEntity
 from .engine.operator_config import OperatorConfig
 from .engine.recognizer_result import RecognizerResult
@@ -9,6 +10,7 @@
 
 __all__ = [
     "InvalidParamException",
+    "ConflictResolutionStrategy",
     "PIIEntity",
     "OperatorConfig",
     "OperatorResult",

diff --git a/presidio-anonymizer/presidio_anonymizer/entities/conflict_resolution_strategy.py b/presidio-anonymizer/presidio_anonymizer/entities/conflict_resolution_strategy.py
@@ -0,0 +1,18 @@
+"""An Enum class designed to manage all types of conflicts among entities or text."""
+from enum import Enum
+
+
+class ConflictResolutionStrategy(Enum):
+    """Conflict resolution strategy.
+
+    The strategy to use when there is a conflict between two entities.
+
+    MERGE_SIMILAR_OR_CONTAINED: This default strategy resolves conflicts
+    between similar or contained entities.
+    REMOVE_INTERSECTIONS: Effectively resolves both intersection conflicts
+    among entities and default strategy conflicts.
+    NONE: No conflict resolution will be performed.
+    """
+
+    MERGE_SIMILAR_OR_CONTAINED = "merge_similar_or_contained"
+    REMOVE_INTERSECTIONS = "remove_intersections"
diff --git a/presidio-anonymizer/tests/test_conflict_resolution_strategy.py b/presidio-anonymizer/tests/test_conflict_resolution_strategy.py
@@ -0,0 +1,165 @@
+import pytest
+
+from presidio_anonymizer import AnonymizerEngine
+from presidio_anonymizer.entities import (
+    RecognizerResult,
+    OperatorConfig,
+    ConflictResolutionStrategy,
+    EngineResult,
+    OperatorResult,
+)
+
+
+@pytest.mark.parametrize(
+    # fmt: off
+    "text, analyzer_result1, analyzer_result2, conflict_strategy, expected_result",
+    [
+        (
+            ("Fake card number 4151 3217 6243 3448.com "
+             "that overlaps with nonexisting URL."),
+            RecognizerResult("CREDIT_CARD", 17, 36, 0.8),
+            RecognizerResult("URL", 32, 40, 0.8),
+            ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED,
+            ("Fake card number 4151 3217 6243 34483448.com "
+             "that overlaps with nonexisting URL.")
+        ),
+        (
+            "Fake text with SSN 145-45-6789 and phone number 953-555-5555.",
+            RecognizerResult("SSN", 19, 30, 0.85),
+            RecognizerResult("PHONE_NUMBER", 48, 60, 0.95),
+            None,
+            "Fake text with SSN 145-45-6789 and phone number 953-555-5555."
+        ),
+    ]
+    # fmt: on
+)
+def test_when_merge_similar_or_contained_selected_then_default_conflict_handled(
+    text, analyzer_result1, analyzer_result2, conflict_strategy, expected_result
+):
+    engine = AnonymizerEngine()
+    operator_config = OperatorConfig("keep")
+    result = engine.anonymize(
+        text,
+        [analyzer_result1, analyzer_result2],
+        {"DEFAULT": operator_config},
+        conflict_resolution=conflict_strategy
+    ).text
+
+    assert result == expected_result
+
+
+@pytest.mark.parametrize(
+    # fmt: off
+    "text, analyzer_results, conflict_strategy, expected_result",
+    [
+        # CREDIT_CARD Entity has higher score, so adjustment will occur at URL entity
+        (
+            (
+                "Fake card number 4151 3217 6243 3448.com "
+                "that overlaps with nonexisting URL."
+            ),
+            [
+                RecognizerResult("CREDIT_CARD", 17, 36, 1),
+                RecognizerResult("URL", 32, 40, 0.5)
+            ],
+            ConflictResolutionStrategy.REMOVE_INTERSECTIONS,
+            EngineResult(
+                text=(
+                    "Fake card number 4151 3217 6243 3448.com "
+                    "that overlaps with nonexisting URL."
+                ),
+                items=[
+                    OperatorResult(17, 36, 'CREDIT_CARD',
+                                   '4151 3217 6243 3448', 'keep'),
+                    OperatorResult(36, 40, 'URL', '.com', 'keep')
+                ]
+            )
+        ),
+        # URL Entity has higher score, so adjustment will occur at CREDIT_CARD entity
+        (
+            (
+                "Fake card number 4151 3217 6243 3448.com "
+                "that overlaps with nonexisting URL."
+            ),
+            [
+                RecognizerResult("CREDIT_CARD", 17, 36, 0.8),
+                RecognizerResult("URL", 32, 40, 1)
+            ],
+            ConflictResolutionStrategy.REMOVE_INTERSECTIONS,
+            EngineResult(
+                text=(
+                    "Fake card number 4151 3217 6243 3448.com "
+                    "that overlaps with nonexisting URL."
+                ),
+                items=[
+                    OperatorResult(17, 32, 'CREDIT_CARD', '4151 3217 6243 ', 'keep'),
+                    OperatorResult(32, 40, 'URL', '3448.com', 'keep')
+                ]
+            )
+        ),
+        # Both entities has same score, so adjustment will occur at second entity
+        (
+            (
+                "Fake card number 4151 3217 6243 3448.com "
+                "that overlaps with nonexisting URL."
+            ),
+            [
+                RecognizerResult("CREDIT_CARD", 17, 36, 0.8),
+                RecognizerResult("URL", 32, 40, 0.8)
+            ],
+            ConflictResolutionStrategy.REMOVE_INTERSECTIONS,
+            EngineResult(
+                text=(
+                    "Fake card number 4151 3217 6243 3448.com "
+                    "that overlaps with nonexisting URL."
+                ),
+                items=[
+                    OperatorResult(17, 36, 'CREDIT_CARD',
+                                   '4151 3217 6243 3448', 'keep'),
+                    OperatorResult(36, 40, 'URL', '.com', 'keep')
+                ]
+            )
+        ),
+        # More than one entity intersections
+        (
+            (
+                "Fake card number 4151 3217 6243 3448.com "
+                "that overlaps with nonexisting URL."
+            ),
+            [
+                RecognizerResult("CREDIT_CARD", 17, 36, 0.8),
+                RecognizerResult("URL", 28, 40, 0.8),
+                RecognizerResult("Ent1", 31, 42, 0.9),
+                RecognizerResult("Ent2", 25, 40, 0.8)
+            ],
+            ConflictResolutionStrategy.REMOVE_INTERSECTIONS,
+            EngineResult(
+                text=(
+                    "Fake card number 4151 3217 6243 3448.com "
+                    "that overlaps with nonexisting URL."
+                ),
+                items=[
+                    OperatorResult(31, 42, 'Ent1', ' 3448.com t', 'keep'),
+                    OperatorResult(17, 31, 'CREDIT_CARD', '4151 3217 6243',  'keep')
+                ]
+            )
+        )
+
+    ]
+    # fmt: on
+)
+def test_when_remove_intersections_conflict_selected_then_all_conflicts_handled(
+    text, analyzer_results, conflict_strategy, expected_result
+):
+    engine = AnonymizerEngine()
+    operator_config = OperatorConfig("keep")
+    conflict_strategy = conflict_strategy
+    result = engine.anonymize(
+        text,
+        analyzer_results,
+        {"DEFAULT": operator_config},
+        conflict_resolution=conflict_strategy
+    )
+
+    assert result.text == expected_result.text
+    assert sorted(result.items) == sorted(expected_result.items)