Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved the logic of conflict handling in AnonymizerEngine #1196

Merged
merged 31 commits into from
Jan 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
a05610a
Improved the logic of _remove_conflicts_and_get_text_manipulation_dat…
VMD7 Oct 28, 2023
dd3fb7d
Resolved the linting errors
VMD7 Nov 3, 2023
0dadff6
Merge branch 'microsoft:main' into testing-anonymizer-engine
VMD7 Nov 3, 2023
28ab0fc
Merge branch 'main' into testing-anonymizer-engine
omri374 Nov 3, 2023
8b7d7ee
Added the test case for new logic of conflict handling
VMD7 Nov 4, 2023
a41d437
Modifying the test case named test_given_custom_anonymizer_then_we_ma…
VMD7 Nov 4, 2023
3b08b82
Merge branch 'main' into testing-anonymizer-engine
omri374 Dec 13, 2023
2d01950
Merge branch 'main' into testing-anonymizer-engine
omri374 Dec 21, 2023
9421c6e
Merge branch 'main' into testing-anonymizer-engine
omri374 Dec 26, 2023
a69729a
Revert "Modifying the test case named test_given_custom_anonymizer_th…
VMD7 Dec 27, 2023
5c59798
Merge branch 'testing-anonymizer-engine' of https://github.com/VMD7/p…
VMD7 Dec 27, 2023
45ba4e9
Merge branch 'microsoft:main' into testing-anonymizer-engine
VMD7 Dec 27, 2023
ed88bff
Merge branch 'testing-anonymizer-engine' of https://github.com/VMD7/p…
VMD7 Dec 27, 2023
7f1ef95
Created ConflictResolutionStrategy enum class
VMD7 Dec 27, 2023
6c58233
Added newly created ConflictResolutionStrategy enum class in entities…
VMD7 Dec 27, 2023
a729d92
Added the logic to handle the TEXT_AND_ENTITIES conflicts
VMD7 Dec 27, 2023
9936420
Added test cases covering various conflict resolution scenarios for t…
VMD7 Dec 27, 2023
4125feb
Merge branch 'main' into testing-anonymizer-engine
omri374 Dec 31, 2023
4a81c6a
Added one more test case where conflict_stragy=None
VMD7 Dec 31, 2023
e584729
Merge branch 'testing-anonymizer-engine' of https://github.com/VMD7/p…
VMD7 Dec 31, 2023
4622602
Utilizing score for entity span changes
VMD7 Jan 2, 2024
660a6f3
Merge branch 'microsoft:main' into testing-anonymizer-engine
VMD7 Jan 2, 2024
66ca4ff
Added all three conflict resolutions
VMD7 Jan 2, 2024
4e40570
Added the respective conflict resolutions logic
VMD7 Jan 2, 2024
2e0694e
Added all types of conflict resolution test cases
VMD7 Jan 2, 2024
2731375
Merge branch 'testing-anonymizer-engine' of https://github.com/VMD7/p…
VMD7 Jan 2, 2024
c9d539b
Removed the NONE option from conflict resoluiton strategy
VMD7 Jan 8, 2024
5c1e82e
Merge branch 'main' into testing-anonymizer-engine
omri374 Jan 8, 2024
09dc282
Update presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py
VMD7 Jan 9, 2024
489ebba
Handled linting error
VMD7 Jan 9, 2024
f8ca304
Removed extra space
VMD7 Jan 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 41 additions & 3 deletions presidio-anonymizer/presidio_anonymizer/anonymizer_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@
from typing import List, Dict, Optional

from presidio_anonymizer.core import EngineBase
from presidio_anonymizer.entities import OperatorConfig, RecognizerResult, EngineResult
from presidio_anonymizer.entities import (
OperatorConfig,
RecognizerResult,
EngineResult,
ConflictResolutionStrategy,
)
from presidio_anonymizer.operators import OperatorType

DEFAULT = "replace"
Expand All @@ -28,6 +33,9 @@ def anonymize(
text: str,
analyzer_results: List[RecognizerResult],
operators: Optional[Dict[str, OperatorConfig]] = None,
conflict_resolution: ConflictResolutionStrategy = (
ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED
)
) -> EngineResult:
"""Anonymize method to anonymize the given text.

Expand All @@ -37,6 +45,8 @@ def anonymize(
:param operators: The configuration of the anonymizers we would like
to use for each entity e.g.: {"PHONE_NUMBER":OperatorConfig("redact", {})}
received from the analyzer
:param conflict_resolution: The configuration designed to handle conflicts
among entities
:return: the anonymized text and a list of information about the
anonymized entities.

Expand Down Expand Up @@ -76,7 +86,7 @@ def anonymize(

"""
analyzer_results = self._remove_conflicts_and_get_text_manipulation_data(
analyzer_results
analyzer_results, conflict_resolution
)

merged_results = self._merge_entities_with_whitespace_between(
Expand All @@ -88,7 +98,9 @@ def anonymize(
return self._operate(text, merged_results, operators, OperatorType.Anonymize)

def _remove_conflicts_and_get_text_manipulation_data(
self, analyzer_results: List[RecognizerResult]
self,
analyzer_results: List[RecognizerResult],
conflict_resolution: ConflictResolutionStrategy
) -> List[RecognizerResult]:
"""
Iterate the list and create a sorted unique results list from it.
Expand Down Expand Up @@ -142,6 +154,32 @@ def _remove_conflicts_and_get_text_manipulation_data(
self.logger.debug(
f"removing element {result} from results list due to conflict"
)

# This further improves the quality of handling the conflict between the
# various entities overlapping. This will not drop the results insted
# it adjust the start and end positions of overlapping results and removes
# All types of conflicts among entities as well as text.
if conflict_resolution == ConflictResolutionStrategy.REMOVE_INTERSECTIONS:
unique_text_metadata_elements.sort(key=lambda element: element.start)
elements_length = len(unique_text_metadata_elements)
index = 0
while index < elements_length - 1:
current_entity = unique_text_metadata_elements[index]
next_entity = unique_text_metadata_elements[index + 1]
if current_entity.end <= next_entity.start:
index += 1
else:
if current_entity.score >= next_entity.score:
next_entity.start = current_entity.end
else:
current_entity.end = next_entity.start
unique_text_metadata_elements.sort(
key=lambda element: element.start
)
unique_text_metadata_elements = [
element for element in unique_text_metadata_elements
if element.start <= element.end
]
return unique_text_metadata_elements

def _merge_entities_with_whitespace_between(
Expand Down
2 changes: 2 additions & 0 deletions presidio-anonymizer/presidio_anonymizer/entities/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Handles all the entities objects (structs) of the anonymizer."""
from .invalid_exception import InvalidParamException
from .conflict_resolution_strategy import ConflictResolutionStrategy
from .engine.pii_entity import PIIEntity
from .engine.operator_config import OperatorConfig
from .engine.recognizer_result import RecognizerResult
Expand All @@ -9,6 +10,7 @@

__all__ = [
"InvalidParamException",
"ConflictResolutionStrategy",
"PIIEntity",
"OperatorConfig",
"OperatorResult",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""An Enum class designed to manage all types of conflicts among entities or text."""
from enum import Enum


class ConflictResolutionStrategy(Enum):
"""Conflict resolution strategy.

The strategy to use when there is a conflict between two entities.

MERGE_SIMILAR_OR_CONTAINED: This default strategy resolves conflicts
between similar or contained entities.
REMOVE_INTERSECTIONS: Effectively resolves both intersection conflicts
among entities and default strategy conflicts.
NONE: No conflict resolution will be performed.
"""

MERGE_SIMILAR_OR_CONTAINED = "merge_similar_or_contained"
REMOVE_INTERSECTIONS = "remove_intersections"
165 changes: 165 additions & 0 deletions presidio-anonymizer/tests/test_conflict_resolution_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import pytest

from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import (
RecognizerResult,
OperatorConfig,
ConflictResolutionStrategy,
EngineResult,
OperatorResult,
)


@pytest.mark.parametrize(
# fmt: off
"text, analyzer_result1, analyzer_result2, conflict_strategy, expected_result",
[
(
("Fake card number 4151 3217 6243 3448.com "
"that overlaps with nonexisting URL."),
RecognizerResult("CREDIT_CARD", 17, 36, 0.8),
RecognizerResult("URL", 32, 40, 0.8),
ConflictResolutionStrategy.MERGE_SIMILAR_OR_CONTAINED,
("Fake card number 4151 3217 6243 34483448.com "
"that overlaps with nonexisting URL.")
),
(
"Fake text with SSN 145-45-6789 and phone number 953-555-5555.",
RecognizerResult("SSN", 19, 30, 0.85),
RecognizerResult("PHONE_NUMBER", 48, 60, 0.95),
None,
"Fake text with SSN 145-45-6789 and phone number 953-555-5555."
),
]
# fmt: on
)
def test_when_merge_similar_or_contained_selected_then_default_conflict_handled(
text, analyzer_result1, analyzer_result2, conflict_strategy, expected_result
):
engine = AnonymizerEngine()
operator_config = OperatorConfig("keep")
result = engine.anonymize(
text,
[analyzer_result1, analyzer_result2],
{"DEFAULT": operator_config},
conflict_resolution=conflict_strategy
).text

assert result == expected_result


@pytest.mark.parametrize(
# fmt: off
"text, analyzer_results, conflict_strategy, expected_result",
[
# CREDIT_CARD Entity has higher score, so adjustment will occur at URL entity
(
(
"Fake card number 4151 3217 6243 3448.com "
"that overlaps with nonexisting URL."
),
[
RecognizerResult("CREDIT_CARD", 17, 36, 1),
RecognizerResult("URL", 32, 40, 0.5)
],
ConflictResolutionStrategy.REMOVE_INTERSECTIONS,
EngineResult(
text=(
"Fake card number 4151 3217 6243 3448.com "
"that overlaps with nonexisting URL."
),
items=[
OperatorResult(17, 36, 'CREDIT_CARD',
'4151 3217 6243 3448', 'keep'),
OperatorResult(36, 40, 'URL', '.com', 'keep')
]
)
),
# URL Entity has higher score, so adjustment will occur at CREDIT_CARD entity
(
(
"Fake card number 4151 3217 6243 3448.com "
"that overlaps with nonexisting URL."
),
[
RecognizerResult("CREDIT_CARD", 17, 36, 0.8),
RecognizerResult("URL", 32, 40, 1)
],
ConflictResolutionStrategy.REMOVE_INTERSECTIONS,
EngineResult(
text=(
"Fake card number 4151 3217 6243 3448.com "
"that overlaps with nonexisting URL."
),
items=[
OperatorResult(17, 32, 'CREDIT_CARD', '4151 3217 6243 ', 'keep'),
OperatorResult(32, 40, 'URL', '3448.com', 'keep')
]
)
),
# Both entities has same score, so adjustment will occur at second entity
(
(
"Fake card number 4151 3217 6243 3448.com "
"that overlaps with nonexisting URL."
),
[
RecognizerResult("CREDIT_CARD", 17, 36, 0.8),
RecognizerResult("URL", 32, 40, 0.8)
],
ConflictResolutionStrategy.REMOVE_INTERSECTIONS,
EngineResult(
text=(
"Fake card number 4151 3217 6243 3448.com "
"that overlaps with nonexisting URL."
),
items=[
OperatorResult(17, 36, 'CREDIT_CARD',
'4151 3217 6243 3448', 'keep'),
OperatorResult(36, 40, 'URL', '.com', 'keep')
]
)
),
# More than one entity intersections
(
(
"Fake card number 4151 3217 6243 3448.com "
"that overlaps with nonexisting URL."
),
[
RecognizerResult("CREDIT_CARD", 17, 36, 0.8),
RecognizerResult("URL", 28, 40, 0.8),
RecognizerResult("Ent1", 31, 42, 0.9),
RecognizerResult("Ent2", 25, 40, 0.8)
],
ConflictResolutionStrategy.REMOVE_INTERSECTIONS,
EngineResult(
text=(
"Fake card number 4151 3217 6243 3448.com "
"that overlaps with nonexisting URL."
),
items=[
OperatorResult(31, 42, 'Ent1', ' 3448.com t', 'keep'),
OperatorResult(17, 31, 'CREDIT_CARD', '4151 3217 6243', 'keep')
]
)
)

]
# fmt: on
)
def test_when_remove_intersections_conflict_selected_then_all_conflicts_handled(
text, analyzer_results, conflict_strategy, expected_result
):
engine = AnonymizerEngine()
operator_config = OperatorConfig("keep")
conflict_strategy = conflict_strategy
result = engine.anonymize(
text,
analyzer_results,
{"DEFAULT": operator_config},
conflict_resolution=conflict_strategy
)

assert result.text == expected_result.text
assert sorted(result.items) == sorted(expected_result.items)