Skip to content

Commit

Permalink
Enable regex flags manipulation (#1193)
Browse files Browse the repository at this point in the history
  • Loading branch information
omri374 authored Oct 26, 2023
1 parent 4aaa05f commit b756c17
Show file tree
Hide file tree
Showing 10 changed files with 113 additions and 53 deletions.
18 changes: 18 additions & 0 deletions docs/analyzer/adding_recognizers.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,24 @@ results = analyzer.analyze(text=text,language="en")
print(results)
```

For pattern based recognizers, it is possible to change the regex flags, either for
one recognizer or for all.
For one recognizer, use the `global_regex_flags` parameter
in the `PatternRecognizer` constructor.
For all recognizers, use the `global_regex_flags` parameter in the `RecognizerRegistry` constructor:

<!--pytest-codeblocks:cont-->
```python
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry

import regex as re

registry = RecognizerRegistry(global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
engine = AnalyzerEngine(registry=registry)
engine.analyze(...)
```


### Creating a new `EntityRecognizer` in code

To create a new recognizer via code:
Expand Down
2 changes: 2 additions & 0 deletions presidio-analyzer/presidio_analyzer/analysis_explanation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def __init__(
pattern: str = None,
validation_result: float = None,
textual_explanation: str = None,
regex_flags: int = None,
):

self.recognizer = recognizer
Expand All @@ -34,6 +35,7 @@ def __init__(
self.score_context_improvement = 0
self.supportive_context_word = ""
self.validation_result = validation_result
self.regex_flags = regex_flags

def __repr__(self):
"""Create string representation of the object."""
Expand Down
23 changes: 17 additions & 6 deletions presidio-analyzer/presidio_analyzer/pattern_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ class PatternRecognizer(LocalRecognizer):
:param context: list of context words
:param deny_list_score: confidence score for a term
identified using a deny-list
:param global_regex_flags: regex flags to be used in regex matching,
including deny-lists.
"""

def __init__(
Expand All @@ -37,9 +39,9 @@ def __init__(
deny_list: List[str] = None,
context: List[str] = None,
deny_list_score: float = 1.0,
global_regex_flags: Optional[int] = re.DOTALL | re.MULTILINE | re.IGNORECASE,
version: str = "0.0.1",
):

if not supported_entity:
raise ValueError("Pattern recognizer should be initialized with entity")

Expand All @@ -61,6 +63,7 @@ def __init__(
self.patterns = patterns
self.context = context
self.deny_list_score = deny_list_score
self.global_regex_flags = global_regex_flags

if deny_list:
deny_list_pattern = self._deny_list_to_regex(deny_list)
Expand All @@ -76,16 +79,16 @@ def analyze(
self,
text: str,
entities: List[str],
nlp_artifacts: NlpArtifacts = None,
regex_flags: int = None,
nlp_artifacts: Optional[NlpArtifacts] = None,
regex_flags: Optional[int] = None,
) -> List[RecognizerResult]:
"""
Analyzes text to detect PII using regular expressions or deny-lists.
:param text: Text to be analyzed
:param entities: Entities this recognizer can detect
:param nlp_artifacts: Output values from the NLP engine
:param regex_flags:
:param regex_flags: regex flags to be used in regex matching
:return:
"""
results = []
Expand Down Expand Up @@ -140,6 +143,7 @@ def build_regex_explanation(
pattern: str,
original_score: float,
validation_result: bool,
regex_flags: int,
) -> AnalysisExplanation:
"""
Construct an explanation for why this entity was detected.
Expand All @@ -149,6 +153,7 @@ def build_regex_explanation(
:param pattern: Regex pattern logic
:param original_score: Score given by the recognizer
:param validation_result: Whether validation was used and its result
:param regex_flags: Regex flags used in the regex matching
:return: Analysis explanation
"""
explanation = AnalysisExplanation(
Expand All @@ -157,6 +162,7 @@ def build_regex_explanation(
pattern_name=pattern_name,
pattern=pattern,
validation_result=validation_result,
regex_flags=regex_flags,
)
return explanation

Expand All @@ -172,7 +178,7 @@ def __analyze_patterns(
:param flags: regex flags
:return: A list of RecognizerResult
"""
flags = flags if flags else re.DOTALL | re.MULTILINE
flags = flags if flags else self.global_regex_flags
results = []
for pattern in self.patterns:
match_start_time = datetime.datetime.now()
Expand All @@ -197,7 +203,12 @@ def __analyze_patterns(

validation_result = self.validate_result(current_match)
description = self.build_regex_explanation(
self.name, pattern.name, pattern.regex, score, validation_result
self.name,
pattern.name,
pattern.regex,
score,
validation_result,
flags,
)
pattern_result = RecognizerResult(
entity_type=self.supported_entities[0],
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
from typing import Optional, List

from presidio_analyzer import Pattern, PatternRecognizer, RecognizerResult
from presidio_analyzer.nlp_engine import NlpArtifacts

import regex as re
from presidio_analyzer import Pattern, PatternRecognizer


class DateRecognizer(PatternRecognizer):
Expand Down Expand Up @@ -96,32 +93,3 @@ def __init__(
context=context,
supported_language=supported_language,
)

def analyze(
self,
text: str,
entities: List[str],
nlp_artifacts: NlpArtifacts = None,
regex_flags: int = None,
) -> List[RecognizerResult]:
"""
Analyzes text to detect PII using regular expressions or deny-lists.
:param text: Text to be analyzed
:param entities: Entities this recognizer can detect
:param nlp_artifacts: Output values from the NLP engine
:param regex_flags:
:return:
"""
regex_flags = (
regex_flags | re.IGNORECASE
if regex_flags
else re.DOTALL | re.MULTILINE | re.IGNORECASE
) # noqa: E501

return super().analyze(
text=text,
entities=entities,
nlp_artifacts=nlp_artifacts,
regex_flags=regex_flags,
)
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,14 @@ def __init__(
self.replacement_pairs = replacement_pairs or [("-", ""), (" ", "")]
self.exact_match = exact_match
self.BOSEOS = bos_eos if exact_match else ()
self.flags = regex_flags
patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
global_regex_flags=regex_flags
)

def validate_result(self, pattern_text: str): # noqa D102
Expand Down Expand Up @@ -126,9 +126,10 @@ def __analyze_patterns(self, text: str, flags: int = None):
:param flags: regex flags
:return: A list of RecognizerResult
"""
flags = flags if flags else self.global_regex_flags
results = []
for pattern in self.patterns:
matches = re.finditer(pattern.regex, text, flags=self.flags)
matches = re.finditer(pattern.regex, text, flags=flags)

for match in matches:
for grp_num in reversed(range(1, len(match.groups()) + 1)):
Expand All @@ -148,7 +149,12 @@ def __analyze_patterns(self, text: str, flags: int = None):

validation_result = self.validate_result(current_match)
description = PatternRecognizer.build_regex_explanation(
self.name, pattern.name, pattern.regex, score, validation_result
self.name,
pattern.name,
pattern.regex,
score,
validation_result,
flags,
)
pattern_result = RecognizerResult(
entity_type=self.supported_entities[0],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import logging
from typing import Optional, List, Iterable, Union, Type, Dict

import regex as re

from pathlib import Path
from presidio_analyzer.nlp_engine.transformers_nlp_engine import (
TransformersNlpEngine,
Expand Down Expand Up @@ -53,13 +55,21 @@ class RecognizerRegistry:
:param recognizers: An optional list of recognizers,
that will be available instead of the predefined recognizers
:param global_regex_flags : regex flags to be used in regex matching,
including deny-lists
"""

def __init__(self, recognizers: Optional[Iterable[EntityRecognizer]] = None):
def __init__(
self,
recognizers: Optional[Iterable[EntityRecognizer]] = None,
global_regex_flags: Optional[int] = re.DOTALL | re.MULTILINE | re.IGNORECASE,
):
if recognizers:
self.recognizers = recognizers
else:
self.recognizers = []
self.global_regex_flags = global_regex_flags

def load_predefined_recognizers(
self, languages: Optional[List[str]] = None, nlp_engine: NlpEngine = None
Expand Down Expand Up @@ -112,10 +122,18 @@ def load_predefined_recognizers(
],
}
for lang in languages:
lang_recognizers = [rc() for rc in recognizers_map.get(lang, [])]
lang_recognizers = [
self.__instantiate_recognizer(
recognizer_class=rc, supported_language=lang
)
for rc in recognizers_map.get(lang, [])
]
self.recognizers.extend(lang_recognizers)
all_recognizers = [
rc(supported_language=lang) for rc in recognizers_map.get("ALL", [])
self.__instantiate_recognizer(
recognizer_class=rc, supported_language=lang
)
for rc in recognizers_map.get("ALL", [])
]
self.recognizers.extend(all_recognizers)
if nlp_engine:
Expand Down Expand Up @@ -283,3 +301,18 @@ def add_recognizers_from_yaml(self, yml_path: Union[str, Path]) -> None:
except TypeError as yaml_error:
print(f"Failed to parse file {yml_path}")
raise yaml_error

def __instantiate_recognizer(
self, recognizer_class: Type[EntityRecognizer], supported_language: str
):
"""
Instantiate a recognizer class given type and input.
:param recognizer_class: Class object of the recognizer
:param supported_language: Language this recognizer should support
"""

inst = recognizer_class(supported_language=supported_language)
if isinstance(inst, PatternRecognizer):
inst.global_regex_flags = self.global_regex_flags
return inst

This file was deleted.

5 changes: 2 additions & 3 deletions presidio-analyzer/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,16 @@
"presidio_analyzer",
"presidio_analyzer.predefined_recognizers",
"presidio_analyzer.nlp_engine",
"presidio_analyzer.recognizer_registry",
"presidio_analyzer.context_aware_enhancers",
],
trusted_host=["pypi.org"],
tests_require=["pytest", "flake8>=3.7.9"],
install_requires=[
"spacy>=3.4.4",
"spacy>=3.4.4, <4.0.0",
"regex",
"tldextract",
"pyyaml",
"phonenumbers>=8.12",
"phonenumbers>=8.12,<9.0.0",
],
extras_require={
"transformers": ["spacy_huggingface_pipelines"],
Expand Down
20 changes: 19 additions & 1 deletion presidio-analyzer/tests/test_pattern_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def test_deny_list_score_change():

@pytest.mark.parametrize(
"text,flag,expected_len",
[("mrs. Kennedy", re.IGNORECASE, 1), ("mrs. Kennedy", None, 0)],
[("mrs. Kennedy", re.IGNORECASE, 1), ("mrs. Kennedy", re.DOTALL, 0)],
)
def test_deny_list_regex_flags(text, flag, expected_len):
deny_list = ["Mr.", "Mrs."]
Expand All @@ -201,3 +201,21 @@ def test_empty_deny_list_raises_value_error():
supported_language="en",
deny_list=[],
)


@pytest.mark.parametrize(
"global_flag,expected_len",
[(re.IGNORECASE | re.MULTILINE, 2), (re.MULTILINE, 0)],
)
def test_global_regex_flag_deny_list_returns_right_result(global_flag, expected_len):
deny_list = ["MrS", "mR"]
text = "Mrs. smith \n\n" \
"and Mr. Jones were sitting in the room."

recognizer_ignore_case = PatternRecognizer(supported_entity="TITLE",
name="TitlesRecognizer",
deny_list=deny_list,
global_regex_flags=global_flag)

results = recognizer_ignore_case.analyze(text=text, entities=["TITLE"])
assert len(results) == expected_len
9 changes: 9 additions & 0 deletions presidio-analyzer/tests/test_recognizer_registry.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path

import pytest
import regex as re

from presidio_analyzer import (
RecognizerRegistry,
Expand Down Expand Up @@ -204,3 +205,11 @@ def test_recognizer_registry_exception_erroneous_yaml():
with pytest.raises(TypeError):
registry = RecognizerRegistry()
registry.add_recognizers_from_yaml(test_yaml)


def test_predefined_pattern_recognizers_have_the_right_regex_flags():
registry = RecognizerRegistry(global_regex_flags=re.DOTALL)
registry.load_predefined_recognizers()
for rec in registry.recognizers:
if isinstance(rec, PatternRecognizer):
assert rec.global_regex_flags == re.DOTALL

0 comments on commit b756c17

Please sign in to comment.