From a59d67be3af8add1fc5c38b4fbf8832cdf2eba94 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 28 Aug 2023 22:34:23 +0300 Subject: [PATCH 01/67] integrating spacy-huggingface-pipeliens and refactoring NlpEngine logic --- docs/analyzer/customizing_nlp_models.md | 7 +- presidio-analyzer/Pipfile | 4 +- presidio-analyzer/conf/default.yaml | 23 ++ presidio-analyzer/conf/spacy.yaml | 17 ++ .../conf/spacy_multilingual.yaml | 16 ++ presidio-analyzer/conf/stanza.yaml | 15 ++ .../conf/stanza_multilingual.yaml | 15 ++ presidio-analyzer/conf/transformers.yaml | 32 ++- .../presidio_analyzer/analyzer_engine.py | 3 + .../presidio_analyzer/nlp_engine/__init__.py | 2 + .../nlp_engine/ner_model_configuration.py | 162 ++++++++++++ .../nlp_engine/nlp_artifacts.py | 16 +- .../nlp_engine/nlp_engine.py | 15 +- .../nlp_engine/nlp_engine_provider.py | 14 +- .../nlp_engine/spacy_nlp_engine.py | 137 ++++++++-- .../nlp_engine/stanza_nlp_engine.py | 21 +- .../nlp_engine/transformers_nlp_engine.py | 234 +++++++++--------- .../predefined_recognizers/__init__.py | 9 +- .../spacy_recognizer.py | 102 ++++---- .../transformers_recognizer.py | 78 ++---- .../recognizer_registry.py | 13 +- presidio-analyzer/setup.cfg | 2 +- presidio-analyzer/setup.py | 3 +- presidio-analyzer/tests/conf/default.yaml | 24 +- presidio-analyzer/tests/conftest.py | 30 ++- .../tests/mocks/nlp_engine_mock.py | 14 +- .../tests/test_analyzer_engine.py | 1 - .../tests/test_context_support.py | 5 - .../tests/test_nlp_engine_provider.py | 200 ++++++++++++--- .../tests/test_spacy_recognizer.py | 8 +- .../tests/test_stanza_recognizer.py | 3 +- 31 files changed, 902 insertions(+), 323 deletions(-) create mode 100644 presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py diff --git a/docs/analyzer/customizing_nlp_models.md b/docs/analyzer/customizing_nlp_models.md index 3e67934c4..16abd2aab 100644 --- a/docs/analyzer/customizing_nlp_models.md +++ b/docs/analyzer/customizing_nlp_models.md @@ -17,7 +17,10 @@ In addition, other types of NLP frameworks [can be integrated into Presidio](dev ## Configure Presidio to use the new model -Configuration can be done in two ways: +Configuration can be done in three ways: + +- **Via the `NlpEngineConfiguration` object: + - **Via code**: Create an `NlpEngine` using the `NlpEnginerProvider` class, and pass it to the `AnalyzerEngine` as input: @@ -69,7 +72,7 @@ Configuration can be done in two ways: ```python from presidio_analyzer import AnalyzerEngine, RecognizerRegistry - from presidio_analyzer.nlp_engine import NlpEngineProvider + from presidio_analyzer.nlp_engine import NlpEngineProvider, NlpEngineConfiguration LANGUAGES_CONFIG_FILE = "./docs/analyzer/languages-config.yml" diff --git a/presidio-analyzer/Pipfile b/presidio-analyzer/Pipfile index d11e89c0a..0a4803e90 100644 --- a/presidio-analyzer/Pipfile +++ b/presidio-analyzer/Pipfile @@ -4,16 +4,18 @@ verify_ssl = true name = "pypi" [packages] -spacy = ">=3.4.4" +spacy = ">=3.4.4, <4.0.0" regex = "*" tldextract = "*" flask = ">=1.1" pyyaml = "*" phonenumbers = ">=8.12" typing-extensions = "*" +spacy-huggingface-pipelines = "*" [dev-packages] pytest = "*" +pytest-mock = "*" flake8= {version = ">=3.7.9"} pep8-naming = "*" flake8-docstrings = "*" diff --git a/presidio-analyzer/conf/default.yaml b/presidio-analyzer/conf/default.yaml index 92c163441..ee1504276 100644 --- a/presidio-analyzer/conf/default.yaml +++ b/presidio-analyzer/conf/default.yaml @@ -3,3 +3,26 @@ models: - lang_code: en model_name: en_core_web_lg +ner_model_configuration: +- model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + LOC: LOCATION + LOCATION: LOCATION + GPE: LOCATION + ORG: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME + NORP: NRP +- aggregation_strategy: simple # "simple", "first", "average", "max" +- stride: 16 # If stride >= 0, process long texts in + # overlapping windows of the model max + # length. The value is the length of the + # window overlap in transformer tokenizer + # tokens, NOT the length of the stride. +- alignment_mode: strict # "strict", "contract", "expand" +- labels_to_ignore: ["O"] +- low_confidence_score_multiplier: 0.4 +- low_score_entity_names: + - ORGANIZATION + - ORG \ No newline at end of file diff --git a/presidio-analyzer/conf/spacy.yaml b/presidio-analyzer/conf/spacy.yaml index 92c163441..2b95bd4ad 100644 --- a/presidio-analyzer/conf/spacy.yaml +++ b/presidio-analyzer/conf/spacy.yaml @@ -3,3 +3,20 @@ models: - lang_code: en model_name: en_core_web_lg +ner_model_configuration: + - model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + LOC: LOCATION + LOCATION: LOCATION + GPE: LOCATION + ORG: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME + NORP: NRP + + - low_confidence_score_multiplier: 0.4 + - low_score_entity_names: + - ORGANIZATION + - ORG + - default_score: 0.85 \ No newline at end of file diff --git a/presidio-analyzer/conf/spacy_multilingual.yaml b/presidio-analyzer/conf/spacy_multilingual.yaml index de4868f73..2cf1d99da 100644 --- a/presidio-analyzer/conf/spacy_multilingual.yaml +++ b/presidio-analyzer/conf/spacy_multilingual.yaml @@ -9,3 +9,19 @@ models: - lang_code: es model_name: es_core_news_md +ner_model_configuration: + - model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + LOC: LOCATION + LOCATION: LOCATION + GPE: LOCATION + ORG: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME + NORP: NRP + + - low_confidence_score_multiplier: 0.4 + - low_score_entity_names: + - ORGANIZATION + - ORG \ No newline at end of file diff --git a/presidio-analyzer/conf/stanza.yaml b/presidio-analyzer/conf/stanza.yaml index 7d8090e4a..a18bd18b5 100644 --- a/presidio-analyzer/conf/stanza.yaml +++ b/presidio-analyzer/conf/stanza.yaml @@ -4,3 +4,18 @@ models: lang_code: en model_name: en +ner_model_configuration: + - model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + LOC: LOCATION + LOCATION: LOCATION + GPE: LOCATION + ORG: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME + NORP: NRP + + - low_confidence_score_multiplier: 0.4 + - low_score_entity_names: + - diff --git a/presidio-analyzer/conf/stanza_multilingual.yaml b/presidio-analyzer/conf/stanza_multilingual.yaml index d0e02e39c..5ad2ca67d 100644 --- a/presidio-analyzer/conf/stanza_multilingual.yaml +++ b/presidio-analyzer/conf/stanza_multilingual.yaml @@ -7,3 +7,18 @@ models: lang_code: de model_name: de +ner_model_configuration: + - model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + LOC: LOCATION + LOCATION: LOCATION + GPE: LOCATION + ORG: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME + NORP: NRP + + - low_confidence_score_multiplier: 0.4 + - low_score_entity_names: + - diff --git a/presidio-analyzer/conf/transformers.yaml b/presidio-analyzer/conf/transformers.yaml index c9edcc0a6..644fdf25a 100644 --- a/presidio-analyzer/conf/transformers.yaml +++ b/presidio-analyzer/conf/transformers.yaml @@ -4,4 +4,34 @@ models: lang_code: en model_name: spacy: en_core_web_sm - transformers: elastic/distilbert-base-uncased-finetuned-conll03-english + transformers: StanfordAIMI/stanford-deidentifier-base + +ner_model_configuration: + labels_to_ignore: + - O + aggregation_strategy: simple # "simple", "first", "average", "max" + stride: 16 # If stride >= 0, process long texts in + # overlapping windows of the model max + # length. The value is the length of the + # window overlap in transformer tokenizer + # tokens, NOT the length of the stride. + alignment_mode: strict # "strict", "contract", "expand" + model_to_presidio_entity_mapping: + PER: PERSON + LOC: LOCATION + ORG: ORGANIZATION + AGE: AGE + ID: ID + EMAIL: EMAIL + PATIENT: PERSON + STAFF: PERSON + HOSP: ORGANIZATION + PATORG: ORGANIZATION + DATE: DATE_TIME + PHONE: PHONE_NUMBER + HCW: PERSON + HOSPITAL: ORGANIZATION + + low_confidence_score_multiplier: 0.4 + low_score_entity_names: + - ID \ No newline at end of file diff --git a/presidio-analyzer/presidio_analyzer/analyzer_engine.py b/presidio-analyzer/presidio_analyzer/analyzer_engine.py index 394afba5a..a67bfed8c 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_engine.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_engine.py @@ -67,6 +67,9 @@ def __init__( self.supported_languages = supported_languages self.nlp_engine = nlp_engine + if not self.nlp_engine.is_loaded(): + self.nlp_engine.load() + self.registry = registry # load all recognizers diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/__init__.py b/presidio-analyzer/presidio_analyzer/nlp_engine/__init__.py index 154be672e..6db10007b 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/__init__.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/__init__.py @@ -1,5 +1,6 @@ """NLP engine package. Performs text pre-processing.""" +from .ner_model_configuration import NerModelConfiguration from .nlp_artifacts import NlpArtifacts from .nlp_engine import NlpEngine from .spacy_nlp_engine import SpacyNlpEngine @@ -8,6 +9,7 @@ from .nlp_engine_provider import NlpEngineProvider __all__ = [ + "NerModelConfiguration", "NlpArtifacts", "NlpEngine", "SpacyNlpEngine", diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py new file mode 100644 index 000000000..f4fc61abb --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py @@ -0,0 +1,162 @@ +import json +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Optional, Union, Collection + +import yaml + +logger = logging.getLogger("presidio-analyzer") + +MODEL_TO_PRESIDIO_ENTITY_MAPPING = dict( + PER="PERSON", + PERSON="PERSON", + LOC="LOCATION", + LOCATION="LOCATION", + GPE="LOCATION", + ORG="ORGANIZATION", + DATE="DATE_TIME", + TIME="DATE_TIME", + NORP="NRP", + AGE="AGE", + ID="ID", + EMAIL="EMAIL", + PATIENT="PERSON", + STAFF="PERSON", + HOSP="ORGANIZATION", + PATORG="ORGANIZATION", + PHONE="PHONE_NUMBER", + HCW="PERSON", + HOSPITAL="ORGANIZATION", +) + +LOW_SCORE_ENTITY_NAMES = {"ORG", "ORGANIZATION"} +LABELS_TO_IGNORE = {"O"} + + +@dataclass +class NerModelConfiguration: + """NER model configuration. + + :param nlp_engine_name: Name of the NLP engine to use. + :param labels_to_ignore: List of labels to not return predictions for. + :param aggregation_strategy: + See https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.TokenClassificationPipeline.aggregation_strategy + :param stride: + See https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.TokenClassificationPipeline.stride + :param alignment_mode: See https://spacy.io/api/doc#char_span + :param default_score: Default confidence score if the model does not provide one. + :param model_to_presidio_entity_mapping: + Mapping between the NER model entities and Presidio entities. + :param low_score_entity_names: + Set of entity names that are likely to have low detection accuracy that should be adjusted. + :param low_confidence_score_multiplier: A multiplier for the score given for low_score_entity_names. + Multiplier to the score given for low_score_entity_names. + """ # noqa E501 + + nlp_engine_name: str + labels_to_ignore: Optional[Collection[str]] = None + aggregation_strategy: Optional[str] = "simple" + stride: Optional[int] = 14 + alignment_mode: Optional[str] = "strict" + default_score: Optional[float] = 0.85 + model_to_presidio_entity_mapping: Optional[Dict[str, str]] = None + low_score_entity_names: Optional[Collection] = None + low_confidence_score_multiplier: Optional[float] = 0.4 + + def __post_init__(self): + if self.model_to_presidio_entity_mapping is None: + self.model_to_presidio_entity_mapping = MODEL_TO_PRESIDIO_ENTITY_MAPPING + if self.low_score_entity_names is None: + self.low_score_entity_names = LOW_SCORE_ENTITY_NAMES + if self.labels_to_ignore is None: + self.labels_to_ignore = LABELS_TO_IGNORE + + @classmethod + def _validate_input(cls, nlp_engine_configuration: Dict): + if "nlp_engine_name" not in nlp_engine_configuration: + raise ValueError("nlp_engine_name is a required parameter") + if "labels_to_ignore" in nlp_engine_configuration: + if not isinstance(nlp_engine_configuration["labels_to_ignore"], list): + raise ValueError("labels_to_ignore must be a list") + if "aggregation_strategy" in nlp_engine_configuration: + if not isinstance(nlp_engine_configuration["aggregation_strategy"], str): + raise ValueError("aggregation_strategy must be a string") + if "alignment_mode" in nlp_engine_configuration: + if not isinstance(nlp_engine_configuration["alignment_mode"], str): + raise ValueError("alignment_mode must be a string") + if "stride" in nlp_engine_configuration: + if not isinstance(nlp_engine_configuration["stride"], int): + raise ValueError("stride must be an integer") + if "model_to_presidio_entity_mapping" in nlp_engine_configuration: + if not isinstance( + nlp_engine_configuration["model_to_presidio_entity_mapping"], dict + ): + raise ValueError("model_to_presidio_entity_mapping must be a dict") + if "low_score_entity_names" in nlp_engine_configuration: + if not isinstance(nlp_engine_configuration["low_score_entity_names"], list): + raise ValueError("low_score_entity_names must be a list") + if "low_confidence_score_multiplier" in nlp_engine_configuration: + if not isinstance( + nlp_engine_configuration["low_confidence_score_multiplier"], float + ): + raise ValueError("low_confidence_score_multiplier must be a float") + + @classmethod + def from_yaml(cls, yaml_file: Union[Path, str]) -> "NerModelConfiguration": + """Load NLP engine configuration from yaml file. + + :param yaml_file: Path to the yaml file.""" + + if not Path(yaml_file).exists(): + raise FileNotFoundError(f"configuration file {yaml_file} not found.") + + with open(yaml_file, "r") as f: + nlp_engine_configuration = yaml.safe_load(f) + + cls._validate_input(nlp_engine_configuration) + + return cls.from_dict(nlp_engine_configuration) + + @classmethod + def from_json(cls, json_file: Union[Path, str]) -> "NerModelConfiguration": + """Load NLP engine configuration from json file. + + :param json_file: Path to the json file.""" + + if not Path(json_file).exists(): + raise FileNotFoundError(f"configuration file {json_file} not found.") + + with open(json_file, "r") as f: + nlp_engine_configuration = json.load(f) + + cls._validate_input(nlp_engine_configuration) + + return cls.from_dict(nlp_engine_configuration) + + @classmethod + def from_dict(cls, config_dict: Dict) -> "NerModelConfiguration": + """Load NLP engine configuration from dict. + + :param config_dict: Dict with the configuration to load. + """ + return cls(**config_dict) + + def to_dict(self) -> Dict: + """Return the configuration as a dict.""" + return self.__dict__ + + @staticmethod + def get_full_conf_path( + default_conf_file: Union[Path, str] = "default.yaml" + ) -> Path: + """Return a Path to the default conf file. + + :param default_conf_file: Name of the default conf file.""" + return Path(Path(__file__).parent.parent.parent, "conf", default_conf_file) + + def __str__(self) -> str: + return str(self.to_dict()) + + def __repr__(self) -> str: + return str(self) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py index d28624774..cc696950e 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py @@ -1,5 +1,5 @@ import json -from typing import List +from typing import List, Optional from spacy.tokens import Doc, Span @@ -18,15 +18,27 @@ def __init__( tokens: Doc, tokens_indices: List[int], lemmas: List[str], - nlp_engine, # noqa ANN001 + nlp_engine: "NlpEngine", # noqa F821 language: str, + scores: Optional[List[float]] = None ): + """ + :param entities: Identified entities + :param tokens: Tokenized text + :param tokens_indices: Indices of tokens + :param lemmas: List of lemmas in text + :param nlp_engine: NlpEngine object + :param language: Text language + :param scores: Entity confidence scores + """ + self.entities = entities self.tokens = tokens self.lemmas = lemmas self.tokens_indices = tokens_indices self.keywords = self.set_keywords(nlp_engine, lemmas, language) self.nlp_engine = nlp_engine + self.scores = scores if scores else [0.85] * len(entities) @staticmethod def set_keywords( diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine.py index 14262e267..410b17842 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Iterable, Iterator, Tuple +from typing import Iterable, Iterator, Tuple, Dict, List from presidio_analyzer.nlp_engine import NlpArtifacts @@ -12,6 +12,14 @@ class NlpEngine(ABC): on tokens. """ + @abstractmethod + def load(self): + """Load the NLP model.""" + + @abstractmethod + def is_loaded(self) -> bool: + """Return True if the model is already loaded.""" + @abstractmethod def process_text(self, text: str, language: str) -> NlpArtifacts: """Execute the NLP pipeline on the given text and language.""" @@ -40,3 +48,8 @@ def is_punct(self, word: str, language: str) -> bool: (within the given language) """ + + @abstractmethod + def get_supported_entities(self) -> List[str]: + """Return the supported entities for this NLP engine.""" + pass \ No newline at end of file diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py index 9873ce21a..f9fa72b65 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py @@ -8,7 +8,7 @@ StanzaNlpEngine, SpacyNlpEngine, NlpEngine, - TransformersNlpEngine, + TransformersNlpEngine, NerModelConfiguration, ) logger = logging.getLogger("presidio-analyzer") @@ -84,11 +84,11 @@ def create_engine(self) -> NlpEngine: ) try: nlp_engine_class = self.nlp_engines[nlp_engine_name] - nlp_engine_opts = { - m["lang_code"]: m["model_name"] - for m in self.nlp_configuration["models"] - } - engine = nlp_engine_class(nlp_engine_opts) + nlp_models = self.nlp_configuration["models"] + + ner_model_configuration = self.nlp_configuration.get("ner_model_params") + engine = nlp_engine_class(models=nlp_models, ner_model_configuration=ner_model_configuration) + engine.load() logger.info( f"Created NLP engine: {engine.engine_name}. " f"Loaded models: {list(engine.nlp.keys())}" @@ -121,4 +121,4 @@ def _get_full_conf_path( default_conf_file: Union[Path, str] = "default.yaml" ) -> Path: """Return a Path to the default conf file.""" - return Path(Path(__file__).parent.parent.parent, "conf", default_conf_file) + return Path(Path(__file__).parent.parent.parent, "conf", default_conf_file) \ No newline at end of file diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py index 7bd3c4a99..7bc8afff5 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py @@ -3,9 +3,9 @@ import spacy from spacy.language import Language -from spacy.tokens import Doc +from spacy.tokens import Doc, Span, SpanGroup -from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngine +from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngine, NerModelConfiguration logger = logging.getLogger("presidio-analyzer") @@ -21,25 +21,75 @@ class SpacyNlpEngine(NlpEngine): engine_name = "spacy" is_available = bool(spacy) + DEFAULT_CONFIDENCE = 0.85 - def __init__(self, models: Optional[Dict[str, str]] = None): + def __init__( + self, + models: Optional[List[Dict[str, str]]] = None, + ner_model_configuration: Optional[NerModelConfiguration] = None, + ): """ Initialize a wrapper on spaCy functionality. :param models: Dictionary with the name of the spaCy model per language. - For example: models = {"en": "en_core_web_lg"} + For example: models = [{"lang_code": "en", "model_name": "en_core_web_lg"}] + :param ner_model_configuration: Parameters for the NER model. See conf/spacy.yaml for an example """ if not models: - models = {"en": "en_core_web_lg"} - logger.debug(f"Loading SpaCy models: {models.values()}") + models = [{"lang_code": "en", "model_name": "en_core_web_lg"}] + self.models = models + + if not ner_model_configuration: + ner_model_configuration = NerModelConfiguration(self.engine_name) + self.ner_model_configuration = ner_model_configuration + + self.nlp = None + + def load(self) -> None: + """Load the spaCy NLP model.""" + logger.debug(f"Loading SpaCy models: {self.models}") + + self.nlp = {} + # Download spaCy model if missing + for model in self.models: + self._validate_model_params(model) + self._download_spacy_model_if_needed(model["model_name"]) + self.nlp[model["lang_code"]] = spacy.load(model["model_name"]) + + @staticmethod + def _download_spacy_model_if_needed(model_name): + if not spacy.util.is_package(model_name): + logger.warning(f"Model {model_name} is not installed. Downloading...") + spacy.cli.download(model_name) + logger.info(f"Finished downloading model {model_name}") + + @staticmethod + def _validate_model_params(model: Dict): + if "lang_code" not in model: + raise ValueError("lang_code is missing from model configuration") + if "model_name" not in model: + raise ValueError("model_name is missing from model configuration") + if not isinstance(model["model_name"], str): + raise ValueError("model_name must be a string") + + def get_supported_entities(self) -> List[str]: + """Return the supported entities for this NLP engine.""" + if not self.ner_model_configuration.model_to_presidio_entity_mapping: + raise ValueError( + "model_to_presidio_entity_mapping is missing from model configuration" + ) + return list( + set(self.ner_model_configuration.model_to_presidio_entity_mapping.values()) + ) - self.nlp = { - lang_code: spacy.load(model_name, disable=["parser"]) - for lang_code, model_name in models.items() - } + def is_loaded(self) -> bool: + """Return True if the model is already loaded.""" + return self.nlp is not None def process_text(self, text: str, language: str) -> NlpArtifacts: """Execute the SpaCy NLP pipeline on the given text and language.""" + if not self.nlp: + raise ValueError("NLP engine is not loaded. Consider calling .load()") doc = self.nlp[language](text) return self._doc_to_nlp_artifact(doc, language) @@ -50,7 +100,17 @@ def process_batch( language: str, as_tuples: bool = False, ) -> Iterator[Optional[NlpArtifacts]]: - """Execute the NLP pipeline on a batch of texts using spacy pipe.""" + """Execute the NLP pipeline on a batch of texts using spacy pipe. + :param texts: A list of texts to process. + :param language: The language of the texts. + :param as_tuples: If set to True, inputs should be a sequence of + (text, context) tuples. Output will then be a sequence of + (doc, context) tuples. Defaults to False. + """ + + if not self.nlp: + raise ValueError("NLP engine is not loaded. Consider calling .load()") + texts = (str(text) for text in texts) docs = self.nlp[language].pipe(texts, as_tuples=as_tuples) for doc in docs: @@ -76,20 +136,67 @@ def get_nlp(self, language: str) -> Language: """ Return the language model loaded for a language. - :param language: Name of language - :return: Language model from spaCy + :param language: Language + :return: Model from spaCy """ return self.nlp[language] def _doc_to_nlp_artifact(self, doc: Doc, language: str) -> NlpArtifacts: lemmas = [token.lemma_ for token in doc] tokens_indices = [token.idx for token in doc] - entities = doc.ents + + entities = self._get_entities(doc) + scores = entities.attrs["scores"] + + entities_as_spans = [ent for ent in entities] + return NlpArtifacts( - entities=entities, + entities=entities_as_spans, tokens=doc, tokens_indices=tokens_indices, lemmas=lemmas, nlp_engine=self, language=language, + scores=scores, ) + + def _get_entities(self, doc: Doc) -> SpanGroup: + """ + Get an updated list of entities based on the ner model configuration. + Remove entities that are in labels_to_ignore, + update entity names based on model_to_presidio_entity_mapping + :param doc: Output of a spaCy model + :return: SpanGroup holding on the entities and confidence scores + """ + output_spans = SpanGroup(doc, attrs={"scores": []}) + + mapping = self.ner_model_configuration.model_to_presidio_entity_mapping + for ent in doc.ents: + # Remove model labels in the ignore list + if ent.label_ in self.ner_model_configuration.labels_to_ignore: + continue + + # Update entity label based on mapping + if ent.label_ in mapping: + ent.label_ = mapping[ent.label_] + else: + logger.warning( + f"Entity {ent.label_} is not mapped to a Presidio entity, but keeping anyway" + ) + + # Remove presidio entities in the ignore list + if ent.label_ in self.ner_model_configuration.labels_to_ignore: + continue + + output_spans.append(ent) + + # Set default confidence (spaCy models don't have built in confidence scores) + score = self.DEFAULT_CONFIDENCE + + # Update score if entity is in low score entity names + if ent.label_ in self.ner_model_configuration.low_score_entity_names: + score *= self.ner_model_configuration.low_confidence_score_multiplier + + output_spans.attrs["scores"].append(score) + + return output_spans diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py index 75b597d59..cfe603950 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py @@ -1,4 +1,5 @@ import logging +from typing import Optional, Dict, List try: import stanza @@ -19,22 +20,20 @@ class StanzaNlpEngine(SpacyNlpEngine): on tokens. The StanzaNlpEngine uses spacy-stanza and stanza as its NLP module - :param models: Dictionary with the name of the stanza model per language. - For example: models = {"en": "en"} """ engine_name = "stanza" is_available = bool(stanza) - def __init__(self, models=None): # noqa ANN201 - if not models: - models = {"en": "en"} - logger.debug(f"Loading Stanza models: {models.values()}") + def load(self): + """Load the NLP model.""" - self.nlp = { - lang_code: spacy_stanza.load_pipeline( - model_name, + logger.debug(f"Loading Stanza models: {self.models}") + + self.nlp = {} + for model in self.models: + self._validate_model_params(model) + self.nlp[model["lang_code"]] = spacy_stanza.load_pipeline( + model["model_name"], processors="tokenize,pos,lemma,ner", ) - for lang_code, model_name in models.items() - } diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py index ee7cf029d..5032acc41 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py @@ -1,96 +1,45 @@ import logging -from typing import Optional, Dict +from typing import Optional, Dict, List import spacy -from spacy.language import Language -from spacy.tokens import Doc, Span - -from presidio_analyzer.nlp_engine import SpacyNlpEngine - +from spacy.tokens import Doc, Span, SpanGroup +from spacy.tokens.doc import SpanGroups try: - import torch + import spacy_huggingface_pipelines import transformers - from transformers import ( - AutoTokenizer, - AutoModelForTokenClassification, - pipeline, - ) except ImportError: - torch = None + spacy_huggingface_pipelines = None transformers = None -logger = logging.getLogger("presidio-analyzer") - - -@Language.factory( - "transformers", - default_config={"pretrained_model_name_or_path": "dslim/bert-base-NER"}, +from presidio_analyzer.nlp_engine import ( + SpacyNlpEngine, + NlpArtifacts, + NerModelConfiguration, ) -def create_transformer_component(nlp, name, pretrained_model_name_or_path: str): - """Spacy Language factory for creating custom component.""" - return TransformersComponent( - pretrained_model_name_or_path=pretrained_model_name_or_path - ) - - -class TransformersComponent: - """ - Custom component to use in spacy pipeline. - - Using HaggingFace transformers pretrained models for entity recognition. - :param pretrained_model_name_or_path: HaggingFace pretrained_model_name_or_path - """ - def __init__(self, pretrained_model_name_or_path: str) -> None: - Span.set_extension("confidence_score", default=1.0, force=True) - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path) - model = AutoModelForTokenClassification.from_pretrained( - pretrained_model_name_or_path - ) - self.nlp = pipeline( - "ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple" - ) - - def __call__(self, doc: Doc) -> Doc: - """Write transformers results to doc entities.""" - - res = self.nlp(doc.text) - ents = [] - for d in res: - span = doc.char_span( - d["start"], d["end"], label=d["entity_group"], alignment_mode="expand" - ) - if span is not None: - span._.confidence_score = d["score"] - ents.append(span) - else: - logger.warning( - f"Transformers model returned {d} but no valid span was found." - ) - doc.ents = ents - return doc +logger = logging.getLogger("presidio-analyzer") class TransformersNlpEngine(SpacyNlpEngine): """ - SpacyTransformersNlpEngine is a transformers based NlpEngine. + TransformersNlpEngine is a transformers based NlpEngine. It comprises a spacy pipeline used for tokenization, lemmatization, pos, and a transformers component for NER. Both the underlying spacy pipeline and the transformers engine could be configured by the user. - - :param models: a dictionary containing the model names per language. + :param models: A dict holding the model's configuration. :example: - { - "en": { + [{"lang_code": "en", "model_name": { "spacy": "en_core_web_sm", "transformers": "dslim/bert-base-NER" - } - } + } + }] + :param ner_model_configuration: Parameters for the NER model. See conf/transformers.yaml for an example + Note that since the spaCy model is not used for NER, we recommend using a simple model, such as en_core_web_sm for English. @@ -98,57 +47,120 @@ class TransformersNlpEngine(SpacyNlpEngine): https://huggingface.co/models?pipeline_tag=token-classification It is further recommended to fine-tune these models to the specific scenario in hand. + """ engine_name = "transformers" - is_available = bool(spacy) and bool(transformers) + is_available = bool(spacy_huggingface_pipelines) - def __init__(self, models: Optional[Dict[str, Dict[str, str]]] = None): - # default models if not specified + def __init__( + self, + models: Optional[Dict] = None, + ner_model_configuration: Optional[NerModelConfiguration] = None, + ): if not models: - models = { - "en": {"spacy": "en_core_web_sm", "transformers": "dslim/bert-base-NER"} - } - # validate models type - elif type(models) is not dict: - logger.error(f"''models' argument must be dict, not {type(models)}") - raise KeyError(f"Expected 'models' argument to be dict, not {type(models)}") - # validate models[model_lang] type is dict for all model_lang - elif any( - [type(model_dict) is not dict for model_lang, model_dict in models.items()] - ): - # elif type(models["model_name"]) is not dict: - logger.error( - "'models.model_name' argument must be dict," - f"not {type(models['model_name'])}" - ) - raise KeyError( - "Expected 'models.model_name' argument to be dict," - f"not {type(models['model_name'])}" - ) - # chack that model_name dict includes the keys: "spacy" and "transformers" - elif any( - [ - any([key not in model_dict for key in ("spacy", "transformers")]) - for model_lang, model_dict in models.items() + models = [ + { + "lang_code": "en", + "model_name": { + "spacy": "en_core_web_sm", + "transformers": "obi/deid_roberta_i2b2", + }, + } ] - ): - logger.error( - "'models.model_name' must contains 'spacy' and 'transformers' keys" - ) - raise KeyError( - "Expected keys ('spacy' and 'transformers') was not found in " - "models.model_name dict" - ) + super().__init__(models=models, ner_model_configuration=ner_model_configuration) + self.entity_key = "bert-base-ner" - logger.debug(f"Loading SpaCy and transformers models: {models.values()}") + def load(self) -> None: + """Load the spaCy and transformers models.""" + logger.debug(f"Loading SpaCy and transformers models: {self.models}") self.nlp = {} - for lang_code, model_name in models.items(): - nlp = spacy.load(model_name["spacy"], disable=["parser", "ner"]) + + for model in self.models: + self._validate_model_params(model) + spacy_model = model["model_name"]["spacy"] + transformers_model = model["model_name"]["transformers"] + self._download_spacy_model_if_needed(spacy_model) + + nlp = spacy.load(spacy_model, disable=["parser", "ner"]) nlp.add_pipe( - "transformers", - config={"pretrained_model_name_or_path": model_name["transformers"]}, - last=True, + "hf_token_pipe", + config={ + "model": transformers_model, + "annotate": "spans", + "stride": self.ner_model_configuration.stride, + "alignment_mode": self.ner_model_configuration.alignment_mode, + "aggregation_strategy": self.ner_model_configuration.aggregation_strategy, + "annotate_spans_key": self.entity_key, + }, + ) + self.nlp[model["lang_code"]] = nlp + + @staticmethod + def _validate_model_params(model: Dict): + if "lang_code" not in model: + raise ValueError("lang_code is missing from model configuration") + if "model_name" not in model: + raise ValueError("model_name is missing from model configuration") + if not isinstance(model["model_name"], dict): + raise ValueError("model_name must be a dictionary") + if "spacy" not in model["model_name"]: + raise ValueError("spacy model name is missing from model configuration") + if "transformers" not in model["model_name"]: + raise ValueError( + "transformers model name is missing from model configuration" ) - self.nlp[lang_code] = nlp + + def process_text(self, text: str, language: str) -> NlpArtifacts: + """Execute the SpaCy NLP pipeline on the given text and language.""" + if not self.nlp: + raise ValueError("NLP engine is not loaded. Consider calling .load()") + + doc = self.nlp[language](text) + return self._doc_to_nlp_artifact(doc, language) + + def _get_entities(self, doc: Doc) -> SpanGroup: + """ + Get an updated list of entities based on the ner model configuration. + Remove entities that are in labels_to_ignore, + update entity names based on model_to_presidio_entity_mapping + :param doc: Output of a spaCy model + :return: SpanGroup holding on the entities and confidence scores + """ + + current_ents = doc.spans[self.entity_key] + current_scores = doc.spans[self.entity_key].attrs["scores"] + + output_spans = SpanGroup(doc, attrs={"scores": []}) + + mapping = self.ner_model_configuration.model_to_presidio_entity_mapping + to_ignore = self.ner_model_configuration.labels_to_ignore + for i, ent in enumerate(current_ents): + # Remove model labels in the ignore list + if ent.label_ in to_ignore: + continue + + # Update entity label based on mapping + if ent.label_ in mapping: + ent.label_ = mapping[ent.label_] + else: + logger.warning( + f"Entity {ent.label_} is not mapped to a Presidio entity, but keeping anyway" + ) + + # Remove presidio entities in the ignore list + if ent.label_ in to_ignore: + continue + + output_spans.append(ent) + + score = current_scores[i] + # Update score if entity is in low score entity names + if ent.label_ in self.ner_model_configuration.low_score_entity_names: + score *= self.ner_model_configuration.low_confidence_score_multiplier + + # Update scores list + output_spans.attrs["scores"].append(score) + + return output_spans diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index 030e3d6cf..c976fcb0a 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -8,8 +8,14 @@ from .crypto_recognizer import CryptoRecognizer from .date_recognizer import DateRecognizer from .email_recognizer import EmailRecognizer +from .es_nif_recognizer import EsNifRecognizer from .iban_recognizer import IbanRecognizer from .ip_recognizer import IpRecognizer +from .it_driver_license_recognizer import ItDriverLicenseRecognizer +from .it_fiscal_code_recognizer import ItFiscalCodeRecognizer +from .it_identity_card_recognizer import ItIdentityCardRecognizer +from .it_passport_recognizer import ItPassportRecognizer +from .it_vat_code import ItVatCodeRecognizer from .medical_license_recognizer import MedicalLicenseRecognizer from .phone_recognizer import PhoneRecognizer from .sg_fin_recognizer import SgFinRecognizer @@ -34,6 +40,7 @@ from .it_passport_recognizer import ItPassportRecognizer from .in_pan_recognizer import InPanRecognizer + NLP_RECOGNIZERS = { "spacy": SpacyRecognizer, "stanza": StanzaRecognizer, @@ -72,5 +79,5 @@ "ItVatCodeRecognizer", "ItIdentityCardRecognizer", "ItPassportRecognizer", - "InPanRecognizer" + "InPanRecognizer", ] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py index 2003ba7a4..09cf7683e 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py @@ -1,4 +1,5 @@ import logging +import warnings from typing import Optional, List, Tuple, Set from presidio_analyzer import ( @@ -11,32 +12,11 @@ class SpacyRecognizer(LocalRecognizer): - """ - Recognize PII entities using a spaCy NLP model. - - Since the spaCy pipeline is ran by the AnalyzerEngine, - this recognizer only extracts the entities from the NlpArtifacts - and replaces their types to align with Presidio's. - - :param supported_language: Language this recognizer supports - :param supported_entities: The entities this recognizer can detect - :param ner_strength: Default confidence for NER prediction - :param check_label_groups: Tuple containing Presidio entity names - and spaCy entity names, for verifying that the right entity - is translated into a Presidio entity. - """ - - ENTITIES = [ - "DATE_TIME", - "NRP", - "LOCATION", - "PERSON", - # "ORGANIZATION" - Less accurate with the 'en_core_web_lg' model, - # can be used with more assurance when using 'en_core_web_trf'. - ] + ENTITIES = ["DATE_TIME", "NRP", "LOCATION", "PERSON", "ORGANIZATION"] DEFAULT_EXPLANATION = "Identified as {} by Spacy's Named Entity Recognition" + # deprecated, use MODEL_TO_PRESIDIO_MAPPING in NerModelConfiguration instead CHECK_LABEL_GROUPS = [ ({"LOCATION"}, {"GPE", "LOC"}), ({"PERSON", "PER"}, {"PERSON", "PER"}), @@ -50,12 +30,34 @@ def __init__( supported_language: str = "en", supported_entities: Optional[List[str]] = None, ner_strength: float = 0.85, + default_explanation: Optional[str] = None, check_label_groups: Optional[List[Tuple[Set, Set]]] = None, context: Optional[List[str]] = None, ): + """ + Recognize PII entities using a spaCy NLP model. + + Since the spaCy pipeline is ran by the AnalyzerEngine, + this recognizer only extracts the entities from the NlpArtifacts + and replaces their types to align with Presidio's. + + :param supported_language: Language this recognizer supports + :param supported_entities: The entities this recognizer can detect + :param ner_strength: Default confidence for NER prediction + :param check_label_groups: (DEPRECATED) Tuple containing Presidio entity names + :param default_explanation: Default explanation for the results when using return_decision_process=True + """ # noqa E501 + self.ner_strength = ner_strength - self.check_label_groups = ( - check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS + if check_label_groups: + warnings.warn( + "check_label_groups is deprecated and isn't used;entities are mapped in NerModelConfiguration", + DeprecationWarning, + 2, + ) + + self.default_explanation = ( + default_explanation if default_explanation else self.DEFAULT_EXPLANATION ) supported_entities = supported_entities if supported_entities else self.ENTITIES super().__init__( @@ -69,7 +71,7 @@ def load(self) -> None: # noqa D102 # preprocessed nlp artifacts pass - def build_spacy_explanation( + def build_explanation( self, original_score: float, explanation: str ) -> AnalysisExplanation: """ @@ -86,36 +88,36 @@ def build_spacy_explanation( ) return explanation - def analyze(self, text, entities, nlp_artifacts=None): # noqa D102 + def analyze(self, text:str, entities, nlp_artifacts=None): # noqa D102 results = [] if not nlp_artifacts: logger.warning("Skipping SpaCy, nlp artifacts not provided...") return results ner_entities = nlp_artifacts.entities + ner_scores = nlp_artifacts.scores - for entity in entities: - if entity not in self.supported_entities: + for ner_entity, ner_score in zip(ner_entities, ner_scores): + if ner_entity.label_ not in entities: + logger.debug(f"Skipping entity {ner_entity.label_} as it is not in the supported entities list") continue - for ent in ner_entities: - if not self.__check_label(entity, ent.label_, self.check_label_groups): - continue - textual_explanation = self.DEFAULT_EXPLANATION.format(ent.label_) - explanation = self.build_spacy_explanation( - self.ner_strength, textual_explanation - ) - spacy_result = RecognizerResult( - entity_type=entity, - start=ent.start_char, - end=ent.end_char, - score=self.ner_strength, - analysis_explanation=explanation, - recognition_metadata={ - RecognizerResult.RECOGNIZER_NAME_KEY: self.name, - RecognizerResult.RECOGNIZER_IDENTIFIER_KEY: self.id, - }, - ) - results.append(spacy_result) + + textual_explanation = self.DEFAULT_EXPLANATION.format( + ner_entity.label_ + ) + explanation = self.build_explanation(ner_score, textual_explanation) + spacy_result = RecognizerResult( + entity_type=ner_entity.label_, + start=ner_entity.start_char, + end=ner_entity.end_char, + score=ner_score, + analysis_explanation=explanation, + recognition_metadata={ + RecognizerResult.RECOGNIZER_NAME_KEY: self.name, + RecognizerResult.RECOGNIZER_IDENTIFIER_KEY: self.id, + }, + ) + results.append(spacy_result) return results @@ -123,6 +125,4 @@ def analyze(self, text, entities, nlp_artifacts=None): # noqa D102 def __check_label( entity: str, label: str, check_label_groups: Tuple[Set, Set] ) -> bool: - return any( - [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups] - ) + raise DeprecationWarning("__check_label is deprecated") diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py index 51360e39c..2134d5cfb 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py @@ -1,5 +1,6 @@ -from typing import Tuple, Set import logging +from typing import List + from presidio_analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer from presidio_analyzer import RecognizerResult @@ -8,65 +9,30 @@ class TransformersRecognizer(SpacyRecognizer): """ - Recognize entities using the transformers package. + Recognize entities using the spacy-huggingface-pipeline package. The recognizer doesn't run transformers models, but loads the output from the NlpArtifacts - See https://huggingface.co/docs/transformers/main/en/index - Uses the transformers package - (https://huggingface.co/docs/transformers/main/en/installation) to align - transformers interface with spaCy - """ - - def __init__(self, **kwargs): # noqa ANN003 + See: + - https://huggingface.co/docs/transformers/main/en/index for transformer models + - https://github.com/explosion/spacy-huggingface-pipelines on the spaCy wrapper to transformers + """ # noqa E501 + + ENTITIES = [ + "PERSON", + "LOCATION", + "ORGANIZATION", + "AGE", + "ID", + "EMAIL", + "DATE_TIME", + "PHONE_NUMBER", + ] + + LOW_SCORE_ENTITY_NAMES = {"ID"} + + def __init__(self, **kwargs): self.DEFAULT_EXPLANATION = self.DEFAULT_EXPLANATION.replace( "Spacy", "Transfromers" ) super().__init__(**kwargs) - - def analyze(self, text, entities, nlp_artifacts=None): # noqa D102 - results = [] - if not nlp_artifacts: - logger.warning("Skipping SpaCy, nlp artifacts not provided...") - return results - - ner_entities = nlp_artifacts.entities - - for entity in entities: - if entity not in self.supported_entities: - continue - for ent in ner_entities: - if not self.__check_label(entity, ent.label_, self.check_label_groups): - continue - if not ent.has_extension("confidence_score"): - raise ValueError( - "confidence score not available as a spaCy span extension " - "(ent._.confidence_score)" - ) - confidence_score = ent._.confidence_score - textual_explanation = self.DEFAULT_EXPLANATION.format(ent.label_) - explanation = self.build_spacy_explanation( - confidence_score, textual_explanation - ) - spacy_result = RecognizerResult( - entity_type=entity, - start=ent.start_char, - end=ent.end_char, - score=confidence_score, - analysis_explanation=explanation, - recognition_metadata={ - RecognizerResult.RECOGNIZER_NAME_KEY: self.name, - RecognizerResult.RECOGNIZER_IDENTIFIER_KEY: self.id, - }, - ) - results.append(spacy_result) - - return results - - @staticmethod - def __check_label( - entity: str, label: str, check_label_groups: Tuple[Set, Set] - ) -> bool: - return any( - [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups] - ) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 43ea8b244..d1bd52222 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -55,7 +55,6 @@ class RecognizerRegistry: """ def __init__(self, recognizers: Optional[Iterable[EntityRecognizer]] = None): - if recognizers: self.recognizers = recognizers else: @@ -75,6 +74,7 @@ def load_predefined_recognizers( languages = ["en"] nlp_recognizer = self._get_nlp_recognizer(nlp_engine) + recognizers_map = { "en": [ UsBankRecognizer, @@ -106,7 +106,6 @@ def load_predefined_recognizers( IbanRecognizer, IpRecognizer, MedicalLicenseRecognizer, - nlp_recognizer, PhoneRecognizer, UrlRecognizer, ], @@ -118,11 +117,19 @@ def load_predefined_recognizers( rc(supported_language=lang) for rc in recognizers_map.get("ALL", []) ] self.recognizers.extend(all_recognizers) + if nlp_engine: + nlp_recognizer_inst = nlp_recognizer( + supported_language=lang, + supported_entities=nlp_engine.get_supported_entities() + ) + else: + nlp_recognizer_inst = nlp_recognizer(supported_language=lang) + self.recognizers.append(nlp_recognizer_inst) @staticmethod def _get_nlp_recognizer( nlp_engine: NlpEngine, - ) -> Union[Type[SpacyRecognizer], Type[StanzaRecognizer]]: + ) -> Type[SpacyRecognizer]: """Return the recognizer leveraging the selected NLP Engine.""" if not nlp_engine or type(nlp_engine) == SpacyNlpEngine: diff --git a/presidio-analyzer/setup.cfg b/presidio-analyzer/setup.cfg index 55dc798b4..6a510fa1c 100644 --- a/presidio-analyzer/setup.cfg +++ b/presidio-analyzer/setup.cfg @@ -7,4 +7,4 @@ exclude = dist, tests docstring-convention = numpy -extend-ignore = E203 D100 D202 ANN101 ANN102 ANN204 ANN203 \ No newline at end of file +extend-ignore = E203 D100 D202 ANN101 ANN102 ANN201 ANN204 ANN203 TC001 \ No newline at end of file diff --git a/presidio-analyzer/setup.py b/presidio-analyzer/setup.py index 5329a91d8..e616b55ef 100644 --- a/presidio-analyzer/setup.py +++ b/presidio-analyzer/setup.py @@ -39,7 +39,8 @@ "phonenumbers>=8.12", ], extras_require={ - 'transformers': ['torch', 'transformers'], + 'transformers': ['spacy_huggingface_pipelines'], + "stanza": ["stanza", "spacy_stanza"], }, include_package_data=True, license="MIT", diff --git a/presidio-analyzer/tests/conf/default.yaml b/presidio-analyzer/tests/conf/default.yaml index 68f0f0f75..ee1504276 100644 --- a/presidio-analyzer/tests/conf/default.yaml +++ b/presidio-analyzer/tests/conf/default.yaml @@ -3,4 +3,26 @@ models: - lang_code: en model_name: en_core_web_lg - +ner_model_configuration: +- model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + LOC: LOCATION + LOCATION: LOCATION + GPE: LOCATION + ORG: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME + NORP: NRP +- aggregation_strategy: simple # "simple", "first", "average", "max" +- stride: 16 # If stride >= 0, process long texts in + # overlapping windows of the model max + # length. The value is the length of the + # window overlap in transformer tokenizer + # tokens, NOT the length of the stride. +- alignment_mode: strict # "strict", "contract", "expand" +- labels_to_ignore: ["O"] +- low_confidence_score_multiplier: 0.4 +- low_score_entity_names: + - ORGANIZATION + - ORG \ No newline at end of file diff --git a/presidio-analyzer/tests/conftest.py b/presidio-analyzer/tests/conftest.py index 04ee13828..5d5602787 100644 --- a/presidio-analyzer/tests/conftest.py +++ b/presidio-analyzer/tests/conftest.py @@ -51,18 +51,27 @@ def nlp_engines(request, nlp_engine_provider) -> Dict[str, NlpEngine]: nlp_engines = nlp_engine_provider.nlp_engines for name, engine_cls in nlp_engines.items(): if name == "spacy" and not request.config.getoption("--runfast"): - available_engines[f"{name}_en"] = engine_cls({"en": "en_core_web_lg"}) + available_engines[f"{name}_en"] = engine_cls( + models=[{"lang_code": "en", "model_name": "en_core_web_lg"}] + ) + elif name == "stanza" and not request.config.getoption("--runfast"): + available_engines[f"{name}_en"] = engine_cls( + models=[{"lang_code": "en", "model_name": "en"}] + ) elif name == "transformers" and not request.config.getoption("--runfast"): available_engines[f"{name}_en"] = engine_cls( - { - "en": { - "spacy": "en_core_web_lg", - "transformers": "dslim/bert-base-NER", - } - } + models=[{ + "lang_code": "en", + "model_name": { + "spacy": "en_core_web_sm", + "transformers": "StanfordAIMI/stanford-deidentifier-base", + }, + }] ) else: - available_engines[f"{name}_en"] = engine_cls() + raise ValueError("Unsupported engine for tests") + # Load engine + available_engines[f"{name}_en"].load() return available_engines @@ -91,6 +100,11 @@ def max_score() -> float: return EntityRecognizer.MAX_SCORE +@pytest.fixture(scope="session") +def min_score() -> float: + return EntityRecognizer.MIN_SCORE + + @pytest.fixture(scope="module") def loaded_registry() -> RecognizerRegistry: return RecognizerRegistry() diff --git a/presidio-analyzer/tests/mocks/nlp_engine_mock.py b/presidio-analyzer/tests/mocks/nlp_engine_mock.py index 26dd7a654..5e8ab5568 100644 --- a/presidio-analyzer/tests/mocks/nlp_engine_mock.py +++ b/presidio-analyzer/tests/mocks/nlp_engine_mock.py @@ -1,4 +1,4 @@ -from typing import Iterable, Iterator, Tuple +from typing import Iterable, Iterator, Tuple, Dict, List from presidio_analyzer.nlp_engine import NlpEngine, NlpArtifacts @@ -12,6 +12,12 @@ def __init__(self, stopwords=None, punct_words=None, nlp_artifacts=None): else: self.nlp_artifacts = nlp_artifacts + def load(self): + pass + + def is_loaded(self) -> bool: + return True + def is_stopword(self, word, language): return word in self.stopwords @@ -27,3 +33,9 @@ def process_batch( texts = list(texts) for i in range(len(texts)): yield texts[i], self.nlp_artifacts + + def get_nlp_engine_configuration_as_dict(self) -> Dict: + return {} + + def get_supported_entities(self) -> List[str]: + pass diff --git a/presidio-analyzer/tests/test_analyzer_engine.py b/presidio-analyzer/tests/test_analyzer_engine.py index 3daa67fe3..746f2ae60 100644 --- a/presidio-analyzer/tests/test_analyzer_engine.py +++ b/presidio-analyzer/tests/test_analyzer_engine.py @@ -152,7 +152,6 @@ def test_when_analyze_added_pattern_recognizer_then_succeed(unit_test_guid): ) mock_recognizer_registry = RecognizerRegistryMock() - # Make sure the analyzer doesn't get this entity analyze_engine = AnalyzerEngine( registry=mock_recognizer_registry, diff --git a/presidio-analyzer/tests/test_context_support.py b/presidio-analyzer/tests/test_context_support.py index 124da5d21..3054b29ee 100644 --- a/presidio-analyzer/tests/test_context_support.py +++ b/presidio-analyzer/tests/test_context_support.py @@ -46,11 +46,6 @@ def recognizers_list(recognizers_map): return rec_list -@pytest.fixture(scope="module") -def nlp_engine(nlp_engines): - return nlp_engines["spacy_en"] - - @pytest.fixture(scope="module") def dataset(recognizers_map): """Loads up a group of sentences with relevant context words and creates diff --git a/presidio-analyzer/tests/test_nlp_engine_provider.py b/presidio-analyzer/tests/test_nlp_engine_provider.py index 0afe338d1..4fa92f109 100644 --- a/presidio-analyzer/tests/test_nlp_engine_provider.py +++ b/presidio-analyzer/tests/test_nlp_engine_provider.py @@ -1,16 +1,45 @@ +import json from pathlib import Path +from typing import Dict, List +from unittest.mock import patch import pytest import spacy +import yaml from presidio_analyzer.nlp_engine import ( SpacyNlpEngine, StanzaNlpEngine, NlpEngineProvider, + NerModelConfiguration, ) from presidio_analyzer.nlp_engine.transformers_nlp_engine import TransformersNlpEngine +@pytest.fixture(scope="session") +def nlp_configuration_dict() -> Dict: + nlp_configuration = { + "lang_code": "en", + "model_name": { + "spacy": "en_core_web_lg", + "transformers": "StanfordAIMI/stanford-deidentifier-base", + }, + } + + return nlp_configuration + + +@pytest.fixture(scope="session") +def ner_model_configuration_dict() -> Dict: + ner_model_configuration = { + "nlp_engine_name": "transformers", + "aggregation_strategy": "simple", + "alignment_mode": "strict", + "low_score_entity_names": ["O"], + } + return ner_model_configuration + + def test_when_create_nlp_engine__then_return_default_configuration(): provider = NlpEngineProvider() engine = provider.create_engine() @@ -18,7 +47,12 @@ def test_when_create_nlp_engine__then_return_default_configuration(): assert engine.nlp is not None -def test_when_create_nlp_engine_then_simple_config_succeeds(mock_he_model): +def test_when_create_nlp_engine_then_simple_config_succeeds(mocker, mock_he_model): + mocker.patch( + "presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed", + return_value=None, + ) + nlp_configuration = { "nlp_engine_name": "spacy", "models": [{"lang_code": "he", "model_name": "he_test"}], @@ -31,7 +65,14 @@ def test_when_create_nlp_engine_then_simple_config_succeeds(mock_he_model): assert isinstance(engine.nlp["he"], spacy.lang.he.Hebrew) -def test_when_create_nlp_engine_then_two_models_succeeds(mock_he_model, mock_bn_model): +def test_when_create_nlp_engine_then_two_models_succeeds( + mocker, mock_he_model, mock_bn_model +): + mocker.patch( + "presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed", + return_value=None, + ) + nlp_configuration = { "nlp_engine_name": "spacy", "models": [ @@ -51,7 +92,11 @@ def test_when_create_nlp_engine_then_two_models_succeeds(mock_he_model, mock_bn_ assert isinstance(engine.nlp["bn"], spacy.lang.bn.Bengali) -def test_when_create_nlp_engine_from_wrong_conf_then_fail(): +def test_when_create_nlp_engine_from_wrong_conf_then_fail(mocker): + mocker.patch( + "presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed", + return_value=None, + ) with pytest.raises(OSError): nlp_configuration = { "nlp_engine_name": "spacy", @@ -61,7 +106,11 @@ def test_when_create_nlp_engine_from_wrong_conf_then_fail(): provider.create_engine() -def test_when_unsupported_nlp_engine_then_fail(): +def test_when_unsupported_nlp_engine_then_fail(mocker): + mocker.patch( + "presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed", + return_value=None, + ) with pytest.raises(ValueError) as e: unsupported_engine_name = "not exists" nlp_configuration = { @@ -70,11 +119,17 @@ def test_when_unsupported_nlp_engine_then_fail(): } provider = NlpEngineProvider(nlp_configuration=nlp_configuration) provider.create_engine() - assert (f"NLP engine '{unsupported_engine_name}' is not available. " - "Make sure you have all required packages installed") == e.value.args[0] + assert ( + f"NLP engine '{unsupported_engine_name}' is not available. " + "Make sure you have all required packages installed" + ) == e.value.args[0] -def test_when_read_test_nlp_conf_file_then_returns_spacy_nlp_engine(): +def test_when_read_test_nlp_conf_file_then_returns_spacy_nlp_engine(mocker): + mocker.patch( + "presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed", + return_value=None, + ) test_conf_file = Path(Path(__file__).parent, "conf", "test.yaml") provider = NlpEngineProvider(conf_file=test_conf_file) nlp_engine = provider.create_engine() @@ -93,7 +148,12 @@ def test_when_read_test_nlp_conf_file_then_returns_stanza_nlp_engine(): assert nlp_engine.nlp is not None -def test_when_both_conf_and_config_then_fail(): +def test_when_both_conf_and_config_then_fail(mocker): + mocker.patch( + "presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed", + return_value=None, + ) + nlp_configuration = { "nlp_engine_name": "spacy", "models": [{"lang_code": "he", "model_name": "he_test"}], @@ -105,15 +165,19 @@ def test_when_both_conf_and_config_then_fail(): @pytest.mark.skip_engine("transformers_en") -def test_when_create_transformers_nlp_engine_then_succeeds(): - nlp_configuration={ +def test_when_create_transformers_nlp_engine_then_succeeds(mocker): + mocker.patch( + "presidio_analyzer.nlp_engine.TransformersNlpEngine._download_spacy_model_if_needed", + return_value=None, + ) + nlp_configuration = { "nlp_engine_name": "transformers", "models": [ { "lang_code": "en", "model_name": { - "spacy": "en_core_web_sm", - "transformers": "dslim/bert-base-NER", + "spacy": "en_core_web_lg", + "transformers": "StanfordAIMI/stanford-deidentifier-base", }, } ], @@ -124,8 +188,15 @@ def test_when_create_transformers_nlp_engine_then_succeeds(): assert isinstance(engine.nlp["en"], spacy.lang.en.English) -def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_not_dict_then_fail(): - nlp_configuration={ +@pytest.mark.skip_engine("transformers_en") +def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_not_dict_then_fail( + mocker, +): + mocker.patch( + "presidio_analyzer.nlp_engine.SpacyNlpEngine._download_spacy_model_if_needed", + return_value=None, + ) + nlp_configuration = { "nlp_engine_name": "transformers", "models": [ { @@ -138,35 +209,82 @@ def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_not NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() -def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_keys_not_include_spacy_then_fail(): - nlp_configuration={ - "nlp_engine_name": "transformers", - "models": [ - { - "lang_code": "en", - "model_name": { # keys should contain transformers and spacy - "not_spacy": "en_core_web_sm", - "transformers": "dslim/bert-base-NER", - }, - } - ], - } +def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_keys_not_include_spacy_then_fail( + nlp_configuration_dict, +): + nlp_configuration = nlp_configuration_dict.copy() + del nlp_configuration["model_name"]["spacy"] + nlp_configuration["model_name"]["not_spacy"] = "ERROR" with pytest.raises(ValueError): NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() -def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_keys_not_include_transformers_then_fail(): - nlp_configuration={ - "nlp_engine_name": "transformers", - "models": [ - { - "lang_code": "en", - "model_name": { # keys should contain transformers and spacy - "spacy": "en_core_web_sm", - "not_transformers": "dslim/bert-base-NER", - }, - } - ], - } +def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_keys_not_include_transformers_then_fail( + nlp_configuration_dict, +): + nlp_configuration = nlp_configuration_dict.copy() + del nlp_configuration["model_name"]["transformers"] + nlp_configuration["model_name"]["not_transformers"] = "ERROR" with pytest.raises(ValueError): - NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() \ No newline at end of file + NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() + + +def test_ner_model_configuration_from_json( + ner_model_configuration_dict, tmp_path_factory +): + fn = tmp_path_factory.mktemp("data") / "nlp_configuration.json" + fn.write_text(json.dumps(ner_model_configuration_dict), "UTF-8") + + ner_model_configuration = NerModelConfiguration.from_json(fn.absolute()) + assert ner_model_configuration.nlp_engine_name == "transformers" + assert ( + ner_model_configuration.low_score_entity_names + == ner_model_configuration_dict["low_score_entity_names"] + ) + assert ( + ner_model_configuration.aggregation_strategy + == ner_model_configuration_dict["aggregation_strategy"] + ) + assert ( + ner_model_configuration.alignment_mode + == ner_model_configuration_dict["alignment_mode"] + ) + + +def test_nlp_model_configuration_from_yaml( + ner_model_configuration_dict, tmp_path_factory +): + fn = tmp_path_factory.mktemp("data") / "nlp_configuration.yaml" + fn.write_text(yaml.safe_dump(ner_model_configuration_dict), "UTF-8") + + ner_model_configuration = NerModelConfiguration.from_yaml(fn.absolute()) + assert ner_model_configuration.nlp_engine_name == "transformers" + assert ( + ner_model_configuration.low_score_entity_names + == ner_model_configuration_dict["low_score_entity_names"] + ) + assert ( + ner_model_configuration.aggregation_strategy + == ner_model_configuration_dict["aggregation_strategy"] + ) + assert ( + ner_model_configuration.alignment_mode + == ner_model_configuration_dict["alignment_mode"] + ) + + +def test_nlp_model_configuration_from_yaml_missing_field( + ner_model_configuration_dict, tmp_path_factory +): + fn = tmp_path_factory.mktemp("data") / "nlp_configuration.yaml" + del ner_model_configuration_dict["nlp_engine_name"] + fn.write_text(yaml.safe_dump(ner_model_configuration_dict), "UTF-8") + + with pytest.raises(ValueError): + NerModelConfiguration.from_yaml(fn.absolute()) + + +def test_nlp_engine_provider_init_through_nlp_engine_configuration(): + engine = NlpEngineProvider().create_engine() + assert isinstance(engine, SpacyNlpEngine) + assert engine.engine_name == "spacy" diff --git a/presidio-analyzer/tests/test_spacy_recognizer.py b/presidio-analyzer/tests/test_spacy_recognizer.py index 3fd5b31ba..68ce78fab 100644 --- a/presidio-analyzer/tests/test_spacy_recognizer.py +++ b/presidio-analyzer/tests/test_spacy_recognizer.py @@ -1,8 +1,8 @@ import pytest +from presidio_analyzer.nlp_engine import SpacyNlpEngine from tests import assert_result_within_score_range - @pytest.fixture(scope="module") def entities(): return ["PERSON", "DATE_TIME"] @@ -77,3 +77,9 @@ def test_when_person_in_text_then_person_full_name_complex_found( covered_text += text[sl] assert len(text) - len(covered_text) < 5 + + +def test_nlp_not_loaded_value_error(): + spacy_nlp = SpacyNlpEngine() + with pytest.raises(ValueError): + spacy_nlp.process_text("This should fail as the NLP model isn't loaded", language="en") diff --git a/presidio-analyzer/tests/test_stanza_recognizer.py b/presidio-analyzer/tests/test_stanza_recognizer.py index bd9e4b57b..fcab9755e 100644 --- a/presidio-analyzer/tests/test_stanza_recognizer.py +++ b/presidio-analyzer/tests/test_stanza_recognizer.py @@ -21,6 +21,7 @@ def nlp_recognizer(nlp_recognizers): def prepare_and_analyze(nlp, recognizer, text, ents): + nlp.load() nlp_artifacts = nlp.process_text(text, "en") results = recognizer.analyze(text, ents, nlp_artifacts) return results @@ -50,7 +51,7 @@ def prepare_and_analyze(nlp, recognizer, text, ents): # fmt: on ], ) -def test_when_using_stanze_then_all_stanza_result_correct( +def test_when_using_stanza_then_all_stanza_result_correct( text, expected_len, expected_positions, From cf44222015d842d11b3c85c655b4687145a454d2 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 28 Aug 2023 22:40:48 +0300 Subject: [PATCH 02/67] Update languages-config.yml --- docs/analyzer/languages-config.yml | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/docs/analyzer/languages-config.yml b/docs/analyzer/languages-config.yml index 16c0e383d..bd2ef780e 100644 --- a/docs/analyzer/languages-config.yml +++ b/docs/analyzer/languages-config.yml @@ -3,6 +3,25 @@ models: - lang_code: en model_name: en_core_web_lg + - + lang_code: de + model_name: de_core_news_md - lang_code: es - model_name: es_core_news_md \ No newline at end of file + model_name: es_core_news_md +ner_model_configuration: + - model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + LOC: LOCATION + LOCATION: LOCATION + GPE: LOCATION + ORG: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME + NORP: NRP + + - low_confidence_score_multiplier: 0.4 + - low_score_entity_names: + - ORGANIZATION + - ORG From 7db2320caf44291fd0f55f8a813341524723f4b7 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 28 Aug 2023 22:41:20 +0300 Subject: [PATCH 03/67] Update customizing_nlp_models.md --- docs/analyzer/customizing_nlp_models.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/analyzer/customizing_nlp_models.md b/docs/analyzer/customizing_nlp_models.md index 16abd2aab..9b3898a60 100644 --- a/docs/analyzer/customizing_nlp_models.md +++ b/docs/analyzer/customizing_nlp_models.md @@ -17,7 +17,7 @@ In addition, other types of NLP frameworks [can be integrated into Presidio](dev ## Configure Presidio to use the new model -Configuration can be done in three ways: +Configuration can be done in two ways: - **Via the `NlpEngineConfiguration` object: @@ -72,7 +72,7 @@ Configuration can be done in three ways: ```python from presidio_analyzer import AnalyzerEngine, RecognizerRegistry - from presidio_analyzer.nlp_engine import NlpEngineProvider, NlpEngineConfiguration + from presidio_analyzer.nlp_engine import NlpEngineProvider LANGUAGES_CONFIG_FILE = "./docs/analyzer/languages-config.yml" From ede9f9aa767f963040e4a01e0b443715d2df4eb0 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 28 Aug 2023 22:43:56 +0300 Subject: [PATCH 04/67] Update customizing_nlp_models.md --- docs/analyzer/customizing_nlp_models.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/analyzer/customizing_nlp_models.md b/docs/analyzer/customizing_nlp_models.md index 9b3898a60..3e67934c4 100644 --- a/docs/analyzer/customizing_nlp_models.md +++ b/docs/analyzer/customizing_nlp_models.md @@ -19,9 +19,6 @@ In addition, other types of NLP frameworks [can be integrated into Presidio](dev Configuration can be done in two ways: -- **Via the `NlpEngineConfiguration` object: - - - **Via code**: Create an `NlpEngine` using the `NlpEnginerProvider` class, and pass it to the `AnalyzerEngine` as input: ```python From d3edce804d84bd925895c9c32f9300f0b65926f2 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 28 Aug 2023 22:44:35 +0300 Subject: [PATCH 05/67] Update default.yaml --- presidio-analyzer/conf/default.yaml | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/presidio-analyzer/conf/default.yaml b/presidio-analyzer/conf/default.yaml index ee1504276..bcdf850bb 100644 --- a/presidio-analyzer/conf/default.yaml +++ b/presidio-analyzer/conf/default.yaml @@ -14,15 +14,7 @@ ner_model_configuration: DATE: DATE_TIME TIME: DATE_TIME NORP: NRP -- aggregation_strategy: simple # "simple", "first", "average", "max" -- stride: 16 # If stride >= 0, process long texts in - # overlapping windows of the model max - # length. The value is the length of the - # window overlap in transformer tokenizer - # tokens, NOT the length of the stride. -- alignment_mode: strict # "strict", "contract", "expand" -- labels_to_ignore: ["O"] - low_confidence_score_multiplier: 0.4 - low_score_entity_names: - ORGANIZATION - - ORG \ No newline at end of file + - ORG From 8508c005fbfef8520a0c819058fccb7a4d872b05 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 28 Aug 2023 22:45:08 +0300 Subject: [PATCH 06/67] Update languages-config.yml --- docs/analyzer/languages-config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/analyzer/languages-config.yml b/docs/analyzer/languages-config.yml index bd2ef780e..fbd71caaa 100644 --- a/docs/analyzer/languages-config.yml +++ b/docs/analyzer/languages-config.yml @@ -25,3 +25,4 @@ ner_model_configuration: - low_score_entity_names: - ORGANIZATION - ORG + - default_score: 0.85 From 800843dab8aa26198a8884f9559f8bea1b112730 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 28 Aug 2023 22:45:20 +0300 Subject: [PATCH 07/67] Update default.yaml --- presidio-analyzer/conf/default.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/presidio-analyzer/conf/default.yaml b/presidio-analyzer/conf/default.yaml index bcdf850bb..4d6000e7a 100644 --- a/presidio-analyzer/conf/default.yaml +++ b/presidio-analyzer/conf/default.yaml @@ -18,3 +18,4 @@ ner_model_configuration: - low_score_entity_names: - ORGANIZATION - ORG +- default_score: 0.85 From f066c31ceed985ab453f648110be140e77c92153 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 28 Aug 2023 22:46:52 +0300 Subject: [PATCH 08/67] Update spacy_multilingual.yaml --- presidio-analyzer/conf/spacy_multilingual.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/presidio-analyzer/conf/spacy_multilingual.yaml b/presidio-analyzer/conf/spacy_multilingual.yaml index 2cf1d99da..34a989549 100644 --- a/presidio-analyzer/conf/spacy_multilingual.yaml +++ b/presidio-analyzer/conf/spacy_multilingual.yaml @@ -24,4 +24,5 @@ ner_model_configuration: - low_confidence_score_multiplier: 0.4 - low_score_entity_names: - ORGANIZATION - - ORG \ No newline at end of file + - ORG + - default_score: 0.85 From 03a7ed8cf574c44d0a9d3a6a092a7ed78b1ddd54 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 28 Aug 2023 22:47:04 +0300 Subject: [PATCH 09/67] Update stanza.yaml --- presidio-analyzer/conf/stanza.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/presidio-analyzer/conf/stanza.yaml b/presidio-analyzer/conf/stanza.yaml index a18bd18b5..77d501530 100644 --- a/presidio-analyzer/conf/stanza.yaml +++ b/presidio-analyzer/conf/stanza.yaml @@ -19,3 +19,4 @@ ner_model_configuration: - low_confidence_score_multiplier: 0.4 - low_score_entity_names: - + - default_score: 0.85 From 971c7ec84539bdaa3c4eae9814dc0591c577256a Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 28 Aug 2023 22:47:21 +0300 Subject: [PATCH 10/67] Update stanza_multilingual.yaml --- presidio-analyzer/conf/stanza_multilingual.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/presidio-analyzer/conf/stanza_multilingual.yaml b/presidio-analyzer/conf/stanza_multilingual.yaml index 5ad2ca67d..459869c6b 100644 --- a/presidio-analyzer/conf/stanza_multilingual.yaml +++ b/presidio-analyzer/conf/stanza_multilingual.yaml @@ -22,3 +22,4 @@ ner_model_configuration: - low_confidence_score_multiplier: 0.4 - low_score_entity_names: - + - default_score: 0.85 From 1263f215d744e3663809880254b43062ab6b3cb1 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 28 Aug 2023 22:49:39 +0300 Subject: [PATCH 11/67] default_score from config + more logging --- .../presidio_analyzer/nlp_engine/ner_model_configuration.py | 3 +++ .../presidio_analyzer/nlp_engine/spacy_nlp_engine.py | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py index f4fc61abb..55a6d52aa 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py @@ -66,10 +66,13 @@ class NerModelConfiguration: def __post_init__(self): if self.model_to_presidio_entity_mapping is None: + logger.warning(f"model_to_presidio_entity_mapping is missing from configuration, using default") self.model_to_presidio_entity_mapping = MODEL_TO_PRESIDIO_ENTITY_MAPPING if self.low_score_entity_names is None: + logger.warning(f"low_score_entity_names is missing from configuration, using default") self.low_score_entity_names = LOW_SCORE_ENTITY_NAMES if self.labels_to_ignore is None: + logger.warning(f"labels_to_ignore is missing from configuration, using default") self.labels_to_ignore = LABELS_TO_IGNORE @classmethod diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py index 7bc8afff5..954a87334 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py @@ -21,7 +21,6 @@ class SpacyNlpEngine(NlpEngine): engine_name = "spacy" is_available = bool(spacy) - DEFAULT_CONFIDENCE = 0.85 def __init__( self, @@ -191,7 +190,7 @@ def _get_entities(self, doc: Doc) -> SpanGroup: output_spans.append(ent) # Set default confidence (spaCy models don't have built in confidence scores) - score = self.DEFAULT_CONFIDENCE + score = self.ner_model_configuration.default_score # Update score if entity is in low score entity names if ent.label_ in self.ner_model_configuration.low_score_entity_names: From b353c5c496d377703eacce24f4da850ced8cf8b7 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 28 Aug 2023 23:10:52 +0300 Subject: [PATCH 12/67] flake8 updates --- .../presidio_analyzer/analyzer_engine.py | 2 +- .../context_aware_enhancer.py | 1 - .../nlp_engine/ner_model_configuration.py | 29 +++++++++++++------ .../nlp_engine/nlp_artifacts.py | 27 +++++++++-------- .../nlp_engine/nlp_engine.py | 6 ++-- .../nlp_engine/nlp_engine_provider.py | 9 +++--- .../nlp_engine/spacy_nlp_engine.py | 24 ++++++++++----- .../nlp_engine/stanza_nlp_engine.py | 1 - .../nlp_engine/transformers_nlp_engine.py | 19 +++++++----- .../predefined_recognizers/__init__.py | 6 ---- .../spacy_recognizer.py | 28 +++++++++++------- .../transformers_recognizer.py | 6 ++-- .../recognizer_registry.py | 8 +++-- presidio-analyzer/setup.cfg | 2 +- 14 files changed, 94 insertions(+), 74 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/analyzer_engine.py b/presidio-analyzer/presidio_analyzer/analyzer_engine.py index a67bfed8c..c84e8be5b 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_engine.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_engine.py @@ -336,7 +336,7 @@ def _remove_allow_list( @staticmethod def __add_recognizer_id_if_not_exists( results: List[RecognizerResult], recognizer: EntityRecognizer - ): + ) -> None: """Ensure recognition metadata with recognizer id existence. Ensure recognizer result list contains recognizer id inside recognition diff --git a/presidio-analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py b/presidio-analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py index 4a6eb5d5b..329ca5caf 100644 --- a/presidio-analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py +++ b/presidio-analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py @@ -33,7 +33,6 @@ def __init__( context_prefix_count: int, context_suffix_count: int, ): - self.context_similarity_factor = context_similarity_factor self.min_score_with_context_similarity = min_score_with_context_similarity self.context_prefix_count = context_prefix_count diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py index 55a6d52aa..64622c1a0 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py @@ -65,18 +65,26 @@ class NerModelConfiguration: low_confidence_score_multiplier: Optional[float] = 0.4 def __post_init__(self): + """Validate the configuration and set defaults.""" if self.model_to_presidio_entity_mapping is None: - logger.warning(f"model_to_presidio_entity_mapping is missing from configuration, using default") + logger.warning( + "model_to_presidio_entity_mapping is missing from configuration, " + "using default" + ) self.model_to_presidio_entity_mapping = MODEL_TO_PRESIDIO_ENTITY_MAPPING if self.low_score_entity_names is None: - logger.warning(f"low_score_entity_names is missing from configuration, using default") + logger.warning( + "low_score_entity_names is missing from configuration, " "using default" + ) self.low_score_entity_names = LOW_SCORE_ENTITY_NAMES if self.labels_to_ignore is None: - logger.warning(f"labels_to_ignore is missing from configuration, using default") + logger.warning( + "labels_to_ignore is missing from configuration, " "using default" + ) self.labels_to_ignore = LABELS_TO_IGNORE @classmethod - def _validate_input(cls, nlp_engine_configuration: Dict): + def _validate_input(cls, nlp_engine_configuration: Dict) -> None: if "nlp_engine_name" not in nlp_engine_configuration: raise ValueError("nlp_engine_name is a required parameter") if "labels_to_ignore" in nlp_engine_configuration: @@ -109,7 +117,8 @@ def _validate_input(cls, nlp_engine_configuration: Dict): def from_yaml(cls, yaml_file: Union[Path, str]) -> "NerModelConfiguration": """Load NLP engine configuration from yaml file. - :param yaml_file: Path to the yaml file.""" + :param yaml_file: Path to the yaml file. + """ if not Path(yaml_file).exists(): raise FileNotFoundError(f"configuration file {yaml_file} not found.") @@ -125,7 +134,8 @@ def from_yaml(cls, yaml_file: Union[Path, str]) -> "NerModelConfiguration": def from_json(cls, json_file: Union[Path, str]) -> "NerModelConfiguration": """Load NLP engine configuration from json file. - :param json_file: Path to the json file.""" + :param json_file: Path to the json file. + """ if not Path(json_file).exists(): raise FileNotFoundError(f"configuration file {json_file} not found.") @@ -155,11 +165,12 @@ def get_full_conf_path( ) -> Path: """Return a Path to the default conf file. - :param default_conf_file: Name of the default conf file.""" + :param default_conf_file: Name of the default conf file. + """ return Path(Path(__file__).parent.parent.parent, "conf", default_conf_file) - def __str__(self) -> str: + def __str__(self) -> str: # noqa D105 return str(self.to_dict()) - def __repr__(self) -> str: + def __repr__(self) -> str: # noqa D105 return str(self) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py index cc696950e..9e2a9b114 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py @@ -1,7 +1,8 @@ import json -from typing import List, Optional +from typing import List, Optional, TYPE_CHECKING -from spacy.tokens import Doc, Span +if TYPE_CHECKING: + from spacy.tokens import Doc, Span class NlpArtifacts: @@ -10,6 +11,14 @@ class NlpArtifacts: processing over a given text, it holds attributes such as entities, tokens and lemmas which can be used by any recognizer + + :param entities: Identified entities + :param tokens: Tokenized text + :param tokens_indices: Indices of tokens + :param lemmas: List of lemmas in text + :param nlp_engine: NlpEngine object + :param language: Text language + :param scores: Entity confidence scores """ def __init__( @@ -18,20 +27,10 @@ def __init__( tokens: Doc, tokens_indices: List[int], lemmas: List[str], - nlp_engine: "NlpEngine", # noqa F821 + nlp_engine: "NlpEngine", # noqa F821 language: str, - scores: Optional[List[float]] = None + scores: Optional[List[float]] = None, ): - """ - :param entities: Identified entities - :param tokens: Tokenized text - :param tokens_indices: Indices of tokens - :param lemmas: List of lemmas in text - :param nlp_engine: NlpEngine object - :param language: Text language - :param scores: Entity confidence scores - """ - self.entities = entities self.tokens = tokens self.lemmas = lemmas diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine.py index 410b17842..808a49ddc 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Iterable, Iterator, Tuple, Dict, List +from typing import Iterable, Iterator, Tuple, List from presidio_analyzer.nlp_engine import NlpArtifacts @@ -26,7 +26,7 @@ def process_text(self, text: str, language: str) -> NlpArtifacts: @abstractmethod def process_batch( - self, texts: Iterable[str], language: str, **kwargs + self, texts: Iterable[str], language: str, **kwargs # noqa ANN003 ) -> Iterator[Tuple[str, NlpArtifacts]]: """Execute the NLP pipeline on a batch of texts. @@ -52,4 +52,4 @@ def is_punct(self, word: str, language: str) -> bool: @abstractmethod def get_supported_entities(self) -> List[str]: """Return the supported entities for this NLP engine.""" - pass \ No newline at end of file + pass diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py index f9fa72b65..e67e35030 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py @@ -8,7 +8,7 @@ StanzaNlpEngine, SpacyNlpEngine, NlpEngine, - TransformersNlpEngine, NerModelConfiguration, + TransformersNlpEngine, ) logger = logging.getLogger("presidio-analyzer") @@ -37,7 +37,6 @@ def __init__( conf_file: Optional[Union[Path, str]] = None, nlp_configuration: Optional[Dict] = None, ): - if not nlp_engines: nlp_engines = (SpacyNlpEngine, StanzaNlpEngine, TransformersNlpEngine) @@ -87,7 +86,9 @@ def create_engine(self) -> NlpEngine: nlp_models = self.nlp_configuration["models"] ner_model_configuration = self.nlp_configuration.get("ner_model_params") - engine = nlp_engine_class(models=nlp_models, ner_model_configuration=ner_model_configuration) + engine = nlp_engine_class( + models=nlp_models, ner_model_configuration=ner_model_configuration + ) engine.load() logger.info( f"Created NLP engine: {engine.engine_name}. " @@ -121,4 +122,4 @@ def _get_full_conf_path( default_conf_file: Union[Path, str] = "default.yaml" ) -> Path: """Return a Path to the default conf file.""" - return Path(Path(__file__).parent.parent.parent, "conf", default_conf_file) \ No newline at end of file + return Path(Path(__file__).parent.parent.parent, "conf", default_conf_file) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py index 954a87334..9d0cccebf 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py @@ -1,9 +1,11 @@ import logging -from typing import Optional, Dict, Iterator, Tuple, Union, List +from typing import Optional, Dict, Iterator, Tuple, Union, List, TYPE_CHECKING import spacy -from spacy.language import Language -from spacy.tokens import Doc, Span, SpanGroup + +if TYPE_CHECKING: + from spacy.language import Language +from spacy.tokens import Doc, SpanGroup from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngine, NerModelConfiguration @@ -32,7 +34,8 @@ def __init__( :param models: Dictionary with the name of the spaCy model per language. For example: models = [{"lang_code": "en", "model_name": "en_core_web_lg"}] - :param ner_model_configuration: Parameters for the NER model. See conf/spacy.yaml for an example + :param ner_model_configuration: Parameters for the NER model. + See conf/spacy.yaml for an example """ if not models: models = [{"lang_code": "en", "model_name": "en_core_web_lg"}] @@ -56,14 +59,14 @@ def load(self) -> None: self.nlp[model["lang_code"]] = spacy.load(model["model_name"]) @staticmethod - def _download_spacy_model_if_needed(model_name): + def _download_spacy_model_if_needed(model_name: str) -> None: if not spacy.util.is_package(model_name): logger.warning(f"Model {model_name} is not installed. Downloading...") spacy.cli.download(model_name) logger.info(f"Finished downloading model {model_name}") @staticmethod - def _validate_model_params(model: Dict): + def _validate_model_params(model: Dict) -> None: if "lang_code" not in model: raise ValueError("lang_code is missing from model configuration") if "model_name" not in model: @@ -100,6 +103,7 @@ def process_batch( as_tuples: bool = False, ) -> Iterator[Optional[NlpArtifacts]]: """Execute the NLP pipeline on a batch of texts using spacy pipe. + :param texts: A list of texts to process. :param language: The language of the texts. :param as_tuples: If set to True, inputs should be a sequence of @@ -162,8 +166,10 @@ def _doc_to_nlp_artifact(self, doc: Doc, language: str) -> NlpArtifacts: def _get_entities(self, doc: Doc) -> SpanGroup: """ Get an updated list of entities based on the ner model configuration. + Remove entities that are in labels_to_ignore, update entity names based on model_to_presidio_entity_mapping + :param doc: Output of a spaCy model :return: SpanGroup holding on the entities and confidence scores """ @@ -180,7 +186,8 @@ def _get_entities(self, doc: Doc) -> SpanGroup: ent.label_ = mapping[ent.label_] else: logger.warning( - f"Entity {ent.label_} is not mapped to a Presidio entity, but keeping anyway" + f"Entity {ent.label_} is not mapped to a Presidio entity, " + f"but keeping anyway" ) # Remove presidio entities in the ignore list @@ -189,7 +196,8 @@ def _get_entities(self, doc: Doc) -> SpanGroup: output_spans.append(ent) - # Set default confidence (spaCy models don't have built in confidence scores) + # Set default confidence + # (spaCy models don't have built in confidence scores) score = self.ner_model_configuration.default_score # Update score if entity is in low score entity names diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py index cfe603950..994848499 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py @@ -1,5 +1,4 @@ import logging -from typing import Optional, Dict, List try: import stanza diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py index 5032acc41..a0f664447 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py @@ -1,9 +1,8 @@ import logging -from typing import Optional, Dict, List +from typing import Optional, Dict import spacy -from spacy.tokens import Doc, Span, SpanGroup -from spacy.tokens.doc import SpanGroups +from spacy.tokens import Doc, SpanGroup try: import spacy_huggingface_pipelines @@ -38,7 +37,8 @@ class TransformersNlpEngine(SpacyNlpEngine): "transformers": "dslim/bert-base-NER" } }] - :param ner_model_configuration: Parameters for the NER model. See conf/transformers.yaml for an example + :param ner_model_configuration: Parameters for the NER model. + See conf/transformers.yaml for an example Note that since the spaCy model is not used for NER, @@ -91,14 +91,14 @@ def load(self) -> None: "annotate": "spans", "stride": self.ner_model_configuration.stride, "alignment_mode": self.ner_model_configuration.alignment_mode, - "aggregation_strategy": self.ner_model_configuration.aggregation_strategy, + "aggregation_strategy": self.ner_model_configuration.aggregation_strategy, # noqa E501 "annotate_spans_key": self.entity_key, }, ) self.nlp[model["lang_code"]] = nlp @staticmethod - def _validate_model_params(model: Dict): + def _validate_model_params(model: Dict) -> None: if "lang_code" not in model: raise ValueError("lang_code is missing from model configuration") if "model_name" not in model: @@ -123,8 +123,10 @@ def process_text(self, text: str, language: str) -> NlpArtifacts: def _get_entities(self, doc: Doc) -> SpanGroup: """ Get an updated list of entities based on the ner model configuration. + Remove entities that are in labels_to_ignore, - update entity names based on model_to_presidio_entity_mapping + update entity names based on model_to_presidio_entity_mapping. + :param doc: Output of a spaCy model :return: SpanGroup holding on the entities and confidence scores """ @@ -146,7 +148,8 @@ def _get_entities(self, doc: Doc) -> SpanGroup: ent.label_ = mapping[ent.label_] else: logger.warning( - f"Entity {ent.label_} is not mapped to a Presidio entity, but keeping anyway" + f"Entity {ent.label_} is not mapped to a Presidio entity, " + f"but keeping anyway" ) # Remove presidio entities in the ignore list diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index c976fcb0a..61c6386d6 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -28,16 +28,10 @@ from .us_itin_recognizer import UsItinRecognizer from .us_passport_recognizer import UsPassportRecognizer from .us_ssn_recognizer import UsSsnRecognizer -from .es_nif_recognizer import EsNifRecognizer from .au_abn_recognizer import AuAbnRecognizer from .au_acn_recognizer import AuAcnRecognizer from .au_tfn_recognizer import AuTfnRecognizer from .au_medicare_recognizer import AuMedicareRecognizer -from .it_driver_license_recognizer import ItDriverLicenseRecognizer -from .it_fiscal_code_recognizer import ItFiscalCodeRecognizer -from .it_vat_code import ItVatCodeRecognizer -from .it_identity_card_recognizer import ItIdentityCardRecognizer -from .it_passport_recognizer import ItPassportRecognizer from .in_pan_recognizer import InPanRecognizer diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py index 09cf7683e..d8e4b5725 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py @@ -12,6 +12,15 @@ class SpacyRecognizer(LocalRecognizer): + """ + Recognize PII entities using a spaCy NLP model. + + Since the spaCy pipeline is ran by the AnalyzerEngine/SpacyNlpEngine, + this recognizer only extracts the entities from the NlpArtifacts + and returns them. + + """ + ENTITIES = ["DATE_TIME", "NRP", "LOCATION", "PERSON", "ORGANIZATION"] DEFAULT_EXPLANATION = "Identified as {} by Spacy's Named Entity Recognition" @@ -35,11 +44,6 @@ def __init__( context: Optional[List[str]] = None, ): """ - Recognize PII entities using a spaCy NLP model. - - Since the spaCy pipeline is ran by the AnalyzerEngine, - this recognizer only extracts the entities from the NlpArtifacts - and replaces their types to align with Presidio's. :param supported_language: Language this recognizer supports :param supported_entities: The entities this recognizer can detect @@ -51,7 +55,8 @@ def __init__( self.ner_strength = ner_strength if check_label_groups: warnings.warn( - "check_label_groups is deprecated and isn't used;entities are mapped in NerModelConfiguration", + "check_label_groups is deprecated and isn't used;" + "entities are mapped in NerModelConfiguration", DeprecationWarning, 2, ) @@ -88,7 +93,7 @@ def build_explanation( ) return explanation - def analyze(self, text:str, entities, nlp_artifacts=None): # noqa D102 + def analyze(self, text: str, entities, nlp_artifacts=None): # noqa D102 results = [] if not nlp_artifacts: logger.warning("Skipping SpaCy, nlp artifacts not provided...") @@ -99,12 +104,13 @@ def analyze(self, text:str, entities, nlp_artifacts=None): # noqa D102 for ner_entity, ner_score in zip(ner_entities, ner_scores): if ner_entity.label_ not in entities: - logger.debug(f"Skipping entity {ner_entity.label_} as it is not in the supported entities list") + logger.debug( + f"Skipping entity {ner_entity.label_} " + f"as it is not in the supported entities list" + ) continue - textual_explanation = self.DEFAULT_EXPLANATION.format( - ner_entity.label_ - ) + textual_explanation = self.DEFAULT_EXPLANATION.format(ner_entity.label_) explanation = self.build_explanation(ner_score, textual_explanation) spacy_result = RecognizerResult( entity_type=ner_entity.label_, diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py index 2134d5cfb..e78193d88 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py @@ -1,8 +1,6 @@ import logging -from typing import List from presidio_analyzer.predefined_recognizers.spacy_recognizer import SpacyRecognizer -from presidio_analyzer import RecognizerResult logger = logging.getLogger("presidio-analyzer") @@ -16,7 +14,7 @@ class TransformersRecognizer(SpacyRecognizer): See: - https://huggingface.co/docs/transformers/main/en/index for transformer models - https://github.com/explosion/spacy-huggingface-pipelines on the spaCy wrapper to transformers - """ # noqa E501 + """ # noqa E501 ENTITIES = [ "PERSON", @@ -31,7 +29,7 @@ class TransformersRecognizer(SpacyRecognizer): LOW_SCORE_ENTITY_NAMES = {"ID"} - def __init__(self, **kwargs): + def __init__(self, **kwargs): # noqa ANN003 self.DEFAULT_EXPLANATION = self.DEFAULT_EXPLANATION.replace( "Spacy", "Transfromers" ) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 5c31a3aaa..1c4a88d45 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -1,7 +1,9 @@ import copy import logging -from pathlib import Path -from typing import Optional, List, Iterable, Union, Type, Dict +from typing import Optional, List, Iterable, Union, Type, Dict, TYPE_CHECKING + +if TYPE_CHECKING: + from pathlib import Path from presidio_analyzer.nlp_engine.transformers_nlp_engine import ( TransformersNlpEngine, ) @@ -120,7 +122,7 @@ def load_predefined_recognizers( if nlp_engine: nlp_recognizer_inst = nlp_recognizer( supported_language=lang, - supported_entities=nlp_engine.get_supported_entities() + supported_entities=nlp_engine.get_supported_entities(), ) else: nlp_recognizer_inst = nlp_recognizer(supported_language=lang) diff --git a/presidio-analyzer/setup.cfg b/presidio-analyzer/setup.cfg index 6a510fa1c..3026b4093 100644 --- a/presidio-analyzer/setup.cfg +++ b/presidio-analyzer/setup.cfg @@ -7,4 +7,4 @@ exclude = dist, tests docstring-convention = numpy -extend-ignore = E203 D100 D202 ANN101 ANN102 ANN201 ANN204 ANN203 TC001 \ No newline at end of file +extend-ignore = E203 D100 D202 ANN101 ANN102 ANN204 ANN203 TC001 \ No newline at end of file From a740c2d52d98243fe63495b342ecf2aad9dd6fb1 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 28 Aug 2023 23:12:05 +0300 Subject: [PATCH 13/67] added en_core_web_sm for transformers pipelines --- .pipelines/templates/build-analyzer.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pipelines/templates/build-analyzer.yml b/.pipelines/templates/build-analyzer.yml index 69bf63def..6f23caa6f 100644 --- a/.pipelines/templates/build-analyzer.yml +++ b/.pipelines/templates/build-analyzer.yml @@ -18,6 +18,7 @@ steps: set -eux # fail on error pipenv install --deploy --dev pipenv run python -m spacy download en_core_web_lg + pipenv run python -m spacy download en_core_web_sm - template: ./build-python.yml parameters: From b6170b62e5b6cd045a8a2e3c9f9aab11c69797a5 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 28 Aug 2023 23:21:58 +0300 Subject: [PATCH 14/67] removed type_checking option --- .../presidio_analyzer/nlp_engine/nlp_artifacts.py | 5 ++--- .../presidio_analyzer/nlp_engine/spacy_nlp_engine.py | 5 ++--- .../recognizer_registry/recognizer_registry.py | 5 ++--- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py index 9e2a9b114..052bafd2b 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py @@ -1,8 +1,7 @@ import json -from typing import List, Optional, TYPE_CHECKING +from typing import List, Optional -if TYPE_CHECKING: - from spacy.tokens import Doc, Span +from spacy.tokens import Doc, Span class NlpArtifacts: diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py index 9d0cccebf..f525280b8 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py @@ -1,10 +1,9 @@ import logging -from typing import Optional, Dict, Iterator, Tuple, Union, List, TYPE_CHECKING +from typing import Optional, Dict, Iterator, Tuple, Union, List import spacy -if TYPE_CHECKING: - from spacy.language import Language +from spacy.language import Language from spacy.tokens import Doc, SpanGroup from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngine, NerModelConfiguration diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 1c4a88d45..5acd5e283 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -1,9 +1,8 @@ import copy import logging -from typing import Optional, List, Iterable, Union, Type, Dict, TYPE_CHECKING +from typing import Optional, List, Iterable, Union, Type, Dict -if TYPE_CHECKING: - from pathlib import Path +from pathlib import Path from presidio_analyzer.nlp_engine.transformers_nlp_engine import ( TransformersNlpEngine, ) From f0a99245c1bf654c06745cfd8db9cd26d82c37df Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 29 Aug 2023 12:24:12 +0300 Subject: [PATCH 15/67] add transformers_recognizer test --- .../tests/test_transformers_recognizer.py | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 presidio-analyzer/tests/test_transformers_recognizer.py diff --git a/presidio-analyzer/tests/test_transformers_recognizer.py b/presidio-analyzer/tests/test_transformers_recognizer.py new file mode 100644 index 000000000..680e1e608 --- /dev/null +++ b/presidio-analyzer/tests/test_transformers_recognizer.py @@ -0,0 +1,95 @@ +import pytest + +from tests import assert_result_within_score_range + + +@pytest.fixture(scope="module") +def entities(): + return ["PERSON", "DATE_TIME"] + + +@pytest.mark.skip_engine("transformers_en") +@pytest.fixture(scope="module") +def nlp_engine(nlp_engines): + return nlp_engines.get("transformers_en", None) + + +@pytest.mark.skip_engine("transformers_en") +@pytest.fixture(scope="module") +def nlp_recognizer(nlp_recognizers): + return nlp_recognizers.get("transformers", None) + + + +def prepare_and_analyze(nlp, recognizer, text, entities): + nlp.load() + nlp_artifacts = nlp.process_text(text, "en") + results = recognizer.analyze(text, entities, nlp_artifacts) + return results + + +@pytest.mark.skip_engine("transformers_en") +@pytest.mark.parametrize( + "text, expected_len, expected_positions, entity_num", + [ + # fmt: off + # Test PERSON entity + ("my name is Dan", 1, ((11, 14),), 0), + ("Dan Tailor", 1, ((0, 10),), 0), + ("John Oliver is a comedian.", 1, ((0, 11),), 0), + ("Richard Milhous Nixon", 1, ((0, 21),), 0), + ("Richard M. Nixon", 1, ((0, 16),), 0), + ("Dan May has a bank account.", 1, ((0, 7),), 0), + ("his name is Mr. May", 1, ((12, 19),), 0), + ("They call me Mr. May", 1, ((13, 20),), 0), + # Test DATE_TIME Entity + ("year 1972", 1, ((0, 9),), 1), + ("I bought my car in 1972.", 1, ((19, 23),), 1), + ("I bought my car in May.", 1, ((19, 22),), 1), + ("May 1st", 1, ((0, 7),), 1), + ("May 1st, 1977", 1, ((0, 13),), 1), + ("I bought my car on May 1st, 1977", 1, ((19, 32),), 1), + # fmt: on + ], +) +def test_when_using_transformers_then_all_transformers_result_correct( + text, + expected_len, + expected_positions, + entity_num, + nlp_engine, + nlp_recognizer, + entities, + min_score, + max_score, +): + results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) + assert len(results) == expected_len + entity_to_check = entities[entity_num] + for res, (st_pos, fn_pos) in zip(results, expected_positions): + assert_result_within_score_range( + result=res, + expected_entity_type=entity_to_check, + expected_start=st_pos, + expected_end=fn_pos, + expected_score_min=min_score, + expected_score_max=max_score, + ) + + +@pytest.mark.skip_engine("transformers_en") +def test_when_person_in_text_then_person_full_name_complex_found( + nlp_engine, nlp_recognizer, entities +): + text = "Richard (Rick) C. Henderson" + results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) + + assert len(results) > 0 + + # check that most of the text is covered + covered_text = "" + for result in results: + sl = slice(result.start, result.end) + covered_text += text[sl] + + assert len(text) - len(covered_text) < 5 From 28472abfb7fd8a63f7737ab0b90b34f04643d421 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 29 Aug 2023 12:24:34 +0300 Subject: [PATCH 16/67] formatting --- presidio-analyzer/tests/test_transformers_recognizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/presidio-analyzer/tests/test_transformers_recognizer.py b/presidio-analyzer/tests/test_transformers_recognizer.py index 680e1e608..03303cd9b 100644 --- a/presidio-analyzer/tests/test_transformers_recognizer.py +++ b/presidio-analyzer/tests/test_transformers_recognizer.py @@ -20,7 +20,6 @@ def nlp_recognizer(nlp_recognizers): return nlp_recognizers.get("transformers", None) - def prepare_and_analyze(nlp, recognizer, text, entities): nlp.load() nlp_artifacts = nlp.process_text(text, "en") From 39103e208ef7341cc79152e7d664fbfca1be9754 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 29 Aug 2023 13:09:03 +0300 Subject: [PATCH 17/67] updated docstring --- .../presidio_analyzer/nlp_engine/spacy_nlp_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py index f525280b8..917616333 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py @@ -170,7 +170,7 @@ def _get_entities(self, doc: Doc) -> SpanGroup: update entity names based on model_to_presidio_entity_mapping :param doc: Output of a spaCy model - :return: SpanGroup holding on the entities and confidence scores + :return: SpanGroup holding the entities and confidence scores """ output_spans = SpanGroup(doc, attrs={"scores": []}) From 1aca1534d0811bd08c08b4772e467a42394f660d Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 29 Aug 2023 13:10:09 +0300 Subject: [PATCH 18/67] revert formatting --- .../context_aware_enhancers/context_aware_enhancer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/presidio-analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py b/presidio-analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py index 329ca5caf..1e0f8e350 100644 --- a/presidio-analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py +++ b/presidio-analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py @@ -33,6 +33,7 @@ def __init__( context_prefix_count: int, context_suffix_count: int, ): + self.context_similarity_factor = context_similarity_factor self.min_score_with_context_similarity = min_score_with_context_similarity self.context_prefix_count = context_prefix_count From db7c45f0b2360f11123f1212f3ed0153b4e6b914 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 29 Aug 2023 13:10:53 +0300 Subject: [PATCH 19/67] revert formatting --- .../context_aware_enhancers/context_aware_enhancer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py b/presidio-analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py index 1e0f8e350..4a6eb5d5b 100644 --- a/presidio-analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py +++ b/presidio-analyzer/presidio_analyzer/context_aware_enhancers/context_aware_enhancer.py @@ -33,7 +33,7 @@ def __init__( context_prefix_count: int, context_suffix_count: int, ): - + self.context_similarity_factor = context_similarity_factor self.min_score_with_context_similarity = min_score_with_context_similarity self.context_prefix_count = context_prefix_count From e8a814f0fca504f55e52db0051e0741164cb1f63 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 29 Aug 2023 13:15:58 +0300 Subject: [PATCH 20/67] ignore type checking errors (TC001 TC002 TC003) --- presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine.py | 2 +- .../presidio_analyzer/nlp_engine/stanza_nlp_engine.py | 2 +- .../recognizer_registry/recognizer_registry.py | 4 ++-- presidio-analyzer/setup.cfg | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine.py index 808a49ddc..72230ca30 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine.py @@ -13,7 +13,7 @@ class NlpEngine(ABC): """ @abstractmethod - def load(self): + def load(self) -> None: """Load the NLP model.""" @abstractmethod diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py index 994848499..1033f75d6 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py @@ -24,7 +24,7 @@ class StanzaNlpEngine(SpacyNlpEngine): engine_name = "stanza" is_available = bool(stanza) - def load(self): + def load(self) -> None: """Load the NLP model.""" logger.debug(f"Loading Stanza models: {self.models}") diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 5acd5e283..2813f1615 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -239,7 +239,7 @@ def remove_recognizer(self, recognizer_name: str) -> None: ) self.recognizers = new_recognizers - def add_pattern_recognizer_from_dict(self, recognizer_dict: Dict): + def add_pattern_recognizer_from_dict(self, recognizer_dict: Dict) -> None: """ Load a pattern recognizer from a Dict into the recognizer registry. @@ -254,7 +254,7 @@ def add_pattern_recognizer_from_dict(self, recognizer_dict: Dict): recognizer = PatternRecognizer.from_dict(recognizer_dict) self.add_recognizer(recognizer) - def add_recognizers_from_yaml(self, yml_path: Union[str, Path]): + def add_recognizers_from_yaml(self, yml_path: Union[str, Path]) -> None: r""" Read YAML file and load recognizers into the recognizer registry. diff --git a/presidio-analyzer/setup.cfg b/presidio-analyzer/setup.cfg index 3026b4093..732559f8e 100644 --- a/presidio-analyzer/setup.cfg +++ b/presidio-analyzer/setup.cfg @@ -7,4 +7,4 @@ exclude = dist, tests docstring-convention = numpy -extend-ignore = E203 D100 D202 ANN101 ANN102 ANN204 ANN203 TC001 \ No newline at end of file +extend-ignore = E203 D100 D202 ANN101 ANN102 ANN204 ANN203 TC \ No newline at end of file From f2cd47902fbdfb4c55bef6ce945176318f8f16cc Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 29 Aug 2023 14:18:43 +0300 Subject: [PATCH 21/67] small updates to docs --- docs/getting_started.md | 52 +++++++++++++++++++++++++++++++++++++++-- docs/installation.md | 48 +++++++++++++++++++++++-------------- 2 files changed, 81 insertions(+), 19 deletions(-) diff --git a/docs/getting_started.md b/docs/getting_started.md index 49def7a26..c05c1acdc 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -1,10 +1,10 @@ # Getting started with Presidio -## Simple flow +## Simple flow: Text Using Presidio's modules as Python packages to get started -=== "Anonymize PII in text" +=== "Anonymize PII in text (Default spaCy model)" 1. Install Presidio @@ -41,6 +41,54 @@ Using Presidio's modules as Python packages to get started print(anonymized_text) ``` +=== "Anonymize PII in text (transformers)" + + 1. Install Presidio + + ```sh + pip install presidio-analyzer + pip install presidio-anonymizer + python -m spacy download en_core_web_sm + ``` + + 2. Analyze + Anonymize + + ```py + from presidio_analyzer import AnalyzerEngine + from presidio_analyzer.nlp_engine import TransformersNlpEngine + from presidio_anonymizer import AnonymizerEngine + + text="My name is Don and my phone number is 212-555-5555" + + # Define which transformers model to use + model_config = [{"lang_code": "en", "model_name": { + "spacy": "en_core_web_sm", # use a small spaCy model for lemmas, tokens etc. + "transformers": "dslim/bert-base-NER" + } + }] + + nlp_engine = TransformersNlpEngine(models=model_config) + + # Set up the engine, loads the NLP module (spaCy model by default) + # and other PII recognizers + analyzer = AnalyzerEngine(nlp_engine=nlp_engine) + + # Call analyzer to get results + results = analyzer.analyze(text=text, + language='en') + print(results) + + # Analyzer results are passed to the AnonymizerEngine for anonymization + + anonymizer = AnonymizerEngine() + + anonymized_text = anonymizer.anonymize(text=text,analyzer_results=results) + + print(anonymized_text) + ``` + +## Simple flow: Images + === "Anonymize PII in images" 1. Install presidio-image-redactor diff --git a/docs/installation.md b/docs/installation.md index dcaf66b83..a8e635bd3 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -10,32 +10,44 @@ Presidio suite using `pip` (as Python packages) or using `Docker` (As containeri ## Using pip !!! note "Note" - Consider installing the Presidio python packages - on a virtual environment like [venv](https://docs.python.org/3/tutorial/venv.html) - or [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). -### Supported Python Versions + Consider installing the Presidio python packages + on a virtual environment like [venv](https://docs.python.org/3/tutorial/venv.html) + or [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). -Presidio is supported for the following python versions: +### Supported Python Versions -* 3.7 -* 3.8 -* 3.9 -* 3.10 -* 3.11 +Presidio is supported for the following python versions: 3.7, 3.8, 3.9, 3.10, 3.11. -### PII anonymization on text +### PII de-identification in text For PII anonymization on text, install the `presidio-analyzer` and `presidio-anonymizer` packages: ```sh pip install presidio_analyzer pip install presidio_anonymizer - -# Presidio analyzer requires a spaCy language model. -python -m spacy download en_core_web_lg ``` +In addition, Presidio requires at least one NLP engine (spaCy, transformers or stanza): + +=== "spaCy (default)" + + ``` + python -m spacy download en_core_web_lg + ``` + +=== "Transformers" + + ``` + pip install presidio_analyzer[transformers] + ``` + +=== "Stanza" + + ``` + pip install presidio_analyzer[stanza] + ``` + For a more detailed installation of each package, refer to the specific documentation: * [presidio-analyzer](analyzer/index.md). @@ -61,9 +73,10 @@ Presidio can expose REST endpoints for each service using Flask and Docker. To download the Presidio Docker containers, run the following command: !!! note "Note" - This requires Docker to be installed. [Download Docker](https://docs.docker.com/get-docker/). -### For PII anonymization in text + This requires Docker to be installed. [Download Docker](https://docs.docker.com/get-docker/). + +### For PII de-identification in text For PII detection and anonymization in text, the `presidio-analyzer` and `presidio-anonymizer` modules are required. @@ -113,7 +126,8 @@ git clone git@github.com:microsoft/presidio.git Then, build the containers locally. !!! note "Note" - Presidio uses [docker-compose](https://docs.docker.com/compose/) to manage the different Presidio containers. + + Presidio uses [docker-compose](https://docs.docker.com/compose/) to manage the different Presidio containers. From the root folder of the repo: From 5dc59300d38a41fc256dc2bd4bf60b9b84d06e85 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 29 Aug 2023 14:19:28 +0300 Subject: [PATCH 22/67] update to mkdocs to support tabs in v8 --- mkdocs.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mkdocs.yml b/mkdocs.yml index 5215fce26..8c077ee1e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -8,6 +8,7 @@ edit_uri: "" nav: - Home: index.md + - Installation: installation.md - Quickstart: getting_started.md - Step by step tutorial: - Home: tutorial/index.md @@ -26,7 +27,6 @@ nav: - Encryption/Decryption: tutorial/12_encryption.md - Allow-lists: tutorial/13_allow_list.md - Docs: - - Installation: installation.md - Handling text: - Home: text_anonymization.md - Presidio Analyzer: @@ -72,6 +72,7 @@ theme: favicon: assets/ms_icon.png features: - navigation.instant + - content.tabs.link # - navigation.tabs # - navigation.tabs.sticky plugins: @@ -102,4 +103,5 @@ markdown_extensions: - pymdownx.highlight - pymdownx.superfences - pymdownx.pathconverter - - pymdownx.tabbed + - pymdownx.tabbed: + alternate_style: true From 5ce545f7438f42c301f803337f482d0c41575e5a Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 29 Aug 2023 14:21:56 +0300 Subject: [PATCH 23/67] added trasnformers extra --- docs/getting_started.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getting_started.md b/docs/getting_started.md index c05c1acdc..fd9ca4294 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -46,7 +46,7 @@ Using Presidio's modules as Python packages to get started 1. Install Presidio ```sh - pip install presidio-analyzer + pip install presidio-analyzer[transformers] pip install presidio-anonymizer python -m spacy download en_core_web_sm ``` From fcd8ef64060bc5ac25cc68aeba3cdb213ca80374 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 29 Aug 2023 14:26:22 +0300 Subject: [PATCH 24/67] fixed extras --- docs/getting_started.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getting_started.md b/docs/getting_started.md index fd9ca4294..300bfdc09 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -46,7 +46,7 @@ Using Presidio's modules as Python packages to get started 1. Install Presidio ```sh - pip install presidio-analyzer[transformers] + pip install "presidio-analyzer[transformers]" pip install presidio-anonymizer python -m spacy download en_core_web_sm ``` From b5d56f6033d2fbb18c534f09717279886e57b8c9 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 29 Aug 2023 14:26:53 +0300 Subject: [PATCH 25/67] updated extras --- docs/installation.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index a8e635bd3..73249ee99 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -39,13 +39,13 @@ In addition, Presidio requires at least one NLP engine (spaCy, transformers or s === "Transformers" ``` - pip install presidio_analyzer[transformers] + pip install "presidio_analyzer[transformers]" ``` === "Stanza" ``` - pip install presidio_analyzer[stanza] + pip install "presidio_analyzer[stanza]" ``` For a more detailed installation of each package, refer to the specific documentation: From 736405272de4539c6b189a4d497f8ab5e9853da4 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 29 Aug 2023 14:27:19 +0300 Subject: [PATCH 26/67] Update installation.md From 7aa5df0792428757722475e53aa471ad8a3f3a33 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 29 Aug 2023 14:28:48 +0300 Subject: [PATCH 27/67] Update getting_started.md --- docs/getting_started.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/getting_started.md b/docs/getting_started.md index 300bfdc09..f3358cdb3 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -20,7 +20,7 @@ Using Presidio's modules as Python packages to get started from presidio_analyzer import AnalyzerEngine from presidio_anonymizer import AnonymizerEngine - text="My phone number is 212-555-5555" + text = "My phone number is 212-555-5555" # Set up the engine, loads the NLP module (spaCy model by default) # and other PII recognizers @@ -58,7 +58,7 @@ Using Presidio's modules as Python packages to get started from presidio_analyzer.nlp_engine import TransformersNlpEngine from presidio_anonymizer import AnonymizerEngine - text="My name is Don and my phone number is 212-555-5555" + text = "My name is Don and my phone number is 212-555-5555" # Define which transformers model to use model_config = [{"lang_code": "en", "model_name": { From 16106854e4d85e3b9b61a80fdf55d00d156088a9 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 29 Aug 2023 14:33:34 +0300 Subject: [PATCH 28/67] added comment on lazy downloading --- docs/getting_started.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/getting_started.md b/docs/getting_started.md index f3358cdb3..9fd0c9858 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -87,6 +87,8 @@ Using Presidio's modules as Python packages to get started print(anonymized_text) ``` + The transformers model and the spacy model would be downloaded on the first call to the `AnalyzerEngine`. + ## Simple flow: Images === "Anonymize PII in images" From f2d58430de2d1ab2c99b0c22edd96f13550109d4 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 29 Aug 2023 14:36:49 +0300 Subject: [PATCH 29/67] Update getting_started.md --- docs/getting_started.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/getting_started.md b/docs/getting_started.md index 9fd0c9858..04c7ec267 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -74,15 +74,14 @@ Using Presidio's modules as Python packages to get started analyzer = AnalyzerEngine(nlp_engine=nlp_engine) # Call analyzer to get results - results = analyzer.analyze(text=text, - language='en') + results = analyzer.analyze(text=text, language='en') print(results) # Analyzer results are passed to the AnonymizerEngine for anonymization anonymizer = AnonymizerEngine() - anonymized_text = anonymizer.anonymize(text=text,analyzer_results=results) + anonymized_text = anonymizer.anonymize(text=text, analyzer_results=results) print(anonymized_text) ``` From cf85101c7daf31e74f493ca87320e908f5f3eea9 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Thu, 31 Aug 2023 10:15:03 +0300 Subject: [PATCH 30/67] revert conf to reduce PR size --- presidio-analyzer/conf/default.yaml | 16 ------------- presidio-analyzer/conf/spacy.yaml | 17 ------------- .../conf/spacy_multilingual.yaml | 17 ------------- presidio-analyzer/conf/stanza.yaml | 16 ------------- .../conf/stanza_multilingual.yaml | 16 ------------- presidio-analyzer/tests/conf/default.yaml | 24 +------------------ 6 files changed, 1 insertion(+), 105 deletions(-) diff --git a/presidio-analyzer/conf/default.yaml b/presidio-analyzer/conf/default.yaml index 4d6000e7a..92c163441 100644 --- a/presidio-analyzer/conf/default.yaml +++ b/presidio-analyzer/conf/default.yaml @@ -3,19 +3,3 @@ models: - lang_code: en model_name: en_core_web_lg -ner_model_configuration: -- model_to_presidio_entity_mapping: - PER: PERSON - PERSON: PERSON - LOC: LOCATION - LOCATION: LOCATION - GPE: LOCATION - ORG: ORGANIZATION - DATE: DATE_TIME - TIME: DATE_TIME - NORP: NRP -- low_confidence_score_multiplier: 0.4 -- low_score_entity_names: - - ORGANIZATION - - ORG -- default_score: 0.85 diff --git a/presidio-analyzer/conf/spacy.yaml b/presidio-analyzer/conf/spacy.yaml index 2b95bd4ad..92c163441 100644 --- a/presidio-analyzer/conf/spacy.yaml +++ b/presidio-analyzer/conf/spacy.yaml @@ -3,20 +3,3 @@ models: - lang_code: en model_name: en_core_web_lg -ner_model_configuration: - - model_to_presidio_entity_mapping: - PER: PERSON - PERSON: PERSON - LOC: LOCATION - LOCATION: LOCATION - GPE: LOCATION - ORG: ORGANIZATION - DATE: DATE_TIME - TIME: DATE_TIME - NORP: NRP - - - low_confidence_score_multiplier: 0.4 - - low_score_entity_names: - - ORGANIZATION - - ORG - - default_score: 0.85 \ No newline at end of file diff --git a/presidio-analyzer/conf/spacy_multilingual.yaml b/presidio-analyzer/conf/spacy_multilingual.yaml index 34a989549..de4868f73 100644 --- a/presidio-analyzer/conf/spacy_multilingual.yaml +++ b/presidio-analyzer/conf/spacy_multilingual.yaml @@ -9,20 +9,3 @@ models: - lang_code: es model_name: es_core_news_md -ner_model_configuration: - - model_to_presidio_entity_mapping: - PER: PERSON - PERSON: PERSON - LOC: LOCATION - LOCATION: LOCATION - GPE: LOCATION - ORG: ORGANIZATION - DATE: DATE_TIME - TIME: DATE_TIME - NORP: NRP - - - low_confidence_score_multiplier: 0.4 - - low_score_entity_names: - - ORGANIZATION - - ORG - - default_score: 0.85 diff --git a/presidio-analyzer/conf/stanza.yaml b/presidio-analyzer/conf/stanza.yaml index 77d501530..7d8090e4a 100644 --- a/presidio-analyzer/conf/stanza.yaml +++ b/presidio-analyzer/conf/stanza.yaml @@ -4,19 +4,3 @@ models: lang_code: en model_name: en -ner_model_configuration: - - model_to_presidio_entity_mapping: - PER: PERSON - PERSON: PERSON - LOC: LOCATION - LOCATION: LOCATION - GPE: LOCATION - ORG: ORGANIZATION - DATE: DATE_TIME - TIME: DATE_TIME - NORP: NRP - - - low_confidence_score_multiplier: 0.4 - - low_score_entity_names: - - - - default_score: 0.85 diff --git a/presidio-analyzer/conf/stanza_multilingual.yaml b/presidio-analyzer/conf/stanza_multilingual.yaml index 459869c6b..d0e02e39c 100644 --- a/presidio-analyzer/conf/stanza_multilingual.yaml +++ b/presidio-analyzer/conf/stanza_multilingual.yaml @@ -7,19 +7,3 @@ models: lang_code: de model_name: de -ner_model_configuration: - - model_to_presidio_entity_mapping: - PER: PERSON - PERSON: PERSON - LOC: LOCATION - LOCATION: LOCATION - GPE: LOCATION - ORG: ORGANIZATION - DATE: DATE_TIME - TIME: DATE_TIME - NORP: NRP - - - low_confidence_score_multiplier: 0.4 - - low_score_entity_names: - - - - default_score: 0.85 diff --git a/presidio-analyzer/tests/conf/default.yaml b/presidio-analyzer/tests/conf/default.yaml index ee1504276..68f0f0f75 100644 --- a/presidio-analyzer/tests/conf/default.yaml +++ b/presidio-analyzer/tests/conf/default.yaml @@ -3,26 +3,4 @@ models: - lang_code: en model_name: en_core_web_lg -ner_model_configuration: -- model_to_presidio_entity_mapping: - PER: PERSON - PERSON: PERSON - LOC: LOCATION - LOCATION: LOCATION - GPE: LOCATION - ORG: ORGANIZATION - DATE: DATE_TIME - TIME: DATE_TIME - NORP: NRP -- aggregation_strategy: simple # "simple", "first", "average", "max" -- stride: 16 # If stride >= 0, process long texts in - # overlapping windows of the model max - # length. The value is the length of the - # window overlap in transformer tokenizer - # tokens, NOT the length of the stride. -- alignment_mode: strict # "strict", "contract", "expand" -- labels_to_ignore: ["O"] -- low_confidence_score_multiplier: 0.4 -- low_score_entity_names: - - ORGANIZATION - - ORG \ No newline at end of file + From 5a4bb2964e454747d9a20e4bd41ed2d4d8f16d3f Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Thu, 31 Aug 2023 11:10:17 +0300 Subject: [PATCH 31/67] Simplified logic between spacy and trasnformers nlp engines --- presidio-analyzer/Pipfile | 2 + .../nlp_engine/nlp_artifacts.py | 2 + .../nlp_engine/spacy_nlp_engine.py | 63 +++++++++++++------ .../nlp_engine/transformers_nlp_engine.py | 57 +++++------------ 4 files changed, 64 insertions(+), 60 deletions(-) diff --git a/presidio-analyzer/Pipfile b/presidio-analyzer/Pipfile index 8307b1657..470f0e633 100644 --- a/presidio-analyzer/Pipfile +++ b/presidio-analyzer/Pipfile @@ -12,6 +12,8 @@ pyyaml = "*" phonenumbers = ">=8.12,<9.0.0" typing-extensions = "*" spacy-huggingface-pipelines = "*" +stanza = "*" +spacy-stanza = "*" [dev-packages] pytest = "*" diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py index 052bafd2b..015dbc210 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py @@ -80,5 +80,7 @@ def to_json(self) -> str: return_dict["tokens"] = [token.text for token in self.tokens] if "entities" in return_dict: return_dict["entities"] = [entity.text for entity in self.entities] + if "scores" in return_dict: + return_dict["scores"] = [float(score) for score in self.scores] return json.dumps(return_dict) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py index 917616333..fdab4397a 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py @@ -4,7 +4,7 @@ import spacy from spacy.language import Language -from spacy.tokens import Doc, SpanGroup +from spacy.tokens import Doc, SpanGroup, Span from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngine, NerModelConfiguration @@ -148,12 +148,12 @@ def _doc_to_nlp_artifact(self, doc: Doc, language: str) -> NlpArtifacts: tokens_indices = [token.idx for token in doc] entities = self._get_entities(doc) - scores = entities.attrs["scores"] + scores = self._get_scores_for_entities(doc) - entities_as_spans = [ent for ent in entities] + entities, scores = self._get_updated_entities(entities, scores) return NlpArtifacts( - entities=entities_as_spans, + entities=entities, tokens=doc, tokens_indices=tokens_indices, lemmas=lemmas, @@ -162,22 +162,53 @@ def _doc_to_nlp_artifact(self, doc: Doc, language: str) -> NlpArtifacts: scores=scores, ) - def _get_entities(self, doc: Doc) -> SpanGroup: + def _get_entities(self, doc:Doc) -> List[Span]: + """ + Extract entities out of a spaCy pipeline, depending on the type of pipeline. + + For normal spaCy, this would be doc.ents + :param doc: the output spaCy doc. + :return: List of entities + """ + + return doc.ents + + def _get_scores_for_entities(self, doc: Doc) -> List[float]: + """Extract scores for entities from the doc. + + Since spaCy does not provide confidence scores for entities by default, + we use the default score from the ner model configuration. + :param doc: SpaCy doc + """ + + entities = doc.ents + scores = [self.ner_model_configuration.default_score] * len(entities) + return scores + + def _get_updated_entities( + self, entities: List[Span], scores: List[float] + ) -> Tuple[List[Span], List[float]]: """ Get an updated list of entities based on the ner model configuration. Remove entities that are in labels_to_ignore, update entity names based on model_to_presidio_entity_mapping - :param doc: Output of a spaCy model - :return: SpanGroup holding the entities and confidence scores + :param entities: Entities that were extracted from a spaCy pipeline + :param scores: Original confidence scores for the entities extracted + :return: Tuple holding the entities and confidence scores """ - output_spans = SpanGroup(doc, attrs={"scores": []}) + if len(entities) != len(scores): + raise ValueError("Entities and scores must be the same length") + + new_entities = [] + new_scores = [] mapping = self.ner_model_configuration.model_to_presidio_entity_mapping - for ent in doc.ents: + to_ignore = self.ner_model_configuration.labels_to_ignore + for ent, score in zip(entities, scores): # Remove model labels in the ignore list - if ent.label_ in self.ner_model_configuration.labels_to_ignore: + if ent.label_ in to_ignore: continue # Update entity label based on mapping @@ -190,19 +221,15 @@ def _get_entities(self, doc: Doc) -> SpanGroup: ) # Remove presidio entities in the ignore list - if ent.label_ in self.ner_model_configuration.labels_to_ignore: + if ent.label_ in to_ignore: continue - output_spans.append(ent) - - # Set default confidence - # (spaCy models don't have built in confidence scores) - score = self.ner_model_configuration.default_score + new_entities.append(ent) # Update score if entity is in low score entity names if ent.label_ in self.ner_model_configuration.low_score_entity_names: score *= self.ner_model_configuration.low_confidence_score_multiplier - output_spans.attrs["scores"].append(score) + new_scores.append(score) - return output_spans + return new_entities, new_scores diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py index a0f664447..8cd720cd7 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py @@ -1,8 +1,8 @@ import logging -from typing import Optional, Dict +from typing import Optional, Dict, List import spacy -from spacy.tokens import Doc, SpanGroup +from spacy.tokens import Doc, SpanGroup, Span try: import spacy_huggingface_pipelines @@ -120,50 +120,23 @@ def process_text(self, text: str, language: str) -> NlpArtifacts: doc = self.nlp[language](text) return self._doc_to_nlp_artifact(doc, language) - def _get_entities(self, doc: Doc) -> SpanGroup: + def _get_entities(self, doc: Doc) -> List[Span]: """ - Get an updated list of entities based on the ner model configuration. + Extract entities out of a spaCy pipeline, depending on the type of pipeline. - Remove entities that are in labels_to_ignore, - update entity names based on model_to_presidio_entity_mapping. - - :param doc: Output of a spaCy model - :return: SpanGroup holding on the entities and confidence scores + For spacy-huggingface-pipeline, this would be doc.spans[key] + :param doc: the output spaCy doc. + :return: List of entities """ - current_ents = doc.spans[self.entity_key] - current_scores = doc.spans[self.entity_key].attrs["scores"] - - output_spans = SpanGroup(doc, attrs={"scores": []}) - - mapping = self.ner_model_configuration.model_to_presidio_entity_mapping - to_ignore = self.ner_model_configuration.labels_to_ignore - for i, ent in enumerate(current_ents): - # Remove model labels in the ignore list - if ent.label_ in to_ignore: - continue - - # Update entity label based on mapping - if ent.label_ in mapping: - ent.label_ = mapping[ent.label_] - else: - logger.warning( - f"Entity {ent.label_} is not mapped to a Presidio entity, " - f"but keeping anyway" - ) + return doc.spans[self.entity_key] - # Remove presidio entities in the ignore list - if ent.label_ in to_ignore: - continue + def _get_scores_for_entities(self, doc: Doc) -> List[float]: + """Extract scores for entities from the doc. - output_spans.append(ent) - - score = current_scores[i] - # Update score if entity is in low score entity names - if ent.label_ in self.ner_model_configuration.low_score_entity_names: - score *= self.ner_model_configuration.low_confidence_score_multiplier - - # Update scores list - output_spans.attrs["scores"].append(score) + While spaCy does not provide confidence scores, + the spacy-huggingface-pipeline flow adds confidence scores as SpanGroup attributes. + :param doc: SpaCy doc + """ - return output_spans + return doc.spans[self.entity_key].attrs["scores"] From 7e05bf0f4e9104a9f7fa259d12e608a55d96a643 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Thu, 31 Aug 2023 11:59:46 +0300 Subject: [PATCH 32/67] flake8 --- presidio-analyzer/Pipfile | 1 + .../presidio_analyzer/nlp_engine/spacy_nlp_engine.py | 4 ++-- .../presidio_analyzer/nlp_engine/transformers_nlp_engine.py | 5 +++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/presidio-analyzer/Pipfile b/presidio-analyzer/Pipfile index 470f0e633..bebb2aa2b 100644 --- a/presidio-analyzer/Pipfile +++ b/presidio-analyzer/Pipfile @@ -21,4 +21,5 @@ pytest-mock = "*" flake8= {version = ">=3.7.9"} pep8-naming = "*" flake8-docstrings = "*" +flake8-annotations = "*" pre_commit = "*" diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py index fdab4397a..66850592d 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py @@ -4,7 +4,7 @@ import spacy from spacy.language import Language -from spacy.tokens import Doc, SpanGroup, Span +from spacy.tokens import Doc, Span from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngine, NerModelConfiguration @@ -162,7 +162,7 @@ def _doc_to_nlp_artifact(self, doc: Doc, language: str) -> NlpArtifacts: scores=scores, ) - def _get_entities(self, doc:Doc) -> List[Span]: + def _get_entities(self, doc: Doc) -> List[Span]: """ Extract entities out of a spaCy pipeline, depending on the type of pipeline. diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py index 8cd720cd7..8f06e6a41 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py @@ -2,7 +2,7 @@ from typing import Optional, Dict, List import spacy -from spacy.tokens import Doc, SpanGroup, Span +from spacy.tokens import Doc, Span try: import spacy_huggingface_pipelines @@ -135,7 +135,8 @@ def _get_scores_for_entities(self, doc: Doc) -> List[float]: """Extract scores for entities from the doc. While spaCy does not provide confidence scores, - the spacy-huggingface-pipeline flow adds confidence scores as SpanGroup attributes. + the spacy-huggingface-pipeline flow adds confidence scores + as SpanGroup attributes. :param doc: SpaCy doc """ From 865fae0a4c6e2013cc7686f537383d13aae9af38 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Thu, 31 Aug 2023 12:50:57 +0300 Subject: [PATCH 33/67] Update Pipfile --- presidio-analyzer/Pipfile | 1 - 1 file changed, 1 deletion(-) diff --git a/presidio-analyzer/Pipfile b/presidio-analyzer/Pipfile index bebb2aa2b..470f0e633 100644 --- a/presidio-analyzer/Pipfile +++ b/presidio-analyzer/Pipfile @@ -21,5 +21,4 @@ pytest-mock = "*" flake8= {version = ">=3.7.9"} pep8-naming = "*" flake8-docstrings = "*" -flake8-annotations = "*" pre_commit = "*" From 67bf43f2458a5c80b8e5bc2364e5924b2f2ec1b7 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Thu, 31 Aug 2023 12:58:15 +0300 Subject: [PATCH 34/67] fixed wrong key name --- .../presidio_analyzer/nlp_engine/nlp_engine_provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py index e67e35030..2fc85651e 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py @@ -85,7 +85,7 @@ def create_engine(self) -> NlpEngine: nlp_engine_class = self.nlp_engines[nlp_engine_name] nlp_models = self.nlp_configuration["models"] - ner_model_configuration = self.nlp_configuration.get("ner_model_params") + ner_model_configuration = self.nlp_configuration.get("ner_model_configuration") engine = nlp_engine_class( models=nlp_models, ner_model_configuration=ner_model_configuration ) From 372484720588292313caa373530c33211be33b18 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Thu, 31 Aug 2023 13:40:01 +0300 Subject: [PATCH 35/67] line width --- .../presidio_analyzer/nlp_engine/nlp_engine_provider.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py index 2fc85651e..6dfc3a87d 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py @@ -85,7 +85,9 @@ def create_engine(self) -> NlpEngine: nlp_engine_class = self.nlp_engines[nlp_engine_name] nlp_models = self.nlp_configuration["models"] - ner_model_configuration = self.nlp_configuration.get("ner_model_configuration") + ner_model_configuration = self.nlp_configuration.get( + "ner_model_configuration" + ) engine = nlp_engine_class( models=nlp_models, ner_model_configuration=ner_model_configuration ) From 53e01965186d0b50124388876bfe8462a8aabd2d Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 18 Sep 2023 12:34:55 +0300 Subject: [PATCH 36/67] Updates to tests and docs --- .../nlp_engine/ner_model_configuration.py | 107 +++++------------- .../nlp_engine/stanza_nlp_engine.py | 1 + .../nlp_engine/transformers_nlp_engine.py | 9 -- .../tests/conf/test_transformers.yaml | 36 ++++++ presidio-analyzer/tests/conftest.py | 84 +++++--------- .../tests/mocks/nlp_engine_mock.py | 2 +- .../tests/test_analyzer_engine.py | 41 +++---- .../tests/test_batch_analyzer_engine.py | 2 +- .../tests/test_context_support.py | 18 +-- .../tests/test_ner_model_configuration.py | 75 ++++++++++++ .../tests/test_nlp_engine_provider.py | 99 +++++----------- .../tests/test_phone_recognizer.py | 8 +- .../tests/test_spacy_nlp_engine.py | 32 +++++- .../tests/test_spacy_recognizer.py | 21 ++-- .../tests/test_stanza_recognizer.py | 16 +-- .../tests/test_transformers_nlp_engine.py | 48 ++++++++ .../tests/test_transformers_recognizer.py | 15 ++- 17 files changed, 338 insertions(+), 276 deletions(-) create mode 100644 presidio-analyzer/tests/conf/test_transformers.yaml create mode 100644 presidio-analyzer/tests/test_ner_model_configuration.py create mode 100644 presidio-analyzer/tests/test_transformers_nlp_engine.py diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py index 64622c1a0..67b181270 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py @@ -1,10 +1,6 @@ -import json import logging from dataclasses import dataclass -from pathlib import Path -from typing import Dict, Optional, Union, Collection - -import yaml +from typing import Dict, Optional, Collection, Type logger = logging.getLogger("presidio-analyzer") @@ -38,7 +34,6 @@ class NerModelConfiguration: """NER model configuration. - :param nlp_engine_name: Name of the NLP engine to use. :param labels_to_ignore: List of labels to not return predictions for. :param aggregation_strategy: See https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.TokenClassificationPipeline.aggregation_strategy @@ -54,7 +49,6 @@ class NerModelConfiguration: Multiplier to the score given for low_score_entity_names. """ # noqa E501 - nlp_engine_name: str labels_to_ignore: Optional[Collection[str]] = None aggregation_strategy: Optional[str] = "simple" stride: Optional[int] = 14 @@ -84,91 +78,44 @@ def __post_init__(self): self.labels_to_ignore = LABELS_TO_IGNORE @classmethod - def _validate_input(cls, nlp_engine_configuration: Dict) -> None: - if "nlp_engine_name" not in nlp_engine_configuration: - raise ValueError("nlp_engine_name is a required parameter") - if "labels_to_ignore" in nlp_engine_configuration: - if not isinstance(nlp_engine_configuration["labels_to_ignore"], list): - raise ValueError("labels_to_ignore must be a list") - if "aggregation_strategy" in nlp_engine_configuration: - if not isinstance(nlp_engine_configuration["aggregation_strategy"], str): - raise ValueError("aggregation_strategy must be a string") - if "alignment_mode" in nlp_engine_configuration: - if not isinstance(nlp_engine_configuration["alignment_mode"], str): - raise ValueError("alignment_mode must be a string") - if "stride" in nlp_engine_configuration: - if not isinstance(nlp_engine_configuration["stride"], int): - raise ValueError("stride must be an integer") - if "model_to_presidio_entity_mapping" in nlp_engine_configuration: - if not isinstance( - nlp_engine_configuration["model_to_presidio_entity_mapping"], dict - ): - raise ValueError("model_to_presidio_entity_mapping must be a dict") - if "low_score_entity_names" in nlp_engine_configuration: - if not isinstance(nlp_engine_configuration["low_score_entity_names"], list): - raise ValueError("low_score_entity_names must be a list") - if "low_confidence_score_multiplier" in nlp_engine_configuration: - if not isinstance( - nlp_engine_configuration["low_confidence_score_multiplier"], float - ): - raise ValueError("low_confidence_score_multiplier must be a float") - - @classmethod - def from_yaml(cls, yaml_file: Union[Path, str]) -> "NerModelConfiguration": - """Load NLP engine configuration from yaml file. - - :param yaml_file: Path to the yaml file. - """ - - if not Path(yaml_file).exists(): - raise FileNotFoundError(f"configuration file {yaml_file} not found.") - - with open(yaml_file, "r") as f: - nlp_engine_configuration = yaml.safe_load(f) - - cls._validate_input(nlp_engine_configuration) + def _validate_input(cls, ner_model_configuration_dict: Dict) -> None: + key_to_type = { + "labels_to_ignore": list, + "aggregation_strategy": str, + "alignment_mode": str, + "model_to_presidio_entity_mapping": dict, + "low_confidence_score_multiplier": float, + "low_score_entity_names": list, + "stride": int, + } + + for key, field_type in key_to_type.items(): + cls.__validate_type( + config_dict=ner_model_configuration_dict, key=key, field_type=field_type + ) - return cls.from_dict(nlp_engine_configuration) + @staticmethod + def __validate_type(config_dict: Dict, key: str, field_type: Type) -> None: + if key in config_dict: + if not isinstance(config_dict[key], field_type): + raise ValueError(f"{key} must be of type {field_type}") + else: + raise ValueError(f"NER configuration is missing '{key}'") @classmethod - def from_json(cls, json_file: Union[Path, str]) -> "NerModelConfiguration": - """Load NLP engine configuration from json file. + def from_dict(cls, nlp_engine_configuration: Dict) -> "NerModelConfiguration": + """Load NLP engine configuration from dict. - :param json_file: Path to the json file. + :param nlp_engine_configuration: Dict with the configuration to load. """ - - if not Path(json_file).exists(): - raise FileNotFoundError(f"configuration file {json_file} not found.") - - with open(json_file, "r") as f: - nlp_engine_configuration = json.load(f) - cls._validate_input(nlp_engine_configuration) - return cls.from_dict(nlp_engine_configuration) - - @classmethod - def from_dict(cls, config_dict: Dict) -> "NerModelConfiguration": - """Load NLP engine configuration from dict. - - :param config_dict: Dict with the configuration to load. - """ - return cls(**config_dict) + return cls(**nlp_engine_configuration) def to_dict(self) -> Dict: """Return the configuration as a dict.""" return self.__dict__ - @staticmethod - def get_full_conf_path( - default_conf_file: Union[Path, str] = "default.yaml" - ) -> Path: - """Return a Path to the default conf file. - - :param default_conf_file: Name of the default conf file. - """ - return Path(Path(__file__).parent.parent.parent, "conf", default_conf_file) - def __str__(self) -> str: # noqa D105 return str(self.to_dict()) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py index 1033f75d6..89977275b 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py @@ -35,4 +35,5 @@ def load(self) -> None: self.nlp[model["lang_code"]] = spacy_stanza.load_pipeline( model["model_name"], processors="tokenize,pos,lemma,ner", + download_method=None, ) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py index 8f06e6a41..46b491d83 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py @@ -13,7 +13,6 @@ from presidio_analyzer.nlp_engine import ( SpacyNlpEngine, - NlpArtifacts, NerModelConfiguration, ) @@ -112,14 +111,6 @@ def _validate_model_params(model: Dict) -> None: "transformers model name is missing from model configuration" ) - def process_text(self, text: str, language: str) -> NlpArtifacts: - """Execute the SpaCy NLP pipeline on the given text and language.""" - if not self.nlp: - raise ValueError("NLP engine is not loaded. Consider calling .load()") - - doc = self.nlp[language](text) - return self._doc_to_nlp_artifact(doc, language) - def _get_entities(self, doc: Doc) -> List[Span]: """ Extract entities out of a spaCy pipeline, depending on the type of pipeline. diff --git a/presidio-analyzer/tests/conf/test_transformers.yaml b/presidio-analyzer/tests/conf/test_transformers.yaml new file mode 100644 index 000000000..5fca7be77 --- /dev/null +++ b/presidio-analyzer/tests/conf/test_transformers.yaml @@ -0,0 +1,36 @@ +nlp_engine_name: transformers +models: + - + lang_code: en + model_name: + spacy: en_core_web_lg + transformers: StanfordAIMI/stanford-deidentifier-base +ner_model_configuration: + labels_to_ignore: + - O + aggregation_strategy: simple # "simple", "first", "average", "max" + stride: 16 # If stride >= 0, process long texts in + # overlapping windows of the model max + # length. The value is the length of the + # window overlap in transformer tokenizer + # tokens, NOT the length of the stride. + alignment_mode: strict # "strict", "contract", "expand" + model_to_presidio_entity_mapping: + PER: PERSON + LOC: LOCATION + ORG: ORGANIZATION + AGE: AGE + ID: ID + EMAIL: EMAIL + PATIENT: PERSON + STAFF: PERSON + HOSP: ORGANIZATION + PATORG: ORGANIZATION + DATE: DATE_TIME + PHONE: PHONE_NUMBER + HCW: PERSON + HOSPITAL: ORGANIZATION + + low_confidence_score_multiplier: 0.4 + low_score_entity_names: + - ID \ No newline at end of file diff --git a/presidio-analyzer/tests/conftest.py b/presidio-analyzer/tests/conftest.py index 5d5602787..b6505bba8 100644 --- a/presidio-analyzer/tests/conftest.py +++ b/presidio-analyzer/tests/conftest.py @@ -3,7 +3,6 @@ from typing import Dict import pytest -import spacy from presidio_analyzer import ( EntityRecognizer, @@ -14,29 +13,15 @@ from presidio_analyzer import RecognizerRegistry from presidio_analyzer.nlp_engine import NlpEngineProvider, NlpEngine from presidio_analyzer.predefined_recognizers import NLP_RECOGNIZERS -from tests.mocks import RecognizerRegistryMock +from tests.mocks import RecognizerRegistryMock, NlpEngineMock -def pytest_addoption(parser): - parser.addoption( - "--runfast", action="store_true", default=False, help="run fast tests" - ) - def pytest_configure(config): - config.addinivalue_line("markers", "slow: mark test as slow to run") config.addinivalue_line( "markers", "skip_engine(nlp_engine): skip test for given nlp engine" ) - - -def pytest_collection_modifyitems(config, items): - if config.getoption("--runfast"): - # --runfast given in cli: skip slow tests - skip_slow = pytest.mark.skip(reason="remove --runfast option to run") - for item in items: - if "slow" in item.keywords: - item.add_marker(skip_slow) + config.addinivalue_line("markers", "integration: mark test as an integration test") @pytest.fixture(scope="session") @@ -50,28 +35,28 @@ def nlp_engines(request, nlp_engine_provider) -> Dict[str, NlpEngine]: nlp_engines = nlp_engine_provider.nlp_engines for name, engine_cls in nlp_engines.items(): - if name == "spacy" and not request.config.getoption("--runfast"): + if name == "spacy": available_engines[f"{name}_en"] = engine_cls( models=[{"lang_code": "en", "model_name": "en_core_web_lg"}] ) - elif name == "stanza" and not request.config.getoption("--runfast"): + elif name == "stanza": available_engines[f"{name}_en"] = engine_cls( models=[{"lang_code": "en", "model_name": "en"}] ) - elif name == "transformers" and not request.config.getoption("--runfast"): + elif name == "transformers": available_engines[f"{name}_en"] = engine_cls( - models=[{ - "lang_code": "en", - "model_name": { - "spacy": "en_core_web_sm", - "transformers": "StanfordAIMI/stanford-deidentifier-base", - }, - }] + models=[ + { + "lang_code": "en", + "model_name": { + "spacy": "en_core_web_sm", + "transformers": "StanfordAIMI/stanford-deidentifier-base", + }, + } + ] ) else: raise ValueError("Unsupported engine for tests") - # Load engine - available_engines[f"{name}_en"].load() return available_engines @@ -85,6 +70,15 @@ def skip_by_engine(request, nlp_engines): pytest.skip(f"skipped on this engine: {marker_arg}") +@pytest.mark.skip_engine("spacy_en") +@pytest.fixture(scope="session") +def spacy_nlp_engine(nlp_engines): + nlp_engine = nlp_engines.get("spacy_en", None) + if nlp_engine: + nlp_engine.load() + return nlp_engine + + @pytest.fixture(scope="session") def nlp_recognizers() -> Dict[str, EntityRecognizer]: return {name: rec_cls() for name, rec_cls in NLP_RECOGNIZERS.items()} @@ -110,41 +104,19 @@ def loaded_registry() -> RecognizerRegistry: return RecognizerRegistry() -@pytest.fixture(scope="module") -def nlp_engine(nlp_engines) -> NlpEngine: - return nlp_engines["spacy_en"] - - @pytest.fixture(scope="module") def mock_registry() -> RecognizerRegistryMock: return RecognizerRegistryMock() @pytest.fixture(scope="module") -def analyzer_engine_simple(mock_registry, nlp_engine) -> AnalyzerEngine: - return AnalyzerEngine(registry=mock_registry, nlp_engine=nlp_engine) - - -@pytest.fixture(scope="session") -def mock_he_model(): - """ - Create an empty Hebrew spaCy pipeline and save it to disk. - - So that it could be loaded using spacy.load() - """ - he = spacy.blank("he") - he.to_disk("he_test") +def mock_nlp_engine() -> NlpEngineMock: + return NlpEngineMock() -@pytest.fixture(scope="session") -def mock_bn_model(): - """ - Create an empty Bengali spaCy pipeline and save it to disk. - - So that it could be loaded using spacy.load() - """ - bn = spacy.blank("bn") - bn.to_disk("bn_test") +@pytest.fixture(scope="module") +def analyzer_engine_simple(mock_registry, mock_nlp_engine) -> AnalyzerEngine: + return AnalyzerEngine(registry=mock_registry, nlp_engine=mock_nlp_engine) @pytest.fixture(scope="session") diff --git a/presidio-analyzer/tests/mocks/nlp_engine_mock.py b/presidio-analyzer/tests/mocks/nlp_engine_mock.py index 5e8ab5568..a2a591968 100644 --- a/presidio-analyzer/tests/mocks/nlp_engine_mock.py +++ b/presidio-analyzer/tests/mocks/nlp_engine_mock.py @@ -8,7 +8,7 @@ def __init__(self, stopwords=None, punct_words=None, nlp_artifacts=None): self.stopwords = stopwords if stopwords else [] self.punct_words = punct_words if punct_words else [] if nlp_artifacts is None: - self.nlp_artifacts = NlpArtifacts([], [], [], [], None, "en") + self.nlp_artifacts = NlpArtifacts([], [], [], [], None, "en", []) else: self.nlp_artifacts = nlp_artifacts diff --git a/presidio-analyzer/tests/test_analyzer_engine.py b/presidio-analyzer/tests/test_analyzer_engine.py index 746f2ae60..68c28d512 100644 --- a/presidio-analyzer/tests/test_analyzer_engine.py +++ b/presidio-analyzer/tests/test_analyzer_engine.py @@ -49,6 +49,7 @@ def nlp_engine(nlp_engines): return nlp_engines["spacy_en"] +@pytest.mark.integration def test_simple(): dic = { "text": "John Smith drivers license is AC432223", @@ -79,14 +80,14 @@ def test_when_analyze_with_predefined_recognizers_then_return_results( def test_when_analyze_with_multiple_predefined_recognizers_then_succeed( - loaded_registry, unit_test_guid, nlp_engine, max_score + loaded_registry, unit_test_guid, spacy_nlp_engine, max_score ): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] analyzer_engine_with_spacy = AnalyzerEngine( - registry=loaded_registry, nlp_engine=nlp_engine + registry=loaded_registry, nlp_engine=spacy_nlp_engine ) results = analyzer_engine_with_spacy.analyze( correlation_id=unit_test_guid, @@ -134,8 +135,8 @@ def test_when_analyze_with_unsupported_language_then_fail( ) -def test_when_analyze_two_entities_embedded_then_return_results(nlp_engine): - analyzer = AnalyzerEngine(nlp_engine=nlp_engine) +def test_when_analyze_two_entities_embedded_then_return_results(spacy_nlp_engine): + analyzer = AnalyzerEngine(nlp_engine=spacy_nlp_engine) # Name with driver license in it text = "My name is John 1234567 Doe" @@ -318,10 +319,10 @@ def test_when_entities_is_none_then_return_all_fields(loaded_registry): def test_when_entities_is_none_all_recognizers_loaded_then_return_all_fields( - nlp_engine, + spacy_nlp_engine, ): analyze_engine = AnalyzerEngine( - registry=RecognizerRegistry(), nlp_engine=nlp_engine + registry=RecognizerRegistry(), nlp_engine=spacy_nlp_engine ) threshold = 0 text = "My name is Sharon and I live in Seattle." "Domain: microsoft.com " @@ -337,7 +338,7 @@ def test_when_entities_is_none_all_recognizers_loaded_then_return_all_fields( def test_when_analyze_then_apptracer_has_value( - loaded_registry, unit_test_guid, nlp_engine + loaded_registry, unit_test_guid, spacy_nlp_engine ): text = "My name is Bart Simpson, and Credit card: 4095-2609-9393-4932, my phone is 425 8829090" # noqa E501 language = "en" @@ -347,7 +348,7 @@ def test_when_analyze_then_apptracer_has_value( loaded_registry, app_tracer=app_tracer_mock, log_decision_process=True, - nlp_engine=nlp_engine, + nlp_engine=spacy_nlp_engine, ) results = analyzer_engine_with_spacy.analyze( correlation_id=unit_test_guid, @@ -470,7 +471,7 @@ def test_when_get_supported_fields_then_return_all_languages( def test_when_get_supported_fields_specific_language_then_return_single_result( - loaded_registry, unit_test_guid, nlp_engine + loaded_registry, unit_test_guid, spacy_nlp_engine ): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer( @@ -480,7 +481,7 @@ def test_when_get_supported_fields_specific_language_then_return_single_result( supported_language="ru", ) - analyzer = AnalyzerEngine(registry=loaded_registry, nlp_engine=nlp_engine) + analyzer = AnalyzerEngine(registry=loaded_registry, nlp_engine=spacy_nlp_engine) analyzer.registry.add_recognizer(pattern_recognizer) entities = analyzer.get_supported_entities(language="ru") @@ -507,7 +508,7 @@ def test_when_get_recognizers_then_returns_supported_language(): assert len(response) == 1 -def test_when_add_recognizer_then_also_outputs_others(nlp_engine): +def test_when_add_recognizer_then_also_outputs_others(spacy_nlp_engine): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer( "ROCKET", @@ -521,7 +522,7 @@ def test_when_add_recognizer_then_also_outputs_others(nlp_engine): assert len(registry.recognizers) > 1 - analyzer = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine) + analyzer = AnalyzerEngine(registry=registry, nlp_engine=spacy_nlp_engine) text = "Michael Jones has a rocket" @@ -652,9 +653,9 @@ def test_entities_filter_for_ad_hoc_removes_recognizer(loaded_analyzer_engine): assert "MR" not in [resp.entity_type for resp in responses2] -def test_ad_hoc_with_context_support_higher_confidence(nlp_engine, zip_code_recognizer): +def test_ad_hoc_with_context_support_higher_confidence(spacy_nlp_engine, zip_code_recognizer): text = "Mr. John Smith's zip code is 10023" - analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine) + analyzer_engine = AnalyzerEngine(nlp_engine=spacy_nlp_engine) responses1 = analyzer_engine.analyze( text=text, language="en", ad_hoc_recognizers=[zip_code_recognizer] @@ -686,7 +687,7 @@ def test_ad_hoc_when_no_other_recognizers_are_requested_returns_only_ad_hoc_resu assert "ZIP" in [resp.entity_type for resp in responses] -def test_when_recognizer_doesnt_return_recognizer_name_no_exception(nlp_engine): +def test_when_recognizer_doesnt_return_recognizer_name_no_exception(spacy_nlp_engine): class MockRecognizer1(EntityRecognizer, ABC): def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts): return [RecognizerResult("TEST1", 10, 30, 0.5)] @@ -703,7 +704,7 @@ def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts): registry.add_recognizer(mock_recognizer1) registry.add_recognizer(mock_recognizer2) - analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry) + analyzer_engine = AnalyzerEngine(nlp_engine=spacy_nlp_engine, registry=registry) results = analyzer_engine.analyze("ABC", language="en") assert len(results) == 2 @@ -735,7 +736,7 @@ def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts): ) -def test_when_recognizer_overrides_enhance_score_then_it_get_boosted_once(nlp_engine): +def test_when_recognizer_overrides_enhance_score_then_it_get_boosted_once(spacy_nlp_engine): class MockRecognizer(EntityRecognizer, ABC): def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts): return [ @@ -768,7 +769,7 @@ def enhance_using_context( registry = RecognizerRegistry() registry.add_recognizer(mock_recognizer) - analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry) + analyzer_engine = AnalyzerEngine(nlp_engine=spacy_nlp_engine, registry=registry) recognizer_results = analyzer_engine.analyze("ABC", language="en") assert len(recognizer_results) == 2 @@ -809,7 +810,7 @@ def enhance_using_context( ] -def test_when_multiple_nameless_recognizers_context_is_correct(nlp_engine): +def test_when_multiple_nameless_recognizers_context_is_correct(spacy_nlp_engine): rocket_recognizer = PatternRecognizer( supported_entity="ROCKET", context=["cool"], @@ -825,7 +826,7 @@ def test_when_multiple_nameless_recognizers_context_is_correct(nlp_engine): registry.add_recognizer(rocket_recognizer) registry.add_recognizer(rocket_recognizer2) - analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry) + analyzer_engine = AnalyzerEngine(nlp_engine=spacy_nlp_engine, registry=registry) recognizer_results = analyzer_engine.analyze( "I have a cool rocket and a fast missile.", language="en" ) diff --git a/presidio-analyzer/tests/test_batch_analyzer_engine.py b/presidio-analyzer/tests/test_batch_analyzer_engine.py index 51cd34ec0..e454cee35 100644 --- a/presidio-analyzer/tests/test_batch_analyzer_engine.py +++ b/presidio-analyzer/tests/test_batch_analyzer_engine.py @@ -158,7 +158,7 @@ def test_analyze_dict_on_nested_dict(batch_analyzer_engine_simple): key="key_a1", value=nested_dict["key_a"]["key_a1"], recognizer_results=[ - RecognizerResult("PHONE_NUMBER", start=19, end=31, score=0.75) + RecognizerResult("PHONE_NUMBER", start=19, end=31, score=0.4) ], ) ], diff --git a/presidio-analyzer/tests/test_context_support.py b/presidio-analyzer/tests/test_context_support.py index 3054b29ee..a7b4d00be 100644 --- a/presidio-analyzer/tests/test_context_support.py +++ b/presidio-analyzer/tests/test_context_support.py @@ -90,7 +90,7 @@ def us_license_recognizer(): def test_when_text_with_aditional_context_lemma_based_context_enhancer_then_analysis_explanation_include_correct_supportive_context_word( # noqa: E501 - nlp_engine, lemma_context, us_license_recognizer + spacy_nlp_engine, lemma_context, us_license_recognizer ): """This test checks that LemmaContextAwareEnhancer uses supportive context word from analyze input as if it was in the text itself. @@ -100,7 +100,7 @@ def test_when_text_with_aditional_context_lemma_based_context_enhancer_then_anal return that word as supportive_context_word instead of other recognizer context word """ text = "John Smith license is AC432223" - nlp_artifacts = nlp_engine.process_text(text, "en") + nlp_artifacts = spacy_nlp_engine.process_text(text, "en") recognizer_results = us_license_recognizer.analyze(text, nlp_artifacts) results_without_additional_context = lemma_context.enhance_using_context( text, recognizer_results, nlp_artifacts, [us_license_recognizer] @@ -125,8 +125,8 @@ def test_when_text_with_aditional_context_lemma_based_context_enhancer_then_anal ) -def test_when_text_with_only_aditional_context_lemma_based_context_enhancer_then_analysis_explanation_include_correct_supportive_context_word( # noqa: E501 - nlp_engine, lemma_context, us_license_recognizer +def test_when_text_with_only_additional_context_lemma_based_context_enhancer_then_analysis_explanation_include_correct_supportive_context_word( # noqa: E501 + spacy_nlp_engine, lemma_context, us_license_recognizer ): """This test checks that LemmaContextAwareEnhancer uses supportive context word from analyze input as if it was in the text itself but no other words apear @@ -138,7 +138,7 @@ def test_when_text_with_only_aditional_context_lemma_based_context_enhancer_then return that word as supportive_context_word and raise the score. """ text = "John Smith D.R is AC432223" - nlp_artifacts = nlp_engine.process_text(text, "en") + nlp_artifacts = spacy_nlp_engine.process_text(text, "en") recognizer_results = us_license_recognizer.analyze(text, nlp_artifacts) results_without_additional_context = lemma_context.enhance_using_context( text, recognizer_results, nlp_artifacts, [us_license_recognizer] @@ -166,11 +166,11 @@ def test_when_text_with_only_aditional_context_lemma_based_context_enhancer_then def test_when_text_with_context_then_improves_score( - dataset, nlp_engine, mock_nlp_artifacts, lemma_context, recognizers_list + dataset, spacy_nlp_engine, mock_nlp_artifacts, lemma_context, recognizers_list ): for item in dataset: text, recognizer, entities = item - nlp_artifacts = nlp_engine.process_text(text, "en") + nlp_artifacts = spacy_nlp_engine.process_text(text, "en") results_without_context = recognizer.analyze(text, entities, mock_nlp_artifacts) results_with_context = recognizer.analyze(text, entities, nlp_artifacts) @@ -189,7 +189,7 @@ def test_when_text_with_context_then_improves_score( assert res_wo.score <= res_w.score -def test_when_context_custom_recognizer_then_succeed(nlp_engine, mock_nlp_artifacts): +def test_when_context_custom_recognizer_then_succeed(spacy_nlp_engine, mock_nlp_artifacts): """This test checks that a custom recognizer is also enhanced by context. However this test also verifies a specific case in which the pattern also @@ -206,7 +206,7 @@ def test_when_context_custom_recognizer_then_succeed(nlp_engine, mock_nlp_artifa text = "hi, this is a cool ROCKET" recognizer = rocket_recognizer entities = ["ROCKET"] - nlp_artifacts = nlp_engine.process_text(text, "en") + nlp_artifacts = spacy_nlp_engine.process_text(text, "en") results_without_context = recognizer.analyze(text, entities, mock_nlp_artifacts) results_with_context = recognizer.analyze(text, entities, nlp_artifacts) assert len(results_without_context) == len(results_with_context) diff --git a/presidio-analyzer/tests/test_ner_model_configuration.py b/presidio-analyzer/tests/test_ner_model_configuration.py new file mode 100644 index 000000000..51d83774c --- /dev/null +++ b/presidio-analyzer/tests/test_ner_model_configuration.py @@ -0,0 +1,75 @@ +from pathlib import Path + +import pytest +import yaml + +from presidio_analyzer.nlp_engine import NerModelConfiguration + + +@pytest.fixture(scope="module") +def ner_model_configuration_dict(): + this_path = Path(__file__).parent.absolute() + conf_file = Path(this_path, "conf/test_transformers.yaml") + with open(conf_file) as f: + configuration_dict = yaml.safe_load(f) + + return configuration_dict["ner_model_configuration"] + + +@pytest.mark.parametrize( + "key, original_value, expected_value", + [ + ("labels_to_ignore", [], []), + ("labels_to_ignore", ["A", "B"], ["A", "B"]), + ("aggregation_strategy", "X", "X"), + ("alignment_mode", "Y", "Y"), + ("stride", 51, 51), + ("model_to_presidio_entity_mapping", {"A": "B"}, {"A": "B"}), + ("low_score_entity_names", ["A", "C"], ["A", "C"]), + ("low_confidence_score_multiplier", 12.0, 12.0), + ], +) +def test_from_dict_happy_path( + ner_model_configuration_dict, key, original_value, expected_value +): + ner_model_configuration_dict[key] = original_value + + result = NerModelConfiguration.from_dict(ner_model_configuration_dict) + assert result.to_dict()[key] == expected_value + + +@pytest.mark.parametrize( + "key, value", + [ + ("stride", []), + ("stride", "X"), + ("stride", None), + ("alignment_mode", 5), + ("alignment_mode", None), + ("low_confidence_score_multiplier", "X"), + ], +) +def test_from_dict_wrong_types(ner_model_configuration_dict, key, value): + new_config = ner_model_configuration_dict.copy() + new_config[key] = value + with pytest.raises(ValueError): + NerModelConfiguration.from_dict(new_config) + + +@pytest.mark.parametrize( + "key", + [ + ("labels_to_ignore"), + ("aggregation_strategy"), + ("alignment_mode"), + ("model_to_presidio_entity_mapping"), + ("low_confidence_score_multiplier"), + ("low_score_entity_names"), + ("stride"), + ], +) +def test_from_dict_missing_fields(ner_model_configuration_dict, key): + new_config = ner_model_configuration_dict.copy() + del new_config[key] + with pytest.raises(ValueError): + NerModelConfiguration.from_dict(new_config) diff --git a/presidio-analyzer/tests/test_nlp_engine_provider.py b/presidio-analyzer/tests/test_nlp_engine_provider.py index 4fa92f109..bf0b3d3dd 100644 --- a/presidio-analyzer/tests/test_nlp_engine_provider.py +++ b/presidio-analyzer/tests/test_nlp_engine_provider.py @@ -1,30 +1,48 @@ -import json from pathlib import Path -from typing import Dict, List -from unittest.mock import patch +from typing import Dict import pytest import spacy -import yaml from presidio_analyzer.nlp_engine import ( SpacyNlpEngine, StanzaNlpEngine, NlpEngineProvider, - NerModelConfiguration, ) from presidio_analyzer.nlp_engine.transformers_nlp_engine import TransformersNlpEngine +@pytest.fixture(scope="module") +def mock_he_model(): + """ + Create an empty Hebrew spaCy pipeline and save it to disk. + + So that it could be loaded using spacy.load() + """ + he = spacy.blank("he") + he.to_disk("he_test") + + +@pytest.fixture(scope="module") +def mock_bn_model(): + """ + Create an empty Bengali spaCy pipeline and save it to disk. + + So that it could be loaded using spacy.load() + """ + bn = spacy.blank("bn") + bn.to_disk("bn_test") + + @pytest.fixture(scope="session") def nlp_configuration_dict() -> Dict: nlp_configuration = { - "lang_code": "en", - "model_name": { - "spacy": "en_core_web_lg", - "transformers": "StanfordAIMI/stanford-deidentifier-base", - }, - } + "lang_code": "en", + "model_name": { + "spacy": "en_core_web_lg", + "transformers": "StanfordAIMI/stanford-deidentifier-base", + }, + } return nlp_configuration @@ -210,7 +228,7 @@ def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_not def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_keys_not_include_spacy_then_fail( - nlp_configuration_dict, + nlp_configuration_dict, ): nlp_configuration = nlp_configuration_dict.copy() del nlp_configuration["model_name"]["spacy"] @@ -220,7 +238,7 @@ def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_key def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_keys_not_include_transformers_then_fail( - nlp_configuration_dict, + nlp_configuration_dict, ): nlp_configuration = nlp_configuration_dict.copy() del nlp_configuration["model_name"]["transformers"] @@ -229,61 +247,6 @@ def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_key NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() -def test_ner_model_configuration_from_json( - ner_model_configuration_dict, tmp_path_factory -): - fn = tmp_path_factory.mktemp("data") / "nlp_configuration.json" - fn.write_text(json.dumps(ner_model_configuration_dict), "UTF-8") - - ner_model_configuration = NerModelConfiguration.from_json(fn.absolute()) - assert ner_model_configuration.nlp_engine_name == "transformers" - assert ( - ner_model_configuration.low_score_entity_names - == ner_model_configuration_dict["low_score_entity_names"] - ) - assert ( - ner_model_configuration.aggregation_strategy - == ner_model_configuration_dict["aggregation_strategy"] - ) - assert ( - ner_model_configuration.alignment_mode - == ner_model_configuration_dict["alignment_mode"] - ) - - -def test_nlp_model_configuration_from_yaml( - ner_model_configuration_dict, tmp_path_factory -): - fn = tmp_path_factory.mktemp("data") / "nlp_configuration.yaml" - fn.write_text(yaml.safe_dump(ner_model_configuration_dict), "UTF-8") - - ner_model_configuration = NerModelConfiguration.from_yaml(fn.absolute()) - assert ner_model_configuration.nlp_engine_name == "transformers" - assert ( - ner_model_configuration.low_score_entity_names - == ner_model_configuration_dict["low_score_entity_names"] - ) - assert ( - ner_model_configuration.aggregation_strategy - == ner_model_configuration_dict["aggregation_strategy"] - ) - assert ( - ner_model_configuration.alignment_mode - == ner_model_configuration_dict["alignment_mode"] - ) - - -def test_nlp_model_configuration_from_yaml_missing_field( - ner_model_configuration_dict, tmp_path_factory -): - fn = tmp_path_factory.mktemp("data") / "nlp_configuration.yaml" - del ner_model_configuration_dict["nlp_engine_name"] - fn.write_text(yaml.safe_dump(ner_model_configuration_dict), "UTF-8") - - with pytest.raises(ValueError): - NerModelConfiguration.from_yaml(fn.absolute()) - - def test_nlp_engine_provider_init_through_nlp_engine_configuration(): engine = NlpEngineProvider().create_engine() assert isinstance(engine, SpacyNlpEngine) diff --git a/presidio-analyzer/tests/test_phone_recognizer.py b/presidio-analyzer/tests/test_phone_recognizer.py index 55f40be43..f10024f4f 100644 --- a/presidio-analyzer/tests/test_phone_recognizer.py +++ b/presidio-analyzer/tests/test_phone_recognizer.py @@ -9,10 +9,6 @@ def recognizer(): return PhoneRecognizer() -@pytest.fixture(scope="module") -def nlp_engine(nlp_engines): - return nlp_engines["spacy_en"] - @pytest.mark.parametrize( "text, expected_len, entities, expected_positions, score", @@ -32,7 +28,7 @@ def nlp_engine(nlp_engines): ], ) def test_when_all_phones_then_succeed( - nlp_engine, + spacy_nlp_engine, text, expected_len, entities, @@ -40,7 +36,7 @@ def test_when_all_phones_then_succeed( score, recognizer, ): - nlp_artifacts = nlp_engine.process_text(text, "en") + nlp_artifacts = spacy_nlp_engine.process_text(text, "en") results = recognizer.analyze(text, entities, nlp_artifacts=nlp_artifacts) assert len(results) == expected_len for i, (res, (st_pos, fn_pos)) in enumerate(zip(results, expected_positions)): diff --git a/presidio-analyzer/tests/test_spacy_nlp_engine.py b/presidio-analyzer/tests/test_spacy_nlp_engine.py index 313405da5..d09fdbe87 100644 --- a/presidio-analyzer/tests/test_spacy_nlp_engine.py +++ b/presidio-analyzer/tests/test_spacy_nlp_engine.py @@ -1,18 +1,22 @@ from typing import Iterator +import pytest -def test_simple_process_text(nlp_engine): +from presidio_analyzer.nlp_engine import SpacyNlpEngine - nlp_artifacts = nlp_engine.process_text("simple text", language="en") + +def test_simple_process_text(spacy_nlp_engine): + + nlp_artifacts = spacy_nlp_engine.process_text("simple text", language="en") assert len(nlp_artifacts.tokens) == 2 assert not nlp_artifacts.entities assert nlp_artifacts.lemmas[0] == "simple" assert nlp_artifacts.lemmas[1] == "text" -def test_process_batch_strings(nlp_engine): +def test_process_batch_strings(spacy_nlp_engine): - nlp_artifacts_batch = nlp_engine.process_batch( + nlp_artifacts_batch = spacy_nlp_engine.process_batch( ["simple text", "simple text"], language="en" ) assert isinstance(nlp_artifacts_batch, Iterator) @@ -21,3 +25,23 @@ def test_process_batch_strings(nlp_engine): for text, nlp_artifacts in nlp_artifacts_batch: assert text == "simple text" assert len(nlp_artifacts.tokens) == 2 + + +def test_nlp_not_loaded_value_error(): + unloaded_spacy_nlp = SpacyNlpEngine() + with pytest.raises(ValueError): + unloaded_spacy_nlp.process_text("This should fail as the NLP model isn't loaded", language="en") + + +def test_validate_model_params_missing_fields(): + model = { + "lang_code": "en", + "model_name": "en_core_web_;g" + } + + for key in model.keys(): + new_model = model.copy() + del new_model[key] + + with pytest.raises(ValueError): + SpacyNlpEngine._validate_model_params(new_model) diff --git a/presidio-analyzer/tests/test_spacy_recognizer.py b/presidio-analyzer/tests/test_spacy_recognizer.py index 68ce78fab..23b3b45d8 100644 --- a/presidio-analyzer/tests/test_spacy_recognizer.py +++ b/presidio-analyzer/tests/test_spacy_recognizer.py @@ -1,8 +1,9 @@ import pytest -from presidio_analyzer.nlp_engine import SpacyNlpEngine +from presidio_analyzer.predefined_recognizers import SpacyRecognizer from tests import assert_result_within_score_range + @pytest.fixture(scope="module") def entities(): return ["PERSON", "DATE_TIME"] @@ -18,7 +19,7 @@ def prepare_and_analyze(nlp, recognizer, text, ents): results = recognizer.analyze(text, ents, nlp_artifacts) return results - +@pytest.mark.itegration @pytest.mark.parametrize( "text, expected_len, expected_positions, entity_num", [ @@ -47,13 +48,13 @@ def test_when_using_spacy_then_all_spacy_result_found( expected_len, expected_positions, entity_num, - nlp_engine, + spacy_nlp_engine, nlp_recognizer, entities, ner_strength, max_score, ): - results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) + results = prepare_and_analyze(spacy_nlp_engine, nlp_recognizer, text, entities) assert len(results) == expected_len entity_to_check = entities[entity_num] for res, (st_pos, fn_pos) in zip(results, expected_positions): @@ -63,10 +64,10 @@ def test_when_using_spacy_then_all_spacy_result_found( def test_when_person_in_text_then_person_full_name_complex_found( - nlp_engine, nlp_recognizer, entities + spacy_nlp_engine, nlp_recognizer, entities ): text = "William Bill Alexander" - results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) + results = prepare_and_analyze(spacy_nlp_engine, nlp_recognizer, text, entities) assert len(results) > 0 @@ -79,7 +80,7 @@ def test_when_person_in_text_then_person_full_name_complex_found( assert len(text) - len(covered_text) < 5 -def test_nlp_not_loaded_value_error(): - spacy_nlp = SpacyNlpEngine() - with pytest.raises(ValueError): - spacy_nlp.process_text("This should fail as the NLP model isn't loaded", language="en") +def test_analyze_no_nlp_artifacts(): + spacy_recognizer = SpacyRecognizer() + res = spacy_recognizer.analyze(text="text", nlp_artifacts=None, entities=["PERSON"]) + assert len(res) == 0 diff --git a/presidio-analyzer/tests/test_stanza_recognizer.py b/presidio-analyzer/tests/test_stanza_recognizer.py index fcab9755e..94f53f4ad 100644 --- a/presidio-analyzer/tests/test_stanza_recognizer.py +++ b/presidio-analyzer/tests/test_stanza_recognizer.py @@ -10,9 +10,11 @@ def entities(): @pytest.mark.skip_engine("stanza_en") @pytest.fixture(scope="module") -def nlp_engine(nlp_engines): - return nlp_engines.get("stanza_en", None) - +def spacy_nlp_engine(nlp_engines): + nlp_engine = nlp_engines.get("stanza_en", None) + if nlp_engine: + nlp_engine.load() + return nlp_engine @pytest.mark.skip_engine("stanza_en") @pytest.fixture(scope="module") @@ -56,13 +58,13 @@ def test_when_using_stanza_then_all_stanza_result_correct( expected_len, expected_positions, entity_num, - nlp_engine, + spacy_nlp_engine, nlp_recognizer, entities, ner_strength, max_score, ): - results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) + results = prepare_and_analyze(spacy_nlp_engine, nlp_recognizer, text, entities) assert len(results) == expected_len entity_to_check = entities[entity_num] for res, (st_pos, fn_pos) in zip(results, expected_positions): @@ -73,10 +75,10 @@ def test_when_using_stanza_then_all_stanza_result_correct( @pytest.mark.skip_engine("stanza_en") def test_when_person_in_text_then_person_full_name_complex_found( - nlp_engine, nlp_recognizer, entities + spacy_nlp_engine, nlp_recognizer, entities ): text = "Richard (Rick) C. Henderson" - results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) + results = prepare_and_analyze(spacy_nlp_engine, nlp_recognizer, text, entities) assert len(results) > 0 diff --git a/presidio-analyzer/tests/test_transformers_nlp_engine.py b/presidio-analyzer/tests/test_transformers_nlp_engine.py new file mode 100644 index 000000000..c11db92b0 --- /dev/null +++ b/presidio-analyzer/tests/test_transformers_nlp_engine.py @@ -0,0 +1,48 @@ +import pytest + +from presidio_analyzer.nlp_engine import TransformersNlpEngine + + +def test_default_models(): + engine = TransformersNlpEngine() + assert len(engine.models) > 0 + assert engine.models[0]["lang_code"] == "en" + assert isinstance(engine.models[0]["model_name"], dict) + + +def test_validate_model_params_happy_path(): + model = { + "lang_code": "en", + "model_name": { + "spacy": "en_core_web_sm", + "transformers": "obi/deid_roberta_i2b2", + }, + } + + TransformersNlpEngine._validate_model_params(model) + +@pytest.mark.parametrize( + "key", + [ + ("lang_code"), + ("model_name"), + ("model_name.spacy"), + ("model_name.transformers") + ], +) +def test_validate_model_params_missing_fields(key): + model = { + "lang_code": "en", + "model_name": { + "spacy": "en_core_web_sm", + "transformers": "obi/deid_roberta_i2b2", + }, + } + keys = key.split(".") + if len(keys) == 1: + del model[keys[0]] + else: + del model[keys[0]][keys[1]] + + with pytest.raises(ValueError): + TransformersNlpEngine._validate_model_params(model) diff --git a/presidio-analyzer/tests/test_transformers_recognizer.py b/presidio-analyzer/tests/test_transformers_recognizer.py index 03303cd9b..9ebc1fa57 100644 --- a/presidio-analyzer/tests/test_transformers_recognizer.py +++ b/presidio-analyzer/tests/test_transformers_recognizer.py @@ -10,14 +10,17 @@ def entities(): @pytest.mark.skip_engine("transformers_en") @pytest.fixture(scope="module") -def nlp_engine(nlp_engines): - return nlp_engines.get("transformers_en", None) +def nlp_recognizer(nlp_recognizers): + return nlp_recognizers.get("transformers", None) @pytest.mark.skip_engine("transformers_en") @pytest.fixture(scope="module") -def nlp_recognizer(nlp_recognizers): - return nlp_recognizers.get("transformers", None) +def nlp_engine(nlp_engines): + nlp_engine = nlp_engines.get("transformers_en", None) + if nlp_engine: + nlp_engine.load() + return nlp_engine def prepare_and_analyze(nlp, recognizer, text, entities): @@ -27,6 +30,7 @@ def prepare_and_analyze(nlp, recognizer, text, entities): return results +@pytest.mark.itegration @pytest.mark.skip_engine("transformers_en") @pytest.mark.parametrize( "text, expected_len, expected_positions, entity_num", @@ -76,9 +80,10 @@ def test_when_using_transformers_then_all_transformers_result_correct( ) +@pytest.mark.itegration @pytest.mark.skip_engine("transformers_en") def test_when_person_in_text_then_person_full_name_complex_found( - nlp_engine, nlp_recognizer, entities + nlp_engine, nlp_recognizer, entities ): text = "Richard (Rick) C. Henderson" results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) From b194c92db5f82c58df9bdd7f706f656ba3b00a09 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 18 Sep 2023 13:41:55 +0300 Subject: [PATCH 37/67] updates to tests and docs --- docs/analyzer/customizing_nlp_models.md | 43 ++++++++--- docs/analyzer/developing_recognizers.md | 4 +- docs/analyzer/index.md | 37 +-------- docs/analyzer/nlp_engines/spacy_stanza.md | 1 - docs/analyzer/nlp_engines/transformers.md | 77 +++++++++++++++---- docs/anonymizer/index.md | 35 +-------- docs/api/analyzer_python.md | 4 +- docs/api/anonymizer_python.md | 2 +- docs/api/image_redactor_python.md | 13 +--- docs/getting_started.md | 7 +- docs/index.md | 6 +- docs/installation.md | 36 ++++----- .../python/transformers_recognizer/index.md | 22 ++++-- docs/text_anonymization.md | 5 +- docs/tutorial/04_external_services.md | 2 +- mkdocs.yml | 58 +++++++------- presidio-analyzer/tests/conftest.py | 35 +++------ .../tests/test_analyzer_engine.py | 10 +++ .../tests/test_spacy_recognizer.py | 1 + 19 files changed, 201 insertions(+), 197 deletions(-) diff --git a/docs/analyzer/customizing_nlp_models.md b/docs/analyzer/customizing_nlp_models.md index 3e67934c4..5969e86b1 100644 --- a/docs/analyzer/customizing_nlp_models.md +++ b/docs/analyzer/customizing_nlp_models.md @@ -1,11 +1,11 @@ -# Customizing the NLP models in Presidio Analyzer - -Presidio uses NLP engines for two main tasks: NER based PII identification, -and feature extraction for custom rule based logic (such as leveraging context words for improved detection). -While Presidio comes with an open-source model (the `en_core_web_lg` model from spaCy), -it can be customized by leveraging other NLP models, either public or proprietary. -These models can be trained or downloaded from existing NLP frameworks like [spaCy](https://spacy.io/usage/models), -[Stanza](https://github.com/stanfordnlp/stanza) and +# Customizing the NLP engine in Presidio Analyzer + +Presidio uses NLP engines for two main tasks: NER based PII identification, +and feature extraction for downstream rule based logic (such as leveraging context words for improved detection). +While Presidio comes with an open-source model (the `en_core_web_lg` model from spaCy), +additional NLP models and frameworks could be plugged in, either public or proprietary. +These models can be trained or downloaded from existing NLP frameworks like [spaCy](https://spacy.io/usage/models), +[Stanza](https://github.com/stanfordnlp/stanza) and [transformers](https://github.com/huggingface/transformers). In addition, other types of NLP frameworks [can be integrated into Presidio](developing_recognizers.md#machine-learning-ml-based-or-rule-based). @@ -63,9 +63,30 @@ Configuration can be done in two ways: - lang_code: es model_name: es_core_news_md + ner_model_configuration: + labels_to_ignore: + - O + model_to_presidio_entity_mapping: + PER: PERSON + LOC: LOCATION + ORG: ORGANIZATION + AGE: AGE + ID: ID + DATE: DATE_TIME + low_confidence_score_multiplier: 0.4 + low_score_entity_names: + - ID + - ORG ``` - The default conf file is read during the default initialization of the `AnalyzerEngine`. Alternatively, the path to a custom configuration file can be passed to the `NlpEngineProvider`: + The `ner_model_configuration` section contains the following parameters: + + - `labels_to_ignore`: A list of labels to ignore. For example, `O` (no entity) or entities you are not interested in returning. + - `model_to_presidio_entity_mapping`: A mapping between the transformers model labels and the Presidio entity types. + - `low_confidence_score_multiplier`: A multiplier to apply to the score of entities with low confidence. + - `low_score_entity_names`: A list of entity types to apply the low confidence score multiplier to. + + The [default conf file](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/conf/default.yaml) is read during the default initialization of the `AnalyzerEngine`. Alternatively, the path to a custom configuration file can be passed to the `NlpEngineProvider`: ```python from presidio_analyzer import AnalyzerEngine, RecognizerRegistry @@ -97,12 +118,14 @@ Configuration can be done in two ways: c. pass requests in each of these languages. !!! note "Note" - Presidio can currently use one NLP model per language. + Presidio can currently use one NER model per language via the `NlpEngine`. If multiple are required, + consider wrapping NER models as additional recognizers ([see sample here](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_remote_recognizer.py)). ## Leverage frameworks other than spaCy, Stanza and transformers for ML based PII detection In addition to the built-in spaCy/Stanza/transformers capabitilies, it is possible to create new recognizers which serve as interfaces to other models. For more information: + - [Remote recognizer documentation](adding_recognizers.md#creating-a-remote-recognizer) and [samples](../samples/python/integrating_with_external_services.ipynb). - [Flair recognizer example](../samples/python/flair_recognizer.py) diff --git a/docs/analyzer/developing_recognizers.md b/docs/analyzer/developing_recognizers.md index 546c0ce35..98e1631b4 100644 --- a/docs/analyzer/developing_recognizers.md +++ b/docs/analyzer/developing_recognizers.md @@ -49,9 +49,9 @@ See some examples here: Many PII entities are undetectable using naive approaches like deny-lists or regular expressions. In these cases, we would wish to utilize a Machine Learning model capable of identifying entities in free text, or a rule-based recognizer. There are four options for adding ML and rule based recognizers: -#### Utilize SpaCy or Stanza +#### Utilize SpaCy, Stanza or Transformers -Presidio currently uses [spaCy](https://spacy.io/) as a framework for text analysis and Named Entity Recognition (NER), and [stanza](https://stanfordnlp.github.io/stanza/) as an alternative. To avoid introducing new tools, it is recommended to first try to use `spaCy` or `stanza` over other tools if possible. +Presidio currently uses [spaCy](https://spacy.io/) as a framework for text analysis and Named Entity Recognition (NER), and [stanza](https://stanfordnlp.github.io/stanza/) and [huggingface transformers](https://huggingface.co/docs/transformers/index) as an alternative. To avoid introducing new tools, it is recommended to first try to use `spaCy`, `stanza` or `transformers` over other tools if possible. `spaCy` provides descent results compared to state-of-the-art NER models, but with much better computational performance. `spaCy` and `stanza` models could be trained from scratch, used in combination with pre-trained embeddings, or retrained to detect new entities. When integrating such a model into Presidio, a class inheriting from the [`EntityRecognizer`](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/entity_recognizer.py) should be created. diff --git a/docs/analyzer/index.md b/docs/analyzer/index.md index 3a98f8cb2..6412834ad 100644 --- a/docs/analyzer/index.md +++ b/docs/analyzer/index.md @@ -14,42 +14,7 @@ Named Entity Recognition and other types of logic to detect PII in unstructured ## Installation -=== "Using pip" - - !!! note "Note" - Consider installing the Presidio python packages on a virtual environment like venv or conda. - - To get started with Presidio-analyzer, - download the package and the `en_core_web_lg` spaCy model: - - ```sh - pip install presidio-analyzer - python -m spacy download en_core_web_lg - ``` - -=== "Using Docker" - - !!! note "Note" - This requires Docker to be installed. [Download Docker](https://docs.docker.com/get-docker/). - - ```sh - # Download image from Dockerhub - docker pull mcr.microsoft.com/presidio-analyzer - - # Run the container with the default port - docker run -d -p 5002:3000 mcr.microsoft.com/presidio-analyzer:latest - ``` - -=== "From source" - - First, clone the Presidio repo. [See here for instructions](../installation.md#install-from-source). - - Then, build the presidio-analyzer container: - - ```sh - cd presidio-analyzer - docker build . -t presidio/presidio-analyzer - ``` +see [Installing Presidio](../installation.md). ## Getting started diff --git a/docs/analyzer/nlp_engines/spacy_stanza.md b/docs/analyzer/nlp_engines/spacy_stanza.md index c7e6e9fc8..435e5752a 100644 --- a/docs/analyzer/nlp_engines/spacy_stanza.md +++ b/docs/analyzer/nlp_engines/spacy_stanza.md @@ -30,7 +30,6 @@ For the available models, follow these links: [spaCy](https://spacy.io/usage/mod !!! tip "Tip" For Person, Location and Organization detection, it could be useful to try out the transformers based models (e.g. `en_core_web_trf`) which uses a more modern deep-learning architecture, but is generally slower than the default `en_core_web_lg` model. - ### Configure Presidio to use the pre-trained model Once created, see [the NLP configuration documentation](../customizing_nlp_models.md#Configure-Presidio-to-use-the-new-model) for more information. diff --git a/docs/analyzer/nlp_engines/transformers.md b/docs/analyzer/nlp_engines/transformers.md index 89a8a9f37..a36f0f94b 100644 --- a/docs/analyzer/nlp_engines/transformers.md +++ b/docs/analyzer/nlp_engines/transformers.md @@ -4,19 +4,17 @@ Presidio's `TransformersNlpEngine` consists of a spaCy pipeline which encapsulat ![image](../../assets/spacy-transformers-ner.png) -Presidio leverages other types of information from spaCy such as tokens, lemmas and part-of-speech. +Presidio leverages other types of information from spaCy such as tokens, lemmas and part-of-speech. Therefore the pipeline returns both the NER model results as well as results from other pipeline components. -!!! warning "Warning" - spaCy and transformers use a different tokenization approach. Therefore, it could be that there is no alignment between the spans identified by a transformers model and the spans created by spaCy. In this cases, there could be cases where the output of the transformers model is different from the output of Presidio's `TransformersNlpEngine` - ## Adding a new model As the underlying transformers model, you can choose from either a public pretrained model or a custom model. ### Using a public pre-trained transformers model -#### Downloading a pre-trained model +### Downloading a pre-trained model + To download the desired NER model from HuggingFace: ```python @@ -34,28 +32,72 @@ AutoModelForTokenClassification.from_pretrained(transformers_model) ``` Then, also download a spaCy pipeline/model: + ```sh python -m spacy download en_core_web_sm ``` #### Creating a configuration file -Once the models are downloaded, the easiest option would be to create a YAML configuration file. -Note that this file needs to contain both a `spaCy` pipeline name and a transformers model name: + +Once the models are downloaded, one option to configure them is to create a YAML configuration file. +Note that the configuration needs to contain both a `spaCy` pipeline name and a transformers model name. +In addition, different configurations for parsing the results of the transformers model can be added. + +Example configuration (in YAML): ```yaml nlp_engine_name: transformers models: -- -lang_code: en -model_name: - spacy: - transformers: + - + lang_code: en + model_name: + spacy: en_core_web_sm + transformers: StanfordAIMI/stanford-deidentifier-base + +ner_model_configuration: + labels_to_ignore: + - O + aggregation_strategy: simple # "simple", "first", "average", "max" + stride: 16 + alignment_mode: strict # "strict", "contract", "expand" + model_to_presidio_entity_mapping: + PER: PERSON + LOC: LOCATION + ORG: ORGANIZATION + AGE: AGE + ID: ID + EMAIL: EMAIL + PATIENT: PERSON + STAFF: PERSON + HOSP: ORGANIZATION + PATORG: ORGANIZATION + DATE: DATE_TIME + PHONE: PHONE_NUMBER + HCW: PERSON + HOSPITAL: ORGANIZATION + + low_confidence_score_multiplier: 0.4 + low_score_entity_names: + - ID ``` - + Where: -- `` is a name of a spaCy model/pipeline, which would wrap the transformers NER model. For example, `en_core_web_sm`. -- The `` is the full path for a huggingface model. Models can be found on [HuggingFace Models Hub](https://huggingface.co/models?pipeline_tag=token-classification). For example, `obi/deid_roberta_i2b2` +- `model_name.spacy` is a name of a spaCy model/pipeline, which would wrap the transformers NER model. For example, `en_core_web_sm`. +- The `model_name.transformers` is the full path for a huggingface model. Models can be found on [HuggingFace Models Hub](https://huggingface.co/models?pipeline_tag=token-classification). For example, `obi/deid_roberta_i2b2` + +The `ner_model_configuration` section contains the following parameters: + +- `labels_to_ignore`: A list of labels to ignore. For example, `O` (no entity) or entities you are not interested in returning. +- `aggregation_strategy`: The strategy to use when aggregating the results of the transformers model. +- `stride`: The value is the length of the window overlap in transformer tokenizer tokens. +- `alignment_mode`: The strategy to use when aligning the results of the transformers model to the original text. +- `model_to_presidio_entity_mapping`: A mapping between the transformers model labels and the Presidio entity types. +- `low_confidence_score_multiplier`: A multiplier to apply to the score of entities with low confidence. +- `low_score_entity_names`: A list of entity types to apply the low confidence score multiplier to. + +See more information on parameters on the [spacy-huggingface-pipelines Github repo](https://github.com/explosion/spacy-huggingface-pipelines#token-classification). + Once created, see [the NLP configuration documentation](../customizing_nlp_models.md#Configure-Presidio-to-use-the-new-model) for more information. ### Training your own model @@ -66,3 +108,8 @@ Once created, see [the NLP configuration documentation](../customizing_nlp_model For more information on model training and evaluation for Presidio, see the [Presidio-Research Github repository](https://github.com/microsoft/presidio-research). To train your own model, see this tutorial: [Train your own transformers model](https://huggingface.co/docs/transformers/training). + +### Using a transformers model as an `EntityRecognizer` + +In addition to the approach described in this document, one can decide to integrate a transformers model as a recognizer. +We allow these two options, as a user might want to have multiple NER models running in parallel. In this case, one can create multiple `EntityRecognizer` instances, each serving a different model, instead of one model used in an `NlpEngine`. [See this sample](../../samples/python/transformers_recognizer/index.md) for more info on integrating a transformers model as a Presidio recognizer and not as a Presidio `NLPEngine`. diff --git a/docs/anonymizer/index.md b/docs/anonymizer/index.md index 78a014508..b0c272a34 100644 --- a/docs/anonymizer/index.md +++ b/docs/anonymizer/index.md @@ -17,40 +17,7 @@ with some other value by applying a certain operator (e.g. replace, mask, redact ## Installation -=== "Using pip" - - !!! note "Note" - Consider installing the Presidio python packages on a virtual environment like venv or conda. - - To install Presidio Anonymizer, run: - - ```sh - pip install presidio-anonymizer - ``` - -=== "Using Docker" - - !!! note "Note" - This requires Docker to be installed. [Download Docker](https://docs.docker.com/get-docker/). - - ```sh - # Download image from Dockerhub - docker pull mcr.microsoft.com/presidio-anonymizer - - # Run the container with the default port - docker run -d -p 5001:3000 mcr.microsoft.com/presidio-anonymizer:latest - ``` - -=== "From source" - - First, clone the Presidio repo. [See here for instructions](../installation.md#install-from-source). - - Then, build the presidio-anonymizer container: - - ```sh - cd presidio-anonymizer - docker build . -t presidio/presidio-anonymizer - ``` +see [Installing Presidio](../installation.md). ## Getting started diff --git a/docs/api/analyzer_python.md b/docs/api/analyzer_python.md index 4267add11..9e0665a22 100644 --- a/docs/api/analyzer_python.md +++ b/docs/api/analyzer_python.md @@ -2,5 +2,5 @@ ::: presidio_analyzer handler: python - selection: - docstring_style: sphinx \ No newline at end of file + options: + docstring_style: sphinx diff --git a/docs/api/anonymizer_python.md b/docs/api/anonymizer_python.md index bf0b42832..f59ee1255 100644 --- a/docs/api/anonymizer_python.md +++ b/docs/api/anonymizer_python.md @@ -2,5 +2,5 @@ ::: presidio_anonymizer handler: python - selection: + options: docstring_style: sphinx diff --git a/docs/api/image_redactor_python.md b/docs/api/image_redactor_python.md index 2eb5290b6..33aa583ad 100644 --- a/docs/api/image_redactor_python.md +++ b/docs/api/image_redactor_python.md @@ -1,15 +1,6 @@ # Presidio Image Redactor API Reference -## ImageRedactorEngine class - -::: presidio_image_redactor.ImageRedactorEngine - handler: python - selection: - docstring_style: sphinx - -## ImageAnalyzerEngine class - -::: presidio_image_redactor.ImageAnalyzerEngine +::: presidio_image_redactor handler: python - selection: + options: docstring_style: sphinx diff --git a/docs/getting_started.md b/docs/getting_started.md index 04c7ec267..71b36baab 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -2,7 +2,7 @@ ## Simple flow: Text -Using Presidio's modules as Python packages to get started +Using Presidio's modules as Python packages to get started: === "Anonymize PII in text (Default spaCy model)" @@ -84,9 +84,10 @@ Using Presidio's modules as Python packages to get started anonymized_text = anonymizer.anonymize(text=text, analyzer_results=results) print(anonymized_text) + ``` - - The transformers model and the spacy model would be downloaded on the first call to the `AnalyzerEngine`. + !!! tip "Tip: Downloading models" + If not available, the transformers model and the spacy model would be downloaded on the first call to the `AnalyzerEngine`. To pre-download, see [this doc](./analyzer/nlp_engines/transformers.md#downloading-a-pre-trained-model). ## Simple flow: Images diff --git a/docs/index.md b/docs/index.md index 50a098a4a..3c7c1ae1c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -46,12 +46,12 @@ bitcoin wallets, US phone numbers, financial data and more. ## Running Presidio -1. [Running Presidio via code](samples/python/index.md) +1. [Samples for running Presidio via code](samples/index.md) 2. [Running Presidio as an HTTP service](samples/docker/index.md) 3. [Setting up a development environment](development.md) 4. [Perform PII identification using presidio-analyzer](analyzer/index.md) -5. [Perform PII anonymization using presidio-anonymizer](anonymizer/index.md) -6. [Perform PII identification and anonymization in images using presidio-image-redactor](image-redactor/index.md) +5. [Perform PII de-identification using presidio-anonymizer](anonymizer/index.md) +6. [Perform PII identification and redaction in images using presidio-image-redactor](image-redactor/index.md) 7. [Example deployments](samples/deployments/index.md) --- diff --git a/docs/installation.md b/docs/installation.md index 73249ee99..d1efafaa3 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -2,9 +2,7 @@ ## Description -This document describes how to download and install the Presidio services locally. -As Presidio is comprised of several packages/services, -this document describes the installation of the entire +This document describes the installation of the entire Presidio suite using `pip` (as Python packages) or using `Docker` (As containerized services). ## Using pip @@ -12,7 +10,7 @@ Presidio suite using `pip` (as Python packages) or using `Docker` (As containeri !!! note "Note" Consider installing the Presidio python packages - on a virtual environment like [venv](https://docs.python.org/3/tutorial/venv.html) + in a virtual environment like [venv](https://docs.python.org/3/tutorial/venv.html) or [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). ### Supported Python Versions @@ -21,18 +19,14 @@ Presidio is supported for the following python versions: 3.7, 3.8, 3.9, 3.10, 3. ### PII de-identification in text -For PII anonymization on text, install the `presidio-analyzer` and `presidio-anonymizer` packages: - -```sh -pip install presidio_analyzer -pip install presidio_anonymizer -``` - -In addition, Presidio requires at least one NLP engine (spaCy, transformers or stanza): +For PII anonymization on text, install the `presidio-analyzer` and `presidio-anonymizer` packages +with at least one NLP engine (`spaCy`, `transformers` or `stanza`): === "spaCy (default)" ``` + pip install presidio_analyzer + pip install presidio_anonymizer python -m spacy download en_core_web_lg ``` @@ -40,18 +34,26 @@ In addition, Presidio requires at least one NLP engine (spaCy, transformers or s ``` pip install "presidio_analyzer[transformers]" + pip install presidio_anonymizer + python -m spacy download en_core_web_sm ``` + !!! note "Note" + + When using a transformers NLP engine, Presidio would still use spaCy for other capabilities, + therefore a small spaCy model (such as en_core_web_sm) is required. + Transformers models would be loaded lazily. To pre-load them, see: [Downloading a pre-trained model](./analyzer/nlp_engines/transformers.md#downloading-a-pre-trained-model) + === "Stanza" ``` pip install "presidio_analyzer[stanza]" + pip install presidio_anonymizer ``` -For a more detailed installation of each package, refer to the specific documentation: - -* [presidio-analyzer](analyzer/index.md). -* [presidio-anonymizer](anonymizer/index.md). + !!! note "Note" + + Stanza models would be loaded lazily. To pre-load them, see: [Downloading a pre-trained model](./analyzer/nlp_engines/spacy_stanza.md#download-the-pre-trained-model). ### PII redaction in images @@ -65,8 +67,6 @@ pip install presidio_image_redactor python -m spacy download en_core_web_lg ``` -[Click here](image-redactor/index.md) for more information on the presidio-image-redactor package. - ## Using Docker Presidio can expose REST endpoints for each service using Flask and Docker. diff --git a/docs/samples/python/transformers_recognizer/index.md b/docs/samples/python/transformers_recognizer/index.md index bd9e46679..7c31b446d 100644 --- a/docs/samples/python/transformers_recognizer/index.md +++ b/docs/samples/python/transformers_recognizer/index.md @@ -1,24 +1,30 @@ -# Run Presidio With Transformers Models +# Add a Transformers model based EntityRecognizer + +!!! note "Note" + + This example demonstrates how to create a **Presidio Recognizer**. + To integrate a transformers model as a **Presidio NLP Engine**, see [this documentation](../../../analyzer/nlp_engines/transformers.md). + + We allow these two options, as a user might want to have multiple NER models running in parallel. In this case, one can create multiple `EntityRecognizer` instances, each serving a different model. If you only plan to use one NER model, consider creating a [`TransformersNlpEngine`](../../../analyzer/nlp_engines/transformers.md) instead of the [`TransformersRecognizer`](https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/transformer_recognizer.py) described in this document. -This example demonstrates how to extract PII entities using transformers models. When initializing the `TransformersRecognizer`, choose from the following options: -1. A string referencing an uploaded model to HuggingFace. Use this url to access all TokenClassification models - https://huggingface.co/models?pipeline_tag=token-classification&sort=downloads + +1. A string referencing an uploaded model to HuggingFace. See the different available options for models [here](https://huggingface.co/models?pipeline_tag=token-classification&sort=downloads>). 2. Initialize your own `TokenClassificationPipeline` instance using your custom transformers model and use it for inference. 3. Provide the path to your own local custom trained model. !!! note "Note" -For each combination of model & dataset, it is recommended to create a configuration object which includes setting necessary parameters for getting the correct results. Please reference this [configuraion.py](configuration.py) file for examples. + For each combination of model & dataset, it is recommended to create a configuration object which includes setting necessary parameters for getting the correct results. Please reference this [configuraion.py](https://github.cim/microsoft/presidio/blob/miN/configuration.py) file for examples. - - - -### Example Code +## Example Code This example code uses a `TransformersRecognizer` for NER, and removes the default `SpacyRecognizer`. In order to be able to use spaCy features such as lemmas, we introduce the small (and faster) `en_core_web_sm` model. +[link to full TransformersRecognizer code](https://github.com/microsoft/presidio/blob/main/docs/samples/python/transformers_recognizer/transformer_recognizer.py) + ```python from presidio_analyzer import AnalyzerEngine, RecognizerRegistry from presidio_analyzer.nlp_engine import NlpEngineProvider diff --git a/docs/text_anonymization.md b/docs/text_anonymization.md index a73a48cf3..1f989b2b0 100644 --- a/docs/text_anonymization.md +++ b/docs/text_anonymization.md @@ -2,8 +2,8 @@ Presidio's features two main modules for anonymization PII in text: -- [Presidio analyzer](analyzer/index.md): Identification PII in text -- [Presidio anonymizer](anonymizer/index.md): Anonymize detected PII entities using different operators +- [Presidio analyzer](analyzer/index.md): Identification of PII in text +- [Presidio anonymizer](anonymizer/index.md): De-identify detected PII entities using different operators In most cases, we would run the Presidio analyzer to detect where PII entities exist, and then the Presidio anonymizer to remove those using specific operators (such as redact, replace, hash or encrypt) @@ -14,4 +14,3 @@ This figure presents the overall flow in high level: - The [Presidio Analyzer](analyzer/index.md) holds multiple recognizers, each one capable of detecting specific PII entities. These recognizers leverage regular expressions, deny lists, checksum, rule based logic, Named Entity Recognition ML models and context from surrounding words. - The [Presidio Anonymizer](anonymizer/index.md) holds multiple operators, each one can be used to anonymize the PII entity in a different way. Additionally, it can be used to de-anonymize an already anonymized entity (For example, decrypt an encrypted entity) - diff --git a/docs/tutorial/04_external_services.md b/docs/tutorial/04_external_services.md index 92990e0c4..47d231c08 100644 --- a/docs/tutorial/04_external_services.md +++ b/docs/tutorial/04_external_services.md @@ -13,5 +13,5 @@ In a similar way to example 3, we can write logic to call external services for ## Calling a model in a different framework -- [This example](../samples/python/flair_recognizer.py) shows a Presidio wrapper for a Flair model. +- [This example](https://github.com/microsoft/presidio/blob/main/docs/samples/python/flair_recognizer.py) shows a Presidio wrapper for a Flair model. - Using a similar approach, we could create wrappers for HuggingFace models, Conditional Random Fields or any other framework. diff --git a/mkdocs.yml b/mkdocs.yml index 8c077ee1e..f5774ccfb 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -26,32 +26,38 @@ nav: - Custom anonymization: tutorial/11_custom_anonymization.md - Encryption/Decryption: tutorial/12_encryption.md - Allow-lists: tutorial/13_allow_list.md - - Docs: - - Handling text: - - Home: text_anonymization.md - - Presidio Analyzer: - - Home: analyzer/index.md - - Developing PII recognizers: - - Tutorial : analyzer/adding_recognizers.md - - Best practices in developing recognizers : analyzer/developing_recognizers.md - - Multi-language support: analyzer/languages.md - - Customizing the NLP model: analyzer/customizing_nlp_models.md - - Tracing the decision process: analyzer/decision_process.md - - Presidio Anonymizer: - - Home: anonymizer/index.md - - Developing PII operators: anonymizer/adding_operators.md - - Handling images: image-redactor/index.md - - Supported entities: supported_entities.md - - Development and design: - - Design: design.md - - Setting up a development environment: development.md - - Build and release process: build_release.md - - Changes from V1 to V2: presidio_V2.md - - Python API reference: - - Presidio Analyzer Python API: api/analyzer_python.md - - Presidio Anonymizer Python API: api/anonymizer_python.md - - Presidio Image Redactor Python API: api/image_redactor_python.md - - REST API reference: https://microsoft.github.io/presidio/api-docs/api-docs.html" target="_blank + + - Handling text: + - Home: text_anonymization.md + - Presidio Analyzer: + - Home: analyzer/index.md + - Developing PII recognizers: + - Tutorial : analyzer/adding_recognizers.md + - Best practices in developing recognizers : analyzer/developing_recognizers.md + - Multi-language support: analyzer/languages.md + - Customizing the NLP model: + - Home: analyzer/customizing_nlp_models.md + - Spacy/Stanza: analyzer/nlp_engines/spacy_stanza.md + - Transformers: analyzer/nlp_engines/transformers.md + - Tracing the decision process: analyzer/decision_process.md + - Presidio Anonymizer: + - Home: anonymizer/index.md + - Developing PII operators: anonymizer/adding_operators.md + - Handling images: + - Home: image-redactor/index.md + - Evaluating DICOM redaction: image-redactor/evaluating_dicom_redaction.md + - Supported entities: supported_entities.md + - Development and design: + - Design: design.md + - Setting up a development environment: development.md + - Build and release process: build_release.md + - Changes from V1 to V2: presidio_V2.md + - Python API reference: + - Home: api.md + - Presidio Analyzer Python API: api/analyzer_python.md + - Presidio Anonymizer Python API: api/anonymizer_python.md + - Presidio Image Redactor Python API: api/image_redactor_python.md + - REST API reference: https://microsoft.github.io/presidio/api-docs/api-docs.html" target="_blank - Samples: samples/index.md - Community: community.md - FAQ: faq.md diff --git a/presidio-analyzer/tests/conftest.py b/presidio-analyzer/tests/conftest.py index b6505bba8..948f5f20a 100644 --- a/presidio-analyzer/tests/conftest.py +++ b/presidio-analyzer/tests/conftest.py @@ -21,7 +21,6 @@ def pytest_configure(config): config.addinivalue_line( "markers", "skip_engine(nlp_engine): skip test for given nlp engine" ) - config.addinivalue_line("markers", "integration: mark test as an integration test") @pytest.fixture(scope="session") @@ -129,28 +128,18 @@ def zip_code_recognizer(): return zip_recognizer -@pytest.fixture(scope="session") -def zip_code_deny_list_recognizer(): - regex = r"(\b\d{5}(?:\-\d{4})?\b)" - zipcode_pattern = Pattern(name="zip code (weak)", regex=regex, score=0.01) - zip_recognizer = PatternRecognizer( - supported_entity="ZIP", deny_list=["999"], patterns=[zipcode_pattern] - ) - return zip_recognizer - def pytest_sessionfinish(): """Remove files created during mock spaCy models creation.""" - he_test_model_path = Path(Path(__file__).parent.parent, "he_test") - if he_test_model_path.exists(): - try: - shutil.rmtree(he_test_model_path) - except OSError as e: - print("Failed to remove file: %s - %s." % (e.filename, e.strerror)) - - bn_test_model_path = Path(Path(__file__).parent.parent, "bn_test") - if bn_test_model_path.exists(): - try: - shutil.rmtree(bn_test_model_path) - except OSError as e: - print("Failed to remove file: %s - %s." % (e.filename, e.strerror)) + + mock_models = ("he_test", "bn_test") + + for mock_model in mock_models: + test_model_path1 = Path(Path(__file__).parent, mock_model) + test_model_path2 = Path(Path(__file__).parent.parent, mock_model) + for path in (test_model_path1, test_model_path2): + if path.exists(): + try: + shutil.rmtree(path) + except OSError as e: + print("Failed to remove file: %s - %s." % (e.filename, e.strerror)) diff --git a/presidio-analyzer/tests/test_analyzer_engine.py b/presidio-analyzer/tests/test_analyzer_engine.py index 68c28d512..9494a1c66 100644 --- a/presidio-analyzer/tests/test_analyzer_engine.py +++ b/presidio-analyzer/tests/test_analyzer_engine.py @@ -39,6 +39,16 @@ def loaded_analyzer_engine(loaded_registry, app_tracer): return analyzer_engine +@pytest.fixture(scope="module") +def zip_code_deny_list_recognizer(): + regex = r"(\b\d{5}(?:\-\d{4})?\b)" + zipcode_pattern = Pattern(name="zip code (weak)", regex=regex, score=0.01) + zip_recognizer = PatternRecognizer( + supported_entity="ZIP", deny_list=["999"], patterns=[zipcode_pattern] + ) + return zip_recognizer + + @pytest.fixture(scope="module") def unit_test_guid(): return "00000000-0000-0000-0000-000000000000" diff --git a/presidio-analyzer/tests/test_spacy_recognizer.py b/presidio-analyzer/tests/test_spacy_recognizer.py index 23b3b45d8..f58b52fab 100644 --- a/presidio-analyzer/tests/test_spacy_recognizer.py +++ b/presidio-analyzer/tests/test_spacy_recognizer.py @@ -19,6 +19,7 @@ def prepare_and_analyze(nlp, recognizer, text, ents): results = recognizer.analyze(text, ents, nlp_artifacts) return results + @pytest.mark.itegration @pytest.mark.parametrize( "text, expected_len, expected_positions, entity_num", From 0604028a11cb151ff3ae40d13ce2e4901602b51d Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 18 Sep 2023 13:48:55 +0300 Subject: [PATCH 38/67] revert tests to separate PRs --- docs/faq.md | 40 +++--- .../batch_analyzer_engine.py | 18 +-- .../tests/conf/test_transformers.yaml | 36 ------ presidio-analyzer/tests/conftest.py | 117 ++++++++++++------ .../tests/mocks/nlp_engine_mock.py | 2 +- .../tests/test_analyzer_engine.py | 51 +++----- .../tests/test_batch_analyzer_engine.py | 2 +- .../tests/test_context_support.py | 18 +-- .../tests/test_ner_model_configuration.py | 75 ----------- .../tests/test_nlp_engine_provider.py | 99 ++++++++++----- .../tests/test_phone_recognizer.py | 8 +- .../tests/test_spacy_nlp_engine.py | 32 +---- .../tests/test_spacy_recognizer.py | 20 ++- .../tests/test_stanza_recognizer.py | 16 ++- .../tests/test_transformers_nlp_engine.py | 48 ------- .../tests/test_transformers_recognizer.py | 15 +-- 16 files changed, 238 insertions(+), 359 deletions(-) delete mode 100644 presidio-analyzer/tests/conf/test_transformers.yaml delete mode 100644 presidio-analyzer/tests/test_ner_model_configuration.py delete mode 100644 presidio-analyzer/tests/test_transformers_nlp_engine.py diff --git a/docs/faq.md b/docs/faq.md index 37e3afafe..d5c894cb3 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -1,26 +1,27 @@ # Frequently Asked Questions (FAQ) - [General](#general) - - [What is Presidio?](#what-is-presidio) - - [Why did Microsoft create Presidio?](#why-did-microsoft-create-presidio) - - [Is Microsoft Presidio an official Microsoft product?](#is-microsoft-presidio-an-official-microsoft-product) - - [What is the difference between Presidio and different PII detection services like Azure Text Analytics and Amazon Comprehend?](#what-is-the-difference-between-presidio-and-different-pii-detection-services-like-azure-text-analytics-and-amazon-comprehend) + - [What is Presidio?](#what-is-presidio) + - [Why did Microsoft create Presidio?](#why-did-microsoft-create-presidio) + - [Is Microsoft Presidio an official Microsoft product?](#is-microsoft-presidio-an-official-microsoft-product) + - [What is the difference between Presidio and different PII detection services like Azure Text Analytics and Amazon Comprehend?](#what-is-the-difference-between-presidio-and-different-pii-detection-services-like-azure-text-analytics-and-amazon-comprehend) - [Using Presidio](#using-presidio) - - [How can I start using Presidio?](#how-can-i-start-using-presidio) - - [What are the main building blocks in Presidio?](#what-are-the-main-building-blocks-in-presidio) + - [How can I start using Presidio?](#how-can-i-start-using-presidio) + - [What are the main building blocks in Presidio?](#what-are-the-main-building-blocks-in-presidio) - [Customizing Presidio](#customizing-presidio) - - [How can Presidio be customized to my needs?](#how-can-presidio-be-customized-to-my-needs) - - [What NLP frameworks does Presidio support?](#what-nlp-frameworks-does-presidio-support) - - [Can Presidio be used for Pseudonymization?](#can-presidio-be-used-for-pseudonymization) - - [Does Presidio work on structured/tabular data?](#does-presidio-work-on-structuredtabular-data) + - [How can Presidio be customized to my needs?](#how-can-presidio-be-customized-to-my-needs) + - [What NLP frameworks does Presidio support?](#what-nlp-frameworks-does-presidio-support) + - [Can Presidio be used for Pseudonymization?](#can-presidio-be-used-for-pseudonymization) + - [Does Presidio work on structured/tabular data?](#does-presidio-work-on-structuredtabular-data) - [Improving detection accuracy](#improving-detection-accuracy) - - [What can I do if Presidio does not detect some of the PII entities in my data (False Negatives)?](#what-can-i-do-if-presidio-does-not-detect-some-of-the-pii-entities-in-my-data-false-negatives) - - [What can I do if Presidio falsely detects text as PII entities (False Positives)?](#what-can-i-do-if-presidio-falsely-detects-text-as-pii-entities-false-positives) - - [How can I evaluate the performance of my Presidio instance?](#how-can-i-evaluate-the-performance-of-my-presidio-instance) + - [What can I do if Presidio does not detect some of the PII entities in my data (False Negatives)?](#what-can-i-do-if-presidio-does-not-detect-some-of-the-pii-entities-in-my-data-false-negatives) + - [What can I do if Presidio falsely detects text as PII entities (False Positives)?](#what-can-i-do-if-presidio-falsely-detects-text-as-pii-entities-false-positives) + - [How can I evaluate the performance of my Presidio instance?](#how-can-i-evaluate-the-performance-of-my-presidio-instance) - [Deployment](#deployment) - - [How can I deploy Presidio into my environment?](#how-can-i-deploy-presidio-into-my-environment) + - [How can I deploy Presidio into my environment?](#how-can-i-deploy-presidio-into-my-environment) - [Contributing](#contributing) - - [How can I contribute to Presidio?](#how-can-i-contribute-to-presidio) + - [How can I contribute to Presidio?](#how-can-i-contribute-to-presidio) + - [How can I report security vulnerabilities?](#how-can-i-report-security-vulnerabilities) ## General @@ -94,11 +95,11 @@ For more information, see the [docs](https://microsoft.github.io/presidio/analyz ### Can Presidio be used for Pseudonymization? -Pseudonymization is a de-identification technique in which the real data is replaced with fake data. Since there are various ways and approaches for this, we provide a simple [sample](https://microsoft.github.io/presidio/samples/python/example_custom_lambda_anonymizer/) which can be extended for more sophisticated usage. If you have a question or a request on this topic, please open an issue on the repo. +Pseudonymization is a de-identification technique in which the real data is replaced with fake data in a reversible way. Since there are various ways and approaches for this, we provide a simple [sample](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_custom_lambda_anonymizer.py) which can be extended for more sophisticated usage. If you have a question or a request on this topic, please open an issue on the repo. ### Does Presidio work on structured/tabular data? -This is an area we are actively looking into. We have an [example implementation](https://microsoft.github.io/presidio/samples/python/batch_processing/) of using Presidio on structured/semi-structured data. Also see the different discussions on this topic on the [Discussions](https://github.com/microsoft/presidio/discussions) section. If you have a question, suggestion, or a contribution in this area, please reach out by opening an issue, starting a discussion or reaching us directly at presidio@microsoft.com +This is an area we are actively looking into. We have an [example implementation](https://microsoft.github.io/presidio/samples/python/batch_processing/) of using Presidio on structured/semi-structured data. Also see the different discussions on this topic on the [Discussions](https://github.com/microsoft/presidio/discussions) section. If you have a question, suggestion, or a contribution in this area, please reach out by opening an issue, starting a discussion or reaching us directly at ## Improving detection accuracy @@ -133,7 +134,8 @@ The main Presidio modules (analyzer, anonymizer, image-redactor) can be used bot ### How can I contribute to Presidio? -First, review the [contribution guidelines](https://github.com/microsoft/presidio/blob/main/CONTRIBUTING.md), and feel free to reach out by opening an issue, posting a discussion or emailing us at presidio@microsoft.com +First, review the [contribution guidelines](https://github.com/microsoft/presidio/blob/main/CONTRIBUTING.md), and feel free to reach out by opening an issue, posting a discussion or emailing us at + +### How can I report security vulnerabilities? -### How can I report security vulnerabilities? Please see the [security information](https://github.com/microsoft/presidio/blob/main/SECURITY.md). diff --git a/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py b/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py index 4a428595d..daf323f12 100644 --- a/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py +++ b/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py @@ -8,18 +8,18 @@ class BatchAnalyzerEngine: - """ - Batch analysis of documents (tables, lists, dicts). - - Wrapper class to run Presidio Analyzer Engine on multiple values, - either lists/iterators of strings, or dictionaries. - - :param: analyzer_engine: AnalyzerEngine instance to use - for handling the values in those collections. - """ def __init__(self, analyzer_engine: Optional[AnalyzerEngine] = None): + """ + Batch analysis of documents (tables, lists, dicts). + Wrapper class to run Presidio Analyzer Engine on multiple values, + either lists/iterators of strings, or dictionaries. + + :param: analyzer_engine: AnalyzerEngine instance to use + for handling the values in those collections. + """ + self.analyzer_engine = analyzer_engine if not analyzer_engine: self.analyzer_engine = AnalyzerEngine() diff --git a/presidio-analyzer/tests/conf/test_transformers.yaml b/presidio-analyzer/tests/conf/test_transformers.yaml deleted file mode 100644 index 5fca7be77..000000000 --- a/presidio-analyzer/tests/conf/test_transformers.yaml +++ /dev/null @@ -1,36 +0,0 @@ -nlp_engine_name: transformers -models: - - - lang_code: en - model_name: - spacy: en_core_web_lg - transformers: StanfordAIMI/stanford-deidentifier-base -ner_model_configuration: - labels_to_ignore: - - O - aggregation_strategy: simple # "simple", "first", "average", "max" - stride: 16 # If stride >= 0, process long texts in - # overlapping windows of the model max - # length. The value is the length of the - # window overlap in transformer tokenizer - # tokens, NOT the length of the stride. - alignment_mode: strict # "strict", "contract", "expand" - model_to_presidio_entity_mapping: - PER: PERSON - LOC: LOCATION - ORG: ORGANIZATION - AGE: AGE - ID: ID - EMAIL: EMAIL - PATIENT: PERSON - STAFF: PERSON - HOSP: ORGANIZATION - PATORG: ORGANIZATION - DATE: DATE_TIME - PHONE: PHONE_NUMBER - HCW: PERSON - HOSPITAL: ORGANIZATION - - low_confidence_score_multiplier: 0.4 - low_score_entity_names: - - ID \ No newline at end of file diff --git a/presidio-analyzer/tests/conftest.py b/presidio-analyzer/tests/conftest.py index 948f5f20a..5d5602787 100644 --- a/presidio-analyzer/tests/conftest.py +++ b/presidio-analyzer/tests/conftest.py @@ -3,6 +3,7 @@ from typing import Dict import pytest +import spacy from presidio_analyzer import ( EntityRecognizer, @@ -13,16 +14,31 @@ from presidio_analyzer import RecognizerRegistry from presidio_analyzer.nlp_engine import NlpEngineProvider, NlpEngine from presidio_analyzer.predefined_recognizers import NLP_RECOGNIZERS -from tests.mocks import RecognizerRegistryMock, NlpEngineMock +from tests.mocks import RecognizerRegistryMock +def pytest_addoption(parser): + parser.addoption( + "--runfast", action="store_true", default=False, help="run fast tests" + ) + def pytest_configure(config): + config.addinivalue_line("markers", "slow: mark test as slow to run") config.addinivalue_line( "markers", "skip_engine(nlp_engine): skip test for given nlp engine" ) +def pytest_collection_modifyitems(config, items): + if config.getoption("--runfast"): + # --runfast given in cli: skip slow tests + skip_slow = pytest.mark.skip(reason="remove --runfast option to run") + for item in items: + if "slow" in item.keywords: + item.add_marker(skip_slow) + + @pytest.fixture(scope="session") def nlp_engine_provider() -> NlpEngineProvider: return NlpEngineProvider() @@ -34,28 +50,28 @@ def nlp_engines(request, nlp_engine_provider) -> Dict[str, NlpEngine]: nlp_engines = nlp_engine_provider.nlp_engines for name, engine_cls in nlp_engines.items(): - if name == "spacy": + if name == "spacy" and not request.config.getoption("--runfast"): available_engines[f"{name}_en"] = engine_cls( models=[{"lang_code": "en", "model_name": "en_core_web_lg"}] ) - elif name == "stanza": + elif name == "stanza" and not request.config.getoption("--runfast"): available_engines[f"{name}_en"] = engine_cls( models=[{"lang_code": "en", "model_name": "en"}] ) - elif name == "transformers": + elif name == "transformers" and not request.config.getoption("--runfast"): available_engines[f"{name}_en"] = engine_cls( - models=[ - { - "lang_code": "en", - "model_name": { - "spacy": "en_core_web_sm", - "transformers": "StanfordAIMI/stanford-deidentifier-base", - }, - } - ] + models=[{ + "lang_code": "en", + "model_name": { + "spacy": "en_core_web_sm", + "transformers": "StanfordAIMI/stanford-deidentifier-base", + }, + }] ) else: raise ValueError("Unsupported engine for tests") + # Load engine + available_engines[f"{name}_en"].load() return available_engines @@ -69,15 +85,6 @@ def skip_by_engine(request, nlp_engines): pytest.skip(f"skipped on this engine: {marker_arg}") -@pytest.mark.skip_engine("spacy_en") -@pytest.fixture(scope="session") -def spacy_nlp_engine(nlp_engines): - nlp_engine = nlp_engines.get("spacy_en", None) - if nlp_engine: - nlp_engine.load() - return nlp_engine - - @pytest.fixture(scope="session") def nlp_recognizers() -> Dict[str, EntityRecognizer]: return {name: rec_cls() for name, rec_cls in NLP_RECOGNIZERS.items()} @@ -103,19 +110,41 @@ def loaded_registry() -> RecognizerRegistry: return RecognizerRegistry() +@pytest.fixture(scope="module") +def nlp_engine(nlp_engines) -> NlpEngine: + return nlp_engines["spacy_en"] + + @pytest.fixture(scope="module") def mock_registry() -> RecognizerRegistryMock: return RecognizerRegistryMock() @pytest.fixture(scope="module") -def mock_nlp_engine() -> NlpEngineMock: - return NlpEngineMock() +def analyzer_engine_simple(mock_registry, nlp_engine) -> AnalyzerEngine: + return AnalyzerEngine(registry=mock_registry, nlp_engine=nlp_engine) -@pytest.fixture(scope="module") -def analyzer_engine_simple(mock_registry, mock_nlp_engine) -> AnalyzerEngine: - return AnalyzerEngine(registry=mock_registry, nlp_engine=mock_nlp_engine) +@pytest.fixture(scope="session") +def mock_he_model(): + """ + Create an empty Hebrew spaCy pipeline and save it to disk. + + So that it could be loaded using spacy.load() + """ + he = spacy.blank("he") + he.to_disk("he_test") + + +@pytest.fixture(scope="session") +def mock_bn_model(): + """ + Create an empty Bengali spaCy pipeline and save it to disk. + + So that it could be loaded using spacy.load() + """ + bn = spacy.blank("bn") + bn.to_disk("bn_test") @pytest.fixture(scope="session") @@ -128,18 +157,28 @@ def zip_code_recognizer(): return zip_recognizer +@pytest.fixture(scope="session") +def zip_code_deny_list_recognizer(): + regex = r"(\b\d{5}(?:\-\d{4})?\b)" + zipcode_pattern = Pattern(name="zip code (weak)", regex=regex, score=0.01) + zip_recognizer = PatternRecognizer( + supported_entity="ZIP", deny_list=["999"], patterns=[zipcode_pattern] + ) + return zip_recognizer + def pytest_sessionfinish(): """Remove files created during mock spaCy models creation.""" - - mock_models = ("he_test", "bn_test") - - for mock_model in mock_models: - test_model_path1 = Path(Path(__file__).parent, mock_model) - test_model_path2 = Path(Path(__file__).parent.parent, mock_model) - for path in (test_model_path1, test_model_path2): - if path.exists(): - try: - shutil.rmtree(path) - except OSError as e: - print("Failed to remove file: %s - %s." % (e.filename, e.strerror)) + he_test_model_path = Path(Path(__file__).parent.parent, "he_test") + if he_test_model_path.exists(): + try: + shutil.rmtree(he_test_model_path) + except OSError as e: + print("Failed to remove file: %s - %s." % (e.filename, e.strerror)) + + bn_test_model_path = Path(Path(__file__).parent.parent, "bn_test") + if bn_test_model_path.exists(): + try: + shutil.rmtree(bn_test_model_path) + except OSError as e: + print("Failed to remove file: %s - %s." % (e.filename, e.strerror)) diff --git a/presidio-analyzer/tests/mocks/nlp_engine_mock.py b/presidio-analyzer/tests/mocks/nlp_engine_mock.py index a2a591968..5e8ab5568 100644 --- a/presidio-analyzer/tests/mocks/nlp_engine_mock.py +++ b/presidio-analyzer/tests/mocks/nlp_engine_mock.py @@ -8,7 +8,7 @@ def __init__(self, stopwords=None, punct_words=None, nlp_artifacts=None): self.stopwords = stopwords if stopwords else [] self.punct_words = punct_words if punct_words else [] if nlp_artifacts is None: - self.nlp_artifacts = NlpArtifacts([], [], [], [], None, "en", []) + self.nlp_artifacts = NlpArtifacts([], [], [], [], None, "en") else: self.nlp_artifacts = nlp_artifacts diff --git a/presidio-analyzer/tests/test_analyzer_engine.py b/presidio-analyzer/tests/test_analyzer_engine.py index 9494a1c66..746f2ae60 100644 --- a/presidio-analyzer/tests/test_analyzer_engine.py +++ b/presidio-analyzer/tests/test_analyzer_engine.py @@ -39,16 +39,6 @@ def loaded_analyzer_engine(loaded_registry, app_tracer): return analyzer_engine -@pytest.fixture(scope="module") -def zip_code_deny_list_recognizer(): - regex = r"(\b\d{5}(?:\-\d{4})?\b)" - zipcode_pattern = Pattern(name="zip code (weak)", regex=regex, score=0.01) - zip_recognizer = PatternRecognizer( - supported_entity="ZIP", deny_list=["999"], patterns=[zipcode_pattern] - ) - return zip_recognizer - - @pytest.fixture(scope="module") def unit_test_guid(): return "00000000-0000-0000-0000-000000000000" @@ -59,7 +49,6 @@ def nlp_engine(nlp_engines): return nlp_engines["spacy_en"] -@pytest.mark.integration def test_simple(): dic = { "text": "John Smith drivers license is AC432223", @@ -90,14 +79,14 @@ def test_when_analyze_with_predefined_recognizers_then_return_results( def test_when_analyze_with_multiple_predefined_recognizers_then_succeed( - loaded_registry, unit_test_guid, spacy_nlp_engine, max_score + loaded_registry, unit_test_guid, nlp_engine, max_score ): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] analyzer_engine_with_spacy = AnalyzerEngine( - registry=loaded_registry, nlp_engine=spacy_nlp_engine + registry=loaded_registry, nlp_engine=nlp_engine ) results = analyzer_engine_with_spacy.analyze( correlation_id=unit_test_guid, @@ -145,8 +134,8 @@ def test_when_analyze_with_unsupported_language_then_fail( ) -def test_when_analyze_two_entities_embedded_then_return_results(spacy_nlp_engine): - analyzer = AnalyzerEngine(nlp_engine=spacy_nlp_engine) +def test_when_analyze_two_entities_embedded_then_return_results(nlp_engine): + analyzer = AnalyzerEngine(nlp_engine=nlp_engine) # Name with driver license in it text = "My name is John 1234567 Doe" @@ -329,10 +318,10 @@ def test_when_entities_is_none_then_return_all_fields(loaded_registry): def test_when_entities_is_none_all_recognizers_loaded_then_return_all_fields( - spacy_nlp_engine, + nlp_engine, ): analyze_engine = AnalyzerEngine( - registry=RecognizerRegistry(), nlp_engine=spacy_nlp_engine + registry=RecognizerRegistry(), nlp_engine=nlp_engine ) threshold = 0 text = "My name is Sharon and I live in Seattle." "Domain: microsoft.com " @@ -348,7 +337,7 @@ def test_when_entities_is_none_all_recognizers_loaded_then_return_all_fields( def test_when_analyze_then_apptracer_has_value( - loaded_registry, unit_test_guid, spacy_nlp_engine + loaded_registry, unit_test_guid, nlp_engine ): text = "My name is Bart Simpson, and Credit card: 4095-2609-9393-4932, my phone is 425 8829090" # noqa E501 language = "en" @@ -358,7 +347,7 @@ def test_when_analyze_then_apptracer_has_value( loaded_registry, app_tracer=app_tracer_mock, log_decision_process=True, - nlp_engine=spacy_nlp_engine, + nlp_engine=nlp_engine, ) results = analyzer_engine_with_spacy.analyze( correlation_id=unit_test_guid, @@ -481,7 +470,7 @@ def test_when_get_supported_fields_then_return_all_languages( def test_when_get_supported_fields_specific_language_then_return_single_result( - loaded_registry, unit_test_guid, spacy_nlp_engine + loaded_registry, unit_test_guid, nlp_engine ): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer( @@ -491,7 +480,7 @@ def test_when_get_supported_fields_specific_language_then_return_single_result( supported_language="ru", ) - analyzer = AnalyzerEngine(registry=loaded_registry, nlp_engine=spacy_nlp_engine) + analyzer = AnalyzerEngine(registry=loaded_registry, nlp_engine=nlp_engine) analyzer.registry.add_recognizer(pattern_recognizer) entities = analyzer.get_supported_entities(language="ru") @@ -518,7 +507,7 @@ def test_when_get_recognizers_then_returns_supported_language(): assert len(response) == 1 -def test_when_add_recognizer_then_also_outputs_others(spacy_nlp_engine): +def test_when_add_recognizer_then_also_outputs_others(nlp_engine): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer( "ROCKET", @@ -532,7 +521,7 @@ def test_when_add_recognizer_then_also_outputs_others(spacy_nlp_engine): assert len(registry.recognizers) > 1 - analyzer = AnalyzerEngine(registry=registry, nlp_engine=spacy_nlp_engine) + analyzer = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine) text = "Michael Jones has a rocket" @@ -663,9 +652,9 @@ def test_entities_filter_for_ad_hoc_removes_recognizer(loaded_analyzer_engine): assert "MR" not in [resp.entity_type for resp in responses2] -def test_ad_hoc_with_context_support_higher_confidence(spacy_nlp_engine, zip_code_recognizer): +def test_ad_hoc_with_context_support_higher_confidence(nlp_engine, zip_code_recognizer): text = "Mr. John Smith's zip code is 10023" - analyzer_engine = AnalyzerEngine(nlp_engine=spacy_nlp_engine) + analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine) responses1 = analyzer_engine.analyze( text=text, language="en", ad_hoc_recognizers=[zip_code_recognizer] @@ -697,7 +686,7 @@ def test_ad_hoc_when_no_other_recognizers_are_requested_returns_only_ad_hoc_resu assert "ZIP" in [resp.entity_type for resp in responses] -def test_when_recognizer_doesnt_return_recognizer_name_no_exception(spacy_nlp_engine): +def test_when_recognizer_doesnt_return_recognizer_name_no_exception(nlp_engine): class MockRecognizer1(EntityRecognizer, ABC): def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts): return [RecognizerResult("TEST1", 10, 30, 0.5)] @@ -714,7 +703,7 @@ def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts): registry.add_recognizer(mock_recognizer1) registry.add_recognizer(mock_recognizer2) - analyzer_engine = AnalyzerEngine(nlp_engine=spacy_nlp_engine, registry=registry) + analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry) results = analyzer_engine.analyze("ABC", language="en") assert len(results) == 2 @@ -746,7 +735,7 @@ def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts): ) -def test_when_recognizer_overrides_enhance_score_then_it_get_boosted_once(spacy_nlp_engine): +def test_when_recognizer_overrides_enhance_score_then_it_get_boosted_once(nlp_engine): class MockRecognizer(EntityRecognizer, ABC): def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts): return [ @@ -779,7 +768,7 @@ def enhance_using_context( registry = RecognizerRegistry() registry.add_recognizer(mock_recognizer) - analyzer_engine = AnalyzerEngine(nlp_engine=spacy_nlp_engine, registry=registry) + analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry) recognizer_results = analyzer_engine.analyze("ABC", language="en") assert len(recognizer_results) == 2 @@ -820,7 +809,7 @@ def enhance_using_context( ] -def test_when_multiple_nameless_recognizers_context_is_correct(spacy_nlp_engine): +def test_when_multiple_nameless_recognizers_context_is_correct(nlp_engine): rocket_recognizer = PatternRecognizer( supported_entity="ROCKET", context=["cool"], @@ -836,7 +825,7 @@ def test_when_multiple_nameless_recognizers_context_is_correct(spacy_nlp_engine) registry.add_recognizer(rocket_recognizer) registry.add_recognizer(rocket_recognizer2) - analyzer_engine = AnalyzerEngine(nlp_engine=spacy_nlp_engine, registry=registry) + analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry) recognizer_results = analyzer_engine.analyze( "I have a cool rocket and a fast missile.", language="en" ) diff --git a/presidio-analyzer/tests/test_batch_analyzer_engine.py b/presidio-analyzer/tests/test_batch_analyzer_engine.py index e454cee35..51cd34ec0 100644 --- a/presidio-analyzer/tests/test_batch_analyzer_engine.py +++ b/presidio-analyzer/tests/test_batch_analyzer_engine.py @@ -158,7 +158,7 @@ def test_analyze_dict_on_nested_dict(batch_analyzer_engine_simple): key="key_a1", value=nested_dict["key_a"]["key_a1"], recognizer_results=[ - RecognizerResult("PHONE_NUMBER", start=19, end=31, score=0.4) + RecognizerResult("PHONE_NUMBER", start=19, end=31, score=0.75) ], ) ], diff --git a/presidio-analyzer/tests/test_context_support.py b/presidio-analyzer/tests/test_context_support.py index a7b4d00be..3054b29ee 100644 --- a/presidio-analyzer/tests/test_context_support.py +++ b/presidio-analyzer/tests/test_context_support.py @@ -90,7 +90,7 @@ def us_license_recognizer(): def test_when_text_with_aditional_context_lemma_based_context_enhancer_then_analysis_explanation_include_correct_supportive_context_word( # noqa: E501 - spacy_nlp_engine, lemma_context, us_license_recognizer + nlp_engine, lemma_context, us_license_recognizer ): """This test checks that LemmaContextAwareEnhancer uses supportive context word from analyze input as if it was in the text itself. @@ -100,7 +100,7 @@ def test_when_text_with_aditional_context_lemma_based_context_enhancer_then_anal return that word as supportive_context_word instead of other recognizer context word """ text = "John Smith license is AC432223" - nlp_artifacts = spacy_nlp_engine.process_text(text, "en") + nlp_artifacts = nlp_engine.process_text(text, "en") recognizer_results = us_license_recognizer.analyze(text, nlp_artifacts) results_without_additional_context = lemma_context.enhance_using_context( text, recognizer_results, nlp_artifacts, [us_license_recognizer] @@ -125,8 +125,8 @@ def test_when_text_with_aditional_context_lemma_based_context_enhancer_then_anal ) -def test_when_text_with_only_additional_context_lemma_based_context_enhancer_then_analysis_explanation_include_correct_supportive_context_word( # noqa: E501 - spacy_nlp_engine, lemma_context, us_license_recognizer +def test_when_text_with_only_aditional_context_lemma_based_context_enhancer_then_analysis_explanation_include_correct_supportive_context_word( # noqa: E501 + nlp_engine, lemma_context, us_license_recognizer ): """This test checks that LemmaContextAwareEnhancer uses supportive context word from analyze input as if it was in the text itself but no other words apear @@ -138,7 +138,7 @@ def test_when_text_with_only_additional_context_lemma_based_context_enhancer_the return that word as supportive_context_word and raise the score. """ text = "John Smith D.R is AC432223" - nlp_artifacts = spacy_nlp_engine.process_text(text, "en") + nlp_artifacts = nlp_engine.process_text(text, "en") recognizer_results = us_license_recognizer.analyze(text, nlp_artifacts) results_without_additional_context = lemma_context.enhance_using_context( text, recognizer_results, nlp_artifacts, [us_license_recognizer] @@ -166,11 +166,11 @@ def test_when_text_with_only_additional_context_lemma_based_context_enhancer_the def test_when_text_with_context_then_improves_score( - dataset, spacy_nlp_engine, mock_nlp_artifacts, lemma_context, recognizers_list + dataset, nlp_engine, mock_nlp_artifacts, lemma_context, recognizers_list ): for item in dataset: text, recognizer, entities = item - nlp_artifacts = spacy_nlp_engine.process_text(text, "en") + nlp_artifacts = nlp_engine.process_text(text, "en") results_without_context = recognizer.analyze(text, entities, mock_nlp_artifacts) results_with_context = recognizer.analyze(text, entities, nlp_artifacts) @@ -189,7 +189,7 @@ def test_when_text_with_context_then_improves_score( assert res_wo.score <= res_w.score -def test_when_context_custom_recognizer_then_succeed(spacy_nlp_engine, mock_nlp_artifacts): +def test_when_context_custom_recognizer_then_succeed(nlp_engine, mock_nlp_artifacts): """This test checks that a custom recognizer is also enhanced by context. However this test also verifies a specific case in which the pattern also @@ -206,7 +206,7 @@ def test_when_context_custom_recognizer_then_succeed(spacy_nlp_engine, mock_nlp_ text = "hi, this is a cool ROCKET" recognizer = rocket_recognizer entities = ["ROCKET"] - nlp_artifacts = spacy_nlp_engine.process_text(text, "en") + nlp_artifacts = nlp_engine.process_text(text, "en") results_without_context = recognizer.analyze(text, entities, mock_nlp_artifacts) results_with_context = recognizer.analyze(text, entities, nlp_artifacts) assert len(results_without_context) == len(results_with_context) diff --git a/presidio-analyzer/tests/test_ner_model_configuration.py b/presidio-analyzer/tests/test_ner_model_configuration.py deleted file mode 100644 index 51d83774c..000000000 --- a/presidio-analyzer/tests/test_ner_model_configuration.py +++ /dev/null @@ -1,75 +0,0 @@ -from pathlib import Path - -import pytest -import yaml - -from presidio_analyzer.nlp_engine import NerModelConfiguration - - -@pytest.fixture(scope="module") -def ner_model_configuration_dict(): - this_path = Path(__file__).parent.absolute() - conf_file = Path(this_path, "conf/test_transformers.yaml") - with open(conf_file) as f: - configuration_dict = yaml.safe_load(f) - - return configuration_dict["ner_model_configuration"] - - -@pytest.mark.parametrize( - "key, original_value, expected_value", - [ - ("labels_to_ignore", [], []), - ("labels_to_ignore", ["A", "B"], ["A", "B"]), - ("aggregation_strategy", "X", "X"), - ("alignment_mode", "Y", "Y"), - ("stride", 51, 51), - ("model_to_presidio_entity_mapping", {"A": "B"}, {"A": "B"}), - ("low_score_entity_names", ["A", "C"], ["A", "C"]), - ("low_confidence_score_multiplier", 12.0, 12.0), - ], -) -def test_from_dict_happy_path( - ner_model_configuration_dict, key, original_value, expected_value -): - ner_model_configuration_dict[key] = original_value - - result = NerModelConfiguration.from_dict(ner_model_configuration_dict) - assert result.to_dict()[key] == expected_value - - -@pytest.mark.parametrize( - "key, value", - [ - ("stride", []), - ("stride", "X"), - ("stride", None), - ("alignment_mode", 5), - ("alignment_mode", None), - ("low_confidence_score_multiplier", "X"), - ], -) -def test_from_dict_wrong_types(ner_model_configuration_dict, key, value): - new_config = ner_model_configuration_dict.copy() - new_config[key] = value - with pytest.raises(ValueError): - NerModelConfiguration.from_dict(new_config) - - -@pytest.mark.parametrize( - "key", - [ - ("labels_to_ignore"), - ("aggregation_strategy"), - ("alignment_mode"), - ("model_to_presidio_entity_mapping"), - ("low_confidence_score_multiplier"), - ("low_score_entity_names"), - ("stride"), - ], -) -def test_from_dict_missing_fields(ner_model_configuration_dict, key): - new_config = ner_model_configuration_dict.copy() - del new_config[key] - with pytest.raises(ValueError): - NerModelConfiguration.from_dict(new_config) diff --git a/presidio-analyzer/tests/test_nlp_engine_provider.py b/presidio-analyzer/tests/test_nlp_engine_provider.py index bf0b3d3dd..4fa92f109 100644 --- a/presidio-analyzer/tests/test_nlp_engine_provider.py +++ b/presidio-analyzer/tests/test_nlp_engine_provider.py @@ -1,48 +1,30 @@ +import json from pathlib import Path -from typing import Dict +from typing import Dict, List +from unittest.mock import patch import pytest import spacy +import yaml from presidio_analyzer.nlp_engine import ( SpacyNlpEngine, StanzaNlpEngine, NlpEngineProvider, + NerModelConfiguration, ) from presidio_analyzer.nlp_engine.transformers_nlp_engine import TransformersNlpEngine -@pytest.fixture(scope="module") -def mock_he_model(): - """ - Create an empty Hebrew spaCy pipeline and save it to disk. - - So that it could be loaded using spacy.load() - """ - he = spacy.blank("he") - he.to_disk("he_test") - - -@pytest.fixture(scope="module") -def mock_bn_model(): - """ - Create an empty Bengali spaCy pipeline and save it to disk. - - So that it could be loaded using spacy.load() - """ - bn = spacy.blank("bn") - bn.to_disk("bn_test") - - @pytest.fixture(scope="session") def nlp_configuration_dict() -> Dict: nlp_configuration = { - "lang_code": "en", - "model_name": { - "spacy": "en_core_web_lg", - "transformers": "StanfordAIMI/stanford-deidentifier-base", - }, - } + "lang_code": "en", + "model_name": { + "spacy": "en_core_web_lg", + "transformers": "StanfordAIMI/stanford-deidentifier-base", + }, + } return nlp_configuration @@ -228,7 +210,7 @@ def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_not def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_keys_not_include_spacy_then_fail( - nlp_configuration_dict, + nlp_configuration_dict, ): nlp_configuration = nlp_configuration_dict.copy() del nlp_configuration["model_name"]["spacy"] @@ -238,7 +220,7 @@ def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_key def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_keys_not_include_transformers_then_fail( - nlp_configuration_dict, + nlp_configuration_dict, ): nlp_configuration = nlp_configuration_dict.copy() del nlp_configuration["model_name"]["transformers"] @@ -247,6 +229,61 @@ def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_key NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() +def test_ner_model_configuration_from_json( + ner_model_configuration_dict, tmp_path_factory +): + fn = tmp_path_factory.mktemp("data") / "nlp_configuration.json" + fn.write_text(json.dumps(ner_model_configuration_dict), "UTF-8") + + ner_model_configuration = NerModelConfiguration.from_json(fn.absolute()) + assert ner_model_configuration.nlp_engine_name == "transformers" + assert ( + ner_model_configuration.low_score_entity_names + == ner_model_configuration_dict["low_score_entity_names"] + ) + assert ( + ner_model_configuration.aggregation_strategy + == ner_model_configuration_dict["aggregation_strategy"] + ) + assert ( + ner_model_configuration.alignment_mode + == ner_model_configuration_dict["alignment_mode"] + ) + + +def test_nlp_model_configuration_from_yaml( + ner_model_configuration_dict, tmp_path_factory +): + fn = tmp_path_factory.mktemp("data") / "nlp_configuration.yaml" + fn.write_text(yaml.safe_dump(ner_model_configuration_dict), "UTF-8") + + ner_model_configuration = NerModelConfiguration.from_yaml(fn.absolute()) + assert ner_model_configuration.nlp_engine_name == "transformers" + assert ( + ner_model_configuration.low_score_entity_names + == ner_model_configuration_dict["low_score_entity_names"] + ) + assert ( + ner_model_configuration.aggregation_strategy + == ner_model_configuration_dict["aggregation_strategy"] + ) + assert ( + ner_model_configuration.alignment_mode + == ner_model_configuration_dict["alignment_mode"] + ) + + +def test_nlp_model_configuration_from_yaml_missing_field( + ner_model_configuration_dict, tmp_path_factory +): + fn = tmp_path_factory.mktemp("data") / "nlp_configuration.yaml" + del ner_model_configuration_dict["nlp_engine_name"] + fn.write_text(yaml.safe_dump(ner_model_configuration_dict), "UTF-8") + + with pytest.raises(ValueError): + NerModelConfiguration.from_yaml(fn.absolute()) + + def test_nlp_engine_provider_init_through_nlp_engine_configuration(): engine = NlpEngineProvider().create_engine() assert isinstance(engine, SpacyNlpEngine) diff --git a/presidio-analyzer/tests/test_phone_recognizer.py b/presidio-analyzer/tests/test_phone_recognizer.py index f10024f4f..55f40be43 100644 --- a/presidio-analyzer/tests/test_phone_recognizer.py +++ b/presidio-analyzer/tests/test_phone_recognizer.py @@ -9,6 +9,10 @@ def recognizer(): return PhoneRecognizer() +@pytest.fixture(scope="module") +def nlp_engine(nlp_engines): + return nlp_engines["spacy_en"] + @pytest.mark.parametrize( "text, expected_len, entities, expected_positions, score", @@ -28,7 +32,7 @@ def recognizer(): ], ) def test_when_all_phones_then_succeed( - spacy_nlp_engine, + nlp_engine, text, expected_len, entities, @@ -36,7 +40,7 @@ def test_when_all_phones_then_succeed( score, recognizer, ): - nlp_artifacts = spacy_nlp_engine.process_text(text, "en") + nlp_artifacts = nlp_engine.process_text(text, "en") results = recognizer.analyze(text, entities, nlp_artifacts=nlp_artifacts) assert len(results) == expected_len for i, (res, (st_pos, fn_pos)) in enumerate(zip(results, expected_positions)): diff --git a/presidio-analyzer/tests/test_spacy_nlp_engine.py b/presidio-analyzer/tests/test_spacy_nlp_engine.py index d09fdbe87..313405da5 100644 --- a/presidio-analyzer/tests/test_spacy_nlp_engine.py +++ b/presidio-analyzer/tests/test_spacy_nlp_engine.py @@ -1,22 +1,18 @@ from typing import Iterator -import pytest -from presidio_analyzer.nlp_engine import SpacyNlpEngine +def test_simple_process_text(nlp_engine): - -def test_simple_process_text(spacy_nlp_engine): - - nlp_artifacts = spacy_nlp_engine.process_text("simple text", language="en") + nlp_artifacts = nlp_engine.process_text("simple text", language="en") assert len(nlp_artifacts.tokens) == 2 assert not nlp_artifacts.entities assert nlp_artifacts.lemmas[0] == "simple" assert nlp_artifacts.lemmas[1] == "text" -def test_process_batch_strings(spacy_nlp_engine): +def test_process_batch_strings(nlp_engine): - nlp_artifacts_batch = spacy_nlp_engine.process_batch( + nlp_artifacts_batch = nlp_engine.process_batch( ["simple text", "simple text"], language="en" ) assert isinstance(nlp_artifacts_batch, Iterator) @@ -25,23 +21,3 @@ def test_process_batch_strings(spacy_nlp_engine): for text, nlp_artifacts in nlp_artifacts_batch: assert text == "simple text" assert len(nlp_artifacts.tokens) == 2 - - -def test_nlp_not_loaded_value_error(): - unloaded_spacy_nlp = SpacyNlpEngine() - with pytest.raises(ValueError): - unloaded_spacy_nlp.process_text("This should fail as the NLP model isn't loaded", language="en") - - -def test_validate_model_params_missing_fields(): - model = { - "lang_code": "en", - "model_name": "en_core_web_;g" - } - - for key in model.keys(): - new_model = model.copy() - del new_model[key] - - with pytest.raises(ValueError): - SpacyNlpEngine._validate_model_params(new_model) diff --git a/presidio-analyzer/tests/test_spacy_recognizer.py b/presidio-analyzer/tests/test_spacy_recognizer.py index f58b52fab..68ce78fab 100644 --- a/presidio-analyzer/tests/test_spacy_recognizer.py +++ b/presidio-analyzer/tests/test_spacy_recognizer.py @@ -1,9 +1,8 @@ import pytest -from presidio_analyzer.predefined_recognizers import SpacyRecognizer +from presidio_analyzer.nlp_engine import SpacyNlpEngine from tests import assert_result_within_score_range - @pytest.fixture(scope="module") def entities(): return ["PERSON", "DATE_TIME"] @@ -20,7 +19,6 @@ def prepare_and_analyze(nlp, recognizer, text, ents): return results -@pytest.mark.itegration @pytest.mark.parametrize( "text, expected_len, expected_positions, entity_num", [ @@ -49,13 +47,13 @@ def test_when_using_spacy_then_all_spacy_result_found( expected_len, expected_positions, entity_num, - spacy_nlp_engine, + nlp_engine, nlp_recognizer, entities, ner_strength, max_score, ): - results = prepare_and_analyze(spacy_nlp_engine, nlp_recognizer, text, entities) + results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) assert len(results) == expected_len entity_to_check = entities[entity_num] for res, (st_pos, fn_pos) in zip(results, expected_positions): @@ -65,10 +63,10 @@ def test_when_using_spacy_then_all_spacy_result_found( def test_when_person_in_text_then_person_full_name_complex_found( - spacy_nlp_engine, nlp_recognizer, entities + nlp_engine, nlp_recognizer, entities ): text = "William Bill Alexander" - results = prepare_and_analyze(spacy_nlp_engine, nlp_recognizer, text, entities) + results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) assert len(results) > 0 @@ -81,7 +79,7 @@ def test_when_person_in_text_then_person_full_name_complex_found( assert len(text) - len(covered_text) < 5 -def test_analyze_no_nlp_artifacts(): - spacy_recognizer = SpacyRecognizer() - res = spacy_recognizer.analyze(text="text", nlp_artifacts=None, entities=["PERSON"]) - assert len(res) == 0 +def test_nlp_not_loaded_value_error(): + spacy_nlp = SpacyNlpEngine() + with pytest.raises(ValueError): + spacy_nlp.process_text("This should fail as the NLP model isn't loaded", language="en") diff --git a/presidio-analyzer/tests/test_stanza_recognizer.py b/presidio-analyzer/tests/test_stanza_recognizer.py index 94f53f4ad..fcab9755e 100644 --- a/presidio-analyzer/tests/test_stanza_recognizer.py +++ b/presidio-analyzer/tests/test_stanza_recognizer.py @@ -10,11 +10,9 @@ def entities(): @pytest.mark.skip_engine("stanza_en") @pytest.fixture(scope="module") -def spacy_nlp_engine(nlp_engines): - nlp_engine = nlp_engines.get("stanza_en", None) - if nlp_engine: - nlp_engine.load() - return nlp_engine +def nlp_engine(nlp_engines): + return nlp_engines.get("stanza_en", None) + @pytest.mark.skip_engine("stanza_en") @pytest.fixture(scope="module") @@ -58,13 +56,13 @@ def test_when_using_stanza_then_all_stanza_result_correct( expected_len, expected_positions, entity_num, - spacy_nlp_engine, + nlp_engine, nlp_recognizer, entities, ner_strength, max_score, ): - results = prepare_and_analyze(spacy_nlp_engine, nlp_recognizer, text, entities) + results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) assert len(results) == expected_len entity_to_check = entities[entity_num] for res, (st_pos, fn_pos) in zip(results, expected_positions): @@ -75,10 +73,10 @@ def test_when_using_stanza_then_all_stanza_result_correct( @pytest.mark.skip_engine("stanza_en") def test_when_person_in_text_then_person_full_name_complex_found( - spacy_nlp_engine, nlp_recognizer, entities + nlp_engine, nlp_recognizer, entities ): text = "Richard (Rick) C. Henderson" - results = prepare_and_analyze(spacy_nlp_engine, nlp_recognizer, text, entities) + results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) assert len(results) > 0 diff --git a/presidio-analyzer/tests/test_transformers_nlp_engine.py b/presidio-analyzer/tests/test_transformers_nlp_engine.py deleted file mode 100644 index c11db92b0..000000000 --- a/presidio-analyzer/tests/test_transformers_nlp_engine.py +++ /dev/null @@ -1,48 +0,0 @@ -import pytest - -from presidio_analyzer.nlp_engine import TransformersNlpEngine - - -def test_default_models(): - engine = TransformersNlpEngine() - assert len(engine.models) > 0 - assert engine.models[0]["lang_code"] == "en" - assert isinstance(engine.models[0]["model_name"], dict) - - -def test_validate_model_params_happy_path(): - model = { - "lang_code": "en", - "model_name": { - "spacy": "en_core_web_sm", - "transformers": "obi/deid_roberta_i2b2", - }, - } - - TransformersNlpEngine._validate_model_params(model) - -@pytest.mark.parametrize( - "key", - [ - ("lang_code"), - ("model_name"), - ("model_name.spacy"), - ("model_name.transformers") - ], -) -def test_validate_model_params_missing_fields(key): - model = { - "lang_code": "en", - "model_name": { - "spacy": "en_core_web_sm", - "transformers": "obi/deid_roberta_i2b2", - }, - } - keys = key.split(".") - if len(keys) == 1: - del model[keys[0]] - else: - del model[keys[0]][keys[1]] - - with pytest.raises(ValueError): - TransformersNlpEngine._validate_model_params(model) diff --git a/presidio-analyzer/tests/test_transformers_recognizer.py b/presidio-analyzer/tests/test_transformers_recognizer.py index 9ebc1fa57..03303cd9b 100644 --- a/presidio-analyzer/tests/test_transformers_recognizer.py +++ b/presidio-analyzer/tests/test_transformers_recognizer.py @@ -10,17 +10,14 @@ def entities(): @pytest.mark.skip_engine("transformers_en") @pytest.fixture(scope="module") -def nlp_recognizer(nlp_recognizers): - return nlp_recognizers.get("transformers", None) +def nlp_engine(nlp_engines): + return nlp_engines.get("transformers_en", None) @pytest.mark.skip_engine("transformers_en") @pytest.fixture(scope="module") -def nlp_engine(nlp_engines): - nlp_engine = nlp_engines.get("transformers_en", None) - if nlp_engine: - nlp_engine.load() - return nlp_engine +def nlp_recognizer(nlp_recognizers): + return nlp_recognizers.get("transformers", None) def prepare_and_analyze(nlp, recognizer, text, entities): @@ -30,7 +27,6 @@ def prepare_and_analyze(nlp, recognizer, text, entities): return results -@pytest.mark.itegration @pytest.mark.skip_engine("transformers_en") @pytest.mark.parametrize( "text, expected_len, expected_positions, entity_num", @@ -80,10 +76,9 @@ def test_when_using_transformers_then_all_transformers_result_correct( ) -@pytest.mark.itegration @pytest.mark.skip_engine("transformers_en") def test_when_person_in_text_then_person_full_name_complex_found( - nlp_engine, nlp_recognizer, entities + nlp_engine, nlp_recognizer, entities ): text = "Richard (Rick) C. Henderson" results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) From 35b99e0a0105275f75de9d202948b57697b1a0b1 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 18 Sep 2023 13:52:02 +0300 Subject: [PATCH 39/67] revert code to separate PRs --- .../batch_analyzer_engine.py | 18 +-- .../nlp_engine/ner_model_configuration.py | 107 +++++++++++++----- .../nlp_engine/stanza_nlp_engine.py | 1 - .../nlp_engine/transformers_nlp_engine.py | 9 ++ 4 files changed, 98 insertions(+), 37 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py b/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py index daf323f12..4a428595d 100644 --- a/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py +++ b/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py @@ -8,18 +8,18 @@ class BatchAnalyzerEngine: + """ + Batch analysis of documents (tables, lists, dicts). - def __init__(self, analyzer_engine: Optional[AnalyzerEngine] = None): - """ - Batch analysis of documents (tables, lists, dicts). + Wrapper class to run Presidio Analyzer Engine on multiple values, + either lists/iterators of strings, or dictionaries. - Wrapper class to run Presidio Analyzer Engine on multiple values, - either lists/iterators of strings, or dictionaries. + :param: analyzer_engine: AnalyzerEngine instance to use + for handling the values in those collections. + """ + + def __init__(self, analyzer_engine: Optional[AnalyzerEngine] = None): - :param: analyzer_engine: AnalyzerEngine instance to use - for handling the values in those collections. - """ - self.analyzer_engine = analyzer_engine if not analyzer_engine: self.analyzer_engine = AnalyzerEngine() diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py index 67b181270..64622c1a0 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py @@ -1,6 +1,10 @@ +import json import logging from dataclasses import dataclass -from typing import Dict, Optional, Collection, Type +from pathlib import Path +from typing import Dict, Optional, Union, Collection + +import yaml logger = logging.getLogger("presidio-analyzer") @@ -34,6 +38,7 @@ class NerModelConfiguration: """NER model configuration. + :param nlp_engine_name: Name of the NLP engine to use. :param labels_to_ignore: List of labels to not return predictions for. :param aggregation_strategy: See https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.TokenClassificationPipeline.aggregation_strategy @@ -49,6 +54,7 @@ class NerModelConfiguration: Multiplier to the score given for low_score_entity_names. """ # noqa E501 + nlp_engine_name: str labels_to_ignore: Optional[Collection[str]] = None aggregation_strategy: Optional[str] = "simple" stride: Optional[int] = 14 @@ -78,44 +84,91 @@ def __post_init__(self): self.labels_to_ignore = LABELS_TO_IGNORE @classmethod - def _validate_input(cls, ner_model_configuration_dict: Dict) -> None: - key_to_type = { - "labels_to_ignore": list, - "aggregation_strategy": str, - "alignment_mode": str, - "model_to_presidio_entity_mapping": dict, - "low_confidence_score_multiplier": float, - "low_score_entity_names": list, - "stride": int, - } - - for key, field_type in key_to_type.items(): - cls.__validate_type( - config_dict=ner_model_configuration_dict, key=key, field_type=field_type - ) + def _validate_input(cls, nlp_engine_configuration: Dict) -> None: + if "nlp_engine_name" not in nlp_engine_configuration: + raise ValueError("nlp_engine_name is a required parameter") + if "labels_to_ignore" in nlp_engine_configuration: + if not isinstance(nlp_engine_configuration["labels_to_ignore"], list): + raise ValueError("labels_to_ignore must be a list") + if "aggregation_strategy" in nlp_engine_configuration: + if not isinstance(nlp_engine_configuration["aggregation_strategy"], str): + raise ValueError("aggregation_strategy must be a string") + if "alignment_mode" in nlp_engine_configuration: + if not isinstance(nlp_engine_configuration["alignment_mode"], str): + raise ValueError("alignment_mode must be a string") + if "stride" in nlp_engine_configuration: + if not isinstance(nlp_engine_configuration["stride"], int): + raise ValueError("stride must be an integer") + if "model_to_presidio_entity_mapping" in nlp_engine_configuration: + if not isinstance( + nlp_engine_configuration["model_to_presidio_entity_mapping"], dict + ): + raise ValueError("model_to_presidio_entity_mapping must be a dict") + if "low_score_entity_names" in nlp_engine_configuration: + if not isinstance(nlp_engine_configuration["low_score_entity_names"], list): + raise ValueError("low_score_entity_names must be a list") + if "low_confidence_score_multiplier" in nlp_engine_configuration: + if not isinstance( + nlp_engine_configuration["low_confidence_score_multiplier"], float + ): + raise ValueError("low_confidence_score_multiplier must be a float") - @staticmethod - def __validate_type(config_dict: Dict, key: str, field_type: Type) -> None: - if key in config_dict: - if not isinstance(config_dict[key], field_type): - raise ValueError(f"{key} must be of type {field_type}") - else: - raise ValueError(f"NER configuration is missing '{key}'") + @classmethod + def from_yaml(cls, yaml_file: Union[Path, str]) -> "NerModelConfiguration": + """Load NLP engine configuration from yaml file. + + :param yaml_file: Path to the yaml file. + """ + + if not Path(yaml_file).exists(): + raise FileNotFoundError(f"configuration file {yaml_file} not found.") + + with open(yaml_file, "r") as f: + nlp_engine_configuration = yaml.safe_load(f) + + cls._validate_input(nlp_engine_configuration) + + return cls.from_dict(nlp_engine_configuration) @classmethod - def from_dict(cls, nlp_engine_configuration: Dict) -> "NerModelConfiguration": - """Load NLP engine configuration from dict. + def from_json(cls, json_file: Union[Path, str]) -> "NerModelConfiguration": + """Load NLP engine configuration from json file. - :param nlp_engine_configuration: Dict with the configuration to load. + :param json_file: Path to the json file. """ + + if not Path(json_file).exists(): + raise FileNotFoundError(f"configuration file {json_file} not found.") + + with open(json_file, "r") as f: + nlp_engine_configuration = json.load(f) + cls._validate_input(nlp_engine_configuration) - return cls(**nlp_engine_configuration) + return cls.from_dict(nlp_engine_configuration) + + @classmethod + def from_dict(cls, config_dict: Dict) -> "NerModelConfiguration": + """Load NLP engine configuration from dict. + + :param config_dict: Dict with the configuration to load. + """ + return cls(**config_dict) def to_dict(self) -> Dict: """Return the configuration as a dict.""" return self.__dict__ + @staticmethod + def get_full_conf_path( + default_conf_file: Union[Path, str] = "default.yaml" + ) -> Path: + """Return a Path to the default conf file. + + :param default_conf_file: Name of the default conf file. + """ + return Path(Path(__file__).parent.parent.parent, "conf", default_conf_file) + def __str__(self) -> str: # noqa D105 return str(self.to_dict()) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py index 89977275b..1033f75d6 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py @@ -35,5 +35,4 @@ def load(self) -> None: self.nlp[model["lang_code"]] = spacy_stanza.load_pipeline( model["model_name"], processors="tokenize,pos,lemma,ner", - download_method=None, ) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py index 46b491d83..8f06e6a41 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py @@ -13,6 +13,7 @@ from presidio_analyzer.nlp_engine import ( SpacyNlpEngine, + NlpArtifacts, NerModelConfiguration, ) @@ -111,6 +112,14 @@ def _validate_model_params(model: Dict) -> None: "transformers model name is missing from model configuration" ) + def process_text(self, text: str, language: str) -> NlpArtifacts: + """Execute the SpaCy NLP pipeline on the given text and language.""" + if not self.nlp: + raise ValueError("NLP engine is not loaded. Consider calling .load()") + + doc = self.nlp[language](text) + return self._doc_to_nlp_artifact(doc, language) + def _get_entities(self, doc: Doc) -> List[Span]: """ Extract entities out of a spaCy pipeline, depending on the type of pipeline. From 7dbd2be152c7478915503a1de45f170a009fda8d Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 18 Sep 2023 03:55:27 -0700 Subject: [PATCH 40/67] Updates to NlpEngine - tests (#1176) --- mkdocs.yml | 58 +++++---- .../nlp_engine/ner_model_configuration.py | 107 ++++------------ .../nlp_engine/stanza_nlp_engine.py | 1 + .../nlp_engine/transformers_nlp_engine.py | 9 -- .../tests/conf/test_transformers.yaml | 36 ++++++ presidio-analyzer/tests/conftest.py | 117 ++++++------------ .../tests/mocks/nlp_engine_mock.py | 2 +- .../tests/test_analyzer_engine.py | 51 +++++--- .../tests/test_batch_analyzer_engine.py | 2 +- .../tests/test_context_support.py | 18 +-- .../tests/test_ner_model_configuration.py | 75 +++++++++++ .../tests/test_nlp_engine_provider.py | 99 +++++---------- .../tests/test_phone_recognizer.py | 8 +- .../tests/test_spacy_nlp_engine.py | 32 ++++- .../tests/test_spacy_recognizer.py | 20 +-- .../tests/test_stanza_recognizer.py | 16 +-- .../tests/test_transformers_nlp_engine.py | 48 +++++++ .../tests/test_transformers_recognizer.py | 15 ++- 18 files changed, 391 insertions(+), 323 deletions(-) create mode 100644 presidio-analyzer/tests/conf/test_transformers.yaml create mode 100644 presidio-analyzer/tests/test_ner_model_configuration.py create mode 100644 presidio-analyzer/tests/test_transformers_nlp_engine.py diff --git a/mkdocs.yml b/mkdocs.yml index 8c077ee1e..f5774ccfb 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -26,32 +26,38 @@ nav: - Custom anonymization: tutorial/11_custom_anonymization.md - Encryption/Decryption: tutorial/12_encryption.md - Allow-lists: tutorial/13_allow_list.md - - Docs: - - Handling text: - - Home: text_anonymization.md - - Presidio Analyzer: - - Home: analyzer/index.md - - Developing PII recognizers: - - Tutorial : analyzer/adding_recognizers.md - - Best practices in developing recognizers : analyzer/developing_recognizers.md - - Multi-language support: analyzer/languages.md - - Customizing the NLP model: analyzer/customizing_nlp_models.md - - Tracing the decision process: analyzer/decision_process.md - - Presidio Anonymizer: - - Home: anonymizer/index.md - - Developing PII operators: anonymizer/adding_operators.md - - Handling images: image-redactor/index.md - - Supported entities: supported_entities.md - - Development and design: - - Design: design.md - - Setting up a development environment: development.md - - Build and release process: build_release.md - - Changes from V1 to V2: presidio_V2.md - - Python API reference: - - Presidio Analyzer Python API: api/analyzer_python.md - - Presidio Anonymizer Python API: api/anonymizer_python.md - - Presidio Image Redactor Python API: api/image_redactor_python.md - - REST API reference: https://microsoft.github.io/presidio/api-docs/api-docs.html" target="_blank + + - Handling text: + - Home: text_anonymization.md + - Presidio Analyzer: + - Home: analyzer/index.md + - Developing PII recognizers: + - Tutorial : analyzer/adding_recognizers.md + - Best practices in developing recognizers : analyzer/developing_recognizers.md + - Multi-language support: analyzer/languages.md + - Customizing the NLP model: + - Home: analyzer/customizing_nlp_models.md + - Spacy/Stanza: analyzer/nlp_engines/spacy_stanza.md + - Transformers: analyzer/nlp_engines/transformers.md + - Tracing the decision process: analyzer/decision_process.md + - Presidio Anonymizer: + - Home: anonymizer/index.md + - Developing PII operators: anonymizer/adding_operators.md + - Handling images: + - Home: image-redactor/index.md + - Evaluating DICOM redaction: image-redactor/evaluating_dicom_redaction.md + - Supported entities: supported_entities.md + - Development and design: + - Design: design.md + - Setting up a development environment: development.md + - Build and release process: build_release.md + - Changes from V1 to V2: presidio_V2.md + - Python API reference: + - Home: api.md + - Presidio Analyzer Python API: api/analyzer_python.md + - Presidio Anonymizer Python API: api/anonymizer_python.md + - Presidio Image Redactor Python API: api/image_redactor_python.md + - REST API reference: https://microsoft.github.io/presidio/api-docs/api-docs.html" target="_blank - Samples: samples/index.md - Community: community.md - FAQ: faq.md diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py index 64622c1a0..67b181270 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py @@ -1,10 +1,6 @@ -import json import logging from dataclasses import dataclass -from pathlib import Path -from typing import Dict, Optional, Union, Collection - -import yaml +from typing import Dict, Optional, Collection, Type logger = logging.getLogger("presidio-analyzer") @@ -38,7 +34,6 @@ class NerModelConfiguration: """NER model configuration. - :param nlp_engine_name: Name of the NLP engine to use. :param labels_to_ignore: List of labels to not return predictions for. :param aggregation_strategy: See https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.TokenClassificationPipeline.aggregation_strategy @@ -54,7 +49,6 @@ class NerModelConfiguration: Multiplier to the score given for low_score_entity_names. """ # noqa E501 - nlp_engine_name: str labels_to_ignore: Optional[Collection[str]] = None aggregation_strategy: Optional[str] = "simple" stride: Optional[int] = 14 @@ -84,91 +78,44 @@ def __post_init__(self): self.labels_to_ignore = LABELS_TO_IGNORE @classmethod - def _validate_input(cls, nlp_engine_configuration: Dict) -> None: - if "nlp_engine_name" not in nlp_engine_configuration: - raise ValueError("nlp_engine_name is a required parameter") - if "labels_to_ignore" in nlp_engine_configuration: - if not isinstance(nlp_engine_configuration["labels_to_ignore"], list): - raise ValueError("labels_to_ignore must be a list") - if "aggregation_strategy" in nlp_engine_configuration: - if not isinstance(nlp_engine_configuration["aggregation_strategy"], str): - raise ValueError("aggregation_strategy must be a string") - if "alignment_mode" in nlp_engine_configuration: - if not isinstance(nlp_engine_configuration["alignment_mode"], str): - raise ValueError("alignment_mode must be a string") - if "stride" in nlp_engine_configuration: - if not isinstance(nlp_engine_configuration["stride"], int): - raise ValueError("stride must be an integer") - if "model_to_presidio_entity_mapping" in nlp_engine_configuration: - if not isinstance( - nlp_engine_configuration["model_to_presidio_entity_mapping"], dict - ): - raise ValueError("model_to_presidio_entity_mapping must be a dict") - if "low_score_entity_names" in nlp_engine_configuration: - if not isinstance(nlp_engine_configuration["low_score_entity_names"], list): - raise ValueError("low_score_entity_names must be a list") - if "low_confidence_score_multiplier" in nlp_engine_configuration: - if not isinstance( - nlp_engine_configuration["low_confidence_score_multiplier"], float - ): - raise ValueError("low_confidence_score_multiplier must be a float") - - @classmethod - def from_yaml(cls, yaml_file: Union[Path, str]) -> "NerModelConfiguration": - """Load NLP engine configuration from yaml file. - - :param yaml_file: Path to the yaml file. - """ - - if not Path(yaml_file).exists(): - raise FileNotFoundError(f"configuration file {yaml_file} not found.") - - with open(yaml_file, "r") as f: - nlp_engine_configuration = yaml.safe_load(f) - - cls._validate_input(nlp_engine_configuration) + def _validate_input(cls, ner_model_configuration_dict: Dict) -> None: + key_to_type = { + "labels_to_ignore": list, + "aggregation_strategy": str, + "alignment_mode": str, + "model_to_presidio_entity_mapping": dict, + "low_confidence_score_multiplier": float, + "low_score_entity_names": list, + "stride": int, + } + + for key, field_type in key_to_type.items(): + cls.__validate_type( + config_dict=ner_model_configuration_dict, key=key, field_type=field_type + ) - return cls.from_dict(nlp_engine_configuration) + @staticmethod + def __validate_type(config_dict: Dict, key: str, field_type: Type) -> None: + if key in config_dict: + if not isinstance(config_dict[key], field_type): + raise ValueError(f"{key} must be of type {field_type}") + else: + raise ValueError(f"NER configuration is missing '{key}'") @classmethod - def from_json(cls, json_file: Union[Path, str]) -> "NerModelConfiguration": - """Load NLP engine configuration from json file. + def from_dict(cls, nlp_engine_configuration: Dict) -> "NerModelConfiguration": + """Load NLP engine configuration from dict. - :param json_file: Path to the json file. + :param nlp_engine_configuration: Dict with the configuration to load. """ - - if not Path(json_file).exists(): - raise FileNotFoundError(f"configuration file {json_file} not found.") - - with open(json_file, "r") as f: - nlp_engine_configuration = json.load(f) - cls._validate_input(nlp_engine_configuration) - return cls.from_dict(nlp_engine_configuration) - - @classmethod - def from_dict(cls, config_dict: Dict) -> "NerModelConfiguration": - """Load NLP engine configuration from dict. - - :param config_dict: Dict with the configuration to load. - """ - return cls(**config_dict) + return cls(**nlp_engine_configuration) def to_dict(self) -> Dict: """Return the configuration as a dict.""" return self.__dict__ - @staticmethod - def get_full_conf_path( - default_conf_file: Union[Path, str] = "default.yaml" - ) -> Path: - """Return a Path to the default conf file. - - :param default_conf_file: Name of the default conf file. - """ - return Path(Path(__file__).parent.parent.parent, "conf", default_conf_file) - def __str__(self) -> str: # noqa D105 return str(self.to_dict()) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py index 1033f75d6..89977275b 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py @@ -35,4 +35,5 @@ def load(self) -> None: self.nlp[model["lang_code"]] = spacy_stanza.load_pipeline( model["model_name"], processors="tokenize,pos,lemma,ner", + download_method=None, ) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py index 8f06e6a41..46b491d83 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py @@ -13,7 +13,6 @@ from presidio_analyzer.nlp_engine import ( SpacyNlpEngine, - NlpArtifacts, NerModelConfiguration, ) @@ -112,14 +111,6 @@ def _validate_model_params(model: Dict) -> None: "transformers model name is missing from model configuration" ) - def process_text(self, text: str, language: str) -> NlpArtifacts: - """Execute the SpaCy NLP pipeline on the given text and language.""" - if not self.nlp: - raise ValueError("NLP engine is not loaded. Consider calling .load()") - - doc = self.nlp[language](text) - return self._doc_to_nlp_artifact(doc, language) - def _get_entities(self, doc: Doc) -> List[Span]: """ Extract entities out of a spaCy pipeline, depending on the type of pipeline. diff --git a/presidio-analyzer/tests/conf/test_transformers.yaml b/presidio-analyzer/tests/conf/test_transformers.yaml new file mode 100644 index 000000000..5fca7be77 --- /dev/null +++ b/presidio-analyzer/tests/conf/test_transformers.yaml @@ -0,0 +1,36 @@ +nlp_engine_name: transformers +models: + - + lang_code: en + model_name: + spacy: en_core_web_lg + transformers: StanfordAIMI/stanford-deidentifier-base +ner_model_configuration: + labels_to_ignore: + - O + aggregation_strategy: simple # "simple", "first", "average", "max" + stride: 16 # If stride >= 0, process long texts in + # overlapping windows of the model max + # length. The value is the length of the + # window overlap in transformer tokenizer + # tokens, NOT the length of the stride. + alignment_mode: strict # "strict", "contract", "expand" + model_to_presidio_entity_mapping: + PER: PERSON + LOC: LOCATION + ORG: ORGANIZATION + AGE: AGE + ID: ID + EMAIL: EMAIL + PATIENT: PERSON + STAFF: PERSON + HOSP: ORGANIZATION + PATORG: ORGANIZATION + DATE: DATE_TIME + PHONE: PHONE_NUMBER + HCW: PERSON + HOSPITAL: ORGANIZATION + + low_confidence_score_multiplier: 0.4 + low_score_entity_names: + - ID \ No newline at end of file diff --git a/presidio-analyzer/tests/conftest.py b/presidio-analyzer/tests/conftest.py index 5d5602787..948f5f20a 100644 --- a/presidio-analyzer/tests/conftest.py +++ b/presidio-analyzer/tests/conftest.py @@ -3,7 +3,6 @@ from typing import Dict import pytest -import spacy from presidio_analyzer import ( EntityRecognizer, @@ -14,31 +13,16 @@ from presidio_analyzer import RecognizerRegistry from presidio_analyzer.nlp_engine import NlpEngineProvider, NlpEngine from presidio_analyzer.predefined_recognizers import NLP_RECOGNIZERS -from tests.mocks import RecognizerRegistryMock +from tests.mocks import RecognizerRegistryMock, NlpEngineMock -def pytest_addoption(parser): - parser.addoption( - "--runfast", action="store_true", default=False, help="run fast tests" - ) - def pytest_configure(config): - config.addinivalue_line("markers", "slow: mark test as slow to run") config.addinivalue_line( "markers", "skip_engine(nlp_engine): skip test for given nlp engine" ) -def pytest_collection_modifyitems(config, items): - if config.getoption("--runfast"): - # --runfast given in cli: skip slow tests - skip_slow = pytest.mark.skip(reason="remove --runfast option to run") - for item in items: - if "slow" in item.keywords: - item.add_marker(skip_slow) - - @pytest.fixture(scope="session") def nlp_engine_provider() -> NlpEngineProvider: return NlpEngineProvider() @@ -50,28 +34,28 @@ def nlp_engines(request, nlp_engine_provider) -> Dict[str, NlpEngine]: nlp_engines = nlp_engine_provider.nlp_engines for name, engine_cls in nlp_engines.items(): - if name == "spacy" and not request.config.getoption("--runfast"): + if name == "spacy": available_engines[f"{name}_en"] = engine_cls( models=[{"lang_code": "en", "model_name": "en_core_web_lg"}] ) - elif name == "stanza" and not request.config.getoption("--runfast"): + elif name == "stanza": available_engines[f"{name}_en"] = engine_cls( models=[{"lang_code": "en", "model_name": "en"}] ) - elif name == "transformers" and not request.config.getoption("--runfast"): + elif name == "transformers": available_engines[f"{name}_en"] = engine_cls( - models=[{ - "lang_code": "en", - "model_name": { - "spacy": "en_core_web_sm", - "transformers": "StanfordAIMI/stanford-deidentifier-base", - }, - }] + models=[ + { + "lang_code": "en", + "model_name": { + "spacy": "en_core_web_sm", + "transformers": "StanfordAIMI/stanford-deidentifier-base", + }, + } + ] ) else: raise ValueError("Unsupported engine for tests") - # Load engine - available_engines[f"{name}_en"].load() return available_engines @@ -85,6 +69,15 @@ def skip_by_engine(request, nlp_engines): pytest.skip(f"skipped on this engine: {marker_arg}") +@pytest.mark.skip_engine("spacy_en") +@pytest.fixture(scope="session") +def spacy_nlp_engine(nlp_engines): + nlp_engine = nlp_engines.get("spacy_en", None) + if nlp_engine: + nlp_engine.load() + return nlp_engine + + @pytest.fixture(scope="session") def nlp_recognizers() -> Dict[str, EntityRecognizer]: return {name: rec_cls() for name, rec_cls in NLP_RECOGNIZERS.items()} @@ -110,41 +103,19 @@ def loaded_registry() -> RecognizerRegistry: return RecognizerRegistry() -@pytest.fixture(scope="module") -def nlp_engine(nlp_engines) -> NlpEngine: - return nlp_engines["spacy_en"] - - @pytest.fixture(scope="module") def mock_registry() -> RecognizerRegistryMock: return RecognizerRegistryMock() @pytest.fixture(scope="module") -def analyzer_engine_simple(mock_registry, nlp_engine) -> AnalyzerEngine: - return AnalyzerEngine(registry=mock_registry, nlp_engine=nlp_engine) - - -@pytest.fixture(scope="session") -def mock_he_model(): - """ - Create an empty Hebrew spaCy pipeline and save it to disk. +def mock_nlp_engine() -> NlpEngineMock: + return NlpEngineMock() - So that it could be loaded using spacy.load() - """ - he = spacy.blank("he") - he.to_disk("he_test") - -@pytest.fixture(scope="session") -def mock_bn_model(): - """ - Create an empty Bengali spaCy pipeline and save it to disk. - - So that it could be loaded using spacy.load() - """ - bn = spacy.blank("bn") - bn.to_disk("bn_test") +@pytest.fixture(scope="module") +def analyzer_engine_simple(mock_registry, mock_nlp_engine) -> AnalyzerEngine: + return AnalyzerEngine(registry=mock_registry, nlp_engine=mock_nlp_engine) @pytest.fixture(scope="session") @@ -157,28 +128,18 @@ def zip_code_recognizer(): return zip_recognizer -@pytest.fixture(scope="session") -def zip_code_deny_list_recognizer(): - regex = r"(\b\d{5}(?:\-\d{4})?\b)" - zipcode_pattern = Pattern(name="zip code (weak)", regex=regex, score=0.01) - zip_recognizer = PatternRecognizer( - supported_entity="ZIP", deny_list=["999"], patterns=[zipcode_pattern] - ) - return zip_recognizer - def pytest_sessionfinish(): """Remove files created during mock spaCy models creation.""" - he_test_model_path = Path(Path(__file__).parent.parent, "he_test") - if he_test_model_path.exists(): - try: - shutil.rmtree(he_test_model_path) - except OSError as e: - print("Failed to remove file: %s - %s." % (e.filename, e.strerror)) - - bn_test_model_path = Path(Path(__file__).parent.parent, "bn_test") - if bn_test_model_path.exists(): - try: - shutil.rmtree(bn_test_model_path) - except OSError as e: - print("Failed to remove file: %s - %s." % (e.filename, e.strerror)) + + mock_models = ("he_test", "bn_test") + + for mock_model in mock_models: + test_model_path1 = Path(Path(__file__).parent, mock_model) + test_model_path2 = Path(Path(__file__).parent.parent, mock_model) + for path in (test_model_path1, test_model_path2): + if path.exists(): + try: + shutil.rmtree(path) + except OSError as e: + print("Failed to remove file: %s - %s." % (e.filename, e.strerror)) diff --git a/presidio-analyzer/tests/mocks/nlp_engine_mock.py b/presidio-analyzer/tests/mocks/nlp_engine_mock.py index 5e8ab5568..a2a591968 100644 --- a/presidio-analyzer/tests/mocks/nlp_engine_mock.py +++ b/presidio-analyzer/tests/mocks/nlp_engine_mock.py @@ -8,7 +8,7 @@ def __init__(self, stopwords=None, punct_words=None, nlp_artifacts=None): self.stopwords = stopwords if stopwords else [] self.punct_words = punct_words if punct_words else [] if nlp_artifacts is None: - self.nlp_artifacts = NlpArtifacts([], [], [], [], None, "en") + self.nlp_artifacts = NlpArtifacts([], [], [], [], None, "en", []) else: self.nlp_artifacts = nlp_artifacts diff --git a/presidio-analyzer/tests/test_analyzer_engine.py b/presidio-analyzer/tests/test_analyzer_engine.py index 746f2ae60..9494a1c66 100644 --- a/presidio-analyzer/tests/test_analyzer_engine.py +++ b/presidio-analyzer/tests/test_analyzer_engine.py @@ -39,6 +39,16 @@ def loaded_analyzer_engine(loaded_registry, app_tracer): return analyzer_engine +@pytest.fixture(scope="module") +def zip_code_deny_list_recognizer(): + regex = r"(\b\d{5}(?:\-\d{4})?\b)" + zipcode_pattern = Pattern(name="zip code (weak)", regex=regex, score=0.01) + zip_recognizer = PatternRecognizer( + supported_entity="ZIP", deny_list=["999"], patterns=[zipcode_pattern] + ) + return zip_recognizer + + @pytest.fixture(scope="module") def unit_test_guid(): return "00000000-0000-0000-0000-000000000000" @@ -49,6 +59,7 @@ def nlp_engine(nlp_engines): return nlp_engines["spacy_en"] +@pytest.mark.integration def test_simple(): dic = { "text": "John Smith drivers license is AC432223", @@ -79,14 +90,14 @@ def test_when_analyze_with_predefined_recognizers_then_return_results( def test_when_analyze_with_multiple_predefined_recognizers_then_succeed( - loaded_registry, unit_test_guid, nlp_engine, max_score + loaded_registry, unit_test_guid, spacy_nlp_engine, max_score ): text = " Credit card: 4095-2609-9393-4932, my phone is 425 8829090" language = "en" entities = ["CREDIT_CARD", "PHONE_NUMBER"] analyzer_engine_with_spacy = AnalyzerEngine( - registry=loaded_registry, nlp_engine=nlp_engine + registry=loaded_registry, nlp_engine=spacy_nlp_engine ) results = analyzer_engine_with_spacy.analyze( correlation_id=unit_test_guid, @@ -134,8 +145,8 @@ def test_when_analyze_with_unsupported_language_then_fail( ) -def test_when_analyze_two_entities_embedded_then_return_results(nlp_engine): - analyzer = AnalyzerEngine(nlp_engine=nlp_engine) +def test_when_analyze_two_entities_embedded_then_return_results(spacy_nlp_engine): + analyzer = AnalyzerEngine(nlp_engine=spacy_nlp_engine) # Name with driver license in it text = "My name is John 1234567 Doe" @@ -318,10 +329,10 @@ def test_when_entities_is_none_then_return_all_fields(loaded_registry): def test_when_entities_is_none_all_recognizers_loaded_then_return_all_fields( - nlp_engine, + spacy_nlp_engine, ): analyze_engine = AnalyzerEngine( - registry=RecognizerRegistry(), nlp_engine=nlp_engine + registry=RecognizerRegistry(), nlp_engine=spacy_nlp_engine ) threshold = 0 text = "My name is Sharon and I live in Seattle." "Domain: microsoft.com " @@ -337,7 +348,7 @@ def test_when_entities_is_none_all_recognizers_loaded_then_return_all_fields( def test_when_analyze_then_apptracer_has_value( - loaded_registry, unit_test_guid, nlp_engine + loaded_registry, unit_test_guid, spacy_nlp_engine ): text = "My name is Bart Simpson, and Credit card: 4095-2609-9393-4932, my phone is 425 8829090" # noqa E501 language = "en" @@ -347,7 +358,7 @@ def test_when_analyze_then_apptracer_has_value( loaded_registry, app_tracer=app_tracer_mock, log_decision_process=True, - nlp_engine=nlp_engine, + nlp_engine=spacy_nlp_engine, ) results = analyzer_engine_with_spacy.analyze( correlation_id=unit_test_guid, @@ -470,7 +481,7 @@ def test_when_get_supported_fields_then_return_all_languages( def test_when_get_supported_fields_specific_language_then_return_single_result( - loaded_registry, unit_test_guid, nlp_engine + loaded_registry, unit_test_guid, spacy_nlp_engine ): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer( @@ -480,7 +491,7 @@ def test_when_get_supported_fields_specific_language_then_return_single_result( supported_language="ru", ) - analyzer = AnalyzerEngine(registry=loaded_registry, nlp_engine=nlp_engine) + analyzer = AnalyzerEngine(registry=loaded_registry, nlp_engine=spacy_nlp_engine) analyzer.registry.add_recognizer(pattern_recognizer) entities = analyzer.get_supported_entities(language="ru") @@ -507,7 +518,7 @@ def test_when_get_recognizers_then_returns_supported_language(): assert len(response) == 1 -def test_when_add_recognizer_then_also_outputs_others(nlp_engine): +def test_when_add_recognizer_then_also_outputs_others(spacy_nlp_engine): pattern = Pattern("rocket pattern", r"\W*(rocket)\W*", 0.8) pattern_recognizer = PatternRecognizer( "ROCKET", @@ -521,7 +532,7 @@ def test_when_add_recognizer_then_also_outputs_others(nlp_engine): assert len(registry.recognizers) > 1 - analyzer = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine) + analyzer = AnalyzerEngine(registry=registry, nlp_engine=spacy_nlp_engine) text = "Michael Jones has a rocket" @@ -652,9 +663,9 @@ def test_entities_filter_for_ad_hoc_removes_recognizer(loaded_analyzer_engine): assert "MR" not in [resp.entity_type for resp in responses2] -def test_ad_hoc_with_context_support_higher_confidence(nlp_engine, zip_code_recognizer): +def test_ad_hoc_with_context_support_higher_confidence(spacy_nlp_engine, zip_code_recognizer): text = "Mr. John Smith's zip code is 10023" - analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine) + analyzer_engine = AnalyzerEngine(nlp_engine=spacy_nlp_engine) responses1 = analyzer_engine.analyze( text=text, language="en", ad_hoc_recognizers=[zip_code_recognizer] @@ -686,7 +697,7 @@ def test_ad_hoc_when_no_other_recognizers_are_requested_returns_only_ad_hoc_resu assert "ZIP" in [resp.entity_type for resp in responses] -def test_when_recognizer_doesnt_return_recognizer_name_no_exception(nlp_engine): +def test_when_recognizer_doesnt_return_recognizer_name_no_exception(spacy_nlp_engine): class MockRecognizer1(EntityRecognizer, ABC): def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts): return [RecognizerResult("TEST1", 10, 30, 0.5)] @@ -703,7 +714,7 @@ def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts): registry.add_recognizer(mock_recognizer1) registry.add_recognizer(mock_recognizer2) - analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry) + analyzer_engine = AnalyzerEngine(nlp_engine=spacy_nlp_engine, registry=registry) results = analyzer_engine.analyze("ABC", language="en") assert len(results) == 2 @@ -735,7 +746,7 @@ def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts): ) -def test_when_recognizer_overrides_enhance_score_then_it_get_boosted_once(nlp_engine): +def test_when_recognizer_overrides_enhance_score_then_it_get_boosted_once(spacy_nlp_engine): class MockRecognizer(EntityRecognizer, ABC): def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts): return [ @@ -768,7 +779,7 @@ def enhance_using_context( registry = RecognizerRegistry() registry.add_recognizer(mock_recognizer) - analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry) + analyzer_engine = AnalyzerEngine(nlp_engine=spacy_nlp_engine, registry=registry) recognizer_results = analyzer_engine.analyze("ABC", language="en") assert len(recognizer_results) == 2 @@ -809,7 +820,7 @@ def enhance_using_context( ] -def test_when_multiple_nameless_recognizers_context_is_correct(nlp_engine): +def test_when_multiple_nameless_recognizers_context_is_correct(spacy_nlp_engine): rocket_recognizer = PatternRecognizer( supported_entity="ROCKET", context=["cool"], @@ -825,7 +836,7 @@ def test_when_multiple_nameless_recognizers_context_is_correct(nlp_engine): registry.add_recognizer(rocket_recognizer) registry.add_recognizer(rocket_recognizer2) - analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry) + analyzer_engine = AnalyzerEngine(nlp_engine=spacy_nlp_engine, registry=registry) recognizer_results = analyzer_engine.analyze( "I have a cool rocket and a fast missile.", language="en" ) diff --git a/presidio-analyzer/tests/test_batch_analyzer_engine.py b/presidio-analyzer/tests/test_batch_analyzer_engine.py index 51cd34ec0..e454cee35 100644 --- a/presidio-analyzer/tests/test_batch_analyzer_engine.py +++ b/presidio-analyzer/tests/test_batch_analyzer_engine.py @@ -158,7 +158,7 @@ def test_analyze_dict_on_nested_dict(batch_analyzer_engine_simple): key="key_a1", value=nested_dict["key_a"]["key_a1"], recognizer_results=[ - RecognizerResult("PHONE_NUMBER", start=19, end=31, score=0.75) + RecognizerResult("PHONE_NUMBER", start=19, end=31, score=0.4) ], ) ], diff --git a/presidio-analyzer/tests/test_context_support.py b/presidio-analyzer/tests/test_context_support.py index 3054b29ee..a7b4d00be 100644 --- a/presidio-analyzer/tests/test_context_support.py +++ b/presidio-analyzer/tests/test_context_support.py @@ -90,7 +90,7 @@ def us_license_recognizer(): def test_when_text_with_aditional_context_lemma_based_context_enhancer_then_analysis_explanation_include_correct_supportive_context_word( # noqa: E501 - nlp_engine, lemma_context, us_license_recognizer + spacy_nlp_engine, lemma_context, us_license_recognizer ): """This test checks that LemmaContextAwareEnhancer uses supportive context word from analyze input as if it was in the text itself. @@ -100,7 +100,7 @@ def test_when_text_with_aditional_context_lemma_based_context_enhancer_then_anal return that word as supportive_context_word instead of other recognizer context word """ text = "John Smith license is AC432223" - nlp_artifacts = nlp_engine.process_text(text, "en") + nlp_artifacts = spacy_nlp_engine.process_text(text, "en") recognizer_results = us_license_recognizer.analyze(text, nlp_artifacts) results_without_additional_context = lemma_context.enhance_using_context( text, recognizer_results, nlp_artifacts, [us_license_recognizer] @@ -125,8 +125,8 @@ def test_when_text_with_aditional_context_lemma_based_context_enhancer_then_anal ) -def test_when_text_with_only_aditional_context_lemma_based_context_enhancer_then_analysis_explanation_include_correct_supportive_context_word( # noqa: E501 - nlp_engine, lemma_context, us_license_recognizer +def test_when_text_with_only_additional_context_lemma_based_context_enhancer_then_analysis_explanation_include_correct_supportive_context_word( # noqa: E501 + spacy_nlp_engine, lemma_context, us_license_recognizer ): """This test checks that LemmaContextAwareEnhancer uses supportive context word from analyze input as if it was in the text itself but no other words apear @@ -138,7 +138,7 @@ def test_when_text_with_only_aditional_context_lemma_based_context_enhancer_then return that word as supportive_context_word and raise the score. """ text = "John Smith D.R is AC432223" - nlp_artifacts = nlp_engine.process_text(text, "en") + nlp_artifacts = spacy_nlp_engine.process_text(text, "en") recognizer_results = us_license_recognizer.analyze(text, nlp_artifacts) results_without_additional_context = lemma_context.enhance_using_context( text, recognizer_results, nlp_artifacts, [us_license_recognizer] @@ -166,11 +166,11 @@ def test_when_text_with_only_aditional_context_lemma_based_context_enhancer_then def test_when_text_with_context_then_improves_score( - dataset, nlp_engine, mock_nlp_artifacts, lemma_context, recognizers_list + dataset, spacy_nlp_engine, mock_nlp_artifacts, lemma_context, recognizers_list ): for item in dataset: text, recognizer, entities = item - nlp_artifacts = nlp_engine.process_text(text, "en") + nlp_artifacts = spacy_nlp_engine.process_text(text, "en") results_without_context = recognizer.analyze(text, entities, mock_nlp_artifacts) results_with_context = recognizer.analyze(text, entities, nlp_artifacts) @@ -189,7 +189,7 @@ def test_when_text_with_context_then_improves_score( assert res_wo.score <= res_w.score -def test_when_context_custom_recognizer_then_succeed(nlp_engine, mock_nlp_artifacts): +def test_when_context_custom_recognizer_then_succeed(spacy_nlp_engine, mock_nlp_artifacts): """This test checks that a custom recognizer is also enhanced by context. However this test also verifies a specific case in which the pattern also @@ -206,7 +206,7 @@ def test_when_context_custom_recognizer_then_succeed(nlp_engine, mock_nlp_artifa text = "hi, this is a cool ROCKET" recognizer = rocket_recognizer entities = ["ROCKET"] - nlp_artifacts = nlp_engine.process_text(text, "en") + nlp_artifacts = spacy_nlp_engine.process_text(text, "en") results_without_context = recognizer.analyze(text, entities, mock_nlp_artifacts) results_with_context = recognizer.analyze(text, entities, nlp_artifacts) assert len(results_without_context) == len(results_with_context) diff --git a/presidio-analyzer/tests/test_ner_model_configuration.py b/presidio-analyzer/tests/test_ner_model_configuration.py new file mode 100644 index 000000000..51d83774c --- /dev/null +++ b/presidio-analyzer/tests/test_ner_model_configuration.py @@ -0,0 +1,75 @@ +from pathlib import Path + +import pytest +import yaml + +from presidio_analyzer.nlp_engine import NerModelConfiguration + + +@pytest.fixture(scope="module") +def ner_model_configuration_dict(): + this_path = Path(__file__).parent.absolute() + conf_file = Path(this_path, "conf/test_transformers.yaml") + with open(conf_file) as f: + configuration_dict = yaml.safe_load(f) + + return configuration_dict["ner_model_configuration"] + + +@pytest.mark.parametrize( + "key, original_value, expected_value", + [ + ("labels_to_ignore", [], []), + ("labels_to_ignore", ["A", "B"], ["A", "B"]), + ("aggregation_strategy", "X", "X"), + ("alignment_mode", "Y", "Y"), + ("stride", 51, 51), + ("model_to_presidio_entity_mapping", {"A": "B"}, {"A": "B"}), + ("low_score_entity_names", ["A", "C"], ["A", "C"]), + ("low_confidence_score_multiplier", 12.0, 12.0), + ], +) +def test_from_dict_happy_path( + ner_model_configuration_dict, key, original_value, expected_value +): + ner_model_configuration_dict[key] = original_value + + result = NerModelConfiguration.from_dict(ner_model_configuration_dict) + assert result.to_dict()[key] == expected_value + + +@pytest.mark.parametrize( + "key, value", + [ + ("stride", []), + ("stride", "X"), + ("stride", None), + ("alignment_mode", 5), + ("alignment_mode", None), + ("low_confidence_score_multiplier", "X"), + ], +) +def test_from_dict_wrong_types(ner_model_configuration_dict, key, value): + new_config = ner_model_configuration_dict.copy() + new_config[key] = value + with pytest.raises(ValueError): + NerModelConfiguration.from_dict(new_config) + + +@pytest.mark.parametrize( + "key", + [ + ("labels_to_ignore"), + ("aggregation_strategy"), + ("alignment_mode"), + ("model_to_presidio_entity_mapping"), + ("low_confidence_score_multiplier"), + ("low_score_entity_names"), + ("stride"), + ], +) +def test_from_dict_missing_fields(ner_model_configuration_dict, key): + new_config = ner_model_configuration_dict.copy() + del new_config[key] + with pytest.raises(ValueError): + NerModelConfiguration.from_dict(new_config) diff --git a/presidio-analyzer/tests/test_nlp_engine_provider.py b/presidio-analyzer/tests/test_nlp_engine_provider.py index 4fa92f109..bf0b3d3dd 100644 --- a/presidio-analyzer/tests/test_nlp_engine_provider.py +++ b/presidio-analyzer/tests/test_nlp_engine_provider.py @@ -1,30 +1,48 @@ -import json from pathlib import Path -from typing import Dict, List -from unittest.mock import patch +from typing import Dict import pytest import spacy -import yaml from presidio_analyzer.nlp_engine import ( SpacyNlpEngine, StanzaNlpEngine, NlpEngineProvider, - NerModelConfiguration, ) from presidio_analyzer.nlp_engine.transformers_nlp_engine import TransformersNlpEngine +@pytest.fixture(scope="module") +def mock_he_model(): + """ + Create an empty Hebrew spaCy pipeline and save it to disk. + + So that it could be loaded using spacy.load() + """ + he = spacy.blank("he") + he.to_disk("he_test") + + +@pytest.fixture(scope="module") +def mock_bn_model(): + """ + Create an empty Bengali spaCy pipeline and save it to disk. + + So that it could be loaded using spacy.load() + """ + bn = spacy.blank("bn") + bn.to_disk("bn_test") + + @pytest.fixture(scope="session") def nlp_configuration_dict() -> Dict: nlp_configuration = { - "lang_code": "en", - "model_name": { - "spacy": "en_core_web_lg", - "transformers": "StanfordAIMI/stanford-deidentifier-base", - }, - } + "lang_code": "en", + "model_name": { + "spacy": "en_core_web_lg", + "transformers": "StanfordAIMI/stanford-deidentifier-base", + }, + } return nlp_configuration @@ -210,7 +228,7 @@ def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_not def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_keys_not_include_spacy_then_fail( - nlp_configuration_dict, + nlp_configuration_dict, ): nlp_configuration = nlp_configuration_dict.copy() del nlp_configuration["model_name"]["spacy"] @@ -220,7 +238,7 @@ def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_key def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_keys_not_include_transformers_then_fail( - nlp_configuration_dict, + nlp_configuration_dict, ): nlp_configuration = nlp_configuration_dict.copy() del nlp_configuration["model_name"]["transformers"] @@ -229,61 +247,6 @@ def test_when_create_transformers_nlp_engine_from_wrong_conf_with_model_name_key NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() -def test_ner_model_configuration_from_json( - ner_model_configuration_dict, tmp_path_factory -): - fn = tmp_path_factory.mktemp("data") / "nlp_configuration.json" - fn.write_text(json.dumps(ner_model_configuration_dict), "UTF-8") - - ner_model_configuration = NerModelConfiguration.from_json(fn.absolute()) - assert ner_model_configuration.nlp_engine_name == "transformers" - assert ( - ner_model_configuration.low_score_entity_names - == ner_model_configuration_dict["low_score_entity_names"] - ) - assert ( - ner_model_configuration.aggregation_strategy - == ner_model_configuration_dict["aggregation_strategy"] - ) - assert ( - ner_model_configuration.alignment_mode - == ner_model_configuration_dict["alignment_mode"] - ) - - -def test_nlp_model_configuration_from_yaml( - ner_model_configuration_dict, tmp_path_factory -): - fn = tmp_path_factory.mktemp("data") / "nlp_configuration.yaml" - fn.write_text(yaml.safe_dump(ner_model_configuration_dict), "UTF-8") - - ner_model_configuration = NerModelConfiguration.from_yaml(fn.absolute()) - assert ner_model_configuration.nlp_engine_name == "transformers" - assert ( - ner_model_configuration.low_score_entity_names - == ner_model_configuration_dict["low_score_entity_names"] - ) - assert ( - ner_model_configuration.aggregation_strategy - == ner_model_configuration_dict["aggregation_strategy"] - ) - assert ( - ner_model_configuration.alignment_mode - == ner_model_configuration_dict["alignment_mode"] - ) - - -def test_nlp_model_configuration_from_yaml_missing_field( - ner_model_configuration_dict, tmp_path_factory -): - fn = tmp_path_factory.mktemp("data") / "nlp_configuration.yaml" - del ner_model_configuration_dict["nlp_engine_name"] - fn.write_text(yaml.safe_dump(ner_model_configuration_dict), "UTF-8") - - with pytest.raises(ValueError): - NerModelConfiguration.from_yaml(fn.absolute()) - - def test_nlp_engine_provider_init_through_nlp_engine_configuration(): engine = NlpEngineProvider().create_engine() assert isinstance(engine, SpacyNlpEngine) diff --git a/presidio-analyzer/tests/test_phone_recognizer.py b/presidio-analyzer/tests/test_phone_recognizer.py index 55f40be43..f10024f4f 100644 --- a/presidio-analyzer/tests/test_phone_recognizer.py +++ b/presidio-analyzer/tests/test_phone_recognizer.py @@ -9,10 +9,6 @@ def recognizer(): return PhoneRecognizer() -@pytest.fixture(scope="module") -def nlp_engine(nlp_engines): - return nlp_engines["spacy_en"] - @pytest.mark.parametrize( "text, expected_len, entities, expected_positions, score", @@ -32,7 +28,7 @@ def nlp_engine(nlp_engines): ], ) def test_when_all_phones_then_succeed( - nlp_engine, + spacy_nlp_engine, text, expected_len, entities, @@ -40,7 +36,7 @@ def test_when_all_phones_then_succeed( score, recognizer, ): - nlp_artifacts = nlp_engine.process_text(text, "en") + nlp_artifacts = spacy_nlp_engine.process_text(text, "en") results = recognizer.analyze(text, entities, nlp_artifacts=nlp_artifacts) assert len(results) == expected_len for i, (res, (st_pos, fn_pos)) in enumerate(zip(results, expected_positions)): diff --git a/presidio-analyzer/tests/test_spacy_nlp_engine.py b/presidio-analyzer/tests/test_spacy_nlp_engine.py index 313405da5..d09fdbe87 100644 --- a/presidio-analyzer/tests/test_spacy_nlp_engine.py +++ b/presidio-analyzer/tests/test_spacy_nlp_engine.py @@ -1,18 +1,22 @@ from typing import Iterator +import pytest -def test_simple_process_text(nlp_engine): +from presidio_analyzer.nlp_engine import SpacyNlpEngine - nlp_artifacts = nlp_engine.process_text("simple text", language="en") + +def test_simple_process_text(spacy_nlp_engine): + + nlp_artifacts = spacy_nlp_engine.process_text("simple text", language="en") assert len(nlp_artifacts.tokens) == 2 assert not nlp_artifacts.entities assert nlp_artifacts.lemmas[0] == "simple" assert nlp_artifacts.lemmas[1] == "text" -def test_process_batch_strings(nlp_engine): +def test_process_batch_strings(spacy_nlp_engine): - nlp_artifacts_batch = nlp_engine.process_batch( + nlp_artifacts_batch = spacy_nlp_engine.process_batch( ["simple text", "simple text"], language="en" ) assert isinstance(nlp_artifacts_batch, Iterator) @@ -21,3 +25,23 @@ def test_process_batch_strings(nlp_engine): for text, nlp_artifacts in nlp_artifacts_batch: assert text == "simple text" assert len(nlp_artifacts.tokens) == 2 + + +def test_nlp_not_loaded_value_error(): + unloaded_spacy_nlp = SpacyNlpEngine() + with pytest.raises(ValueError): + unloaded_spacy_nlp.process_text("This should fail as the NLP model isn't loaded", language="en") + + +def test_validate_model_params_missing_fields(): + model = { + "lang_code": "en", + "model_name": "en_core_web_;g" + } + + for key in model.keys(): + new_model = model.copy() + del new_model[key] + + with pytest.raises(ValueError): + SpacyNlpEngine._validate_model_params(new_model) diff --git a/presidio-analyzer/tests/test_spacy_recognizer.py b/presidio-analyzer/tests/test_spacy_recognizer.py index 68ce78fab..f58b52fab 100644 --- a/presidio-analyzer/tests/test_spacy_recognizer.py +++ b/presidio-analyzer/tests/test_spacy_recognizer.py @@ -1,8 +1,9 @@ import pytest -from presidio_analyzer.nlp_engine import SpacyNlpEngine +from presidio_analyzer.predefined_recognizers import SpacyRecognizer from tests import assert_result_within_score_range + @pytest.fixture(scope="module") def entities(): return ["PERSON", "DATE_TIME"] @@ -19,6 +20,7 @@ def prepare_and_analyze(nlp, recognizer, text, ents): return results +@pytest.mark.itegration @pytest.mark.parametrize( "text, expected_len, expected_positions, entity_num", [ @@ -47,13 +49,13 @@ def test_when_using_spacy_then_all_spacy_result_found( expected_len, expected_positions, entity_num, - nlp_engine, + spacy_nlp_engine, nlp_recognizer, entities, ner_strength, max_score, ): - results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) + results = prepare_and_analyze(spacy_nlp_engine, nlp_recognizer, text, entities) assert len(results) == expected_len entity_to_check = entities[entity_num] for res, (st_pos, fn_pos) in zip(results, expected_positions): @@ -63,10 +65,10 @@ def test_when_using_spacy_then_all_spacy_result_found( def test_when_person_in_text_then_person_full_name_complex_found( - nlp_engine, nlp_recognizer, entities + spacy_nlp_engine, nlp_recognizer, entities ): text = "William Bill Alexander" - results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) + results = prepare_and_analyze(spacy_nlp_engine, nlp_recognizer, text, entities) assert len(results) > 0 @@ -79,7 +81,7 @@ def test_when_person_in_text_then_person_full_name_complex_found( assert len(text) - len(covered_text) < 5 -def test_nlp_not_loaded_value_error(): - spacy_nlp = SpacyNlpEngine() - with pytest.raises(ValueError): - spacy_nlp.process_text("This should fail as the NLP model isn't loaded", language="en") +def test_analyze_no_nlp_artifacts(): + spacy_recognizer = SpacyRecognizer() + res = spacy_recognizer.analyze(text="text", nlp_artifacts=None, entities=["PERSON"]) + assert len(res) == 0 diff --git a/presidio-analyzer/tests/test_stanza_recognizer.py b/presidio-analyzer/tests/test_stanza_recognizer.py index fcab9755e..94f53f4ad 100644 --- a/presidio-analyzer/tests/test_stanza_recognizer.py +++ b/presidio-analyzer/tests/test_stanza_recognizer.py @@ -10,9 +10,11 @@ def entities(): @pytest.mark.skip_engine("stanza_en") @pytest.fixture(scope="module") -def nlp_engine(nlp_engines): - return nlp_engines.get("stanza_en", None) - +def spacy_nlp_engine(nlp_engines): + nlp_engine = nlp_engines.get("stanza_en", None) + if nlp_engine: + nlp_engine.load() + return nlp_engine @pytest.mark.skip_engine("stanza_en") @pytest.fixture(scope="module") @@ -56,13 +58,13 @@ def test_when_using_stanza_then_all_stanza_result_correct( expected_len, expected_positions, entity_num, - nlp_engine, + spacy_nlp_engine, nlp_recognizer, entities, ner_strength, max_score, ): - results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) + results = prepare_and_analyze(spacy_nlp_engine, nlp_recognizer, text, entities) assert len(results) == expected_len entity_to_check = entities[entity_num] for res, (st_pos, fn_pos) in zip(results, expected_positions): @@ -73,10 +75,10 @@ def test_when_using_stanza_then_all_stanza_result_correct( @pytest.mark.skip_engine("stanza_en") def test_when_person_in_text_then_person_full_name_complex_found( - nlp_engine, nlp_recognizer, entities + spacy_nlp_engine, nlp_recognizer, entities ): text = "Richard (Rick) C. Henderson" - results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) + results = prepare_and_analyze(spacy_nlp_engine, nlp_recognizer, text, entities) assert len(results) > 0 diff --git a/presidio-analyzer/tests/test_transformers_nlp_engine.py b/presidio-analyzer/tests/test_transformers_nlp_engine.py new file mode 100644 index 000000000..c11db92b0 --- /dev/null +++ b/presidio-analyzer/tests/test_transformers_nlp_engine.py @@ -0,0 +1,48 @@ +import pytest + +from presidio_analyzer.nlp_engine import TransformersNlpEngine + + +def test_default_models(): + engine = TransformersNlpEngine() + assert len(engine.models) > 0 + assert engine.models[0]["lang_code"] == "en" + assert isinstance(engine.models[0]["model_name"], dict) + + +def test_validate_model_params_happy_path(): + model = { + "lang_code": "en", + "model_name": { + "spacy": "en_core_web_sm", + "transformers": "obi/deid_roberta_i2b2", + }, + } + + TransformersNlpEngine._validate_model_params(model) + +@pytest.mark.parametrize( + "key", + [ + ("lang_code"), + ("model_name"), + ("model_name.spacy"), + ("model_name.transformers") + ], +) +def test_validate_model_params_missing_fields(key): + model = { + "lang_code": "en", + "model_name": { + "spacy": "en_core_web_sm", + "transformers": "obi/deid_roberta_i2b2", + }, + } + keys = key.split(".") + if len(keys) == 1: + del model[keys[0]] + else: + del model[keys[0]][keys[1]] + + with pytest.raises(ValueError): + TransformersNlpEngine._validate_model_params(model) diff --git a/presidio-analyzer/tests/test_transformers_recognizer.py b/presidio-analyzer/tests/test_transformers_recognizer.py index 03303cd9b..9ebc1fa57 100644 --- a/presidio-analyzer/tests/test_transformers_recognizer.py +++ b/presidio-analyzer/tests/test_transformers_recognizer.py @@ -10,14 +10,17 @@ def entities(): @pytest.mark.skip_engine("transformers_en") @pytest.fixture(scope="module") -def nlp_engine(nlp_engines): - return nlp_engines.get("transformers_en", None) +def nlp_recognizer(nlp_recognizers): + return nlp_recognizers.get("transformers", None) @pytest.mark.skip_engine("transformers_en") @pytest.fixture(scope="module") -def nlp_recognizer(nlp_recognizers): - return nlp_recognizers.get("transformers", None) +def nlp_engine(nlp_engines): + nlp_engine = nlp_engines.get("transformers_en", None) + if nlp_engine: + nlp_engine.load() + return nlp_engine def prepare_and_analyze(nlp, recognizer, text, entities): @@ -27,6 +30,7 @@ def prepare_and_analyze(nlp, recognizer, text, entities): return results +@pytest.mark.itegration @pytest.mark.skip_engine("transformers_en") @pytest.mark.parametrize( "text, expected_len, expected_positions, entity_num", @@ -76,9 +80,10 @@ def test_when_using_transformers_then_all_transformers_result_correct( ) +@pytest.mark.itegration @pytest.mark.skip_engine("transformers_en") def test_when_person_in_text_then_person_full_name_complex_found( - nlp_engine, nlp_recognizer, entities + nlp_engine, nlp_recognizer, entities ): text = "Richard (Rick) C. Henderson" results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) From 5f9cab6d80c26a4958c68d50c47565e487286302 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 18 Sep 2023 15:39:49 +0300 Subject: [PATCH 41/67] updates to Stanza NLP engine + tests --- presidio-analyzer/conf/default.yaml | 13 ++++++++++++ presidio-analyzer/conf/spacy.yaml | 13 ++++++++++++ .../conf/spacy_multilingual.yaml | 13 ++++++++++++ presidio-analyzer/conf/stanza.yaml | 12 +++++++++++ .../conf/stanza_multilingual.yaml | 12 +++++++++++ presidio-analyzer/conf/transformers.yaml | 5 +++++ .../nlp_engine/stanza_nlp_engine.py | 21 +++++++++++++++++-- presidio-analyzer/setup.py | 2 +- presidio-analyzer/tests/conftest.py | 2 -- .../tests/mocks/app_tracer_mock.py | 1 - .../tests/test_spacy_nlp_engine.py | 11 ++++------ .../tests/test_stanza_recognizer.py | 5 +++-- .../tests/test_transformers_nlp_engine.py | 8 ++----- .../tests/test_transformers_recognizer.py | 2 +- 14 files changed, 98 insertions(+), 22 deletions(-) diff --git a/presidio-analyzer/conf/default.yaml b/presidio-analyzer/conf/default.yaml index 92c163441..681eeb1d7 100644 --- a/presidio-analyzer/conf/default.yaml +++ b/presidio-analyzer/conf/default.yaml @@ -3,3 +3,16 @@ models: - lang_code: en model_name: en_core_web_lg + +ner_model_configuration: + model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + NORP: NRP + FAC: FACILITY + LOC: LOCATION + LOCATION: LOCATION + ORG: ORGANIZATION + ORGANIZATION: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME diff --git a/presidio-analyzer/conf/spacy.yaml b/presidio-analyzer/conf/spacy.yaml index 92c163441..681eeb1d7 100644 --- a/presidio-analyzer/conf/spacy.yaml +++ b/presidio-analyzer/conf/spacy.yaml @@ -3,3 +3,16 @@ models: - lang_code: en model_name: en_core_web_lg + +ner_model_configuration: + model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + NORP: NRP + FAC: FACILITY + LOC: LOCATION + LOCATION: LOCATION + ORG: ORGANIZATION + ORGANIZATION: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME diff --git a/presidio-analyzer/conf/spacy_multilingual.yaml b/presidio-analyzer/conf/spacy_multilingual.yaml index de4868f73..a78c86a88 100644 --- a/presidio-analyzer/conf/spacy_multilingual.yaml +++ b/presidio-analyzer/conf/spacy_multilingual.yaml @@ -9,3 +9,16 @@ models: - lang_code: es model_name: es_core_news_md + +ner_model_configuration: + model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + NORP: NRP + FAC: FACILITY + LOC: LOCATION + LOCATION: LOCATION + ORG: ORGANIZATION + ORGANIZATION: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME diff --git a/presidio-analyzer/conf/stanza.yaml b/presidio-analyzer/conf/stanza.yaml index 7d8090e4a..cfdd9e646 100644 --- a/presidio-analyzer/conf/stanza.yaml +++ b/presidio-analyzer/conf/stanza.yaml @@ -4,3 +4,15 @@ models: lang_code: en model_name: en +ner_model_configuration: + model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + NORP: NRP + FAC: FACILITY + LOC: LOCATION + LOCATION: LOCATION + ORG: ORGANIZATION + ORGANIZATION: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME diff --git a/presidio-analyzer/conf/stanza_multilingual.yaml b/presidio-analyzer/conf/stanza_multilingual.yaml index d0e02e39c..00947e2ad 100644 --- a/presidio-analyzer/conf/stanza_multilingual.yaml +++ b/presidio-analyzer/conf/stanza_multilingual.yaml @@ -7,3 +7,15 @@ models: lang_code: de model_name: de +ner_model_configuration: + model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + NORP: NRP + FAC: FACILITY + LOC: LOCATION + LOCATION: LOCATION + ORG: ORGANIZATION + ORGANIZATION: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME diff --git a/presidio-analyzer/conf/transformers.yaml b/presidio-analyzer/conf/transformers.yaml index 644fdf25a..e7843fa42 100644 --- a/presidio-analyzer/conf/transformers.yaml +++ b/presidio-analyzer/conf/transformers.yaml @@ -18,8 +18,12 @@ ner_model_configuration: alignment_mode: strict # "strict", "contract", "expand" model_to_presidio_entity_mapping: PER: PERSON + PERSON: PERSON LOC: LOCATION + LOCATION: LOCATION ORG: ORGANIZATION + ORGANIZATION: ORGANIZATION + NORP: NRP AGE: AGE ID: ID EMAIL: EMAIL @@ -31,6 +35,7 @@ ner_model_configuration: PHONE: PHONE_NUMBER HCW: PERSON HOSPITAL: ORGANIZATION + FACILITY: LOCATION low_confidence_score_multiplier: 0.4 low_score_entity_names: diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py index 89977275b..b9dcd7ed3 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py @@ -1,4 +1,5 @@ import logging +from typing import Optional, Dict, List try: import stanza @@ -6,7 +7,7 @@ except ImportError: stanza = None -from presidio_analyzer.nlp_engine import SpacyNlpEngine +from presidio_analyzer.nlp_engine import SpacyNlpEngine, NerModelConfiguration logger = logging.getLogger("presidio-analyzer") @@ -19,11 +20,25 @@ class StanzaNlpEngine(SpacyNlpEngine): on tokens. The StanzaNlpEngine uses spacy-stanza and stanza as its NLP module + :param models: Dictionary with the name of the spaCy model per language. + For example: models = [{"lang_code": "en", "model_name": "en"}] + :param ner_model_configuration: Parameters for the NER model. + See conf/stanza.yaml for an example + """ engine_name = "stanza" is_available = bool(stanza) + def __init__( + self, + models: Optional[List[Dict[str, str]]] = None, + ner_model_configuration: Optional[NerModelConfiguration] = None, + download_if_missing: bool = True, + ): + super().__init__(models, ner_model_configuration) + self.download_if_missing = download_if_missing + def load(self) -> None: """Load the NLP model.""" @@ -35,5 +50,7 @@ def load(self) -> None: self.nlp[model["lang_code"]] = spacy_stanza.load_pipeline( model["model_name"], processors="tokenize,pos,lemma,ner", - download_method=None, + download_method="DOWNLOAD_RESOURCES" + if self.download_if_missing + else None, ) diff --git a/presidio-analyzer/setup.py b/presidio-analyzer/setup.py index e616b55ef..16220305c 100644 --- a/presidio-analyzer/setup.py +++ b/presidio-analyzer/setup.py @@ -39,7 +39,7 @@ "phonenumbers>=8.12", ], extras_require={ - 'transformers': ['spacy_huggingface_pipelines'], + "transformers": ["spacy_huggingface_pipelines"], "stanza": ["stanza", "spacy_stanza"], }, include_package_data=True, diff --git a/presidio-analyzer/tests/conftest.py b/presidio-analyzer/tests/conftest.py index 948f5f20a..0945a0de0 100644 --- a/presidio-analyzer/tests/conftest.py +++ b/presidio-analyzer/tests/conftest.py @@ -16,7 +16,6 @@ from tests.mocks import RecognizerRegistryMock, NlpEngineMock - def pytest_configure(config): config.addinivalue_line( "markers", "skip_engine(nlp_engine): skip test for given nlp engine" @@ -128,7 +127,6 @@ def zip_code_recognizer(): return zip_recognizer - def pytest_sessionfinish(): """Remove files created during mock spaCy models creation.""" diff --git a/presidio-analyzer/tests/mocks/app_tracer_mock.py b/presidio-analyzer/tests/mocks/app_tracer_mock.py index 3f54d9057..36ff76a52 100644 --- a/presidio-analyzer/tests/mocks/app_tracer_mock.py +++ b/presidio-analyzer/tests/mocks/app_tracer_mock.py @@ -5,7 +5,6 @@ class AppTracerMock(AppTracer): def __init__(self, enable_decision_process=True): - logger = logging.getLogger("DecisionProcessMock") if not logger.handlers: ch = logging.StreamHandler() diff --git a/presidio-analyzer/tests/test_spacy_nlp_engine.py b/presidio-analyzer/tests/test_spacy_nlp_engine.py index d09fdbe87..033f22da4 100644 --- a/presidio-analyzer/tests/test_spacy_nlp_engine.py +++ b/presidio-analyzer/tests/test_spacy_nlp_engine.py @@ -6,7 +6,6 @@ def test_simple_process_text(spacy_nlp_engine): - nlp_artifacts = spacy_nlp_engine.process_text("simple text", language="en") assert len(nlp_artifacts.tokens) == 2 assert not nlp_artifacts.entities @@ -15,7 +14,6 @@ def test_simple_process_text(spacy_nlp_engine): def test_process_batch_strings(spacy_nlp_engine): - nlp_artifacts_batch = spacy_nlp_engine.process_batch( ["simple text", "simple text"], language="en" ) @@ -30,14 +28,13 @@ def test_process_batch_strings(spacy_nlp_engine): def test_nlp_not_loaded_value_error(): unloaded_spacy_nlp = SpacyNlpEngine() with pytest.raises(ValueError): - unloaded_spacy_nlp.process_text("This should fail as the NLP model isn't loaded", language="en") + unloaded_spacy_nlp.process_text( + "This should fail as the NLP model isn't loaded", language="en" + ) def test_validate_model_params_missing_fields(): - model = { - "lang_code": "en", - "model_name": "en_core_web_;g" - } + model = {"lang_code": "en", "model_name": "en_core_web_lg"} for key in model.keys(): new_model = model.copy() diff --git a/presidio-analyzer/tests/test_stanza_recognizer.py b/presidio-analyzer/tests/test_stanza_recognizer.py index 94f53f4ad..d55877e49 100644 --- a/presidio-analyzer/tests/test_stanza_recognizer.py +++ b/presidio-analyzer/tests/test_stanza_recognizer.py @@ -16,6 +16,7 @@ def spacy_nlp_engine(nlp_engines): nlp_engine.load() return nlp_engine + @pytest.mark.skip_engine("stanza_en") @pytest.fixture(scope="module") def nlp_recognizer(nlp_recognizers): @@ -58,7 +59,7 @@ def test_when_using_stanza_then_all_stanza_result_correct( expected_len, expected_positions, entity_num, - spacy_nlp_engine, + spacy_nlp_engine, nlp_recognizer, entities, ner_strength, @@ -75,7 +76,7 @@ def test_when_using_stanza_then_all_stanza_result_correct( @pytest.mark.skip_engine("stanza_en") def test_when_person_in_text_then_person_full_name_complex_found( - spacy_nlp_engine, nlp_recognizer, entities + spacy_nlp_engine, nlp_recognizer, entities ): text = "Richard (Rick) C. Henderson" results = prepare_and_analyze(spacy_nlp_engine, nlp_recognizer, text, entities) diff --git a/presidio-analyzer/tests/test_transformers_nlp_engine.py b/presidio-analyzer/tests/test_transformers_nlp_engine.py index c11db92b0..cd0274e0d 100644 --- a/presidio-analyzer/tests/test_transformers_nlp_engine.py +++ b/presidio-analyzer/tests/test_transformers_nlp_engine.py @@ -21,14 +21,10 @@ def test_validate_model_params_happy_path(): TransformersNlpEngine._validate_model_params(model) + @pytest.mark.parametrize( "key", - [ - ("lang_code"), - ("model_name"), - ("model_name.spacy"), - ("model_name.transformers") - ], + [("lang_code"), ("model_name"), ("model_name.spacy"), ("model_name.transformers")], ) def test_validate_model_params_missing_fields(key): model = { diff --git a/presidio-analyzer/tests/test_transformers_recognizer.py b/presidio-analyzer/tests/test_transformers_recognizer.py index 9ebc1fa57..959988093 100644 --- a/presidio-analyzer/tests/test_transformers_recognizer.py +++ b/presidio-analyzer/tests/test_transformers_recognizer.py @@ -83,7 +83,7 @@ def test_when_using_transformers_then_all_transformers_result_correct( @pytest.mark.itegration @pytest.mark.skip_engine("transformers_en") def test_when_person_in_text_then_person_full_name_complex_found( - nlp_engine, nlp_recognizer, entities + nlp_engine, nlp_recognizer, entities ): text = "Richard (Rick) C. Henderson" results = prepare_and_analyze(nlp_engine, nlp_recognizer, text, entities) From 1c556c98bdd2339185a9a04bad16fff7adada7d8 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 19 Sep 2023 13:56:31 +0300 Subject: [PATCH 42/67] tests fix --- .../nlp_engine/ner_model_configuration.py | 2 -- .../nlp_engine/nlp_engine_provider.py | 9 +++++++++ .../tests/test_analyzer_engine.py | 1 - .../tests/test_ner_model_configuration.py | 18 ------------------ .../tests/test_spacy_recognizer.py | 1 - .../tests/test_transformers_recognizer.py | 3 +-- 6 files changed, 10 insertions(+), 24 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py index 67b181270..a1e311eee 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py @@ -99,8 +99,6 @@ def __validate_type(config_dict: Dict, key: str, field_type: Type) -> None: if key in config_dict: if not isinstance(config_dict[key], field_type): raise ValueError(f"{key} must be of type {field_type}") - else: - raise ValueError(f"NER configuration is missing '{key}'") @classmethod def from_dict(cls, nlp_engine_configuration: Dict) -> "NerModelConfiguration": diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py index 6dfc3a87d..4037bf1a0 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py @@ -9,6 +9,7 @@ SpacyNlpEngine, NlpEngine, TransformersNlpEngine, + NerModelConfiguration, ) logger = logging.getLogger("presidio-analyzer") @@ -88,6 +89,9 @@ def create_engine(self) -> NlpEngine: ner_model_configuration = self.nlp_configuration.get( "ner_model_configuration" ) + if ner_model_configuration: + ner_model_configuration = NerModelConfiguration.from_dict(ner_model_configuration) + engine = nlp_engine_class( models=nlp_models, ner_model_configuration=ner_model_configuration ) @@ -117,6 +121,11 @@ def _read_nlp_conf(conf_file: Union[Path, str]) -> dict: else: nlp_configuration = yaml.safe_load(open(conf_file)) + if "ner_model_configuration" not in nlp_configuration: + logger.warning( + "configuration file is missing 'ner_model_configuration'. Using default" + ) + return nlp_configuration @staticmethod diff --git a/presidio-analyzer/tests/test_analyzer_engine.py b/presidio-analyzer/tests/test_analyzer_engine.py index 9494a1c66..659c4cb53 100644 --- a/presidio-analyzer/tests/test_analyzer_engine.py +++ b/presidio-analyzer/tests/test_analyzer_engine.py @@ -59,7 +59,6 @@ def nlp_engine(nlp_engines): return nlp_engines["spacy_en"] -@pytest.mark.integration def test_simple(): dic = { "text": "John Smith drivers license is AC432223", diff --git a/presidio-analyzer/tests/test_ner_model_configuration.py b/presidio-analyzer/tests/test_ner_model_configuration.py index 51d83774c..09c1e95cc 100644 --- a/presidio-analyzer/tests/test_ner_model_configuration.py +++ b/presidio-analyzer/tests/test_ner_model_configuration.py @@ -55,21 +55,3 @@ def test_from_dict_wrong_types(ner_model_configuration_dict, key, value): with pytest.raises(ValueError): NerModelConfiguration.from_dict(new_config) - -@pytest.mark.parametrize( - "key", - [ - ("labels_to_ignore"), - ("aggregation_strategy"), - ("alignment_mode"), - ("model_to_presidio_entity_mapping"), - ("low_confidence_score_multiplier"), - ("low_score_entity_names"), - ("stride"), - ], -) -def test_from_dict_missing_fields(ner_model_configuration_dict, key): - new_config = ner_model_configuration_dict.copy() - del new_config[key] - with pytest.raises(ValueError): - NerModelConfiguration.from_dict(new_config) diff --git a/presidio-analyzer/tests/test_spacy_recognizer.py b/presidio-analyzer/tests/test_spacy_recognizer.py index f58b52fab..92ed0947c 100644 --- a/presidio-analyzer/tests/test_spacy_recognizer.py +++ b/presidio-analyzer/tests/test_spacy_recognizer.py @@ -20,7 +20,6 @@ def prepare_and_analyze(nlp, recognizer, text, ents): return results -@pytest.mark.itegration @pytest.mark.parametrize( "text, expected_len, expected_positions, entity_num", [ diff --git a/presidio-analyzer/tests/test_transformers_recognizer.py b/presidio-analyzer/tests/test_transformers_recognizer.py index 959988093..2779bb7eb 100644 --- a/presidio-analyzer/tests/test_transformers_recognizer.py +++ b/presidio-analyzer/tests/test_transformers_recognizer.py @@ -30,7 +30,6 @@ def prepare_and_analyze(nlp, recognizer, text, entities): return results -@pytest.mark.itegration @pytest.mark.skip_engine("transformers_en") @pytest.mark.parametrize( "text, expected_len, expected_positions, entity_num", @@ -80,7 +79,7 @@ def test_when_using_transformers_then_all_transformers_result_correct( ) -@pytest.mark.itegration + @pytest.mark.skip_engine("transformers_en") def test_when_person_in_text_then_person_full_name_complex_found( nlp_engine, nlp_recognizer, entities From b58a799be7ff588b2d699052387e315c0f4c12ee Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 19 Sep 2023 14:11:33 +0300 Subject: [PATCH 43/67] linting --- .../presidio_analyzer/nlp_engine/nlp_engine_provider.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py index 4037bf1a0..2f2d4ae8f 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py @@ -90,7 +90,9 @@ def create_engine(self) -> NlpEngine: "ner_model_configuration" ) if ner_model_configuration: - ner_model_configuration = NerModelConfiguration.from_dict(ner_model_configuration) + ner_model_configuration = NerModelConfiguration.from_dict( + ner_model_configuration + ) engine = nlp_engine_class( models=nlp_models, ner_model_configuration=ner_model_configuration From 3fb24947f76a72f1f3d9fc768a689ff98f42c277 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 19 Sep 2023 17:23:01 +0300 Subject: [PATCH 44/67] added GPE to mapping --- presidio-analyzer/conf/default.yaml | 1 + presidio-analyzer/conf/spacy.yaml | 1 + presidio-analyzer/conf/spacy_multilingual.yaml | 1 + presidio-analyzer/conf/stanza.yaml | 1 + presidio-analyzer/conf/stanza_multilingual.yaml | 1 + presidio-analyzer/conf/transformers.yaml | 2 ++ presidio-analyzer/tests/conf/default.yaml | 13 +++++++++++++ presidio-analyzer/tests/conf/test.yaml | 14 ++++++++++++++ 8 files changed, 34 insertions(+) diff --git a/presidio-analyzer/conf/default.yaml b/presidio-analyzer/conf/default.yaml index 681eeb1d7..471f43c5a 100644 --- a/presidio-analyzer/conf/default.yaml +++ b/presidio-analyzer/conf/default.yaml @@ -11,6 +11,7 @@ ner_model_configuration: NORP: NRP FAC: FACILITY LOC: LOCATION + GPE: LOCATION LOCATION: LOCATION ORG: ORGANIZATION ORGANIZATION: ORGANIZATION diff --git a/presidio-analyzer/conf/spacy.yaml b/presidio-analyzer/conf/spacy.yaml index 681eeb1d7..adbddac4c 100644 --- a/presidio-analyzer/conf/spacy.yaml +++ b/presidio-analyzer/conf/spacy.yaml @@ -12,6 +12,7 @@ ner_model_configuration: FAC: FACILITY LOC: LOCATION LOCATION: LOCATION + GPE: LOCATION ORG: ORGANIZATION ORGANIZATION: ORGANIZATION DATE: DATE_TIME diff --git a/presidio-analyzer/conf/spacy_multilingual.yaml b/presidio-analyzer/conf/spacy_multilingual.yaml index a78c86a88..552ad8e75 100644 --- a/presidio-analyzer/conf/spacy_multilingual.yaml +++ b/presidio-analyzer/conf/spacy_multilingual.yaml @@ -18,6 +18,7 @@ ner_model_configuration: FAC: FACILITY LOC: LOCATION LOCATION: LOCATION + GPE: LOCATION ORG: ORGANIZATION ORGANIZATION: ORGANIZATION DATE: DATE_TIME diff --git a/presidio-analyzer/conf/stanza.yaml b/presidio-analyzer/conf/stanza.yaml index cfdd9e646..618bbbbfb 100644 --- a/presidio-analyzer/conf/stanza.yaml +++ b/presidio-analyzer/conf/stanza.yaml @@ -12,6 +12,7 @@ ner_model_configuration: FAC: FACILITY LOC: LOCATION LOCATION: LOCATION + GPE: LOCATION ORG: ORGANIZATION ORGANIZATION: ORGANIZATION DATE: DATE_TIME diff --git a/presidio-analyzer/conf/stanza_multilingual.yaml b/presidio-analyzer/conf/stanza_multilingual.yaml index 00947e2ad..7afaceeec 100644 --- a/presidio-analyzer/conf/stanza_multilingual.yaml +++ b/presidio-analyzer/conf/stanza_multilingual.yaml @@ -15,6 +15,7 @@ ner_model_configuration: FAC: FACILITY LOC: LOCATION LOCATION: LOCATION + GPE: LOCATION ORG: ORGANIZATION ORGANIZATION: ORGANIZATION DATE: DATE_TIME diff --git a/presidio-analyzer/conf/transformers.yaml b/presidio-analyzer/conf/transformers.yaml index e7843fa42..4ed00e454 100644 --- a/presidio-analyzer/conf/transformers.yaml +++ b/presidio-analyzer/conf/transformers.yaml @@ -21,6 +21,7 @@ ner_model_configuration: PERSON: PERSON LOC: LOCATION LOCATION: LOCATION + GPE: LOCATION ORG: ORGANIZATION ORGANIZATION: ORGANIZATION NORP: NRP @@ -32,6 +33,7 @@ ner_model_configuration: HOSP: ORGANIZATION PATORG: ORGANIZATION DATE: DATE_TIME + TIME: DATE_TIME PHONE: PHONE_NUMBER HCW: PERSON HOSPITAL: ORGANIZATION diff --git a/presidio-analyzer/tests/conf/default.yaml b/presidio-analyzer/tests/conf/default.yaml index 68f0f0f75..471f43c5a 100644 --- a/presidio-analyzer/tests/conf/default.yaml +++ b/presidio-analyzer/tests/conf/default.yaml @@ -4,3 +4,16 @@ models: lang_code: en model_name: en_core_web_lg +ner_model_configuration: + model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + NORP: NRP + FAC: FACILITY + LOC: LOCATION + GPE: LOCATION + LOCATION: LOCATION + ORG: ORGANIZATION + ORGANIZATION: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME diff --git a/presidio-analyzer/tests/conf/test.yaml b/presidio-analyzer/tests/conf/test.yaml index e95873ab1..e22e15e88 100644 --- a/presidio-analyzer/tests/conf/test.yaml +++ b/presidio-analyzer/tests/conf/test.yaml @@ -6,3 +6,17 @@ models: - lang_code: bn model_name: bn_test + +ner_model_configuration: + model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + NORP: NRP + FAC: FACILITY + LOC: LOCATION + GPE: LOCATION + LOCATION: LOCATION + ORG: ORGANIZATION + ORGANIZATION: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME From 0200d1a4d05b8b66cf2d68ab603ebe8b2b283381 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 20 Sep 2023 11:05:18 +0300 Subject: [PATCH 45/67] reverted installation.md --- docs/installation.md | 48 ++++++++++++++++---------------------------- 1 file changed, 17 insertions(+), 31 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 73249ee99..dcaf66b83 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -10,43 +10,31 @@ Presidio suite using `pip` (as Python packages) or using `Docker` (As containeri ## Using pip !!! note "Note" - - Consider installing the Presidio python packages - on a virtual environment like [venv](https://docs.python.org/3/tutorial/venv.html) - or [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). + Consider installing the Presidio python packages + on a virtual environment like [venv](https://docs.python.org/3/tutorial/venv.html) + or [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). ### Supported Python Versions -Presidio is supported for the following python versions: 3.7, 3.8, 3.9, 3.10, 3.11. +Presidio is supported for the following python versions: + +* 3.7 +* 3.8 +* 3.9 +* 3.10 +* 3.11 -### PII de-identification in text +### PII anonymization on text For PII anonymization on text, install the `presidio-analyzer` and `presidio-anonymizer` packages: ```sh pip install presidio_analyzer pip install presidio_anonymizer -``` - -In addition, Presidio requires at least one NLP engine (spaCy, transformers or stanza): - -=== "spaCy (default)" - - ``` - python -m spacy download en_core_web_lg - ``` - -=== "Transformers" - ``` - pip install "presidio_analyzer[transformers]" - ``` - -=== "Stanza" - - ``` - pip install "presidio_analyzer[stanza]" - ``` +# Presidio analyzer requires a spaCy language model. +python -m spacy download en_core_web_lg +``` For a more detailed installation of each package, refer to the specific documentation: @@ -73,10 +61,9 @@ Presidio can expose REST endpoints for each service using Flask and Docker. To download the Presidio Docker containers, run the following command: !!! note "Note" + This requires Docker to be installed. [Download Docker](https://docs.docker.com/get-docker/). - This requires Docker to be installed. [Download Docker](https://docs.docker.com/get-docker/). - -### For PII de-identification in text +### For PII anonymization in text For PII detection and anonymization in text, the `presidio-analyzer` and `presidio-anonymizer` modules are required. @@ -126,8 +113,7 @@ git clone git@github.com:microsoft/presidio.git Then, build the containers locally. !!! note "Note" - - Presidio uses [docker-compose](https://docs.docker.com/compose/) to manage the different Presidio containers. + Presidio uses [docker-compose](https://docs.docker.com/compose/) to manage the different Presidio containers. From the root folder of the repo: From 49b056246c390a2a86cc9cd5acba0ab75b39c30d Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 20 Sep 2023 11:06:19 +0300 Subject: [PATCH 46/67] reverted getting_started.md --- docs/getting_started.md | 55 +++-------------------------------------- 1 file changed, 3 insertions(+), 52 deletions(-) diff --git a/docs/getting_started.md b/docs/getting_started.md index 04c7ec267..49def7a26 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -1,10 +1,10 @@ # Getting started with Presidio -## Simple flow: Text +## Simple flow Using Presidio's modules as Python packages to get started -=== "Anonymize PII in text (Default spaCy model)" +=== "Anonymize PII in text" 1. Install Presidio @@ -20,7 +20,7 @@ Using Presidio's modules as Python packages to get started from presidio_analyzer import AnalyzerEngine from presidio_anonymizer import AnonymizerEngine - text = "My phone number is 212-555-5555" + text="My phone number is 212-555-5555" # Set up the engine, loads the NLP module (spaCy model by default) # and other PII recognizers @@ -41,55 +41,6 @@ Using Presidio's modules as Python packages to get started print(anonymized_text) ``` -=== "Anonymize PII in text (transformers)" - - 1. Install Presidio - - ```sh - pip install "presidio-analyzer[transformers]" - pip install presidio-anonymizer - python -m spacy download en_core_web_sm - ``` - - 2. Analyze + Anonymize - - ```py - from presidio_analyzer import AnalyzerEngine - from presidio_analyzer.nlp_engine import TransformersNlpEngine - from presidio_anonymizer import AnonymizerEngine - - text = "My name is Don and my phone number is 212-555-5555" - - # Define which transformers model to use - model_config = [{"lang_code": "en", "model_name": { - "spacy": "en_core_web_sm", # use a small spaCy model for lemmas, tokens etc. - "transformers": "dslim/bert-base-NER" - } - }] - - nlp_engine = TransformersNlpEngine(models=model_config) - - # Set up the engine, loads the NLP module (spaCy model by default) - # and other PII recognizers - analyzer = AnalyzerEngine(nlp_engine=nlp_engine) - - # Call analyzer to get results - results = analyzer.analyze(text=text, language='en') - print(results) - - # Analyzer results are passed to the AnonymizerEngine for anonymization - - anonymizer = AnonymizerEngine() - - anonymized_text = anonymizer.anonymize(text=text, analyzer_results=results) - - print(anonymized_text) - ``` - - The transformers model and the spacy model would be downloaded on the first call to the `AnalyzerEngine`. - -## Simple flow: Images - === "Anonymize PII in images" 1. Install presidio-image-redactor From 023cc8fce5d9fb7729bd4274f9c2937d3b009693 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 20 Sep 2023 11:07:35 +0300 Subject: [PATCH 47/67] Update spacy.yaml --- presidio-analyzer/conf/spacy.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/presidio-analyzer/conf/spacy.yaml b/presidio-analyzer/conf/spacy.yaml index adbddac4c..342f46d63 100644 --- a/presidio-analyzer/conf/spacy.yaml +++ b/presidio-analyzer/conf/spacy.yaml @@ -17,3 +17,8 @@ ner_model_configuration: ORGANIZATION: ORGANIZATION DATE: DATE_TIME TIME: DATE_TIME + + low_confidence_score_multiplier: 0.4 + low_score_entity_names: + - ORG + - ORGANIZATION From f81e3c39f15ad252a3ec5f41099293fb16b02832 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 20 Sep 2023 11:07:51 +0300 Subject: [PATCH 48/67] Update spacy_multilingual.yaml --- presidio-analyzer/conf/spacy_multilingual.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/presidio-analyzer/conf/spacy_multilingual.yaml b/presidio-analyzer/conf/spacy_multilingual.yaml index 552ad8e75..5b442593d 100644 --- a/presidio-analyzer/conf/spacy_multilingual.yaml +++ b/presidio-analyzer/conf/spacy_multilingual.yaml @@ -23,3 +23,8 @@ ner_model_configuration: ORGANIZATION: ORGANIZATION DATE: DATE_TIME TIME: DATE_TIME + + low_confidence_score_multiplier: 0.4 + low_score_entity_names: + - ORG + - ORGANIZATION From 2a81f873cb657dc44f63a35aa291d3683a0308ba Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Thu, 21 Sep 2023 10:25:44 +0300 Subject: [PATCH 49/67] changed alignment_model to expand --- presidio-analyzer/conf/transformers.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/presidio-analyzer/conf/transformers.yaml b/presidio-analyzer/conf/transformers.yaml index 4ed00e454..6dff1ef71 100644 --- a/presidio-analyzer/conf/transformers.yaml +++ b/presidio-analyzer/conf/transformers.yaml @@ -15,7 +15,7 @@ ner_model_configuration: # length. The value is the length of the # window overlap in transformer tokenizer # tokens, NOT the length of the stride. - alignment_mode: strict # "strict", "contract", "expand" + alignment_mode: expand # "strict", "contract", "expand" model_to_presidio_entity_mapping: PER: PERSON PERSON: PERSON @@ -41,4 +41,4 @@ ner_model_configuration: low_confidence_score_multiplier: 0.4 low_score_entity_names: - - ID \ No newline at end of file + - ID From 72e7c2b57ca6b67e3a2faaf6e4e97bfbf1cd5516 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Thu, 21 Sep 2023 10:26:05 +0300 Subject: [PATCH 50/67] Update ner_model_configuration.py --- .../presidio_analyzer/nlp_engine/ner_model_configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py index a1e311eee..b4e388302 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py @@ -52,7 +52,7 @@ class NerModelConfiguration: labels_to_ignore: Optional[Collection[str]] = None aggregation_strategy: Optional[str] = "simple" stride: Optional[int] = 14 - alignment_mode: Optional[str] = "strict" + alignment_mode: Optional[str] = "expand" default_score: Optional[float] = 0.85 model_to_presidio_entity_mapping: Optional[Dict[str, str]] = None low_score_entity_names: Optional[Collection] = None From a338b99f06229dc8f2f588342b9c71e08d732ce1 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 26 Sep 2023 23:13:45 +0300 Subject: [PATCH 51/67] minor changes after more testing --- presidio-analyzer/conf/default.yaml | 16 ++++++++++++++++ presidio-analyzer/conf/spacy.yaml | 11 +++++++++++ .../nlp_engine/nlp_engine_provider.py | 2 +- .../nlp_engine/spacy_nlp_engine.py | 2 +- .../predefined_recognizers/stanza_recognizer.py | 2 ++ .../transformers_recognizer.py | 2 ++ 6 files changed, 33 insertions(+), 2 deletions(-) diff --git a/presidio-analyzer/conf/default.yaml b/presidio-analyzer/conf/default.yaml index 471f43c5a..60791c517 100644 --- a/presidio-analyzer/conf/default.yaml +++ b/presidio-analyzer/conf/default.yaml @@ -17,3 +17,19 @@ ner_model_configuration: ORGANIZATION: ORGANIZATION DATE: DATE_TIME TIME: DATE_TIME + + low_confidence_score_multiplier: 0.4 + low_score_entity_names: + - ORG + - ORGANIZATION + labels_to_ignore: + - CARDINAL + - EVENT + - LANGUAGE + - LAW + - MONEY + - ORDINAL + - PERCENT + - PRODUCT + - QUANTITY + - WORK_OF_ART diff --git a/presidio-analyzer/conf/spacy.yaml b/presidio-analyzer/conf/spacy.yaml index 342f46d63..93fef1533 100644 --- a/presidio-analyzer/conf/spacy.yaml +++ b/presidio-analyzer/conf/spacy.yaml @@ -22,3 +22,14 @@ ner_model_configuration: low_score_entity_names: - ORG - ORGANIZATION + labels_to_ignore: + - CARDINAL + - EVENT + - LANGUAGE + - LAW + - MONEY + - ORDINAL + - PERCENT + - PRODUCT + - QUANTITY + - WORK_OF_ART diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py index 2f2d4ae8f..b0701be7a 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py @@ -59,7 +59,7 @@ def __init__( if conf_file: self.nlp_configuration = self._read_nlp_conf(conf_file) - if not conf_file and not nlp_configuration: + if conf_file is None and nlp_configuration is None: conf_file = self._get_full_conf_path() logger.debug(f"Reading default conf file from {conf_file}") self.nlp_configuration = self._read_nlp_conf(conf_file) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py index 66850592d..957bf0b00 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py @@ -217,7 +217,7 @@ def _get_updated_entities( else: logger.warning( f"Entity {ent.label_} is not mapped to a Presidio entity, " - f"but keeping anyway" + f"but keeping anyway. Add to `NerModelConfiguration.labels_to_ignore` to remove." ) # Remove presidio entities in the ignore list diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/stanza_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/stanza_recognizer.py index 1152cbeca..9479935b5 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/stanza_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/stanza_recognizer.py @@ -13,3 +13,5 @@ class StanzaRecognizer(SpacyRecognizer): def __init__(self, **kwargs): # noqa ANN003 self.DEFAULT_EXPLANATION = self.DEFAULT_EXPLANATION.replace("Spacy", "Stanza") super().__init__(**kwargs) + self.name = "StanzaRecognizer" + diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py index e78193d88..0a7255df9 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py @@ -34,3 +34,5 @@ def __init__(self, **kwargs): # noqa ANN003 "Spacy", "Transfromers" ) super().__init__(**kwargs) + self.name = "TransformersRecognizer" + From 52cce16e06fa8bc13947d454cba0ea208ca10c2a Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 26 Sep 2023 23:30:06 +0300 Subject: [PATCH 52/67] revert recognizer name change, no need. --- .../predefined_recognizers/spacy_recognizer.py | 2 +- .../predefined_recognizers/stanza_recognizer.py | 2 -- .../predefined_recognizers/transformers_recognizer.py | 4 +--- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py index d8e4b5725..322242004 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py @@ -87,7 +87,7 @@ def build_explanation( :return: """ explanation = AnalysisExplanation( - recognizer=self.__class__.__name__, + recognizer=self.name, original_score=original_score, textual_explanation=explanation, ) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/stanza_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/stanza_recognizer.py index 9479935b5..1152cbeca 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/stanza_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/stanza_recognizer.py @@ -13,5 +13,3 @@ class StanzaRecognizer(SpacyRecognizer): def __init__(self, **kwargs): # noqa ANN003 self.DEFAULT_EXPLANATION = self.DEFAULT_EXPLANATION.replace("Spacy", "Stanza") super().__init__(**kwargs) - self.name = "StanzaRecognizer" - diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py index 0a7255df9..65791fc8b 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py @@ -31,8 +31,6 @@ class TransformersRecognizer(SpacyRecognizer): def __init__(self, **kwargs): # noqa ANN003 self.DEFAULT_EXPLANATION = self.DEFAULT_EXPLANATION.replace( - "Spacy", "Transfromers" + "Spacy", "Transformers" ) super().__init__(**kwargs) - self.name = "TransformersRecognizer" - From 8238cbd688c30a6288a8b4a4a983ae551f630c9e Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 26 Sep 2023 23:37:00 +0300 Subject: [PATCH 53/67] removed unnecessary field --- .../predefined_recognizers/transformers_recognizer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py index 65791fc8b..2f1109325 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/transformers_recognizer.py @@ -27,8 +27,6 @@ class TransformersRecognizer(SpacyRecognizer): "PHONE_NUMBER", ] - LOW_SCORE_ENTITY_NAMES = {"ID"} - def __init__(self, **kwargs): # noqa ANN003 self.DEFAULT_EXPLANATION = self.DEFAULT_EXPLANATION.replace( "Spacy", "Transformers" From d60315acbb01451d4c85c5ed60f41cc3117b102c Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 27 Sep 2023 00:50:01 +0300 Subject: [PATCH 54/67] updates to docs for new NLP engine and in general --- docs/analyzer/developing_recognizers.md | 36 ++++++------- docs/analyzer/languages.md | 3 +- docs/analyzer/nlp_engines/spacy_stanza.md | 16 ++++++ docs/analyzer/nlp_engines/transformers.md | 45 ++++++++++++++++- docs/faq.md | 2 +- docs/getting_started.md | 2 +- docs/installation.md | 2 +- .../image_redaction_allow_list_approach.ipynb | 2 +- docs/tutorial/05_languages.md | 4 +- docs/tutorial/index.md | 2 +- mkdocs.yml | 50 +++++++++++-------- 11 files changed, 112 insertions(+), 52 deletions(-) diff --git a/docs/analyzer/developing_recognizers.md b/docs/analyzer/developing_recognizers.md index 98e1631b4..3772867ce 100644 --- a/docs/analyzer/developing_recognizers.md +++ b/docs/analyzer/developing_recognizers.md @@ -7,7 +7,8 @@ Recognizers define the logic for detection, as well as the confidence a predicti ### Accuracy -Each recognizer, regardless of its complexity, could have false positives and false negatives. When adding new recognizers, we try to balance the effect of each recognizer on the entire system. A recognizer with many false positives would affect the system's usability, while a recognizer with many false negatives might require more work before it can be integrated. For reproducibility purposes, it is be best to note how the recognizer's accuracy was tested, and on which datasets. +Each recognizer, regardless of its complexity, could have false positives and false negatives. When adding new recognizers, we try to balance the effect of each recognizer on the entire system. +A recognizer with many false positives would affect the system's usability, while a recognizer with many false negatives might require more work before it can be integrated. For reproducibility purposes, it is be best to note how the recognizer's accuracy was tested, and on which datasets. For tools and documentation on evaluating and analyzing recognizers, refer to the [presidio-research Github repository](https://github.com/microsoft/presidio-research). !!! note "Note" @@ -22,7 +23,8 @@ Make sure your recognizer doesn't take too long to process text. Anything above ### Environment -When adding new recognizers that have 3rd party dependencies, make sure that the new dependencies don't interfere with Presidio's dependencies. In the case of a conflict, one can create an isolated model environment (outside the main presidio-analyzer process) and implement a [`RemoteRecognizer`](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/remote_recognizer.py) on the presidio-analyzer side to interact with the model's endpoint. In addition, make sure the license on the 3rd party dependency allows you to use it for any purpose. +When adding new recognizers that have 3rd party dependencies, make sure that the new dependencies don't interfere with Presidio's dependencies. +In the case of a conflict, one can create an isolated model environment (outside the main presidio-analyzer process) and implement a [`RemoteRecognizer`](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/remote_recognizer.py) on the presidio-analyzer side to interact with the model's endpoint. ## Recognizer Types @@ -32,7 +34,7 @@ Generally speaking, there are three types of recognizers: A deny list is a list of words that should be removed during text analysis. For example, it can include a list of titles (`["Mr.", "Mrs.", "Ms.", "Dr."]` to detect a "Title" entity.) -See [this documentation](index.md#how-to-add-a-new-recognizer) on adding a new recognizer. The [`PatternRecognizer`](/presidio-analyzer/presidio_analyzer/pattern_recognizer.py) class has built-in support for a deny-list input. +See [this documentation](index.md#how-to-add-a-new-recognizer) on adding a new recognizer. The [`PatternRecognizer`](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/pattern_recognizer.py) class has built-in support for a deny-list input. ### Pattern Based @@ -47,36 +49,26 @@ See some examples here: ### Machine Learning (ML) Based or Rule-Based Many PII entities are undetectable using naive approaches like deny-lists or regular expressions. -In these cases, we would wish to utilize a Machine Learning model capable of identifying entities in free text, or a rule-based recognizer. There are four options for adding ML and rule based recognizers: +In these cases, we would wish to utilize a Machine Learning model capable of identifying entities in free text, or a rule-based recognizer. -#### Utilize SpaCy, Stanza or Transformers +#### ML: Utilize SpaCy, Stanza or Transformers Presidio currently uses [spaCy](https://spacy.io/) as a framework for text analysis and Named Entity Recognition (NER), and [stanza](https://stanfordnlp.github.io/stanza/) and [huggingface transformers](https://huggingface.co/docs/transformers/index) as an alternative. To avoid introducing new tools, it is recommended to first try to use `spaCy`, `stanza` or `transformers` over other tools if possible. `spaCy` provides descent results compared to state-of-the-art NER models, but with much better computational performance. -`spaCy` and `stanza` models could be trained from scratch, used in combination with pre-trained embeddings, or retrained to detect new entities. -When integrating such a model into Presidio, a class inheriting from the [`EntityRecognizer`](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/entity_recognizer.py) should be created. +`spaCy`, `stanza` and `transformers` models could be trained from scratch, used in combination with pre-trained embeddings, or be fine-tuned. -#### Utilize Scikit-learn or Similar - -`Scikit-learn` models tend to be fast, but usually have lower accuracy than deep learning methods. However, for well defined problems with well defined features, they can provide very good results. -When integrating such a model into Presidio, a class inheriting from the [`EntityRecognizer`](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/entity_recognizer.py) should be created. +In addition to those, it is also possible to use other ML models. In that case, a new `EntityRecognizer` should be created. +See an example using [Flair here](https://github.com/microsoft/presidio/blob/main/docs/samples/python/flair_recognizer.py). #### Apply Custom Logic -In some cases, rule-based logic provides the best way of detecting entities. -The Presidio `EntityRecognizer` API allows you to use `spaCy`/`stanza` extracted features like lemmas, part of speech, dependencies and more to create your logic. When integrating such logic into Presidio, a class inheriting from the [`EntityRecognizer`](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/entity_recognizer.py) should be created. - -#### Deep Learning Based Methods - -Deep learning methods offer excellent detection rates for NER. -They are however more complex to train, deploy and tend to be slower than traditional approaches. -When creating a DL based method for PII detection, there are two main alternatives for integrating it with Presidio: - -1. Create an external endpoint (either local or remote) which is isolated from the `presidio-analyzer` process. On the `presidio-analyzer` side, one would extend the [`RemoteRecognizer`](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/remote_recognizer.py) class and implement the network interface between `presidio-analyzer` and the endpoint of the model's container. -2. Integrate the model as an additional [`EntityRecognizer`](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/entity_recognizer.py) within the `presidio-analyzer` flow. +In some cases, rule-based logic provides reasonable ways for detecting entities. +The Presidio `EntityRecognizer` API allows you to use `spaCy` extracted features like lemmas, part of speech, dependencies and more to create your logic. +When integrating such logic into Presidio, a class inheriting from the [`EntityRecognizer`](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/entity_recognizer.py) should be created. !!! attention "Considerations for selecting one option over another" + - Accuracy. - Ease of integration. - Runtime considerations (For example if the new model requires a GPU). - 3rd party dependencies of the new model vs. the existing `presidio-analyzer` package. diff --git a/docs/analyzer/languages.md b/docs/analyzer/languages.md index aee03bcec..7d51dcd17 100644 --- a/docs/analyzer/languages.md +++ b/docs/analyzer/languages.md @@ -64,6 +64,7 @@ analyzer = AnalyzerEngine( analyzer.analyze(text="My name is David", language="en") ``` +Link to LANGUAGES_CONFIG_FILE=[languages-config.yml](https://github.com/microsoft/presidio/blob/main/docs/analyzer/languages-config.yml) ### Automatically install NLP models into the Docker container @@ -73,4 +74,4 @@ update the [conf/default.yaml](https://github.com/microsoft/presidio/blob/main/p the `docker build` phase and the models defined in it are installed automatically. For `transformers` based models, the configuration [can be found here](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/conf/transformers.yaml). -In addition, make sure the Docker file contains the relevant packages for `transformers`, which are not loaded automatically with Presidio. +A docker file supporting transformers models [can be found here](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/Dockerfile.transformers). diff --git a/docs/analyzer/nlp_engines/spacy_stanza.md b/docs/analyzer/nlp_engines/spacy_stanza.md index 435e5752a..d0372570f 100644 --- a/docs/analyzer/nlp_engines/spacy_stanza.md +++ b/docs/analyzer/nlp_engines/spacy_stanza.md @@ -34,6 +34,22 @@ For the available models, follow these links: [spaCy](https://spacy.io/usage/mod Once created, see [the NLP configuration documentation](../customizing_nlp_models.md#Configure-Presidio-to-use-the-new-model) for more information. +## How NER results flow within Presidio +This diagram describes the flow of NER results within Presidio, and the relationship between the `SpacyNlpEngine` component and the `SpacyRecognizer` component: +```mermaid +sequenceDiagram + AnalyzerEngine->>SpacyNlpEngine: Call engine.process_text(text)
to get model results + SpacyNlpEngine->>spaCy: Call spaCy pipeline + spaCy->>SpacyNlpEngine: return entities and other attributes + Note over SpacyNlpEngine: Map entity names to Presidio's,
update scores,
remove unwanted entities
based on NerModelConfiguration + SpacyNlpEngine->>AnalyzerEngine: Pass NlpArtifacts
(Entities, lemmas, tokens, scores etc.) + Note over AnalyzerEngine: Call all recognizers + AnalyzerEngine->>SpacyRecognizer: Pass NlpArtifacts + Note over SpacyRecognizer: Extract PII entities out of NlpArtifacts + SpacyRecognizer->>AnalyzerEngine: Return List[RecognizerResult] + +``` + ## Training your own model !!! note "Note" diff --git a/docs/analyzer/nlp_engines/transformers.md b/docs/analyzer/nlp_engines/transformers.md index a36f0f94b..bee44ea89 100644 --- a/docs/analyzer/nlp_engines/transformers.md +++ b/docs/analyzer/nlp_engines/transformers.md @@ -7,13 +7,31 @@ Presidio's `TransformersNlpEngine` consists of a spaCy pipeline which encapsulat Presidio leverages other types of information from spaCy such as tokens, lemmas and part-of-speech. Therefore the pipeline returns both the NER model results as well as results from other pipeline components. +## How NER results flow within Presidio +This diagram describes the flow of NER results within Presidio, and the relationship between the `TransformersNlpEngine` component and the `TransformersRecognizer` component: +```mermaid +sequenceDiagram + AnalyzerEngine->>TransformersNlpEngine: Call engine.process_text(text)
to get model results + TransformersNlpEngine->>spaCy: Call spaCy pipeline + spaCy->>transformers: call NER model + transformers->>spaCy: get entities + spaCy->>TransformersNlpEngine: return transformers entities
+ spaCy attributes + Note over TransformersNlpEngine: Map entity names to Presidio's,
update scores,
remove unwanted entities
based on NerModelConfiguration + TransformersNlpEngine->>AnalyzerEngine: Pass NlpArtifacts
(Entities, lemmas, tokens, scores etc.) + Note over AnalyzerEngine: Call all recognizers + AnalyzerEngine->>TransformersRecognizer: Pass NlpArtifacts + Note over TransformersRecognizer: Extract PII entities out of NlpArtifacts + TransformersRecognizer->>AnalyzerEngine: Return List[RecognizerResult] + +``` + ## Adding a new model As the underlying transformers model, you can choose from either a public pretrained model or a custom model. ### Using a public pre-trained transformers model -### Downloading a pre-trained model +#### Downloading a pre-trained model To download the desired NER model from HuggingFace: @@ -100,6 +118,31 @@ See more information on parameters on the [spacy-huggingface-pipelines Github re Once created, see [the NLP configuration documentation](../customizing_nlp_models.md#Configure-Presidio-to-use-the-new-model) for more information. +#### Calling the new model + +Once the configuration file is created, it can be used to create a new `TransformersNlpEngine`: + +```python + from presidio_analyzer import AnalyzerEngine, RecognizerRegistry + from presidio_analyzer.nlp_engine import NlpEngineProvider + + # Create configuration containing engine name and models + conf_file = PATH_TO_CONF_FILE + + # Create NLP engine based on configuration + provider = NlpEngineProvider(conf_file=conf_file) + nlp_engine = provider.create_engine() + + # Pass the created NLP engine and supported_languages to the AnalyzerEngine + analyzer = AnalyzerEngine( + nlp_engine=nlp_engine, + supported_languages=["en"] + ) + + results_english = analyzer.analyze(text="My name is Morris", language="en") + print(results_english) +``` + ### Training your own model !!! note "Note" diff --git a/docs/faq.md b/docs/faq.md index d5c894cb3..113230ad4 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -45,7 +45,7 @@ By developing Presidio, our goals are: ### Is Microsoft Presidio an official Microsoft product? -The authors and maintainers of Presidio come from the [Commercial Software Engineering]([https://microsoft/github.io/code-with-engineering-playbook/cse](https://microsoft.github.io/code-with-engineering-playbook/CSE/)) team. We work with customers on various engineering problems, and have found the proper handling of private and sensitive data a recurring challenge across many customers and industries. +The authors and maintainers of Presidio come from the [Industry Solutions Engineering](https://microsoft.github.io/code-with-engineering-playbook) team. We work with customers on various engineering problems, and have found the proper handling of private and sensitive data a recurring challenge across many customers and industries. !!! note "Note" Microsoft Presidio is not an official Microsoft product. Usage terms are defined in the [repository's license](https://github.com/microsoft/presidio/blob/main/LICENSE). diff --git a/docs/getting_started.md b/docs/getting_started.md index 71b36baab..b1e3c6e34 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -4,7 +4,7 @@ Using Presidio's modules as Python packages to get started: -=== "Anonymize PII in text (Default spaCy model)" +===+ "Anonymize PII in text (Default spaCy model)" 1. Install Presidio diff --git a/docs/installation.md b/docs/installation.md index d1efafaa3..cde574992 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -22,7 +22,7 @@ Presidio is supported for the following python versions: 3.7, 3.8, 3.9, 3.10, 3. For PII anonymization on text, install the `presidio-analyzer` and `presidio-anonymizer` packages with at least one NLP engine (`spaCy`, `transformers` or `stanza`): -=== "spaCy (default)" +===+ "spaCy (default)" ``` pip install presidio_analyzer diff --git a/docs/samples/python/image_redaction_allow_list_approach.ipynb b/docs/samples/python/image_redaction_allow_list_approach.ipynb index fc7b38166..91ed0f2f6 100644 --- a/docs/samples/python/image_redaction_allow_list_approach.ipynb +++ b/docs/samples/python/image_redaction_allow_list_approach.ipynb @@ -146,7 +146,7 @@ "metadata": {}, "source": [ "### 1.2 DICOM medical image\n", - "For more information on DICOM image redaction, please see [example_dicom_image_redactor.ipynb](./example_dicom_image_redactor.ipynb) and the [Image redactor module documentation](../../../image-redactor/index.md)." + "For more information on DICOM image redaction, please see [example_dicom_image_redactor.ipynb](./example_dicom_image_redactor.ipynb) and the [Image redactor module documentation](../../image-redactor/index.md)." ] }, { diff --git a/docs/tutorial/05_languages.md b/docs/tutorial/05_languages.md index 453a4b2cb..4cc2fd06a 100644 --- a/docs/tutorial/05_languages.md +++ b/docs/tutorial/05_languages.md @@ -47,10 +47,10 @@ print("Results from English request:") print(results_english) ``` -[See this documentation](https://microsoft.github.io/presidio/analyzer/languages/) for more details on how to configure Presidio support additional NLP models and languages. +[See this documentation](https://microsoft.github.io/presidio/analyzer/languages/) for more details on setting up additional NLP models and languages. ## Using external models/frameworks -Some languages are not supported by spaCy/Stanza, or have very limited support in those. In this case, other frameworks could be leveraged. (see [example 4](04_external_services.md) for more information). +Some languages are not supported by spaCy/Stanza/huggingface, or have very limited support in those. In this case, other frameworks could be leveraged. (see [example 4](04_external_services.md) for more information). Since Presidio requires a spaCy model to be passed, we propose to use a simple spaCy pipeline such as `en_core_web_sm` as the NLP engine's model, and a recognizer calling an external framework/service as the Named Entity Recognition (NER) model. diff --git a/docs/tutorial/index.md b/docs/tutorial/index.md index 6d2ea9045..63af3e4d0 100644 --- a/docs/tutorial/index.md +++ b/docs/tutorial/index.md @@ -16,7 +16,7 @@ This tutorials covers different customization use cases to: - [Supporting new models and languages](05_languages.md) - [Calling an external service for PII detection](04_external_services.md) - [Using context words](06_context.md) -- [Tracing the decision process](07_decision_process) +- [Tracing the decision process](07_decision_process.md) - [Loading recognizers from file](08_no_code.md) - [Ad-Hoc recognizers](09_ad_hoc.md) - [Simple anonymization](10_simple_anonymization.md) diff --git a/mkdocs.yml b/mkdocs.yml index f5774ccfb..14df6386a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -10,25 +10,26 @@ nav: - Home: index.md - Installation: installation.md - Quickstart: getting_started.md - - Step by step tutorial: - - Home: tutorial/index.md - - Getting started: tutorial/00_getting_started.md - - Deny-list recognizers: tutorial/01_deny_list.md - - Regex recognizers: tutorial/02_regex.md - - Rule-based recognizers: tutorial/03_rule_based.md - - Additional models/languages: tutorial/05_languages.md - - External services: tutorial/04_external_services.md - - Context enhancement: tutorial/06_context.md - - Decision process: tutorial/07_decision_process.md - - No-code recognizers: tutorial/08_no_code.md - - Ad-hoc recognizers: tutorial/09_ad_hoc.md - - Simple anonymization: tutorial/10_simple_anonymization.md - - Custom anonymization: tutorial/11_custom_anonymization.md - - Encryption/Decryption: tutorial/12_encryption.md - - Allow-lists: tutorial/13_allow_list.md + - Handling text: - Home: text_anonymization.md + - Step by step tutorial: + - Home: tutorial/index.md + - Getting started: tutorial/00_getting_started.md + - Deny-list recognizers: tutorial/01_deny_list.md + - Regex recognizers: tutorial/02_regex.md + - Rule-based recognizers: tutorial/03_rule_based.md + - Additional models/languages: tutorial/05_languages.md + - External services: tutorial/04_external_services.md + - Context enhancement: tutorial/06_context.md + - Decision process: tutorial/07_decision_process.md + - No-code recognizers: tutorial/08_no_code.md + - Ad-hoc recognizers: tutorial/09_ad_hoc.md + - Simple anonymization: tutorial/10_simple_anonymization.md + - Custom anonymization: tutorial/11_custom_anonymization.md + - Encryption/Decryption: tutorial/12_encryption.md + - Allow-lists: tutorial/13_allow_list.md - Presidio Analyzer: - Home: analyzer/index.md - Developing PII recognizers: @@ -46,7 +47,6 @@ nav: - Handling images: - Home: image-redactor/index.md - Evaluating DICOM redaction: image-redactor/evaluating_dicom_redaction.md - - Supported entities: supported_entities.md - Development and design: - Design: design.md - Setting up a development environment: development.md @@ -58,10 +58,12 @@ nav: - Presidio Anonymizer Python API: api/anonymizer_python.md - Presidio Image Redactor Python API: api/image_redactor_python.md - REST API reference: https://microsoft.github.io/presidio/api-docs/api-docs.html" target="_blank - - Samples: samples/index.md - - Community: community.md - - FAQ: faq.md - - Demo: https://huggingface.co/spaces/presidio/presidio_demo" target="_blank + - General: + - Supported entities: supported_entities.md + - Samples: samples/index.md + - Community: community.md + - FAQ: faq.md + - Demo: https://huggingface.co/spaces/presidio/presidio_demo" target="_blank theme: name: material custom_dir: overrides @@ -79,6 +81,7 @@ theme: features: - navigation.instant - content.tabs.link + # - navigation.sections # - navigation.tabs # - navigation.tabs.sticky plugins: @@ -111,3 +114,8 @@ markdown_extensions: - pymdownx.pathconverter - pymdownx.tabbed: alternate_style: true + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format From 79434a21db03490bb2ba6151e1c3174caad35ad5 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 27 Sep 2023 00:51:47 +0300 Subject: [PATCH 55/67] newline --- .../presidio_analyzer/nlp_engine/spacy_nlp_engine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py index 957bf0b00..23ba066b7 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py @@ -217,7 +217,8 @@ def _get_updated_entities( else: logger.warning( f"Entity {ent.label_} is not mapped to a Presidio entity, " - f"but keeping anyway. Add to `NerModelConfiguration.labels_to_ignore` to remove." + f"but keeping anyway. " + f"Add to `NerModelConfiguration.labels_to_ignore` to remove." ) # Remove presidio entities in the ignore list From e6df67e8dae97ebf1224edfcaad191a18ee02e5d Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 22 Oct 2023 09:52:28 +0300 Subject: [PATCH 56/67] Create spelling.yml --- .github/spelling.yml | 166 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 .github/spelling.yml diff --git a/.github/spelling.yml b/.github/spelling.yml new file mode 100644 index 000000000..38dece487 --- /dev/null +++ b/.github/spelling.yml @@ -0,0 +1,166 @@ +name: Check Spelling + +# Comment management is handled through a secondary job, for details see: +# https://github.com/check-spelling/check-spelling/wiki/Feature%3A-Restricted-Permissions +# +# `jobs.comment-push` runs when a push is made to a repository and the `jobs.spelling` job needs to make a comment +# (in odd cases, it might actually run just to collapse a comment, but that's fairly rare) +# it needs `contents: write` in order to add a comment. +# +# `jobs.comment-pr` runs when a pull_request is made to a repository and the `jobs.spelling` job needs to make a comment +# or collapse a comment (in the case where it had previously made a comment and now no longer needs to show a comment) +# it needs `pull-requests: write` in order to manipulate those comments. + +# Updating pull request branches is managed via comment handling. +# For details, see: https://github.com/check-spelling/check-spelling/wiki/Feature:-Update-expect-list +# +# These elements work together to make it happen: +# +# `on.issue_comment` +# This event listens to comments by users asking to update the metadata. +# +# `jobs.update` +# This job runs in response to an issue_comment and will push a new commit +# to update the spelling metadata. +# +# `with.experimental_apply_changes_via_bot` +# Tells the action to support and generate messages that enable it +# to make a commit to update the spelling metadata. +# +# `with.ssh_key` +# In order to trigger workflows when the commit is made, you can provide a +# secret (typically, a write-enabled github deploy key). +# +# For background, see: https://github.com/check-spelling/check-spelling/wiki/Feature:-Update-with-deploy-key + +# Sarif reporting +# +# Access to Sarif reports is generally restricted (by GitHub) to members of the repository. +# +# Requires enabling `security-events: write` +# and configuring the action with `use_sarif: 1` +# +# For information on the feature, see: https://github.com/check-spelling/check-spelling/wiki/Feature:-Sarif-output + +# Minimal workflow structure: +# +# on: +# push: +# ... +# pull_request_target: +# ... +# jobs: +# # you only want the spelling job, all others should be omitted +# spelling: +# # remove `security-events: write` and `use_sarif: 1` +# # remove `experimental_apply_changes_via_bot: 1` +# ... otherwise adjust the `with:` as you wish + +on: + push: + branches: + - "**" + tags-ignore: + - "**" + pull_request_target: + branches: + - "**" + types: + - 'opened' + - 'reopened' + - 'synchronize' + issue_comment: + types: + - 'created' + +jobs: + spelling: + name: Check Spelling + permissions: + contents: read + pull-requests: read + actions: read + security-events: write + outputs: + followup: ${{ steps.spelling.outputs.followup }} + runs-on: ubuntu-latest + if: ${{ contains(github.event_name, 'pull_request') || github.event_name == 'push' }} + concurrency: + group: spelling-${{ github.event.pull_request.number || github.ref }} + # note: If you use only_check_changed_files, you do not want cancel-in-progress + cancel-in-progress: true + steps: + - name: check-spelling + id: spelling + uses: check-spelling/check-spelling@main + with: + suppress_push_for_open_pull_request: ${{ github.actor != 'dependabot[bot]' && 1 }} + checkout: true + check_file_names: 1 + spell_check_this: check-spelling/spell-check-this@prerelease + post_comment: 0 + use_magic_file: 1 + report-timing: 1 + warnings: bad-regex,binary-file,deprecated-feature,large-file,limited-references,no-newline-at-eof,noisy-file,non-alpha-in-dictionary,token-is-substring,unexpected-line-ending,whitespace-in-dictionary,minified-file,unsupported-configuration,no-files-to-check + experimental_apply_changes_via_bot: 1 + use_sarif: ${{ (!github.event.pull_request || (github.event.pull_request.head.repo.full_name == github.repository)) && 1 }} + extra_dictionary_limit: 20 + extra_dictionaries: + cspell:software-terms/dict/softwareTerms.txt + + comment-push: + name: Report (Push) + # If your workflow isn't running on push, you can remove this job + runs-on: ubuntu-latest + needs: spelling + permissions: + contents: write + if: (success() || failure()) && needs.spelling.outputs.followup && github.event_name == 'push' + steps: + - name: comment + uses: check-spelling/check-spelling@main + with: + checkout: true + spell_check_this: check-spelling/spell-check-this@prerelease + task: ${{ needs.spelling.outputs.followup }} + + comment-pr: + name: Report (PR) + # If you workflow isn't running on pull_request*, you can remove this job + runs-on: ubuntu-latest + needs: spelling + permissions: + contents: read + pull-requests: write + if: (success() || failure()) && needs.spelling.outputs.followup && contains(github.event_name, 'pull_request') + steps: + - name: comment + uses: check-spelling/check-spelling@main + with: + checkout: true + spell_check_this: check-spelling/spell-check-this@prerelease + task: ${{ needs.spelling.outputs.followup }} + experimental_apply_changes_via_bot: 1 + + update: + name: Update PR + permissions: + contents: write + pull-requests: write + actions: read + runs-on: ubuntu-latest + if: ${{ + github.event_name == 'issue_comment' && + github.event.issue.pull_request && + contains(github.event.comment.body, '@check-spelling-bot apply') + }} + concurrency: + group: spelling-update-${{ github.event.issue.number }} + cancel-in-progress: false + steps: + - name: apply spelling updates + uses: check-spelling/check-spelling@main + with: + experimental_apply_changes_via_bot: 1 + checkout: true + ssh_key: "${{ secrets.CHECK_SPELLING }}" From e3c31e7fbd595be4ccb7275a7db0642ca5f619da Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 22 Oct 2023 15:11:48 +0300 Subject: [PATCH 57/67] Delete .github/spelling.yml --- .github/spelling.yml | 166 ------------------------------------------- 1 file changed, 166 deletions(-) delete mode 100644 .github/spelling.yml diff --git a/.github/spelling.yml b/.github/spelling.yml deleted file mode 100644 index 38dece487..000000000 --- a/.github/spelling.yml +++ /dev/null @@ -1,166 +0,0 @@ -name: Check Spelling - -# Comment management is handled through a secondary job, for details see: -# https://github.com/check-spelling/check-spelling/wiki/Feature%3A-Restricted-Permissions -# -# `jobs.comment-push` runs when a push is made to a repository and the `jobs.spelling` job needs to make a comment -# (in odd cases, it might actually run just to collapse a comment, but that's fairly rare) -# it needs `contents: write` in order to add a comment. -# -# `jobs.comment-pr` runs when a pull_request is made to a repository and the `jobs.spelling` job needs to make a comment -# or collapse a comment (in the case where it had previously made a comment and now no longer needs to show a comment) -# it needs `pull-requests: write` in order to manipulate those comments. - -# Updating pull request branches is managed via comment handling. -# For details, see: https://github.com/check-spelling/check-spelling/wiki/Feature:-Update-expect-list -# -# These elements work together to make it happen: -# -# `on.issue_comment` -# This event listens to comments by users asking to update the metadata. -# -# `jobs.update` -# This job runs in response to an issue_comment and will push a new commit -# to update the spelling metadata. -# -# `with.experimental_apply_changes_via_bot` -# Tells the action to support and generate messages that enable it -# to make a commit to update the spelling metadata. -# -# `with.ssh_key` -# In order to trigger workflows when the commit is made, you can provide a -# secret (typically, a write-enabled github deploy key). -# -# For background, see: https://github.com/check-spelling/check-spelling/wiki/Feature:-Update-with-deploy-key - -# Sarif reporting -# -# Access to Sarif reports is generally restricted (by GitHub) to members of the repository. -# -# Requires enabling `security-events: write` -# and configuring the action with `use_sarif: 1` -# -# For information on the feature, see: https://github.com/check-spelling/check-spelling/wiki/Feature:-Sarif-output - -# Minimal workflow structure: -# -# on: -# push: -# ... -# pull_request_target: -# ... -# jobs: -# # you only want the spelling job, all others should be omitted -# spelling: -# # remove `security-events: write` and `use_sarif: 1` -# # remove `experimental_apply_changes_via_bot: 1` -# ... otherwise adjust the `with:` as you wish - -on: - push: - branches: - - "**" - tags-ignore: - - "**" - pull_request_target: - branches: - - "**" - types: - - 'opened' - - 'reopened' - - 'synchronize' - issue_comment: - types: - - 'created' - -jobs: - spelling: - name: Check Spelling - permissions: - contents: read - pull-requests: read - actions: read - security-events: write - outputs: - followup: ${{ steps.spelling.outputs.followup }} - runs-on: ubuntu-latest - if: ${{ contains(github.event_name, 'pull_request') || github.event_name == 'push' }} - concurrency: - group: spelling-${{ github.event.pull_request.number || github.ref }} - # note: If you use only_check_changed_files, you do not want cancel-in-progress - cancel-in-progress: true - steps: - - name: check-spelling - id: spelling - uses: check-spelling/check-spelling@main - with: - suppress_push_for_open_pull_request: ${{ github.actor != 'dependabot[bot]' && 1 }} - checkout: true - check_file_names: 1 - spell_check_this: check-spelling/spell-check-this@prerelease - post_comment: 0 - use_magic_file: 1 - report-timing: 1 - warnings: bad-regex,binary-file,deprecated-feature,large-file,limited-references,no-newline-at-eof,noisy-file,non-alpha-in-dictionary,token-is-substring,unexpected-line-ending,whitespace-in-dictionary,minified-file,unsupported-configuration,no-files-to-check - experimental_apply_changes_via_bot: 1 - use_sarif: ${{ (!github.event.pull_request || (github.event.pull_request.head.repo.full_name == github.repository)) && 1 }} - extra_dictionary_limit: 20 - extra_dictionaries: - cspell:software-terms/dict/softwareTerms.txt - - comment-push: - name: Report (Push) - # If your workflow isn't running on push, you can remove this job - runs-on: ubuntu-latest - needs: spelling - permissions: - contents: write - if: (success() || failure()) && needs.spelling.outputs.followup && github.event_name == 'push' - steps: - - name: comment - uses: check-spelling/check-spelling@main - with: - checkout: true - spell_check_this: check-spelling/spell-check-this@prerelease - task: ${{ needs.spelling.outputs.followup }} - - comment-pr: - name: Report (PR) - # If you workflow isn't running on pull_request*, you can remove this job - runs-on: ubuntu-latest - needs: spelling - permissions: - contents: read - pull-requests: write - if: (success() || failure()) && needs.spelling.outputs.followup && contains(github.event_name, 'pull_request') - steps: - - name: comment - uses: check-spelling/check-spelling@main - with: - checkout: true - spell_check_this: check-spelling/spell-check-this@prerelease - task: ${{ needs.spelling.outputs.followup }} - experimental_apply_changes_via_bot: 1 - - update: - name: Update PR - permissions: - contents: write - pull-requests: write - actions: read - runs-on: ubuntu-latest - if: ${{ - github.event_name == 'issue_comment' && - github.event.issue.pull_request && - contains(github.event.comment.body, '@check-spelling-bot apply') - }} - concurrency: - group: spelling-update-${{ github.event.issue.number }} - cancel-in-progress: false - steps: - - name: apply spelling updates - uses: check-spelling/check-spelling@main - with: - experimental_apply_changes_via_bot: 1 - checkout: true - ssh_key: "${{ secrets.CHECK_SPELLING }}" From 8feadb80a26ed3d039cbaecf8a5253990773cd90 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 22 Oct 2023 15:12:43 +0300 Subject: [PATCH 58/67] Updates to docs --- docs/samples/index.md | 55 +- .../python/Anonymizing known values.ipynb | 4 +- docs/samples/python/batch_processing.ipynb | 1049 +++++++++-------- .../customizing_presidio_analyzer.ipynb | 77 +- docs/samples/python/encrypt_decrypt.ipynb | 505 ++++---- .../python/example_dicom_image_redactor.ipynb | 22 +- docs/samples/python/index.md | 21 - docs/samples/python/presidio_notebook.ipynb | 394 ++++--- mkdocs.yml | 26 +- .../batch_analyzer_engine.py | 17 +- 10 files changed, 1114 insertions(+), 1056 deletions(-) delete mode 100644 docs/samples/python/index.md diff --git a/docs/samples/index.md b/docs/samples/index.md index c46e5fdfa..3d7f462e1 100644 --- a/docs/samples/index.md +++ b/docs/samples/index.md @@ -1,29 +1,30 @@ # Samples -| Topic | Type | Sample | -| :---------- |:--------------------------------------| :---------------------------------------------------------------------------------------------------------------------------------------------- | -| Usage | Python Notebook | [Presidio Basic Usage Notebook](python/presidio_notebook.ipynb) | -| Usage | Python Notebook | [Customizing Presidio Analyzer](python/customizing_presidio_analyzer.ipynb) | -| Usage | Python Notebook | [Analyzing structured / semi-structured data in batch](python/batch_processing.ipynb)| -| Usage | Python Notebook | [Encrypting and Decrypting identified entities](python/encrypt_decrypt.ipynb)| -| Usage | Python Notebook | [Getting the identified entity value using a custom Operator](python/getting_entity_values.ipynb)| -| Usage | Python Notebook | [Anonymizing known values](https://github.com/microsoft/presidio/blob/main/docs/samples/python/Anonymizing%20known%20values.ipynb) -| Usage | Python Notebook | [Redacting text PII from DICOM images](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_dicom_image_redactor.ipynb) -| Usage | Python Notebook | [Using an allow list with image redaction](https://github.com/microsoft/presidio/blob/main/docs/samples/python/image_redaction_allow_list_approach.ipynb) -| Usage | Python Notebook | [Annotating PII in a PDF](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_pdf_annotation.ipynb) -| Usage | Python Notebook | [Integrating with external services](https://github.com/microsoft/presidio/blob/main/docs/samples/python/integrating_with_external_services.ipynb) | -| Usage | Python | [Remote Recognizer](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_remote_recognizer.py) | -| Usage | Python | [Text Analytics as a Remote Recognizer](https://github.com/microsoft/presidio/blob/main/docs/samples/python/text_analytics/index.md) | -| Usage | Python | [Analyze and Anonymize CSV file](https://github.com/microsoft/presidio/blob/main/docs/samples/python/process_csv_file.py) | -| Usage | Python | [Using Flair as an external PII model](https://github.com/microsoft/presidio/blob/main/docs/samples/python/flair_recognizer.py)| -| Usage | Python | [Using Transformers as an external PII model](python/transformers_recognizer/index.md)| -| Usage | Python | [Passing a lambda as a Presidio anonymizer using Faker](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_custom_lambda_anonymizer.py)| -| Usage | REST API (postman) | [Presidio as a REST endpoint](docker/index.md)| -| Deployment | App Service | [Presidio with App Service](deployments/app-service/index.md)| -| Deployment | Kubernetes | [Presidio with Kubernetes](deployments/k8s/index.md)| -| Deployment | Spark/Azure Databricks | [Presidio with Spark](deployments/spark/index.md)| -| Deployment | Azure Data Factory with App Service | [ETL for small dataset](deployments/data-factory/presidio-data-factory.md#option-1-presidio-as-an-http-rest-endpoint) | -| Deployment | Azure Data Factory with Databricks | [ETL for large datasets](deployments/data-factory/presidio-data-factory.md#option-2-presidio-on-azure-databricks) | -| ADF Pipeline | Azure Data Factory | [Add Presidio as an HTTP service to your Azure Data Factory](deployments/data-factory/presidio-data-factory-template-gallery-http.md) | -| ADF Pipeline | Azure Data Factory | [Add Presidio on Databricks to your Azure Data Factory](deployments/data-factory/presidio-data-factory-template-gallery-databricks.md) | -| Demo | Streamlit | [Create a simple demo app using Streamlit](python/streamlit/index.md) +| Topic | Data Type |Resource | Sample | +| :---------- |:--------------------------------------| :---------------------------------| :---------------------------------------------------------------------------------------------------------------------------------------------- | +| Usage | Text | Python Notebook | [Presidio Basic Usage Notebook](https://github.com/microsoft/presidio/blob/main/docs/samples//python/presidio_notebook.ipynb) | +| Usage | Text | Python Notebook | [Customizing Presidio Analyzer](https://github.com/microsoft/presidio/blob/main/docs/samples//python/customizing_presidio_analyzer.ipynb) | +| Usage | Semi-structured | Python Notebook | [Analyzing structured / semi-structured data in batch](https://github.com/microsoft/presidio/blob/main/docs/samples//python/batch_processing.ipynb)| +| Usage | Text | Python Notebook | [Encrypting and Decrypting identified entities](https://github.com/microsoft/presidio/blob/main/docs/samples//python/encrypt_decrypt.ipynb)| +| Usage | Text | Python Notebook | [Getting the identified entity value using a custom Operator](https://github.com/microsoft/presidio/blob/main/docs/samples/python/getting_entity_values.ipynb)| +| Usage | text | Python Notebook | [Anonymizing known values](https://github.com/microsoft/presidio/blob/main/docs/samples/python/Anonymizing%20known%20values.ipynb) +| Usage | Images | Python Notebook | [Redacting Text PII from DICOM images](python/example_dicom_image_redactor.ipynb) +| Usage | Images | Python Notebook | [Using an allow list with image redaction](https://github.com/microsoft/presidio/blob/main/docs/samples/python/image_redaction_allow_list_approach.ipynb) +| Usage | PDF | Python Notebook | [Annotating PII in a PDF](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_pdf_annotation.ipynb) +| Usage | Images | Python Notebook | [Plot custom bounding boxes](https://github.com/microsoft/presidio/blob/main/docs/samples/python/plot_custom_bboxes.ipynb) +| Usage | Text | Python Notebook | [Integrating with external services](https://github.com/microsoft/presidio/blob/main/docs/samples/python/integrating_with_external_services.ipynb) | +| Usage | Text | Python file | [Remote Recognizer](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_remote_recognizer.py) | +| Usage | Text | Python file | [Azure AI Language as a Remote Recognizer](python/text_analytics/index.md) | +| Usage | CSV | Python file | [Analyze and Anonymize CSV file](https://github.com/microsoft/presidio/blob/main/docs/samples/python/process_csv_file.py) | +| Usage | Text | Python | [Using Flair as an external PII model](https://github.com/microsoft/presidio/blob/main/docs/samples/python/flair_recognizer.py)| +| Usage | Text | Python file | [Using Transformers as an external PII model](python/transformers_recognizer/index.md)| +| Usage | Text | Python file | [Passing a lambda as a Presidio anonymizer using Faker](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_custom_lambda_anonymizer.py)| +| Usage | | REST API (postman) | [Presidio as a REST endpoint](docker/index.md)| +| Deployment | | App Service | [Presidio with App Service](deployments/app-service/index.md)| +| Deployment | | Kubernetes | [Presidio with Kubernetes](deployments/k8s/index.md)| +| Deployment | | Spark/Azure Databricks | [Presidio with Spark](deployments/spark/index.md)| +| Deployment | | Azure Data Factory with App Service | [ETL for small dataset](deployments/data-factory/presidio-data-factory.md#option-1-presidio-as-an-http-rest-endpoint) | +| Deployment | | Azure Data Factory with Databricks | [ETL for large datasets](deployments/data-factory/presidio-data-factory.md#option-2-presidio-on-azure-databricks) | +| ADF Pipeline | | Azure Data Factory | [Add Presidio as an HTTP service to your Azure Data Factory](deployments/data-factory/presidio-data-factory-template-gallery-http.md) | +| ADF Pipeline | | Azure Data Factory | [Add Presidio on Databricks to your Azure Data Factory](deployments/data-factory/presidio-data-factory-template-gallery-databricks.md) | +| Demo | | Streamlit app | [Create a simple demo app using Streamlit](python/streamlit/index.md) diff --git a/docs/samples/python/Anonymizing known values.ipynb b/docs/samples/python/Anonymizing known values.ipynb index 353efae3d..6f6d09200 100644 --- a/docs/samples/python/Anonymizing known values.ipynb +++ b/docs/samples/python/Anonymizing known values.ipynb @@ -10,7 +10,9 @@ "outputs": [], "source": [ "# download presidio\n", - "!pip install presidio_analyzer presidio_anonymizer" + "!pip install presidio_analyzer presidio_anonymizer\n", + "\n", + "!python -m spacy download en_core_web_lg" ] }, { diff --git a/docs/samples/python/batch_processing.ipynb b/docs/samples/python/batch_processing.ipynb index ad96a1a02..f82612470 100644 --- a/docs/samples/python/batch_processing.ipynb +++ b/docs/samples/python/batch_processing.ipynb @@ -1,522 +1,531 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "bcddce7b", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# download presidio\n", - "!pip install presidio_analyzer presidio_anonymizer", - "!python -m spacy download en_core_web_lg" - ] - }, - { - "cell_type": "markdown", - "id": "3345f1c4", - "metadata": {}, - "source": [ - "Path to notebook: [https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/batch_processing.ipynb](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/batch_processing.ipynb)" - ] - }, - { - "cell_type": "markdown", - "id": "gothic-trademark", - "metadata": {}, - "source": [ - "# Run Presidio on structured / semi-structured data\n", - "\n", - "This sample shows how Presidio could be potentially extended to handle the anonymization of a table or data frame.\n", - "It introduces methods for the analysis and anonymization of both lists and dicts. \n", - "\n", - "Note: this sample input here is a Pandas DataFrame and a JSON file, but it can be used in other scenarios such as querying SQL data or using Spark DataFrames.\n" - ] - }, - { - "cell_type": "markdown", - "id": "roman-allergy", - "metadata": {}, - "source": [ - "### Set up imports" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "extensive-greensboro", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import List, Optional, Dict, Union, Iterator, Iterable\n", - "import collections\n", - "from dataclasses import dataclass\n", - "import pprint\n", - "\n", - "import pandas as pd\n", - "\n", - "from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerResult, DictAnalyzerResult\n", - "from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine\n", - "from presidio_anonymizer.entities import EngineResult\n" - ] - }, - { - "cell_type": "markdown", - "id": "fiscal-affair", - "metadata": {}, - "source": [ - "## Example using sample tabular data" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "bright-maple", - "metadata": {}, - "outputs": [], - "source": [ - "columns = [\"name phrase\", \"phone number phrase\", \"integer\", \"boolean\" ]\n", - "sample_data = [\n", - " ('Charlie likes this', 'Please call 212-555-1234 after 2pm', 1, True),\n", - " ('You should talk to Mike', 'his number is 978-428-7111', 2, False),\n", - " ('Mary had a little startup', 'Phone number: 202-342-1234', 3, False)\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "russian-proceeding", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name phrasephone number phraseintegerboolean
0Charlie likes thisPlease call 212-555-1234 after 2pm1True
1You should talk to Mikehis number is 978-428-71112False
2Mary had a little startupPhone number: 202-342-12343False
\n", - "
" - ], - "text/plain": [ - " name phrase phone number phrase integer \n", - "0 Charlie likes this Please call 212-555-1234 after 2pm 1 \\\n", - "1 You should talk to Mike his number is 978-428-7111 2 \n", - "2 Mary had a little startup Phone number: 202-342-1234 3 \n", - "\n", - " boolean \n", - "0 True \n", - "1 False \n", - "2 False " - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Create Pandas DataFrame\n", - "df = pd.DataFrame(sample_data,columns=columns)\n", - "\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "brazilian-punch", - "metadata": {}, - "outputs": [], - "source": [ - "# DataFrame to dict\n", - "df_dict = df.to_dict(orient=\"list\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "fixed-commerce", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'boolean': [True, False, False],\n", - " 'integer': [1, 2, 3],\n", - " 'name phrase': ['Charlie likes this',\n", - " 'You should talk to Mike',\n", - " 'Mary had a little startup'],\n", - " 'phone number phrase': ['Please call 212-555-1234 after 2pm',\n", - " 'his number is 978-428-7111',\n", - " 'Phone number: 202-342-1234']}\n" - ] - } - ], - "source": [ - "pprint.pprint(df_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "verified-spirituality", - "metadata": {}, - "outputs": [], - "source": [ - "analyzer = AnalyzerEngine()\n", - "batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)\n", - "batch_anonymizer = BatchAnonymizerEngine()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "narrative-freeze", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[DictAnalyzerResult(key='name phrase', value=['Charlie likes this', 'You should talk to Mike', 'Mary had a little startup'], recognizer_results=[[type: PERSON, start: 0, end: 7, score: 0.85], [type: PERSON, start: 19, end: 23, score: 0.85], [type: PERSON, start: 0, end: 4, score: 0.85]]),\n", - " DictAnalyzerResult(key='phone number phrase', value=['Please call 212-555-1234 after 2pm', 'his number is 978-428-7111', 'Phone number: 202-342-1234'], recognizer_results=[[type: DATE_TIME, start: 31, end: 34, score: 0.85, type: PHONE_NUMBER, start: 12, end: 24, score: 0.75], [type: PHONE_NUMBER, start: 14, end: 26, score: 0.75], [type: PHONE_NUMBER, start: 14, end: 26, score: 0.75]]),\n", - " DictAnalyzerResult(key='integer', value=[1, 2, 3], recognizer_results=[[], [], []]),\n", - " DictAnalyzerResult(key='boolean', value=[True, False, False], recognizer_results=[[], [], []])]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "analyzer_results = batch_analyzer.analyze_dict(df_dict, language=\"en\")\n", - "analyzer_results = list(analyzer_results)\n", - "analyzer_results" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "rural-month", - "metadata": {}, - "outputs": [], - "source": [ - "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "acute-mauritius", - "metadata": {}, - "outputs": [], - "source": [ - "scrubbed_df = pd.DataFrame(anonymizer_results)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "irish-phoenix", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name phrasephone number phraseintegerboolean
0<PERSON> likes thisPlease call <PHONE_NUMBER> after <DATE_TIME>1True
1You should talk to <PERSON>his number is <PHONE_NUMBER>2False
2<PERSON> had a little startupPhone number: <PHONE_NUMBER>3False
\n", - "
" - ], - "text/plain": [ - " name phrase \n", - "0 likes this \\\n", - "1 You should talk to \n", - "2 had a little startup \n", - "\n", - " phone number phrase integer boolean \n", - "0 Please call after 1 True \n", - "1 his number is 2 False \n", - "2 Phone number: 3 False " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "scrubbed_df" - ] - }, - { - "cell_type": "markdown", - "id": "1cb4b006", - "metadata": {}, - "source": [ - "## Example using JSON" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "1063019b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'key_a': {'key_a1': 'My phone number is 212-121-1424'},\n", - " 'key_b': {'www.abc.com'},\n", - " 'key_c': 3,\n", - " 'names': ['James Bond', 'Clark Kent', 'Hakeem Olajuwon', 'No name here!']}\n" - ] - } - ], - "source": [ - "nested_dict = {\n", - " \"key_a\": {\"key_a1\": \"My phone number is 212-121-1424\"},\n", - " \"key_b\": {\"www.abc.com\"},\n", - " \"key_c\": 3,\n", - " \"names\": [\"James Bond\", \"Clark Kent\", \"Hakeem Olajuwon\", \"No name here!\"]\n", - "}\n", - "\n", - "pprint.pprint(nested_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "e3c09b4b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'key_a': {'key_a1': 'My phone number is '},\n", - " 'key_b': [''],\n", - " 'key_c': 3,\n", - " 'names': ['', '', '', 'No name here!']}\n" - ] - } - ], - "source": [ - "# Analyze dict\n", - "analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language=\"en\")\n", - "\n", - "# Anonymize dict\n", - "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)\n", - "pprint.pprint(anonymizer_results)" - ] - }, - { - "cell_type": "markdown", - "id": "e593eb11", - "metadata": {}, - "source": [ - "### Ignoring specific keys" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "84b2ef95", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'key_a': {'key_a1': 'My phone number is 212-121-1424'},\n", - " 'key_b': [''],\n", - " 'key_c': 3,\n", - " 'names': ['James Bond', 'Clark Kent', 'Hakeem Olajuwon', 'No name here!']}\n" - ] - } - ], - "source": [ - "keys_to_skip=[\"key_a1\", \"names\"]\n", - "analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language=\"en\", keys_to_skip=keys_to_skip)\n", - "\n", - "# Anonymize dict\n", - "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)\n", - "pprint.pprint(anonymizer_results)" - ] - }, - { - "cell_type": "markdown", - "id": "bd0cde2a", - "metadata": {}, - "source": [ - "### Ignoring nested keys" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "93ed8769", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'key_a': {'key_a1': 'My phone number is 212-121-1424'},\n", - " 'key_b': [''],\n", - " 'key_c': 3,\n", - " 'names': ['', '', '', 'No name here!']}\n" - ] - } - ], - "source": [ - "keys_to_skip = [\"key_a.key_a1\"]\n", - "\n", - "analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language=\"en\", keys_to_skip=keys_to_skip)\n", - "\n", - "# Anonymize dict\n", - "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)\n", - "pprint.pprint(anonymizer_results)" - ] - }, - { - "cell_type": "markdown", - "id": "aa0ab530", - "metadata": {}, - "source": [ - "#### **Note!**\n", - "\n", - "JSON files with objects within lists, e.g.:\n", - "```\n", - "{\n", - " \"key\": [\n", - " {\n", - " \"key2\": \"Peter Parker\"\n", - " },\n", - " {\n", - " \"key3\": \"555-1234\"\n", - " }\n", - " ]\n", - "}\n", - "```\n", - "\n", - "Are not yet supported. Consider breaking the JSON to parts if needed." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "bcddce7b", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# download presidio\n", + "!pip install presidio_analyzer presidio_anonymizer\n", + "\n", + "!python -m spacy download en_core_web_lg" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3345f1c4", + "metadata": {}, + "source": [ + "Path to notebook: [https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/batch_processing.ipynb](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/batch_processing.ipynb)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "gothic-trademark", + "metadata": {}, + "source": [ + "# Run Presidio on structured / semi-structured data\n", + "\n", + "This sample shows how Presidio could be potentially extended to handle the anonymization of a table or data frame.\n", + "It introduces methods for the analysis and anonymization of both lists and dicts. \n", + "\n", + "Note: this sample input here is a Pandas DataFrame and a JSON file, but it can be used in other scenarios such as querying SQL data or using Spark DataFrames.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "roman-allergy", + "metadata": {}, + "source": [ + "### Set up imports" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "extensive-greensboro", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List, Optional, Dict, Union, Iterator, Iterable\n", + "import collections\n", + "from dataclasses import dataclass\n", + "import pprint\n", + "\n", + "import pandas as pd\n", + "\n", + "from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerResult, DictAnalyzerResult\n", + "from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine\n", + "from presidio_anonymizer.entities import EngineResult\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "fiscal-affair", + "metadata": {}, + "source": [ + "## Example using sample tabular data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "bright-maple", + "metadata": {}, + "outputs": [], + "source": [ + "columns = [\"name phrase\", \"phone number phrase\", \"integer\", \"boolean\" ]\n", + "sample_data = [\n", + " ('Charlie likes this', 'Please call 212-555-1234 after 2pm', 1, True),\n", + " ('You should talk to Mike', 'his number is 978-428-7111', 2, False),\n", + " ('Mary had a little startup', 'Phone number: 202-342-1234', 3, False)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "russian-proceeding", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
name phrasephone number phraseintegerboolean
0Charlie likes thisPlease call 212-555-1234 after 2pm1True
1You should talk to Mikehis number is 978-428-71112False
2Mary had a little startupPhone number: 202-342-12343False
\n", + "
" + ], + "text/plain": [ + " name phrase phone number phrase integer \n", + "0 Charlie likes this Please call 212-555-1234 after 2pm 1 \\\n", + "1 You should talk to Mike his number is 978-428-7111 2 \n", + "2 Mary had a little startup Phone number: 202-342-1234 3 \n", + "\n", + " boolean \n", + "0 True \n", + "1 False \n", + "2 False " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create Pandas DataFrame\n", + "df = pd.DataFrame(sample_data,columns=columns)\n", + "\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "brazilian-punch", + "metadata": {}, + "outputs": [], + "source": [ + "# DataFrame to dict\n", + "df_dict = df.to_dict(orient=\"list\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fixed-commerce", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'boolean': [True, False, False],\n", + " 'integer': [1, 2, 3],\n", + " 'name phrase': ['Charlie likes this',\n", + " 'You should talk to Mike',\n", + " 'Mary had a little startup'],\n", + " 'phone number phrase': ['Please call 212-555-1234 after 2pm',\n", + " 'his number is 978-428-7111',\n", + " 'Phone number: 202-342-1234']}\n" + ] + } + ], + "source": [ + "pprint.pprint(df_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "verified-spirituality", + "metadata": {}, + "outputs": [], + "source": [ + "analyzer = AnalyzerEngine()\n", + "batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)\n", + "batch_anonymizer = BatchAnonymizerEngine()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "narrative-freeze", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[DictAnalyzerResult(key='name phrase', value=['Charlie likes this', 'You should talk to Mike', 'Mary had a little startup'], recognizer_results=[[type: PERSON, start: 0, end: 7, score: 0.85], [type: PERSON, start: 19, end: 23, score: 0.85], [type: PERSON, start: 0, end: 4, score: 0.85]]),\n", + " DictAnalyzerResult(key='phone number phrase', value=['Please call 212-555-1234 after 2pm', 'his number is 978-428-7111', 'Phone number: 202-342-1234'], recognizer_results=[[type: DATE_TIME, start: 31, end: 34, score: 0.85, type: PHONE_NUMBER, start: 12, end: 24, score: 0.75], [type: PHONE_NUMBER, start: 14, end: 26, score: 0.75], [type: PHONE_NUMBER, start: 14, end: 26, score: 0.75]]),\n", + " DictAnalyzerResult(key='integer', value=[1, 2, 3], recognizer_results=[[], [], []]),\n", + " DictAnalyzerResult(key='boolean', value=[True, False, False], recognizer_results=[[], [], []])]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analyzer_results = batch_analyzer.analyze_dict(df_dict, language=\"en\")\n", + "analyzer_results = list(analyzer_results)\n", + "analyzer_results" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "rural-month", + "metadata": {}, + "outputs": [], + "source": [ + "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "acute-mauritius", + "metadata": {}, + "outputs": [], + "source": [ + "scrubbed_df = pd.DataFrame(anonymizer_results)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "irish-phoenix", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
name phrasephone number phraseintegerboolean
0<PERSON> likes thisPlease call <PHONE_NUMBER> after <DATE_TIME>1True
1You should talk to <PERSON>his number is <PHONE_NUMBER>2False
2<PERSON> had a little startupPhone number: <PHONE_NUMBER>3False
\n", + "
" + ], + "text/plain": [ + " name phrase \n", + "0 likes this \\\n", + "1 You should talk to \n", + "2 had a little startup \n", + "\n", + " phone number phrase integer boolean \n", + "0 Please call after 1 True \n", + "1 his number is 2 False \n", + "2 Phone number: 3 False " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scrubbed_df" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1cb4b006", + "metadata": {}, + "source": [ + "## Example using JSON" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "1063019b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'key_a': {'key_a1': 'My phone number is 212-121-1424'},\n", + " 'key_b': {'www.abc.com'},\n", + " 'key_c': 3,\n", + " 'names': ['James Bond', 'Clark Kent', 'Hakeem Olajuwon', 'No name here!']}\n" + ] + } + ], + "source": [ + "nested_dict = {\n", + " \"key_a\": {\"key_a1\": \"My phone number is 212-121-1424\"},\n", + " \"key_b\": {\"www.abc.com\"},\n", + " \"key_c\": 3,\n", + " \"names\": [\"James Bond\", \"Clark Kent\", \"Hakeem Olajuwon\", \"No name here!\"]\n", + "}\n", + "\n", + "pprint.pprint(nested_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e3c09b4b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'key_a': {'key_a1': 'My phone number is '},\n", + " 'key_b': [''],\n", + " 'key_c': 3,\n", + " 'names': ['', '', '', 'No name here!']}\n" + ] + } + ], + "source": [ + "# Analyze dict\n", + "analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language=\"en\")\n", + "\n", + "# Anonymize dict\n", + "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)\n", + "pprint.pprint(anonymizer_results)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e593eb11", + "metadata": {}, + "source": [ + "### Ignoring specific keys" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "84b2ef95", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'key_a': {'key_a1': 'My phone number is 212-121-1424'},\n", + " 'key_b': [''],\n", + " 'key_c': 3,\n", + " 'names': ['James Bond', 'Clark Kent', 'Hakeem Olajuwon', 'No name here!']}\n" + ] + } + ], + "source": [ + "keys_to_skip=[\"key_a1\", \"names\"]\n", + "analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language=\"en\", keys_to_skip=keys_to_skip)\n", + "\n", + "# Anonymize dict\n", + "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)\n", + "pprint.pprint(anonymizer_results)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "bd0cde2a", + "metadata": {}, + "source": [ + "### Ignoring nested keys" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "93ed8769", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'key_a': {'key_a1': 'My phone number is 212-121-1424'},\n", + " 'key_b': [''],\n", + " 'key_c': 3,\n", + " 'names': ['', '', '', 'No name here!']}\n" + ] + } + ], + "source": [ + "keys_to_skip = [\"key_a.key_a1\"]\n", + "\n", + "analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language=\"en\", keys_to_skip=keys_to_skip)\n", + "\n", + "# Anonymize dict\n", + "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)\n", + "pprint.pprint(anonymizer_results)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "aa0ab530", + "metadata": {}, + "source": [ + "#### **Note!**\n", + "\n", + "JSON files with objects within lists, e.g.:\n", + "```\n", + "{\n", + " \"key\": [\n", + " {\n", + " \"key2\": \"Peter Parker\"\n", + " },\n", + " {\n", + " \"key3\": \"555-1234\"\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "\n", + "Are not yet supported. Consider breaking the JSON to parts if needed." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/docs/samples/python/customizing_presidio_analyzer.ipynb b/docs/samples/python/customizing_presidio_analyzer.ipynb index 86c841d67..669fc0a7b 100644 --- a/docs/samples/python/customizing_presidio_analyzer.ipynb +++ b/docs/samples/python/customizing_presidio_analyzer.ipynb @@ -1,25 +1,29 @@ { "cells": [ -{ -"cell_type": "markdown", -"metadata": {}, -"source": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ "Path to notebook: [https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/customizing_presidio_analyzer.ipynb](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/customizing_presidio_analyzer.ipynb)" -] -}, + ] + }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Customizing the PII analysis process in Microsoft Presidio\n", "\n", "This notebooks covers different customization use cases to:\n", + "\n", "1. Adapt Presidio to detect new types of PII entities\n", - "2. Adapt Presidio to detect PII entities in a new language\n", - "3. Embed new types of detection modules into Presidio, to improve the coverage of the service." + "1. Adapt Presidio to detect PII entities in a new language\n", + "1. Embed new types of detection modules into Presidio, to improve the coverage of the service." ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -27,22 +31,23 @@ "First, let's install presidio using `pip`. For detailed documentation, see the [installation docs](https://microsoft.github.io/presidio/installation).\n", "\n", "Install from PyPI:" - ]}, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# download presidio\n", - "!pip install presidio_analyzer presidio_anonymizer", - "!python -m spacy download en_core_web_lg" - ] - + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# download presidio\n", + "!pip install presidio_analyzer presidio_anonymizer\n", + "!python -m spacy download en_core_web_lg" + ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -70,6 +75,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -89,6 +95,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -105,6 +112,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -132,6 +140,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -150,6 +159,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -187,6 +197,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -215,6 +226,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -238,6 +250,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -267,6 +280,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -274,6 +288,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -311,6 +326,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -362,6 +378,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -395,6 +412,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -402,6 +420,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -416,6 +435,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -428,6 +448,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -489,6 +510,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -496,6 +518,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -539,6 +562,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -558,6 +582,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -602,6 +627,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -652,6 +678,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -659,6 +686,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -700,6 +728,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -753,6 +782,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -760,6 +790,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -767,6 +798,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -799,6 +831,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ diff --git a/docs/samples/python/encrypt_decrypt.ipynb b/docs/samples/python/encrypt_decrypt.ipynb index 8e9deff74..003fc1524 100644 --- a/docs/samples/python/encrypt_decrypt.ipynb +++ b/docs/samples/python/encrypt_decrypt.ipynb @@ -1,255 +1,262 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "bcddce7b", - "metadata": { - "scrolled": true + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "bcddce7b", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# download presidio\n", + "!pip install presidio_analyzer presidio_anonymizer\n", + "\n", + "!python -m spacy download en_core_web_lg" + ] }, - "outputs": [], - "source": [ - "# download presidio\n", - "!pip install presidio_analyzer presidio_anonymizer", - "!python -m spacy download en_core_web_lg" - ] + { + "attachments": {}, + "cell_type": "markdown", + "id": "3345f1c4", + "metadata": {}, + "source": [ + "Path to notebook: [https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/encrypt_decrypt.ipynb](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/encrypt_decrypt.ipynb)" + ] }, { - "cell_type": "markdown", - "id": "3345f1c4", - "metadata": {}, - "source": [ - "Path to notebook: [https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/encrypt_decrypt.ipynb](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/encrypt_decrypt.ipynb)" - ] + "attachments": {}, + "cell_type": "markdown", + "id": "gothic-trademark", + "metadata": {}, + "source": [ + "# Encrypting and Decrypting identified entities\n", + "\n", + "This sample shows how to use Presidio Anonymizer built-in functionality, to encrypt and decrypt identified entities.\n", + "The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.\n" + ] }, - { - "cell_type": "markdown", - "id": "gothic-trademark", - "metadata": {}, - "source": [ - "# Encrypting and Decrypting identified entities\n", - "\n", - "This sample shows how to use Presidio Anonymizer built-in functionality, to encrypt and decrypt identified entities.\n", - "The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.\n" - ] - }, - { - "cell_type": "markdown", - "id": "roman-allergy", - "metadata": {}, - "source": [ - "### Set up imports" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "extensive-greensboro", - "metadata": {}, - "outputs": [], - "source": [ - "from presidio_anonymizer import AnonymizerEngine, DeanonymizeEngine\n", - "from presidio_anonymizer.entities import RecognizerResult, OperatorResult, OperatorConfig\n", - "from presidio_anonymizer.operators import Decrypt" - ] - }, - { - "cell_type": "markdown", - "id": "091be4b6", - "metadata": {}, - "source": [ - "### Define a cryptographic key (for both encryption and decryption)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "50bc451e", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "crypto_key = \"WmZq4t7w!z%C&F)J\"" - ] - }, - { - "cell_type": "markdown", - "id": "metropolitan-atlantic", - "metadata": {}, - "source": [ - "### Presidio Anonymizer: Encrypt" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "medium-ridge", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "text: My name is M4lla0kBCzu6SwCONL6Y+ZqsPqhBp1Lhdc3t0FKnUwM=.\n", - "items:\n", - "[\n", - " {'start': 11, 'end': 55, 'entity_type': 'PERSON', 'text': 'M4lla0kBCzu6SwCONL6Y+ZqsPqhBp1Lhdc3t0FKnUwM=', 'operator': 'encrypt'}\n", - "]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "engine = AnonymizerEngine()\n", - "\n", - "# Invoke the anonymize function with the text,\n", - "# analyzer results (potentially coming from presidio-analyzer)\n", - "# and an 'encrypt' operator to get an encrypted anonymization output:\n", - "anonymize_result = engine.anonymize(\n", - " text=\"My name is James Bond\",\n", - " analyzer_results=[\n", - " RecognizerResult(entity_type=\"PERSON\", start=11, end=21, score=0.8),\n", - " ],\n", - " operators={\"PERSON\": OperatorConfig(\"encrypt\", {\"key\": crypto_key})},\n", - ")\n", - "\n", - "anonymize_result" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "2f8be6b5", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "# Fetch the anonymized text from the result.\n", - "anonymized_text = anonymize_result.text\n", - "\n", - "# Fetch the anonynized entities from the result.\n", - "anonymized_entities = anonymize_result.items" - ] - }, - { - "cell_type": "markdown", - "id": "obvious-fifty", - "metadata": {}, - "source": [ - "### Presidio Anonymizer: Decrypt" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "outstanding-celebration", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "text: My name is James Bond.\n", - "items:\n", - "[\n", - " {'start': 11, 'end': 21, 'entity_type': 'PERSON', 'text': 'James Bond', 'operator': 'decrypt'}\n", - "]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Initialize the engine:\n", - "engine = DeanonymizeEngine()\n", - "\n", - "# Invoke the deanonymize function with the text, anonymizer results\n", - "# and a 'decrypt' operator to get the original text as output.\n", - "deanonymized_result = engine.deanonymize(\n", - " text=anonymized_text,\n", - " entities=anonymized_entities,\n", - " operators={\"DEFAULT\": OperatorConfig(\"decrypt\", {\"key\": crypto_key})},\n", - ")\n", - "\n", - "deanonymized_result" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "9ff6810b", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false + { + "attachments": {}, + "cell_type": "markdown", + "id": "roman-allergy", + "metadata": {}, + "source": [ + "### Set up imports" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "extensive-greensboro", + "metadata": {}, + "outputs": [], + "source": [ + "from presidio_anonymizer import AnonymizerEngine, DeanonymizeEngine\n", + "from presidio_anonymizer.entities import RecognizerResult, OperatorResult, OperatorConfig\n", + "from presidio_anonymizer.operators import Decrypt" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "091be4b6", + "metadata": {}, + "source": [ + "### Define a cryptographic key (for both encryption and decryption)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "50bc451e", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "crypto_key = \"WmZq4t7w!z%C&F)J\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "metropolitan-atlantic", + "metadata": {}, + "source": [ + "### Presidio Anonymizer: Encrypt" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "medium-ridge", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "text: My name is M4lla0kBCzu6SwCONL6Y+ZqsPqhBp1Lhdc3t0FKnUwM=.\n", + "items:\n", + "[\n", + " {'start': 11, 'end': 55, 'entity_type': 'PERSON', 'text': 'M4lla0kBCzu6SwCONL6Y+ZqsPqhBp1Lhdc3t0FKnUwM=', 'operator': 'encrypt'}\n", + "]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "engine = AnonymizerEngine()\n", + "\n", + "# Invoke the anonymize function with the text,\n", + "# analyzer results (potentially coming from presidio-analyzer)\n", + "# and an 'encrypt' operator to get an encrypted anonymization output:\n", + "anonymize_result = engine.anonymize(\n", + " text=\"My name is James Bond\",\n", + " analyzer_results=[\n", + " RecognizerResult(entity_type=\"PERSON\", start=11, end=21, score=0.8),\n", + " ],\n", + " operators={\"PERSON\": OperatorConfig(\"encrypt\", {\"key\": crypto_key})},\n", + ")\n", + "\n", + "anonymize_result" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2f8be6b5", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# Fetch the anonymized text from the result.\n", + "anonymized_text = anonymize_result.text\n", + "\n", + "# Fetch the anonynized entities from the result.\n", + "anonymized_entities = anonymize_result.items" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "obvious-fifty", + "metadata": {}, + "source": [ + "### Presidio Anonymizer: Decrypt" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "outstanding-celebration", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "text: My name is James Bond.\n", + "items:\n", + "[\n", + " {'start': 11, 'end': 21, 'entity_type': 'PERSON', 'text': 'James Bond', 'operator': 'decrypt'}\n", + "]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Initialize the engine:\n", + "engine = DeanonymizeEngine()\n", + "\n", + "# Invoke the deanonymize function with the text, anonymizer results\n", + "# and a 'decrypt' operator to get the original text as output.\n", + "deanonymized_result = engine.deanonymize(\n", + " text=anonymized_text,\n", + " entities=anonymized_entities,\n", + " operators={\"DEFAULT\": OperatorConfig(\"decrypt\", {\"key\": crypto_key})},\n", + ")\n", + "\n", + "deanonymized_result" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9ff6810b", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'James Bond'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Alternatively, call the Decrypt operator directly:\n", + "\n", + "# Fetch the encrypted entitiy value from the previous stage\n", + "encrypted_entity_value = anonymize_result.items[0].text\n", + "\n", + "# Restore the original entity value\n", + "Decrypt().operate(text=encrypted_entity_value, params={\"key\": crypto_key})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'James Bond'" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Alternatively, call the Decrypt operator directly:\n", - "\n", - "# Fetch the encrypted entitiy value from the previous stage\n", - "encrypted_entity_value = anonymize_result.items[0].text\n", - "\n", - "# Restore the original entity value\n", - "Decrypt().operate(text=encrypted_entity_value, params={\"key\": crypto_key})" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/samples/python/example_dicom_image_redactor.ipynb b/docs/samples/python/example_dicom_image_redactor.ipynb index d057173ef..255223e8b 100644 --- a/docs/samples/python/example_dicom_image_redactor.ipynb +++ b/docs/samples/python/example_dicom_image_redactor.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "3345f1c4", "metadata": {}, @@ -9,12 +10,14 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "4c3fbe1a", "metadata": {}, "source": [ "# De-identifying sensitive burnt-in text in DICOM images\n", "This notebook covers how to:\n", + "\n", "1. Redact text Personal Health Information (PHI) present as pixels in DICOM images\n", "2. Visually compare original DICOM images with their redacted versions\n", "\n", @@ -22,6 +25,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "39472f68", "metadata": {}, @@ -44,13 +48,15 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "bfd19f8f", "metadata": {}, "source": [ "## Dataset\n", "Sample DICOM files are available for use in this notebook in `./sample_data`. Copies of the original DICOM data were saved into the folder with permission from the dataset owners. Please see the original dataset information below:\n", - "> Rutherford, M., Mun, S.K., Levine, B., Bennett, W.C., Smith, K., Farmer, P., Jarosz, J., Wagner, U., Farahani, K., Prior, F. (2021). A DICOM dataset for evaluation of medical image de-identification (Pseudo-PHI-DICOM-Data) [Data set]. The Cancer Imaging Archive. DOI: https://doi.org/10.7937/s17z-r072" + "\n", + "> Rutherford, M., Mun, S.K., Levine, B., Bennett, W.C., Smith, K., Farmer, P., Jarosz, J., Wagner, U., Farahani, K., Prior, F. (2021). A DICOM dataset for evaluation of medical image de-identification (Pseudo-PHI-DICOM-Data) [Data set]. The Cancer Imaging Archive. DOI: https://doi.org/10.7937/s17z-r072\n" ] }, { @@ -68,6 +74,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "9c629f28", "metadata": {}, @@ -102,6 +109,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "4c977b77", "metadata": {}, @@ -122,6 +130,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "edaebbc2", "metadata": {}, @@ -130,6 +139,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "2645dab7", "metadata": {}, @@ -160,6 +170,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "85ccd52e", "metadata": {}, @@ -190,6 +201,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f1ca21eb", "metadata": {}, @@ -220,6 +232,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "17bfb9ff", "metadata": {}, @@ -251,6 +264,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "fa9f4209", "metadata": {}, @@ -291,6 +305,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "b02b8345", "metadata": {}, @@ -318,6 +333,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "9c11ce22", "metadata": {}, @@ -353,6 +369,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "624dca7d", "metadata": {}, @@ -362,6 +379,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f4cf6a33", "metadata": {}, @@ -386,6 +404,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "d733f22f", "metadata": {}, @@ -451,6 +470,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "6223ad64", "metadata": {}, diff --git a/docs/samples/python/index.md b/docs/samples/python/index.md deleted file mode 100644 index 7b55550f4..000000000 --- a/docs/samples/python/index.md +++ /dev/null @@ -1,21 +0,0 @@ -# Using Presidio in a Python script - -## Description - -Presidio service can be used as python packages inside python scripts - -## Table of contents - -1. [Simple analysis and anonymization](presidio_notebook.ipynb) -2. [Developing new PII recognizers](customizing_presidio_analyzer.ipynb) -3. [Remote Recognizer](https://github.com/microsoft/presidio/blob/main/docs/samples/python/example_remote_recognizer.py) -4. [Azure Text Analytics Integration](text_analytics/index.md) -5. [Anonymizing known values](Anonymizing%20known%20values.ipynb) -6. [Redacting text PII from DICOM images](example_dicom_image_redactor.ipynb) -7. [Annotating PII in a PDF](example_pdf_annotation.ipynb) -8. [Custom Anonymizer with lambda expression](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/example_custom_lambda_anonymizer.py) -9. [Running Presidio on structured / semi-structured data in batch](batch_processing.ipynb) -10. [Getting the detected text value using a custom operator](getting_entity_values.ipynb) -11. [Creating a simple demo website](streamlit/index.md) -12. [Using Flair as an external PII model](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/flair_recognizer.py) -13. [Using Transformers as an external PII model](transformers_recognizer/index.md) diff --git a/docs/samples/python/presidio_notebook.ipynb b/docs/samples/python/presidio_notebook.ipynb index 5e7cae687..d18f4bfe5 100644 --- a/docs/samples/python/presidio_notebook.ipynb +++ b/docs/samples/python/presidio_notebook.ipynb @@ -1,200 +1,206 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# download presidio\n", + "!pip install presidio_analyzer presidio_anonymizer\n", + "\n", + "!python -m spacy download en_core_web_lg" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Path to notebook: [https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/presidio_notebook.ipynb](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/presidio_notebook.ipynb)" + ] }, - "outputs": [], - "source": [ - "# download presidio\n", - "!pip install presidio_analyzer presidio_anonymizer", - "!python -m spacy download en_core_web_lg" - ] + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from presidio_analyzer import AnalyzerEngine, PatternRecognizer\n", + "from presidio_anonymizer import AnonymizerEngine\n", + "from presidio_anonymizer.entities import OperatorConfig\n", + "import json\n", + "from pprint import pprint" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analyze Text for PII Entities\n", + "\n", + "Using Presidio Analyzer, analyze a text to identify PII entities. \n", + "The Presidio analyzer is using pre-defined entity recognizers, and offers the option to create custom recognizers.\n", + "\n", + "The following code sample will:\n", + "\n", + "- Set up the Analyzer engine: load the NLP module (spaCy model by default) and other PII recognizers\n", + "- Call analyzer to get analyzed results for \"PHONE_NUMBER\" entity type\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text_to_anonymize = \"His name is Mr. Jones and his phone number is 212-555-5555\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "analyzer = AnalyzerEngine()\n", + "analyzer_results = analyzer.analyze(text=text_to_anonymize, entities=[\"PHONE_NUMBER\"], language='en')\n", + "\n", + "print(analyzer_results)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Custom PII Entity Recognizers\n", + "\n", + "Presidio Analyzer comes with a pre-defined set of entity recognizers. It also allows adding new recognizers without changing the analyzer base code, **by creating custom recognizers**. \n", + "In the following example, we will create two new recognizers of type `PatternRecognizer` to identify titles and pronouns in the analyzed text.\n", + "A `PatternRecognizer` is a PII entity recognizer which uses regular expressions or deny-lists.\n", + "\n", + "The following code sample will:\n", + "- Create custom recognizers\n", + "- Add the new custom recognizers to the analyzer\n", + "- Call analyzer to get results from the new recognizers" + ] }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Path to notebook: [https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/presidio_notebook.ipynb](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/presidio_notebook.ipynb)" - ] + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "titles_recognizer = PatternRecognizer(supported_entity=\"TITLE\",\n", + " deny_list=[\"Mr.\",\"Mrs.\",\"Miss\"])\n", + "\n", + "pronoun_recognizer = PatternRecognizer(supported_entity=\"PRONOUN\",\n", + " deny_list=[\"he\", \"He\", \"his\", \"His\", \"she\", \"She\", \"hers\", \"Hers\"])\n", + "\n", + "analyzer.registry.add_recognizer(titles_recognizer)\n", + "analyzer.registry.add_recognizer(pronoun_recognizer)\n", + "\n", + "analyzer_results = analyzer.analyze(text=text_to_anonymize,\n", + " entities=[\"TITLE\", \"PRONOUN\"],\n", + " language=\"en\")\n", + "print(analyzer_results)\n" + ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from presidio_analyzer import AnalyzerEngine, PatternRecognizer\n", - "from presidio_anonymizer import AnonymizerEngine\n", - "from presidio_anonymizer.entities import OperatorConfig\n", - "import json\n", - "from pprint import pprint" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Analyze Text for PII Entities\n", - "\n", - "Using Presidio Analyzer, analyze a text to identify PII entities. \n", - "The Presidio analyzer is using pre-defined entity recognizers, and offers the option to create custom recognizers.\n", - "\n", - "The following code sample will:\n", - "\n", - "- Set up the Analyzer engine: load the NLP module (spaCy model by default) and other PII recognizers\n", - "- Call analyzer to get analyzed results for \"PHONE_NUMBER\" entity type\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "text_to_anonymize = \"His name is Mr. Jones and his phone number is 212-555-5555\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analyzer = AnalyzerEngine()\n", - "analyzer_results = analyzer.analyze(text=text_to_anonymize, entities=[\"PHONE_NUMBER\"], language='en')\n", - "\n", - "print(analyzer_results)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Custom PII Entity Recognizers\n", - "\n", - "Presidio Analyzer comes with a pre-defined set of entity recognizers. It also allows adding new recognizers without changing the analyzer base code, **by creating custom recognizers**. \n", - "In the following example, we will create two new recognizers of type `PatternRecognizer` to identify titles and pronouns in the analyzed text.\n", - "A `PatternRecognizer` is a PII entity recognizer which uses regular expressions or deny-lists.\n", - "\n", - "The following code sample will:\n", - "- Create custom recognizers\n", - "- Add the new custom recognizers to the analyzer\n", - "- Call analyzer to get results from the new recognizers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "titles_recognizer = PatternRecognizer(supported_entity=\"TITLE\",\n", - " deny_list=[\"Mr.\",\"Mrs.\",\"Miss\"])\n", - "\n", - "pronoun_recognizer = PatternRecognizer(supported_entity=\"PRONOUN\",\n", - " deny_list=[\"he\", \"He\", \"his\", \"His\", \"she\", \"She\", \"hers\", \"Hers\"])\n", - "\n", - "analyzer.registry.add_recognizer(titles_recognizer)\n", - "analyzer.registry.add_recognizer(pronoun_recognizer)\n", - "\n", - "analyzer_results = analyzer.analyze(text=text_to_anonymize,\n", - " entities=[\"TITLE\", \"PRONOUN\"],\n", - " language=\"en\")\n", - "print(analyzer_results)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Call Presidio Analyzer and get analyzed results with all the configured recognizers - default and new custom recognizers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analyzer_results = analyzer.analyze(text=text_to_anonymize, language='en')\n", - "\n", - "analyzer_results" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Anonymize Text with Identified PII Entities\n", - "\n", - "
Presidio Anonymizer iterates over the Presidio Analyzer result, and provides anonymization capabilities for the identified text.\n", - "
The anonymizer provides 5 types of anonymizers - replace, redact, mask, hash and encrypt. The default is **replace**\n", - "\n", - "
The following code sample will:\n", - "
    \n", - "
  1. Setup the anonymizer engine
  2. \n", - "
  3. Create an anonymizer request - text to anonymize, list of anonymizers to apply and the results from the analyzer request
  4. \n", - "
  5. Anonymize the text
  6. \n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "anonymizer = AnonymizerEngine()\n", - "\n", - "anonymized_results = anonymizer.anonymize(\n", - " text=text_to_anonymize,\n", - " analyzer_results=analyzer_results, \n", - " operators={\"DEFAULT\": OperatorConfig(\"replace\", {\"new_value\": \"\"}), \n", - " \"PHONE_NUMBER\": OperatorConfig(\"mask\", {\"type\": \"mask\", \"masking_char\" : \"*\", \"chars_to_mask\" : 12, \"from_end\" : True}),\n", - " \"TITLE\": OperatorConfig(\"redact\", {})}\n", - ")\n", - "\n", - "print(f\"text: {anonymized_results.text}\")\n", - "print(\"detailed response:\")\n", - "\n", - "pprint(json.loads(anonymized_results.to_json()))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "presidio", - "language": "python", - "name": "presidio" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - }, - "metadata": { - "interpreter": { - "hash": "1baa965d5efe3ac65b79dfc60c0d706280b1da80fedb7760faf2759126c4f253" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Call Presidio Analyzer and get analyzed results with all the configured recognizers - default and new custom recognizers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "analyzer_results = analyzer.analyze(text=text_to_anonymize, language='en')\n", + "\n", + "analyzer_results" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Anonymize Text with Identified PII Entities\n", + "\n", + "
Presidio Anonymizer iterates over the Presidio Analyzer result, and provides anonymization capabilities for the identified text.\n", + "
The anonymizer provides 5 types of anonymizers - replace, redact, mask, hash and encrypt. The default is **replace**\n", + "\n", + "
The following code sample will:\n", + "
    \n", + "
  1. Setup the anonymizer engine
  2. \n", + "
  3. Create an anonymizer request - text to anonymize, list of anonymizers to apply and the results from the analyzer request
  4. \n", + "
  5. Anonymize the text
  6. \n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "anonymizer = AnonymizerEngine()\n", + "\n", + "anonymized_results = anonymizer.anonymize(\n", + " text=text_to_anonymize,\n", + " analyzer_results=analyzer_results, \n", + " operators={\"DEFAULT\": OperatorConfig(\"replace\", {\"new_value\": \"\"}), \n", + " \"PHONE_NUMBER\": OperatorConfig(\"mask\", {\"type\": \"mask\", \"masking_char\" : \"*\", \"chars_to_mask\" : 12, \"from_end\" : True}),\n", + " \"TITLE\": OperatorConfig(\"redact\", {})}\n", + ")\n", + "\n", + "print(f\"text: {anonymized_results.text}\")\n", + "print(\"detailed response:\")\n", + "\n", + "pprint(json.loads(anonymized_results.to_json()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "presidio", + "language": "python", + "name": "presidio" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + }, + "metadata": { + "interpreter": { + "hash": "1baa965d5efe3ac65b79dfc60c0d706280b1da80fedb7760faf2759126c4f253" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/mkdocs.yml b/mkdocs.yml index 945d62810..3fe12f9d0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -46,21 +46,21 @@ nav: - Handling images: - Home: image-redactor/index.md - Evaluating DICOM redaction: image-redactor/evaluating_dicom_redaction.md - - Development and design: - - Design: design.md - - Setting up a development environment: development.md - - Build and release process: build_release.md - - Changes from V1 to V2: presidio_V2.md - - Python API reference: - - Home: api.md - - Presidio Analyzer Python API: api/analyzer_python.md - - Presidio Anonymizer Python API: api/anonymizer_python.md - - Presidio Image Redactor Python API: api/image_redactor_python.md - - REST API reference: https://microsoft.github.io/presidio/api-docs/api-docs.html" target="_blank - + - Samples: samples/index.md - General: - Supported entities: supported_entities.md - - Samples: samples/index.md + - Development and design: + - Design: design.md + - Setting up a development environment: development.md + - Build and release process: build_release.md + - Changes from V1 to V2: presidio_V2.md + - Python API reference: + - Home: api.md + - Presidio Analyzer Python API: api/analyzer_python.md + - Presidio Anonymizer Python API: api/anonymizer_python.md + - Presidio Image Redactor Python API: api/image_redactor_python.md + - REST API reference: https://microsoft.github.io/presidio/api-docs/api-docs.html" target="_blank + - Community: community.md - FAQ: faq.md - Demo: https://huggingface.co/spaces/presidio/presidio_demo" target="_blank diff --git a/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py b/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py index 4a428595d..ceb2e7eef 100644 --- a/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py +++ b/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py @@ -8,17 +8,18 @@ class BatchAnalyzerEngine: - """ - Batch analysis of documents (tables, lists, dicts). - Wrapper class to run Presidio Analyzer Engine on multiple values, - either lists/iterators of strings, or dictionaries. - - :param: analyzer_engine: AnalyzerEngine instance to use - for handling the values in those collections. - """ def __init__(self, analyzer_engine: Optional[AnalyzerEngine] = None): + """ + Batch analysis of documents (tables, lists, dicts). + + Wrapper class to run Presidio Analyzer Engine on multiple values, + either lists/iterators of strings, or dictionaries. + + :param: analyzer_engine: AnalyzerEngine instance to use + for handling the values in those collections. + """ self.analyzer_engine = analyzer_engine if not analyzer_engine: From b634221ff581563faa4772ce499fc9f243437e80 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 22 Oct 2023 15:15:43 +0300 Subject: [PATCH 59/67] removed "attachments" --- docs/samples/python/batch_processing.ipynb | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/docs/samples/python/batch_processing.ipynb b/docs/samples/python/batch_processing.ipynb index f82612470..9cb29a14b 100644 --- a/docs/samples/python/batch_processing.ipynb +++ b/docs/samples/python/batch_processing.ipynb @@ -15,8 +15,7 @@ "!python -m spacy download en_core_web_lg" ] }, - { - "attachments": {}, + { "cell_type": "markdown", "id": "3345f1c4", "metadata": {}, @@ -25,7 +24,7 @@ ] }, { - "attachments": {}, + "cell_type": "markdown", "id": "gothic-trademark", "metadata": {}, @@ -39,7 +38,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "roman-allergy", "metadata": {}, @@ -67,7 +65,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "fiscal-affair", "metadata": {}, @@ -348,7 +345,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "1cb4b006", "metadata": {}, @@ -411,7 +407,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "e593eb11", "metadata": {}, @@ -446,7 +441,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "bd0cde2a", "metadata": {}, @@ -482,7 +476,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "aa0ab530", "metadata": {}, From 1d800116ffd1e5c1d3b9ae9b6be0cdb89c7a2e3a Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 22 Oct 2023 15:34:02 +0300 Subject: [PATCH 60/67] Update batch_processing.ipynb Reverted notebook change --- docs/samples/python/batch_processing.ipynb | 1043 ++++++++++---------- 1 file changed, 521 insertions(+), 522 deletions(-) diff --git a/docs/samples/python/batch_processing.ipynb b/docs/samples/python/batch_processing.ipynb index 9cb29a14b..d27a72c47 100644 --- a/docs/samples/python/batch_processing.ipynb +++ b/docs/samples/python/batch_processing.ipynb @@ -1,524 +1,523 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "bcddce7b", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# download presidio\n", - "!pip install presidio_analyzer presidio_anonymizer\n", - "\n", - "!python -m spacy download en_core_web_lg" - ] - }, - { - "cell_type": "markdown", - "id": "3345f1c4", - "metadata": {}, - "source": [ - "Path to notebook: [https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/batch_processing.ipynb](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/batch_processing.ipynb)" - ] - }, - { - - "cell_type": "markdown", - "id": "gothic-trademark", - "metadata": {}, - "source": [ - "# Run Presidio on structured / semi-structured data\n", - "\n", - "This sample shows how Presidio could be potentially extended to handle the anonymization of a table or data frame.\n", - "It introduces methods for the analysis and anonymization of both lists and dicts. \n", - "\n", - "Note: this sample input here is a Pandas DataFrame and a JSON file, but it can be used in other scenarios such as querying SQL data or using Spark DataFrames.\n" - ] - }, - { - "cell_type": "markdown", - "id": "roman-allergy", - "metadata": {}, - "source": [ - "### Set up imports" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "extensive-greensboro", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import List, Optional, Dict, Union, Iterator, Iterable\n", - "import collections\n", - "from dataclasses import dataclass\n", - "import pprint\n", - "\n", - "import pandas as pd\n", - "\n", - "from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerResult, DictAnalyzerResult\n", - "from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine\n", - "from presidio_anonymizer.entities import EngineResult\n" - ] - }, - { - "cell_type": "markdown", - "id": "fiscal-affair", - "metadata": {}, - "source": [ - "## Example using sample tabular data" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "bright-maple", - "metadata": {}, - "outputs": [], - "source": [ - "columns = [\"name phrase\", \"phone number phrase\", \"integer\", \"boolean\" ]\n", - "sample_data = [\n", - " ('Charlie likes this', 'Please call 212-555-1234 after 2pm', 1, True),\n", - " ('You should talk to Mike', 'his number is 978-428-7111', 2, False),\n", - " ('Mary had a little startup', 'Phone number: 202-342-1234', 3, False)\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "russian-proceeding", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name phrasephone number phraseintegerboolean
0Charlie likes thisPlease call 212-555-1234 after 2pm1True
1You should talk to Mikehis number is 978-428-71112False
2Mary had a little startupPhone number: 202-342-12343False
\n", - "
" - ], - "text/plain": [ - " name phrase phone number phrase integer \n", - "0 Charlie likes this Please call 212-555-1234 after 2pm 1 \\\n", - "1 You should talk to Mike his number is 978-428-7111 2 \n", - "2 Mary had a little startup Phone number: 202-342-1234 3 \n", - "\n", - " boolean \n", - "0 True \n", - "1 False \n", - "2 False " - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Create Pandas DataFrame\n", - "df = pd.DataFrame(sample_data,columns=columns)\n", - "\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "brazilian-punch", - "metadata": {}, - "outputs": [], - "source": [ - "# DataFrame to dict\n", - "df_dict = df.to_dict(orient=\"list\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "fixed-commerce", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'boolean': [True, False, False],\n", - " 'integer': [1, 2, 3],\n", - " 'name phrase': ['Charlie likes this',\n", - " 'You should talk to Mike',\n", - " 'Mary had a little startup'],\n", - " 'phone number phrase': ['Please call 212-555-1234 after 2pm',\n", - " 'his number is 978-428-7111',\n", - " 'Phone number: 202-342-1234']}\n" - ] - } - ], - "source": [ - "pprint.pprint(df_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "verified-spirituality", - "metadata": {}, - "outputs": [], - "source": [ - "analyzer = AnalyzerEngine()\n", - "batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)\n", - "batch_anonymizer = BatchAnonymizerEngine()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "narrative-freeze", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[DictAnalyzerResult(key='name phrase', value=['Charlie likes this', 'You should talk to Mike', 'Mary had a little startup'], recognizer_results=[[type: PERSON, start: 0, end: 7, score: 0.85], [type: PERSON, start: 19, end: 23, score: 0.85], [type: PERSON, start: 0, end: 4, score: 0.85]]),\n", - " DictAnalyzerResult(key='phone number phrase', value=['Please call 212-555-1234 after 2pm', 'his number is 978-428-7111', 'Phone number: 202-342-1234'], recognizer_results=[[type: DATE_TIME, start: 31, end: 34, score: 0.85, type: PHONE_NUMBER, start: 12, end: 24, score: 0.75], [type: PHONE_NUMBER, start: 14, end: 26, score: 0.75], [type: PHONE_NUMBER, start: 14, end: 26, score: 0.75]]),\n", - " DictAnalyzerResult(key='integer', value=[1, 2, 3], recognizer_results=[[], [], []]),\n", - " DictAnalyzerResult(key='boolean', value=[True, False, False], recognizer_results=[[], [], []])]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "analyzer_results = batch_analyzer.analyze_dict(df_dict, language=\"en\")\n", - "analyzer_results = list(analyzer_results)\n", - "analyzer_results" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "rural-month", - "metadata": {}, - "outputs": [], - "source": [ - "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "acute-mauritius", - "metadata": {}, - "outputs": [], - "source": [ - "scrubbed_df = pd.DataFrame(anonymizer_results)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "irish-phoenix", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
name phrasephone number phraseintegerboolean
0<PERSON> likes thisPlease call <PHONE_NUMBER> after <DATE_TIME>1True
1You should talk to <PERSON>his number is <PHONE_NUMBER>2False
2<PERSON> had a little startupPhone number: <PHONE_NUMBER>3False
\n", - "
" - ], - "text/plain": [ - " name phrase \n", - "0 likes this \\\n", - "1 You should talk to \n", - "2 had a little startup \n", - "\n", - " phone number phrase integer boolean \n", - "0 Please call after 1 True \n", - "1 his number is 2 False \n", - "2 Phone number: 3 False " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "scrubbed_df" - ] - }, - { - "cell_type": "markdown", - "id": "1cb4b006", - "metadata": {}, - "source": [ - "## Example using JSON" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "1063019b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'key_a': {'key_a1': 'My phone number is 212-121-1424'},\n", - " 'key_b': {'www.abc.com'},\n", - " 'key_c': 3,\n", - " 'names': ['James Bond', 'Clark Kent', 'Hakeem Olajuwon', 'No name here!']}\n" - ] - } - ], - "source": [ - "nested_dict = {\n", - " \"key_a\": {\"key_a1\": \"My phone number is 212-121-1424\"},\n", - " \"key_b\": {\"www.abc.com\"},\n", - " \"key_c\": 3,\n", - " \"names\": [\"James Bond\", \"Clark Kent\", \"Hakeem Olajuwon\", \"No name here!\"]\n", - "}\n", - "\n", - "pprint.pprint(nested_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "e3c09b4b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'key_a': {'key_a1': 'My phone number is '},\n", - " 'key_b': [''],\n", - " 'key_c': 3,\n", - " 'names': ['', '', '', 'No name here!']}\n" - ] - } - ], - "source": [ - "# Analyze dict\n", - "analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language=\"en\")\n", - "\n", - "# Anonymize dict\n", - "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)\n", - "pprint.pprint(anonymizer_results)" - ] - }, - { - "cell_type": "markdown", - "id": "e593eb11", - "metadata": {}, - "source": [ - "### Ignoring specific keys" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "84b2ef95", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'key_a': {'key_a1': 'My phone number is 212-121-1424'},\n", - " 'key_b': [''],\n", - " 'key_c': 3,\n", - " 'names': ['James Bond', 'Clark Kent', 'Hakeem Olajuwon', 'No name here!']}\n" - ] - } - ], - "source": [ - "keys_to_skip=[\"key_a1\", \"names\"]\n", - "analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language=\"en\", keys_to_skip=keys_to_skip)\n", - "\n", - "# Anonymize dict\n", - "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)\n", - "pprint.pprint(anonymizer_results)" - ] - }, - { - "cell_type": "markdown", - "id": "bd0cde2a", - "metadata": {}, - "source": [ - "### Ignoring nested keys" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "93ed8769", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'key_a': {'key_a1': 'My phone number is 212-121-1424'},\n", - " 'key_b': [''],\n", - " 'key_c': 3,\n", - " 'names': ['', '', '', 'No name here!']}\n" - ] - } - ], - "source": [ - "keys_to_skip = [\"key_a.key_a1\"]\n", - "\n", - "analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language=\"en\", keys_to_skip=keys_to_skip)\n", - "\n", - "# Anonymize dict\n", - "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)\n", - "pprint.pprint(anonymizer_results)" - ] - }, - { - "cell_type": "markdown", - "id": "aa0ab530", - "metadata": {}, - "source": [ - "#### **Note!**\n", - "\n", - "JSON files with objects within lists, e.g.:\n", - "```\n", - "{\n", - " \"key\": [\n", - " {\n", - " \"key2\": \"Peter Parker\"\n", - " },\n", - " {\n", - " \"key3\": \"555-1234\"\n", - " }\n", - " ]\n", - "}\n", - "```\n", - "\n", - "Are not yet supported. Consider breaking the JSON to parts if needed." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "bcddce7b", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# download presidio\n", + "!pip install presidio_analyzer presidio_anonymizer", + "", + "!python -m spacy download en_core_web_lg" + ] + }, + { + "cell_type": "markdown", + "id": "3345f1c4", + "metadata": {}, + "source": [ + "Path to notebook: [https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/batch_processing.ipynb](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/batch_processing.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "gothic-trademark", + "metadata": {}, + "source": [ + "# Run Presidio on structured / semi-structured data\n", + "\n", + "This sample shows how Presidio could be potentially extended to handle the anonymization of a table or data frame.\n", + "It introduces methods for the analysis and anonymization of both lists and dicts. \n", + "\n", + "Note: this sample input here is a Pandas DataFrame and a JSON file, but it can be used in other scenarios such as querying SQL data or using Spark DataFrames.\n" + ] + }, + { + "cell_type": "markdown", + "id": "roman-allergy", + "metadata": {}, + "source": [ + "### Set up imports" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "extensive-greensboro", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List, Optional, Dict, Union, Iterator, Iterable\n", + "import collections\n", + "from dataclasses import dataclass\n", + "import pprint\n", + "\n", + "import pandas as pd\n", + "\n", + "from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerResult, DictAnalyzerResult\n", + "from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine\n", + "from presidio_anonymizer.entities import EngineResult\n" + ] + }, + { + "cell_type": "markdown", + "id": "fiscal-affair", + "metadata": {}, + "source": [ + "## Example using sample tabular data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "bright-maple", + "metadata": {}, + "outputs": [], + "source": [ + "columns = [\"name phrase\", \"phone number phrase\", \"integer\", \"boolean\" ]\n", + "sample_data = [\n", + " ('Charlie likes this', 'Please call 212-555-1234 after 2pm', 1, True),\n", + " ('You should talk to Mike', 'his number is 978-428-7111', 2, False),\n", + " ('Mary had a little startup', 'Phone number: 202-342-1234', 3, False)\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "russian-proceeding", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
name phrasephone number phraseintegerboolean
0Charlie likes thisPlease call 212-555-1234 after 2pm1True
1You should talk to Mikehis number is 978-428-71112False
2Mary had a little startupPhone number: 202-342-12343False
\n", + "
" + ], + "text/plain": [ + " name phrase phone number phrase integer \n", + "0 Charlie likes this Please call 212-555-1234 after 2pm 1 \\\n", + "1 You should talk to Mike his number is 978-428-7111 2 \n", + "2 Mary had a little startup Phone number: 202-342-1234 3 \n", + "\n", + " boolean \n", + "0 True \n", + "1 False \n", + "2 False " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create Pandas DataFrame\n", + "df = pd.DataFrame(sample_data,columns=columns)\n", + "\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "brazilian-punch", + "metadata": {}, + "outputs": [], + "source": [ + "# DataFrame to dict\n", + "df_dict = df.to_dict(orient=\"list\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fixed-commerce", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'boolean': [True, False, False],\n", + " 'integer': [1, 2, 3],\n", + " 'name phrase': ['Charlie likes this',\n", + " 'You should talk to Mike',\n", + " 'Mary had a little startup'],\n", + " 'phone number phrase': ['Please call 212-555-1234 after 2pm',\n", + " 'his number is 978-428-7111',\n", + " 'Phone number: 202-342-1234']}\n" + ] + } + ], + "source": [ + "pprint.pprint(df_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "verified-spirituality", + "metadata": {}, + "outputs": [], + "source": [ + "analyzer = AnalyzerEngine()\n", + "batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)\n", + "batch_anonymizer = BatchAnonymizerEngine()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "narrative-freeze", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[DictAnalyzerResult(key='name phrase', value=['Charlie likes this', 'You should talk to Mike', 'Mary had a little startup'], recognizer_results=[[type: PERSON, start: 0, end: 7, score: 0.85], [type: PERSON, start: 19, end: 23, score: 0.85], [type: PERSON, start: 0, end: 4, score: 0.85]]),\n", + " DictAnalyzerResult(key='phone number phrase', value=['Please call 212-555-1234 after 2pm', 'his number is 978-428-7111', 'Phone number: 202-342-1234'], recognizer_results=[[type: DATE_TIME, start: 31, end: 34, score: 0.85, type: PHONE_NUMBER, start: 12, end: 24, score: 0.75], [type: PHONE_NUMBER, start: 14, end: 26, score: 0.75], [type: PHONE_NUMBER, start: 14, end: 26, score: 0.75]]),\n", + " DictAnalyzerResult(key='integer', value=[1, 2, 3], recognizer_results=[[], [], []]),\n", + " DictAnalyzerResult(key='boolean', value=[True, False, False], recognizer_results=[[], [], []])]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analyzer_results = batch_analyzer.analyze_dict(df_dict, language=\"en\")\n", + "analyzer_results = list(analyzer_results)\n", + "analyzer_results" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "rural-month", + "metadata": {}, + "outputs": [], + "source": [ + "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "acute-mauritius", + "metadata": {}, + "outputs": [], + "source": [ + "scrubbed_df = pd.DataFrame(anonymizer_results)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "irish-phoenix", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
name phrasephone number phraseintegerboolean
0<PERSON> likes thisPlease call <PHONE_NUMBER> after <DATE_TIME>1True
1You should talk to <PERSON>his number is <PHONE_NUMBER>2False
2<PERSON> had a little startupPhone number: <PHONE_NUMBER>3False
\n", + "
" + ], + "text/plain": [ + " name phrase \n", + "0 likes this \\\n", + "1 You should talk to \n", + "2 had a little startup \n", + "\n", + " phone number phrase integer boolean \n", + "0 Please call after 1 True \n", + "1 his number is 2 False \n", + "2 Phone number: 3 False " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scrubbed_df" + ] + }, + { + "cell_type": "markdown", + "id": "1cb4b006", + "metadata": {}, + "source": [ + "## Example using JSON" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "1063019b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'key_a': {'key_a1': 'My phone number is 212-121-1424'},\n", + " 'key_b': {'www.abc.com'},\n", + " 'key_c': 3,\n", + " 'names': ['James Bond', 'Clark Kent', 'Hakeem Olajuwon', 'No name here!']}\n" + ] + } + ], + "source": [ + "nested_dict = {\n", + " \"key_a\": {\"key_a1\": \"My phone number is 212-121-1424\"},\n", + " \"key_b\": {\"www.abc.com\"},\n", + " \"key_c\": 3,\n", + " \"names\": [\"James Bond\", \"Clark Kent\", \"Hakeem Olajuwon\", \"No name here!\"]\n", + "}\n", + "\n", + "pprint.pprint(nested_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e3c09b4b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'key_a': {'key_a1': 'My phone number is '},\n", + " 'key_b': [''],\n", + " 'key_c': 3,\n", + " 'names': ['', '', '', 'No name here!']}\n" + ] + } + ], + "source": [ + "# Analyze dict\n", + "analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language=\"en\")\n", + "\n", + "# Anonymize dict\n", + "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)\n", + "pprint.pprint(anonymizer_results)" + ] + }, + { + "cell_type": "markdown", + "id": "e593eb11", + "metadata": {}, + "source": [ + "### Ignoring specific keys" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "84b2ef95", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'key_a': {'key_a1': 'My phone number is 212-121-1424'},\n", + " 'key_b': [''],\n", + " 'key_c': 3,\n", + " 'names': ['James Bond', 'Clark Kent', 'Hakeem Olajuwon', 'No name here!']}\n" + ] + } + ], + "source": [ + "keys_to_skip=[\"key_a1\", \"names\"]\n", + "analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language=\"en\", keys_to_skip=keys_to_skip)\n", + "\n", + "# Anonymize dict\n", + "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)\n", + "pprint.pprint(anonymizer_results)" + ] + }, + { + "cell_type": "markdown", + "id": "bd0cde2a", + "metadata": {}, + "source": [ + "### Ignoring nested keys" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "93ed8769", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'key_a': {'key_a1': 'My phone number is 212-121-1424'},\n", + " 'key_b': [''],\n", + " 'key_c': 3,\n", + " 'names': ['', '', '', 'No name here!']}\n" + ] + } + ], + "source": [ + "keys_to_skip = [\"key_a.key_a1\"]\n", + "\n", + "analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language=\"en\", keys_to_skip=keys_to_skip)\n", + "\n", + "# Anonymize dict\n", + "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)\n", + "pprint.pprint(anonymizer_results)" + ] + }, + { + "cell_type": "markdown", + "id": "aa0ab530", + "metadata": {}, + "source": [ + "#### **Note!**\n", + "\n", + "JSON files with objects within lists, e.g.:\n", + "```\n", + "{\n", + " \"key\": [\n", + " {\n", + " \"key2\": \"Peter Parker\"\n", + " },\n", + " {\n", + " \"key3\": \"555-1234\"\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "\n", + "Are not yet supported. Consider breaking the JSON to parts if needed." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } From 4cc77feee91c0b43d777623c21f147f132ba3c80 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 22 Oct 2023 15:34:47 +0300 Subject: [PATCH 61/67] Update batch_processing.ipynb added line between pip and spacy --- docs/samples/python/batch_processing.ipynb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/samples/python/batch_processing.ipynb b/docs/samples/python/batch_processing.ipynb index d27a72c47..ab836f031 100644 --- a/docs/samples/python/batch_processing.ipynb +++ b/docs/samples/python/batch_processing.ipynb @@ -10,8 +10,7 @@ "outputs": [], "source": [ "# download presidio\n", - "!pip install presidio_analyzer presidio_anonymizer", - "", + "!pip install presidio_analyzer presidio_anonymizer\n", "!python -m spacy download en_core_web_lg" ] }, From 5ab5c8d79c78476a1bc3b6c5194c69b2af1cf1c4 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 22 Oct 2023 15:37:03 +0300 Subject: [PATCH 62/67] Added line between pip and spacy From 0cb1d9e5d6d7158ad5bfb5baf2072e5ffe7fc6a0 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 22 Oct 2023 15:38:43 +0300 Subject: [PATCH 63/67] fixed markdown in notebook --- .../customizing_presidio_analyzer.ipynb | 76 ++++++------------- 1 file changed, 22 insertions(+), 54 deletions(-) diff --git a/docs/samples/python/customizing_presidio_analyzer.ipynb b/docs/samples/python/customizing_presidio_analyzer.ipynb index 669fc0a7b..09173d7b1 100644 --- a/docs/samples/python/customizing_presidio_analyzer.ipynb +++ b/docs/samples/python/customizing_presidio_analyzer.ipynb @@ -1,15 +1,13 @@ { "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ +{ +"cell_type": "markdown", +"metadata": {}, +"source": [ "Path to notebook: [https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/customizing_presidio_analyzer.ipynb](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/customizing_presidio_analyzer.ipynb)" - ] - }, +] +}, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -18,12 +16,11 @@ "This notebooks covers different customization use cases to:\n", "\n", "1. Adapt Presidio to detect new types of PII entities\n", - "1. Adapt Presidio to detect PII entities in a new language\n", - "1. Embed new types of detection modules into Presidio, to improve the coverage of the service." + "2. Adapt Presidio to detect PII entities in a new language\n", + "3. Embed new types of detection modules into Presidio, to improve the coverage of the service." ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -31,23 +28,22 @@ "First, let's install presidio using `pip`. For detailed documentation, see the [installation docs](https://microsoft.github.io/presidio/installation).\n", "\n", "Install from PyPI:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# download presidio\n", - "!pip install presidio_analyzer presidio_anonymizer\n", - "!python -m spacy download en_core_web_lg" - ] + ]}, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# download presidio\n", + "!pip install presidio_analyzer presidio_anonymizer\n", + "!python -m spacy download en_core_web_lg" + ] + }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -75,7 +71,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -95,7 +90,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -112,7 +106,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -140,7 +133,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -159,7 +151,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -197,7 +188,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -226,7 +216,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -250,7 +239,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -280,7 +268,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -288,7 +275,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -326,7 +312,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -378,7 +363,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -412,7 +396,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -420,7 +403,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -435,7 +417,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -448,7 +429,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -510,7 +490,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -518,7 +497,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -562,7 +540,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -582,7 +559,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -627,7 +603,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -678,7 +653,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -686,7 +660,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -728,7 +701,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -782,7 +754,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -790,7 +761,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -798,7 +768,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -831,7 +800,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ From b8955e3903bdb57fc8be8713c5cd3321189c68a1 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 22 Oct 2023 15:39:30 +0300 Subject: [PATCH 64/67] added line between pip and spacy --- docs/samples/python/encrypt_decrypt.ipynb | 503 +++++++++++----------- 1 file changed, 248 insertions(+), 255 deletions(-) diff --git a/docs/samples/python/encrypt_decrypt.ipynb b/docs/samples/python/encrypt_decrypt.ipynb index 003fc1524..a6a09b655 100644 --- a/docs/samples/python/encrypt_decrypt.ipynb +++ b/docs/samples/python/encrypt_decrypt.ipynb @@ -1,262 +1,255 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "bcddce7b", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# download presidio\n", - "!pip install presidio_analyzer presidio_anonymizer\n", - "\n", - "!python -m spacy download en_core_web_lg" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "3345f1c4", - "metadata": {}, - "source": [ - "Path to notebook: [https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/encrypt_decrypt.ipynb](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/encrypt_decrypt.ipynb)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "gothic-trademark", - "metadata": {}, - "source": [ - "# Encrypting and Decrypting identified entities\n", - "\n", - "This sample shows how to use Presidio Anonymizer built-in functionality, to encrypt and decrypt identified entities.\n", - "The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "roman-allergy", - "metadata": {}, - "source": [ - "### Set up imports" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "extensive-greensboro", - "metadata": {}, - "outputs": [], - "source": [ - "from presidio_anonymizer import AnonymizerEngine, DeanonymizeEngine\n", - "from presidio_anonymizer.entities import RecognizerResult, OperatorResult, OperatorConfig\n", - "from presidio_anonymizer.operators import Decrypt" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "091be4b6", - "metadata": {}, - "source": [ - "### Define a cryptographic key (for both encryption and decryption)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "50bc451e", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "crypto_key = \"WmZq4t7w!z%C&F)J\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "metropolitan-atlantic", - "metadata": {}, - "source": [ - "### Presidio Anonymizer: Encrypt" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "medium-ridge", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "text: My name is M4lla0kBCzu6SwCONL6Y+ZqsPqhBp1Lhdc3t0FKnUwM=.\n", - "items:\n", - "[\n", - " {'start': 11, 'end': 55, 'entity_type': 'PERSON', 'text': 'M4lla0kBCzu6SwCONL6Y+ZqsPqhBp1Lhdc3t0FKnUwM=', 'operator': 'encrypt'}\n", - "]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "engine = AnonymizerEngine()\n", - "\n", - "# Invoke the anonymize function with the text,\n", - "# analyzer results (potentially coming from presidio-analyzer)\n", - "# and an 'encrypt' operator to get an encrypted anonymization output:\n", - "anonymize_result = engine.anonymize(\n", - " text=\"My name is James Bond\",\n", - " analyzer_results=[\n", - " RecognizerResult(entity_type=\"PERSON\", start=11, end=21, score=0.8),\n", - " ],\n", - " operators={\"PERSON\": OperatorConfig(\"encrypt\", {\"key\": crypto_key})},\n", - ")\n", - "\n", - "anonymize_result" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "2f8be6b5", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "# Fetch the anonymized text from the result.\n", - "anonymized_text = anonymize_result.text\n", - "\n", - "# Fetch the anonynized entities from the result.\n", - "anonymized_entities = anonymize_result.items" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "obvious-fifty", - "metadata": {}, - "source": [ - "### Presidio Anonymizer: Decrypt" - ] + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "bcddce7b", + "metadata": { + "scrolled": true }, - { - "cell_type": "code", - "execution_count": 8, - "id": "outstanding-celebration", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "text: My name is James Bond.\n", - "items:\n", - "[\n", - " {'start': 11, 'end': 21, 'entity_type': 'PERSON', 'text': 'James Bond', 'operator': 'decrypt'}\n", - "]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Initialize the engine:\n", - "engine = DeanonymizeEngine()\n", - "\n", - "# Invoke the deanonymize function with the text, anonymizer results\n", - "# and a 'decrypt' operator to get the original text as output.\n", - "deanonymized_result = engine.deanonymize(\n", - " text=anonymized_text,\n", - " entities=anonymized_entities,\n", - " operators={\"DEFAULT\": OperatorConfig(\"decrypt\", {\"key\": crypto_key})},\n", - ")\n", - "\n", - "deanonymized_result" - ] + "outputs": [], + "source": [ + "# download presidio\n", + "!pip install presidio_analyzer presidio_anonymizer\n", + "!python -m spacy download en_core_web_lg" + ] }, { - "cell_type": "code", - "execution_count": 9, - "id": "9ff6810b", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'James Bond'" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Alternatively, call the Decrypt operator directly:\n", - "\n", - "# Fetch the encrypted entitiy value from the previous stage\n", - "encrypted_entity_value = anonymize_result.items[0].text\n", - "\n", - "# Restore the original entity value\n", - "Decrypt().operate(text=encrypted_entity_value, params={\"key\": crypto_key})" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" + "cell_type": "markdown", + "id": "3345f1c4", + "metadata": {}, + "source": [ + "Path to notebook: [https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/encrypt_decrypt.ipynb](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/encrypt_decrypt.ipynb)" + ] }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - } + { + "cell_type": "markdown", + "id": "gothic-trademark", + "metadata": {}, + "source": [ + "# Encrypting and Decrypting identified entities\n", + "\n", + "This sample shows how to use Presidio Anonymizer built-in functionality, to encrypt and decrypt identified entities.\n", + "The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption.\n" + ] + }, + { + "cell_type": "markdown", + "id": "roman-allergy", + "metadata": {}, + "source": [ + "### Set up imports" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "extensive-greensboro", + "metadata": {}, + "outputs": [], + "source": [ + "from presidio_anonymizer import AnonymizerEngine, DeanonymizeEngine\n", + "from presidio_anonymizer.entities import RecognizerResult, OperatorResult, OperatorConfig\n", + "from presidio_anonymizer.operators import Decrypt" + ] + }, + { + "cell_type": "markdown", + "id": "091be4b6", + "metadata": {}, + "source": [ + "### Define a cryptographic key (for both encryption and decryption)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "50bc451e", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "crypto_key = \"WmZq4t7w!z%C&F)J\"" + ] + }, + { + "cell_type": "markdown", + "id": "metropolitan-atlantic", + "metadata": {}, + "source": [ + "### Presidio Anonymizer: Encrypt" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "medium-ridge", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "text: My name is M4lla0kBCzu6SwCONL6Y+ZqsPqhBp1Lhdc3t0FKnUwM=.\n", + "items:\n", + "[\n", + " {'start': 11, 'end': 55, 'entity_type': 'PERSON', 'text': 'M4lla0kBCzu6SwCONL6Y+ZqsPqhBp1Lhdc3t0FKnUwM=', 'operator': 'encrypt'}\n", + "]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "engine = AnonymizerEngine()\n", + "\n", + "# Invoke the anonymize function with the text,\n", + "# analyzer results (potentially coming from presidio-analyzer)\n", + "# and an 'encrypt' operator to get an encrypted anonymization output:\n", + "anonymize_result = engine.anonymize(\n", + " text=\"My name is James Bond\",\n", + " analyzer_results=[\n", + " RecognizerResult(entity_type=\"PERSON\", start=11, end=21, score=0.8),\n", + " ],\n", + " operators={\"PERSON\": OperatorConfig(\"encrypt\", {\"key\": crypto_key})},\n", + ")\n", + "\n", + "anonymize_result" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2f8be6b5", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# Fetch the anonymized text from the result.\n", + "anonymized_text = anonymize_result.text\n", + "\n", + "# Fetch the anonynized entities from the result.\n", + "anonymized_entities = anonymize_result.items" + ] + }, + { + "cell_type": "markdown", + "id": "obvious-fifty", + "metadata": {}, + "source": [ + "### Presidio Anonymizer: Decrypt" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "outstanding-celebration", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "text: My name is James Bond.\n", + "items:\n", + "[\n", + " {'start': 11, 'end': 21, 'entity_type': 'PERSON', 'text': 'James Bond', 'operator': 'decrypt'}\n", + "]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Initialize the engine:\n", + "engine = DeanonymizeEngine()\n", + "\n", + "# Invoke the deanonymize function with the text, anonymizer results\n", + "# and a 'decrypt' operator to get the original text as output.\n", + "deanonymized_result = engine.deanonymize(\n", + " text=anonymized_text,\n", + " entities=anonymized_entities,\n", + " operators={\"DEFAULT\": OperatorConfig(\"decrypt\", {\"key\": crypto_key})},\n", + ")\n", + "\n", + "deanonymized_result" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9ff6810b", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false }, - "nbformat": 4, - "nbformat_minor": 5 + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'James Bond'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Alternatively, call the Decrypt operator directly:\n", + "\n", + "# Fetch the encrypted entitiy value from the previous stage\n", + "encrypted_entity_value = anonymize_result.items[0].text\n", + "\n", + "# Restore the original entity value\n", + "Decrypt().operate(text=encrypted_entity_value, params={\"key\": crypto_key})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } From c371f04c0d30b20fe3cfb505c3add72bccd2e16c Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 22 Oct 2023 15:41:03 +0300 Subject: [PATCH 65/67] revert notebook change --- .../python/example_dicom_image_redactor.ipynb | 22 +------------------ 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/docs/samples/python/example_dicom_image_redactor.ipynb b/docs/samples/python/example_dicom_image_redactor.ipynb index 255223e8b..d057173ef 100644 --- a/docs/samples/python/example_dicom_image_redactor.ipynb +++ b/docs/samples/python/example_dicom_image_redactor.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "3345f1c4", "metadata": {}, @@ -10,14 +9,12 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "4c3fbe1a", "metadata": {}, "source": [ "# De-identifying sensitive burnt-in text in DICOM images\n", "This notebook covers how to:\n", - "\n", "1. Redact text Personal Health Information (PHI) present as pixels in DICOM images\n", "2. Visually compare original DICOM images with their redacted versions\n", "\n", @@ -25,7 +22,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "39472f68", "metadata": {}, @@ -48,15 +44,13 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "bfd19f8f", "metadata": {}, "source": [ "## Dataset\n", "Sample DICOM files are available for use in this notebook in `./sample_data`. Copies of the original DICOM data were saved into the folder with permission from the dataset owners. Please see the original dataset information below:\n", - "\n", - "> Rutherford, M., Mun, S.K., Levine, B., Bennett, W.C., Smith, K., Farmer, P., Jarosz, J., Wagner, U., Farahani, K., Prior, F. (2021). A DICOM dataset for evaluation of medical image de-identification (Pseudo-PHI-DICOM-Data) [Data set]. The Cancer Imaging Archive. DOI: https://doi.org/10.7937/s17z-r072\n" + "> Rutherford, M., Mun, S.K., Levine, B., Bennett, W.C., Smith, K., Farmer, P., Jarosz, J., Wagner, U., Farahani, K., Prior, F. (2021). A DICOM dataset for evaluation of medical image de-identification (Pseudo-PHI-DICOM-Data) [Data set]. The Cancer Imaging Archive. DOI: https://doi.org/10.7937/s17z-r072" ] }, { @@ -74,7 +68,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "9c629f28", "metadata": {}, @@ -109,7 +102,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "4c977b77", "metadata": {}, @@ -130,7 +122,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "edaebbc2", "metadata": {}, @@ -139,7 +130,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "2645dab7", "metadata": {}, @@ -170,7 +160,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "85ccd52e", "metadata": {}, @@ -201,7 +190,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "f1ca21eb", "metadata": {}, @@ -232,7 +220,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "17bfb9ff", "metadata": {}, @@ -264,7 +251,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "fa9f4209", "metadata": {}, @@ -305,7 +291,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "b02b8345", "metadata": {}, @@ -333,7 +318,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "9c11ce22", "metadata": {}, @@ -369,7 +353,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "624dca7d", "metadata": {}, @@ -379,7 +362,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "f4cf6a33", "metadata": {}, @@ -404,7 +386,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "d733f22f", "metadata": {}, @@ -470,7 +451,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "6223ad64", "metadata": {}, From cdf09add823ffeca06e81a179a044e0097364efe Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 22 Oct 2023 15:42:57 +0300 Subject: [PATCH 66/67] added a line between pip and spacy --- docs/samples/python/presidio_notebook.ipynb | 394 ++++++++++---------- 1 file changed, 194 insertions(+), 200 deletions(-) diff --git a/docs/samples/python/presidio_notebook.ipynb b/docs/samples/python/presidio_notebook.ipynb index d18f4bfe5..23747e786 100644 --- a/docs/samples/python/presidio_notebook.ipynb +++ b/docs/samples/python/presidio_notebook.ipynb @@ -1,206 +1,200 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# download presidio\n", - "!pip install presidio_analyzer presidio_anonymizer\n", - "\n", - "!python -m spacy download en_core_web_lg" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Path to notebook: [https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/presidio_notebook.ipynb](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/presidio_notebook.ipynb)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from presidio_analyzer import AnalyzerEngine, PatternRecognizer\n", - "from presidio_anonymizer import AnonymizerEngine\n", - "from presidio_anonymizer.entities import OperatorConfig\n", - "import json\n", - "from pprint import pprint" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Analyze Text for PII Entities\n", - "\n", - "Using Presidio Analyzer, analyze a text to identify PII entities. \n", - "The Presidio analyzer is using pre-defined entity recognizers, and offers the option to create custom recognizers.\n", - "\n", - "The following code sample will:\n", - "\n", - "- Set up the Analyzer engine: load the NLP module (spaCy model by default) and other PII recognizers\n", - "- Call analyzer to get analyzed results for \"PHONE_NUMBER\" entity type\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "text_to_anonymize = \"His name is Mr. Jones and his phone number is 212-555-5555\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analyzer = AnalyzerEngine()\n", - "analyzer_results = analyzer.analyze(text=text_to_anonymize, entities=[\"PHONE_NUMBER\"], language='en')\n", - "\n", - "print(analyzer_results)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Custom PII Entity Recognizers\n", - "\n", - "Presidio Analyzer comes with a pre-defined set of entity recognizers. It also allows adding new recognizers without changing the analyzer base code, **by creating custom recognizers**. \n", - "In the following example, we will create two new recognizers of type `PatternRecognizer` to identify titles and pronouns in the analyzed text.\n", - "A `PatternRecognizer` is a PII entity recognizer which uses regular expressions or deny-lists.\n", - "\n", - "The following code sample will:\n", - "- Create custom recognizers\n", - "- Add the new custom recognizers to the analyzer\n", - "- Call analyzer to get results from the new recognizers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "titles_recognizer = PatternRecognizer(supported_entity=\"TITLE\",\n", - " deny_list=[\"Mr.\",\"Mrs.\",\"Miss\"])\n", - "\n", - "pronoun_recognizer = PatternRecognizer(supported_entity=\"PRONOUN\",\n", - " deny_list=[\"he\", \"He\", \"his\", \"His\", \"she\", \"She\", \"hers\", \"Hers\"])\n", - "\n", - "analyzer.registry.add_recognizer(titles_recognizer)\n", - "analyzer.registry.add_recognizer(pronoun_recognizer)\n", - "\n", - "analyzer_results = analyzer.analyze(text=text_to_anonymize,\n", - " entities=[\"TITLE\", \"PRONOUN\"],\n", - " language=\"en\")\n", - "print(analyzer_results)\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Call Presidio Analyzer and get analyzed results with all the configured recognizers - default and new custom recognizers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "analyzer_results = analyzer.analyze(text=text_to_anonymize, language='en')\n", - "\n", - "analyzer_results" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Anonymize Text with Identified PII Entities\n", - "\n", - "
Presidio Anonymizer iterates over the Presidio Analyzer result, and provides anonymization capabilities for the identified text.\n", - "
The anonymizer provides 5 types of anonymizers - replace, redact, mask, hash and encrypt. The default is **replace**\n", - "\n", - "
The following code sample will:\n", - "
    \n", - "
  1. Setup the anonymizer engine
  2. \n", - "
  3. Create an anonymizer request - text to anonymize, list of anonymizers to apply and the results from the analyzer request
  4. \n", - "
  5. Anonymize the text
  6. \n", - "
" - ] + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "anonymizer = AnonymizerEngine()\n", - "\n", - "anonymized_results = anonymizer.anonymize(\n", - " text=text_to_anonymize,\n", - " analyzer_results=analyzer_results, \n", - " operators={\"DEFAULT\": OperatorConfig(\"replace\", {\"new_value\": \"\"}), \n", - " \"PHONE_NUMBER\": OperatorConfig(\"mask\", {\"type\": \"mask\", \"masking_char\" : \"*\", \"chars_to_mask\" : 12, \"from_end\" : True}),\n", - " \"TITLE\": OperatorConfig(\"redact\", {})}\n", - ")\n", - "\n", - "print(f\"text: {anonymized_results.text}\")\n", - "print(\"detailed response:\")\n", - "\n", - "pprint(json.loads(anonymized_results.to_json()))" - ] + "outputs": [], + "source": [ + "# download presidio\n", + "!pip install presidio_analyzer presidio_anonymizer\n", + "!python -m spacy download en_core_web_lg" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "presidio", - "language": "python", - "name": "presidio" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Path to notebook: [https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/presidio_notebook.ipynb](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/presidio_notebook.ipynb)" + ] }, - "metadata": { - "interpreter": { - "hash": "1baa965d5efe3ac65b79dfc60c0d706280b1da80fedb7760faf2759126c4f253" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from presidio_analyzer import AnalyzerEngine, PatternRecognizer\n", + "from presidio_anonymizer import AnonymizerEngine\n", + "from presidio_anonymizer.entities import OperatorConfig\n", + "import json\n", + "from pprint import pprint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analyze Text for PII Entities\n", + "\n", + "Using Presidio Analyzer, analyze a text to identify PII entities. \n", + "The Presidio analyzer is using pre-defined entity recognizers, and offers the option to create custom recognizers.\n", + "\n", + "The following code sample will:\n", + "\n", + "- Set up the Analyzer engine: load the NLP module (spaCy model by default) and other PII recognizers\n", + "- Call analyzer to get analyzed results for \"PHONE_NUMBER\" entity type\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text_to_anonymize = \"His name is Mr. Jones and his phone number is 212-555-5555\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "analyzer = AnalyzerEngine()\n", + "analyzer_results = analyzer.analyze(text=text_to_anonymize, entities=[\"PHONE_NUMBER\"], language='en')\n", + "\n", + "print(analyzer_results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Custom PII Entity Recognizers\n", + "\n", + "Presidio Analyzer comes with a pre-defined set of entity recognizers. It also allows adding new recognizers without changing the analyzer base code, **by creating custom recognizers**. \n", + "In the following example, we will create two new recognizers of type `PatternRecognizer` to identify titles and pronouns in the analyzed text.\n", + "A `PatternRecognizer` is a PII entity recognizer which uses regular expressions or deny-lists.\n", + "\n", + "The following code sample will:\n", + "- Create custom recognizers\n", + "- Add the new custom recognizers to the analyzer\n", + "- Call analyzer to get results from the new recognizers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "titles_recognizer = PatternRecognizer(supported_entity=\"TITLE\",\n", + " deny_list=[\"Mr.\",\"Mrs.\",\"Miss\"])\n", + "\n", + "pronoun_recognizer = PatternRecognizer(supported_entity=\"PRONOUN\",\n", + " deny_list=[\"he\", \"He\", \"his\", \"His\", \"she\", \"She\", \"hers\", \"Hers\"])\n", + "\n", + "analyzer.registry.add_recognizer(titles_recognizer)\n", + "analyzer.registry.add_recognizer(pronoun_recognizer)\n", + "\n", + "analyzer_results = analyzer.analyze(text=text_to_anonymize,\n", + " entities=[\"TITLE\", \"PRONOUN\"],\n", + " language=\"en\")\n", + "print(analyzer_results)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Call Presidio Analyzer and get analyzed results with all the configured recognizers - default and new custom recognizers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "analyzer_results = analyzer.analyze(text=text_to_anonymize, language='en')\n", + "\n", + "analyzer_results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Anonymize Text with Identified PII Entities\n", + "\n", + "
Presidio Anonymizer iterates over the Presidio Analyzer result, and provides anonymization capabilities for the identified text.\n", + "
The anonymizer provides 5 types of anonymizers - replace, redact, mask, hash and encrypt. The default is **replace**\n", + "\n", + "
The following code sample will:\n", + "
    \n", + "
  1. Setup the anonymizer engine
  2. \n", + "
  3. Create an anonymizer request - text to anonymize, list of anonymizers to apply and the results from the analyzer request
  4. \n", + "
  5. Anonymize the text
  6. \n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "anonymizer = AnonymizerEngine()\n", + "\n", + "anonymized_results = anonymizer.anonymize(\n", + " text=text_to_anonymize,\n", + " analyzer_results=analyzer_results, \n", + " operators={\"DEFAULT\": OperatorConfig(\"replace\", {\"new_value\": \"\"}), \n", + " \"PHONE_NUMBER\": OperatorConfig(\"mask\", {\"type\": \"mask\", \"masking_char\" : \"*\", \"chars_to_mask\" : 12, \"from_end\" : True}),\n", + " \"TITLE\": OperatorConfig(\"redact\", {})}\n", + ")\n", + "\n", + "print(f\"text: {anonymized_results.text}\")\n", + "print(\"detailed response:\")\n", + "\n", + "pprint(json.loads(anonymized_results.to_json()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "presidio", + "language": "python", + "name": "presidio" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + }, + "metadata": { + "interpreter": { + "hash": "1baa965d5efe3ac65b79dfc60c0d706280b1da80fedb7760faf2759126c4f253" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 } From 98e631e49ba77b860df355c104bb7af851288d1b Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 22 Oct 2023 16:40:01 +0300 Subject: [PATCH 67/67] revert docstring change --- .../presidio_analyzer/batch_analyzer_engine.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py b/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py index ceb2e7eef..4a428595d 100644 --- a/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py +++ b/presidio-analyzer/presidio_analyzer/batch_analyzer_engine.py @@ -8,18 +8,17 @@ class BatchAnalyzerEngine: + """ + Batch analysis of documents (tables, lists, dicts). + Wrapper class to run Presidio Analyzer Engine on multiple values, + either lists/iterators of strings, or dictionaries. - def __init__(self, analyzer_engine: Optional[AnalyzerEngine] = None): - """ - Batch analysis of documents (tables, lists, dicts). - - Wrapper class to run Presidio Analyzer Engine on multiple values, - either lists/iterators of strings, or dictionaries. + :param: analyzer_engine: AnalyzerEngine instance to use + for handling the values in those collections. + """ - :param: analyzer_engine: AnalyzerEngine instance to use - for handling the values in those collections. - """ + def __init__(self, analyzer_engine: Optional[AnalyzerEngine] = None): self.analyzer_engine = analyzer_engine if not analyzer_engine: