From 698a5cdc53ee75e85b5786ff32e816aeb4dbb10b Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 8 Nov 2023 23:01:40 +0200 Subject: [PATCH] bugfix with wrong input to NerModelConfiguration (#1208) * bugfix with wrong input to NerModelConfiguration * Update VERSION * Update CHANGELOG.md * Put org in ignore as it has many FPs * aligned the conf with the defaults in code * changed pipenv install to pip install --- .pipelines/templates/build-python.yml | 2 +- CHANGELOG.md | 20 +++++++++++--- VERSION | 2 +- .../nlp_engine/ner_model_configuration.py | 16 ++++++++++- .../nlp_engine/spacy_nlp_engine.py | 2 +- presidio-analyzer/setup.py | 2 +- .../tests/test_spacy_nlp_engine.py | 27 ++++++++++++++++++- 7 files changed, 61 insertions(+), 10 deletions(-) diff --git a/.pipelines/templates/build-python.yml b/.pipelines/templates/build-python.yml index 734bbed61..03397075d 100644 --- a/.pipelines/templates/build-python.yml +++ b/.pipelines/templates/build-python.yml @@ -20,7 +20,7 @@ steps: script: | set -eux # fail on error # Install pytest and run tests - pipenv install --dev pytest-azurepipelines + pipenv run pip install pytest pytest-azurepipelines pipenv run pytest -vv - task: Bash@3 diff --git a/CHANGELOG.md b/CHANGELOG.md index 713be5cbe..199e098bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,10 +2,20 @@ All notable changes to this project will be documented in this file. -## [2.2.4] - Nov. 2nd 2024 +## [2.2.351] - Nov. 6th 2024 ### Changed #### Analyzer -* Hotfix for the default.yaml file which is not parsed correctly (#1202) +* Hotfix for NerModelConfiguration not created correctly (#1208) + +## [2.2.350] - Nov. 2nd 2024 +### Changed +#### Analyzer +* Hotfix: default.yaml is not parsed correctly (#1202) + +## [2.2.35] - Nov. 2nd 2024 +### Changed +#### Analyzer +* Put org in ignore as it has many FPs (#1200) ## [2.2.34] - Oct. 30th 2024 @@ -291,8 +301,10 @@ Upgrade Analyzer spacy version to 3.0.5 #### Deanonymize: New endpoint for deanonymizing encrypted entities by the anonymizer. -[unreleased]: https://github.com/microsoft/presidio/compare/2.2.4...HEAD -[2.2.4]: https://github.com/microsoft/presidio/compare/2.2.34...2.2.4 +[unreleased]: https://github.com/microsoft/presidio/compare/2.2.351...HEAD +[2.2.351]: https://github.com/microsoft/presidio/compare/2.2.350...2.2.351 +[2.2.350]: https://github.com/microsoft/presidio/compare/2.2.35...2.2.350 +[2.2.35]: https://github.com/microsoft/presidio/compare/2.2.34...2.2.35 [2.2.34]: https://github.com/microsoft/presidio/compare/2.2.33...2.2.34 [2.2.33]: https://github.com/microsoft/presidio/compare/2.2.32...2.2.33 [2.2.32]: https://github.com/microsoft/presidio/compare/2.2.31...2.2.32 diff --git a/VERSION b/VERSION index 65b5c491a..4c036fa59 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.350 +2.2.351 diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py index 4641ef4b5..a88545090 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py @@ -27,7 +27,21 @@ ) LOW_SCORE_ENTITY_NAMES = {} -LABELS_TO_IGNORE = {"O", "ORG", "ORGANIZATION"} +LABELS_TO_IGNORE = { + "O", + "ORG", + "ORGANIZATION", + "CARDINAL", + "EVENT", + "LANGUAGE", + "LAW", + "MONEY", + "ORDINAL", + "PERCENT", + "PRODUCT", + "QUANTITY", + "WORK_OF_ART", +} @dataclass diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py index 23ba066b7..68ccbb558 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py @@ -41,7 +41,7 @@ def __init__( self.models = models if not ner_model_configuration: - ner_model_configuration = NerModelConfiguration(self.engine_name) + ner_model_configuration = NerModelConfiguration() self.ner_model_configuration = ner_model_configuration self.nlp = None diff --git a/presidio-analyzer/setup.py b/presidio-analyzer/setup.py index 67ac4cfbd..918a3dfc0 100644 --- a/presidio-analyzer/setup.py +++ b/presidio-analyzer/setup.py @@ -57,4 +57,4 @@ ], long_description=long_description, long_description_content_type="text/markdown", -) \ No newline at end of file +) diff --git a/presidio-analyzer/tests/test_spacy_nlp_engine.py b/presidio-analyzer/tests/test_spacy_nlp_engine.py index 033f22da4..f9028e54b 100644 --- a/presidio-analyzer/tests/test_spacy_nlp_engine.py +++ b/presidio-analyzer/tests/test_spacy_nlp_engine.py @@ -1,8 +1,16 @@ +import json from typing import Iterator import pytest -from presidio_analyzer.nlp_engine import SpacyNlpEngine +from presidio_analyzer.nlp_engine import SpacyNlpEngine, NerModelConfiguration + + +class SetEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, set): + return list(obj) + return json.JSONEncoder.default(self, obj) def test_simple_process_text(spacy_nlp_engine): @@ -42,3 +50,20 @@ def test_validate_model_params_missing_fields(): with pytest.raises(ValueError): SpacyNlpEngine._validate_model_params(new_model) + + +def test_default_configuration_correct(): + spacy_nlp_engine = SpacyNlpEngine() + expected_ner_config = NerModelConfiguration() + + actual_config_json = json.dumps( + spacy_nlp_engine.ner_model_configuration.to_dict(), + sort_keys=True, + cls=SetEncoder, + ) + + expected_config_json = json.dumps( + expected_ner_config.to_dict(), sort_keys=True, cls=SetEncoder + ) + + assert actual_config_json == expected_config_json