Simplified logic between spacy and trasnformers nlp engines

microsoft · Aug 31, 2023 · 5a4bb29 · 5a4bb29
1 parent cf85101
commit 5a4bb29
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 60 deletions.
diff --git a/presidio-analyzer/Pipfile b/presidio-analyzer/Pipfile
@@ -12,6 +12,8 @@ pyyaml = "*"
 phonenumbers = ">=8.12,<9.0.0"
 typing-extensions = "*"
 spacy-huggingface-pipelines = "*"
+stanza = "*"
+spacy-stanza = "*"
 
 [dev-packages]
 pytest = "*"

diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_artifacts.py
@@ -80,5 +80,7 @@ def to_json(self) -> str:
             return_dict["tokens"] = [token.text for token in self.tokens]
         if "entities" in return_dict:
             return_dict["entities"] = [entity.text for entity in self.entities]
+        if "scores" in return_dict:
+            return_dict["scores"] = [float(score) for score in self.scores]
 
         return json.dumps(return_dict)
diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py
@@ -4,7 +4,7 @@
 import spacy
 
 from spacy.language import Language
-from spacy.tokens import Doc, SpanGroup
+from spacy.tokens import Doc, SpanGroup, Span
 
 from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngine, NerModelConfiguration
 
@@ -148,12 +148,12 @@ def _doc_to_nlp_artifact(self, doc: Doc, language: str) -> NlpArtifacts:
         tokens_indices = [token.idx for token in doc]
 
         entities = self._get_entities(doc)
-        scores = entities.attrs["scores"]
+        scores = self._get_scores_for_entities(doc)
 
-        entities_as_spans = [ent for ent in entities]
+        entities, scores = self._get_updated_entities(entities, scores)
 
         return NlpArtifacts(
-            entities=entities_as_spans,
+            entities=entities,
             tokens=doc,
             tokens_indices=tokens_indices,
             lemmas=lemmas,
@@ -162,22 +162,53 @@ def _doc_to_nlp_artifact(self, doc: Doc, language: str) -> NlpArtifacts:
             scores=scores,
         )
 
-    def _get_entities(self, doc: Doc) -> SpanGroup:
+    def _get_entities(self, doc:Doc) -> List[Span]:
+        """
+        Extract entities out of a spaCy pipeline, depending on the type of pipeline.
+
+        For normal spaCy, this would be doc.ents
+        :param doc: the output spaCy doc.
+        :return: List of entities
+        """
+
+        return doc.ents
+
+    def _get_scores_for_entities(self, doc: Doc) -> List[float]:
+        """Extract scores for entities from the doc.
+
+        Since spaCy does not provide confidence scores for entities by default,
+        we use the default score from the ner model configuration.
+        :param doc: SpaCy doc
+        """
+
+        entities = doc.ents
+        scores = [self.ner_model_configuration.default_score] * len(entities)
+        return scores
+
+    def _get_updated_entities(
+        self, entities: List[Span], scores: List[float]
+    ) -> Tuple[List[Span], List[float]]:
         """
         Get an updated list of entities based on the ner model configuration.
 
         Remove entities that are in labels_to_ignore,
         update entity names based on model_to_presidio_entity_mapping
 
-        :param doc: Output of a spaCy model
-        :return: SpanGroup holding the entities and confidence scores
+        :param entities: Entities that were extracted from a spaCy pipeline
+        :param scores: Original confidence scores for the entities extracted
+        :return: Tuple holding the entities and confidence scores
         """
-        output_spans = SpanGroup(doc, attrs={"scores": []})
+        if len(entities) != len(scores):
+            raise ValueError("Entities and scores must be the same length")
+
+        new_entities = []
+        new_scores = []
 
         mapping = self.ner_model_configuration.model_to_presidio_entity_mapping
-        for ent in doc.ents:
+        to_ignore = self.ner_model_configuration.labels_to_ignore
+        for ent, score in zip(entities, scores):
             # Remove model labels in the ignore list
-            if ent.label_ in self.ner_model_configuration.labels_to_ignore:
+            if ent.label_ in to_ignore:
                 continue
 
             # Update entity label based on mapping
@@ -190,19 +221,15 @@ def _get_entities(self, doc: Doc) -> SpanGroup:
                 )
 
             # Remove presidio entities in the ignore list
-            if ent.label_ in self.ner_model_configuration.labels_to_ignore:
+            if ent.label_ in to_ignore:
                 continue
 
-            output_spans.append(ent)
-
-            # Set default confidence
-            # (spaCy models don't have built in confidence scores)
-            score = self.ner_model_configuration.default_score
+            new_entities.append(ent)
 
             # Update score if entity is in low score entity names
             if ent.label_ in self.ner_model_configuration.low_score_entity_names:
                 score *= self.ner_model_configuration.low_confidence_score_multiplier
 
-            output_spans.attrs["scores"].append(score)
+            new_scores.append(score)
 
-        return output_spans
+        return new_entities, new_scores
diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/transformers_nlp_engine.py
@@ -1,8 +1,8 @@
 import logging
-from typing import Optional, Dict
+from typing import Optional, Dict, List
 
 import spacy
-from spacy.tokens import Doc, SpanGroup
+from spacy.tokens import Doc, SpanGroup, Span
 
 try:
     import spacy_huggingface_pipelines
@@ -120,50 +120,23 @@ def process_text(self, text: str, language: str) -> NlpArtifacts:
         doc = self.nlp[language](text)
         return self._doc_to_nlp_artifact(doc, language)
 
-    def _get_entities(self, doc: Doc) -> SpanGroup:
+    def _get_entities(self, doc: Doc) -> List[Span]:
         """
-        Get an updated list of entities based on the ner model configuration.
+        Extract entities out of a spaCy pipeline, depending on the type of pipeline.
 
-        Remove entities that are in labels_to_ignore,
-        update entity names based on model_to_presidio_entity_mapping.
-
-        :param doc: Output of a spaCy model
-        :return: SpanGroup holding on the entities and confidence scores
+        For spacy-huggingface-pipeline, this would be doc.spans[key]
+        :param doc: the output spaCy doc.
+        :return: List of entities
         """
 
-        current_ents = doc.spans[self.entity_key]
-        current_scores = doc.spans[self.entity_key].attrs["scores"]
-
-        output_spans = SpanGroup(doc, attrs={"scores": []})
-
-        mapping = self.ner_model_configuration.model_to_presidio_entity_mapping
-        to_ignore = self.ner_model_configuration.labels_to_ignore
-        for i, ent in enumerate(current_ents):
-            # Remove model labels in the ignore list
-            if ent.label_ in to_ignore:
-                continue
-
-            # Update entity label based on mapping
-            if ent.label_ in mapping:
-                ent.label_ = mapping[ent.label_]
-            else:
-                logger.warning(
-                    f"Entity {ent.label_} is not mapped to a Presidio entity, "
-                    f"but keeping anyway"
-                )
+        return doc.spans[self.entity_key]
 
-            # Remove presidio entities in the ignore list
-            if ent.label_ in to_ignore:
-                continue
+    def _get_scores_for_entities(self, doc: Doc) -> List[float]:
+        """Extract scores for entities from the doc.
 
-            output_spans.append(ent)
-
-            score = current_scores[i]
-            # Update score if entity is in low score entity names
-            if ent.label_ in self.ner_model_configuration.low_score_entity_names:
-                score *= self.ner_model_configuration.low_confidence_score_multiplier
-
-            # Update scores list
-            output_spans.attrs["scores"].append(score)
+        While spaCy does not provide confidence scores,
+        the spacy-huggingface-pipeline flow adds confidence scores as SpanGroup attributes.
+        :param doc: SpaCy doc
+        """
 
-        return output_spans
+        return doc.spans[self.entity_key].attrs["scores"]