Skip to content

Commit

Permalink
Simplified logic between spacy and trasnformers nlp engines
Browse files Browse the repository at this point in the history
  • Loading branch information
omri374 committed Aug 31, 2023
1 parent cf85101 commit 5a4bb29
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 60 deletions.
2 changes: 2 additions & 0 deletions presidio-analyzer/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ pyyaml = "*"
phonenumbers = ">=8.12,<9.0.0"
typing-extensions = "*"
spacy-huggingface-pipelines = "*"
stanza = "*"
spacy-stanza = "*"

[dev-packages]
pytest = "*"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,5 +80,7 @@ def to_json(self) -> str:
return_dict["tokens"] = [token.text for token in self.tokens]
if "entities" in return_dict:
return_dict["entities"] = [entity.text for entity in self.entities]
if "scores" in return_dict:
return_dict["scores"] = [float(score) for score in self.scores]

return json.dumps(return_dict)
63 changes: 45 additions & 18 deletions presidio-analyzer/presidio_analyzer/nlp_engine/spacy_nlp_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import spacy

from spacy.language import Language
from spacy.tokens import Doc, SpanGroup
from spacy.tokens import Doc, SpanGroup, Span

from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngine, NerModelConfiguration

Expand Down Expand Up @@ -148,12 +148,12 @@ def _doc_to_nlp_artifact(self, doc: Doc, language: str) -> NlpArtifacts:
tokens_indices = [token.idx for token in doc]

entities = self._get_entities(doc)
scores = entities.attrs["scores"]
scores = self._get_scores_for_entities(doc)

entities_as_spans = [ent for ent in entities]
entities, scores = self._get_updated_entities(entities, scores)

return NlpArtifacts(
entities=entities_as_spans,
entities=entities,
tokens=doc,
tokens_indices=tokens_indices,
lemmas=lemmas,
Expand All @@ -162,22 +162,53 @@ def _doc_to_nlp_artifact(self, doc: Doc, language: str) -> NlpArtifacts:
scores=scores,
)

def _get_entities(self, doc: Doc) -> SpanGroup:
def _get_entities(self, doc:Doc) -> List[Span]:
"""
Extract entities out of a spaCy pipeline, depending on the type of pipeline.
For normal spaCy, this would be doc.ents
:param doc: the output spaCy doc.
:return: List of entities
"""

return doc.ents

def _get_scores_for_entities(self, doc: Doc) -> List[float]:
"""Extract scores for entities from the doc.
Since spaCy does not provide confidence scores for entities by default,
we use the default score from the ner model configuration.
:param doc: SpaCy doc
"""

entities = doc.ents
scores = [self.ner_model_configuration.default_score] * len(entities)
return scores

def _get_updated_entities(
self, entities: List[Span], scores: List[float]
) -> Tuple[List[Span], List[float]]:
"""
Get an updated list of entities based on the ner model configuration.
Remove entities that are in labels_to_ignore,
update entity names based on model_to_presidio_entity_mapping
:param doc: Output of a spaCy model
:return: SpanGroup holding the entities and confidence scores
:param entities: Entities that were extracted from a spaCy pipeline
:param scores: Original confidence scores for the entities extracted
:return: Tuple holding the entities and confidence scores
"""
output_spans = SpanGroup(doc, attrs={"scores": []})
if len(entities) != len(scores):
raise ValueError("Entities and scores must be the same length")

new_entities = []
new_scores = []

mapping = self.ner_model_configuration.model_to_presidio_entity_mapping
for ent in doc.ents:
to_ignore = self.ner_model_configuration.labels_to_ignore
for ent, score in zip(entities, scores):
# Remove model labels in the ignore list
if ent.label_ in self.ner_model_configuration.labels_to_ignore:
if ent.label_ in to_ignore:
continue

# Update entity label based on mapping
Expand All @@ -190,19 +221,15 @@ def _get_entities(self, doc: Doc) -> SpanGroup:
)

# Remove presidio entities in the ignore list
if ent.label_ in self.ner_model_configuration.labels_to_ignore:
if ent.label_ in to_ignore:
continue

output_spans.append(ent)

# Set default confidence
# (spaCy models don't have built in confidence scores)
score = self.ner_model_configuration.default_score
new_entities.append(ent)

# Update score if entity is in low score entity names
if ent.label_ in self.ner_model_configuration.low_score_entity_names:
score *= self.ner_model_configuration.low_confidence_score_multiplier

output_spans.attrs["scores"].append(score)
new_scores.append(score)

return output_spans
return new_entities, new_scores
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import logging
from typing import Optional, Dict
from typing import Optional, Dict, List

import spacy
from spacy.tokens import Doc, SpanGroup
from spacy.tokens import Doc, SpanGroup, Span

try:
import spacy_huggingface_pipelines
Expand Down Expand Up @@ -120,50 +120,23 @@ def process_text(self, text: str, language: str) -> NlpArtifacts:
doc = self.nlp[language](text)
return self._doc_to_nlp_artifact(doc, language)

def _get_entities(self, doc: Doc) -> SpanGroup:
def _get_entities(self, doc: Doc) -> List[Span]:
"""
Get an updated list of entities based on the ner model configuration.
Extract entities out of a spaCy pipeline, depending on the type of pipeline.
Remove entities that are in labels_to_ignore,
update entity names based on model_to_presidio_entity_mapping.
:param doc: Output of a spaCy model
:return: SpanGroup holding on the entities and confidence scores
For spacy-huggingface-pipeline, this would be doc.spans[key]
:param doc: the output spaCy doc.
:return: List of entities
"""

current_ents = doc.spans[self.entity_key]
current_scores = doc.spans[self.entity_key].attrs["scores"]

output_spans = SpanGroup(doc, attrs={"scores": []})

mapping = self.ner_model_configuration.model_to_presidio_entity_mapping
to_ignore = self.ner_model_configuration.labels_to_ignore
for i, ent in enumerate(current_ents):
# Remove model labels in the ignore list
if ent.label_ in to_ignore:
continue

# Update entity label based on mapping
if ent.label_ in mapping:
ent.label_ = mapping[ent.label_]
else:
logger.warning(
f"Entity {ent.label_} is not mapped to a Presidio entity, "
f"but keeping anyway"
)
return doc.spans[self.entity_key]

# Remove presidio entities in the ignore list
if ent.label_ in to_ignore:
continue
def _get_scores_for_entities(self, doc: Doc) -> List[float]:
"""Extract scores for entities from the doc.
output_spans.append(ent)

score = current_scores[i]
# Update score if entity is in low score entity names
if ent.label_ in self.ner_model_configuration.low_score_entity_names:
score *= self.ner_model_configuration.low_confidence_score_multiplier

# Update scores list
output_spans.attrs["scores"].append(score)
While spaCy does not provide confidence scores,
the spacy-huggingface-pipeline flow adds confidence scores as SpanGroup attributes.
:param doc: SpaCy doc
"""

return output_spans
return doc.spans[self.entity_key].attrs["scores"]

0 comments on commit 5a4bb29

Please sign in to comment.