This repository has been archived by the owner on Feb 5, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 16
/
webapp.py
123 lines (100 loc) · 4.1 KB
/
webapp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import string
from typing import Dict
import spacy
import streamlit as st
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.tokenization import SpacyTokenizer
from flair.visual.ner_html import render_ner_html
from spacy.lang.fr import French
from spacy.tokenizer import Tokenizer
from streamlit import cache
# embed the custom tokenizer function to have webapp as an autonomous file
def get_tokenizer(model: French) -> Tokenizer:
split_char = r"[ ,\\.()-/\\|:;\"+=!?_+#“’'‘]"
extended_infix = [r'[:\\(\\)-\./#"“’\'—‘'] + model.Defaults.infixes
infix_re = spacy.util.compile_infix_regex(extended_infix)
prefix_re = spacy.util.compile_prefix_regex(tuple(list(model.Defaults.prefixes) + [split_char]))
suffix_re = spacy.util.compile_suffix_regex(tuple(list(model.Defaults.suffixes) + [split_char]))
tok = Tokenizer(
model.vocab,
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=None,
)
return tok
@cache(allow_output_mutation=True, max_entries=1)
def get_model():
return SequenceTagger.load("resources/flair_ner/luxano_segment_0_flair/best-model.pt")
@cache(allow_output_mutation=True, max_entries=1)
def get_french_tokenizer():
nlp = spacy.blank(name="fr")
nlp.tokenizer = get_tokenizer(model=nlp)
return SpacyTokenizer(nlp)
to_skip = ["ETAT"]
replace_parenthesis = ["ADDRESS", "DATE"]
st.beta_set_page_config(
page_title="Anonymisation", page_icon="🔍", layout="centered", initial_sidebar_state="collapsed",
)
"""
# Reconnaissance des entités nommées en vue d'une anonymisation automatisée
Modèle [Flair](https://github.com/flairNLP/flair) entraîné sur 342 décisions Luxembourgeoises annotées manuellement.
Ressources:
* [code du projet anonymisation](https://github.com/ELS-RD/anonymisation)
* [code du projet récupération tagtog / transformations](https://github.com/ELS-RD/lux-ano)
"""
st.image(image="http://svowebmaster.free.fr/images_site_svo/armoiries/armoiries_LUXEMBOURG.gif", width=150)
user_input = st.text_area("coller une décision ci-dessous", "", max_chars=200000, height=300)
replace_names = st.checkbox(label="Remplacer les entités nommées par des pseudos", value=False)
if user_input:
"## Résultat"
paragraphs = list()
tagger = get_model()
tokenizer = get_french_tokenizer()
for paragraph in user_input.split("\n"):
if paragraph.strip() == "":
continue
sentence = Sentence(paragraph, use_tokenizer=tokenizer)
tagger.predict(sentence)
paragraphs.append(sentence)
inside_parenthesis = False
if replace_names:
pseudo = list(string.ascii_uppercase) + [a + b for a in string.ascii_uppercase for b in string.ascii_uppercase]
replacement_dict: Dict[str, str] = dict()
for sentence in paragraphs:
for word in sentence:
tag = word.get_tag("ner").value
if any([True for i in replace_parenthesis if i in tag]):
if not inside_parenthesis:
# first item
word.text = "(...)"
else:
# not the first one, put empty
word.text = ""
inside_parenthesis = True
elif tag != "O" and not any([True for i in to_skip if i in tag]):
inside_parenthesis = False
if word.text.lower() not in replacement_dict:
replacement_dict[word.text.lower()] = pseudo[len(replacement_dict)]
word.text = replacement_dict[word.text.lower()]
else:
inside_parenthesis = False
colors = {
"ETABLISSEMENT": "#35c2b2",
"ADDRESS": "#FFAE62",
"ORGANIZATION": "#FFB990",
"SITE": "#ff8800",
"HOPITAL": "#edddcb",
"MEDIA": "#e966c4",
"MAIL": "#1688cb",
"ETAT": "#00c5ed",
"RESIDENCE": "#94bce1",
"PERSONNE_DE_JUSTICE": "#89B2C4",
"GROUPE": "#9cae64",
"DATE": "#F9E17D",
"NUMEROS": "#F8485E",
"PERS": "#FA7268",
"FONDS": "#C3FF1F",
}
st.write(render_ner_html(sentences=paragraphs, colors=colors, wrap_page=False), unsafe_allow_html=True)