diff --git a/README.md b/README.md index aa9d237..1159c0e 100644 --- a/README.md +++ b/README.md @@ -117,12 +117,14 @@ doc = nlp("This is a text") visualize_parser(doc) ``` -| Argument | Type | Description | -| --------------- | ------------- | -------------------------------------------- | -| `doc` | `Doc` | The spaCy `Doc` object to visualize. | -| _keyword-only_ | | | -| `title` | Optional[str] | Title of the visualizer block. | -| `sidebar_title` | Optional[str] | Title of the config settings in the sidebar. | +| Argument | Type | Description | +| ------------------ | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The spaCy `Doc` object to visualize. | +| _keyword-only_ | | | +| `title` | Optional[str] | Title of the visualizer block. | +| `key` | Optional[str] | Key used for the streamlit component for selecting labels. | +| `manual` | bool | Flag signifying whether the doc argument is a Doc object or a List of Dicts containing parse information. | +| `displacy_optoins` | Optional[Dict] | Dictionary of options to be passed to the displacy render method for generating the HTML to be rendered. See: https://spacy.io/api/top-level#options-dep | #### function `visualize_ner` @@ -138,16 +140,49 @@ doc = nlp("Sundar Pichai is the CEO of Google.") visualize_ner(doc, labels=nlp.get_pipe("ner").labels) ``` -| Argument | Type | Description | -| --------------- | ------------- | ----------------------------------------------------------------------------- | -| `doc` | `Doc` | The spaCy `Doc` object to visualize. | -| _keyword-only_ | | | -| `labels` | Sequence[str] | The labels to show in the labels dropdown. | -| `attrs` | List[str] | The span attributes to show in entity table. | -| `show_table` | bool | Whether to show a table of entities and their attributes. Defaults to `True`. | -| `title` | Optional[str] | Title of the visualizer block. | -| `sidebar_title` | Optional[str] | Title of the config settings in the sidebar. | -| `colors` | Dict[str,str] | A dictionary mapping labels to display colors ({"LABEL": "COLOR"}) | +| Argument | Type | Description | +| ------------------ | -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The spaCy `Doc` object to visualize. | +| _keyword-only_ | | | +| `labels` | Sequence[str] | The labels to show in the labels dropdown. | +| `attrs` | List[str] | The span attributes to show in entity table. | +| `show_table` | bool | Whether to show a table of entities and their attributes. Defaults to `True`. | +| `title` | Optional[str] | Title of the visualizer block. | +| `colors` | Dict[str,str] | Dictionary of colors for the entity spans to visualize, with keys as labels and corresponding colors as the values. This argument will be deprecated soon. In future the colors arg need to be passed in the `displacy_options` arg with the key "colors".) | +| `key` | Optional[str] | Key used for the streamlit component for selecting labels. | +| `manual` | bool | Flag signifying whether the doc argument is a Doc object or a List of Dicts containing entity span | +| information. | +| `displacy_options` | Optional[Dict] | Dictionary of options to be passed to the displacy render method for generating the HTML to be rendered. See https://spacy.io/api/top-level#displacy_options-ent. | + + +#### function `visualize_spans` + +Visualize spans in a `Doc` using spaCy's +[`displacy` visualizer](https://spacy.io/usage/visualizers). + +```python +import spacy +from spacy_streamlit import visualize_spans + +nlp = spacy.load("en_core_web_sm") +doc = nlp("Sundar Pichai is the CEO of Google.") +span = doc[4:7] # CEO of Google +span.label_ = "CEO" +doc.spans["job_role"] = [span] +visualize_spans(doc, spans_key="job_role", displacy_options={"colors": {"CEO": "#09a3d5"}}) +``` + +| Argument | Type | Description | +| ------------------ | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `doc` | `Doc` | The spaCy `Doc` object to visualize. | +| _keyword-only_ | | | +| `spans_key` | Sequence[str] | Which spans key to render spans from. Default is "sc". | +| `attrs` | List[str] | The attributes on the entity Span to be labeled. Attributes are displayed only when the `show_table` argument is True. | +| `show_table` | bool | Whether to show a table of spans and their attributes. Defaults to `True`. | +| `title` | Optional[str] | Title of the visualizer block. | +| `manual` | bool | Flag signifying whether the doc argument is a Doc object or a List of Dicts containing entity span information. | +| `displacy_options` | Optional[Dict] | Dictionary of options to be passed to the displacy render method for generating the HTML to be rendered. See https://spacy.io/api/top-level#displacy_options-span. | + #### function `visualize_textcat` diff --git a/examples/05_visualize-spans.py b/examples/05_visualize-spans.py new file mode 100644 index 0000000..d732f9d --- /dev/null +++ b/examples/05_visualize-spans.py @@ -0,0 +1,17 @@ +""" +Example of using `visualize_spans` with a non-default spans_key +""" +import spacy_streamlit +import streamlit as st + +import spacy +from spacy_streamlit import visualize_spans + +nlp = spacy.load("en_core_web_sm") +doc = nlp("Sundar Pichai is the CEO of Google.") +span = doc[4:7] # CEO of Google +span.label_ = "CEO" +doc.spans["job_role"] = [span] +visualize_spans( + doc, spans_key="job_role", displacy_options={"colors": {"CEO": "#09a3d5"}} +) diff --git a/spacy_streamlit/__init__.py b/spacy_streamlit/__init__.py index a1c0fd2..663e635 100644 --- a/spacy_streamlit/__init__.py +++ b/spacy_streamlit/__init__.py @@ -1,3 +1,3 @@ -from .visualizer import visualize, visualize_parser, visualize_ner +from .visualizer import visualize, visualize_parser, visualize_ner, visualize_spans from .visualizer import visualize_textcat, visualize_similarity, visualize_tokens from .util import load_model, process_text diff --git a/spacy_streamlit/visualizer.py b/spacy_streamlit/visualizer.py index 1dab782..b22c689 100644 --- a/spacy_streamlit/visualizer.py +++ b/spacy_streamlit/visualizer.py @@ -7,12 +7,16 @@ from .util import load_model, process_text, get_svg, get_html, LOGO +SPACY_VERSION = tuple(map(int, spacy.__version__.split("."))) # fmt: off NER_ATTRS = ["text", "label_", "start", "end", "start_char", "end_char"] TOKEN_ATTRS = ["idx", "text", "lemma_", "pos_", "tag_", "dep_", "head", "morph", "ent_type_", "ent_iob_", "shape_", "is_alpha", "is_ascii", "is_digit", "is_punct", "like_num", "is_sent_start"] +# Currently these attrs are the same, but they might differ in the future. +SPAN_ATTRS = NER_ATTRS + # fmt: on FOOTER = """♥ Built with [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit)""" @@ -130,30 +134,58 @@ def visualize( def visualize_parser( - doc: spacy.tokens.Doc, + doc: Union[spacy.tokens.Doc, List[Dict[str, str]]], *, title: Optional[str] = "Dependency Parse & Part-of-speech tags", key: Optional[str] = None, + manual: bool = False, + displacy_options: Optional[Dict] = None, ) -> None: - """Visualizer for dependency parses.""" + """Visualizer for dependency parses. + + doc (Doc, List): The document to visualize. + key (str): Key used for the streamlit component for selecting labels. + title (str): The title displayed at the top of the parser visualization. + manual (bool): Flag signifying whether the doc argument is a Doc object or a List of Dicts containing parse information. + displacy_options (Dict): Dictionary of options to be passed to the displacy render method for generating the HTML to be rendered. + See: https://spacy.io/api/top-level#options-dep + """ + if displacy_options is None: + displacy_options = dict() if title: st.header(title) - cols = st.columns(4) - split_sents = cols[0].checkbox( - "Split sentences", value=True, key=f"{key}_parser_split_sents" - ) - options = { - "collapse_punct": cols[1].checkbox( - "Collapse punct", value=True, key=f"{key}_parser_collapse_punct" - ), - "collapse_phrases": cols[2].checkbox( - "Collapse phrases", key=f"{key}_parser_collapse_phrases" - ), - "compact": cols[3].checkbox("Compact mode", key=f"{key}_parser_compact"), - } + if manual: + # In manual mode, collapse_phrases and collapse_punct are passed as options to + # displacy.parse_deps(doc) and the resulting data is retokenized to be correct, + # so we already have these options configured at the time we use this data. + cols = st.columns(1) + split_sents = False + options = { + "compact": cols[0].checkbox("Compact mode", key=f"{key}_parser_compact"), + } + else: + cols = st.columns(4) + split_sents = cols[0].checkbox( + "Split sentences", value=True, key=f"{key}_parser_split_sents" + ) + options = { + "collapse_punct": cols[1].checkbox( + "Collapse punct", value=True, key=f"{key}_parser_collapse_punct" + ), + "collapse_phrases": cols[2].checkbox( + "Collapse phrases", key=f"{key}_parser_collapse_phrases" + ), + "compact": cols[3].checkbox("Compact mode", key=f"{key}_parser_compact"), + } docs = [span.as_doc() for span in doc.sents] if split_sents else [doc] + # add selected options to options provided by user + # `options` from `displacy_options` are overwritten by user provided + # options from the checkboxes + displacy_options = {**displacy_options, **options} for sent in docs: - html = displacy.render(sent, options=options, style="dep") + html = displacy.render( + sent, options=displacy_options, style="dep", manual=manual + ) # Double newlines seem to mess with the rendering html = html.replace("\n\n", "\n") if split_sents and len(docs) > 1: @@ -170,7 +202,7 @@ def visualize_ner( title: Optional[str] = "Named Entities", colors: Dict[str, str] = {}, key: Optional[str] = None, - manual: Optional[bool] = False, + manual: bool = False, displacy_options: Optional[Dict] = None, ): """ @@ -180,6 +212,7 @@ def visualize_ner( labels (list): The entity labels to visualize. attrs (list): The attributes on the entity Span to be labeled. Attributes are displayed only when the show_table argument is True. + show_table (bool): Flag signifying whether to show a table with accompanying entity attributes. title (str): The title displayed at the top of the NER visualization. colors (Dict): Dictionary of colors for the entity spans to visualize, with keys as labels and corresponding colors as the values. This argument will be deprecated soon. In future the colors arg need to be passed in the displacy_options arg @@ -188,6 +221,7 @@ def visualize_ner( manual (bool): Flag signifying whether the doc argument is a Doc object or a List of Dicts containing entity span information. displacy_options (Dict): Dictionary of options to be passed to the displacy render method for generating the HTML to be rendered. + See https://spacy.io/api/top-level#displacy_options-ent. """ if not displacy_options: displacy_options = dict() @@ -240,6 +274,67 @@ def visualize_ner( st.dataframe(df) +def visualize_spans( + doc: Union[spacy.tokens.Doc, Dict[str, str]], + *, + spans_key: str = "sc", + attrs: List[str] = SPAN_ATTRS, + show_table: bool = True, + title: Optional[str] = "Spans", + manual: bool = False, + displacy_options: Optional[Dict] = None, +): + """ + Visualizer for spans. + + doc (Doc, Dict): The document to visualize. + spans_key (str): Which spans key to render spans from. Default is "sc". + attrs (list): The attributes on the entity Span to be labeled. Attributes are displayed only when the show_table + argument is True. + show_table (bool): Flag signifying whether to show a table with accompanying span attributes. + title (str): The title displayed at the top of the Spans visualization. + manual (bool): Flag signifying whether the doc argument is a Doc object or a List of Dicts containing span information. + displacy_options (Dict): Dictionary of options to be passed to the displacy render method for generating the HTML to be rendered. + See https://spacy.io/api/top-level#displacy_options-span + """ + if SPACY_VERSION < (3, 3, 0): + raise ValueError( + f"'visualize_spans' requires spacy>=3.3.0. You have spacy=={spacy.__version__}" + ) + if not displacy_options: + displacy_options = dict() + displacy_options["spans_key"] = spans_key + + if title: + st.header(title) + + if manual: + if show_table: + st.warning( + "When the parameter 'manual' is set to True, the parameter 'show_table' must be set to False." + ) + if not isinstance(doc, dict): + st.warning( + "When the parameter 'manual' is set to True, the parameter 'doc' must be of type 'Dict', not 'spacy.tokens.Doc'." + ) + html = displacy.render( + doc, + style="span", + options=displacy_options, + manual=manual, + ) + st.write(f"{get_html(html)}", unsafe_allow_html=True) + + if show_table: + data = [ + [str(getattr(span, attr)) for attr in attrs] + for span in doc.spans[spans_key] + ] + if data: + df = pd.DataFrame(data, columns=attrs) + st.dataframe(df) + + def visualize_textcat( doc: spacy.tokens.Doc, *, title: Optional[str] = "Text Classification" ) -> None: