-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Konstantin Slavnov <[email protected]>
- Loading branch information
Showing
12 changed files
with
1,037 additions
and
288 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
""" | ||
Annotation tool. | ||
Inspired by https://uima.apache.org/d/uimafit-current/api/ | ||
""" | ||
|
||
# TODO(zurk) move annotation module and tests to lookout-sdk-ml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,283 @@ | ||
from typing import Dict, Iterator, Optional, Tuple, Union, Type # noqa F401 | ||
|
||
from lookout.core.analyzer import UnicodeFile | ||
from sortedcontainers import SortedDict | ||
|
||
from lookout.style.format.annotations.annotations import Annotation, check_offset, check_span, \ | ||
LanguageAnnotation, PathAnnotation, UASTAnnotation | ||
|
||
|
||
class NoIntersection(Exception): | ||
""" | ||
Raised by `AnnotationManager.find_intersect()` if there is no intersection of provided \ | ||
intervals. | ||
See documentation about `AnnotationManager.find_intersect()` for more information. | ||
""" | ||
|
||
|
||
class AnnotationsSpan(dict): | ||
""" | ||
Annotations collection for a specific span (or range). | ||
Dictionary-like object. | ||
""" | ||
|
||
def __init__(self, start, stop, *args, **kwargs): | ||
""" | ||
Initialize a new instance of `AnnotationsSlice`. | ||
:param start: Start of the span. | ||
:param stop: End of the span. Stop point itself is excluded. | ||
:param args: The rest position arguments are passed to `dict.__init__()`. | ||
:param kwargs: The rest key arguments are passed to `dict.__init__()`. | ||
""" | ||
check_span(start, stop) | ||
super().__init__(*args, **kwargs) | ||
self._span = (start, stop) | ||
self._start = start | ||
self._stop = stop | ||
|
||
start = property(lambda self: self._start) | ||
|
||
stop = property(lambda self: self._stop) | ||
|
||
span = property(lambda self: self._span) | ||
|
||
|
||
class AnnotationManager: | ||
""" | ||
Manager of `Annotation`-s for a text, e.g. source code. | ||
All the methods to work with annotated data should be placed in this class. | ||
Candidates can be found here: | ||
https://uima.apache.org/d/uimafit-current/api/org/apache/uima/fit/util/JCasUtil.html | ||
""" | ||
|
||
def __init__(self, sequence: str): | ||
""" | ||
Initialize a new `AnnotationManager` instance. | ||
:param sequence: Sequential data to annotate. It is expected to be string but can be any \ | ||
type with __getitem__() defined for int and slice input arguments. | ||
""" | ||
self._sequence = sequence | ||
|
||
# Dictionary to store annotations for the whole file (aka `global` annotations) | ||
self._global_annotations = {} # type: Dict[Type[Annotation], Annotation] | ||
|
||
# Next dictionaries are the main core of this class. The most common use-case we have in | ||
# style-analyzer is iterating through Token annotations in the sorted order. That is why | ||
# ordered dict is used. | ||
# `self._type_to_annotations` is the main storage of the Annotations. It is a mapping | ||
# from the annotation type to all annotations of this type which are are stored in the \ | ||
# dictionary that is sorted by spans. | ||
# `self._span_to_annotations` dict is an optimization to quickly lookup all | ||
# `Annotation`-s that belongs to the same [start, stop) span. | ||
self._span_to_annotations = SortedDict() # type: SortedDict[(int, int), Dict[Type[Annotation], Annotation]] # noqa E501 | ||
self._type_to_annotations = {} # type: Dict[Type[Annotation], SortedDict[(int, int), Annotation]] # noqa E501 | ||
|
||
sequence = property(lambda self: self._sequence) | ||
|
||
def __len__(self): | ||
"""Return the size of the underlying sequence.""" | ||
return len(self._sequence) | ||
|
||
def __getitem__(self, item: Union[int, slice, Tuple[int, int]]) -> str: | ||
""" | ||
Get the underlying sequence item or slice for the specified range. | ||
:param item: index, slice or (start, stop) tuple. | ||
:return: The corresponding part of the sequence. | ||
""" | ||
if isinstance(item, tuple): | ||
item = slice(*item) | ||
if isinstance(item, slice) and item.step is not None: | ||
raise KeyError("slice.step is not supported.") | ||
return self._sequence[item] | ||
|
||
def count(self, annotation_type: Type[Annotation]): | ||
"""Count the number of annotations of a specific type.""" | ||
return len(self._type_to_annotations[annotation_type]) | ||
|
||
def add(self, *annotations: Annotation) -> None: | ||
""" | ||
Add multiple annotations. | ||
""" | ||
for annotation in annotations: | ||
self._add(annotation) | ||
|
||
def _add(self, annotation: Annotation) -> None: | ||
""" | ||
Add an annotation. Annotations of the same type may not overlap. | ||
""" | ||
annotation_type = type(annotation) | ||
if annotation.start == 0 and annotation.stop == len(self): | ||
if annotation_type in self._global_annotations: | ||
raise ValueError("Global annotation %s already exists" % annotation) | ||
self._global_annotations[annotation_type] = annotation | ||
else: | ||
# TODO(zurk): Add a check that there is no overlapping annotations of one type. | ||
if annotation.span not in self._span_to_annotations: | ||
self._span_to_annotations[annotation.span] = {} | ||
if annotation_type not in self._type_to_annotations: | ||
self._type_to_annotations[annotation_type] = SortedDict() | ||
self._span_to_annotations[annotation.span][annotation_type] = annotation | ||
self._type_to_annotations[annotation_type][annotation.span] = annotation | ||
|
||
def get(self, annotation_type: Type[Annotation], span: Optional[Tuple[int, int]] = None, | ||
) -> Annotation: | ||
""" | ||
Return an annotation for the given span and type. | ||
Looking for an exact match only. | ||
:param annotation_type: Annotation type to get. | ||
:param span: Annotation span (range) to get. If span is not specified it returns an \ | ||
annotation that cover all content (aka global annotation). | ||
:return: Requested `Annotation`. | ||
""" | ||
if span is None: | ||
return self._global_annotations[annotation_type] | ||
else: | ||
check_span(*span) | ||
return self._type_to_annotations[annotation_type][span] | ||
|
||
def iter_annotations(self, annotation_type: Type[Annotation], | ||
*additional: Type[Annotation], | ||
start_offset: Optional[int] = None) -> Union[Iterator[AnnotationsSpan]]: | ||
""" | ||
Iterate through annotations with specified type. | ||
Iteration goes through `annotation_type`. It is additionally annotated with annotations | ||
specified in `additional`. `additional` can't be empty. If you need to iterate through \ | ||
one annotation only use `AnnotationManager.iter_annotation()`. | ||
:param annotation_type: Type of annotation to iterate through. | ||
:param additional: Additional annotations that should be added to the main one. | ||
:param start_offset: Start to iterate from the spesific offset. \ | ||
Can be used as a key argument only. | ||
:return: Iterator through annotations of requested types. | ||
""" | ||
if not additional: | ||
raise ValueError("At least one additional annotation should be specified. " | ||
"If you need to iterate through only one annotation use " | ||
"`iter_annotation()`.") | ||
types = set(additional) | {annotation_type} | ||
for annotation in self.iter_annotation(annotation_type, start_offset=start_offset): | ||
# Annotations with the same span | ||
same_span_annotations = self._span_to_annotations[annotation.span] | ||
same_span_annotations_type = set(same_span_annotations.keys()) | ||
common_types = types & same_span_annotations_type | ||
missing_types = types - same_span_annotations_type | ||
annotations = dict() | ||
for missing_type in missing_types: | ||
try: | ||
annotations[missing_type] = self.find_overlapping_span(missing_type, | ||
*annotation.span) | ||
except NoIntersection: | ||
pass | ||
annotations.update({type: same_span_annotations[type] for type in common_types}) | ||
yield AnnotationsSpan(*annotation.span, annotations) | ||
|
||
def iter_annotation(self, annotation_type: Type[Annotation], *, | ||
start_offset: Optional[int] = None) -> Iterator[Annotation]: | ||
""" | ||
Iterate through a specific type of annotation. | ||
If you need to iterate through several annotations use \ | ||
`AnnotationManager.iter_annotations()` instead. | ||
:param annotation_type: Type of annotation to iterate through. | ||
:param start_offset: Start to iterate from the spesific offset. \ | ||
Can be used as a key argument only. | ||
:return: Iterator through annotations of requested type. | ||
""" | ||
search_from = 0 | ||
if start_offset is not None: | ||
check_offset(start_offset, "start_offset") | ||
search_from = self._type_to_annotations[annotation_type].bisect_left( | ||
(start_offset, start_offset)) | ||
for value in self._type_to_annotations[annotation_type].values()[search_from:]: | ||
yield value | ||
|
||
def find_overlapping_span(self, annotation_type: Type[Annotation], | ||
start: int, stop: int) -> Annotation: | ||
""" | ||
Find an annotation of the given type that intersects the interval [start, stop). | ||
:param annotation_type: Annotation type to look for. | ||
:param start: Start of the search interval. | ||
:param stop: End of the search interval. Stop point itself is excluded. | ||
:raise NoIntersection: There is no such annotation that overlaps with the given interval. | ||
:return: `Annotation` of the requested type. | ||
""" | ||
try: | ||
annotation_layer = self._type_to_annotations[annotation_type] | ||
except KeyError: | ||
raise NoIntersection("There is no annotation layer %s" % annotation_type) | ||
check_span(start, stop) | ||
search_start = max(0, annotation_layer.bisect_left((start, start)) - 1) | ||
search_stop = annotation_layer.bisect_right((stop, stop)) | ||
for span in annotation_layer.islice(search_start, search_stop): | ||
if self._check_spans_overlap(start, stop, *span): | ||
# assuming that there is only one such annotation | ||
return annotation_layer[span] | ||
raise NoIntersection("There is no annotation %s from %d to %d" % (annotation_type, start, | ||
stop)) | ||
|
||
@classmethod | ||
def _check_spans_overlap(cls, start1: int, stop1: int, start2: int, stop2: int) -> bool: | ||
""" | ||
Check if two spans have at least 1 common point in their overlap. | ||
Span 1 is [start1, stop1). `stop1` itself is excluded. | ||
Span 2 is [start2, stop2). `stop2` itself is excluded. | ||
Everywhere in next examples x < y < z. | ||
Corner cases explained: | ||
1. [x, y) and [y, z) have no overlap because y is excluded from the 1st interval. | ||
2. 0-intervals: | ||
2.1. [y, y) and [y, y) are overlapping because it is the same interval. | ||
2.2. [y, y) and [y, z) have no overlap. | ||
2.3. [x, y) and [y, y) have no overlap. | ||
2.4. [x, z) and [y, y) are overlapping because [x, z) fully covers y point. | ||
Despite the fact that overlapping rules are defined for 0-intervals it is not recommended \ | ||
to rely on them. If you want to get an additional annotation of the 0-interval annotation \ | ||
link one annotation to another. See `TokenAnnotation` as an example. | ||
:param start1: Start offset of the first span. | ||
:param stop1: Stop offset of the first span. | ||
:param start2: Start offset of the second span. | ||
:param stop2: Stop offset of the second span. | ||
:return: True if spans are overlapping else False. | ||
""" | ||
if start1 == stop1: | ||
if start2 == stop2: | ||
return start1 == start2 | ||
else: | ||
return start2 < start1 < stop2 | ||
else: | ||
if start2 == stop2: | ||
return start1 < start2 < stop1 | ||
else: | ||
return (start1 <= start2 < stop1 or | ||
start1 < stop2 < stop1 or | ||
start2 <= start1 < stop2) | ||
|
||
@classmethod | ||
def from_file(cls, file: UnicodeFile) -> "AnnotationManager": | ||
""" | ||
Create `AnnotationManager` instance from `UnicodeFile`. | ||
:param file: `file.content` will be used as data to be annotated with \ | ||
`file.path`, `file.language` and `file.uast`. | ||
:return: new AnnotationManager instance. | ||
""" | ||
raw_data = file.content | ||
annotated_data = AnnotationManager(raw_data) | ||
annotated_data.add(PathAnnotation(0, len(raw_data), file.path)) | ||
annotated_data.add(UASTAnnotation(0, len(raw_data), file.uast)) | ||
annotated_data.add(LanguageAnnotation(0, len(raw_data), file.language)) | ||
return annotated_data |
Oops, something went wrong.