Add annotations

Signed-off-by: Konstantin Slavnov <[email protected]>
src-d · Apr 23, 2019 · e7ceb27 · e7ceb27
1 parent f49438c
commit e7ceb27
Show file tree

Hide file tree

Showing 12 changed files with 1,037 additions and 288 deletions.
diff --git a/lookout/style/format/annotations/__init__.py b/lookout/style/format/annotations/__init__.py
@@ -0,0 +1,7 @@
+"""
+Annotation tool.
+
+Inspired by https://uima.apache.org/d/uimafit-current/api/
+"""
+
+# TODO(zurk) move annotation module and tests to lookout-sdk-ml
diff --git a/lookout/style/format/annotations/annotated_data.py b/lookout/style/format/annotations/annotated_data.py
@@ -0,0 +1,283 @@
+from typing import Dict, Iterator, Optional, Tuple, Union, Type  # noqa F401
+
+from lookout.core.analyzer import UnicodeFile
+from sortedcontainers import SortedDict
+
+from lookout.style.format.annotations.annotations import Annotation, check_offset, check_span, \
+    LanguageAnnotation, PathAnnotation, UASTAnnotation
+
+
+class NoIntersection(Exception):
+    """
+    Raised by `AnnotationManager.find_intersect()` if there is no intersection of provided \
+    intervals.
+
+    See documentation about `AnnotationManager.find_intersect()` for more information.
+    """
+
+
+class AnnotationsSpan(dict):
+    """
+    Annotations collection for a specific span (or range).
+
+    Dictionary-like object.
+    """
+
+    def __init__(self, start, stop, *args, **kwargs):
+        """
+        Initialize a new instance of `AnnotationsSlice`.
+
+        :param start: Start of the span.
+        :param stop: End of the span. Stop point itself is excluded.
+        :param args: The rest position arguments are passed to `dict.__init__()`.
+        :param kwargs: The rest key arguments are passed to `dict.__init__()`.
+        """
+        check_span(start, stop)
+        super().__init__(*args, **kwargs)
+        self._span = (start, stop)
+        self._start = start
+        self._stop = stop
+
+    start = property(lambda self: self._start)
+
+    stop = property(lambda self: self._stop)
+
+    span = property(lambda self: self._span)
+
+
+class AnnotationManager:
+    """
+    Manager of `Annotation`-s for a text, e.g. source code.
+
+    All the methods to work with annotated data should be placed in this class.
+    Candidates can be found here:
+    https://uima.apache.org/d/uimafit-current/api/org/apache/uima/fit/util/JCasUtil.html
+    """
+
+    def __init__(self, sequence: str):
+        """
+        Initialize a new `AnnotationManager` instance.
+
+        :param sequence: Sequential data to annotate. It is expected to be string but can be any \
+                         type with __getitem__() defined for int and slice input arguments.
+        """
+        self._sequence = sequence
+
+        # Dictionary to store annotations for the whole file (aka `global` annotations)
+        self._global_annotations = {}  # type: Dict[Type[Annotation], Annotation]
+
+        # Next dictionaries are the main core of this class. The most common use-case we have in
+        # style-analyzer is iterating through Token annotations in the sorted order. That is why
+        # ordered dict is used.
+        # `self._type_to_annotations` is the main storage of the Annotations. It is a mapping
+        # from the annotation type to all annotations of this type which are are stored in the \
+        # dictionary that is sorted by spans.
+        # `self._span_to_annotations` dict is an optimization to quickly lookup all
+        # `Annotation`-s that belongs to the same [start, stop) span.
+        self._span_to_annotations = SortedDict()  # type: SortedDict[(int, int), Dict[Type[Annotation], Annotation]]  # noqa E501
+        self._type_to_annotations = {}  # type: Dict[Type[Annotation], SortedDict[(int, int), Annotation]]  # noqa E501
+
+    sequence = property(lambda self: self._sequence)
+
+    def __len__(self):
+        """Return the size of the underlying sequence."""
+        return len(self._sequence)
+
+    def __getitem__(self, item: Union[int, slice, Tuple[int, int]]) -> str:
+        """
+        Get the underlying sequence item or slice for the specified range.
+
+        :param item: index, slice or (start, stop) tuple.
+        :return: The corresponding part of the sequence.
+        """
+        if isinstance(item, tuple):
+            item = slice(*item)
+        if isinstance(item, slice) and item.step is not None:
+            raise KeyError("slice.step is not supported.")
+        return self._sequence[item]
+
+    def count(self, annotation_type: Type[Annotation]):
+        """Count the number of annotations of a specific type."""
+        return len(self._type_to_annotations[annotation_type])
+
+    def add(self, *annotations: Annotation) -> None:
+        """
+        Add multiple annotations.
+        """
+        for annotation in annotations:
+            self._add(annotation)
+
+    def _add(self, annotation: Annotation) -> None:
+        """
+        Add an annotation. Annotations of the same type may not overlap.
+        """
+        annotation_type = type(annotation)
+        if annotation.start == 0 and annotation.stop == len(self):
+            if annotation_type in self._global_annotations:
+                raise ValueError("Global annotation %s already exists" % annotation)
+            self._global_annotations[annotation_type] = annotation
+        else:
+            # TODO(zurk): Add a check that there is no overlapping annotations of one type.
+            if annotation.span not in self._span_to_annotations:
+                self._span_to_annotations[annotation.span] = {}
+            if annotation_type not in self._type_to_annotations:
+                self._type_to_annotations[annotation_type] = SortedDict()
+            self._span_to_annotations[annotation.span][annotation_type] = annotation
+            self._type_to_annotations[annotation_type][annotation.span] = annotation
+
+    def get(self, annotation_type: Type[Annotation], span: Optional[Tuple[int, int]] = None,
+            ) -> Annotation:
+        """
+        Return an annotation for the given span and type.
+
+        Looking for an exact match only.
+
+        :param annotation_type: Annotation type to get.
+        :param span: Annotation span (range) to get. If span is not specified it returns an \
+                     annotation that cover all content (aka global annotation).
+        :return: Requested `Annotation`.
+        """
+        if span is None:
+            return self._global_annotations[annotation_type]
+        else:
+            check_span(*span)
+            return self._type_to_annotations[annotation_type][span]
+
+    def iter_annotations(self, annotation_type: Type[Annotation],
+                         *additional: Type[Annotation],
+                         start_offset: Optional[int] = None) -> Union[Iterator[AnnotationsSpan]]:
+        """
+        Iterate through annotations with specified type.
+
+        Iteration goes through `annotation_type`. It is additionally annotated with annotations
+        specified in `additional`. `additional` can't be empty. If you need to iterate through \
+        one annotation only use `AnnotationManager.iter_annotation()`.
+
+        :param annotation_type: Type of annotation to iterate through.
+        :param additional: Additional annotations that should be added to the main one.
+        :param start_offset: Start to iterate from the spesific offset. \
+                             Can be used as a key argument only.
+        :return: Iterator through annotations of requested types.
+        """
+        if not additional:
+            raise ValueError("At least one additional annotation should be specified. "
+                             "If you need to iterate through only one annotation use "
+                             "`iter_annotation()`.")
+        types = set(additional) | {annotation_type}
+        for annotation in self.iter_annotation(annotation_type, start_offset=start_offset):
+            # Annotations with the same span
+            same_span_annotations = self._span_to_annotations[annotation.span]
+            same_span_annotations_type = set(same_span_annotations.keys())
+            common_types = types & same_span_annotations_type
+            missing_types = types - same_span_annotations_type
+            annotations = dict()
+            for missing_type in missing_types:
+                try:
+                    annotations[missing_type] = self.find_overlapping_span(missing_type,
+                                                                           *annotation.span)
+                except NoIntersection:
+                    pass
+            annotations.update({type: same_span_annotations[type] for type in common_types})
+            yield AnnotationsSpan(*annotation.span, annotations)
+
+    def iter_annotation(self, annotation_type: Type[Annotation], *,
+                        start_offset: Optional[int] = None) -> Iterator[Annotation]:
+        """
+        Iterate through a specific type of annotation.
+
+        If you need to iterate through several annotations use \
+        `AnnotationManager.iter_annotations()` instead.
+
+        :param annotation_type: Type of annotation to iterate through.
+        :param start_offset: Start to iterate from the spesific offset. \
+                             Can be used as a key argument only.
+        :return: Iterator through annotations of requested type.
+        """
+        search_from = 0
+        if start_offset is not None:
+            check_offset(start_offset, "start_offset")
+            search_from = self._type_to_annotations[annotation_type].bisect_left(
+                (start_offset, start_offset))
+        for value in self._type_to_annotations[annotation_type].values()[search_from:]:
+            yield value
+
+    def find_overlapping_span(self, annotation_type: Type[Annotation],
+                              start: int, stop: int) -> Annotation:
+        """
+        Find an annotation of the given type that intersects the interval [start, stop).
+
+        :param annotation_type: Annotation type to look for.
+        :param start: Start of the search interval.
+        :param stop: End of the search interval. Stop point itself is excluded.
+        :raise NoIntersection: There is no such annotation that overlaps with the given interval.
+        :return: `Annotation` of the requested type.
+        """
+        try:
+            annotation_layer = self._type_to_annotations[annotation_type]
+        except KeyError:
+            raise NoIntersection("There is no annotation layer %s" % annotation_type)
+        check_span(start, stop)
+        search_start = max(0, annotation_layer.bisect_left((start, start)) - 1)
+        search_stop = annotation_layer.bisect_right((stop, stop))
+        for span in annotation_layer.islice(search_start, search_stop):
+            if self._check_spans_overlap(start, stop, *span):
+                # assuming that there is only one such annotation
+                return annotation_layer[span]
+        raise NoIntersection("There is no annotation %s from %d to %d" % (annotation_type, start,
+                                                                          stop))
+
+    @classmethod
+    def _check_spans_overlap(cls, start1: int, stop1: int, start2: int, stop2: int) -> bool:
+        """
+        Check if two spans have at least 1 common point in their overlap.
+
+        Span 1 is [start1, stop1). `stop1` itself is excluded.
+        Span 2 is [start2, stop2). `stop2` itself is excluded.
+
+        Everywhere in next examples x < y < z.
+        Corner cases explained:
+        1. [x, y) and [y, z) have no overlap because y is excluded from the 1st interval.
+        2. 0-intervals:
+            2.1. [y, y) and [y, y) are overlapping because it is the same interval.
+            2.2. [y, y) and [y, z) have no overlap.
+            2.3. [x, y) and [y, y) have no overlap.
+            2.4. [x, z) and [y, y) are overlapping because [x, z) fully covers y point.
+
+        Despite the fact that overlapping rules are defined for 0-intervals it is not recommended \
+        to rely on them. If you want to get an additional annotation of the 0-interval annotation \
+        link one annotation to another. See `TokenAnnotation` as an example.
+
+        :param start1: Start offset of the first span.
+        :param stop1: Stop offset of the first span.
+        :param start2: Start offset of the second span.
+        :param stop2: Stop offset of the second span.
+        :return: True if spans are overlapping else False.
+        """
+        if start1 == stop1:
+            if start2 == stop2:
+                return start1 == start2
+            else:
+                return start2 < start1 < stop2
+        else:
+            if start2 == stop2:
+                return start1 < start2 < stop1
+            else:
+                return (start1 <= start2 < stop1 or
+                        start1 < stop2 < stop1 or
+                        start2 <= start1 < stop2)
+
+    @classmethod
+    def from_file(cls, file: UnicodeFile) -> "AnnotationManager":
+        """
+        Create `AnnotationManager` instance from `UnicodeFile`.
+
+        :param file: `file.content` will be used as data to be annotated with \
+                     `file.path`, `file.language` and `file.uast`.
+        :return: new AnnotationManager instance.
+        """
+        raw_data = file.content
+        annotated_data = AnnotationManager(raw_data)
+        annotated_data.add(PathAnnotation(0, len(raw_data), file.path))
+        annotated_data.add(UASTAnnotation(0, len(raw_data), file.uast))
+        annotated_data.add(LanguageAnnotation(0, len(raw_data), file.language))
+        return annotated_data