From 45eb4229dba59f31d3a739fdd034794f11a157c2 Mon Sep 17 00:00:00 2001
From: Tomasz Grzegorzek <tomaszgrzegorzek@outlook.com>
Date: Sun, 17 May 2020 23:35:20 +0200
Subject: [PATCH 1/2] Implement text clasification dataset with fasttext
 conversion

---
 doccano_transformer/datasets.py | 58 ++++++++++++++++++++++++++++-----
 doccano_transformer/examples.py | 36 +++++++++++++++++---
 doccano_transformer/utils.py    | 13 ++++++--
 3 files changed, 92 insertions(+), 15 deletions(-)

diff --git a/doccano_transformer/datasets.py b/doccano_transformer/datasets.py
index e4410ac..6ce9a14 100644
--- a/doccano_transformer/datasets.py
+++ b/doccano_transformer/datasets.py
@@ -2,17 +2,19 @@
 import json
 from typing import Any, Callable, Iterable, Iterator, List, Optional, TextIO
 
-from doccano_transformer.examples import Example, NERExample
+from doccano_transformer.examples import (Example, NERExample,
+                                          TextClassificationExample)
+from doccano_transformer.utils import read_labels
 
 
 class Dataset:
     def __init__(
-        self,
-        filepath: str,
-        encoding: Optional[str] = 'utf-8',
-        transformation_func: Optional[Callable[[TextIO], Iterable[Any]]] = None
+            self,
+            filepath: str,
+            encoding: Optional[str] = 'utf-8',
+            transformation_func: Optional[
+                Callable[[TextIO], Iterable[Any]]] = None
     ) -> None:
-
         self.filepath = filepath
         self.encoding = encoding
         self.transformation_func = transformation_func or (lambda x: x)
@@ -29,7 +31,7 @@ def from_jsonl(
 
     @classmethod
     def from_csv(
-        cls, filepath: str, encoding: Optional[str] = 'utf-8'
+            cls, filepath: str, encoding: Optional[str] = 'utf-8'
     ) -> 'Dataset':
         return cls(filepath, encoding, csv.DictReader)
 
@@ -48,13 +50,51 @@ class NERDataset(TaskDataset):
     example_class = NERExample
 
     def to_conll2003(
-        self, tokenizer: Callable[[str], List[str]]
+            self, tokenizer: Callable[[str], List[str]]
     ) -> Iterator[str]:
         for example in self:
             yield from example.to_conll2003(tokenizer)
 
     def to_spacy(
-        self, tokenizer: Callable[[str], List[str]]
+            self, tokenizer: Callable[[str], List[str]]
     ) -> Iterator[dict]:
         for example in self:
             yield from example.to_spacy(tokenizer)
+
+
+class TextClassificationDataset(TaskDataset):
+    example_class = TextClassificationExample
+
+    def __init__(self, filepath: str, labels_filepath: str, encoding: Optional[
+        str] = 'utf-8',
+                 transformation_func: Optional[
+                     Callable[[TextIO], Iterable[Any]]] = None) -> None:
+        """
+        Dataset for converting text classification annotations
+        Args:
+            filepath: path to exported annotations
+            labels_filepath: path to exported label metadata
+            encoding: encoding of the annotation file
+            transformation_func: additional tranformation function
+        """
+        super().__init__(filepath, encoding, transformation_func)
+        self.labels_filepath = labels_filepath
+        self.labels = read_labels(self.labels_filepath)
+
+    @classmethod
+    def from_jsonl(
+            cls, filepath: str, encoding: Optional[str] = 'utf-8', **kwargs
+    ) -> 'Dataset':
+        labels_filepath = kwargs.get('labels_filepath')
+        return cls(filepath, labels_filepath, encoding, lambda f: map(
+            json.loads, f))
+
+    def __iter__(self) -> Iterator[Example]:
+        for raw in super(TextClassificationDataset, self).__iter__():
+            example = self.example_class(raw, self.labels)
+            example.is_valid(raise_exception=True)
+            yield example
+
+    def to_fasttext(self) -> Iterator[str]:
+        for example in self:
+            yield example.to_fasttext()
diff --git a/doccano_transformer/examples.py b/doccano_transformer/examples.py
index cdf7abc..e898573 100644
--- a/doccano_transformer/examples.py
+++ b/doccano_transformer/examples.py
@@ -1,5 +1,6 @@
+import os
 from collections import defaultdict
-from typing import Callable, Iterator, List, Optional
+from typing import Callable, Dict, Iterator, List, Optional
 
 from spacy.gold import biluo_tags_from_offsets
 
@@ -58,7 +59,7 @@ def is_valid(self, raise_exception: Optional[bool] = True) -> bool:
         return True
 
     def to_conll2003(
-        self, tokenizer: Callable[[str], List[str]]
+            self, tokenizer: Callable[[str], List[str]]
     ) -> Iterator[dict]:
         all_tokens, all_token_offsets = self.get_tokens_and_token_offsets(
             tokenizer)
@@ -79,7 +80,7 @@ def to_conll2003(
             yield {'user': user, 'data': ''.join(lines)}
 
     def to_spacy(
-        self, tokenizer: Callable[[str], List[str]]
+            self, tokenizer: Callable[[str], List[str]]
     ) -> Iterator[dict]:
         all_tokens, all_token_offsets = self.get_tokens_and_token_offsets(
             tokenizer)
@@ -101,7 +102,7 @@ def to_spacy(
                 tags = biluo_tags_from_offsets(tokens, label)
                 tokens_for_spacy = []
                 for i, (token, tag, offset) in enumerate(
-                    zip(tokens, tags, offsets)
+                        zip(tokens, tags, offsets)
                 ):
                     tokens_for_spacy.append(
                         {'id': i, 'orth': str(token), 'ner': tag}
@@ -109,3 +110,30 @@ def to_spacy(
                 sentences.append({'tokens': tokens_for_spacy})
             data['sentences'] = sentences
             yield {'user': user, 'data': {'id': self.id, 'paragraphs': [data]}}
+
+
+class TextClassificationExample(Example):
+
+    def __init__(self, raw, labels: Dict) -> None:
+        """
+        Example class for text classification projects
+        Args:
+            raw: example in a for of dict
+            labels: mapping of labels from id to text
+        """
+        self.raw = raw
+        self.labels = labels
+        self.annotations = self.raw['annotations']
+
+    def is_valid(self, raise_exception: Optional[bool] = True) -> None:
+        return True
+
+    def _append_label_text(self, label_id: int) -> str:
+        return f'__label__{self.labels[label_id]} '
+
+    def _create_label_tags(self):
+        return ''.join(self._append_label_text(annotation['label'])
+                       for annotation in self.annotations)
+
+    def to_fasttext(self):
+        return self._create_label_tags() + self.raw['text'] + os.linesep
diff --git a/doccano_transformer/utils.py b/doccano_transformer/utils.py
index 3d5d150..fd11f7e 100644
--- a/doccano_transformer/utils.py
+++ b/doccano_transformer/utils.py
@@ -1,4 +1,5 @@
-from typing import TYPE_CHECKING, List, Optional, Tuple
+import json
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 
 if TYPE_CHECKING:
     from doccano_transformer.datasets import Dataset
@@ -99,7 +100,7 @@ def __str__(self):
 
 
 def convert_tokens_and_offsets_to_spacy_tokens(
-    tokens: List[str], offsets: List[int]
+        tokens: List[str], offsets: List[int]
 ) -> List[Token]:
     """Convert tokens and offsets to the list of SpaCy compatible object.
 
@@ -120,3 +121,11 @@ def convert_tokens_and_offsets_to_spacy_tokens(
     for i, (token, offset) in enumerate(zip(tokens, offsets)):
         spacy_tokens.append(Token(token, offset, i))
     return spacy_tokens
+
+
+def read_labels(labels_filepath: str) -> Dict:
+    labels_doccano = json.load(open(labels_filepath, mode='r'))
+    labels_mapping = {}
+    for label in labels_doccano:
+        labels_mapping[label['id']] = label['text']
+    return labels_mapping

From 3b6bbd7ae489062ee46d2fae8b2e42edfb3e6326 Mon Sep 17 00:00:00 2001
From: Tomasz Grzegorzek <tomaszgrzegorzek@outlook.com>
Date: Sun, 17 May 2020 23:38:45 +0200
Subject: [PATCH 2/2] Refactor TextClassification Dataset/Example docstring

---
 doccano_transformer/datasets.py | 4 ++--
 doccano_transformer/examples.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/doccano_transformer/datasets.py b/doccano_transformer/datasets.py
index 6ce9a14..c93a85f 100644
--- a/doccano_transformer/datasets.py
+++ b/doccano_transformer/datasets.py
@@ -69,8 +69,8 @@ def __init__(self, filepath: str, labels_filepath: str, encoding: Optional[
         str] = 'utf-8',
                  transformation_func: Optional[
                      Callable[[TextIO], Iterable[Any]]] = None) -> None:
-        """
-        Dataset for converting text classification annotations
+        """Dataset for converting text classification annotations
+
         Args:
             filepath: path to exported annotations
             labels_filepath: path to exported label metadata
diff --git a/doccano_transformer/examples.py b/doccano_transformer/examples.py
index e898573..87ca2f7 100644
--- a/doccano_transformer/examples.py
+++ b/doccano_transformer/examples.py
@@ -115,8 +115,8 @@ def to_spacy(
 class TextClassificationExample(Example):
 
     def __init__(self, raw, labels: Dict) -> None:
-        """
-        Example class for text classification projects
+        """Example class for text classification projects
+        
         Args:
             raw: example in a for of dict
             labels: mapping of labels from id to text