diff --git a/doccano_transformer/datasets.py b/doccano_transformer/datasets.py index e4410ac..c93a85f 100644 --- a/doccano_transformer/datasets.py +++ b/doccano_transformer/datasets.py @@ -2,17 +2,19 @@ import json from typing import Any, Callable, Iterable, Iterator, List, Optional, TextIO -from doccano_transformer.examples import Example, NERExample +from doccano_transformer.examples import (Example, NERExample, + TextClassificationExample) +from doccano_transformer.utils import read_labels class Dataset: def __init__( - self, - filepath: str, - encoding: Optional[str] = 'utf-8', - transformation_func: Optional[Callable[[TextIO], Iterable[Any]]] = None + self, + filepath: str, + encoding: Optional[str] = 'utf-8', + transformation_func: Optional[ + Callable[[TextIO], Iterable[Any]]] = None ) -> None: - self.filepath = filepath self.encoding = encoding self.transformation_func = transformation_func or (lambda x: x) @@ -29,7 +31,7 @@ def from_jsonl( @classmethod def from_csv( - cls, filepath: str, encoding: Optional[str] = 'utf-8' + cls, filepath: str, encoding: Optional[str] = 'utf-8' ) -> 'Dataset': return cls(filepath, encoding, csv.DictReader) @@ -48,13 +50,51 @@ class NERDataset(TaskDataset): example_class = NERExample def to_conll2003( - self, tokenizer: Callable[[str], List[str]] + self, tokenizer: Callable[[str], List[str]] ) -> Iterator[str]: for example in self: yield from example.to_conll2003(tokenizer) def to_spacy( - self, tokenizer: Callable[[str], List[str]] + self, tokenizer: Callable[[str], List[str]] ) -> Iterator[dict]: for example in self: yield from example.to_spacy(tokenizer) + + +class TextClassificationDataset(TaskDataset): + example_class = TextClassificationExample + + def __init__(self, filepath: str, labels_filepath: str, encoding: Optional[ + str] = 'utf-8', + transformation_func: Optional[ + Callable[[TextIO], Iterable[Any]]] = None) -> None: + """Dataset for converting text classification annotations + + Args: + filepath: path to exported annotations + labels_filepath: path to exported label metadata + encoding: encoding of the annotation file + transformation_func: additional tranformation function + """ + super().__init__(filepath, encoding, transformation_func) + self.labels_filepath = labels_filepath + self.labels = read_labels(self.labels_filepath) + + @classmethod + def from_jsonl( + cls, filepath: str, encoding: Optional[str] = 'utf-8', **kwargs + ) -> 'Dataset': + labels_filepath = kwargs.get('labels_filepath') + return cls(filepath, labels_filepath, encoding, lambda f: map( + json.loads, f)) + + def __iter__(self) -> Iterator[Example]: + for raw in super(TextClassificationDataset, self).__iter__(): + example = self.example_class(raw, self.labels) + example.is_valid(raise_exception=True) + yield example + + def to_fasttext(self) -> Iterator[str]: + for example in self: + yield example.to_fasttext() diff --git a/doccano_transformer/examples.py b/doccano_transformer/examples.py index cdf7abc..87ca2f7 100644 --- a/doccano_transformer/examples.py +++ b/doccano_transformer/examples.py @@ -1,5 +1,6 @@ +import os from collections import defaultdict -from typing import Callable, Iterator, List, Optional +from typing import Callable, Dict, Iterator, List, Optional from spacy.gold import biluo_tags_from_offsets @@ -58,7 +59,7 @@ def is_valid(self, raise_exception: Optional[bool] = True) -> bool: return True def to_conll2003( - self, tokenizer: Callable[[str], List[str]] + self, tokenizer: Callable[[str], List[str]] ) -> Iterator[dict]: all_tokens, all_token_offsets = self.get_tokens_and_token_offsets( tokenizer) @@ -79,7 +80,7 @@ def to_conll2003( yield {'user': user, 'data': ''.join(lines)} def to_spacy( - self, tokenizer: Callable[[str], List[str]] + self, tokenizer: Callable[[str], List[str]] ) -> Iterator[dict]: all_tokens, all_token_offsets = self.get_tokens_and_token_offsets( tokenizer) @@ -101,7 +102,7 @@ def to_spacy( tags = biluo_tags_from_offsets(tokens, label) tokens_for_spacy = [] for i, (token, tag, offset) in enumerate( - zip(tokens, tags, offsets) + zip(tokens, tags, offsets) ): tokens_for_spacy.append( {'id': i, 'orth': str(token), 'ner': tag} @@ -109,3 +110,30 @@ def to_spacy( sentences.append({'tokens': tokens_for_spacy}) data['sentences'] = sentences yield {'user': user, 'data': {'id': self.id, 'paragraphs': [data]}} + + +class TextClassificationExample(Example): + + def __init__(self, raw, labels: Dict) -> None: + """Example class for text classification projects + + Args: + raw: example in a for of dict + labels: mapping of labels from id to text + """ + self.raw = raw + self.labels = labels + self.annotations = self.raw['annotations'] + + def is_valid(self, raise_exception: Optional[bool] = True) -> None: + return True + + def _append_label_text(self, label_id: int) -> str: + return f'__label__{self.labels[label_id]} ' + + def _create_label_tags(self): + return ''.join(self._append_label_text(annotation['label']) + for annotation in self.annotations) + + def to_fasttext(self): + return self._create_label_tags() + self.raw['text'] + os.linesep diff --git a/doccano_transformer/utils.py b/doccano_transformer/utils.py index 3d5d150..fd11f7e 100644 --- a/doccano_transformer/utils.py +++ b/doccano_transformer/utils.py @@ -1,4 +1,5 @@ -from typing import TYPE_CHECKING, List, Optional, Tuple +import json +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple if TYPE_CHECKING: from doccano_transformer.datasets import Dataset @@ -99,7 +100,7 @@ def __str__(self): def convert_tokens_and_offsets_to_spacy_tokens( - tokens: List[str], offsets: List[int] + tokens: List[str], offsets: List[int] ) -> List[Token]: """Convert tokens and offsets to the list of SpaCy compatible object. @@ -120,3 +121,11 @@ def convert_tokens_and_offsets_to_spacy_tokens( for i, (token, offset) in enumerate(zip(tokens, offsets)): spacy_tokens.append(Token(token, offset, i)) return spacy_tokens + + +def read_labels(labels_filepath: str) -> Dict: + labels_doccano = json.load(open(labels_filepath, mode='r')) + labels_mapping = {} + for label in labels_doccano: + labels_mapping[label['id']] = label['text'] + return labels_mapping