diff --git a/README.md b/README.md index a8345a9c4..bab729dd7 100644 --- a/README.md +++ b/README.md @@ -234,7 +234,7 @@ $ pip install fuse-med-ml[all,examples] * [**Mortality prediction for ICU patients**](./fuse_examples/multimodality/ehr_transformer) - Example of EHR transformer applied to the data of Intensive Care Units patients for in-hospital mortality prediction. The dataset is from [PhysioNet Computing in Cardiology Challenge (2012)](https://physionet.org/content/challenge-2012/1.0.0/) * Pre-training * [**Medical Imaging Pre-training and Downstream Task Validation**](./fuse_examples/imaging/oai_example) - pre-training a model on 3D MRI medical imaging and then using it for classification and segmentation downstream tasks. - + ## Walkthrough template * [**Walkthrough Template**](./fuse/dl/templates/walkthrough_template.py) - includes several TODO notes, marking the minimal scope of code required to get your pipeline up and running. The template also includes useful explanations and tips. diff --git a/fuse/data/tokenizers/modular_tokenizer/op.py b/fuse/data/tokenizers/modular_tokenizer/op.py index 0f2b85697..9ccf6650a 100644 --- a/fuse/data/tokenizers/modular_tokenizer/op.py +++ b/fuse/data/tokenizers/modular_tokenizer/op.py @@ -116,7 +116,7 @@ def get_max_token_id(self) -> Tuple[str, int]: def get_min_max_sentinels( self, sentinel_prefix: Optional[str] = " Tuple[int, int]: """ returns a Tuple [min encountered sentinel name, max encountered sentinel name] @@ -186,6 +186,22 @@ def get_max_len( """ return self._tokenizer.get_expected_max_len(override_max_len=override_max_len) + def add_new_special_tokens(self, new_special_tokens: list[str]) -> int: + """add new special tokens if they are not in the tokenizer. + Skipps allready existing special tokens. + + Args: + new_special_tokens (list[str]): the tokens to add + Returns: + `int`: The number of tokens that were created in the vocabulary + + Will raise an exception if any of the tokens are allready in the tokenizer as _regular_ tokens. + """ + + tokenizer = self._tokenizer + num_new_tokens = tokenizer.add_special_tokens(new_special_tokens) + return num_new_tokens + def __call__( self, sample_dict: NDict,