Merge branch 'AntNLP:main' into main

AntNLP · Jan 29, 2022 · f6b0dac · f6b0dac
2 parents e0176c0 + 6ff6f21
commit f6b0dac
Show file tree

Hide file tree

Showing 8 changed files with 411 additions and 8 deletions.
diff --git a/core-peripherals/README.md b/core-peripherals/README.md
@@ -0,0 +1,95 @@
+## Tokenizer
+
+To build a tokenizer, we need to perform three steps:
++ pre-process: split words according to whitespace and punctuation, or using tools like [spaCy](https://spacy.io/) and [Moses](https://www.statmt.org/moses/?n=Development.GetStarted).
++ train: build vocabulary on the corpus.
++ encode: output sub-words according to the vocabulary.
+
+### Resources & References
++ Paper:
+  + [Byte Pair Encoding (BPE)](https://aclanthology.org/P16-1162.pdf)
+  + [WordPiece](https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf)
+  + [SentencePiece Unigram](https://arxiv.org/pdf/1804.10959.pdf)
++ Code:
+  + [huggingface](https://github.com/huggingface/tokenizers) (Rust implementation)
+  + [BPE](https://github.com/rsennrich/subword-nmt/tree/master/subword_nmt) (Python implementation)
+  + [BPE (light version)](https://github.com/lovit/WordPieceModel/blob/master/wordpiecemodel/bpe.py) (Python implementation)
+  + [SentencePiece](https://github.com/google/sentencepiece) (C++ implementation)
+  + [WordPiece](https://github.com/google-research/bert/blob/master/tokenization.py) (Python Implementation, without training code)
++ blog:
+  + https://huggingface.co/docs/transformers/tokenizer_summary
+  + https://medium.com/@makcedward/how-subword-helps-on-your-nlp-model-83dd1b836f46
+  + https://towardsdatascience.com/wordpiece-subword-based-tokenization-algorithm-1fbd14394ed7
+
+### Implementation
+For clarity, we assume that the corpus has been pre-processed with spaCy. Thus, the structure of [`Tokenizer` class](tokenizer.py) as follow:
++ `__init__`: initialize.
++ `train`: build the vocabulary.
++ `encode`: output sub-words according to the vocabulary.
++ `save`: save the class to the file.
++ `from_file`: instantiate a new class from the file.
+
+### Verification
+##### Our Output
+```python
+# Step 1: Set the random seed
+SEED=xxx
+random.seed(SEED)
+np.random.seed(SEED)
+
+# Step 2: Prepare the corpus
+# one line per sentence, words are split by whitespace
+corpus_file = "xxx"
+
+# Step 3: Build the vocabulary
+tokenizer = Tokenizer(
+    vocab=None,
+    unk_token="[UNK]"
+    ...
+)
+tokenizer.train(
+    files=[corpus_file],
+    vocab_size=30000,
+    ...
+)
+tokenizer.save("tokenizer.json")
+
+# Step 4: Tokenize
+tokenizer = Tokenizer.from_file("tokenizer.json")
+output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
+```
+
+
+##### Huggingface's Output
+```python
+# Step 1: Set the random seed
+# IMPORTANT! SEED must be the same as ours!
+SEED=xxx
+random.seed(SEED)
+np.random.seed(SEED)
+
+# Step 2: Prepare the corpus
+# one line per sentence, words are split by whitespace
+corpus_file = "xxx"
+
+# Step 3: Build the vocabulary, here taking BPE as an example
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+from tokenizers.pre_tokenizers import Whitespace
+
+# We should keep the same hyper-parameters with ours
+tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+tokenizer.pre_tokenizer = Whitespace()
+
+trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+tokenizer.train([corpus_file], trainer)
+
+tokenizer.save("tokenizer.json")
+
+# Step 4: Tokenize
+tokenizer = Tokenizer.from_file("tokenizer.json")
+output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
+```
+
+Lastly, to verify the correctness of our implementation, we should compare huggingface's output with ours.
diff --git a/core-peripherals/tokenizer.py b/core-peripherals/tokenizer.py
@@ -0,0 +1,92 @@
+from typing import List, Dict
+
+
+class Tokenizer():
+    """
+    Constructing a tokenizer, including training and encoding.
+    """
+    def __init__(self,
+                 vocab: Dict[str, int] = None,
+                 unk_token: str = "[UNK]",
+                 prefix: str = "##",
+                 lowercase: bool = False,
+                 **kwargs) -> None:
+        """
+        Args:
+            vocab (`Dict[str, int]`, optional, defaults to `None`):
+                A dictionnary of string keys and their ids `{"am": 0,...}`.
+            unk_token (`str`, optional, defaults to `[UNK]`):
+                The unknown token to be used by the model.
+            prefix (`str`, optional, defaults to `##`):
+                A prefix to be used for every subword that is not a beginning-of-word.
+            lowercase (`bool`, optional, defaults to `False`):
+                Whether to lowercase.
+        """
+
+        if vocab is None:
+            self.vocab = {}
+        else:
+            self.vocab = vocab
+
+        pass
+
+    def train(self,
+              files: List[str],
+              vocab_size: int = 30000,
+              min_frequency: int = 2,
+              special_tokens: List[str] = [
+                  "[PAD]",
+                  "[UNK]",
+                  "[CLS]",
+                  "[SEP]",
+                  "[MASK]",
+              ],
+              limit_alphabet: int = 1000,
+              initial_alphabet: List[str] = [],
+              prefix: str = "##",
+              **kwargs) -> None:
+        """Build vocabulary
+        Args:
+            files (`List[str]`):
+                A list of path to the files that we should use for training.
+            vocab_size (`int`, optional, default to `30000`):
+                The size of the final vocabulary, including all tokens and alphabet. Note that 30000 for BPE and WordPiece, while 8000 for SentencePieceUnigram.
+            min_frequency (`int`, optional, default to `2`):
+                The minimum frequency a pair should have in order to be merged. Note that 0 for WordPiece and SentencePieceUnigram, while 2 for BPE.
+            special_tokens (`List[str]`, optional, default to `["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]",]`):
+                A list of special tokens the model should know of.
+            limit_alphabet (`int`, optional, default to `1000`):
+                The maximum different characters to keep in the alphabet.
+            initial_alphabet (`List[str]`, optional, default to `[]`)
+                A list of characters to include in the initial alphabet, even if not seen in the training dataset. If the strings contain more than one character, only the first one is kept.
+            prefix (`str`, optional, `##`):
+                A prefix to be used for every subword that is not a beginning-of-word.
+        """
+        pass
+
+    def encode(self, sequence: str) -> List[str]:
+        """Tokenize
+        Args:
+            sequence (`str`):
+                The raw text sequence we want to encode.
+        """
+        pass
+
+    def save(self, path: str) -> None:
+        """Save the class `Tokenizer` to the file at the given path.
+
+        Args:
+            path (`str`):
+                A path to a file in which to save the serialized tokenizer.
+        """
+        pass
+
+    def from_file(path: str) -> Tokenizer:
+        """Instantiate a new class `Tokenizer` from the file at the given path.
+
+        Args:
+            path (`str`):
+                A path to a local JSON file representing a previously serialized
+                class `Tokenizer`.
+        """
+        pass
diff --git a/transformer/README.md b/transformer/README.md
@@ -28,7 +28,7 @@ The tree logic between classes is as follows:
 │   ├── CrossAttentionSublayer
 │   │   ├── MultiHeadAttention
 │   ├── FeedforwardSublayer
-├── LearnableEmbeddings  
+├── LearnableEmbeddings
 ├── SinusoidalEmbeddings
 ```
 To obtain the output of a reproduced Transformer, we can define a randomly initialized BERT (Encoder) or BART (Enc-Dec).
@@ -42,8 +42,8 @@ Take BERT as an example:
 │   │   ├── SelfAttentionSublayer
 │   │   │   ├── MultiHeadAttention
 │   │   ├── FeedforwardSublayer
-│   ├── BERTMLMHead
-│   ├── BERTNSPHead
+│   ├── BERTMLMHead (Optional)
+│   ├── BERTNSPHead (Optional)
 ```
 ```python
 # Step 1: Set the Random Seed in the program entry 

diff --git a/transformer/attention.py b/transformer/attention.py
@@ -11,7 +11,19 @@ def __init__(
         attention_probs_dropout_prob: int = 0.1,
         position_embedding_type: str = 'absolute',      # Supporting 'relative' is a bonus.
     ) -> None:
-
+        """
+        Args:
+            num_attention_heads (`int`, *optional*, defaults to 12):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            hidden_size (`int`, *optional*, defaults to 768):
+                Dimensionality of the encoder layers.
+            attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+                The dropout ratio for the attention probabilities.
+            position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+                Type of position embedding. Choose one of `"absolute"`, `"relative"`. For
+                positional embeddings use `"absolute"`. For more information on `"relative"`, please refer to
+                [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+        """
         assert position_embedding_type == 'absolute' or position_embedding_type =='relative'
         pass
 
@@ -23,7 +35,23 @@ def forward(
         encoder_attention_mask: Tensor = None,
         output_attentions: bool = False,        # Return Tensor if not output_attentions else tuple(Tensor, Tensor)
     ) -> Union[Tensor, tuple(Tensor, Tensor)]:
-
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Contextual representations.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token & future token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Contextual representations of the encoding sequence.
+            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+        """
         self.multi_head_attention_type = 'cross-attention' if encoder_hidden_states else 'self-attention'
         pass
 
@@ -43,7 +71,23 @@ def __init__(
         hidden_dropout_prob: float = 0.1,
         position_embedding_type: str = 'absolute',      # Supporting 'relative' is a bonus.
     ) -> None:
-
+        """
+        Args:
+            num_attention_heads (`int`, *optional*, defaults to 12):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            hidden_size (`int`, *optional*, defaults to 768):
+                Dimensionality of the encoder layers.
+            attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+                The dropout ratio for the attention probabilities.
+            layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+                The epsilon used by the layer normalization layers.
+            hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+                The dropout probability for all fully connected layers in the embeddings and encoder.
+            position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+                Type of position embedding. Choose one of `"absolute"`, `"relative"`. For
+                positional embeddings use `"absolute"`. For more information on `"relative"`, please refer to
+                [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+        """
         assert position_embedding_type == 'absolute' or position_embedding_type =='relative'
         pass
 
@@ -53,6 +97,17 @@ def forward(
         attention_mask: Tensor = None,
         output_attentions: bool = False,        # Return Tensor if not output_attentions else tuple(Tensor, Tensor)
     ) -> Union[Tensor, tuple(Tensor, Tensor)]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Contextual representations.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+        """
         pass
 
 
@@ -71,7 +126,23 @@ def __init__(
         hidden_dropout_prob: float = 0.1,
         position_embedding_type: str = 'absolute',      # Supporting 'relative' is a bonus.
     ) -> None:
-
+        """
+        Args:
+            num_attention_heads (`int`, *optional*, defaults to 12):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            hidden_size (`int`, *optional*, defaults to 768):
+                Dimensionality of the encoder layers.
+            attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+                The dropout ratio for the attention probabilities.
+            layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+                The epsilon used by the layer normalization layers.
+            hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+                The dropout probability for all fully connected layers in the embeddings and encoder.
+            position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+                Type of position embedding. Choose one of `"absolute"`, `"relative"`. For
+                positional embeddings use `"absolute"`. For more information on `"relative"`, please refer to
+                [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+        """
         assert position_embedding_type == 'absolute' or position_embedding_type =='relative'
         pass
 
@@ -83,6 +154,23 @@ def forward(
         encoder_attention_mask: Tensor = None,
         output_attentions: bool = False,        # Return Tensor if not output_attentions else tuple(Tensor, Tensor)
     ) -> Union[Tensor, tuple(Tensor, Tensor)]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Contextual representations.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token & future token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Contextual representations of the encoding sequence.
+            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers.
+        """
         pass
 
 
diff --git a/transformer/embeddings.py b/transformer/embeddings.py
@@ -47,6 +47,23 @@ def __init__(
         layer_norm_eps: float = 1e-12,
         hidden_dropout_prob: float = 0.1,
     ) -> None:
+        """
+        Args:
+            vocab_size (`int`, *optional*, defaults to 30522):
+                Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+                `inputs_ids`.
+            hidden_size (`int`, *optional*, defaults to 768):
+                Dimensionality of the encoder layers.
+            max_position_embeddings (`int`, *optional*, defaults to 512):
+                The maximum sequence length that this model might ever be used with. Typically set this to something large
+                just in case (e.g., 512 or 1024 or 2048).
+            type_vocab_size (`int`, *optional*, defaults to 2):
+                The vocabulary size of the `token_type_ids`.
+            layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+                The epsilon used by the layer normalization layers.
+            hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+                The dropout probability for all fully connected layers in the embeddings and encoder.
+        """
         pass
 
     def forward(
@@ -55,4 +72,18 @@ def forward(
         token_type_ids: Tensor = None,
         position_ids: Tensor = None,
     ) -> Tensor:
+        """
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+                Indices can be obtained using [`BertTokenizer`].
+            token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+                1]`:
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+                config.max_position_embeddings - 1]`.
+        """
         pass