diff --git a/examples/language_model/bigbird/run_classifier.py b/examples/language_model/bigbird/run_classifier.py index 21c30f0ada8dd..9e3c22e114e40 100644 --- a/examples/language_model/bigbird/run_classifier.py +++ b/examples/language_model/bigbird/run_classifier.py @@ -74,7 +74,7 @@ def _tokenize(text): input_ids = np.array(input_ids).astype('int64') return input_ids - def _collate_data(data, stack_fn=Stack()): + def _collate_data(data, stack_fn=Stack(dtype='int64')): num_fields = len(data[0]) out = [None] * num_fields out[0] = stack_fn( diff --git a/examples/language_model/bigbird/run_pretrain.py b/examples/language_model/bigbird/run_pretrain.py index 53df5c957ffb2..5b24579642d7c 100644 --- a/examples/language_model/bigbird/run_pretrain.py +++ b/examples/language_model/bigbird/run_pretrain.py @@ -48,7 +48,7 @@ def __init__(self, self.tokenizer = tokenizer self.max_encoder_length = max_encoder_length self.max_pred_length = max_pred_length - input_file = open(input_file, "r") + input_file = open(input_file, "r", encoding="utf-8") self.lines = input_file.readlines() self.vocab_size = tokenizer.vocab_size