Skip to content

Commit

Permalink
fixed CP=1 case
Browse files Browse the repository at this point in the history
Signed-off-by: Lifu Zhang <[email protected]>
  • Loading branch information
tomlifu committed Nov 9, 2024
1 parent d3e9354 commit 02bccd7
Showing 1 changed file with 8 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -650,14 +650,14 @@ def collate_fn(self, batch):
cu_seqlens[-1].append(max_length)

for i in range(len(item['seq_boundaries']) - 1):
# since the data is prepadded with tokenizer's eos_id, we can find out the index of all the eos_id
eos_idx = np.where(
np.array(item['input_ids'][item['seq_boundaries'][i] : item['seq_boundaries'][i + 1] - 1])
== self.tokenizer.eos_id
)

# The second eos_id index marks the length of the original unpadded sequence
seqlen_unpadded = eos_idx[0][0] + 1
current_seq = item['input_ids'][item['seq_boundaries'][i] : item['seq_boundaries'][i + 1] - 1]

# since the data could be prepadded with tokenizer's eos_id, we can find out the index of all the eos_id
eos_idx = np.where(np.array(current_seq) == self.tokenizer.eos_id)

# The second eos_id index marks the length of the original unpadded sequence if the sequence is
# prepadded for cp_size > 1. Otherwise, there is no extra padding.
seqlen_unpadded = eos_idx[0][0] + 1 if eos_idx[0].any() else len(current_seq)
cu_seqlens_unpadded[-1].append(cu_seqlens_unpadded[-1][-1] + seqlen_unpadded)

# if extra paddings are added in the packed sequence, they can't be counted as
Expand Down

0 comments on commit 02bccd7

Please sign in to comment.