-
Notifications
You must be signed in to change notification settings - Fork 1
/
fineweb.py
66 lines (58 loc) · 2.82 KB
/
fineweb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""
Downloads and tokenises the fineweb-edu dataset, saving the data shards to disk.
"""
import os
import tqdm
import tiktoken
import datasets
import multiprocessing
import numpy as np
shard_size = int(1e8) # 100M tokens per shard, total of 100 shards
# Create the cache directory for fineweb if it doesn't exist
CACHE_DIR = os.path.join(os.path.dirname(__file__), 'cache/fineweb_edu_10B')
os.makedirs(CACHE_DIR, exist_ok=True)
tokeniser = tiktoken.get_encoding('gpt2') # or GPTTokeniser('gpt.tkn')
# Download the 10B FineWeb-Edu dataset from Hugging Face
dataset = datasets.load_dataset('HuggingFaceFW/fineweb-edu', name='sample-10BT', split='train')
def tokenise(doc: dict) -> np.ndarray:
"""Tokenise a document."""
tokens = [tokeniser._special_tokens['<|endoftext|>']]
tokens.extend(tokeniser.encode_ordinary(doc['text']))
tokens = np.array(tokens)
assert (0 <= tokens).all() and (tokens < 2**16).all(), 'Token dictionary exceeds bounds for uint16'
return tokens.astype(np.uint16)
# Tokenise all documents and write output shards, each of shard_size tokens (last shard has remainder)
nprocs = max(1, os.cpu_count() - 2) # Use all but 2 CPU cores
with multiprocessing.Pool(nprocs) as pool:
shard_idx = 0
all_tokens = np.empty((shard_size,), dtype=np.uint16) # Pre-allocate the maximum shard size
token_count = 0
prog_bar = None
for tokens in pool.imap(tokenise, dataset, chunksize=16):
if token_count + len(tokens) < shard_size: # Shard not full yet
# Append tokens to the current shard
all_tokens[token_count : token_count + len(tokens)] = tokens
token_count += len(tokens)
# Update progress bar
if prog_bar is None:
prog_bar = tqdm.tqdm(total=shard_size, unit='tok', desc=f'Shard {shard_idx:04d}')
prog_bar.update(len(tokens))
else:
# Write the current shard and reset for the next one
split = 'val' if shard_idx == 0 else 'train'
filename = os.path.join(CACHE_DIR, f'fineweb_edu_{split}_{shard_idx:04d}')
# Pack document into this shard, remaining tokens go to next one
remainder = shard_size - token_count
prog_bar.update(remainder)
all_tokens[token_count : token_count + remainder] = tokens[:remainder]
np.save(filename, all_tokens)
shard_idx += 1
prog_bar = None
# Start a new shard with the remaining tokens
all_tokens[0 : len(tokens) - remainder] = tokens[remainder:]
token_count = len(tokens) - remainder
# Write remaining tokens to the last shard
if token_count != 0:
split = 'val' if shard_idx == 0 else 'train'
filename = os.path.join(CACHE_DIR, f'fineweb_edu_{split}_{shard_idx:04d}')
np.save(filename, all_tokens[:token_count])