-
Notifications
You must be signed in to change notification settings - Fork 0
/
split.py
executable file
·43 lines (34 loc) · 1001 Bytes
/
split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/env python3
from pickle import dump
import numpy as np
import pickle
def load(filename):
""" Load clean dataset
Args:
filename(str): a name of the file we are downloading from.
Returns:
return(str): downloaded str
"""
return pickle.load(open(filename,'rb'))
def save_splitted(data,filename):
""" Save transformed data
Args:
data(str): transformed data we want to save with pickle
filename(str): where we want to save our str
Returns:
None
"""
dump(data,open(filename,'wb'))
print('Saved: {}'.format(filename))
return None
data = load('./data/english-russian.pkl')
# Use only 50000 of sentences!
data = data[:20000,:]
# shuffle them randomly
np.random.shuffle(data)
# splitting
train,test = data[:16000],data[16000:]
# saving
save_splitted(train,'./data/english-russian-train.pkl')
save_splitted(test,'./data/english-russian-test.pkl')
save_splitted(data,'./data/english-russian-reduced.pkl')