forked from harvardnlp/seq2seq-attn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_shrink.py
33 lines (30 loc) · 1.32 KB
/
data_shrink.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# -*- coding: utf-8 -*-
import os
import sys
import argparse
import numpy as np
import h5py
import itertools
from collections import defaultdict
def main(arguments):
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--srcfile', help="Path to source training data, ", required=True)
parser.add_argument('--targetfile', help="Path to target training data, ", required=True)
parser.add_argument('--srcoutputfile', help="Prefix of the output file names. ", type=str, required=True)
parser.add_argument('--targoutputfile', help="Prefix of the output file names. ", type=str, required=True)
parser.add_argument('--saveevery',help="save every #sentence",type = int,default=2)
parser.add_argument('--printevery',help="print every #sentence",type = int,default=100000)
args = parser.parse_args(arguments)
filesrc = open(args.srcoutputfile,"a")
filetarg = open(args.targoutputfile,"a")
for n, (src_orig, targ_orig) in enumerate(itertools.izip(open(args.srcfile,'r'), open(args.targetfile,'r'))):
if (int(n)%args.saveevery) == 0:
filesrc.write(src_orig)
filetarg.write(targ_orig)
if (int(n)%args.printevery) == 0:
print("finished {} sentences".format(n))
print("process finished")
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))