-
Notifications
You must be signed in to change notification settings - Fork 17
/
chunk4turk.py
34 lines (30 loc) · 1.17 KB
/
chunk4turk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import argparse, csv, emoji
# Mechanical Turk can't handle emojis
def remove_emojis(s):
return ''.join(filter(lambda c: c not in emoji.UNICODE_EMOJI, s))
def main():
parser = argparse.ArgumentParser()
parser.add_argument("input_path", type=str,
help="one-article per line file")
parser.add_argument("output_path", type=str,
help="file to write output CSV to")
parser.add_argument('-b', type=int, default=25,
help='chunk size')
args = parser.parse_args()
print(args)
with open(args.input_path, 'r') as input_file, open(args.output_path, 'w') as output_file:
writer = csv.writer(output_file, quoting=csv.QUOTE_ALL)
writer.writerow([ 'OUT%d' % i for i in range(args.b)])
cols = []
i = 0
for line in input_file:
if i > 0 and (i % args.b) == 0:
writer.writerow(cols)
cols = []
cols.append(remove_emojis(line.strip().replace('<|endoftext|>', '')))
i += 1
if i > 0 and (i % args.b) == 0:
writer.writerow(cols)
cols = []
if __name__ == '__main__':
main()