-
Notifications
You must be signed in to change notification settings - Fork 45
/
file_util.py
executable file
·79 lines (70 loc) · 2.84 KB
/
file_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""
A module with a handful of utility functions for dealing with
file I/O
"""
import codecs
import sys
import io
import os
import fileinput
COLCOUNT=10
ID,FORM,LEMMA,CPOSTAG,POSTAG,FEATS,HEAD,DEPREL,DEPS,MISC=range(COLCOUNT)
COLNAMES=u"ID,FORM,LEMMA,CPOSTAG,POSTAG,FEATS,HEAD,DEPREL,DEPS,MISC".split(u",")
def in_out(args,multiple_files=False):
"""Open the input/output data streams. If multiple_files is set to
True, returns an iterator over lines. If set to False, returns an open file.
This distinction is needed because validator.py checks the newlines property and
needs to get the input as a file, but the other scripts just need the lines
so they can work with several files.
"""
#Decide where to get the data from
if args.input is None or args.input=="-": #Stdin
inp=codecs.getreader("utf-8")(os.fdopen(0,"U")) #Switched universal newlines on
else: #File name given
if multiple_files:
inp_raw=fileinput.input(files=args.input,mode="U")
inp=(line.decode("utf-8") for line in inp_raw)
else:
inp_raw=open(args.input,mode="U")
inp=codecs.getreader("utf-8")(inp_raw)
#inp is now an iterator over lines, giving unicode strings
if args.output is None or args.output=="-": #stdout
out=codecs.getwriter("utf-8")(sys.stdout)
else: #File name given
out=codecs.open(args.output,"w","utf-8")
return inp,out
def print_tree(comments,tree,out):
if comments:
print >> out, u"\n".join(comments)
for cols in tree:
print >> out, u"\t".join(cols)
print >> out
def trees(inp):
"""
`inp` a file-like object yielding lines as unicode
Yields the input a tree at a time.
"""
comments=[] #List of comment lines to go with the current tree
lines=[] #List of token/word lines of the current tree
for line_counter, line in enumerate(inp):
line=line.rstrip()
if not line: #empty line
if lines: #Sentence done, yield. Skip otherwise.
yield comments, lines
comments=[]
lines=[]
elif line[0]==u"#":
comments.append(line)
elif line[0].isdigit():
cols=line.split(u"\t")
if len(cols)!=COLCOUNT:
print >> sys.stderr, u"Line %d: The line has %d columns, but %d are expected. Giving up."%(line_counter+1,len(cols),COLCOUNT)
sys.exit(1)
lines.append(cols)
else: #A line which is not a comment, nor a token/word, nor empty. That's bad!
#TODO warn!
print >> sys.stderr, u"Line %d not conllu: Giving up."%(line_counter+1)
sys.exit(1) #Give a non-zero exit code
else: #end of file
if comments or lines: #Looks like a forgotten empty line at the end of the file, well, okay...
yield comments, lines