-
Notifications
You must be signed in to change notification settings - Fork 1
/
flatten
executable file
·232 lines (204 loc) · 9.86 KB
/
flatten
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#!/usr/local/bin/perl
# Author: Jason Eisner, University of Pennsylvania
# Usage: flatten [-f] [-l] [-u] [-w] [-1] [-b] [-B] [files ...]
#
# Flattens parses produced by headall.
# (Can also be used to eliminate just non-branching nodes in the
# output of headall or oneline: see -1 below.)
# The result is something like dependency grammar:
# Each constituent has one terminal on its RHS, which serves as the head.
#
# Example:
# (S (NP @(NNP @John)) @(VP @(VP @(VBZ @likes) (NP @(NNP @Mary))) (RB @tremendously)))
# becomes
# (S|VP|VP|VBZ (NP|NNP @John) @likes (NP|NNP @Mary) (RB @tremendously))
#
# While some information is lost in this transformation
# (namely, the relative obliqueness of the arguments to likes),
# most of it is kept by using complex tags. The
# tag for the constituent headed by "likes" in the output
# records the sequence of all nonterminals headed by
# the corresponding copy of "likes" in the input.
#
# -f says to keep only the first (i.e., maximal) tag from
# such a sequence:
# (S (NP John) likes (NP Mary) (RB tremendously))
#
# -l says to keep only the last (i.e., minimal, part-of-speech) tag
# from such a sequence:
# (VBZ (NNP John) likes (NNP Mary) (RB tremendously))
#
# If -f and -l are both specified, both the first and last tags are kept:
# (S|VBZ (NP|NNP John) likes (NP|NNP Mary) (RB|RB tremendously))
# Note that for consistency, even the single-tag sequence RB is turned into
# a pair RB|RB.
#
# -u basically says to run "uniq" over each tag sequence: for example,
# S|VP|VP|VBZ above would be collapsed into S|VP|VBZ. (That is, adjunct
# levels are eliminated.) This option is incompatible with -f or -l.
#
# -w says that the head lexeme should be lifted onto the tag sequence
# (in addition to any tags specified by -f, -l). Thus we get
# (S|VP|VP|VBZ|likes (NP|NNP|John @) @ (NP|NNP|Mary @) (RB|tremendously @))
# which might be easier for a human to read if sisterless @ were deleted:
# (S|VP|VP|VBZ|likes NP|NNP|John @ NP|NNP|Mary RB|tremendously)
#
# -1 says that instead of eliminating any nonterminal node that is the
# head of its parent, we should only eliminate any nonterminal node
# that is the sole child of its parent. In other words, just collapse
# unary (nonbranching) nodes.
#
# If this option is specified, head markings are irrelevant: they
# are not required in the input, but if they appear in the input,
# they will be preserved in the output.
#
# Note that if -1 is combined with -w, even terminals are collapsed:
# (NPR @John) becomes (NPR|John @) and (NPR John) becomes just NPR|John.
#
# The -b and -B options can be useful for studying the output,
# especially if listrules is subsequently to be applied. These
# "retain" the deleted internal parentheses of a maximal projection by
# adding labeled or unlabeled brackets as new children of the
# nonterminal. -b uses unlabeled brackets and omits those corresponding to unary rules.
# -B uses labeled brackets and preserves those corresponding to unary rules.
# (Note: One could mix and match these two options, which are controlled internally by
# $labbrackets and $trivbrackets - there's just not support for it on the command line
# right now.)
# The -B option makes the most sense together with -f.
#
# flatten -f -b:
# (S (NP John) [ [ likes (NP Mary) ] (RB tremendously) ] )
# which yields frame S --> NP [ [ likes NP ] RB ]
# flatten -f -B:
# (S (NP [NNP John ]) [VP [VP [VBZ likes ] (NP [NNP Mary ]) ] [RB tremendously ] ])
# which yields frame S --> NP [VP [VP [VBZ likes ] NP ] RB ]
#
# The ~ argument marker is handled as follows: it is removed from the sequence
# before any of these operations (including uniq). If it happened to appear on
# some sequence-maximal tag in the input, then it will be restored to
# the maximal tag in the corresponding output sequence. It is, however,
# retained on all slashed or plussed material contained within a tag.
#
# Note: When options such as -f are used, indices may no longer match
# up in the output. For example, if the input is
# (S\NP-SBJ-2
# (NP-SBJ\NP-SBJ-2
# @(-NONE- @0))
# @(VP @(VBZ @makes)
# (NP
# (NPR @(NNP @Kent))
# @(NP @(NNS @cigarettes)))))
# then flatten -w will produce
# (SBAR that
# (S\NP-SBJ-2
# (NP-SBJ\NP-SBJ-2 0)
# makes
# (NP (NPR Kent)
# cigarettes)))
# So notice that despite the index on /NP-SBJ-2, it never gets
# canceled against anything. The head of the SBAR is assumed
# to subcategorize for the gapped category S\NP-SBJ-2, just as
# if we had written S\NP-SBJ -- although we can't actually
# drop the index, since we need to track the propagation of the
# slash.
require("stamp.inc"); &stamp; # modify $0 and @INC, and print timestamp
$firsttag = 1, shift(@ARGV) if $ARGV[0] eq "-f"; # !!! options must be specified in order, yuck!
$lasttag = 1, shift(@ARGV) if $ARGV[0] eq "-l";
$uniqtag = 1, shift(@ARGV) if $ARGV[0] eq "-u";
$wordtag = 1, shift(@ARGV) if $ARGV[0] eq "-w";
$deunarize = 1, shift(@ARGV) if $ARGV[0] eq "-1";
$brackets = 1, shift(@ARGV) if $ARGV[0] eq "-b";
$brackets = 1, $labbrackets = 1, $trivbrackets = 1, shift(@ARGV) if $ARGV[0] eq "-B";
if ($firsttag && $lasttag) {
$twotags = 1;
$firsttag = $lasttag = 0;
} elsif (!$firsttag && !$lasttag) {
$alltags = 1;
}
die "$0: -u is incompatible with -f or -l, aborting\n" if $uniqtag && !$alltags;
die "$0: bad command line flags" if @ARGV && $ARGV[0] =~ /^-./;
$token = "[^ \t\n()]+"; # anything but parens or whitespace can be a token
$inittokenseg = "[^ \t\n()~\\\\\\/+|-]+"; # initial segment of a token (copied from markargs
# AND SLIGHTLY MODIFIED to add |)
while (<>) { # for each sentence
chop;
s/^(\S+:[0-9]+:\t)?//, $location = $&;
unless (/^\#/) { # unless a comment
($headmark, $tag, $kids, $numkids) = &constit; # eat a constit (sentence)
die "$0:$location more than one sentence on line ending $_" if $_;
die "$0:$location the whole sentence was unexpectedly marked as a head" if $headmark;
$_ = "$headmark($tag$kids)";
}
print "$location$_\n";
}
print STDERR "$0: $constit possibly trivial constituents in, ", $constit-$deletedconstit, " out\n";
# -------------------------
# Reads in the next constit, and following whitespace, from the front of $_.
# Any constit may start with @.
#
# output is a tuple:
# headmark ("@" or "") according to whether this is the head of its parent
# composite tag of the flattened constit, respecting $firsttag and $lasttag and $wordtag
# a string that starts with a space and gives all the subconstits of the flattened constit, including the head
# number of child subconstituents
# Discipline: each regexp that eats text is required to eat
# any following whitespace, too.
sub constit {
local($headmark, $tag, $kids, $numkids);
local($foundhead, $badkid);
local($argmark);
$headmark = "@" if s/^@//; # delete initial @ if any
$constit++;
s/^\(\s*// || die "$0:$location open paren expected to start $_"; # eat open paren
s/^($token)\s*//o || die "$0:$location no tag"; # eat tag
$tag = $1;
$argmark = 1 if $tag =~ s/^($inittokenseg)~/$1/g; # kill any arg mark on tag, but remember it. Don't touch arg marks on the slashed categories, though.
$foundhead = 0;
$badkid = 0;
if (/^@?\(/) { # if tag is followed by at least one subconstituent (possibly marked with @)
until (/^\)/) { # eat all the subconstits recursively and remember what they were
local($subheadmark, $subtag, $subkids, $subnumkids) = &constit;
$numkids++;
unless ($deunarize ? ($numkids==1 && /^\)/) : $subheadmark eq "@") { # no reason to absorb the kid: we're killing sole children ($deunarize) or head children (otherwise), and it's not one
$kids .= " $subheadmark" . (($subkids eq " ") ? $subtag : "($subtag$subkids)"); # subkids can be just " " if we specify -w and -1 and heads aren't marked in the input
} else { # this kid must be swallowed
$deletedconstit++;
# splice the kids in without a () wrapper or $subtag, except as dictated by $brackets
local($addbrackets) = $brackets
&& ($trivbrackets || $subnumkids > 1) # if trivial brackets aren't allowed, the head needs multiple kids to justify brackets around it
&& ($trivbrackets || $numkids > 1 || !/^\)/); # if trivial brackets aren't allowed, the head needs siblings to justify brackets around it
$kids .= " [" if $addbrackets;
$kids .= $subtag if $labbrackets;
$kids .= "$subkids";
$kids .= " ]" if $addbrackets;
# adjust the parent tag
$subtag =~ s/~//g; # kill any arg mark on subtag -- we'll just use the higher-up one we already found
if ($lasttag || ($uniqtag && $subtag =~ m%^$tag(|\|$)%)) {
$tag = $subtag; # discard $tag and just keep $subtag
} elsif ($alltags) {
$tag .= "|$subtag";
} elsif ($twotags || ($firsttag && $wordtag)) {
$subtag =~ s/[^|]*\|//; # kill first component of subtag
$tag .= "|$subtag";
} # else leave $tag alone
}
}
} else { # if tag is followed by just a lexical item
local($subheadmark);
$numkids = 1; # we assume there won't be any other kids after the lex item, thanks to input format
s/^@// && ($subheadmark = "@") || $deunarize || die "$0:$location lex item not marked as head and -1 not specified";
s/^($token)\s*//o || die "$0:$location no lex item";
$word = $1;
$kids = " $subheadmark$word";
if ($twotags) {
$tag .= "|$tag"; # use same tag for first & last
}
if ($wordtag) {
$tag .= "|$word";
$kids = " $subheadmark";
}
}
s/^\)\s*// || die "$0:$location close paren expected to start $_";
$tag =~ s/^$inittokenseg/$&~/o if $argmark; # put the arg mark on
($headmark, $tag, $kids, $numkids);
}