-
Notifications
You must be signed in to change notification settings - Fork 1
/
headify
executable file
·158 lines (128 loc) · 6.24 KB
/
headify
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/local/bin/perl -w
# Author: Jason Eisner, University of Pennsylvania
# Usage: headify -c file.mrk [files ...]
#
# Filters parses that are in the format produced by oneline.
# The effect is to mark the head subconstituent of each constituent,
# when it can be guessed, with @.
#
# (S (NP (NNP John)) (VP (VBZ likes) (NP (NNP Mary))
# will emerge with its terminal and nonterminal heads marked:
# (S (NP @(NNP @John)) @(VP @(VBZ @likes) (NP @(NNP @Mary))
#
# This is done by looking at each phrase structure rule and applying
# the prediction methods of predicthead (in predict.inc). file.mrk
# gives an exception list of phrase-structure rules where predicthead
# doesn't get it right. Such a file may be produced by a human using
# Emacs pshead-mode. (It lists phrase structure rules with heads that
# have been marked @ by the user; it does not include = marks, and is
# all in uppercase.)
#
# If a constituent's tag is marked as an argument, with ~, we drop that
# mark if the constituent turns out to be a head.
#
# If predicthead and file.mrk assume articulated rules, pipe the input
# through articulate first. Also, if they assume canonicalized tags,
# pipe the input through canonicalize first, or else use the -c flag
# to force headify to canonicalize tags internally while leaving them
# alone in the output. Finally, it may be useful to filter the
# output of articulate through discardconj first.
require("stamp.inc"); &stamp; # modify $0 and @INC, and print timestamp
require("predict.inc"); # this gives us predicthead
$canonicalize = 1, require("canon.inc"), shift(@ARGV) if $ARGV[0] eq "-c";
die "$0: bad command line flags" unless @ARGV && $ARGV[0] !~ /^-./;
$markfile = shift(@ARGV);
$token = "[^ \t\n()]+"; # anything but parens or whitespace can be a token
# Read in the user markings.
open(MARKEDRULES, $markfile) || die "$0: can't open $markfile";
print STDERR "$0: reading mark file $markfile ...\n";
while (<MARKEDRULES>) {
die "$0: .mrk file format no longer current (contains =).
Please use updatemrkformat to create a version in the new format,
and then rerun $0. You should also pass the .bug file through
updatemrkformat." if /=/;
next if /^\t/; # ignore example lines (these shouldn't occur, but...)
chop;
local($strippedrule) = $_;
warn "$0: wrong number of @ marks in $_ in mark file $markfile!" unless 1==($strippedrule =~ s/(\s)@/$1/g); # strip all @ from rule
if (defined $mark{$strippedrule} && $mark{$strippedrule} ne $_) {
warn "$0: conflict between \"$_\" and \"$mark{$strippedrule}\" in $markfile -- won't use either\n";
$mark{$strippedrule} = "*CONFLICT*";
} else {
$mark{$strippedrule} = $_;
}
}
# Okay, now let's go ahead and mark the heads.
while (<>) { # for each sentence
chop;
s/^(\S+:[0-9]+:\t)?//, $location = $&;
unless (/^\#/) { # unless a comment
$failures = 0;
($tag, $tree) = &constit; # eat a constit (sentence)
die "$0:$location more than one sentence on line ending $_" if $_;
$sent++;
$fullymarked++ unless $failures;
$_ = $tree;
}
print "$location$_\n";
}
print STDERR "$0: $constits nontrivial constits, $marks marked; $sent sentences, $fullymarked fully marked\n";
# -------------------------
# Reads in the next constit, and following whitespace, from the front of $_.
#
# input: none
# output: list of two scalars:
# - nonterminal of this constit, canonicalized if $canonicalize is on, and certainly stripped of its argument mark ~ if any
# - bracketed text version of this constit, with heads marked, to be output
# Discipline: each regexp that eats text is required to eat
# any following whitespace, too.
sub constit {
local($tag, $ctag, $tree, $subctag, $subtree, @subctags, @subtrees); # "tag" denotes input tag, "ctag" denotes version that may have been canonicalized (according to $canonicalize)
s/^\(\s*// || die "$0:$location open paren expected to start $_"; # eat open paren
s/^($token)\s*//o || die "$0:$location no tag"; # eat tag
$ctag = $tag = $1;
$ctag =~ s/~//; # strip argument mark
$ctag = &canonicalizetag($ctag) if $canonicalize;
if (/^\(/) { # if tag is followed by at least one subconstituent
until (/^\)/) { # eat all the subconstits recursively and remember what they were
($subctag, $subtree) = &constit(); # we could omit constits that are lexically null from our phrase structure rules, but we won't.
push(@subctags, $subctag);
push(@subtrees, $subtree);
}
$headkid = &truehead($ctag, @subctags); # guess the head
$constits++;
$marks++ if $headkid;
} else { # if tag is followed by just a lexical item
s/^($token)\s*//o || die "$0:$location no lex item";
@subctags = @subtrees = ($1);
$headkid = 1;
}
s/^\)\s*// || die "$0:$location close paren expected to start $_";
if ($headkid) {
substr($subtrees[$headkid-1],1,1) = "" if $subtrees[$headkid-1] =~ /^\(~/; # drop ~ mark, if any, on the subkid's nonterminal tag
$subtrees[$headkid-1] = "@" . $subtrees[$headkid-1]; # add @ mark to subconstit
}
else { $failures++; }
($ctag, "($tag @subtrees)");
}
# Same vanilla interface as predicthead.
# @RHS is canonicalized and the ~ tags have been stripped.
# Checks whether rule is an exception -- otherwise, looks it up.
# If no head can be found, because rule is unknown && unpredictable,
# return 0 or "".
sub truehead {
local($LHS,@RHS) = @_;
local($answer, $sure);
local($rule) = "$LHS\t@RHS";
# print STDERR "predicting $LHS\t@RHS";
if (defined $mark{$rule} && $mark{$rule} ne "*CONFLICT*") { # there's a marked version of this rule
$mark{$rule} =~ /@/ || die "$0:$location Internal error (annotated rule has no mark); stopped"; # find the mark
$tmp = $`; #Martin Cmejrek changed from Perl4 version
$answer = ($tmp =~ s/\s//g); # count spaces & tabs before mark
# print STDERR "... annotation says kid $answer\n"
} else {
($answer, $sure) = &predicthead($LHS,@RHS); # Try to predict on general principles. We don't care if we're not sure, so long as we get an answer.
# print STDERR "... general principles say $answer\n"
}
$answer;
}