-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.cp
executable file
·2499 lines (2255 loc) · 82.7 KB
/
parser.cp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// TABARI Project
//___________________________________________________________________________________
// parser.cp
// This file contains the parsing routines
// Text Filtering:
// There is a lot -- see the comments in filterText. There is possibly too much --
// for example "(...)" is converted to " , ... , "
// Lexical Markup:
// 1. The primary function of makeLex() is to assign each word the list of literals
// that match it. This is a list because a number of substrings could match:
// for example "ARABIANS" (as in "Saudi Arabians") could match the literals
// ARAB ARABIA ARABIAN and ARABIANS. This list is stored in literaList[],
// and lexArray[].ilit points to the start of the list. The end of the list is
// signaled by a zero cell.
//
// 2. Conditions on lexArray[].ilit:
// 0 : No literal was assigned
// < 0 : Contains the negative of the index of a *symbol*
// > 0 : Points to the first element of a list in literaList[]
//
// 3. literaList stores pairs of information on literals:
// i literal index
// i+1 0 if this was a partial match; 1 if it was an exact match.
// This is used to correctly deal with _ connectors (which didn't work
// consistently prior to version 0.5.1)
//
// Parser Markup:
// 1. Roots are assigned using matchLex(), which uses the literals identified in the
// lexical markup (which involve single words or fragments of words) to identify
// multiple-word roots. Because a word sequence could match multiple roots --
// "SAUDI ARABIANS" might match "SAUDI", "SAUDI ARABI", and "SAUDI ARABIAN" --
// the assignment is made to the root that has the *largest number of matched
// characters*.
//
// When root is identified:
// a. All of the literals that are in the root have their syntArray[].wtype
// changed to the wordtype of the root. This will, for example, cancel the
// conjunctions in roots such as "SHOT_AND_KILLED"
// b. headTag of the first literal and tailTag of the last literal are tagged
// with the wordtype.
// c. syntArray[].iroot of the head of the root contains the root index
// Notes:
// 1. At present the system -- using the Literals.check() function -- goes through
// and checks each possible literal. A lot of this could be bypassed through
// some additional processing of the dictionaries -- in the example given above,
// if we know that ARABIANS matches, then we already know that ARAB, ARABIA and
// ARABIAN matches. This requires only a small amount of additional storage
// and the code should be straightforward.
// 2. The maximum-matched-characters criterion for root matching differs a bit
// from that used in KEDS, where roots and phrases were evaluated in the order
// of the length (in characters) of the phrase and matching stopped at the
// first successful match. In the absence of conditional phrases, the KEDS
// system and the TABARI system will produce identical results. If conditional
// phrases are present, the two systems could produce different results, and
// dictionaries should be updated accordingly. The TABARI system is the more
// accurate of the two methods -- in the sense of correctly identifying the
// most specific phrase -- but KEDS dictionaries may occasionally be coding
// the right phrase for the wrong reasons.
// (ties are still resolved by the listing order in the dictionary -- the first
// phrase checked is selected -- and this could, on rare occasions, be used to
// resolve ambiguous situations)
// 3. Current handling of input error conditions:
//
// NO_TEXT_ERROR: No text in record. This is necessarily caused by a filter
// error. Don't code it; alert with a problem string.
//
// STORY_TOO_SHORT:Fewer than MIN_LEX words in sentence. This is usually caused
// by a filter error having incorrectly split a sentence, or else
// the start of a feature story. Don't code it; alert with a
// problem string.
//
// STORY_TOO_LONG: More than MAX_LEX-1 words in sentence. This is usually caused
// by a filter error concatenating sentences, so truncate and
// code it anyway.
// 4. tagArray holds additional syntactic markup information, organized in 3-word
// blocks. syntArray[].pheadtag holds the index to start of a linked list of "head
// tags" indicating the start of multi-word structures, and syntArray[].ptailtag
// "tail tags" holds the index to start of a linked list indicating the end.
// Structure:
// i type of tag (e.g. noun, clause)
// i+1 optional index associated specific heads with tails; zero if not used
// i+2 tagArray index of next tag in the linked list associated with this word
//
// --- Word types ---
// Null -- default
// Actor
// Agent
// Verb
// Time -- time-shifting word
// Attrib -- attribution word
// Determ -- determiner: A_, AN_, THE_
// Noun -- verb shifted to noun by determiner or capitalization; null-coded actors;
// explicit noun
// Adjctv -- adjective; used to construct compound noun and adjective phrases
// Auxil -- auxilary verb -- WAS_, WERE_, BEEN_ -- used in passive voice detection
// Byword -- BY_, used in passive voice detection
// Comma -- comma ",", used in subordinate clause detection and compounds
// Pronoun
// Conj -- AND_, BUT_
// Prep -- preposition; currently not used
// Plural -- plural; currently not used
// Number -- literals that begin with a digit except when it was part of a phrase,
// as in "Group of 77"
// Issue -- word in ISSUES set
// Synym -- currently not used
//
// --- Clause tags ---
// Clause -- clause in compound sentence
// Compound -- compound noun phrase
// Reference -- pronoun reference
// Subord -- subordinate clause
// Replace --
// NullTag -- deactivates tags in subordinate clauses
// Halt -- tag for end of text
//__________________________________________________________________________________
//
// Copyright (c) 2000 - 2012 Philip A. Schrodt. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted under the terms of the GNU General Public License:
// http://www.opensource.org/licenses/gpl-license.html
//
// Report bugs to: [email protected]
// The most recent version of this code is available from the KEDS Web site:
// http://eventdata.psu.edu
//___________________________________________________________________________________
// Headers
#include "TABARI.h"
ParserClass Parser;
//___________________________________________________________________________________
// Global Variables
extern TabariFlagsClass TabariFlags;
extern LiteralsClass Literals;
extern RootsClass Roots;
extern PhrasesClass Phrases;
extern ProcessorClass Processor;
//___________________________________________________________________________________
void ParserClass:: parseError(const char *s, int ierror)
// write the error string to <errorfile> and throw ierror
{
Processor.writeError("Parser error: ", (char*)s, ierror);
iparseError = ierror;
// throw ierror; ### <09.01.15> well, nice, but doesn't seem to be caught anywhere at the moment
} // parseError
void ParserClass:: getParseError(char *s)
// set texts for the problems string based on iparseError
{
switch (iparseError) {
case PARSE_OK:
strcpy(s,"Incorrect call to Parse.getParseError");
break;
case GENERIC_ERROR:
strcpy(s,"Unspecified problem encountered during parsing ");
break;
case NO_TEXT_ERROR:
strcpy(s,"Input error: no text in record");
break;
case TOO_LONG_ERROR: // currently not used: long sentences are truncated
strcpy(s,"Input error: too many words in sentence");
break;
case TOO_SHORT_ERROR:
strcpy(s,"Input error: too few words in sentence");
break;
case HEAD_TAG_ERROR:
strcpy(s,"Parser error: Tag overflow in addHeadTag");
break;
case TAIL_TAG_ERROR:
strcpy(s,"Parser error: Tag overflow in addTailTag");
break;
case MARK_NOUN_ERROR:
strcpy(s,"Parser error: problem with noun marking");
break;
case CONVERT_AGENT_ERROR:
strcpy(s,"Parser error: no tail tag on agent");
break;
case COMPOUND_TAG_ERROR:
strcpy(s,"Parser error: problem with compound marking");
break;
case SUBORD_TAG_ERROR:
strcpy(s,"Parser error: problem with subordinant clause marking");
break;
}
} // getParseError
void ParserClass:: addSegHeadTag(int loc, toktype tag, toktype index)
// adds indexed tag to snytArray[loc].pheadtag for segments
{
tokptr pt = &syntArray[loc].pheadtag;
if (iTag >= MAX_TAGS - 3)
parseError("Tag overflow in addHeadTag",HEAD_TAG_ERROR); // error check, sort of -- this doesn't really recover, just notifies
while (*pt) pt = &tagArray[*pt+2]; // find last element in tag
*pt = iTag; // reset the next element pointer
tagArray[iTag++] = tag; // add the tag
tagArray[iTag++] = index; // add index
tagArray[iTag++] = 0; // zero link to next element
} // addHeadTag
void ParserClass:: addHeadTag(int loc, toktype tag)
// adds non-indexed tag to snytArray[loc].pheadtag
{
tokptr pt = &syntArray[loc].pheadtag;
if (iTag >= MAX_TAGS - 3)
parseError("Tag overflow in addHeadTag",HEAD_TAG_ERROR); // error check, sort of -- this doesn't really recover, just notifies
while (*pt) pt = &tagArray[*pt+2]; // find last element in tag
*pt = iTag; // reset the next element pointer
tagArray[iTag++] = tag; // add the tag
tagArray[iTag++] = 0; // zero index
tagArray[iTag++] = 0; // zero link to next element
} // addHeadTag
void ParserClass:: addHeadTag(int loc, toktype tag, toktype &index)
// adds next indexed tag to snytArray[loc].pheadtag; returns value of index
{
tokptr pt = &syntArray[loc].pheadtag;
if (iTag >= MAX_TAGS - 3)
parseError("Tag overflow in addHeadTag",HEAD_TAG_ERROR); // error check, sort of -- this doesn't really recover, just notifies
index = 0;
while (*pt) {
if ((*(pt+1) == tag) && (*(pt+1)>index)) index = *(pt+1); // set index to maximum current value
pt = &tagArray[*pt+2]; // find last element in tag
}
++index; // set new index value
*pt = iTag; // reset the next element pointer
tagArray[iTag++] = tag; // add the tag
tagArray[iTag++] = index; // record index
tagArray[iTag++] = 0; // zero link to next element
} // addHeadTag
void ParserClass:: addSegTailTag(int loc, toktype tag, toktype index)
// adds indexed tag to snytArray[loc].ptailtag for segments
{
tokptr pt = &syntArray[loc].ptailtag;
if (iTag >= MAX_TAGS)
parseError("Tag overflow in addClTailTag",TAIL_TAG_ERROR);
while (*pt) pt = &tagArray[*pt+2];
*pt = iTag;
tagArray[iTag++] = tag;
tagArray[iTag++] = index;
tagArray[iTag++] = 0;
} // addTailTag
void ParserClass:: addTailTag(int loc, toktype tag)
// adds non-indexed tag to snytArray[loc].ptailtag
{
tokptr pt = &syntArray[loc].ptailtag;
if (iTag >= MAX_TAGS)
parseError("Tag overflow in addTailTag",TAIL_TAG_ERROR);
while (*pt) pt = &tagArray[*pt+2];
*pt = iTag;
tagArray[iTag++] = tag;
tagArray[iTag++] = 0;
tagArray[iTag++] = 0;
} // addTailTag
void ParserClass:: addTailTag(int loc, toktype tag, toktype index)
// adds next indexed tag to snytArray[loc].ptailtag; sets value of index
{
tokptr pt = &syntArray[loc].ptailtag;
if (iTag >= MAX_TAGS)
parseError("Tag overflow in addTailTag",TAIL_TAG_ERROR);
while (*pt) pt = &tagArray[*pt+2];
*pt = iTag;
tagArray[iTag++] = tag;
tagArray[iTag++] = index;
tagArray[iTag++] = 0;
} // addTailTag
bool ParserClass:: hasHeadTag(int loc, toktype tag)
// returns true if snytArray[loc].pheadtag contains non-indexed tag of type "tag"
{
int idx = syntArray[loc].pheadtag;
while (idx) {
if (tag == tagArray[idx]) return true;
idx = tagArray[idx+2];
}
return false;
} // hasHeadTag
bool ParserClass:: hasHeadTag(int loc, toktype tag, toktype &index)
// returns true if snytArray[loc].pheadtag contains tag of type tag and
// return value of index. Index is set to zero if tag not found
{
int idx = syntArray[loc].pheadtag;
while (idx) {
if (tag == tagArray[idx]) {
index = tagArray[idx+1];
return true;
}
idx = tagArray[idx+2];
}
index = 0;
return false;
} // hasHeadTag
bool ParserClass:: checkHeadTag(int loc, toktype tag, toktype index)
// returns true if snytArray[loc].pheadtag contains tag of type tag and
// value of index.
// ### note that this and checkTailTag do different things...probably should synchronize them
{
int idx = syntArray[loc].pheadtag;
while (idx) {
if ((tag == tagArray[idx]) && (index == tagArray[idx+1])) return true;
idx = tagArray[idx+2];
}
return false;
} // checkHeadTag
bool ParserClass:: hasTailTag(int loc, toktype tag)
// returns true if snytArray[loc].ptailtag contains non-indexed tag of type "tag"
{
int idx = syntArray[loc].ptailtag;
while (idx) {
if (tag == tagArray[idx]) return true;
idx = tagArray[idx+2];
}
return false;
} // hasTailTag
bool ParserClass:: hasTailTag(int loc, toktype tag, toktype index)
// returns true if snytArray[loc].ptailtag contains indexed tag of type "tag:index"
{
int idx = syntArray[loc].ptailtag;
while (idx) {
if ((tag == tagArray[idx]) && (index == tagArray[idx+1])) return true;
idx = tagArray[idx+2];
}
return false;
} // hasTailTag
toktype ParserClass:: getTailTag(int loc, toktype tag)
// returns index if snytArray[loc].ptailtag contains tag of type tag and
// returns value of index. Returns zero if tag not found
{
int idx = syntArray[loc].ptailtag;
while (idx) {
if (tag == tagArray[idx]) {
return tagArray[idx+1];
}
idx = tagArray[idx+2];
}
return 0;
} // hasTailTag
bool ParserClass:: checkTailTag(int loc, toktype tag)
// confirms that a corresponding tail exists for an indexed head tag at loc
// check is needed in case nullSubordinate eliminated the compound
{
toktype tagidx;
if (hasHeadTag(loc, tag, tagidx)) {
++loc;
while ((loc <= iSynt) && (!hasTailTag(loc, tag, tagidx))) ++loc;
if (loc > iSynt) return false;
else return true;
}
else return false; // ### shouldn't hit this...
} // checkTailTag
void ParserClass:: changeHeadTag(int loc, toktype oldtag, toktype newtag)
// changes oldtag at snytArray[loc].ptailtag to newtag.
// If tag isn't present, does nothing.
{
int idx = syntArray[loc].pheadtag;
while (idx) {
if (oldtag == tagArray[idx]) {
tagArray[idx] = newtag;
return;
}
idx = tagArray[idx+2];
}
} // changeHeadTag
void ParserClass:: changeTailTag(int loc, toktype oldtag, toktype newtag)
// changes oldtag at snytArray[loc].ptailtag to newtag.
// If tag isn't present, does nothing.
{
int idx = syntArray[loc].ptailtag;
while (idx) {
if (oldtag == tagArray[idx]) {
tagArray[idx] = newtag;
return;
}
idx = tagArray[idx+2];
}
} // changeTailTag
toktype ParserClass:: getReference(int loc)
// gets a pronoun reference from the headtag of syntArray[loc]
{
int idx = Parser.syntArray[loc].pheadtag;
while (idx) {
if (tagArray[idx] >= setRefer) return (tagArray[idx] & maskRefer);
else idx = tagArray[idx+2];
}
parseError("\aMissing pronoun reference tag in ParserClass:: getReference",PRONOUN_REF_ERROR); // we should never hit this...
return 0;
} // getReference
void ParserClass:: addLitr (toktype ilit, toktype icomp)
// add the literal index ilit and comparison type icomp to litArray
// ### [09.06.26]: this doesn't return any sort of error indicating that MAX_LITR has been exceeded,
// so it continues to get called when it isn't doing anything. Not a big deal but somewhat inefficient
{
if (iLitr < MAX_LITRLIST - 1) {
literaList[iLitr++] = ilit;
literaList[iLitr++] = icomp;
}
else parseError("Too many words in story (iLitr > MAX_LITRLIST in ParserClass:addLitr); remaining matches ignored",TOO_LONG_ERROR);
} // addLitr
int ParserClass:: hasLitr (int index, toktype litr)
// checks for literal in the list for syntArray[index]
// returns
// 0 if not in list, or if list doesn't exist
// 1 if litr is in the list and is a partial match
// 2 if litr is in the list and is an exact match
{
if (syntArray[index].ilit <= 0) return 0;
// Processor.fprob << "hL >> hasLitr call " << endl; // *** debug
toktype * ptok = &literaList[syntArray[index].ilit];
while (*ptok) {
if (*ptok == litr) {
++ptok;
if (*ptok) return 2;
else return 1;
}
ptok +=2 ;
}
return 0;
} // hasLitr
bool ParserClass:: skipSegment(syntStruct * &psyn, wordtype wtype)
// check whether *psyn is a segment (i.e. clause, compound or subord) head;
// if true, increment psyn to point to the first word beyond the end of the
// segment. Returns false if no changes in psyn
// Notes:
// 1. this function has *major* side-effects on psyn and is called from within
// 'if ()' statements, so this might not be the smartest way to do this.
{
toktype segindex;
// Parser.writeParsing("Called from skipSegment 1 "); // *** debug
if (!hasHeadTag(psyn->index,wtype, segindex)) return false; // picks up segindex
// cout << "PC:sS1: " << psyn->index << endl;
while (!hasTailTag(psyn->index,wtype, segindex)) {
++psyn;
}
++ psyn;
return true;
} // skipSegment
bool ParserClass:: skipSegment(int &index, wordtype wtype)
// this variant works directly with the index rather than a pointer.
// Again, the function has *major* side-effects
{
toktype segindex;
if (!hasHeadTag(index,wtype,segindex)) return false;
// cout << "PC:sS2: " << index << endl;
while (!hasTailTag(index,wtype,segindex)) {
++index;
}
++index;
return true;
} // skipSegment
void ParserClass:: filterText(void)
// handles an assortment of filtering tasks as it moves sentText to senfilt
// Punctuation filtering:
// 1. Removes '.','?','!','-'
// 2. Replaces ';',':','(', and ')' with ','
// 3. Space-delimits ',' and '"', '[', '{', '}',and ']'
// 4. Eliminate commas inside numbers
// 5. Preserves periods inside numbers
// 6. Preserves periods in capitalized abbreviations (e.g. U.S., U.N.)
// 7. Preserves apostrophes, $
// 8. Eliminates remaining punctuation
// 9. Eliminates consecutive blanks
// 10. Removes text inside /*...*/
// 11. In FBIS mode, removes text inside '[' and ']'
// 12. In FBIS mode, skip leading non-alphabetic text
//
// ### additional optional tasks this could do from the KEDS model [03.06.29]
// 1. Replacement rules
// 2. General Omit delimiters (/* */ is implemented)
{
char *pstx = Processor.sentText;
char *psf = filtext;
int kchar;
//strcpy(Processor.sentText ,"This, isn't a test-test \"my,friend\" $2,300.12?! (yeah, right) omit /*this*/ text"); // *** debug
fRecCAPS = true;
if (TabariFlags.fFBISMode) {
while ((*pstx) && ('[' != *pstx) && (!isalpha(*pstx))) ++pstx; // skip initial non-alphabetic text
}
while (*pstx) {
if (fRecCAPS && (islower(*pstx))) fRecCAPS = false; // check whether this is an all caps record
if (('/' == *pstx) && ('*' == *(pstx+1))) { // remove text between /* */
while ((*pstx) && (('*' != *pstx) || ('/' != *(pstx+1)))) ++pstx;
if ('*' == *pstx) pstx +=2;
}
else if ((TabariFlags.fFBISMode) && ('[' == *pstx)) { // remove text between [ ] in FBISMode
while ((*pstx) && (']' != *pstx)) ++pstx;
if (']' == *pstx) ++pstx; // #### error check needed here?
}
else if (ispunct(*pstx)) {
if ((';' == *pstx) || (':' == *pstx) // replace ( ) ; and : with ,
|| ('(' == *pstx) || (')' == *pstx)) {
*psf++ = ' ';
*psf++ = ',';
*psf++ = ' ';
}
else if (',' == *pstx) { // handle commas
if (!isdigit(*(pstx-1)) || !isdigit(*(pstx+1))) { // space delimit comma; otherwise it gets eliminate
*psf++ = ' ';
*psf++ = ',';
*psf++ = ' ';
}
}
else if ('.' == *pstx) { // handle periods
if ((isdigit(*(pstx-1))) &&
(isdigit(*(pstx+1)))) *psf++ = '.'; // carry through inside numbers
else if (isupper(*(pstx-1))) *psf++ = '.'; // and in capitalized abbreviations
else *psf++ = ' '; // replace with space
}
else if ('\'' == *pstx) *psf++ = '\'';
// else if ('-' == *pstx) *psf++ = '-'; // [09.06.29] eliminate hyphens
else if ('$' == *pstx) *psf++ = '$';
else if ('"' == *pstx) {
*psf++ = ' ';
*psf++ = '"';
*psf++ = ' ';
}
else *psf++ = ' ';
++pstx; // move to next char
}
else *psf++ = *pstx++; // transfer all other text
} // while
if (*(psf-1) != ' ') *psf++ = ' '; // make sure we've still got a terminal blank
*psf = '\0';
// eliminate consecutive blanks; make sure no word is longer than LIT_SIZE - 1
psf = filtext;
pstx = filtext;
kchar = 0; // number of non-blanks chars
while (*pstx) {
if (' ' == *pstx) {
kchar = 0; // reset counter
if (' ' == *(pstx+1)) ++pstx; // eliminate consecutive blanks
else *psf++ = *pstx++;
}
else {
if (++kchar >= LIT_SIZE) {
*(psf-1) = ' '; // just blank it out -- this only occurs in junk text anyway
kchar = 1;
}
*psf++ = *pstx++;
}
}
*psf = '\0';
// cout << filtext << endl; // *** debug
} // filterText
bool ParserClass:: matchLex(int &nmatch, tokptr phrase, wordtype wtype,int ibegin)
// attempts to match root phrase, beginning at lexArray[ibegin]. nmatch is set to the
// total number of characters matched
// Note that this is called from the *syntactical* analysis level.
// <08.12.29>: Checks for the "S" and "'S" endings on actors and agents, and various regular verb endings
{
lexStruct * plex = &lexArray[ibegin];
lexStruct * plast = &lexArray[iLex];
bool fskip = false; // ok to skip intermediate words?
bool fmatch = false;
toktype * ptok; // pointer to literal list
char * pendtext = 0; // pointer to end of literal in the text
#if DEBUG_LEX
instring s; // *** debugging
WriteLine("mLex >> enter"); // *** debugging
#endif
iTemp = 0;
nmatch = 0;
while ((*phrase) && (plex <= plast)){ // go through the phrase list
#if DEBUG_LEX
WriteLine("mLex check : ",Phrases.get(s, phrase)); // *** debugging
#endif
fmatch = false;
if (-COMMA_LIT == plex->ilit) {
return false; // [05.06.24] don't allow matches across commas ### <09.01.02> more generally, we don't want to match across *clauses*, but we don't have this information yet...
}
if (plex->ilit) { // check whether there is a match anywhere in the list
ptok = &literaList[plex->ilit];
while (*ptok) {
#if DEBUG_LEX
WriteLine("mLex compare: ", Literals.litArray[*ptok].pchar); // *** debugging
#endif
if (*phrase == *ptok) {
fmatch = true;
nmatch += Literals.litArray[*ptok].length;
break;
}
// ++ptok; ### pre-connector code
ptok += 2;
} // while (*ptok)
}
if (!fmatch) { // mismatch
if (fskip) { // okay to skip
++plex; // get next entry in lexArray
continue;
}
else return false;
}
// check if a partial match is ok by checking connector
#if DEBUG_LEX
WriteLine("mLex checking connector");
#endif
if (iTemp >= MAX_TEMP) return false; // bounds check
tempArray[iTemp++] = (int)(plex - lexArray); // store the word index
++phrase; // *phrase is now the connector
pendtext = plex->ptext + Literals.litArray[*ptok].length; // pointer to end of literal in the text + 1
if ((*pendtext != ' ') && // was the entire word matched?
// !(*phrase & Phrases.connWild)) {
!(*phrase & Phrases.connFullSkip)) {
if (*phrase == Phrases.connEqual) return false; // root ended with = so endings are not allowed
if ((Actor == wtype) || (Agent == wtype) || (Noun == wtype)) { // check for possible noun phrase endings
if (*(phrase+1)) return false; // not the last element in the phrase
if (('S' == *pendtext) && (' ' == *(pendtext+1))) {
++nmatch;
return true;
}
else if (('S' == *(pendtext+1)) && (' ' == *(pendtext+2)) && (('\'' == *pendtext) || ('E' == *pendtext))) {
nmatch += 2;
return true;
}
}
if (Verb == wtype) { // check for possible regular verb endings
if (*(phrase+1)) return false; // not the last element in the phrase
if (('S' == *pendtext) && (' ' == *(pendtext+1))) {
++nmatch;
return true;
}
else if (('E' == *pendtext) && (' ' == *(pendtext+2)) && (('D' == *(pendtext+1)) || ('S' == *(pendtext+1)) || ('N' == *(pendtext+1)))) {
nmatch += 2;
return true;
}
else if (('I' == *pendtext) && ('N' == *(pendtext+1)) && ('G' == *(pendtext+2)) && (' ' == *(pendtext+3))) {
nmatch += 3;
return true;
}
else if (('E' == *(pendtext-1)) && ('D' == *pendtext) && (' ' == *(pendtext+1))) {
nmatch += 1;
return true;
}
}
return false; // return since this was the last part of the phrase
}
// if (*phrase & Phrases.connSpace) fskip = true; // check whether we need consecutive words [old version]
if (*phrase & Phrases.connPartNext) fskip = true; // check whether we need consecutive words
else fskip = false;
++plex; // get next entry lexArray
++phrase; // get next part of phrase
} // while
if (*phrase) return false; // phrase was not completely matched; use this rather than "true"
// since otherwise multi-word, skip-connected phrases will incorrectly match. See DEBUG-01
else return fmatch;
} // matchLex
void ParserClass:: makeLex(void)
// convert senfilt to lexArray
{
char *pst = filtext;
char *psa;
litstring s;
lexStruct * plex;
bool first = true;
bool skip = false;
toktype itok;
iLex = -1;
iLitr = 1; // literaList[0] is unused because ilit=0 signals no literals were matched
while (' ' == *pst) ++pst; // go to first non-blank char
while (*pst) {
++iLex;
if (iLex >= MAX_LEX-2) { // leave room for defaults; note that if we actually hit this, forwarding would also not work
--iLex; // leave room for 'Halt' record in syntArray
break; // truncate
}
plex = &lexArray[iLex];
plex->ptext = pst; // record start of text in senfilt
plex->flags = 0; // initialize flags
if (!fRecCAPS) {
if (isupper(*pst)) { // set the various flags before moving to upper case
if (first) {
plex->flags = setInitCap;
first = false;
}
else plex->flags = setMidCap;
}
}
if (isdigit(*pst)) { // check for <number>
psa = Copy2Chr(s,pst,' ');
if (!Literals.check(s)) { // check that this wasn't a literal, e.g. as in "Group of 77"
plex->flags = setNumber;
plex->ilit = -NUMBER_LIT; // reverse signs on symbols
plex->pend = strchr(pst,' ') - 1;
skip = true; // skip additional processing
}
}
else if (',' == *pst) { // set to <comma>
plex->flags = setComma;
plex->ilit = -COMMA_LIT;
plex->pend = pst;
skip = true; // skip additional processing
}
else if (REPLACE_LIT == *pst) {
plex->flags = setReplace;
plex->ilit = -REPLACE_LIT;
plex->pend = pst;
skip = true; // skip additional processing
}
if (!skip) {
psa = pst; // shift filtext to upper case now
if (!fRecCAPS) {
while (*psa != ' ') {
*psa = toupper(*psa);
++psa;
}
}
psa = Copy2Chr(s,pst,' ');
itok = iLitr; // save the starting point of possible literal list
if (Literals.check(s)) {
plex->ilit = itok;
plex->pend = pst + (psa - s); // set pend to work with filtext ### no longer needed, right?
}
else {
plex->ilit = 0;
plex->pend = NULL; // ### no longer needed??
}
}
else skip = false; // reset skip
pst = strchr(pst,' ');
if (pst) ++pst;
else break; // ### <08.06.19> since space is added at end, we should never hit this, but a text from
// SAE containing massive numbers of non-printing chars (see Validation Record xx) led to an
// error here before the 'else' was added. On the other hand,if pst == NULL we need to do
// *something*, so the original code was flawed. Still don't understand what was happening
// here, however...
while (' ' == *pst) ++pst; // go to next non-blank char
} // while
if (iLex < minLex) {
iparseError = TOO_SHORT_ERROR;
throw TOO_SHORT_ERROR;
}
#if FALSE
for (int ka=0; ka<=iLex; ++ka) { // *** debugging output
WriteLong("iLex ",(long)ka);
WRITESTRING(" ptext ");
pst = lexArray[ka].ptext;
while (*pst != ' ') WRITECHAR(*pst++);
WRITEEOL();
WRITESTRING(" pend ");
if (lexArray[ka].pend) WriteLine(*(lexArray[ka].pend));
else WriteLine("null");
WriteLong(" ilit ",lexArray[ka].ilit);
}
Pause();
#endif
} // makeLex
void ParserClass:: makeSynt(void)
// fill in syntArray from lexArray; assign roots
{
int ka;
syntStruct * psyn = syntArray;
tokindex itok;
RootsClass::rootStruct root;
int idxlit; // index to start of current literal list
toktype * ptok; // pointer to literal being checked
int nmatch; // chars matched for current root
int maxmatch; // info on root with maximum number of matched chars
wordtype maxword;
tokindex maxtok;
bool inreplace = false; // flag for inside replace markers
#if DEBUG_LEX
char sout[32];
#endif
if (fuseForward) setForward();
else iTag = 1; // 1 because 0 is used as the end-of-list indicator
for (ka = 0; ka<=iLex+1; ++ka) { // initialize array
syntArray[ka].index = ka; // index of array
syntArray[ka].ilit = lexArray[ka].ilit; // start of literaList list
syntArray[ka].iroot = 0; // index of rootArray entry
syntArray[ka].wtype = Null; // word type
syntArray[ka].pheadtag = 0; // tag list prior to word
syntArray[ka].ptailtag = 0; // tag list following word
syntArray[ka].flags = 0; // indicator flags
}
iSynt = iLex+1;
lastSynt = iSynt;
syntArray[iSynt].wtype = Halt; // signal end of sentence
syntArray[iSynt].ilit = 0;
while (psyn->wtype != Halt) { // assign types and roots
idxlit = psyn->ilit;
#if DEBUG_LEX
WriteLong("mSynt index : ", (long)psyn->index);
#endif
if ((inreplace) && (-REPLACE_LIT != idxlit)) { // don't process text inside replace markers
++psyn;
continue;
}
if ((idxlit) && (Null == psyn->wtype)) { // second condition prevent re-assigning types in compound roots
if (idxlit < 0) { // assign types to numbers, commas
idxlit = -idxlit; // symbol, so reverse sign
#if DEBUG_LEX
WriteLine("mSynt result >> symbol or number");
#endif
if (COMMA_LIT == idxlit) psyn->wtype = Comma;
else if (NUMBER_LIT == idxlit) psyn->wtype = Number;
else if (REPLACE_LIT == idxlit) {
psyn->wtype = Replace;
inreplace = !inreplace;
}
}
else { // check for root
maxmatch = 0;
ptok = &literaList[idxlit];
while (*ptok) { // go through the literals list
#if DEBUG_LEX
WRITESTRING("mSynt check'em lit :");
WRITESTRING(Literals.litArray[*ptok].pchar);
sprintf(sout," (index=%3d)",idxlit); // *** debug
WriteLine(sout);
#endif
itok = Literals.litArray[*ptok].istart; // go through istart list
while (itok) {
root = Roots.rootArray[Literals.litBuf[itok]];
if (Null == root.wtype) { // root has been deleted
itok = Literals.litBuf[itok+1]; // continue through the istart list
continue;
}
#if DEBUG_LEX
WriteLine("mSynt check root:",Literals.litArray[*(root.phrase)].pchar); // *** debug
#endif
if (matchLex(nmatch,root.phrase, root.wtype, psyn->index)) {
if (nmatch > maxmatch) { // save the phrase if it is longer than current best match
for (ka = 0; ka < iTemp; ++ka) matchArray[ka] = tempArray[ka];
iMatch = iTemp;
maxmatch = nmatch; // update the maximum chars matched
maxtok = itok; // save itok so we can retrieve root that generated this
maxword = root.wtype;
}
#if DEBUG_LEX
WriteLine("mSynt >> match");
#endif
}
#if DEBUG_LEX
else WriteLine("mSynt >> failed");
#endif
itok = Literals.litBuf[itok+1]; // continue through the istart list
} // while itok
// ++ptok; ### pre-continuation version
ptok += 2;
} // while (*ptok)
if (maxmatch) {
toktype index;
psyn->iroot = Literals.litBuf[maxtok]; // record the root
++Roots.rootArray[psyn->iroot].used; // increment .used counter
addHeadTag(psyn->index,maxword,index);
for (ka = 0; ka < iMatch; ++ka) syntArray[matchArray[ka]].wtype = maxword;
addTailTag(matchArray[iMatch-1],maxword,index);
}
} // else
} // if idxlit
#if DEBUG_LEX
else WriteLong("mSynt skip:type ",(long)psyn->wtype);
#endif
++psyn;
} // while psyn->wtype
} // makeSynt
void ParserClass:: setForward(void)
// check if forwarding can be done; transfer first actor and compound from
// previous record to area past 'Halt' record in syntArray, then move relevant
// tag information to beginning of tagArray, adjusting the pointer and links.
// Note that this operates on the syntArray and tagArray left over from the
// previous record, and must be called before these are over-written.
// Currently does not handle secondary forwards (i.e. forward from a pronoun
// reference)
{
idxForwActor = -1;
idxForwCompd = -1;
if (!fcanForward) { // new story or skipped sequence number
iTag = 1; // 1 because 0 is used as the end-of-list indicator
return;
}
int saveTag = iTag;
syntStruct * psyn = syntArray;
int itrans; // index of cells reference will transfer into
syntStruct * ptrans; // pointer to same
int savetrans;
if (lastSynt > iLex + 1) itrans = lastSynt + 1; // start transfer beyond previous and current iSynt
else itrans = iLex + 2;
ptrans = &syntArray[itrans];
savetrans = itrans;
while ((idxForwActor < 0) && (idxForwCompd < 0) &&
(psyn->wtype != Halt)) { // find first actor and compound
if (hasHeadTag(psyn->index,Compound)) { // transfer compound
if (idxForwCompd >= 0) { // we already have compound, so skip over this
while (!hasTailTag(psyn->index,Compound)) ++psyn;
}
else {
idxForwCompd = itrans;
ptrans->pheadtag = 0; // zero out whatever is left over here
ptrans->ptailtag = 0;
addHeadTag(itrans, Compound);
while (psyn->wtype != Halt) {
ptrans->index = itrans;
ptrans->iroot = psyn->iroot; // root and wtype are the only info we can save
ptrans->wtype = psyn->wtype;
ptrans->ilit = -1; // literal list no longer valid
ptrans->flags = setCompd;
if (hasHeadTag(psyn->index, Actor)) addHeadTag(itrans, Actor); // copy actor tags
if (hasTailTag(psyn->index, Actor)) addTailTag(itrans, Actor);
++ptrans;
++itrans;
ptrans->pheadtag = 0;