forked from DanielHsH/Regex-Matcher
-
Notifications
You must be signed in to change notification settings - Fork 0
/
reg_exp.cpp
552 lines (502 loc) · 31.6 KB
/
reg_exp.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
// Includes
#include "reg_exp.h"
#include <stdio.h> // printfs
#include <string.h> // memory operations like memcmp() memset()
/****************************************************************************/
// Basic definitions of types. Feel free to remove if you have them already defined.
#ifndef FALSE
#define FALSE (0)
#endif
#ifndef TRUE
#define TRUE (1)
#endif
#ifndef MAX
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#endif
typedef unsigned char MYBOOL, UCHAR; // Feel free to change boolean to 'int' or C++ 'bool'.
#if defined(__ANDROID__) || defined(__APPLE__) || defined(__MACH__)
typedef __int32_t INT32; // Android and iOS specific INT32. Feel free to switch to 64bits integer
#define strcpy_s(dst,size,src) strcpy(dst,src) // Forward compatibility to safe I/O methods. Activate only if needed (not needed for windows devices)
#else
typedef signed __int32 INT32; // Windows
#endif
/****************************************************************************/
// Fast Comparison of 8,16,32 bits int - without branching. Use in loops where speed is crucial.
#define is_int_negative( num1 ) ((((INT32) (num1) )&0x80000000)>>31) // (num1<0 ) ? 1 : 0
inline INT32 is_int_notZero( INT32 num1 ){ return is_int_negative(num1|-num1); } // (num1!=0 ) ? 1 : 0, Since (-X)|X is 0 for X==0 and negative for every other X.
#define is_int_notEqual( num1,num2) is_int_notZero(num1^num2) // (num1!=num2)? 1 : 0
#define is_int_equal( num1,num2) (is_int_notZero(num1^num2)^0x1) // (num1==num2)? 1 : 0
#define is_int_inRange(i,L,U) ((((INT32((i)-(L)))|(INT32((U)-(i))))^0x80000000)>>31) // ((i>=L)&&(i<=U) ? 1 : 0
#define isDigit( chr) is_int_inRange(chr,'0','9') // ('9'>=chr>='0') ? 1 : 0
/****************************************************************************/
static int atoui_simple(const char* s){ // Read positive integer from string. Like "123" is converted to 123.
int resI = 0;
for (;isDigit(*s); s++)
resI = resI*10 + (s[0]-'0');
return resI;
}
/****************************************************************************/
// Define the types of special regex character commands (as bits).
#define TYPE_CHAR (0) // Default 0 - Single character. Like: A b 7 .
#define TYPE_PREFIX (1) // Sepcial '\\' command for Abbreviations and using special characters. like \\?. This is the only command that actually uses 2 characters
#define TYPE_SUFFIX (2) // Command has iteration suffix like a*, z+, b{3,5}
#define TYPE_OPEN (4) // Left Parentheses: On of the following { ( [ . This rule opens a sub expression
#define TYPE_CLOSE (8) // Right parentheses: } ) ]
#define TYPE_RECURSION (16) // Termination of recursive call. (command is a suffix of previous one. Like previous is 'A' and current is '{2}') Todo: Not used yet
#define NO_MATCH (-1) // Returned when regular expression cannot be matched to the string.
/****************************************************************************/
// Define a command structure (single language rule)
typedef struct{
char id; // The character which represennts the command. Like: * ? [
char attr; // Type of the command. Can be combination of above types
void* f; // Pointer to function which proccesses the current command. Polymorphism (C style)
} Cmd;
/****************************************************************************/
// Total there are 6 functions
// First parameter is the pattern. Second is the sample string and optional third is pointer to the end of the pattern
// Each one returns the number of consumed characters in sample string or NO_MATCH if inapplicable
// (like 'a+' was applied on 'bbb'). Note 'a*' can be applied on 'bbb' and it consumes zero characters.
static inline int c_achar( const char* pat, const char* sam); // Handling single char comparison. Like 'b'
static inline int c_any( const char* pat, const char* sam); // Handlling 'any' comparison. '.' Note: '.' may consume few bytes (1 char) when working with unicodes, and only 1 byte (= 1 char) for ascii.
static inline int c_extended(const char* pat, const char* sam); // Handaling special extended abreviations starting with '\\'
static inline int c_group( const char* pat, const char* sam); // Sub pattern. Grouping characters. Like (the) when searching inside 'I am the master of the realm'
static inline int c_option( const char* pat, const char* sam); // Selection of one option to match [aA]ce mathces both words: Ace and ace
static inline int c_multi( const char* pat, const char* sam, const char* endp);// Multiple occurance of the character. Like A+, A* A{4} A?
// Define pointers to functions that process the special commands. 2 Types
typedef int (*Stand_Func)(const char* pat, const char* sam ); // Standard functions and characters like () . [] \\ A 7 ....
typedef int (*SuffixFunc)(const char* pat, const char* sam,const char* endp); // Suffix functions for multiple occurences like * + ? {}
/****************************************************************************/
// Define the table of command (regex rules). For each id, it's length, type of command and processing function
// Rules of commands: TYPE_CLOSE follows TYPE_OPEN immediately in command table
static const Cmd cmd_tbl[] = {
'(', TYPE_OPEN|TYPE_RECURSION, (void*)c_group,
')', TYPE_CLOSE|TYPE_RECURSION, (void*)NULL,
'|', TYPE_CLOSE|TYPE_RECURSION, (void*)NULL,
'[', TYPE_OPEN, (void*)c_option,
']', TYPE_CLOSE, (void*)NULL,
'{', TYPE_SUFFIX|TYPE_OPEN, (void*)c_multi,
'}', TYPE_CLOSE, (void*)NULL,
'*', TYPE_SUFFIX, (void*)c_multi,
'+', TYPE_SUFFIX, (void*)c_multi,
'?', TYPE_SUFFIX, (void*)c_multi,
'\\', TYPE_PREFIX, (void*)c_extended,
'.', TYPE_CHAR, (void*)c_any,
0, TYPE_CHAR, (void*)c_achar,
};
#define cmdLength( cmd) (1 + ((cmd)->attr&TYPE_PREFIX)) // All commands take 1 character + optional prefix character
#define isSuffix(cmd) ( (cmd)->attr&TYPE_SUFFIX) // Does current command is a suffix of previous one. Like previous is 'A' and current is '{2}'
#define isOpen( cmd) ( (cmd)->attr&TYPE_OPEN ) // Does this command opens a sub expression
// Inverse table of the above (given a character like '*', 'C', '\' get the appropriate command). We use a look up table for all possible ASCII characters
static const Cmd* get_cmd_byChar[128];
static int isInitialized = 0; // Was the look up table above initialized
#define get_cmd(c) get_cmd_byChar[(c)&0x7F] // Get the command strucutre by character. For '(' will return '(' command
#define isReservedSymbol(c) (get_cmd(c)->id != 0) // Is the given character a reserved symbol (used as regex command)
/****************************************************************************/
/****************************** Aux functions *******************************/
/****************************************************************************/
// Initialize the get_cmd_byChar[] look up table
static inline void init_regex_mechanism_private(void){
if (isInitialized) return;
const Cmd* cmd = cmd_tbl, *end = cmd_tbl;
while (end->id) end++; // Find the last default command (character processing) and store it in 'end'
for (int i=0; i<128; i++)
get_cmd_byChar[i] = end; // Set the whole look up table to point to the default command
for (; cmd<end; cmd++)
get_cmd_byChar[cmd->id] = cmd; // For all the real commands set an entry in the look up table
isInitialized = TRUE;
}
/****************************************************************************/
// Find the end of the string.
static inline const char* endOfString(const char *str){
while (*str) str++;
return str;
}
/****************************************************************************/
// Find first occurence of character in the string. Returns the poitner in 'str' starting at 'c' or NULL if not found
static inline const char* findFirstCinS(const char *str, const char c){
while ((*str)&&(*str-c)) str++;
return (*str) ? str : NULL;
}
/****************************************************************************/
// Given an expression starting with left parentheses 'p' return a pointer after the end of this expression (right parentheses 'rp').
// Example: given p=[a*(1+3)[z@][aa]]tp5 returns pointer to: 'tp5' by skipping
// the [a*(1+3)[z@][aa]] expression.
// Note: handles nested parentheses by remembring the parentheses depth. Example: (((X)Y)Z)A - X is in depth 3, Z - in depth 1, A in depth zero.
// Extension _uc means uncompiled. needs to read forward in real time to find the end of the expression.
static inline const char* findExpressionEnd_uc(const char *p, const char rp){
char lp = *p; // left parethesis
int depth = 1; // amount of '('- amount of ')'.
MYBOOL isValidCommand = TRUE; // \\[ doesn't count as valid parentheses since it should be treated as part of the text.
for (p++; is_int_notZero(*p)&is_int_notZero(depth); p++){
depth += isValidCommand * (is_int_equal(*p,lp) - is_int_equal(*p,rp)); // Update the depth only if parenthesis is valid. each 'lp' causes +1, rp causes -1
isValidCommand = is_int_notEqual(*p,'\\'); // If current charachter == '\' than the parentheses become invalid
}
return (depth==0) ? p : NULL; // (depth==0) -> Expresion parentheses was matched, otherwise end of string reached and Parentheses are not balanced
}
/****************************************************************************/
// Same as findExpressionEnd_uc() but supports the the set-union extended POSIX defenition.
// This method considers the symbol '|' as closing parentheses. So (aab|ccd|ef) will return 'aab' as first expression.
// On the second execution will return 'ccd' and on the third will return 'ef'.
// Extension _uc means uncompiled. needs to read forward in real time to find the end of the expression.
static inline const char* findExpressionEnd_UnionSet_uc(const char *p, const char lp, const char rp){
int depth = 1;
MYBOOL isValidCommand = TRUE;
for (p++; (*p!='\0')&&(depth!=0); p++){
depth += isValidCommand * (is_int_equal(*p,lp) - is_int_equal(*p,rp));
depth -= is_int_equal(depth,1)&is_int_equal(*p,'|'); // '|' affects only on top level. For example: (XXX(a|B|)DDD) the '|' are not possible alternatives
isValidCommand = is_int_notEqual(*p,'\\');
}
return (depth==0) ? p : NULL;
}
/****************************************************************************/
// Find next unit of pattern
static const char* goToNextPat_uc(const char* cur){
const Cmd* cmd = get_cmd(*cur); // Get the command
if isOpen(cmd) // If this is open command: [,(,{ than search for closeing character Otherwise just advance forward
return findExpressionEnd_uc(cur,(cmd+1)->id); // Find the closing parentheses of cur.
return cur + cmdLength(cmd); // Just skip the command
}
/****************************************************************************/
/****************************** Compilation *********************************/
/****************************************************************************/
// Assumes the pattern is legal. Compiles it into 'C'. Returns the end of the pattern on success or NULL on failure.
const char* tCompiledRegex::compile(const char *pat){
init_regex_mechanism_private();
start = pat; // Pointer to the pattern
// For each 'OPEN' rules calculate the length of the expression. Todo: make it O(n) instead of O(n^2) for worst case of "((((((((((A))))))))))"
for (int i = 0; *pat; pat++, i++)
exprLen[i] = (UCHAR)(goToNextPat_uc(pat) - pat); // 'i' is alwyas equals to (pat - start). Initialize the length of current
end = pat;
// For each '(' ')' rules calculate union set if relevant. Like (A(z*)A|BB|CC)
memset(unionLen,0,sizeof(*unionLen)*(end-start));
pat = start;
for (int i = 0; *pat; pat++, i++){
if ((*pat!='|')||(pat[-1]=='\\')) // We don't care about non unions.
unionLen[i] = exprLen[i];
else if (unionLen[i]==0){ // If we already calculated the length for current union, skip it.
// We are by definition at the first union. Example: For (AA|BB|CC), We are at |BB|CC).
int open;
for (open = i-1; start + open + exprLen[open] <= pat; open--); // Go backwards until we find the '(' of the current union.
// OK, now start+open points exactly to the '(' that opened a union. Moreover 'pat' points to the first '|'
const char *next = pat, *cur = start+open; // Iterate over all the '|' and for each store the length until the next '|'
while (*next =='|'){
unionLen[cur-start] = (UCHAR)(next - cur + 1); // Mark the current '|'
cur = next; // Advance to the next '|' or the terminating ')'
next = findExpressionEnd_UnionSet_uc(next, '(',')') -1;
}
unionLen[cur-start] = (UCHAR)(next - cur + 1); // MArk for the last '|' the length until the terminating ')'
}
}
return end;
}
static const tCompiledRegex* compiledRegexPtr = NULL; // Pointer to the current regex-used. This line ruins multi-threading capabilities. Feel free to change the system architecture to support mutli-threading
static const char* EOS = NULL; // Pointer to the end of current processed sample. Same reason as above
/****************************************************************************/
/**************************** Command Handlers ******************************/
/****************************************************************************/
// Main method. Matches pattern to sample string. Returns the number of used characters
// Or NO_MATCH if impossible to match.
// 'endp' is pointer to the end of the pattern.
// Declaration of the main mehtod is needed since it is recursivly called.
static int match(const char* pat, const char* sam, const char* endp);
/****************************************************************************/
// Any char comparison is always true
static inline int c_any( const char* pat, const char* sam){
return 1;
}
/****************************************************************************/
// Single char comparison. Match uses one charactr. Wrong returns NO_MATCH.
static inline int c_achar(const char* pat, const char* sam){
return (*pat == *sam) ? 1 : NO_MATCH;
}
/****************************************************************************/
static inline int c_group( const char* pat, const char* sam){
const char *close = compiledRegexPtr->getExpressionEnd_UnionSet(pat);
if (!close) return NO_MATCH; // Could not match the paretheses. Wrong expresion. Exit
int nCharsMatched;
while (close[-1]=='|'){
nCharsMatched = match(pat+1, sam, close-1); // +1 and -1 remove the parentheses
if (nCharsMatched >= 0)
return nCharsMatched;
pat = close-1; // Advance to the next alternative
close = compiledRegexPtr->getExpressionEnd_UnionSet(pat);
if (!close) return NO_MATCH; // Could not match the paretheses. Wrong expresion. Exit
}
return match(pat+1, sam, close-1); // Execute the final alternative.
}
/****************************************************************************/
// All possible abbreviations
static inline int c_extended( const char* pat, const char* sam){
#define ABB_LENGTH (32)
char abbr[ABB_LENGTH] = "";
switch (*++pat){
case 'd': strcpy_s(abbr, ABB_LENGTH, "[0-9]"); break; // Digit
case 'D': strcpy_s(abbr, ABB_LENGTH, "[^0-9]"); break; // Non-digit
case 'x': strcpy_s(abbr, ABB_LENGTH, "[0-9A-Fa-f]"); break; // Hex digit
case 'X': strcpy_s(abbr, ABB_LENGTH, "[^0-9A-Fa-f]"); break; // Non Hex
case 'w': strcpy_s(abbr, ABB_LENGTH, "[0-9A-Za-z_]"); break; // Word character
case 'W': strcpy_s(abbr, ABB_LENGTH, "[^0-9A-Za-z_]"); break;
case 'h': strcpy_s(abbr, ABB_LENGTH, "[0-9A-Za-z]"); break; // head of word character
case 'H': strcpy_s(abbr, ABB_LENGTH, "[^0-9A-Za-z]"); break;
case 'a': strcpy_s(abbr, ABB_LENGTH, "[A-Za-z]"); break; // Alphabetic character
case 'A': strcpy_s(abbr, ABB_LENGTH, "[^A-Za-z]"); break;
case 'l': strcpy_s(abbr, ABB_LENGTH, "[a-z]"); break; // Lowercase character
case 'L': strcpy_s(abbr, ABB_LENGTH, "[^a-z]"); break;
case 'u': strcpy_s(abbr, ABB_LENGTH, "[A-Z]"); break; // Uppercase character
case 'U': strcpy_s(abbr, ABB_LENGTH, "[^A-Z]"); break;
case 's': strcpy_s(abbr, ABB_LENGTH, "[ \t\r\n\v\f]"); break; // Whitespace characters
case 'S': strcpy_s(abbr, ABB_LENGTH, "[^ \t\r\n\v\f]"); break;
}
if (*abbr) return match(abbr, sam, endOfString(abbr));
else return c_achar(pat,sam); // Unknown abbreviation. Just assume that it is a character comparison
}
/****************************************************************************/
// Chose one of the options in []. Like [\\-0-9$_#]
static inline int c_option( const char* pat, const char* sam){
const char *from = NULL; // If we have [a-z] 'from' is 'a', 'to' is 'z'
const char *to = NULL; // If we have [qQ] 'from' is 'q' and 'Q', 'to' is not needed
const char *close = compiledRegexPtr->getExpressionEnd(pat);// Extract the expression inside the [] parentheses
pat++; close--; // +1 and -1 remove the parentheses
int negationOp = ((*pat == '^') ? NO_MATCH : 1); // Check for negation flag. Invert character [^a-z], representing negation operator
if (negationOp<0)
pat++;
while (pat < close){
if (*pat == '-' && from){ // Check for range selection. Like 0-9, where we already have the from
to = pat + 1; // Find the 'to'
if (*to == '\\') to++; // Comparison with reserved character. like \- or \*
// Test for range
if is_int_inRange(*sam,*from,*to)
return negationOp; // We have found a match. If 'not' is active than this is a violation of the pattern
pat = to + 1; // So *sam didn't match the current range, try the next range. Like a-z and A-Z
continue;
}
from = pat; // Beggining of the pattern. Initialize 'from'
if (*from == '\\'){
from++; pat++; // Comparison with reserved character. like \\* or \\?
}
if (*sam == *from)
return negationOp; // Comparison of single letter. Like [a-ZAB]
pat++;
}
return -negationOp; // We tested all the options and nothing was mathing.
}
/****************************************************************************/
// Multiple occurence of a character
static inline int c_multi( const char* pat, const char* sam, const char* endp){
const Cmd* cmd = get_cmd(*pat);
int nCharsMatched; // How many characters the multi repitition consumed.
int nRestCharsMatched = NO_MATCH; // How many characters the rest of the pattern consumes (if it exists).
int nRepitions; // Counter, how many repititions were made up to now in a loop
const char* start_sam = sam;
const char *ends = EOS; // Get the end of the sample (stored in cache, instead of recalculation)
const char *foundMatchAt = NULL; // We already found a match but want to try and find a longer matching string
const char *multi = compiledRegexPtr->getExpressionEnd(pat); // Multi occurence pattern: {}, *,?,+
const char *next_pat = compiledRegexPtr->getExpressionEnd(multi); // The rest of the pattern.
// Calculate Min/Max numbers of needed occurences
int min = 0, max = 1; // Initialization not really needed. Just in case.
switch (*multi){
case '{': // For range of repetition: {4} or {4-8}
{
const char* comma = findFirstCinS(multi, ',');
const char* rEnd = findFirstCinS(multi, '}');
// Read the minimum value
min = atoui_simple(multi+1);
// If comma exists inside {} than read also the maximum value
if (comma){
if (comma < rEnd-1) max = MAX(atoui_simple(comma + 1),1); // Read Max, Max must be at least 1;
else if (comma == rEnd-1) max = (1<<30); // Max does not exists: '{min,}', assume 1 billion is enough. Can use MAX_INT instead. Daniel did not want a dependency on <limits.h>
}
else max = MAX(min,1); // No range: Like {4}, Max must be at least 1;
}
break;
case '+': min = 1; max = (1<<30); break;
case '?': min = 0; max = 1; break;
case '*': min = 0; max = (1<<30); break;
}
// If (min==0), we first try to match the rest of pattern
if ((min==0)&&(*next_pat)){
nRestCharsMatched = match(next_pat, sam, endp);
if (nRestCharsMatched>=0)
foundMatchAt = start_sam + nRestCharsMatched; // Yes! The rest of the sample string matches the rest of the pattern. But maybe we can do more repititions and still be fine. like '.*b' matched to 'ab' but can also match 'abccqqb'
// Note: if nRestCharsMatched==0 than the rest of the pattern can be matched to an empty string. Success is guaranteed. Now we want to match as much repititions as we can. Like 'a*b?' was matched to first character of 'aaaz' but can be matched to 'aaa'.
}
// OK. We need to take at least one repitiotion. Enter the loop
nRepitions = 0;
while (sam < ends){
nCharsMatched = ((Stand_Func)cmd->f)(pat, sam); // Find the pattern for the i'th time.
if (nCharsMatched < 0){
// No more repetitions are possible
if (nRepitions < min) return NO_MATCH; // We need at least 'min' but failed
if (*next_pat){
// If (nRestCharsMatched < 0)
// We have enough iterations but we already know that the rest of pattern can't be matched. If we found a good solution earlier return it. Otherwise no solution for matching
// Else We have found a good solution right now and no more iterations possible. Return the good solution.
return (foundMatchAt) ? (int)(foundMatchAt-start_sam) : NO_MATCH;
}
return (int)(sam-start_sam); // Macth found. Use 'nRepitions'
}
sam += nCharsMatched; // Found 'i' repitiotions. Advance pointers
nRepitions++;
if (nRepitions < min) continue; // If we still havent reached the minimal amount of repititions than continue to gather more repetitions.
// OK, we have at least 'min' iterations, Time to check the if the rest of the
// pattern can be matched. If not we will look for more occurences.
// Otherwise we will use the current amount of occurences.
if (*next_pat){
nRestCharsMatched = match(next_pat, sam, endp);
if (nRestCharsMatched>=0)
foundMatchAt = sam + nRestCharsMatched; // See explanation of the code line 'foundMatchAt = start_sam + nRestCharsMatched;' above
}
if (nRepitions == max){ // Check the maximal limit of repititions.
if (*next_pat)
return (foundMatchAt) ? (int)(foundMatchAt-start_sam) : NO_MATCH; // See explanation for this exact code line above
return (int)(sam-start_sam); // Macth found. Use maximal possible amounts of repitions.
}
}
// None of the iterations yielded a consistent match. We exited the loop due to end of sample string.
if (nRepitions < min) return NO_MATCH; // Sample string terminated and we didn't get our minimal amount.
if ((*next_pat)&&(nRestCharsMatched < 0))
return (foundMatchAt) ? (int)(foundMatchAt-start_sam) : NO_MATCH; // Sample string terminated and the rest of the pattern cannot be matched to an empty string. If we found a good solution return it. Otherwise no solution for matching
return (int)(sam-start_sam); // No following patterns that require aditional characters and we got enough iterations. like a*z? on a string of 'aaaa'
}
/****************************************************************************/
// Match pattern to the 'sam' string from its beginning.
// Returns the amount of consumed characters if match was successfull. Otherwise returns NO_MATCH.
// Note: 0 means successfull match. For example 'a?' is matched to 'bc' with zero occurences of 'a'
static int match(const char* pat, const char* sam, const char* endp){
const Cmd* cmd;
int nCharsMatched;
const char* start_sam = sam, *next_pat;
if (!pat)
return NO_MATCH; // NULL pattern is illegal
while (pat < endp){
next_pat = compiledRegexPtr->getExpressionEnd(pat); // Find next pattern to see if it is a suffix like *, {x,y},?,+
if (next_pat==NULL)
return NO_MATCH; // Wrong regular expression. For example '(A)))'
cmd = get_cmd(*next_pat); // Check the next command if it is a suffix
if (isSuffix(cmd)){ // 'cmd' is indeed a suffix. like 'z{3,7}'. Activate {3,7} on pattern 'z'.
int matchedLen = ((SuffixFunc)cmd->f)(pat, sam, endp); // Execute the suffix
return (matchedLen>=0) ? (int)(sam-start_sam) + matchedLen : NO_MATCH;
}
else{ // No suffix
cmd = get_cmd(*pat);
// if (cmd->attr&TYPE_RECURSION){ To do: Handle the the case of the bug (.*)AB is not matched to 'ZAB' because .* consumes 3 letters
nCharsMatched = ((Stand_Func)cmd->f)(pat, sam);
if (nCharsMatched < 0)
return NO_MATCH; // If matching failed return NO_MATCH.
sam += nCharsMatched; // Advance to next pattern
pat = next_pat;
}
}
return (int)(sam-start_sam);
}
/****************************************************************************/
/******************************** API methods *******************************/
/****************************************************************************/
const char* regex_search(const char* pattern, const char* sampleString, int* resLen){
tCompiledRegex builtInCompiledRegex;
builtInCompiledRegex.compile(pattern);
return builtInCompiledRegex.search(sampleString, resLen);
}
/****************************************************************************/
const char* tCompiledRegex::search(const char* sampleString, int* resLen) const{
compiledRegexPtr = this; // Store 'this' as current regex
const char* pattern = start;
const char* endPattern = end;
const char* endOfSearch= EOS = endOfString(sampleString); // When comparint the pattern to sample string we will search the entire sample.
if (pattern == endPattern){ // Empty pattern is matched with zero length matching
*resLen = 0;
return sampleString;
}
if (pattern[0]=='^'){ // Check if first characters forces a matching to beggining of the string
pattern++; // Skip the '^'
endOfSearch = sampleString+1; // Allow match only for first position.
}
// Try to match from every possible place in the sample string
for (;sampleString != endOfSearch; sampleString++){
*resLen = match(pattern, sampleString, endPattern);
if (*resLen > 0) // Note the >0 comparison and not >=. We do not allow empty string match.
return sampleString; // Full match was found. Return the current location in sample string.
}
sampleString = EOS; // No match was found through the entire search.
*resLen = match(pattern, sampleString, endPattern); // Try to match pattern to empty string (since we didn't allow it before).
return (*resLen>= 0) ? sampleString : NULL; // Empty sample string was matched to pattern (like a*b*c*) (if==0). Otherwise no match
};
/****************************************************************************/
/****************************** UNITEST methods *****************************/
/****************************************************************************/
#ifdef REGEX_UNITEST
void regex_debug(const char* pattern, const char* sampleString, const char* trueAnswer){
int len = 0;
char output[256];
const char* res;
const char error[] = "@NO";
const char empty[] = "";
res = regex_search(pattern, sampleString, &len);
if (len>0){ memcpy(output,res, len); output[len] = '\0'; }
else if (len==0) memcpy(output,empty,strlen(empty)+1);
else memcpy(output,error,strlen(error)+1);
// Print only errors
if (strcmp(output,trueAnswer)) fprintf(stderr,"Error: Reg:%s\t\t in %s\t\t : %s,\n", pattern, sampleString, output);
else fprintf(stderr,"Test OK!\n");
}
/****************************************************************************/
// Sample of regular expression and true answers. Checks that mechanism works fine
static void regex_debug_private_tests(void){
regex_debug("(\\(*.\\[)[qQ]", "((a[Q]","((a[Q");
regex_debug("((b)c)", "abc","bc");
regex_debug("(a)((b)c)?", "abc","abc");
regex_debug("(a)(((b))c)(d)?", "abc","abc");
regex_debug("ba*(ac)?", "baaa","baaa");
regex_debug("a{1,2}c?q?", "aaa","aa");
regex_debug("a{3,}c?q?", "aaacq","aaacq");
regex_debug("a{2}v", "aaav","aav");
regex_debug("a*b*c*", "",""); // Testy empty matches
regex_debug("a*","zb","");
regex_debug("a*b*c*q", "","@NO");
regex_debug("ba*", "baaa","baaa");
regex_debug("ba*(ac)+", "baaac","baaac");
regex_debug("ba+", "baaa","baaa");
regex_debug("ba{0,3}", "baaa","baaa");
regex_debug("ba{1,2}c?q?", "baaa","baa");
regex_debug("[0-3]*(22a)", "12222a","12222a");
regex_debug("b[0-9]+", "abc123xyz","@NO");
regex_debug("ac?.[0-2]*", "abc123xyz","ab");
regex_debug("ac?b[q-z]*c{0,4}.[0-2]*(22a)", "abc12222a","abc12222a");
regex_debug("(6(0|4|5)_)+", "61_60_64_65_A","60_64_65_");
regex_debug("(6(0|4|5)_)*", "61_60_64_65_A","60_64_65_");
regex_debug("(6(011|44|5)_)*", "6012_6011_644_65_6541_","6011_644_65_");
regex_debug("^a[ \t\r\n\v\f]*", "za \n ","@NO"); // Test spaces
regex_debug("a[ \t\r\n\v\f]*b", "ga \tbv","a \tb");
regex_debug("a[ \t\r\n\v\f]*", "za \t\t \f\n ","a \t\t \f\n ");
regex_debug("f..k","zfolky","folk");
regex_debug("va*b","kkvaaaab","vaaaab");
regex_debug(".*b","ababc","abab");
regex_debug("z.*b","xyzababc","zabab");
regex_debug(".*b","bc","b");
regex_debug("[abc]1{2}(cat|pup+y|dog).{2}","_a11puppy11_","a11puppy11");
regex_debug("[abc]*\\|","zab|","ab|");
regex_debug("[abc]*\\|[AC-Z]*","zab|ACDB","ab|ACD");
regex_debug("[abc]*\\|[ABCK]*","zab|ACKvv","ab|ACK");
regex_debug("[ABCK\\+UZ0-9]*","AAZ+BUB","AAZ+BUB");
regex_debug("[ABCKUZ0-9\\-]*","AAZ-BUB","AAZ-BUB");
regex_debug("[ABCK\\-UZ0-9]*","AAZ-BUB","AAZ-BUB");
regex_debug("[abc]*\\|[A\\?B-Z]*","zab|A?BCD","ab|A?BCD");
regex_debug("[^xyz]","zab","a");
regex_debug("[^xyc]+","zab","zab");
regex_debug("([0-9]+a{2,4})+q","1aa23aaa445aaaaq","1aa23aaa445aaaaq");
regex_debug("([abc]?[01]?)*","a1b00aab","a1b00aab");
regex_debug("[a-c]+(x{2,4})*","cxxxxxxxx","cxxxxxxxx");
regex_debug("([a-c]+(x{1,2})?)+x","cxccxxccxxx","cxccxxccxxx");
regex_debug("a[0-9]+b", "za789b","a789b");
regex_debug("[^A-Za-z]*","abc12..","12..");
regex_debug("^a\\+{3}\\*\\?b","a+++*?bc","a+++*?b"); // Check special characters
regex_debug("\\[\\[","a[?][[1","[[");
regex_debug("(V\\.?A\\.?T\\.?|TAX|TVA)( *[^A-Z]? +)?[A-_\\. ]*([0-9]+([\\.,][0-9]{1,2})?%?[0-9]+([\\.,][0-9]{1,2})?%_? +)?([A-_\\. ]*)?([0-9]+[\\.,][0-9]{1,2} +)?[A-_]? *[\\-\\+]?[0-9]+(,[0-9]{3})*[\\.,][0-9]{1,2} *.? *\n"," VAT 3.93\n","VAT 3.93\n");
regex_debug("\\[\\]","a[?][]1","[]1");
regex_debug("(\\[\\?\\])*\\|(\\*)+","a[?][?]|**1","[?][?]|**");
regex_debug("(.+)(.+)","AB","AB"); // Known bug in Daniels implementation!
regex_debug("((.*A)BABA)Z","ZABABABABAZQ","ZABABABABAZ"); // Known bug in Daniels implementation!
}
#endif
/****************************************************************************/
// EOF.