Crossfire Server, Branch 1.12
R12190
|
00001 /* 00002 * static char *rcsid_player_c = 00003 * "$Id: re-cmp.c 11578 2009-02-23 22:02:27Z lalo $"; 00004 */ 00005 00022 #include <stdio.h> 00023 #include <stdlib.h> 00024 #include <memory.h> 00025 #include <limits.h> 00026 #include <re-cmp.h> 00027 #include <ctype.h> 00028 #include <global.h> 00029 #include <define.h> /* Needed for OUT_OF_MEMORY. */ 00030 00031 /* Get prototype functions to prevent warnings. */ 00032 #if defined(__sun__) && defined(StupidSunHeaders) 00033 # include <sys/types.h> 00034 # include <sys/time.h> 00035 # include "sunos.h" /* Prototypes for standard libraries, sunos lack those */ 00036 #endif 00037 00038 /* P r o t o t y p e s 00039 */ 00040 const char *re_cmp(const char *, const char *); 00041 static Boolean re_cmp_step(const char *, const char *, unsigned, int); 00042 static void re_init(void); 00043 static Boolean re_match_token(uchar, selection *); 00044 static const char *re_get_token(selection *, const char *); 00045 #ifdef DEBUG2 00046 static void re_dump_sel(selection *); 00047 #endif 00048 00049 /* G l o b a l v a r i a b l e s 00050 */ 00051 static Boolean re_init_done = False; 00052 static selection *re_token[RE_TOKEN_MAX]; 00053 static const char *re_substr[RE_TOKEN_MAX]; 00054 static unsigned int re_token_depth; 00055 00056 /* E x t e r n a l f u n c t i o n 00057 */ 00058 00070 const char *re_cmp(const char *str, const char *regexp) { 00071 const char *next_regexp; 00072 Boolean once = False; 00073 Boolean matched; 00074 00075 if (re_init_done == False) 00076 re_init(); 00077 00078 #ifdef SAFE_CHECKS 00079 if (regexp == NULL || str == NULL) 00080 return NULL; 00081 #endif 00082 if (*regexp == '^') { 00083 once = True; 00084 ++regexp; 00085 } 00086 if (*regexp == 0) { 00087 /* // or /^/ matches any string */ 00088 return str; 00089 } 00090 00091 next_regexp = re_get_token(re_token[0], regexp); 00092 re_token_depth = 0; 00093 re_substr[0] = next_regexp; 00094 00095 matched = False; 00096 while (*str != '\0' && !(matched = re_match_token(*str, re_token[0]))) 00097 str++; 00098 00099 if (matched && *next_regexp == 0) 00100 return str; 00101 00102 /* Apologies for the nearly duplicated code below, hopefully it 00103 * speeds things up. 00104 */ 00105 if (once) { 00106 switch (re_token[0]->repeat) { 00107 case rep_once: 00108 if (matched == False) 00109 return NULL; 00110 break; 00111 00112 case rep_once_or_more: 00113 if (matched == False) 00114 return NULL; 00115 00116 if (re_cmp_step(str+1, regexp, 0, 1)) 00117 return str; 00118 break; 00119 00120 case rep_null_or_once: 00121 if (matched == False) 00122 return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL; 00123 break; 00124 00125 case rep_null_or_more: 00126 if (matched) { 00127 if (re_cmp_step(str+1, regexp, 0, 1)) 00128 return str; 00129 } else { 00130 return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL; 00131 } 00132 break; 00133 } 00134 00135 return re_cmp_step(str+1, next_regexp, 1, 0) ? str : NULL; 00136 } 00137 00138 if (matched) { 00139 switch (re_token[0]->repeat) { 00140 case rep_once: 00141 case rep_null_or_once: 00142 break; 00143 00144 case rep_once_or_more: 00145 case rep_null_or_more: 00146 if (re_cmp_step(str+1, regexp, 0, 1)) 00147 return str; 00148 break; 00149 } 00150 00151 /* The logic here is that re_match_token only sees 00152 * if the one letter matches. Thus, if the 00153 * regex is like '@match eureca', and the 00154 * the user enters anything with an e, re_match_token 00155 * returns true, but they really need to match the 00156 * entire regexp, which re_cmp_step will do. 00157 * However, what happens is that there can be a case 00158 * where the string being match is something like 00159 * 'where is eureca'. In this case, the re_match_token 00160 * matches that first e, but the re_cmp_step below, 00161 * fails because the next character (r) doesn't match 00162 * the u. So we call re_cmp with the string 00163 * after the first r, so that it should hopefully match 00164 * up properly. 00165 */ 00166 if (re_cmp_step(str+1, next_regexp, 1, 0)) 00167 return str; 00168 else if (*(str+1) != 0) 00169 return re_cmp(str+1, regexp); 00170 } 00171 return NULL; 00172 } 00173 00174 /* A u x i l l i a r y f u n c t i o n s 00175 */ 00176 00191 static Boolean re_cmp_step(const char *str, const char *regexp, unsigned slot, int matches) { 00192 const char *next_regexp; 00193 Boolean matched; 00194 00195 #ifdef DEBUG 00196 /* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/ 00197 #endif 00198 00199 if (*regexp == 0) { 00200 /* When we reach the end of the regexp, the match is a success */ 00201 return True; 00202 } 00203 00204 /* This chunk of code makes sure that the regexp-tokenising happens 00205 * only once. We only tokenise as much as we need. 00206 */ 00207 if (slot > re_token_depth) { 00208 re_token_depth = slot; 00209 if (re_token[slot] == NULL) 00210 re_token[slot] = (selection *)malloc(sizeof(selection)); 00211 next_regexp = re_get_token(re_token[slot], regexp); 00212 if (next_regexp == NULL) { 00213 /* Syntax error, what else can we do? */ 00214 return False; 00215 } 00216 re_substr[slot] = next_regexp; 00217 } else { 00218 next_regexp = re_substr[slot]; 00219 } 00220 00221 matched = re_match_token(*str, re_token[slot]); 00222 if (matched) 00223 ++matches; 00224 00225 if (*str == 0) 00226 return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched; 00227 00228 switch (re_token[slot]->repeat) { 00229 case rep_once: 00230 if (matches == 1) { /* (matches == 1) => (matched == True) */ 00231 return re_cmp_step(str+1, next_regexp, slot+1, 0); 00232 } 00233 return False; 00234 00235 case rep_once_or_more: 00236 if (matched) { /* (matched == True) => (matches >= 1) */ 00237 /* First check if the current token repeats more */ 00238 if (re_cmp_step(str+1, regexp, slot, matches)) 00239 return True; 00240 return re_cmp_step(str+1, next_regexp, slot+1, 0); 00241 } 00242 return False; 00243 00244 case rep_null_or_once: 00245 /* We must go on to the next token, but should we advance str? */ 00246 if (matches == 0) { 00247 return re_cmp_step(str, next_regexp, slot+1, 0); 00248 } else if (matches == 1) { 00249 return re_cmp_step(str+1, next_regexp, slot+1, 0); 00250 } 00251 return False; /* Not reached */ 00252 00253 case rep_null_or_more: 00254 if (matched) { 00255 /* Look for further repeats, advance str */ 00256 if (re_cmp_step(str+1, regexp, slot, matches)) 00257 return True; 00258 return re_cmp_step(str, next_regexp, slot+1, 0); 00259 } 00260 return re_cmp_step(str, next_regexp, slot+1, 0); 00261 } 00262 00263 return False; 00264 } 00265 00272 static void re_init(void) { 00273 int i; 00274 00275 re_token[0] = (selection *)malloc(sizeof(selection)); 00276 if (re_token[0] == NULL) 00277 fatal(OUT_OF_MEMORY); 00278 for (i = 1; i < RE_TOKEN_MAX; i++) 00279 re_token[i] = NULL; 00280 00281 re_init_done = True; 00282 } 00283 00294 static Boolean re_match_token(uchar c, selection *sel) { 00295 switch (sel->type) { 00296 case sel_any: 00297 return True; 00298 00299 case sel_end: 00300 return (c == 0); 00301 00302 case sel_single: 00303 return (tolower(c) == tolower(sel->u.single)); 00304 00305 case sel_range: 00306 return (c >= sel->u.range.low && c <= sel->u.range.high); 00307 00308 case sel_array: 00309 return (sel->u.array[c]); 00310 00311 case sel_not_single: 00312 return (tolower(c) != tolower(sel->u.single)); 00313 00314 case sel_not_range: 00315 return (c < sel->u.range.low && c > sel->u.range.high); 00316 } 00317 00318 return False; 00319 } 00320 00332 static const char *re_get_token(selection *sel, const char *regexp) { 00333 #ifdef SAFE_CHECKS 00334 # define exit_if_null if (*regexp == 0) return NULL 00335 #else 00336 # define exit_if_null 00337 #endif 00338 Boolean quoted = False; 00339 uchar looking_at; 00340 00341 #ifdef SAFE_CHECKS 00342 if (sel == NULL || regexp == NULL || *regexp == 0) 00343 return NULL; 00344 #endif 00345 00346 do { 00347 looking_at = *regexp++; 00348 switch (looking_at) { 00349 case '$': 00350 if (quoted) { 00351 quoted = False; 00352 sel->type = sel_single; 00353 sel->u.single = looking_at; 00354 } else { 00355 sel->type = sel_end; 00356 } 00357 break; 00358 00359 case '.': 00360 if (quoted) { 00361 quoted = False; 00362 sel->type = sel_single; 00363 sel->u.single = looking_at; 00364 } else { 00365 sel->type = sel_any; 00366 } 00367 break; 00368 00369 case '[': 00370 /* The fun stuff... perhaps a little obfuscated since I 00371 * don't trust the compiler to analyse liveness. 00372 */ 00373 if (quoted) { 00374 quoted = False; 00375 sel->type = sel_single; 00376 sel->u.single = looking_at; 00377 } else { 00378 Boolean neg = False; 00379 uchar first, last = 0; 00380 00381 exit_if_null; 00382 looking_at = *regexp++; 00383 00384 if (looking_at == '^') { 00385 neg = True; 00386 exit_if_null; 00387 looking_at = *regexp++; 00388 } 00389 first = looking_at; 00390 exit_if_null; 00391 looking_at = *regexp++; 00392 if (looking_at == ']') { 00393 /* On the form [q] or [^q] */ 00394 sel->type = neg ? sel_not_single : sel_single; 00395 sel->u.single = first; 00396 break; 00397 } else if (looking_at == '-') { 00398 exit_if_null; 00399 last = *regexp++; 00400 if (last == ']') { 00401 /* On the form [A-] or [^A-]. Checking for 00402 * [,-] and making it a range is probably not 00403 * worth it :-) 00404 */ 00405 sel->type = sel_array; 00406 memset(sel->u.array, neg, sizeof(sel->u.array)); 00407 sel->u.array[first] = sel->u.array['-'] = !neg; 00408 break; 00409 } else { 00410 exit_if_null; 00411 looking_at = *regexp++; 00412 if (looking_at == ']') { 00413 /* On the form [A-G] or [^A-G]. Note that [G-A] 00414 * is a syntax error. Fair enough, I think. 00415 */ 00416 #ifdef SAFE_CHECKS 00417 if (first > last) 00418 return NULL; 00419 #endif 00420 sel->type = neg ? sel_not_range : sel_range; 00421 sel->u.range.low = first; 00422 sel->u.range.high = last; 00423 break; 00424 } 00425 } 00426 } 00427 { 00428 /* The datastructure can only represent a RE this 00429 * complex with an array. 00430 */ 00431 int i; 00432 uchar previous; 00433 00434 sel->type = sel_array; 00435 memset(sel->u.array, neg, sizeof(sel->u.array)); 00436 if (last) { 00437 /* It starts with a range */ 00438 #ifdef SAFE_CHECKS 00439 if (first > last) 00440 return NULL; 00441 #endif 00442 for (i = first; i <= last; i++) { 00443 sel->u.array[i] = !neg; 00444 } 00445 } else { 00446 /* It begins with a "random" character */ 00447 sel->u.array[first] = !neg; 00448 } 00449 sel->u.array[looking_at] = !neg; 00450 00451 exit_if_null; 00452 previous = looking_at; 00453 looking_at = *regexp++; 00454 00455 /* Add more characters to the array until we reach 00456 * ]. Quoting doesn't and shouldn't work in here. 00457 * ("]" should be put first, and "-" last if they 00458 * are needed inside this construct.) 00459 * Look for ranges as we go along. 00460 */ 00461 while (looking_at != ']') { 00462 if (looking_at == '-') { 00463 exit_if_null; 00464 looking_at = *regexp++; 00465 if (looking_at != ']') { 00466 #ifdef SAFE_CHECKS 00467 if (previous > looking_at) 00468 return NULL; 00469 #endif 00470 for (i = previous+1; i < looking_at; i++) { 00471 /* previous has already been set and 00472 * looking_at is set below. 00473 */ 00474 sel->u.array[i] = !neg; 00475 } 00476 exit_if_null; 00477 } else { 00478 sel->u.array['-'] = !neg; 00479 break; 00480 } 00481 } 00482 sel->u.array[looking_at] = !neg; 00483 previous = looking_at; 00484 exit_if_null; 00485 looking_at = *regexp++; 00486 } 00487 } 00488 } 00489 break; 00490 00491 case '\\': 00492 if (quoted) { 00493 quoted = False; 00494 sel->type = sel_single; 00495 sel->u.single = looking_at; 00496 } else { 00497 quoted = True; 00498 } 00499 break; 00500 00501 default: 00502 quoted = False; 00503 sel->type = sel_single; 00504 sel->u.single = looking_at; 00505 break; 00506 } 00507 } while (quoted); 00508 00509 if (*regexp == '*') { 00510 sel->repeat = rep_null_or_more; 00511 ++regexp; 00512 } else if (*regexp == '?') { 00513 sel->repeat = rep_null_or_once; 00514 ++regexp; 00515 } else if (*regexp == '+') { 00516 sel->repeat = rep_once_or_more; 00517 ++regexp; 00518 } else { 00519 sel->repeat = rep_once; 00520 } 00521 00522 return regexp; 00523 } 00524 00525 /* D e b u g c o d e 00526 */ 00527 #ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */ 00528 00534 static void re_dump_sel(selection *sel) { 00535 switch (sel->type) { 00536 case sel_any: 00537 printf("."); 00538 break; 00539 00540 case sel_end: 00541 printf("$"); 00542 break; 00543 00544 case sel_single: 00545 printf("<%c>", sel->u.single); 00546 break; 00547 00548 case sel_range: 00549 printf("[%c-%c]", sel->u.range.low, sel->u.range.high); 00550 break; 00551 00552 case sel_array: { 00553 int i; 00554 printf("["); 00555 for (i = 0; i < UCHAR_MAX; i++) { 00556 if (sel->u.array[i]) { 00557 printf("%c", i); 00558 } 00559 } 00560 printf("]"); 00561 } 00562 break; 00563 00564 case sel_not_single: 00565 printf("[^%c]", sel->u.single); 00566 break; 00567 00568 case sel_not_range: 00569 printf("[^%c-%c]", sel->u.range.low, sel->u.range.high); 00570 break; 00571 00572 default: 00573 printf("<UNKNOWN TOKEN!>"); 00574 break; 00575 } 00576 00577 switch (sel->repeat) { 00578 case rep_once: 00579 break; 00580 00581 case rep_null_or_once: 00582 printf("?"); 00583 break; 00584 00585 case rep_null_or_more: 00586 printf("*"); 00587 break; 00588 00589 case rep_once_or_more: 00590 printf("+"); 00591 break; 00592 00593 default: 00594 printf("<UNKNOWN REP-TOKEN!>"); 00595 break; 00596 } 00597 } 00598 00599 int main(int argc, char *argv[]) { 00600 char *re, *m; 00601 selection sel; 00602 00603 re = re_get_token(&sel, argv[1]); 00604 00605 printf("'%s' -> '%s'\n", argv[1], re); 00606 re_dump_sel(&sel); 00607 printf("\n"); 00608 m = re_cmp(argv[2], argv[1]); 00609 if (m) 00610 printf("MATCH! -> '%s'\n", m); 00611 return 0; 00612 } 00613 #endif