Crossfire Server, Branch 1.12  R12190
re-cmp.c
Go to the documentation of this file.
00001 /*
00002  * static char *rcsid_player_c =
00003  *   "$Id: re-cmp.c 11578 2009-02-23 22:02:27Z lalo $";
00004  */
00005 
00022 #include <stdio.h>
00023 #include <stdlib.h>
00024 #include <memory.h>
00025 #include <limits.h>
00026 #include <re-cmp.h>
00027 #include <ctype.h>
00028 #include <global.h>
00029 #include <define.h> /* Needed for OUT_OF_MEMORY. */
00030 
00031 /* Get prototype functions to prevent warnings. */
00032 #if defined(__sun__) && defined(StupidSunHeaders)
00033 #  include <sys/types.h>
00034 #  include <sys/time.h>
00035 #  include "sunos.h"   /* Prototypes for standard libraries, sunos lack those */
00036 #endif
00037 
00038 /*   P r o t o t y p e s
00039  */
00040 const char *re_cmp(const char *, const char *);
00041 static Boolean re_cmp_step(const char *, const char *, unsigned, int);
00042 static void re_init(void);
00043 static Boolean re_match_token(uchar, selection *);
00044 static const char *re_get_token(selection *, const char *);
00045 #ifdef DEBUG2
00046 static void re_dump_sel(selection *);
00047 #endif
00048 
00049 /*   G l o b a l   v a r i a b l e s
00050  */
00051 static Boolean      re_init_done = False;
00052 static selection    *re_token[RE_TOKEN_MAX];
00053 static const char   *re_substr[RE_TOKEN_MAX];
00054 static unsigned int re_token_depth;
00055 
00056 /*   E x t e r n a l   f u n c t i o n
00057  */
00058 
00070 const char *re_cmp(const char *str, const char *regexp) {
00071     const char *next_regexp;
00072     Boolean once = False;
00073     Boolean matched;
00074 
00075     if (re_init_done == False)
00076         re_init();
00077 
00078 #ifdef SAFE_CHECKS
00079     if (regexp == NULL || str == NULL)
00080         return NULL;
00081 #endif
00082     if (*regexp == '^') {
00083         once = True;
00084         ++regexp;
00085     }
00086     if (*regexp == 0) {
00087         /* // or /^/ matches any string */
00088         return str;
00089     }
00090 
00091     next_regexp = re_get_token(re_token[0], regexp);
00092     re_token_depth = 0;
00093     re_substr[0] = next_regexp;
00094 
00095     matched = False;
00096     while (*str != '\0' && !(matched = re_match_token(*str, re_token[0])))
00097         str++;
00098 
00099     if (matched && *next_regexp == 0)
00100         return str;
00101 
00102     /* Apologies for the nearly duplicated code below, hopefully it
00103      * speeds things up.
00104      */
00105     if (once) {
00106         switch (re_token[0]->repeat) {
00107         case rep_once:
00108             if (matched == False)
00109                 return NULL;
00110             break;
00111 
00112         case rep_once_or_more:
00113             if (matched == False)
00114                 return NULL;
00115 
00116             if (re_cmp_step(str+1, regexp, 0, 1))
00117                 return str;
00118             break;
00119 
00120         case rep_null_or_once:
00121             if (matched == False)
00122                 return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
00123             break;
00124 
00125         case rep_null_or_more:
00126             if (matched) {
00127                 if (re_cmp_step(str+1, regexp, 0, 1))
00128                     return str;
00129             } else {
00130                 return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
00131             }
00132             break;
00133         }
00134 
00135         return re_cmp_step(str+1, next_regexp, 1, 0) ? str : NULL;
00136     }
00137 
00138     if (matched) {
00139         switch (re_token[0]->repeat) {
00140         case rep_once:
00141         case rep_null_or_once:
00142             break;
00143 
00144         case rep_once_or_more:
00145         case rep_null_or_more:
00146             if (re_cmp_step(str+1, regexp, 0, 1))
00147                 return str;
00148             break;
00149         }
00150 
00151         /* The logic here is that re_match_token only sees
00152          * if the one letter matches.  Thus, if the
00153          * regex is like '@match eureca', and the
00154          * the user enters anything with an e, re_match_token
00155          * returns true, but they really need to match the
00156          * entire regexp, which re_cmp_step will do.
00157          * However, what happens is that there can be a case
00158          * where the string being match is something like
00159          * 'where is eureca'.  In this case, the re_match_token
00160          * matches that first e, but the re_cmp_step below,
00161          * fails because the next character (r) doesn't match
00162          * the u.  So we call re_cmp with the string
00163          * after the first r, so that it should hopefully match
00164          * up properly.
00165          */
00166         if (re_cmp_step(str+1, next_regexp, 1, 0))
00167             return str;
00168         else if (*(str+1) != 0)
00169             return re_cmp(str+1, regexp);
00170     }
00171     return NULL;
00172 }
00173 
00174 /*   A u x i l l i a r y   f u n c t i o n s
00175  */
00176 
00191 static Boolean re_cmp_step(const char *str, const char *regexp, unsigned slot, int matches) {
00192     const char *next_regexp;
00193     Boolean matched;
00194 
00195 #ifdef DEBUG
00196 /*    fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
00197 #endif
00198 
00199     if (*regexp == 0) {
00200         /* When we reach the end of the regexp, the match is a success */
00201         return True;
00202     }
00203 
00204     /* This chunk of code makes sure that the regexp-tokenising happens
00205      * only once. We only tokenise as much as we need.
00206      */
00207     if (slot > re_token_depth) {
00208         re_token_depth = slot;
00209         if (re_token[slot] == NULL)
00210             re_token[slot] = (selection *)malloc(sizeof(selection));
00211         next_regexp = re_get_token(re_token[slot], regexp);
00212         if (next_regexp == NULL) {
00213             /* Syntax error, what else can we do? */
00214             return False;
00215         }
00216         re_substr[slot] = next_regexp;
00217     } else {
00218         next_regexp = re_substr[slot];
00219     }
00220 
00221     matched = re_match_token(*str, re_token[slot]);
00222     if (matched)
00223         ++matches;
00224 
00225     if (*str == 0)
00226         return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;
00227 
00228     switch (re_token[slot]->repeat) {
00229     case rep_once:
00230         if (matches == 1) { /* (matches == 1) => (matched == True) */
00231             return re_cmp_step(str+1, next_regexp, slot+1, 0);
00232         }
00233         return False;
00234 
00235     case rep_once_or_more:
00236         if (matched) { /* (matched == True) => (matches >= 1) */
00237             /* First check if the current token repeats more */
00238             if (re_cmp_step(str+1, regexp, slot, matches))
00239                 return True;
00240             return re_cmp_step(str+1, next_regexp, slot+1, 0);
00241         }
00242         return False;
00243 
00244     case rep_null_or_once:
00245         /* We must go on to the next token, but should we advance str? */
00246         if (matches == 0) {
00247             return re_cmp_step(str, next_regexp, slot+1, 0);
00248             } else if (matches == 1) {
00249             return re_cmp_step(str+1, next_regexp, slot+1, 0);
00250         }
00251         return False; /* Not reached */
00252 
00253     case rep_null_or_more:
00254         if (matched) {
00255             /* Look for further repeats, advance str */
00256             if (re_cmp_step(str+1, regexp, slot, matches))
00257                 return True;
00258             return re_cmp_step(str, next_regexp, slot+1, 0);
00259         }
00260         return re_cmp_step(str, next_regexp, slot+1, 0);
00261     }
00262 
00263     return False;
00264 }
00265 
00272 static void re_init(void) {
00273     int i;
00274 
00275     re_token[0] = (selection *)malloc(sizeof(selection));
00276     if (re_token[0] == NULL)
00277         fatal(OUT_OF_MEMORY);
00278     for (i = 1; i < RE_TOKEN_MAX; i++)
00279         re_token[i] = NULL;
00280 
00281     re_init_done = True;
00282 }
00283 
00294 static Boolean re_match_token(uchar c, selection *sel) {
00295     switch (sel->type) {
00296     case sel_any:
00297         return True;
00298 
00299     case sel_end:
00300         return (c == 0);
00301 
00302     case sel_single:
00303         return (tolower(c) == tolower(sel->u.single));
00304 
00305     case sel_range:
00306         return (c >= sel->u.range.low && c <= sel->u.range.high);
00307 
00308     case sel_array:
00309         return (sel->u.array[c]);
00310 
00311     case sel_not_single:
00312         return (tolower(c) != tolower(sel->u.single));
00313 
00314     case sel_not_range:
00315         return (c < sel->u.range.low && c > sel->u.range.high);
00316     }
00317 
00318     return False;
00319 }
00320 
00332 static const char *re_get_token(selection *sel, const char *regexp) {
00333 #ifdef SAFE_CHECKS
00334 #   define exit_if_null if (*regexp == 0) return NULL
00335 #else
00336 #   define exit_if_null
00337 #endif
00338     Boolean quoted = False;
00339     uchar looking_at;
00340 
00341 #ifdef SAFE_CHECKS
00342     if (sel == NULL || regexp == NULL || *regexp == 0)
00343         return NULL;
00344 #endif
00345 
00346     do {
00347         looking_at = *regexp++;
00348         switch (looking_at) {
00349         case '$':
00350             if (quoted) {
00351                 quoted = False;
00352                 sel->type = sel_single;
00353                 sel->u.single = looking_at;
00354             } else {
00355                 sel->type = sel_end;
00356             }
00357             break;
00358 
00359         case '.':
00360             if (quoted) {
00361                 quoted = False;
00362                 sel->type = sel_single;
00363                 sel->u.single = looking_at;
00364             } else {
00365                 sel->type = sel_any;
00366             }
00367             break;
00368 
00369         case '[':
00370             /* The fun stuff... perhaps a little obfuscated since I
00371              * don't trust the compiler to analyse liveness.
00372              */
00373             if (quoted) {
00374                 quoted = False;
00375                 sel->type = sel_single;
00376                 sel->u.single = looking_at;
00377             } else {
00378                 Boolean neg = False;
00379                 uchar first, last = 0;
00380 
00381                 exit_if_null;
00382                 looking_at = *regexp++;
00383 
00384                 if (looking_at == '^') {
00385                     neg = True;
00386                     exit_if_null;
00387                     looking_at = *regexp++;
00388                 }
00389                 first = looking_at;
00390                 exit_if_null;
00391                 looking_at = *regexp++;
00392                 if (looking_at == ']') {
00393                     /* On the form [q] or [^q] */
00394                     sel->type = neg ? sel_not_single : sel_single;
00395                     sel->u.single = first;
00396                     break;
00397                 } else if (looking_at == '-') {
00398                     exit_if_null;
00399                     last = *regexp++;
00400                     if (last == ']') {
00401                         /* On the form [A-] or [^A-]. Checking for
00402                          * [,-] and making it a range is probably not
00403                          * worth it :-)
00404                          */
00405                         sel->type = sel_array;
00406                         memset(sel->u.array, neg, sizeof(sel->u.array));
00407                         sel->u.array[first] = sel->u.array['-'] = !neg;
00408                         break;
00409                     } else {
00410                         exit_if_null;
00411                         looking_at = *regexp++;
00412                         if (looking_at == ']') {
00413                             /* On the form [A-G] or [^A-G]. Note that [G-A]
00414                              * is a syntax error. Fair enough, I think.
00415                              */
00416 #ifdef SAFE_CHECKS
00417                             if (first > last)
00418                                 return NULL;
00419 #endif
00420                             sel->type = neg ? sel_not_range : sel_range;
00421                             sel->u.range.low = first;
00422                             sel->u.range.high = last;
00423                             break;
00424                         }
00425                     }
00426                 }
00427                 {
00428                     /* The datastructure can only represent a RE this
00429                      * complex with an array.
00430                      */
00431                     int i;
00432                     uchar previous;
00433 
00434                     sel->type = sel_array;
00435                     memset(sel->u.array, neg, sizeof(sel->u.array));
00436                     if (last) {
00437                         /* It starts with a range */
00438 #ifdef SAFE_CHECKS
00439                         if (first > last)
00440                             return NULL;
00441 #endif
00442                         for (i = first; i <= last; i++) {
00443                             sel->u.array[i] = !neg;
00444                         }
00445                     } else {
00446                         /* It begins with a "random" character */
00447                         sel->u.array[first] = !neg;
00448                     }
00449                     sel->u.array[looking_at] = !neg;
00450 
00451                     exit_if_null;
00452                     previous = looking_at;
00453                     looking_at = *regexp++;
00454 
00455                     /* Add more characters to the array until we reach
00456                      * ]. Quoting doesn't and shouldn't work in here.
00457                      * ("]" should be put first, and "-" last if they
00458                      * are needed inside this construct.)
00459                      * Look for ranges as we go along.
00460                      */
00461                     while (looking_at != ']') {
00462                         if (looking_at == '-') {
00463                             exit_if_null;
00464                             looking_at = *regexp++;
00465                             if (looking_at != ']') {
00466 #ifdef SAFE_CHECKS
00467                                 if (previous > looking_at)
00468                                     return NULL;
00469 #endif
00470                                 for (i = previous+1; i < looking_at; i++) {
00471                                     /* previous has already been set and
00472                                      * looking_at is set below.
00473                                      */
00474                                     sel->u.array[i] = !neg;
00475                                 }
00476                                 exit_if_null;
00477                             } else {
00478                                 sel->u.array['-'] = !neg;
00479                                 break;
00480                             }
00481                         }
00482                         sel->u.array[looking_at] = !neg;
00483                         previous = looking_at;
00484                         exit_if_null;
00485                         looking_at = *regexp++;
00486                     }
00487                 }
00488             }
00489             break;
00490 
00491         case '\\':
00492             if (quoted) {
00493                 quoted = False;
00494                 sel->type = sel_single;
00495                 sel->u.single = looking_at;
00496             } else {
00497                 quoted = True;
00498             }
00499             break;
00500 
00501         default:
00502             quoted = False;
00503             sel->type = sel_single;
00504             sel->u.single = looking_at;
00505             break;
00506         }
00507     } while (quoted);
00508 
00509     if (*regexp == '*') {
00510         sel->repeat = rep_null_or_more;
00511         ++regexp;
00512     } else if (*regexp == '?') {
00513         sel->repeat = rep_null_or_once;
00514         ++regexp;
00515     } else if (*regexp == '+') {
00516         sel->repeat = rep_once_or_more;
00517         ++regexp;
00518     } else {
00519         sel->repeat = rep_once;
00520     }
00521 
00522     return regexp;
00523 }
00524 
00525 /*   D e b u g   c o d e
00526  */
00527 #ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
00528 
00534 static void re_dump_sel(selection *sel) {
00535     switch (sel->type) {
00536     case sel_any:
00537         printf(".");
00538         break;
00539 
00540     case sel_end:
00541         printf("$");
00542         break;
00543 
00544     case sel_single:
00545         printf("<%c>", sel->u.single);
00546         break;
00547 
00548     case sel_range:
00549         printf("[%c-%c]", sel->u.range.low, sel->u.range.high);
00550         break;
00551 
00552     case sel_array: {
00553             int i;
00554             printf("[");
00555             for (i = 0; i < UCHAR_MAX; i++) {
00556                 if (sel->u.array[i]) {
00557                     printf("%c", i);
00558                 }
00559             }
00560             printf("]");
00561         }
00562         break;
00563 
00564     case sel_not_single:
00565         printf("[^%c]", sel->u.single);
00566         break;
00567 
00568     case sel_not_range:
00569         printf("[^%c-%c]", sel->u.range.low, sel->u.range.high);
00570         break;
00571 
00572     default:
00573         printf("<UNKNOWN TOKEN!>");
00574         break;
00575     }
00576 
00577     switch (sel->repeat) {
00578     case rep_once:
00579         break;
00580 
00581     case rep_null_or_once:
00582         printf("?");
00583         break;
00584 
00585     case rep_null_or_more:
00586         printf("*");
00587         break;
00588 
00589     case rep_once_or_more:
00590         printf("+");
00591         break;
00592 
00593     default:
00594         printf("<UNKNOWN REP-TOKEN!>");
00595         break;
00596     }
00597 }
00598 
00599 int main(int argc, char *argv[]) {
00600     char *re, *m;
00601     selection sel;
00602 
00603     re = re_get_token(&sel, argv[1]);
00604 
00605     printf("'%s' -> '%s'\n", argv[1], re);
00606     re_dump_sel(&sel);
00607     printf("\n");
00608     m = re_cmp(argv[2], argv[1]);
00609     if (m)
00610         printf("MATCH! -> '%s'\n", m);
00611     return 0;
00612 }
00613 #endif