Crossfire Server, Branches 1.12  R18729
re-cmp.c
Go to the documentation of this file.
1 /*
2  * static char *rcsid_player_c =
3  * "$Id: re-cmp.c 11578 2009-02-23 22:02:27Z lalo $";
4  */
5 
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <memory.h>
25 #include <limits.h>
26 #include <re-cmp.h>
27 #include <ctype.h>
28 #include <global.h>
29 #include <define.h> /* Needed for OUT_OF_MEMORY. */
30 
31 /* Get prototype functions to prevent warnings. */
32 #if defined(__sun__) && defined(StupidSunHeaders)
33 # include <sys/types.h>
34 # include <sys/time.h>
35 # include "sunos.h" /* Prototypes for standard libraries, sunos lack those */
36 #endif
37 
38 /* P r o t o t y p e s
39  */
40 const char *re_cmp(const char *, const char *);
41 static Boolean re_cmp_step(const char *, const char *, unsigned, int);
42 static void re_init(void);
44 static const char *re_get_token(selection *, const char *);
45 #ifdef DEBUG2
46 static void re_dump_sel(selection *);
47 #endif
48 
49 /* G l o b a l v a r i a b l e s
50  */
53 static const char *re_substr[RE_TOKEN_MAX];
54 static unsigned int re_token_depth;
55 
56 /* E x t e r n a l f u n c t i o n
57  */
58 
70 const char *re_cmp(const char *str, const char *regexp) {
71  const char *next_regexp;
72  Boolean once = False;
73  Boolean matched;
74 
75  if (re_init_done == False)
76  re_init();
77 
78 #ifdef SAFE_CHECKS
79  if (regexp == NULL || str == NULL)
80  return NULL;
81 #endif
82  if (*regexp == '^') {
83  once = True;
84  ++regexp;
85  }
86  if (*regexp == 0) {
87  /* // or /^/ matches any string */
88  return str;
89  }
90 
91  next_regexp = re_get_token(re_token[0], regexp);
92  re_token_depth = 0;
93  re_substr[0] = next_regexp;
94 
95  matched = False;
96  while (*str != '\0' && !(matched = re_match_token(*str, re_token[0])))
97  str++;
98 
99  if (matched && *next_regexp == 0)
100  return str;
101 
102  /* Apologies for the nearly duplicated code below, hopefully it
103  * speeds things up.
104  */
105  if (once) {
106  switch (re_token[0]->repeat) {
107  case rep_once:
108  if (matched == False)
109  return NULL;
110  break;
111 
112  case rep_once_or_more:
113  if (matched == False)
114  return NULL;
115 
116  if (re_cmp_step(str+1, regexp, 0, 1))
117  return str;
118  break;
119 
120  case rep_null_or_once:
121  if (matched == False)
122  return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
123  break;
124 
125  case rep_null_or_more:
126  if (matched) {
127  if (re_cmp_step(str+1, regexp, 0, 1))
128  return str;
129  } else {
130  return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
131  }
132  break;
133  }
134 
135  return re_cmp_step(str+1, next_regexp, 1, 0) ? str : NULL;
136  }
137 
138  if (matched) {
139  switch (re_token[0]->repeat) {
140  case rep_once:
141  case rep_null_or_once:
142  break;
143 
144  case rep_once_or_more:
145  case rep_null_or_more:
146  if (re_cmp_step(str+1, regexp, 0, 1))
147  return str;
148  break;
149  }
150 
151  /* The logic here is that re_match_token only sees
152  * if the one letter matches. Thus, if the
153  * regex is like '@match eureca', and the
154  * the user enters anything with an e, re_match_token
155  * returns true, but they really need to match the
156  * entire regexp, which re_cmp_step will do.
157  * However, what happens is that there can be a case
158  * where the string being match is something like
159  * 'where is eureca'. In this case, the re_match_token
160  * matches that first e, but the re_cmp_step below,
161  * fails because the next character (r) doesn't match
162  * the u. So we call re_cmp with the string
163  * after the first r, so that it should hopefully match
164  * up properly.
165  */
166  if (re_cmp_step(str+1, next_regexp, 1, 0))
167  return str;
168  else if (*(str+1) != 0)
169  return re_cmp(str+1, regexp);
170  }
171  return NULL;
172 }
173 
174 /* A u x i l l i a r y f u n c t i o n s
175  */
176 
191 static Boolean re_cmp_step(const char *str, const char *regexp, unsigned slot, int matches) {
192  const char *next_regexp;
193  Boolean matched;
194 
195 #ifdef DEBUG
196 /* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
197 #endif
198 
199  if (*regexp == 0) {
200  /* When we reach the end of the regexp, the match is a success */
201  return True;
202  }
203 
204  /* This chunk of code makes sure that the regexp-tokenising happens
205  * only once. We only tokenise as much as we need.
206  */
207  if (slot > re_token_depth) {
208  re_token_depth = slot;
209  if (re_token[slot] == NULL)
210  re_token[slot] = (selection *)malloc(sizeof(selection));
211  next_regexp = re_get_token(re_token[slot], regexp);
212  if (next_regexp == NULL) {
213  /* Syntax error, what else can we do? */
214  return False;
215  }
216  re_substr[slot] = next_regexp;
217  } else {
218  next_regexp = re_substr[slot];
219  }
220 
221  matched = re_match_token(*str, re_token[slot]);
222  if (matched)
223  ++matches;
224 
225  if (*str == 0)
226  return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;
227 
228  switch (re_token[slot]->repeat) {
229  case rep_once:
230  if (matches == 1) { /* (matches == 1) => (matched == True) */
231  return re_cmp_step(str+1, next_regexp, slot+1, 0);
232  }
233  return False;
234 
235  case rep_once_or_more:
236  if (matched) { /* (matched == True) => (matches >= 1) */
237  /* First check if the current token repeats more */
238  if (re_cmp_step(str+1, regexp, slot, matches))
239  return True;
240  return re_cmp_step(str+1, next_regexp, slot+1, 0);
241  }
242  return False;
243 
244  case rep_null_or_once:
245  /* We must go on to the next token, but should we advance str? */
246  if (matches == 0) {
247  return re_cmp_step(str, next_regexp, slot+1, 0);
248  } else if (matches == 1) {
249  return re_cmp_step(str+1, next_regexp, slot+1, 0);
250  }
251  return False; /* Not reached */
252 
253  case rep_null_or_more:
254  if (matched) {
255  /* Look for further repeats, advance str */
256  if (re_cmp_step(str+1, regexp, slot, matches))
257  return True;
258  return re_cmp_step(str, next_regexp, slot+1, 0);
259  }
260  return re_cmp_step(str, next_regexp, slot+1, 0);
261  }
262 
263  return False;
264 }
265 
272 static void re_init(void) {
273  int i;
274 
275  re_token[0] = (selection *)malloc(sizeof(selection));
276  if (re_token[0] == NULL)
278  for (i = 1; i < RE_TOKEN_MAX; i++)
279  re_token[i] = NULL;
280 
281  re_init_done = True;
282 }
283 
295  switch (sel->type) {
296  case sel_any:
297  return True;
298 
299  case sel_end:
300  return (c == 0);
301 
302  case sel_single:
303  return (tolower(c) == tolower(sel->u.single));
304 
305  case sel_range:
306  return (c >= sel->u.range.low && c <= sel->u.range.high);
307 
308  case sel_array:
309  return (sel->u.array[c]);
310 
311  case sel_not_single:
312  return (tolower(c) != tolower(sel->u.single));
313 
314  case sel_not_range:
315  return (c < sel->u.range.low && c > sel->u.range.high);
316  }
317 
318  return False;
319 }
320 
332 static const char *re_get_token(selection *sel, const char *regexp) {
333 #ifdef SAFE_CHECKS
334 # define exit_if_null if (*regexp == 0) return NULL
335 #else
336 # define exit_if_null
337 #endif
338  Boolean quoted = False;
339  uchar looking_at;
340 
341 #ifdef SAFE_CHECKS
342  if (sel == NULL || regexp == NULL || *regexp == 0)
343  return NULL;
344 #endif
345 
346  do {
347  looking_at = *regexp++;
348  switch (looking_at) {
349  case '$':
350  if (quoted) {
351  quoted = False;
352  sel->type = sel_single;
353  sel->u.single = looking_at;
354  } else {
355  sel->type = sel_end;
356  }
357  break;
358 
359  case '.':
360  if (quoted) {
361  quoted = False;
362  sel->type = sel_single;
363  sel->u.single = looking_at;
364  } else {
365  sel->type = sel_any;
366  }
367  break;
368 
369  case '[':
370  /* The fun stuff... perhaps a little obfuscated since I
371  * don't trust the compiler to analyse liveness.
372  */
373  if (quoted) {
374  quoted = False;
375  sel->type = sel_single;
376  sel->u.single = looking_at;
377  } else {
378  Boolean neg = False;
379  uchar first, last = 0;
380 
381  exit_if_null;
382  looking_at = *regexp++;
383 
384  if (looking_at == '^') {
385  neg = True;
386  exit_if_null;
387  looking_at = *regexp++;
388  }
389  first = looking_at;
390  exit_if_null;
391  looking_at = *regexp++;
392  if (looking_at == ']') {
393  /* On the form [q] or [^q] */
394  sel->type = neg ? sel_not_single : sel_single;
395  sel->u.single = first;
396  break;
397  } else if (looking_at == '-') {
398  exit_if_null;
399  last = *regexp++;
400  if (last == ']') {
401  /* On the form [A-] or [^A-]. Checking for
402  * [,-] and making it a range is probably not
403  * worth it :-)
404  */
405  sel->type = sel_array;
406  memset(sel->u.array, neg, sizeof(sel->u.array));
407  sel->u.array[first] = sel->u.array['-'] = !neg;
408  break;
409  } else {
410  exit_if_null;
411  looking_at = *regexp++;
412  if (looking_at == ']') {
413  /* On the form [A-G] or [^A-G]. Note that [G-A]
414  * is a syntax error. Fair enough, I think.
415  */
416 #ifdef SAFE_CHECKS
417  if (first > last)
418  return NULL;
419 #endif
420  sel->type = neg ? sel_not_range : sel_range;
421  sel->u.range.low = first;
422  sel->u.range.high = last;
423  break;
424  }
425  }
426  }
427  {
428  /* The datastructure can only represent a RE this
429  * complex with an array.
430  */
431  int i;
432  uchar previous;
433 
434  sel->type = sel_array;
435  memset(sel->u.array, neg, sizeof(sel->u.array));
436  if (last) {
437  /* It starts with a range */
438 #ifdef SAFE_CHECKS
439  if (first > last)
440  return NULL;
441 #endif
442  for (i = first; i <= last; i++) {
443  sel->u.array[i] = !neg;
444  }
445  } else {
446  /* It begins with a "random" character */
447  sel->u.array[first] = !neg;
448  }
449  sel->u.array[looking_at] = !neg;
450 
451  exit_if_null;
452  previous = looking_at;
453  looking_at = *regexp++;
454 
455  /* Add more characters to the array until we reach
456  * ]. Quoting doesn't and shouldn't work in here.
457  * ("]" should be put first, and "-" last if they
458  * are needed inside this construct.)
459  * Look for ranges as we go along.
460  */
461  while (looking_at != ']') {
462  if (looking_at == '-') {
463  exit_if_null;
464  looking_at = *regexp++;
465  if (looking_at != ']') {
466 #ifdef SAFE_CHECKS
467  if (previous > looking_at)
468  return NULL;
469 #endif
470  for (i = previous+1; i < looking_at; i++) {
471  /* previous has already been set and
472  * looking_at is set below.
473  */
474  sel->u.array[i] = !neg;
475  }
476  exit_if_null;
477  } else {
478  sel->u.array['-'] = !neg;
479  break;
480  }
481  }
482  sel->u.array[looking_at] = !neg;
483  previous = looking_at;
484  exit_if_null;
485  looking_at = *regexp++;
486  }
487  }
488  }
489  break;
490 
491  case '\\':
492  if (quoted) {
493  quoted = False;
494  sel->type = sel_single;
495  sel->u.single = looking_at;
496  } else {
497  quoted = True;
498  }
499  break;
500 
501  default:
502  quoted = False;
503  sel->type = sel_single;
504  sel->u.single = looking_at;
505  break;
506  }
507  } while (quoted);
508 
509  if (*regexp == '*') {
510  sel->repeat = rep_null_or_more;
511  ++regexp;
512  } else if (*regexp == '?') {
513  sel->repeat = rep_null_or_once;
514  ++regexp;
515  } else if (*regexp == '+') {
516  sel->repeat = rep_once_or_more;
517  ++regexp;
518  } else {
519  sel->repeat = rep_once;
520  }
521 
522  return regexp;
523 }
524 
525 /* D e b u g c o d e
526  */
527 #ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
528 
534 static void re_dump_sel(selection *sel) {
535  switch (sel->type) {
536  case sel_any:
537  printf(".");
538  break;
539 
540  case sel_end:
541  printf("$");
542  break;
543 
544  case sel_single:
545  printf("<%c>", sel->u.single);
546  break;
547 
548  case sel_range:
549  printf("[%c-%c]", sel->u.range.low, sel->u.range.high);
550  break;
551 
552  case sel_array: {
553  int i;
554  printf("[");
555  for (i = 0; i < UCHAR_MAX; i++) {
556  if (sel->u.array[i]) {
557  printf("%c", i);
558  }
559  }
560  printf("]");
561  }
562  break;
563 
564  case sel_not_single:
565  printf("[^%c]", sel->u.single);
566  break;
567 
568  case sel_not_range:
569  printf("[^%c-%c]", sel->u.range.low, sel->u.range.high);
570  break;
571 
572  default:
573  printf("<UNKNOWN TOKEN!>");
574  break;
575  }
576 
577  switch (sel->repeat) {
578  case rep_once:
579  break;
580 
581  case rep_null_or_once:
582  printf("?");
583  break;
584 
585  case rep_null_or_more:
586  printf("*");
587  break;
588 
589  case rep_once_or_more:
590  printf("+");
591  break;
592 
593  default:
594  printf("<UNKNOWN REP-TOKEN!>");
595  break;
596  }
597 }
598 
599 int main(int argc, char *argv[]) {
600  char *re, *m;
601  selection sel;
602 
603  re = re_get_token(&sel, argv[1]);
604 
605  printf("'%s' -> '%s'\n", argv[1], re);
606  re_dump_sel(&sel);
607  printf("\n");
608  m = re_cmp(argv[2], argv[1]);
609  if (m)
610  printf("MATCH! -> '%s'\n", m);
611  return 0;
612 }
613 #endif
selection_type type
Definition: re-cmp.h:72
#define OUT_OF_MEMORY
Definition: define.h:94
#define RE_TOKEN_MAX
Definition: re-cmp.h:21
static Boolean re_init_done
Definition: re-cmp.c:51
#define False
Definition: re-cmp.h:52
union selection::@1 u
uchar single
Definition: re-cmp.h:74
static void re_init(void)
Definition: re-cmp.c:272
static const char * re_substr[RE_TOKEN_MAX]
Definition: re-cmp.c:53
static int matches(const char *exp, const char *text)
Definition: dialog.c:86
static Boolean re_match_token(uchar, selection *)
Definition: re-cmp.c:294
#define exit_if_null
static const char * re_get_token(selection *, const char *)
Definition: re-cmp.c:332
void fatal(int err)
Definition: glue.c:60
const char * re_cmp(const char *, const char *)
Definition: re-cmp.c:70
struct selection::@1::@2 range
repetetion_type repeat
Definition: re-cmp.h:80
#define True
Definition: re-cmp.h:51
Boolean array[UCHAR_MAX]
Definition: re-cmp.h:78
#define tolower(C)
Definition: c_new.c:42
#define Boolean
Definition: re-cmp.h:50
static Boolean re_cmp_step(const char *, const char *, unsigned, int)
Definition: re-cmp.c:191
Definition: re-cmp.h:56
static selection * re_token[RE_TOKEN_MAX]
Definition: re-cmp.c:52
static unsigned int re_token_depth
Definition: re-cmp.c:54
Definition: re-cmp.h:55
int main(int argc, char **argv)
Definition: devel.c:50
#define uchar
Definition: re-cmp.h:49