Crossfire Server, Trunk  R20513
re-cmp.c
Go to the documentation of this file.
1 /*
2  * Crossfire -- cooperative multi-player graphical RPG and adventure game
3  *
4  * Copyright (c) 1999-2014 Mark Wedel and the Crossfire Development Team
5  * Copyright (c) 1992 Frank Tore Johansen
6  *
7  * Crossfire is free software and comes with ABSOLUTELY NO WARRANTY. You are
8  * welcome to redistribute it under certain conditions. For details, please
9  * see COPYING and LICENSE.
10  *
11  * The authors can be reached via e-mail at <crossfire@metalforge.org>.
12  */
13 
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <limits.h>
34 #include <re-cmp.h>
35 #include <ctype.h>
36 #include <global.h>
37 #include <define.h> /* Needed for OUT_OF_MEMORY. */
38 
39 
40 /* P r o t o t y p e s
41  */
42 const char *re_cmp(const char *, const char *);
43 static Boolean re_cmp_step(const char *, const char *, unsigned, int);
44 static void re_init(void);
46 static const char *re_get_token(selection *, const char *);
47 
48 /* G l o b a l v a r i a b l e s
49  */
52 static const char *re_substr[RE_TOKEN_MAX];
53 static unsigned int re_token_depth;
54 
55 /* E x t e r n a l f u n c t i o n
56  */
57 
69 const char *re_cmp(const char *str, const char *regexp) {
70  const char *next_regexp;
71  Boolean once = False;
72  Boolean matched;
73 
74  if (re_init_done == False)
75  re_init();
76 
77 #ifdef SAFE_CHECKS
78  if (regexp == NULL || str == NULL)
79  return NULL;
80 #endif
81  if (*regexp == '^') {
82  once = True;
83  ++regexp;
84  }
85  if (*regexp == 0) {
86  /* // or /^/ matches any string */
87  return str;
88  }
89 
90  next_regexp = re_get_token(re_token[0], regexp);
91  re_token_depth = 0;
92  re_substr[0] = next_regexp;
93 
94  matched = False;
95  while (*str != '\0' && !(matched = re_match_token(*str, re_token[0])))
96  str++;
97 
98  if (matched && *next_regexp == 0)
99  return str;
100 
101  /* Apologies for the nearly duplicated code below, hopefully it
102  * speeds things up.
103  */
104  if (once) {
105  switch (re_token[0]->repeat) {
106  case rep_once:
107  if (matched == False)
108  return NULL;
109  break;
110 
111  case rep_once_or_more:
112  if (matched == False)
113  return NULL;
114 
115  if (re_cmp_step(str+1, regexp, 0, 1))
116  return str;
117  break;
118 
119  case rep_null_or_once:
120  if (matched == False)
121  return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
122  break;
123 
124  case rep_null_or_more:
125  if (matched) {
126  if (re_cmp_step(str+1, regexp, 0, 1))
127  return str;
128  } else {
129  return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
130  }
131  break;
132  }
133 
134  return re_cmp_step(str+1, next_regexp, 1, 0) ? str : NULL;
135  }
136 
137  if (matched) {
138  switch (re_token[0]->repeat) {
139  case rep_once:
140  case rep_null_or_once:
141  break;
142 
143  case rep_once_or_more:
144  case rep_null_or_more:
145  if (re_cmp_step(str+1, regexp, 0, 1))
146  return str;
147  break;
148  }
149 
150  /* The logic here is that re_match_token only sees
151  * if the one letter matches. Thus, if the
152  * regex is like '@match eureca', and the
153  * the user enters anything with an e, re_match_token
154  * returns true, but they really need to match the
155  * entire regexp, which re_cmp_step will do.
156  * However, what happens is that there can be a case
157  * where the string being match is something like
158  * 'where is eureca'. In this case, the re_match_token
159  * matches that first e, but the re_cmp_step below,
160  * fails because the next character (r) doesn't match
161  * the u. So we call re_cmp with the string
162  * after the first r, so that it should hopefully match
163  * up properly.
164  */
165  if (re_cmp_step(str+1, next_regexp, 1, 0))
166  return str;
167  else if (*(str+1) != 0)
168  return re_cmp(str+1, regexp);
169  }
170  return NULL;
171 }
172 
173 /* A u x i l l i a r y f u n c t i o n s
174  */
175 
190 static Boolean re_cmp_step(const char *str, const char *regexp, unsigned slot, int matches) {
191  const char *next_regexp;
192  Boolean matched;
193 
194 #ifdef DEBUG
195 /* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
196 #endif
197 
198  if (*regexp == 0) {
199  /* When we reach the end of the regexp, the match is a success */
200  return True;
201  }
202 
203  /* This chunk of code makes sure that the regexp-tokenising happens
204  * only once. We only tokenise as much as we need.
205  */
206  if (slot > re_token_depth) {
207  re_token_depth = slot;
208  if (re_token[slot] == NULL)
209  re_token[slot] = (selection *)malloc(sizeof(selection));
210  next_regexp = re_get_token(re_token[slot], regexp);
211  if (next_regexp == NULL) {
212  /* Syntax error, what else can we do? */
213  return False;
214  }
215  re_substr[slot] = next_regexp;
216  } else {
217  next_regexp = re_substr[slot];
218  }
219 
220  matched = re_match_token(*str, re_token[slot]);
221  if (matched)
222  ++matches;
223 
224  if (*str == 0)
225  return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;
226 
227  switch (re_token[slot]->repeat) {
228  case rep_once:
229  if (matches == 1) { /* (matches == 1) => (matched == True) */
230  return re_cmp_step(str+1, next_regexp, slot+1, 0);
231  }
232  return False;
233 
234  case rep_once_or_more:
235  if (matched) { /* (matched == True) => (matches >= 1) */
236  /* First check if the current token repeats more */
237  if (re_cmp_step(str+1, regexp, slot, matches))
238  return True;
239  return re_cmp_step(str+1, next_regexp, slot+1, 0);
240  }
241  return False;
242 
243  case rep_null_or_once:
244  /* We must go on to the next token, but should we advance str? */
245  if (matches == 0) {
246  return re_cmp_step(str, next_regexp, slot+1, 0);
247  } else if (matches == 1) {
248  return re_cmp_step(str+1, next_regexp, slot+1, 0);
249  }
250  return False; /* Not reached */
251 
252  case rep_null_or_more:
253  if (matched) {
254  /* Look for further repeats, advance str */
255  if (re_cmp_step(str+1, regexp, slot, matches))
256  return True;
257  return re_cmp_step(str, next_regexp, slot+1, 0);
258  }
259  return re_cmp_step(str, next_regexp, slot+1, 0);
260  }
261 
262  return False;
263 }
264 
271 static void re_init(void) {
272  int i;
273 
274  re_token[0] = (selection *)malloc(sizeof(selection));
275  if (re_token[0] == NULL)
277  for (i = 1; i < RE_TOKEN_MAX; i++)
278  re_token[i] = NULL;
279 
280  re_init_done = True;
281 }
282 
294  switch (sel->type) {
295  case sel_any:
296  return True;
297 
298  case sel_end:
299  return (c == 0);
300 
301  case sel_single:
302  return (tolower(c) == tolower(sel->u.single));
303 
304  case sel_range:
305  return (c >= sel->u.range.low && c <= sel->u.range.high);
306 
307  case sel_array:
308  return (sel->u.array[c]);
309 
310  case sel_not_single:
311  return (tolower(c) != tolower(sel->u.single));
312 
313  case sel_not_range:
314  return (c < sel->u.range.low && c > sel->u.range.high);
315  }
316 
317  return False;
318 }
319 
331 static const char *re_get_token(selection *sel, const char *regexp) {
332 #ifdef SAFE_CHECKS
333 # define exit_if_null if (*regexp == 0) return NULL
334 #else
335 # define exit_if_null
336 #endif
337  Boolean quoted = False;
338  uchar looking_at;
339 
340 #ifdef SAFE_CHECKS
341  if (sel == NULL || regexp == NULL || *regexp == 0)
342  return NULL;
343 #endif
344 
345  do {
346  looking_at = *regexp++;
347  switch (looking_at) {
348  case '$':
349  if (quoted) {
350  quoted = False;
351  sel->type = sel_single;
352  sel->u.single = looking_at;
353  } else {
354  sel->type = sel_end;
355  }
356  break;
357 
358  case '.':
359  if (quoted) {
360  quoted = False;
361  sel->type = sel_single;
362  sel->u.single = looking_at;
363  } else {
364  sel->type = sel_any;
365  }
366  break;
367 
368  case '[':
369  /* The fun stuff... perhaps a little obfuscated since I
370  * don't trust the compiler to analyse liveness.
371  */
372  if (quoted) {
373  quoted = False;
374  sel->type = sel_single;
375  sel->u.single = looking_at;
376  } else {
377  Boolean neg = False;
378  uchar first, last = 0;
379 
380  exit_if_null;
381  looking_at = *regexp++;
382 
383  if (looking_at == '^') {
384  neg = True;
385  exit_if_null;
386  looking_at = *regexp++;
387  }
388  first = looking_at;
389  exit_if_null;
390  looking_at = *regexp++;
391  if (looking_at == ']') {
392  /* On the form [q] or [^q] */
393  sel->type = neg ? sel_not_single : sel_single;
394  sel->u.single = first;
395  break;
396  } else if (looking_at == '-') {
397  exit_if_null;
398  last = *regexp++;
399  if (last == ']') {
400  /* On the form [A-] or [^A-]. Checking for
401  * [,-] and making it a range is probably not
402  * worth it :-)
403  */
404  sel->type = sel_array;
405  memset(sel->u.array, neg, sizeof(sel->u.array));
406  sel->u.array[first] = sel->u.array['-'] = !neg;
407  break;
408  } else {
409  exit_if_null;
410  looking_at = *regexp++;
411  if (looking_at == ']') {
412  /* On the form [A-G] or [^A-G]. Note that [G-A]
413  * is a syntax error. Fair enough, I think.
414  */
415 #ifdef SAFE_CHECKS
416  if (first > last)
417  return NULL;
418 #endif
419  sel->type = neg ? sel_not_range : sel_range;
420  sel->u.range.low = first;
421  sel->u.range.high = last;
422  break;
423  }
424  }
425  }
426  {
427  /* The datastructure can only represent a RE this
428  * complex with an array.
429  */
430  int i;
431  uchar previous;
432 
433  sel->type = sel_array;
434  memset(sel->u.array, neg, sizeof(sel->u.array));
435  if (last) {
436  /* It starts with a range */
437 #ifdef SAFE_CHECKS
438  if (first > last)
439  return NULL;
440 #endif
441  for (i = first; i <= last; i++) {
442  sel->u.array[i] = !neg;
443  }
444  } else {
445  /* It begins with a "random" character */
446  sel->u.array[first] = !neg;
447  }
448  sel->u.array[looking_at] = !neg;
449 
450  exit_if_null;
451  previous = looking_at;
452  looking_at = *regexp++;
453 
454  /* Add more characters to the array until we reach
455  * ]. Quoting doesn't and shouldn't work in here.
456  * ("]" should be put first, and "-" last if they
457  * are needed inside this construct.)
458  * Look for ranges as we go along.
459  */
460  while (looking_at != ']') {
461  if (looking_at == '-') {
462  exit_if_null;
463  looking_at = *regexp++;
464  if (looking_at != ']') {
465 #ifdef SAFE_CHECKS
466  if (previous > looking_at)
467  return NULL;
468 #endif
469  for (i = previous+1; i < looking_at; i++) {
470  /* previous has already been set and
471  * looking_at is set below.
472  */
473  sel->u.array[i] = !neg;
474  }
475  exit_if_null;
476  } else {
477  sel->u.array['-'] = !neg;
478  break;
479  }
480  }
481  sel->u.array[looking_at] = !neg;
482  previous = looking_at;
483  exit_if_null;
484  looking_at = *regexp++;
485  }
486  }
487  }
488  break;
489 
490  case '\\':
491  if (quoted) {
492  quoted = False;
493  sel->type = sel_single;
494  sel->u.single = looking_at;
495  } else {
496  quoted = True;
497  }
498  break;
499 
500  default:
501  quoted = False;
502  sel->type = sel_single;
503  sel->u.single = looking_at;
504  break;
505  }
506  } while (quoted);
507 
508  if (*regexp == '*') {
509  sel->repeat = rep_null_or_more;
510  ++regexp;
511  } else if (*regexp == '?') {
512  sel->repeat = rep_null_or_once;
513  ++regexp;
514  } else if (*regexp == '+') {
515  sel->repeat = rep_once_or_more;
516  ++regexp;
517  } else {
518  sel->repeat = rep_once;
519  }
520 
521  return regexp;
522 }
523 
static Boolean re_match_token(uchar, selection *)
Tests if a char matches a token.
Definition: re-cmp.c:293
static Boolean re_cmp_step(const char *, const char *, unsigned, int)
Tries to match a string with a regexp.
Definition: re-cmp.c:190
selection_type type
Definition: re-cmp.h:72
Datastructures for representing a subset of regular expressions.
void fatal(enum fatal_error err)
fatal() is meant to be called whenever a fatal signal is intercepted.
Definition: utils.c:596
#define RE_TOKEN_MAX
Max amount of tokens in a regexp.
Definition: re-cmp.h:21
corresponds to eg *
Definition: re-cmp.h:68
static Boolean re_init_done
Definition: re-cmp.c:50
#define False
Definition: re-cmp.h:52
corresponds to +
Definition: re-cmp.h:66
union selection::@1 u
Global type definitions and header inclusions.
corresponds to eg q
Definition: re-cmp.h:57
uchar single
Definition: re-cmp.h:74
static void re_init(void)
Init the regular expression structures.
Definition: re-cmp.c:271
static const char * re_substr[RE_TOKEN_MAX]
Definition: re-cmp.c:52
static int matches(const char *exp, const char *text)
Does the text match the expression?
Definition: dialog.c:79
#define exit_if_null
static const char * re_get_token(selection *, const char *)
Get the first regular expression token found in regexp in sel.
Definition: re-cmp.c:331
corresponds to eg [A-F]
Definition: re-cmp.h:58
struct selection::@1::@2 range
corresponds to eg [AF-RqO-T]
Definition: re-cmp.h:59
repetetion_type repeat
Definition: re-cmp.h:80
#define True
Changing this value will break the code.
Definition: re-cmp.h:51
Boolean array[UCHAR_MAX]
Definition: re-cmp.h:78
const char * re_cmp(const char *, const char *)
re-cmp - get regular expression match.
Definition: re-cmp.c:69
#define tolower(C)
Simple macro to convert a letter to lowercase.
Definition: c_new.c:29
#define Boolean
Definition: re-cmp.h:50
corresponds to eg [^f]
Definition: re-cmp.h:60
corresponds to no meta-char
Definition: re-cmp.h:65
corresponds to eg $
Definition: re-cmp.h:56
static selection * re_token[RE_TOKEN_MAX]
Definition: re-cmp.c:51
static unsigned int re_token_depth
Definition: re-cmp.c:53
corresponds to eg [^A-F]
Definition: re-cmp.h:61
corresponds to eg .
Definition: re-cmp.h:55
#define uchar
Definition: re-cmp.h:49
corresponds to eg ?
Definition: re-cmp.h:67
Core defines: object types, flags, etc.