trunk/html/re-cmp_8c_source.html

/*

 * Crossfire -- cooperative multi-player graphical RPG and adventure game

 *

 * Copyright (c) 1999-2014 Mark Wedel and the Crossfire Development Team

 * Copyright (c) 1992 Frank Tore Johansen

 *

 * Crossfire is free software and comes with ABSOLUTELY NO WARRANTY. You are

 * welcome to redistribute it under certain conditions. For details, please

 * see COPYING and LICENSE.

 *

 * The authors can be reached via e-mail at <crossfire@metalforge.org>.

 */


#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <limits.h>

#include <re-cmp.h>

#include <ctype.h>

#include <global.h>

#include <define.h> /* Needed for OUT_OF_MEMORY. */


/*   P r o t o t y p e s

 */

const char *re_cmp(const char *, const char *);

static Boolean re_cmp_step(const char *, const char *, unsigned, int);

static void re_init(void);

static Boolean re_match_token(uchar, selection *);

static const char *re_get_token(selection *, const char *);


/*   G l o b a l   v a r i a b l e s

 */

static Boolean      re_init_done = False;

static selection    *re_token[RE_TOKEN_MAX];

static const char   *re_substr[RE_TOKEN_MAX];

static unsigned int re_token_depth;


/*   E x t e r n a l   f u n c t i o n

 */


const char *re_cmp(const char *str, const char *regexp) {

    const char *next_regexp;

    Boolean once = False;

    Boolean matched;


    if (re_init_done == False)

        re_init();


#ifdef SAFE_CHECKS

    if (regexp == NULL || str == NULL)

        return NULL;

#endif

    if (*regexp == '^') {

        once = True;

        ++regexp;

    }

    if (*regexp == 0) {

        /* // or /^/ matches any string */

        return str;

    }


    next_regexp = re_get_token(re_token[0], regexp);

    re_token_depth = 0;

    re_substr[0] = next_regexp;


    matched = False;

    while (*str != '\0' && !(matched = re_match_token(*str, re_token[0])))

        str++;


    if (matched && *next_regexp == 0)

        return str;


    /* Apologies for the nearly duplicated code below, hopefully it

     * speeds things up.

     */

    if (once) {

        switch (re_token[0]->repeat) {

        case rep_once:

            if (matched == False)

                return NULL;

            break;


        case rep_once_or_more:

            if (matched == False)

                return NULL;


            if (re_cmp_step(str+1, regexp, 0, 1))

                return str;

            break;


        case rep_null_or_once:

            if (matched == False)

                return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;

            break;


        case rep_null_or_more:

            if (matched) {

                if (re_cmp_step(str+1, regexp, 0, 1))

                    return str;

            } else {

                return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;

            }

            break;

        }


        return re_cmp_step(str+1, next_regexp, 1, 0) ? str : NULL;

    }


    if (matched) {

        switch (re_token[0]->repeat) {

        case rep_once:

        case rep_null_or_once:

            break;


        case rep_once_or_more:

        case rep_null_or_more:

            if (re_cmp_step(str+1, regexp, 0, 1))

                return str;

            break;

        }


        /* The logic here is that re_match_token only sees

         * if the one letter matches.  Thus, if the

         * regex is like '@match eureca', and the

         * the user enters anything with an e, re_match_token

         * returns true, but they really need to match the

         * entire regexp, which re_cmp_step will do.

         * However, what happens is that there can be a case

         * where the string being match is something like

         * 'where is eureca'.  In this case, the re_match_token

         * matches that first e, but the re_cmp_step below,

         * fails because the next character (r) doesn't match

         * the u.  So we call re_cmp with the string

         * after the first r, so that it should hopefully match

         * up properly.

         */

        if (re_cmp_step(str+1, next_regexp, 1, 0))

            return str;

        else if (*(str+1) != 0)

            return re_cmp(str+1, regexp);

    }

    return NULL;

}


/*   A u x i l l i a r y   f u n c t i o n s

 */


static Boolean re_cmp_step(const char *str, const char *regexp, unsigned slot, int matches) {

    const char *next_regexp;

    Boolean matched;


#ifdef DEBUG

/*    fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/

#endif


    if (*regexp == 0) {

        /* When we reach the end of the regexp, the match is a success */

        return True;

    }


    /* This chunk of code makes sure that the regexp-tokenising happens

     * only once. We only tokenise as much as we need.

     */

    if (slot > re_token_depth) {

        re_token_depth = slot;

        if (re_token[slot] == NULL)

            re_token[slot] = (selection *)malloc(sizeof(selection));

        next_regexp = re_get_token(re_token[slot], regexp);

        if (next_regexp == NULL) {

            /* Syntax error, what else can we do? */

            return False;

        }

        re_substr[slot] = next_regexp;

    } else {

        next_regexp = re_substr[slot];

    }


    matched = re_match_token(*str, re_token[slot]);

    if (matched)

        ++matches;


    if (*str == 0)

        return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;


    switch (re_token[slot]->repeat) {

    case rep_once:

        if (matches == 1) { /* (matches == 1) => (matched == True) */

            return re_cmp_step(str+1, next_regexp, slot+1, 0);

        }

        return False;


    case rep_once_or_more:

        if (matched) { /* (matched == True) => (matches >= 1) */

            /* First check if the current token repeats more */

            if (re_cmp_step(str+1, regexp, slot, matches))

                return True;

            return re_cmp_step(str+1, next_regexp, slot+1, 0);

        }

        return False;


    case rep_null_or_once:

        /* We must go on to the next token, but should we advance str? */

        if (matches == 0) {

            return re_cmp_step(str, next_regexp, slot+1, 0);

            } else if (matches == 1) {

            return re_cmp_step(str+1, next_regexp, slot+1, 0);

        }

        return False; /* Not reached */


    case rep_null_or_more:

        if (matched) {

            /* Look for further repeats, advance str */

            if (re_cmp_step(str+1, regexp, slot, matches))

                return True;

            return re_cmp_step(str, next_regexp, slot+1, 0);

        }

        return re_cmp_step(str, next_regexp, slot+1, 0);

    }


    return False;

}


static void re_init(void) {

    int i;


    re_token[0] = (selection *)malloc(sizeof(selection));

    if (re_token[0] == NULL)

        fatal(OUT_OF_MEMORY);

    for (i = 1; i < RE_TOKEN_MAX; i++)

        re_token[i] = NULL;


    re_init_done = True;

}


static Boolean re_match_token(uchar c, selection *sel) {

    switch (sel->type) {

    case sel_any:

        return True;


    case sel_end:

        return (c == 0);


    case sel_single:

        return (tolower(c) == tolower(sel->u.single));


    case sel_range:

        return (c >= sel->u.range.low && c <= sel->u.range.high);


    case sel_array:

        return (sel->u.array[c]);


    case sel_not_single:

        return (tolower(c) != tolower(sel->u.single));


    case sel_not_range:

        return (c < sel->u.range.low && c > sel->u.range.high);

    }


    return False;

}


static const char *re_get_token(selection *sel, const char *regexp) {

#ifdef SAFE_CHECKS

#   define exit_if_null if (*regexp == 0) return NULL

#else

#   define exit_if_null

#endif

    Boolean quoted = False;

    uchar looking_at;


#ifdef SAFE_CHECKS

    if (sel == NULL || regexp == NULL || *regexp == 0)

        return NULL;

#endif


    do {

        looking_at = *regexp++;

        switch (looking_at) {

        case '$':

            if (quoted) {

                quoted = False;

                sel->type = sel_single;

                sel->u.single = looking_at;

            } else {

                sel->type = sel_end;

            }

            break;


        case '.':

            if (quoted) {

                quoted = False;

                sel->type = sel_single;

                sel->u.single = looking_at;

            } else {

                sel->type = sel_any;

            }

            break;


        case '[':

            /* The fun stuff... perhaps a little obfuscated since I

             * don't trust the compiler to analyse liveness.

             */

            if (quoted) {

                quoted = False;

                sel->type = sel_single;

                sel->u.single = looking_at;

            } else {

                Boolean neg = False;

                uchar first, last = 0;


                exit_if_null;

                looking_at = *regexp++;


                if (looking_at == '^') {

                    neg = True;

                    exit_if_null;

                    looking_at = *regexp++;

                }

                first = looking_at;

                exit_if_null;

                looking_at = *regexp++;

                if (looking_at == ']') {

                    /* On the form [q] or [^q] */

                    sel->type = neg ? sel_not_single : sel_single;

                    sel->u.single = first;

                    break;

                } else if (looking_at == '-') {

                    exit_if_null;

                    last = *regexp++;

                    if (last == ']') {

                        /* On the form [A-] or [^A-]. Checking for

                         * [,-] and making it a range is probably not

                         * worth it :-)

                         */

                        sel->type = sel_array;

                        memset(sel->u.array, neg, sizeof(sel->u.array));

                        sel->u.array[first] = sel->u.array['-'] = !neg;

                        break;

                    } else {

                        exit_if_null;

                        looking_at = *regexp++;

                        if (looking_at == ']') {

                            /* On the form [A-G] or [^A-G]. Note that [G-A]

                             * is a syntax error. Fair enough, I think.

                             */

#ifdef SAFE_CHECKS

                            if (first > last)

                                return NULL;

#endif

                            sel->type = neg ? sel_not_range : sel_range;

                            sel->u.range.low = first;

                            sel->u.range.high = last;

                            break;

                        }

                    }

                }

                {

                    /* The datastructure can only represent a RE this

                     * complex with an array.

                     */

                    int i;

                    uchar previous;


                    sel->type = sel_array;

                    memset(sel->u.array, neg, sizeof(sel->u.array));

                    if (last) {

                        /* It starts with a range */

#ifdef SAFE_CHECKS

                        if (first > last)

                            return NULL;

#endif

                        for (i = first; i <= last; i++) {

                            sel->u.array[i] = !neg;

                        }

                    } else {

                        /* It begins with a "random" character */

                        sel->u.array[first] = !neg;

                    }

                    sel->u.array[looking_at] = !neg;


                    exit_if_null;

                    previous = looking_at;

                    looking_at = *regexp++;


                    /* Add more characters to the array until we reach

                     * ]. Quoting doesn't and shouldn't work in here.

                     * ("]" should be put first, and "-" last if they

                     * are needed inside this construct.)

                     * Look for ranges as we go along.

                     */

                    while (looking_at != ']') {

                        if (looking_at == '-') {

                            exit_if_null;

                            looking_at = *regexp++;

                            if (looking_at != ']') {

#ifdef SAFE_CHECKS

                                if (previous > looking_at)

                                    return NULL;

#endif

                                for (i = previous+1; i < looking_at; i++) {

                                    /* previous has already been set and

                                     * looking_at is set below.

                                     */

                                    sel->u.array[i] = !neg;

                                }

                                exit_if_null;

                            } else {

                                sel->u.array['-'] = !neg;

                                break;

                            }

                        }

                        sel->u.array[looking_at] = !neg;

                        previous = looking_at;

                        exit_if_null;

                        looking_at = *regexp++;

                    }

                }

            }

            break;


        case '\\':

            if (quoted) {

                quoted = False;

                sel->type = sel_single;

                sel->u.single = looking_at;

            } else {

                quoted = True;

            }

            break;


        default:

            quoted = False;

            sel->type = sel_single;

            sel->u.single = looking_at;

            break;

        }

    } while (quoted);


    if (*regexp == '*') {

        sel->repeat = rep_null_or_more;

        ++regexp;

    } else if (*regexp == '?') {

        sel->repeat = rep_null_or_once;

        ++regexp;

    } else if (*regexp == '+') {

        sel->repeat = rep_once_or_more;

        ++regexp;

    } else {

        sel->repeat = rep_once;

    }


    return regexp;

}