You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
463 lines
12 KiB
463 lines
12 KiB
/*
|
|
* Knuth-Morris-Pratt's Substring Search for N given strings
|
|
*
|
|
* (c) 1999--2005, Robert Spalek <robert@ucw.cz>
|
|
* (c) 2006, Pavel Charvat <pchar@ucw.cz>
|
|
*
|
|
* (In fact, the algorithm is usually referred to as Aho-McCorasick,
|
|
* but that's just an extension of KMP to multiple strings.)
|
|
*/
|
|
|
|
/*
|
|
* This is not a normal header file, it's a generator of KMP algorithm.
|
|
* Each time you include it with parameters set in the corresponding
|
|
* preprocessor macros, it generates KMP structures and functions
|
|
* with the parameters given.
|
|
*
|
|
* This file contains only construction of the automaton. The search
|
|
* itself can be generated by inclusion of file ucw/kmp-search.h.
|
|
* Separeted headers allow the user to define multiple search
|
|
* routines for one common set of key strings.
|
|
*
|
|
* Example:
|
|
*
|
|
* #define KMP_PREFIX(x) kmp_##x
|
|
* #define KMP_WANT_CLEANUP
|
|
* #define KMP_WANT_SEARCH // includes ucw/kmp-search.h automatically
|
|
* #define KMPS_FOUND(kmp,src,s) printf("found\n")
|
|
* #include <ucw/kmp.h>
|
|
*
|
|
* [...]
|
|
*
|
|
* struct kmp_struct kmp; // a structure describing the whole automaton
|
|
* kmp_init(&kmp); // initialization (must be called before all other functions)
|
|
*
|
|
* // add key strings we want to search
|
|
* kmp_add(&kmp, "aaa");
|
|
* kmp_add(&kmp, "abc");
|
|
*
|
|
* // complete the automaton, no more strings can be added later
|
|
* kmp_build(&kmp);
|
|
*
|
|
* // example of search, should print single "found" to stdout
|
|
* kmp_run(&kmp, "aabaabca");
|
|
*
|
|
* // destroy all internal structures
|
|
* kmp_cleanup(&kmp);
|
|
*
|
|
*
|
|
* Brief description of all parameters:
|
|
*
|
|
* Basic parameters:
|
|
* KMP_PREFIX(x) macro to add a name prefix (used on all global names
|
|
* defined by the KMP generator); mandatory;
|
|
* we abbreviate this to P(x) below
|
|
*
|
|
* KMP_CHAR alphabet type, the default is u16
|
|
*
|
|
* KMP_SOURCE user-defined text source; KMP_GET_CHAR must
|
|
* KMP_GET_CHAR(kmp,src,c) return zero at the end or nonzero together with the next character in c otherwise;
|
|
* if not defined, zero-terminated array of bytes is used as the input
|
|
*
|
|
* KMP_VARS user-defined variables in 'struct P(struct)'
|
|
* -- a structure describing the whole automaton;
|
|
* these variables are stored in .u substructure to avoid collisions
|
|
* KMP_STATE_VARS user-defined variables in 'struct P(state)'
|
|
* -- created for each state of the automaton;
|
|
* these variables are stored in .u substructure to avoid collisions
|
|
*
|
|
* Parameters which select how the input is interpreted (if KMP_SOURCE is unset):
|
|
* KMP_USE_ASCII reads single bytes from the input (default)
|
|
* KMP_USE_UTF8 reads UTF-8 characters from the input (valid UTF-8 needed)
|
|
* KMP_TOLOWER converts all to lowercase
|
|
* KMP_UNACCENT removes accents
|
|
* KMP_ONLYALPHA converts non-alphas to KMP_CONTROL_CHAR (see below)
|
|
*
|
|
* Parameters controlling add(kmp, src):
|
|
* KMP_ADD_EXTRA_ARGS extra arguments, should be used carefully because of possible collisions
|
|
* KMP_ADD_INIT(kmp,src) called in the beginning of add(), src is the first
|
|
* KMP_INIT_STATE(kmp,s) initialization of a new state s (called before KMP_ADD_{NEW,DUP});
|
|
* null state is not included and should be handled after init() if necessary;
|
|
* all user-defined data are filled by zeros before call to KMP_INIT_STATE
|
|
* KMP_ADD_NEW(kmp,src,s) initialize last state of every new key string (called after KMP_INIT_STATE);
|
|
* the string must be parsed before so src is after the last string's character
|
|
* KMP_ADD_DUP(kmp,src,s) analogy of KMP_ADD_NEW called for duplicates
|
|
*
|
|
* Parameters to build():
|
|
* KMP_BUILD_STATE(kmp,s) called for all states (including null) in order of non-decreasing tree depth
|
|
*
|
|
* Other parameters:
|
|
* KMP_WANT_CLEANUP define cleanup()
|
|
* KMP_WANT_SEARCH includes ucw/kmp-search.h with the same prefix;
|
|
* there can be multiple search variants for a single KMP automaton
|
|
* KMP_USE_POOL allocates in a given pool
|
|
* KMP_CONTROL_CHAR special control character (default is ':')
|
|
* KMP_GIVE_ALLOC if set, you must supply custom allocation functions:
|
|
* void *alloc(unsigned int size) -- allocate space for
|
|
* a state. Default is pooled allocation from a local pool or HASH_USE_POOL.
|
|
* void free(void *) -- the converse.
|
|
* KMP_GIVE_HASHFN if set, you must supply custom hash function:
|
|
* unsigned int hash(struct P(struct) *kmp, struct P(state) *state, KMP_CHAR c);
|
|
* default hash function works only for integer character types
|
|
* KMP_GIVE_EQ if set, you must supply custom compare function of two characters:
|
|
* int eq(struct P(struct) *kmp, KMP_CHAR a, KMP_CHAR b);
|
|
* default is 'a == b'
|
|
*/
|
|
|
|
#ifndef KMP_PREFIX
|
|
#error Missing KMP_PREFIX
|
|
#endif
|
|
|
|
#include <ucw/mempool.h>
|
|
#include <alloca.h>
|
|
#include <string.h>
|
|
|
|
#define P(x) KMP_PREFIX(x)
|
|
|
|
#ifdef KMP_CHAR
|
|
typedef KMP_CHAR P(char_t);
|
|
#else
|
|
typedef u16 P(char_t);
|
|
#endif
|
|
|
|
typedef u32 P(len_t);
|
|
|
|
#ifdef KMP_NODE
|
|
typedef KMP_NODE P(node_t);
|
|
#else
|
|
typedef struct {} P(node_t);
|
|
#endif
|
|
|
|
struct P(struct);
|
|
|
|
struct P(state) {
|
|
struct P(state) *from; /* state with the previous character (forms a tree with null state in the root) */
|
|
struct P(state) *back; /* backwards edge to the longest shorter state with same suffix */
|
|
struct P(state) *next; /* the longest of shorter matches (or NULL) */
|
|
P(len_t) len; /* state depth if it represents a key string, zero otherwise */
|
|
P(char_t) c; /* last character of the represented string */
|
|
struct {
|
|
# ifdef KMP_STATE_VARS
|
|
KMP_STATE_VARS
|
|
# endif
|
|
} u; /* user-defined data*/
|
|
};
|
|
|
|
/* Control char */
|
|
static inline P(char_t)
|
|
P(control) (void)
|
|
{
|
|
# ifdef KMP_CONTROL_CHAR
|
|
return KMP_CONTROL_CHAR;
|
|
# else
|
|
return ':';
|
|
# endif
|
|
}
|
|
|
|
/* User-defined source */
|
|
struct P(hash_table);
|
|
|
|
#define HASH_GIVE_HASHFN
|
|
#ifdef KMP_GIVE_HASHFN
|
|
static inline uint
|
|
P(hash_hash) (struct P(hash_table) *t, struct P(state) *f, P(char_t) c)
|
|
{
|
|
return P(hash) ((struct P(struct) *) t, f, c);
|
|
}
|
|
#else
|
|
static inline uint
|
|
P(hash_hash) (struct P(hash_table) *t UNUSED, struct P(state) *f, P(char_t) c)
|
|
{
|
|
return (((uint)c) << 16) + (uint)(uintptr_t)f;
|
|
}
|
|
#endif
|
|
|
|
#ifndef KMP_GIVE_EQ
|
|
static inline int
|
|
P(eq) (struct P(struct) *kmp UNUSED, P(char_t) c1, P(char_t) c2)
|
|
{
|
|
return c1 == c2;
|
|
}
|
|
#endif
|
|
|
|
static inline int
|
|
P(is_control) (struct P(struct) *kmp, P(char_t) c)
|
|
{
|
|
return P(eq) (kmp, c, P(control)());
|
|
}
|
|
|
|
#define HASH_GIVE_EQ
|
|
static inline int
|
|
P(hash_eq) (struct P(hash_table) *t, struct P(state) *f1, P(char_t) c1, struct P(state) *f2, P(char_t) c2)
|
|
{
|
|
return f1 == f2 && P(eq)((struct P(struct) *) t, c1, c2);
|
|
}
|
|
|
|
#ifdef KMP_GIVE_ALLOC
|
|
#define HASH_GIVE_ALLOC
|
|
static inline void *
|
|
P(hash_alloc) (struct P(hash_table) *t, uint size)
|
|
{
|
|
return P(alloc) ((struct P(struct) *) t, size);
|
|
}
|
|
|
|
static inline void
|
|
P(hash_free) (struct P(hash_table) *t, void *ptr)
|
|
{
|
|
P(free) ((struct P(struct) *) t, ptr);
|
|
}
|
|
#endif
|
|
|
|
#define HASH_GIVE_INIT_KEY
|
|
static inline void
|
|
P(hash_init_key) (struct P(hash_table) *t UNUSED, struct P(state) *s, struct P(state) *f, P(char_t) c)
|
|
{
|
|
bzero(s, sizeof(*s));
|
|
# ifdef KMP_INIT_STATE
|
|
UNUSED struct P(struct) *kmp = (struct P(struct) *)t;
|
|
{ KMP_INIT_STATE(kmp, s); }
|
|
# endif
|
|
s->from = f;
|
|
s->c = c;
|
|
s->next = f->back; /* the pointers hold the link-list of sons... changed in build() */
|
|
f->back = s;
|
|
}
|
|
|
|
#undef P
|
|
#define HASH_PREFIX(x) KMP_PREFIX(hash_##x)
|
|
#define HASH_NODE struct KMP_PREFIX(state)
|
|
#define HASH_KEY_COMPLEX(x) x from, x c
|
|
#define HASH_KEY_DECL struct KMP_PREFIX(state) *from, KMP_PREFIX(char_t) c
|
|
#define HASH_WANT_NEW
|
|
#define HASH_WANT_FIND
|
|
#ifdef KMP_WANT_CLEANUP
|
|
#define HASH_WANT_CLEANUP
|
|
#endif
|
|
#if defined(KMP_USE_POOL)
|
|
#define HASH_USE_POOL KMP_USE_POOL
|
|
#else
|
|
#define HASH_AUTO_POOL 4096
|
|
#endif
|
|
#define HASH_CONSERVE_SPACE
|
|
#define HASH_TABLE_DYNAMIC
|
|
#include <ucw/hashtable.h>
|
|
#define P(x) KMP_PREFIX(x)
|
|
|
|
struct P(struct) {
|
|
struct P(hash_table) hash; /* hash table of state transitions */
|
|
struct P(state) null; /* null state */
|
|
struct {
|
|
# ifdef KMP_VARS
|
|
KMP_VARS
|
|
# endif
|
|
} u; /* user-defined data */
|
|
};
|
|
|
|
#ifdef KMP_SOURCE
|
|
typedef KMP_SOURCE P(source_t);
|
|
#else
|
|
typedef char *P(source_t);
|
|
#endif
|
|
|
|
#ifdef KMP_GET_CHAR
|
|
static inline int
|
|
P(get_char) (struct P(struct) *kmp UNUSED, P(source_t) *src UNUSED, P(char_t) *c UNUSED)
|
|
{
|
|
return KMP_GET_CHAR(kmp, (*src), (*c));
|
|
}
|
|
#else
|
|
# if defined(KMP_USE_UTF8)
|
|
# include <ucw/unicode.h>
|
|
# if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER) || defined(KMP_UNACCENT)
|
|
# include <charset/unicat.h>
|
|
# endif
|
|
# elif defined(KMP_USE_ASCII)
|
|
# if defined(KMP_ONLYALPHA) || defined(KMP_TOLOWER)
|
|
# include <ucw/chartype.h>
|
|
# endif
|
|
# endif
|
|
static inline int
|
|
P(get_char) (struct P(struct) *kmp UNUSED, P(source_t) *src, P(char_t) *c)
|
|
{
|
|
# ifdef KMP_USE_UTF8
|
|
uint cc;
|
|
*src = utf8_get(*src, &cc);
|
|
# ifdef KMP_ONLYALPHA
|
|
if (!cc) {}
|
|
else if (!Ualpha(cc))
|
|
cc = P(control)();
|
|
else
|
|
# endif
|
|
{
|
|
# ifdef KMP_TOLOWER
|
|
cc = Utolower(cc);
|
|
# endif
|
|
# ifdef KMP_UNACCENT
|
|
cc = Uunaccent(cc);
|
|
# endif
|
|
}
|
|
# else
|
|
uint cc = *(*src)++;
|
|
# ifdef KMP_ONLYALPHA
|
|
if (!cc) {}
|
|
else if (!Calpha(cc))
|
|
cc = P(control)();
|
|
else
|
|
# endif
|
|
# ifdef KMP_TOLOWER
|
|
cc = Clocase(cc);
|
|
# endif
|
|
# ifdef KMP_UNACCENT
|
|
# error Do not know how to unaccent ASCII characters
|
|
# endif
|
|
# endif
|
|
*c = cc;
|
|
return !!cc;
|
|
}
|
|
#endif
|
|
|
|
static struct P(state) *
|
|
P(add) (struct P(struct) *kmp, P(source_t) src
|
|
# ifdef KMP_ADD_EXTRA_ARGS
|
|
, KMP_ADD_EXTRA_ARGS
|
|
# endif
|
|
)
|
|
{
|
|
# ifdef KMP_ADD_INIT
|
|
{ KMP_ADD_INIT(kmp, src); }
|
|
# endif
|
|
|
|
P(char_t) c;
|
|
if (!P(get_char)(kmp, &src, &c))
|
|
return NULL;
|
|
struct P(state) *p = &kmp->null, *s;
|
|
uint len = 0;
|
|
do
|
|
{
|
|
s = P(hash_find)(&kmp->hash, p, c);
|
|
if (!s)
|
|
for (;;)
|
|
{
|
|
s = P(hash_new)(&kmp->hash, p, c);
|
|
len++;
|
|
if (!(P(get_char)(kmp, &src, &c)))
|
|
goto enter_new;
|
|
p = s;
|
|
}
|
|
p = s;
|
|
len++;
|
|
}
|
|
while (P(get_char)(kmp, &src, &c));
|
|
if (s->len)
|
|
{
|
|
# ifdef KMP_ADD_DUP
|
|
{ KMP_ADD_DUP(kmp, src, s); }
|
|
# endif
|
|
return s;
|
|
}
|
|
enter_new:
|
|
s->len = len;
|
|
# ifdef KMP_ADD_NEW
|
|
{ KMP_ADD_NEW(kmp, src, s); }
|
|
# endif
|
|
return s;
|
|
}
|
|
|
|
static void
|
|
P(init) (struct P(struct) *kmp)
|
|
{
|
|
bzero(&kmp->null, sizeof(struct P(state)));
|
|
P(hash_init)(&kmp->hash);
|
|
}
|
|
|
|
#ifdef KMP_WANT_CLEANUP
|
|
static inline void
|
|
P(cleanup) (struct P(struct) *kmp)
|
|
{
|
|
P(hash_cleanup)(&kmp->hash);
|
|
}
|
|
#endif
|
|
|
|
static inline int
|
|
P(empty) (struct P(struct) *kmp)
|
|
{
|
|
return !kmp->hash.hash_count;
|
|
}
|
|
|
|
static inline struct P(state) *
|
|
P(chain_start) (struct P(state) *s)
|
|
{
|
|
return s->len ? s : s->next;
|
|
}
|
|
|
|
static void
|
|
P(build) (struct P(struct) *kmp)
|
|
{
|
|
if (P(empty)(kmp))
|
|
return;
|
|
uint read = 0, write = 0;
|
|
struct P(state) *fifo[kmp->hash.hash_count], *null = &kmp->null;
|
|
for (struct P(state) *s = null->back; s; s = s->next)
|
|
fifo[write++] = s;
|
|
null->back = NULL;
|
|
# ifdef KMP_BUILD_STATE
|
|
{ KMP_BUILD_STATE(kmp, null); }
|
|
# endif
|
|
while (read != write)
|
|
{
|
|
struct P(state) *s = fifo[read++], *t;
|
|
for (t = s->back; t; t = t->next)
|
|
fifo[write++] = t;
|
|
for (t = s->from->back; 1; t = t->back)
|
|
{
|
|
if (!t)
|
|
{
|
|
s->back = null;
|
|
s->next = NULL;
|
|
break;
|
|
}
|
|
s->back = P(hash_find)(&kmp->hash, t, s->c);
|
|
if (s->back)
|
|
{
|
|
s->next = s->back->len ? s->back : s->back->next;
|
|
break;
|
|
}
|
|
}
|
|
# ifdef KMP_BUILD_STATE
|
|
{ KMP_BUILD_STATE(kmp, s); }
|
|
# endif
|
|
}
|
|
}
|
|
|
|
#undef P
|
|
#undef KMP_CHAR
|
|
#undef KMP_SOURCE
|
|
#undef KMP_GET_CHAR
|
|
#undef KMP_VARS
|
|
#undef KMP_STATE_VARS
|
|
#undef KMP_CONTEXT
|
|
#undef KMP_USE_ASCII
|
|
#undef KMP_USE_UTF8
|
|
#undef KMP_TOLOWER
|
|
#undef KMP_UNACCENT
|
|
#undef KMP_ONLYALPHA
|
|
#undef KMP_CONTROL_CHAR
|
|
#undef KMP_ADD_EXTRA_ARGS
|
|
#undef KMP_ADD_INIT
|
|
#undef KMP_ADD_NEW
|
|
#undef KMP_ADD_DUP
|
|
#undef KMP_INIT_STATE
|
|
#undef KMP_BUILD_STATE
|
|
#undef KMP_USE_POOL
|
|
#undef KMP_GIVE_ALLOC
|
|
#undef KMP_GIVE_HASHFN
|
|
#undef KMP_GIVE_EQ
|
|
|
|
#ifdef KMP_WANT_SEARCH
|
|
# undef KMP_WANT_SEARCH
|
|
# define KMPS_PREFIX(x) KMP_PREFIX(x)
|
|
# define KMPS_KMP_PREFIX(x) KMP_PREFIX(x)
|
|
# include <ucw/kmp-search.h>
|
|
#endif
|
|
|
|
#undef KMP_PREFIX
|
|
|