351 lines
7.1 KiB
C
351 lines
7.1 KiB
C
/*
|
|
* UCW Library -- Interface to Regular Expression Libraries
|
|
*
|
|
* (c) 1997--2004 Martin Mares <mj@ucw.cz>
|
|
* (c) 2001 Robert Spalek <robert@ucw.cz>
|
|
*
|
|
* This software may be freely distributed and used according to the terms
|
|
* of the GNU Lesser General Public License.
|
|
*/
|
|
|
|
#include <ucw/lib.h>
|
|
#include <ucw/chartype.h>
|
|
#include <ucw/hashfunc.h>
|
|
#include <ucw/regex.h>
|
|
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
|
|
#ifdef CONFIG_UCW_POSIX_REGEX
|
|
|
|
/* POSIX regular expression library */
|
|
|
|
#include <regex.h>
|
|
|
|
struct regex {
|
|
regex_t rx;
|
|
regmatch_t matches[10];
|
|
};
|
|
|
|
regex *
|
|
rx_compile(const char *p, int icase)
|
|
{
|
|
regex *r = xmalloc_zero(sizeof(regex));
|
|
|
|
int err = regcomp(&r->rx, p, REG_EXTENDED | (icase ? REG_ICASE : 0));
|
|
if (err)
|
|
{
|
|
char msg[256];
|
|
regerror(err, &r->rx, msg, sizeof(msg)-1);
|
|
/* regfree(&r->rx) not needed */
|
|
die("Error parsing regular expression `%s': %s", p, msg);
|
|
}
|
|
return r;
|
|
}
|
|
|
|
void
|
|
rx_free(regex *r)
|
|
{
|
|
regfree(&r->rx);
|
|
xfree(r);
|
|
}
|
|
|
|
int
|
|
rx_match(regex *r, const char *s)
|
|
{
|
|
int err = regexec(&r->rx, s, 10, r->matches, 0);
|
|
if (!err)
|
|
{
|
|
/* regexec doesn't support anchored expressions, so we have to check ourselves that the full string is matched */
|
|
return !(r->matches[0].rm_so || s[r->matches[0].rm_eo]);
|
|
}
|
|
else if (err == REG_NOMATCH)
|
|
return 0;
|
|
else if (err == REG_ESPACE)
|
|
die("Regex matching ran out of memory");
|
|
else
|
|
die("Regex matching failed with unknown error %d", err);
|
|
}
|
|
|
|
int
|
|
rx_subst(regex *r, const char *by, const char *src, char *dest, uint destlen)
|
|
{
|
|
char *end = dest + destlen - 1;
|
|
|
|
if (!rx_match(r, src))
|
|
return 0;
|
|
|
|
while (*by)
|
|
{
|
|
if (*by == '\\')
|
|
{
|
|
by++;
|
|
if (*by >= '0' && *by <= '9') /* \0 gets replaced by entire pattern */
|
|
{
|
|
uint j = *by++ - '0';
|
|
if (j <= r->rx.re_nsub && r->matches[j].rm_so >= 0)
|
|
{
|
|
const char *s = src + r->matches[j].rm_so;
|
|
uint i = r->matches[j].rm_eo - r->matches[j].rm_so;
|
|
if (dest + i >= end)
|
|
return -1;
|
|
memcpy(dest, s, i);
|
|
dest += i;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
if (dest < end)
|
|
*dest++ = *by++;
|
|
else
|
|
return -1;
|
|
}
|
|
*dest = 0;
|
|
return 1;
|
|
}
|
|
|
|
#elif defined(CONFIG_UCW_PCRE)
|
|
|
|
/* PCRE library */
|
|
|
|
#include <pcre.h>
|
|
|
|
struct regex {
|
|
pcre *rx;
|
|
pcre_extra *extra;
|
|
uint match_array_size;
|
|
uint real_matches;
|
|
int matches[0]; /* (max_matches+1) pairs (pos,len) plus some workspace */
|
|
};
|
|
|
|
regex *
|
|
rx_compile(const char *p, int icase)
|
|
{
|
|
const char *err;
|
|
int errpos, match_array_size, eno;
|
|
|
|
pcre *rx = pcre_compile(p, PCRE_ANCHORED | PCRE_EXTRA | (icase ? PCRE_CASELESS : 0), &err, &errpos, NULL);
|
|
if (!rx)
|
|
die("Error parsing regular expression `%s': %s at position %d", p, err, errpos);
|
|
eno = pcre_fullinfo(rx, NULL, PCRE_INFO_CAPTURECOUNT, &match_array_size);
|
|
if (eno)
|
|
die("Internal error: pcre_fullinfo() failed with error %d", eno);
|
|
match_array_size = 3*(match_array_size+1);
|
|
regex *r = xmalloc_zero(sizeof(regex) + match_array_size * sizeof(int));
|
|
r->rx = rx;
|
|
r->match_array_size = match_array_size;
|
|
r->extra = pcre_study(r->rx, 0, &err);
|
|
if (err)
|
|
die("Error studying regular expression `%s': %s", p, err);
|
|
return r;
|
|
}
|
|
|
|
void
|
|
rx_free(regex *r)
|
|
{
|
|
xfree(r->rx);
|
|
xfree(r->extra);
|
|
xfree(r);
|
|
}
|
|
|
|
int
|
|
rx_match(regex *r, const char *s)
|
|
{
|
|
int len = str_len(s);
|
|
int err = pcre_exec(r->rx, r->extra, s, len, 0, 0, r->matches, r->match_array_size);
|
|
if (err >= 0)
|
|
{
|
|
r->real_matches = err;
|
|
/* need to check that the full string matches */
|
|
return !(r->matches[0] || s[r->matches[1]]);
|
|
}
|
|
else if (err == PCRE_ERROR_NOMATCH)
|
|
return 0;
|
|
else if (err == PCRE_ERROR_NOMEMORY)
|
|
die("Regex matching ran out of memory");
|
|
else
|
|
die("Regex matching failed with unknown error %d", err);
|
|
}
|
|
|
|
int
|
|
rx_subst(regex *r, const char *by, const char *src, char *dest, uint destlen)
|
|
{
|
|
char *end = dest + destlen - 1;
|
|
|
|
if (!rx_match(r, src))
|
|
return 0;
|
|
|
|
while (*by)
|
|
{
|
|
if (*by == '\\')
|
|
{
|
|
by++;
|
|
if (*by >= '0' && *by <= '9') /* \0 gets replaced by entire pattern */
|
|
{
|
|
uint j = *by++ - '0';
|
|
if (j < r->real_matches && r->matches[2*j] >= 0)
|
|
{
|
|
const char *s = src + r->matches[2*j];
|
|
uint i = r->matches[2*j+1] - r->matches[2*j];
|
|
if (dest + i >= end)
|
|
return -1;
|
|
memcpy(dest, s, i);
|
|
dest += i;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
if (dest < end)
|
|
*dest++ = *by++;
|
|
else
|
|
return -1;
|
|
}
|
|
*dest = 0;
|
|
return 1;
|
|
}
|
|
|
|
#else
|
|
|
|
/* BSD regular expression library */
|
|
|
|
#include <regex.h>
|
|
|
|
#define INITIAL_MEM 1024 /* Initial space allocated for each pattern */
|
|
#define CHAR_SET_SIZE 256 /* How many characters in the character set. */
|
|
|
|
struct regex {
|
|
struct re_pattern_buffer buf;
|
|
struct re_registers regs; /* Must not change between re_match() calls */
|
|
int len_cache;
|
|
};
|
|
|
|
regex *
|
|
rx_compile(const char *p, int icase)
|
|
{
|
|
regex *r = xmalloc_zero(sizeof(regex));
|
|
const char *msg;
|
|
|
|
r->buf.buffer = xmalloc(INITIAL_MEM);
|
|
r->buf.allocated = INITIAL_MEM;
|
|
if (icase)
|
|
{
|
|
uint i;
|
|
r->buf.translate = xmalloc (CHAR_SET_SIZE);
|
|
/* Map uppercase characters to corresponding lowercase ones. */
|
|
for (i = 0; i < CHAR_SET_SIZE; i++)
|
|
r->buf.translate[i] = Cupcase(i);
|
|
}
|
|
else
|
|
r->buf.translate = NULL;
|
|
re_set_syntax(RE_SYNTAX_POSIX_EXTENDED);
|
|
msg = re_compile_pattern(p, strlen(p), &r->buf);
|
|
if (!msg)
|
|
return r;
|
|
die("Error parsing pattern `%s': %s", p, msg);
|
|
}
|
|
|
|
void
|
|
rx_free(regex *r)
|
|
{
|
|
xfree(r->buf.buffer);
|
|
if (r->buf.translate)
|
|
xfree(r->buf.translate);
|
|
xfree(r);
|
|
}
|
|
|
|
int
|
|
rx_match(regex *r, const char *s)
|
|
{
|
|
int len = strlen(s);
|
|
|
|
r->len_cache = len;
|
|
if (re_match(&r->buf, s, len, 0, &r->regs) < 0)
|
|
return 0;
|
|
if (r->regs.start[0] || r->regs.end[0] != len) /* XXX: Why regex doesn't enforce implicit "^...$" ? */
|
|
return 0;
|
|
return 1;
|
|
}
|
|
|
|
int
|
|
rx_subst(regex *r, const char *by, const char *src, char *dest, uint destlen)
|
|
{
|
|
char *end = dest + destlen - 1;
|
|
|
|
if (!rx_match(r, src))
|
|
return 0;
|
|
|
|
while (*by)
|
|
{
|
|
if (*by == '\\')
|
|
{
|
|
by++;
|
|
if (*by >= '0' && *by <= '9') /* \0 gets replaced by entire pattern */
|
|
{
|
|
uint j = *by++ - '0';
|
|
if (j < r->regs.num_regs)
|
|
{
|
|
const char *s = src + r->regs.start[j];
|
|
uint i = r->regs.end[j] - r->regs.start[j];
|
|
if (r->regs.start[j] > r->len_cache || r->regs.end[j] > r->len_cache)
|
|
return -1;
|
|
if (dest + i >= end)
|
|
return -1;
|
|
memcpy(dest, s, i);
|
|
dest += i;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
if (dest < end)
|
|
*dest++ = *by++;
|
|
else
|
|
return -1;
|
|
}
|
|
*dest = 0;
|
|
return 1;
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifdef TEST
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
regex *r;
|
|
char buf1[4096], buf2[4096];
|
|
int opt_i = 0;
|
|
|
|
if (!strcmp(argv[1], "-i"))
|
|
{
|
|
opt_i = 1;
|
|
argv++;
|
|
argc--;
|
|
}
|
|
r = rx_compile(argv[1], opt_i);
|
|
while (fgets(buf1, sizeof(buf1), stdin))
|
|
{
|
|
char *p = strchr(buf1, '\n');
|
|
if (p)
|
|
*p = 0;
|
|
if (argc == 2)
|
|
{
|
|
if (rx_match(r, buf1))
|
|
puts("MATCH");
|
|
else
|
|
puts("NO MATCH");
|
|
}
|
|
else
|
|
{
|
|
int i = rx_subst(r, argv[2], buf1, buf2, sizeof(buf2));
|
|
if (i < 0)
|
|
puts("OVERFLOW");
|
|
else if (!i)
|
|
puts("NO MATCH");
|
|
else
|
|
puts(buf2);
|
|
}
|
|
}
|
|
rx_free(r);
|
|
}
|
|
|
|
#endif
|