You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
767 lines
16 KiB
767 lines
16 KiB
3 months ago
|
/*
|
||
|
* UCW Library -- URL Functions
|
||
|
*
|
||
|
* (c) 1997--2004 Martin Mares <mj@ucw.cz>
|
||
|
* (c) 2001--2005 Robert Spalek <robert@ucw.cz>
|
||
|
*
|
||
|
* This software may be freely distributed and used according to the terms
|
||
|
* of the GNU Lesser General Public License.
|
||
|
*
|
||
|
* XXX: The buffer handling in this module is really horrible, but it works.
|
||
|
*/
|
||
|
|
||
|
#include <ucw/lib.h>
|
||
|
#include <ucw/url.h>
|
||
|
#include <ucw/chartype.h>
|
||
|
#include <ucw/conf.h>
|
||
|
#include <ucw/prime.h>
|
||
|
|
||
|
#include <string.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <stdio.h>
|
||
|
#include <alloca.h>
|
||
|
|
||
|
/* Configuration */
|
||
|
|
||
|
static uint url_ignore_spaces;
|
||
|
static uint url_ignore_underflow;
|
||
|
static char *url_component_separators = "";
|
||
|
static uint url_min_repeat_count = 0x7fffffff;
|
||
|
static uint url_max_repeat_length = 0;
|
||
|
static uint url_max_occurences = ~0U;
|
||
|
|
||
|
#ifndef TEST
|
||
|
static struct cf_section url_config = {
|
||
|
CF_ITEMS {
|
||
|
CF_UINT("IgnoreSpaces", &url_ignore_spaces),
|
||
|
CF_UINT("IgnoreUnderflow", &url_ignore_underflow),
|
||
|
CF_STRING("ComponentSeparators", &url_component_separators),
|
||
|
CF_UINT("MinRepeatCount", &url_min_repeat_count),
|
||
|
CF_UINT("MaxRepeatLength", &url_max_repeat_length),
|
||
|
CF_UINT("MaxOccurences", &url_max_occurences),
|
||
|
CF_END
|
||
|
}
|
||
|
};
|
||
|
|
||
|
static void CONSTRUCTOR url_init_config(void)
|
||
|
{
|
||
|
cf_declare_section("URL", &url_config, 0);
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
/* Escaping and de-escaping */
|
||
|
|
||
|
static uint
|
||
|
enhex(uint x)
|
||
|
{
|
||
|
return (x<10) ? (x + '0') : (x - 10 + 'A');
|
||
|
}
|
||
|
|
||
|
int
|
||
|
url_deescape(const char *s, char *d)
|
||
|
{
|
||
|
char *dstart = d;
|
||
|
char *end = d + MAX_URL_SIZE - 10;
|
||
|
while (*s)
|
||
|
{
|
||
|
if (d >= end)
|
||
|
return URL_ERR_TOO_LONG;
|
||
|
if (*s == '%')
|
||
|
{
|
||
|
uint val;
|
||
|
if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
|
||
|
return URL_ERR_INVALID_ESCAPE;
|
||
|
val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
|
||
|
if (val < 0x20)
|
||
|
return URL_ERR_INVALID_ESCAPED_CHAR;
|
||
|
switch (val)
|
||
|
{
|
||
|
case ';':
|
||
|
val = NCC_SEMICOLON; break;
|
||
|
case '/':
|
||
|
val = NCC_SLASH; break;
|
||
|
case '?':
|
||
|
val = NCC_QUEST; break;
|
||
|
case ':':
|
||
|
val = NCC_COLON; break;
|
||
|
case '@':
|
||
|
val = NCC_AT; break;
|
||
|
case '=':
|
||
|
val = NCC_EQUAL; break;
|
||
|
case '&':
|
||
|
val = NCC_AND; break;
|
||
|
case '#':
|
||
|
val = NCC_HASH; break;
|
||
|
case '$':
|
||
|
val = NCC_DOLLAR; break;
|
||
|
case '+':
|
||
|
val = NCC_PLUS; break;
|
||
|
case ',':
|
||
|
val = NCC_COMMA; break;
|
||
|
}
|
||
|
*d++ = val;
|
||
|
s += 3;
|
||
|
}
|
||
|
else if ((byte) *s > 0x20)
|
||
|
*d++ = *s++;
|
||
|
else if (Cspace(*s))
|
||
|
{
|
||
|
const char *s0 = s;
|
||
|
while (Cspace(*s))
|
||
|
s++;
|
||
|
if (!url_ignore_spaces || !(!*s || d == dstart))
|
||
|
{
|
||
|
while (Cspace(*s0))
|
||
|
{
|
||
|
if (d >= end)
|
||
|
return URL_ERR_TOO_LONG;
|
||
|
*d++ = *s0++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
return URL_ERR_INVALID_CHAR;
|
||
|
}
|
||
|
*d = 0;
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
int
|
||
|
url_enescape(const char *s, char *d)
|
||
|
{
|
||
|
char *end = d + MAX_URL_SIZE - 10;
|
||
|
uint c;
|
||
|
|
||
|
while (c = *s)
|
||
|
{
|
||
|
if (d >= end)
|
||
|
return URL_ERR_TOO_LONG;
|
||
|
if (Calnum(c) || /* RFC 2396 (2.1-2.3): Only alphanumerics ... */
|
||
|
c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' || /* ... and some exceptions and reserved chars */
|
||
|
c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||
|
||
|
c == ',' || c == '=' || c == '&' || c == '#' || c == ';' ||
|
||
|
c == '/' || c == '?' || c == ':' || c == '@' || c == '~'
|
||
|
)
|
||
|
*d++ = *s++;
|
||
|
else
|
||
|
{
|
||
|
uint val = (byte)(((byte)*s < NCC_MAX) ? NCC_CHARS[(byte)*s] : *s);
|
||
|
*d++ = '%';
|
||
|
*d++ = enhex(val >> 4);
|
||
|
*d++ = enhex(val & 0x0f);
|
||
|
s++;
|
||
|
}
|
||
|
}
|
||
|
*d = 0;
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
int
|
||
|
url_enescape_friendly(const char *src, char *dest)
|
||
|
{
|
||
|
char *end = dest + MAX_URL_SIZE - 10;
|
||
|
const byte *srcb = src;
|
||
|
while (*srcb)
|
||
|
{
|
||
|
if (dest >= end)
|
||
|
return URL_ERR_TOO_LONG;
|
||
|
if ((byte)*srcb < NCC_MAX)
|
||
|
*dest++ = NCC_CHARS[*srcb++];
|
||
|
else if (*srcb >= 0x20 && *srcb < 0x7f)
|
||
|
*dest++ = *srcb++;
|
||
|
else
|
||
|
{
|
||
|
*dest++ = '%';
|
||
|
*dest++ = enhex((byte)*srcb >> 4);
|
||
|
*dest++ = enhex(*srcb++ & 0x0f);
|
||
|
}
|
||
|
}
|
||
|
*dest = 0;
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/* Split an URL (several parts may be copied to the destination buffer) */
|
||
|
|
||
|
char *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
|
||
|
static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;
|
||
|
|
||
|
uint
|
||
|
url_identify_protocol(const char *p)
|
||
|
{
|
||
|
uint i;
|
||
|
|
||
|
for(i=1; i<URL_PROTO_MAX; i++)
|
||
|
if (!strcasecmp(p, url_proto_names[i]))
|
||
|
return i;
|
||
|
return URL_PROTO_UNKNOWN;
|
||
|
}
|
||
|
|
||
|
int
|
||
|
url_split(char *s, struct url *u, char *d)
|
||
|
{
|
||
|
bzero(u, sizeof(struct url));
|
||
|
u->port = ~0;
|
||
|
u->bufend = d + MAX_URL_SIZE - 10;
|
||
|
|
||
|
if (s[0] != '/') /* Seek for "protocol:" */
|
||
|
{
|
||
|
char *p = s;
|
||
|
while (*p && Calnum(*p))
|
||
|
p++;
|
||
|
if (p != s && *p == ':')
|
||
|
{
|
||
|
u->protocol = d;
|
||
|
while (s < p)
|
||
|
*d++ = *s++;
|
||
|
*d++ = 0;
|
||
|
u->protoid = url_identify_protocol(u->protocol);
|
||
|
s++;
|
||
|
if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
|
||
|
{
|
||
|
/* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
|
||
|
int len = d - u->protocol;
|
||
|
d -= len;
|
||
|
s -= len;
|
||
|
u->protocol = NULL;
|
||
|
u->protoid = 0;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (s[0] == '/') /* Host spec or absolute path */
|
||
|
{
|
||
|
if (s[1] == '/') /* Host spec */
|
||
|
{
|
||
|
char *q, *e;
|
||
|
char *at = NULL;
|
||
|
char *ep;
|
||
|
|
||
|
s += 2;
|
||
|
q = d;
|
||
|
while (*s && *s != '/' && *s != '?') /* Copy user:passwd@host:port */
|
||
|
{
|
||
|
if (*s != '@')
|
||
|
*d++ = *s;
|
||
|
else if (!at)
|
||
|
{
|
||
|
*d++ = 0;
|
||
|
at = d;
|
||
|
}
|
||
|
else /* This shouldn't happen with sane URL's, but we need to be sure */
|
||
|
*d++ = NCC_AT;
|
||
|
s++;
|
||
|
}
|
||
|
*d++ = 0;
|
||
|
if (at) /* user:passwd present */
|
||
|
{
|
||
|
u->user = q;
|
||
|
if (e = strchr(q, ':'))
|
||
|
{
|
||
|
*e++ = 0;
|
||
|
u->pass = e;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
at = q;
|
||
|
e = strchr(at, ':');
|
||
|
if (e) /* host:port present */
|
||
|
{
|
||
|
uint p;
|
||
|
*e++ = 0;
|
||
|
p = strtoul(e, &ep, 10);
|
||
|
if (ep && *ep || p > 65535)
|
||
|
return URL_ERR_INVALID_PORT;
|
||
|
else if (p) /* Port 0 (e.g. in :/) is treated as default port */
|
||
|
u->port = p;
|
||
|
}
|
||
|
u->host = at;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
u->rest = s;
|
||
|
u->buf = d;
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/* Normalization according to given base URL */
|
||
|
|
||
|
static uint std_ports[] = URL_DEFPORTS; /* Default port numbers */
|
||
|
|
||
|
static int
|
||
|
relpath_merge(struct url *u, struct url *b)
|
||
|
{
|
||
|
char *a = u->rest;
|
||
|
char *o = b->rest;
|
||
|
char *d = u->buf;
|
||
|
char *e = u->bufend;
|
||
|
char *p;
|
||
|
|
||
|
if (a[0] == '/') /* Absolute path => OK */
|
||
|
return 0;
|
||
|
if (o[0] != '/' && o[0] != '?')
|
||
|
return URL_PATH_UNDERFLOW;
|
||
|
|
||
|
if (!a[0]) /* Empty URL -> inherit everything */
|
||
|
{
|
||
|
u->rest = b->rest;
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
u->rest = d; /* We know we'll need to copy the path somewhere else */
|
||
|
|
||
|
if (a[0] == '#') /* Another fragment */
|
||
|
{
|
||
|
for(p=o; *p && *p != '#'; p++)
|
||
|
;
|
||
|
goto copy;
|
||
|
}
|
||
|
if (a[0] == '?') /* New query */
|
||
|
{
|
||
|
for(p=o; *p && *p != '#' && *p != '?'; p++)
|
||
|
;
|
||
|
goto copy;
|
||
|
}
|
||
|
|
||
|
p = NULL; /* Copy original path and find the last slash */
|
||
|
while (*o && *o != '?' && *o != '#')
|
||
|
{
|
||
|
if (d >= e)
|
||
|
return URL_ERR_TOO_LONG;
|
||
|
if ((*d++ = *o++) == '/')
|
||
|
p = d;
|
||
|
}
|
||
|
if (!p)
|
||
|
return URL_ERR_REL_NOTHING;
|
||
|
d = p;
|
||
|
|
||
|
while (*a)
|
||
|
{
|
||
|
if (a[0] == '.')
|
||
|
{
|
||
|
if (a[1] == '/' || !a[1]) /* Skip "./" and ".$" */
|
||
|
{
|
||
|
a++;
|
||
|
if (a[0])
|
||
|
a++;
|
||
|
continue;
|
||
|
}
|
||
|
else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
|
||
|
{
|
||
|
a += 2;
|
||
|
if (a[0])
|
||
|
a++;
|
||
|
if (d <= u->buf + 1)
|
||
|
{
|
||
|
/*
|
||
|
* RFC 1808 says we should leave ".." as a path segment, but
|
||
|
* we intentionally break the rule and refuse the URL.
|
||
|
*/
|
||
|
if (!url_ignore_underflow)
|
||
|
return URL_PATH_UNDERFLOW;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
d--; /* Discard trailing slash */
|
||
|
while (d[-1] != '/')
|
||
|
d--;
|
||
|
}
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
while (a[0] && a[0] != '/')
|
||
|
{
|
||
|
if (d >= e)
|
||
|
return URL_ERR_TOO_LONG;
|
||
|
*d++ = *a++;
|
||
|
}
|
||
|
if (a[0])
|
||
|
*d++ = *a++;
|
||
|
}
|
||
|
|
||
|
okay:
|
||
|
*d++ = 0;
|
||
|
u->buf = d;
|
||
|
return 0;
|
||
|
|
||
|
copy: /* Combine part of old URL with the new one */
|
||
|
while (o < p)
|
||
|
if (d < e)
|
||
|
*d++ = *o++;
|
||
|
else
|
||
|
return URL_ERR_TOO_LONG;
|
||
|
while (*a)
|
||
|
if (d < e)
|
||
|
*d++ = *a++;
|
||
|
else
|
||
|
return URL_ERR_TOO_LONG;
|
||
|
goto okay;
|
||
|
}
|
||
|
|
||
|
int
|
||
|
url_normalize(struct url *u, struct url *b)
|
||
|
{
|
||
|
int err;
|
||
|
|
||
|
/* Basic checks */
|
||
|
if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) ||
|
||
|
!u->host && u->user ||
|
||
|
!u->user && u->pass ||
|
||
|
!u->rest)
|
||
|
return URL_SYNTAX_ERROR;
|
||
|
|
||
|
if (!u->protocol)
|
||
|
{
|
||
|
/* Now we know it's a relative URL. Do we have any base? */
|
||
|
if (!b || !url_proto_path_flags[b->protoid])
|
||
|
return URL_ERR_REL_NOTHING;
|
||
|
u->protocol = b->protocol;
|
||
|
u->protoid = b->protoid;
|
||
|
|
||
|
/* Reference to the same host */
|
||
|
if (!u->host)
|
||
|
{
|
||
|
u->host = b->host;
|
||
|
u->user = b->user;
|
||
|
u->pass = b->pass;
|
||
|
u->port = b->port;
|
||
|
if (err = relpath_merge(u, b))
|
||
|
return err;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Change path "?" to "/?" because it's the true meaning */
|
||
|
if (u->rest[0] == '?')
|
||
|
{
|
||
|
int l = strlen(u->rest);
|
||
|
if (u->bufend - u->buf < l+1)
|
||
|
return URL_ERR_TOO_LONG;
|
||
|
u->buf[0] = '/';
|
||
|
memcpy(u->buf+1, u->rest, l+1);
|
||
|
u->rest = u->buf;
|
||
|
u->buf += l+2;
|
||
|
}
|
||
|
|
||
|
/* Fill in missing info */
|
||
|
if (u->port == ~0U)
|
||
|
u->port = std_ports[u->protoid];
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/* Name canonicalization */
|
||
|
|
||
|
static void
|
||
|
lowercase(char *b)
|
||
|
{
|
||
|
if (b)
|
||
|
while (*b)
|
||
|
{
|
||
|
if (*b >= 'A' && *b <= 'Z')
|
||
|
*b = *b + 0x20;
|
||
|
b++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
kill_end_dot(char *b)
|
||
|
{
|
||
|
char *k;
|
||
|
|
||
|
if (b)
|
||
|
{
|
||
|
k = b + strlen(b) - 1;
|
||
|
while (k > b && *k == '.')
|
||
|
*k-- = 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int
|
||
|
url_canonicalize(struct url *u)
|
||
|
{
|
||
|
char *c;
|
||
|
|
||
|
lowercase(u->protocol);
|
||
|
lowercase(u->host);
|
||
|
kill_end_dot(u->host);
|
||
|
if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
|
||
|
u->rest = "/";
|
||
|
if (u->rest && (c = strchr(u->rest, '#'))) /* Kill fragment reference */
|
||
|
*c = 0;
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/* Pack a broken-down URL */
|
||
|
|
||
|
static char *
|
||
|
append(char *d, const char *s, char *e)
|
||
|
{
|
||
|
if (d)
|
||
|
while (*s)
|
||
|
{
|
||
|
if (d >= e)
|
||
|
return NULL;
|
||
|
*d++ = *s++;
|
||
|
}
|
||
|
return d;
|
||
|
}
|
||
|
|
||
|
int
|
||
|
url_pack(struct url *u, char *d)
|
||
|
{
|
||
|
char *e = d + MAX_URL_SIZE - 10;
|
||
|
|
||
|
if (u->protocol)
|
||
|
{
|
||
|
d = append(d, u->protocol, e);
|
||
|
d = append(d, ":", e);
|
||
|
u->protoid = url_identify_protocol(u->protocol);
|
||
|
}
|
||
|
if (u->host)
|
||
|
{
|
||
|
d = append(d, "//", e);
|
||
|
if (u->user)
|
||
|
{
|
||
|
d = append(d, u->user, e);
|
||
|
if (u->pass)
|
||
|
{
|
||
|
d = append(d, ":", e);
|
||
|
d = append(d, u->pass, e);
|
||
|
}
|
||
|
d = append(d, "@", e);
|
||
|
}
|
||
|
d = append(d, u->host, e);
|
||
|
if (u->port != std_ports[u->protoid] && u->port != ~0U)
|
||
|
{
|
||
|
char z[10];
|
||
|
sprintf(z, "%d", u->port);
|
||
|
d = append(d, ":", e);
|
||
|
d = append(d, z, e);
|
||
|
}
|
||
|
}
|
||
|
if (u->rest)
|
||
|
d = append(d, u->rest, e);
|
||
|
if (!d)
|
||
|
return URL_ERR_TOO_LONG;
|
||
|
*d = 0;
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/* Error messages */
|
||
|
|
||
|
static char *errmsg[] = {
|
||
|
"Something is wrong",
|
||
|
"Too long",
|
||
|
"Invalid character",
|
||
|
"Invalid escape",
|
||
|
"Invalid escaped character",
|
||
|
"Invalid port number",
|
||
|
"Relative URL not allowed",
|
||
|
"Unknown protocol",
|
||
|
"Syntax error",
|
||
|
"Path underflow"
|
||
|
};
|
||
|
|
||
|
char *
|
||
|
url_error(uint err)
|
||
|
{
|
||
|
if (err >= sizeof(errmsg) / sizeof(char *))
|
||
|
err = 0;
|
||
|
return errmsg[err];
|
||
|
}
|
||
|
|
||
|
/* Standard cookbook recipes */
|
||
|
|
||
|
int
|
||
|
url_canon_split_rel(const char *u, char *buf1, char *buf2, struct url *url, struct url *base)
|
||
|
{
|
||
|
int err;
|
||
|
|
||
|
if (err = url_deescape(u, buf1))
|
||
|
return err;
|
||
|
if (err = url_split(buf1, url, buf2))
|
||
|
return err;
|
||
|
if (err = url_normalize(url, base))
|
||
|
return err;
|
||
|
return url_canonicalize(url);
|
||
|
}
|
||
|
|
||
|
int
|
||
|
url_auto_canonicalize_rel(const char *src, char *dst, struct url *base)
|
||
|
{
|
||
|
char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
|
||
|
int err;
|
||
|
struct url ur;
|
||
|
|
||
|
(void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) ||
|
||
|
(err = url_pack(&ur, buf3)) ||
|
||
|
(err = url_enescape(buf3, dst)));
|
||
|
return err;
|
||
|
}
|
||
|
|
||
|
/* Testing */
|
||
|
|
||
|
#ifdef TEST
|
||
|
|
||
|
int main(int argc, char **argv)
|
||
|
{
|
||
|
char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
|
||
|
int err;
|
||
|
struct url url, url0;
|
||
|
char *base = "http://mj@www.hell.org/123/sub_dir;param/index.html;param?query&zzz/sub;query+#fragment?";
|
||
|
|
||
|
if (argc != 2 && argc != 3)
|
||
|
return 1;
|
||
|
if (argc == 3)
|
||
|
base = argv[2];
|
||
|
if (err = url_deescape(argv[1], buf1))
|
||
|
{
|
||
|
printf("deesc: error %d\n", err);
|
||
|
return 1;
|
||
|
}
|
||
|
printf("deesc: %s\n", buf1);
|
||
|
if (err = url_split(buf1, &url, buf2))
|
||
|
{
|
||
|
printf("split: error %d\n", err);
|
||
|
return 1;
|
||
|
}
|
||
|
printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
|
||
|
if (err = url_split(base, &url0, buf3))
|
||
|
{
|
||
|
printf("split base: error %d\n", err);
|
||
|
return 1;
|
||
|
}
|
||
|
if (err = url_normalize(&url0, NULL))
|
||
|
{
|
||
|
printf("normalize base: error %d\n", err);
|
||
|
return 1;
|
||
|
}
|
||
|
printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
|
||
|
if (err = url_normalize(&url, &url0))
|
||
|
{
|
||
|
printf("normalize: error %d\n", err);
|
||
|
return 1;
|
||
|
}
|
||
|
printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
|
||
|
if (err = url_canonicalize(&url))
|
||
|
{
|
||
|
printf("canonicalize: error %d\n", err);
|
||
|
return 1;
|
||
|
}
|
||
|
printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
|
||
|
if (err = url_pack(&url, buf4))
|
||
|
{
|
||
|
printf("pack: error %d\n", err);
|
||
|
return 1;
|
||
|
}
|
||
|
printf("pack: %s\n", buf4);
|
||
|
if (err = url_enescape(buf4, buf2))
|
||
|
{
|
||
|
printf("enesc: error %d\n", err);
|
||
|
return 1;
|
||
|
}
|
||
|
printf("enesc: %s\n", buf2);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
#endif
|
||
|
|
||
|
struct component {
|
||
|
const char *start;
|
||
|
int length;
|
||
|
uint count;
|
||
|
u32 hash;
|
||
|
};
|
||
|
|
||
|
static inline u32
|
||
|
hashf(const char *start, int length)
|
||
|
{
|
||
|
u32 hf = length;
|
||
|
while (length-- > 0)
|
||
|
hf = (hf << 8 | hf >> 24) ^ *start++;
|
||
|
return hf;
|
||
|
}
|
||
|
|
||
|
static inline uint
|
||
|
repeat_count(struct component *comp, uint count, uint len)
|
||
|
{
|
||
|
struct component *orig_comp = comp;
|
||
|
uint found = 0;
|
||
|
while (1)
|
||
|
{
|
||
|
uint i;
|
||
|
comp += len;
|
||
|
count -= len;
|
||
|
found++;
|
||
|
if (count < len)
|
||
|
return found;
|
||
|
for (i=0; i<len; i++)
|
||
|
if (comp[i].hash != orig_comp[i].hash
|
||
|
|| comp[i].length != orig_comp[i].length
|
||
|
|| memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
|
||
|
return found;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int
|
||
|
url_has_repeated_component(const char *url)
|
||
|
{
|
||
|
struct component *comp;
|
||
|
uint comps, comp_len, rep_prefix, hash_size, *hash, *next;
|
||
|
const char *c;
|
||
|
uint i, j, k;
|
||
|
|
||
|
for (comps=0, c=url; c; comps++)
|
||
|
{
|
||
|
c = strpbrk(c, url_component_separators);
|
||
|
if (c)
|
||
|
c++;
|
||
|
}
|
||
|
if (comps < url_min_repeat_count && comps <= url_max_occurences)
|
||
|
return 0;
|
||
|
comp = alloca(comps * sizeof(*comp));
|
||
|
for (i=0, c=url; c; i++)
|
||
|
{
|
||
|
comp[i].start = c;
|
||
|
c = strpbrk(c, url_component_separators);
|
||
|
if (c)
|
||
|
{
|
||
|
comp[i].length = c - comp[i].start;
|
||
|
c++;
|
||
|
}
|
||
|
else
|
||
|
comp[i].length = strlen(comp[i].start);
|
||
|
}
|
||
|
ASSERT(i == comps);
|
||
|
for (i=0; i<comps; i++)
|
||
|
comp[i].hash = hashf(comp[i].start, comp[i].length);
|
||
|
if (comps > url_max_occurences)
|
||
|
{
|
||
|
hash_size = next_table_prime(comps);
|
||
|
hash = alloca(hash_size * sizeof(*hash));
|
||
|
next = alloca(comps * sizeof(*next));
|
||
|
memset(hash, 255, hash_size * sizeof(*hash));
|
||
|
for (i=0; i<comps; i++)
|
||
|
{
|
||
|
j = comp[i].hash % hash_size;
|
||
|
for (k = hash[j]; ~k && (comp[i].hash != comp[k].hash || comp[i].length != comp[k].length ||
|
||
|
memcmp(comp[k].start, comp[i].start, comp[i].length)); k = next[k]);
|
||
|
if (!~k)
|
||
|
{
|
||
|
next[i] = hash[j];
|
||
|
hash[j] = i;
|
||
|
comp[i].count = 1;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if (comp[k].count++ >= url_max_occurences)
|
||
|
return 1;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
|
||
|
for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
|
||
|
if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
|
||
|
return comp_len;
|
||
|
return 0;
|
||
|
}
|