/* * UCW Library -- URL Functions * * (c) 1997--2004 Martin Mares * (c) 2001--2005 Robert Spalek * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. * * XXX: The buffer handling in this module is really horrible, but it works. */ #include #include #include #include #include #include #include #include #include /* Configuration */ static uint url_ignore_spaces; static uint url_ignore_underflow; static char *url_component_separators = ""; static uint url_min_repeat_count = 0x7fffffff; static uint url_max_repeat_length = 0; static uint url_max_occurences = ~0U; #ifndef TEST static struct cf_section url_config = { CF_ITEMS { CF_UINT("IgnoreSpaces", &url_ignore_spaces), CF_UINT("IgnoreUnderflow", &url_ignore_underflow), CF_STRING("ComponentSeparators", &url_component_separators), CF_UINT("MinRepeatCount", &url_min_repeat_count), CF_UINT("MaxRepeatLength", &url_max_repeat_length), CF_UINT("MaxOccurences", &url_max_occurences), CF_END } }; static void CONSTRUCTOR url_init_config(void) { cf_declare_section("URL", &url_config, 0); } #endif /* Escaping and de-escaping */ static uint enhex(uint x) { return (x<10) ? (x + '0') : (x - 10 + 'A'); } int url_deescape(const char *s, char *d) { char *dstart = d; char *end = d + MAX_URL_SIZE - 10; while (*s) { if (d >= end) return URL_ERR_TOO_LONG; if (*s == '%') { uint val; if (!Cxdigit(s[1]) || !Cxdigit(s[2])) return URL_ERR_INVALID_ESCAPE; val = Cxvalue(s[1])*16 + Cxvalue(s[2]); if (val < 0x20) return URL_ERR_INVALID_ESCAPED_CHAR; switch (val) { case ';': val = NCC_SEMICOLON; break; case '/': val = NCC_SLASH; break; case '?': val = NCC_QUEST; break; case ':': val = NCC_COLON; break; case '@': val = NCC_AT; break; case '=': val = NCC_EQUAL; break; case '&': val = NCC_AND; break; case '#': val = NCC_HASH; break; case '$': val = NCC_DOLLAR; break; case '+': val = NCC_PLUS; break; case ',': val = NCC_COMMA; break; } *d++ = val; s += 3; } else if ((byte) *s > 0x20) *d++ = *s++; else if (Cspace(*s)) { const char *s0 = s; while (Cspace(*s)) s++; if (!url_ignore_spaces || !(!*s || d == dstart)) { while (Cspace(*s0)) { if (d >= end) return URL_ERR_TOO_LONG; *d++ = *s0++; } } } else return URL_ERR_INVALID_CHAR; } *d = 0; return 0; } int url_enescape(const char *s, char *d) { char *end = d + MAX_URL_SIZE - 10; uint c; while (c = *s) { if (d >= end) return URL_ERR_TOO_LONG; if (Calnum(c) || /* RFC 2396 (2.1-2.3): Only alphanumerics ... */ c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' || /* ... and some exceptions and reserved chars */ c == '$' || c == '-' || c == '_' || c == '.' || c == '+' || c == ',' || c == '=' || c == '&' || c == '#' || c == ';' || c == '/' || c == '?' || c == ':' || c == '@' || c == '~' ) *d++ = *s++; else { uint val = (byte)(((byte)*s < NCC_MAX) ? NCC_CHARS[(byte)*s] : *s); *d++ = '%'; *d++ = enhex(val >> 4); *d++ = enhex(val & 0x0f); s++; } } *d = 0; return 0; } int url_enescape_friendly(const char *src, char *dest) { char *end = dest + MAX_URL_SIZE - 10; const byte *srcb = src; while (*srcb) { if (dest >= end) return URL_ERR_TOO_LONG; if ((byte)*srcb < NCC_MAX) *dest++ = NCC_CHARS[*srcb++]; else if (*srcb >= 0x20 && *srcb < 0x7f) *dest++ = *srcb++; else { *dest++ = '%'; *dest++ = enhex((byte)*srcb >> 4); *dest++ = enhex(*srcb++ & 0x0f); } } *dest = 0; return 0; } /* Split an URL (several parts may be copied to the destination buffer) */ char *url_proto_names[URL_PROTO_MAX] = URL_PNAMES; static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS; uint url_identify_protocol(const char *p) { uint i; for(i=1; iport = ~0; u->bufend = d + MAX_URL_SIZE - 10; if (s[0] != '/') /* Seek for "protocol:" */ { char *p = s; while (*p && Calnum(*p)) p++; if (p != s && *p == ':') { u->protocol = d; while (s < p) *d++ = *s++; *d++ = 0; u->protoid = url_identify_protocol(u->protocol); s++; if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/')) { /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */ int len = d - u->protocol; d -= len; s -= len; u->protocol = NULL; u->protoid = 0; } } } if (s[0] == '/') /* Host spec or absolute path */ { if (s[1] == '/') /* Host spec */ { char *q, *e; char *at = NULL; char *ep; s += 2; q = d; while (*s && *s != '/' && *s != '?') /* Copy user:passwd@host:port */ { if (*s != '@') *d++ = *s; else if (!at) { *d++ = 0; at = d; } else /* This shouldn't happen with sane URL's, but we need to be sure */ *d++ = NCC_AT; s++; } *d++ = 0; if (at) /* user:passwd present */ { u->user = q; if (e = strchr(q, ':')) { *e++ = 0; u->pass = e; } } else at = q; e = strchr(at, ':'); if (e) /* host:port present */ { uint p; *e++ = 0; p = strtoul(e, &ep, 10); if (ep && *ep || p > 65535) return URL_ERR_INVALID_PORT; else if (p) /* Port 0 (e.g. in :/) is treated as default port */ u->port = p; } u->host = at; } } u->rest = s; u->buf = d; return 0; } /* Normalization according to given base URL */ static uint std_ports[] = URL_DEFPORTS; /* Default port numbers */ static int relpath_merge(struct url *u, struct url *b) { char *a = u->rest; char *o = b->rest; char *d = u->buf; char *e = u->bufend; char *p; if (a[0] == '/') /* Absolute path => OK */ return 0; if (o[0] != '/' && o[0] != '?') return URL_PATH_UNDERFLOW; if (!a[0]) /* Empty URL -> inherit everything */ { u->rest = b->rest; return 0; } u->rest = d; /* We know we'll need to copy the path somewhere else */ if (a[0] == '#') /* Another fragment */ { for(p=o; *p && *p != '#'; p++) ; goto copy; } if (a[0] == '?') /* New query */ { for(p=o; *p && *p != '#' && *p != '?'; p++) ; goto copy; } p = NULL; /* Copy original path and find the last slash */ while (*o && *o != '?' && *o != '#') { if (d >= e) return URL_ERR_TOO_LONG; if ((*d++ = *o++) == '/') p = d; } if (!p) return URL_ERR_REL_NOTHING; d = p; while (*a) { if (a[0] == '.') { if (a[1] == '/' || !a[1]) /* Skip "./" and ".$" */ { a++; if (a[0]) a++; continue; } else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */ { a += 2; if (a[0]) a++; if (d <= u->buf + 1) { /* * RFC 1808 says we should leave ".." as a path segment, but * we intentionally break the rule and refuse the URL. */ if (!url_ignore_underflow) return URL_PATH_UNDERFLOW; } else { d--; /* Discard trailing slash */ while (d[-1] != '/') d--; } continue; } } while (a[0] && a[0] != '/') { if (d >= e) return URL_ERR_TOO_LONG; *d++ = *a++; } if (a[0]) *d++ = *a++; } okay: *d++ = 0; u->buf = d; return 0; copy: /* Combine part of old URL with the new one */ while (o < p) if (d < e) *d++ = *o++; else return URL_ERR_TOO_LONG; while (*a) if (d < e) *d++ = *a++; else return URL_ERR_TOO_LONG; goto okay; } int url_normalize(struct url *u, struct url *b) { int err; /* Basic checks */ if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) || !u->host && u->user || !u->user && u->pass || !u->rest) return URL_SYNTAX_ERROR; if (!u->protocol) { /* Now we know it's a relative URL. Do we have any base? */ if (!b || !url_proto_path_flags[b->protoid]) return URL_ERR_REL_NOTHING; u->protocol = b->protocol; u->protoid = b->protoid; /* Reference to the same host */ if (!u->host) { u->host = b->host; u->user = b->user; u->pass = b->pass; u->port = b->port; if (err = relpath_merge(u, b)) return err; } } /* Change path "?" to "/?" because it's the true meaning */ if (u->rest[0] == '?') { int l = strlen(u->rest); if (u->bufend - u->buf < l+1) return URL_ERR_TOO_LONG; u->buf[0] = '/'; memcpy(u->buf+1, u->rest, l+1); u->rest = u->buf; u->buf += l+2; } /* Fill in missing info */ if (u->port == ~0U) u->port = std_ports[u->protoid]; return 0; } /* Name canonicalization */ static void lowercase(char *b) { if (b) while (*b) { if (*b >= 'A' && *b <= 'Z') *b = *b + 0x20; b++; } } static void kill_end_dot(char *b) { char *k; if (b) { k = b + strlen(b) - 1; while (k > b && *k == '.') *k-- = 0; } } int url_canonicalize(struct url *u) { char *c; lowercase(u->protocol); lowercase(u->host); kill_end_dot(u->host); if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid]) u->rest = "/"; if (u->rest && (c = strchr(u->rest, '#'))) /* Kill fragment reference */ *c = 0; return 0; } /* Pack a broken-down URL */ static char * append(char *d, const char *s, char *e) { if (d) while (*s) { if (d >= e) return NULL; *d++ = *s++; } return d; } int url_pack(struct url *u, char *d) { char *e = d + MAX_URL_SIZE - 10; if (u->protocol) { d = append(d, u->protocol, e); d = append(d, ":", e); u->protoid = url_identify_protocol(u->protocol); } if (u->host) { d = append(d, "//", e); if (u->user) { d = append(d, u->user, e); if (u->pass) { d = append(d, ":", e); d = append(d, u->pass, e); } d = append(d, "@", e); } d = append(d, u->host, e); if (u->port != std_ports[u->protoid] && u->port != ~0U) { char z[10]; sprintf(z, "%d", u->port); d = append(d, ":", e); d = append(d, z, e); } } if (u->rest) d = append(d, u->rest, e); if (!d) return URL_ERR_TOO_LONG; *d = 0; return 0; } /* Error messages */ static char *errmsg[] = { "Something is wrong", "Too long", "Invalid character", "Invalid escape", "Invalid escaped character", "Invalid port number", "Relative URL not allowed", "Unknown protocol", "Syntax error", "Path underflow" }; char * url_error(uint err) { if (err >= sizeof(errmsg) / sizeof(char *)) err = 0; return errmsg[err]; } /* Standard cookbook recipes */ int url_canon_split_rel(const char *u, char *buf1, char *buf2, struct url *url, struct url *base) { int err; if (err = url_deescape(u, buf1)) return err; if (err = url_split(buf1, url, buf2)) return err; if (err = url_normalize(url, base)) return err; return url_canonicalize(url); } int url_auto_canonicalize_rel(const char *src, char *dst, struct url *base) { char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE]; int err; struct url ur; (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) || (err = url_pack(&ur, buf3)) || (err = url_enescape(buf3, dst))); return err; } /* Testing */ #ifdef TEST int main(int argc, char **argv) { char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE]; int err; struct url url, url0; char *base = "http://mj@www.hell.org/123/sub_dir;param/index.html;param?query&zzz/sub;query+#fragment?"; if (argc != 2 && argc != 3) return 1; if (argc == 3) base = argv[2]; if (err = url_deescape(argv[1], buf1)) { printf("deesc: error %d\n", err); return 1; } printf("deesc: %s\n", buf1); if (err = url_split(buf1, &url, buf2)) { printf("split: error %d\n", err); return 1; } printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest); if (err = url_split(base, &url0, buf3)) { printf("split base: error %d\n", err); return 1; } if (err = url_normalize(&url0, NULL)) { printf("normalize base: error %d\n", err); return 1; } printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest); if (err = url_normalize(&url, &url0)) { printf("normalize: error %d\n", err); return 1; } printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest); if (err = url_canonicalize(&url)) { printf("canonicalize: error %d\n", err); return 1; } printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest); if (err = url_pack(&url, buf4)) { printf("pack: error %d\n", err); return 1; } printf("pack: %s\n", buf4); if (err = url_enescape(buf4, buf2)) { printf("enesc: error %d\n", err); return 1; } printf("enesc: %s\n", buf2); return 0; } #endif struct component { const char *start; int length; uint count; u32 hash; }; static inline u32 hashf(const char *start, int length) { u32 hf = length; while (length-- > 0) hf = (hf << 8 | hf >> 24) ^ *start++; return hf; } static inline uint repeat_count(struct component *comp, uint count, uint len) { struct component *orig_comp = comp; uint found = 0; while (1) { uint i; comp += len; count -= len; found++; if (count < len) return found; for (i=0; i url_max_occurences) { hash_size = next_table_prime(comps); hash = alloca(hash_size * sizeof(*hash)); next = alloca(comps * sizeof(*next)); memset(hash, 255, hash_size * sizeof(*hash)); for (i=0; i= url_max_occurences) return 1; } } } for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++) for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++) if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count) return comp_len; return 0; }