sksp2024-mcu/libucw/ucw/url.c


								/*

								 *	UCW Library -- URL Functions

								 *

								 *	(c) 1997--2004 Martin Mares <mj@ucw.cz>

								 *	(c) 2001--2005 Robert Spalek <robert@ucw.cz>

								 *

								 *	This software may be freely distributed and used according to the terms

								 *	of the GNU Lesser General Public License.

								 *

								 *	XXX: The buffer handling in this module is really horrible, but it works.

								 */


								#include <ucw/lib.h>

								#include <ucw/url.h>

								#include <ucw/chartype.h>

								#include <ucw/conf.h>

								#include <ucw/prime.h>


								#include <string.h>

								#include <stdlib.h>

								#include <stdio.h>

								#include <alloca.h>


								/* Configuration */


								static uint url_ignore_spaces;

								static uint url_ignore_underflow;

								static char *url_component_separators = "";

								static uint url_min_repeat_count = 0x7fffffff;

								static uint url_max_repeat_length = 0;

								static uint url_max_occurences = ~0U;


								#ifndef TEST

								static struct cf_section url_config = {

								  CF_ITEMS {

								    CF_UINT("IgnoreSpaces", &url_ignore_spaces),

								    CF_UINT("IgnoreUnderflow", &url_ignore_underflow),

								    CF_STRING("ComponentSeparators", &url_component_separators),

								    CF_UINT("MinRepeatCount", &url_min_repeat_count),

								    CF_UINT("MaxRepeatLength", &url_max_repeat_length),

								    CF_UINT("MaxOccurences", &url_max_occurences),

								    CF_END

								  }

								};


								static void CONSTRUCTOR url_init_config(void)

								{

								  cf_declare_section("URL", &url_config, 0);

								}

								#endif


								/* Escaping and de-escaping */


								static uint

								enhex(uint x)

								{

								  return (x<10) ? (x + '0') : (x - 10 + 'A');

								}


								int

								url_deescape(const char *s, char *d)

								{

								  char *dstart = d;

								  char *end = d + MAX_URL_SIZE - 10;

								  while (*s)

								    {

								      if (d >= end)

									return URL_ERR_TOO_LONG;

								      if (*s == '%')

									{

									  uint val;

									  if (!Cxdigit(s[1]) || !Cxdigit(s[2]))

									    return URL_ERR_INVALID_ESCAPE;

									  val = Cxvalue(s[1])*16 + Cxvalue(s[2]);

									  if (val < 0x20)

									    return URL_ERR_INVALID_ESCAPED_CHAR;

									  switch (val)

									    {

									    case ';':

									      val = NCC_SEMICOLON; break;

									    case '/':

									      val = NCC_SLASH; break;

									    case '?':

									      val = NCC_QUEST; break;

									    case ':':

									      val = NCC_COLON; break;

									    case '@':

									      val = NCC_AT; break;

									    case '=':

									      val = NCC_EQUAL; break;

									    case '&':

									      val = NCC_AND; break;

									    case '#':

									      val = NCC_HASH; break;

									    case '$':

									      val = NCC_DOLLAR; break;

									    case '+':

									      val = NCC_PLUS; break;

									    case ',':

									      val = NCC_COMMA; break;

									    }

									  *d++ = val;

									  s += 3;

									}

								      else if ((byte) *s > 0x20)

									*d++ = *s++;

								      else if (Cspace(*s))

									{

									  const char *s0 = s;

									  while (Cspace(*s))

									    s++;

									  if (!url_ignore_spaces || !(!*s || d == dstart))

									    {

									      while (Cspace(*s0))

										{

										  if (d >= end)

										    return URL_ERR_TOO_LONG;

										  *d++ = *s0++;

										}

									    }

									}

								      else

									return URL_ERR_INVALID_CHAR;

								    }

								  *d = 0;

								  return 0;

								}


								int

								url_enescape(const char *s, char *d)

								{

								  char *end = d + MAX_URL_SIZE - 10;

								  uint c;


								  while (c = *s)

								    {

								      if (d >= end)

									return URL_ERR_TOO_LONG;

								      if (Calnum(c) ||							/* RFC 2396 (2.1-2.3): Only alphanumerics ... */

									  c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||	/* ... and some exceptions and reserved chars */

									  c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||

									  c == ',' || c == '=' || c == '&' || c == '#' || c == ';' ||

									  c == '/' || c == '?' || c == ':' || c == '@' || c == '~'

									)

									*d++ = *s++;

								      else

									{

									  uint val = (byte)(((byte)*s < NCC_MAX) ? NCC_CHARS[(byte)*s] : *s);

									  *d++ = '%';

									  *d++ = enhex(val >> 4);

									  *d++ = enhex(val & 0x0f);

									  s++;

									}

								    }

								  *d = 0;

								  return 0;

								}


								int

								url_enescape_friendly(const char *src, char *dest)

								{

								  char *end = dest + MAX_URL_SIZE - 10;

								  const byte *srcb = src;

								  while (*srcb)

								    {

								      if (dest >= end)

									return URL_ERR_TOO_LONG;

								      if ((byte)*srcb < NCC_MAX)

									*dest++ = NCC_CHARS[*srcb++];

								      else if (*srcb >= 0x20 && *srcb < 0x7f)

									*dest++ = *srcb++;

								      else

									{

									  *dest++ = '%';

									  *dest++ = enhex((byte)*srcb >> 4);

									  *dest++ = enhex(*srcb++ & 0x0f);

									}

								    }

								  *dest = 0;

								  return 0;

								}


								/* Split an URL (several parts may be copied to the destination buffer) */


								char *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;

								static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;


								uint

								url_identify_protocol(const char *p)

								{

								  uint i;


								  for(i=1; i<URL_PROTO_MAX; i++)

								    if (!strcasecmp(p, url_proto_names[i]))

								      return i;

								  return URL_PROTO_UNKNOWN;

								}


								int

								url_split(char *s, struct url *u, char *d)

								{

								  bzero(u, sizeof(struct url));

								  u->port = ~0;

								  u->bufend = d + MAX_URL_SIZE - 10;


								  if (s[0] != '/')			/* Seek for "protocol:" */

								    {

								      char *p = s;

								      while (*p && Calnum(*p))

									p++;

								      if (p != s && *p == ':')

									{

									  u->protocol = d;

									  while (s < p)

									    *d++ = *s++;

									  *d++ = 0;

									  u->protoid = url_identify_protocol(u->protocol);

									  s++;

									  if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))

									    {

									      /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */

									      int len = d - u->protocol;

									      d -= len;

									      s -= len;

									      u->protocol = NULL;

									      u->protoid = 0;

									    }

									}

								    }


								  if (s[0] == '/')			/* Host spec or absolute path */

								    {

								      if (s[1] == '/')			/* Host spec */

									{

									  char *q, *e;

									  char *at = NULL;

									  char *ep;


									  s += 2;

									  q = d;

									  while (*s && *s != '/' && *s != '?')	/* Copy user:passwd@host:port */

									    {

									      if (*s != '@')

										*d++ = *s;

									      else if (!at)

										{

										  *d++ = 0;

										  at = d;

										}

									      else			/* This shouldn't happen with sane URL's, but we need to be sure */

										*d++ = NCC_AT;

									      s++;

									    }

									  *d++ = 0;

									  if (at)			/* user:passwd present */

									    {

									      u->user = q;

									      if (e = strchr(q, ':'))

										{

										  *e++ = 0;

										  u->pass = e;

										}

									    }

									  else

									    at = q;

									  e = strchr(at, ':');

									  if (e)			/* host:port present */

									    {

									      uint p;

									      *e++ = 0;

									      p = strtoul(e, &ep, 10);

									      if (ep && *ep || p > 65535)

										return URL_ERR_INVALID_PORT;

									      else if (p)		/* Port 0 (e.g. in :/) is treated as default port */

										u->port = p;

									    }

									  u->host = at;

									}

								    }


								  u->rest = s;

								  u->buf = d;

								  return 0;

								}


								/* Normalization according to given base URL */


								static uint std_ports[] = URL_DEFPORTS;	/* Default port numbers */


								static int

								relpath_merge(struct url *u, struct url *b)

								{

								  char *a = u->rest;

								  char *o = b->rest;

								  char *d = u->buf;

								  char *e = u->bufend;

								  char *p;


								  if (a[0] == '/')			/* Absolute path => OK */

								    return 0;

								  if (o[0] != '/' && o[0] != '?')

								    return URL_PATH_UNDERFLOW;


								  if (!a[0])				/* Empty URL -> inherit everything */

								    {

								      u->rest = b->rest;

								      return 0;

								    }


								  u->rest = d;				/* We know we'll need to copy the path somewhere else */


								  if (a[0] == '#')			/* Another fragment */

								    {

								      for(p=o; *p && *p != '#'; p++)

									;

								      goto copy;

								    }

								  if (a[0] == '?')			/* New query */

								    {

								      for(p=o; *p && *p != '#' && *p != '?'; p++)

									;

								      goto copy;

								    }


								  p = NULL;				/* Copy original path and find the last slash */

								  while (*o && *o != '?' && *o != '#')

								    {

								      if (d >= e)

									return URL_ERR_TOO_LONG;

								      if ((*d++ = *o++) == '/')

									p = d;

								    }

								  if (!p)

								    return URL_ERR_REL_NOTHING;

								  d = p;


								  while (*a)

								    {

								      if (a[0] == '.')

									{

									  if (a[1] == '/' || !a[1])	/* Skip "./" and ".$" */

									    {

									      a++;

									      if (a[0])

										a++;

									      continue;

									    }

									  else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */

									    {

									      a += 2;

									      if (a[0])

										a++;

									      if (d <= u->buf + 1)

										{

										  /*

										   * RFC 1808 says we should leave ".." as a path segment, but

										   * we intentionally break the rule and refuse the URL.

										   */

										  if (!url_ignore_underflow)

										    return URL_PATH_UNDERFLOW;

										}

									      else

										{

										  d--;			/* Discard trailing slash */

										  while (d[-1] != '/')

										    d--;

										}

									      continue;

									    }

									}

								      while (a[0] && a[0] != '/')

									{

									  if (d >= e)

									    return URL_ERR_TOO_LONG;

									  *d++ = *a++;

									}

								      if (a[0])

									*d++ = *a++;

								    }


								okay:

								  *d++ = 0;

								  u->buf = d;

								  return 0;


								copy:					/* Combine part of old URL with the new one */

								  while (o < p)

								    if (d < e)

								      *d++ = *o++;

								    else

								      return URL_ERR_TOO_LONG;

								  while (*a)

								    if (d < e)

								      *d++ = *a++;

								    else

								      return URL_ERR_TOO_LONG;

								  goto okay;

								}


								int

								url_normalize(struct url *u, struct url *b)

								{

								  int err;


								  /* Basic checks */

								  if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) ||

								      !u->host && u->user ||

								      !u->user && u->pass ||

								      !u->rest)

								    return URL_SYNTAX_ERROR;


								  if (!u->protocol)

								    {

								      /* Now we know it's a relative URL. Do we have any base? */

								      if (!b || !url_proto_path_flags[b->protoid])

									return URL_ERR_REL_NOTHING;

								      u->protocol = b->protocol;

								      u->protoid = b->protoid;


								      /* Reference to the same host */

								      if (!u->host)

									{

									  u->host = b->host;

									  u->user = b->user;

									  u->pass = b->pass;

									  u->port = b->port;

									  if (err = relpath_merge(u, b))

									    return err;

									}

								    }


								  /* Change path "?" to "/?" because it's the true meaning */

								  if (u->rest[0] == '?')

								    {

								      int l = strlen(u->rest);

								      if (u->bufend - u->buf < l+1)

									return URL_ERR_TOO_LONG;

								      u->buf[0] = '/';

								      memcpy(u->buf+1, u->rest, l+1);

								      u->rest = u->buf;

								      u->buf += l+2;

								    }


								  /* Fill in missing info */

								  if (u->port == ~0U)

								    u->port = std_ports[u->protoid];


								  return 0;

								}


								/* Name canonicalization */


								static void

								lowercase(char *b)

								{

								  if (b)

								    while (*b)

								      {

									if (*b >= 'A' && *b <= 'Z')

									  *b = *b + 0x20;

									b++;

								      }

								}


								static void

								kill_end_dot(char *b)

								{

								  char *k;


								  if (b)

								    {

								      k = b + strlen(b) - 1;

								      while (k > b && *k == '.')

									*k-- = 0;

								    }

								}


								int

								url_canonicalize(struct url *u)

								{

								  char *c;


								  lowercase(u->protocol);

								  lowercase(u->host);

								  kill_end_dot(u->host);

								  if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])

								    u->rest = "/";

								  if (u->rest && (c = strchr(u->rest, '#')))	/* Kill fragment reference */

								    *c = 0;

								  return 0;

								}


								/* Pack a broken-down URL */


								static char *

								append(char *d, const char *s, char *e)

								{

								  if (d)

								    while (*s)

								      {

									if (d >= e)

									  return NULL;

									*d++ = *s++;

								      }

								  return d;

								}


								int

								url_pack(struct url *u, char *d)

								{

								  char *e = d + MAX_URL_SIZE - 10;


								  if (u->protocol)

								    {

								      d = append(d, u->protocol, e);

								      d = append(d, ":", e);

								      u->protoid = url_identify_protocol(u->protocol);

								    }

								  if (u->host)

								    {

								      d = append(d, "//", e);

								      if (u->user)

									{

									  d = append(d, u->user, e);

									  if (u->pass)

									    {

									      d = append(d, ":", e);

									      d = append(d, u->pass, e);

									    }

									  d = append(d, "@", e);

									}

								      d = append(d, u->host, e);

								      if (u->port != std_ports[u->protoid] && u->port != ~0U)

									{

									  char z[10];

									  sprintf(z, "%d", u->port);

									  d = append(d, ":", e);

									  d = append(d, z, e);

									}

								    }

								  if (u->rest)

								    d = append(d, u->rest, e);

								  if (!d)

								    return URL_ERR_TOO_LONG;

								  *d = 0;

								  return 0;

								}


								/* Error messages */


								static char *errmsg[] = {

								  "Something is wrong",

								  "Too long",

								  "Invalid character",

								  "Invalid escape",

								  "Invalid escaped character",

								  "Invalid port number",

								  "Relative URL not allowed",

								  "Unknown protocol",

								  "Syntax error",

								  "Path underflow"

								};


								char *

								url_error(uint err)

								{

								  if (err >= sizeof(errmsg) / sizeof(char *))

								    err = 0;

								  return errmsg[err];

								}


								/* Standard cookbook recipes */


								int

								url_canon_split_rel(const char *u, char *buf1, char *buf2, struct url *url, struct url *base)

								{

								  int err;


								  if (err = url_deescape(u, buf1))

								    return err;

								  if (err = url_split(buf1, url, buf2))

								    return err;

								  if (err = url_normalize(url, base))

								    return err;

								  return url_canonicalize(url);

								}


								int

								url_auto_canonicalize_rel(const char *src, char *dst, struct url *base)

								{

								  char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];

								  int err;

								  struct url ur;


								  (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) ||

								   (err = url_pack(&ur, buf3)) ||

								   (err = url_enescape(buf3, dst)));

								  return err;

								}


								/* Testing */


								#ifdef TEST


								int main(int argc, char **argv)

								{

								  char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];

								  int err;

								  struct url url, url0;

								  char *base = "http://mj@www.hell.org/123/sub_dir;param/index.html;param?query&zzz/sub;query+#fragment?";


								  if (argc != 2 && argc != 3)

								    return 1;

								  if (argc == 3)

								    base = argv[2];

								  if (err = url_deescape(argv[1], buf1))

								    {

								      printf("deesc: error %d\n", err);

								      return 1;

								    }

								  printf("deesc: %s\n", buf1);

								  if (err = url_split(buf1, &url, buf2))

								    {

								      printf("split: error %d\n", err);

								      return 1;

								    }

								  printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);

								  if (err = url_split(base, &url0, buf3))

								    {

								      printf("split base: error %d\n", err);

								      return 1;

								    }

								  if (err = url_normalize(&url0, NULL))

								    {

								      printf("normalize base: error %d\n", err);

								      return 1;

								    }

								  printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);

								  if (err = url_normalize(&url, &url0))

								    {

								      printf("normalize: error %d\n", err);

								      return 1;

								    }

								  printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);

								  if (err = url_canonicalize(&url))

								    {

								      printf("canonicalize: error %d\n", err);

								      return 1;

								    }

								  printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);

								  if (err = url_pack(&url, buf4))

								    {

								      printf("pack: error %d\n", err);

								      return 1;

								    }

								  printf("pack: %s\n", buf4);

								  if (err = url_enescape(buf4, buf2))

								    {

								      printf("enesc: error %d\n", err);

								      return 1;

								    }

								  printf("enesc: %s\n", buf2);

								  return 0;

								}


								#endif


								struct component {

									const char *start;

									int length;

									uint count;

									u32 hash;

								};


								static inline u32

								hashf(const char *start, int length)

								{

									u32 hf = length;

									while (length-- > 0)

										hf = (hf << 8 | hf >> 24) ^ *start++;

									return hf;

								}


								static inline uint

								repeat_count(struct component *comp, uint count, uint len)

								{

									struct component *orig_comp = comp;

									uint found = 0;

									while (1)

									{

										uint i;

										comp += len;

										count -= len;

										found++;

										if (count < len)

											return found;

										for (i=0; i<len; i++)

											if (comp[i].hash != orig_comp[i].hash

											|| comp[i].length != orig_comp[i].length

											|| memcmp(comp[i].start, orig_comp[i].start, comp[i].length))

												return found;

									}

								}


								int

								url_has_repeated_component(const char *url)

								{

									struct component *comp;

									uint comps, comp_len, rep_prefix, hash_size, *hash, *next;

									const char *c;

									uint i, j, k;


									for (comps=0, c=url; c; comps++)

									{

										c = strpbrk(c, url_component_separators);

										if (c)

											c++;

									}

									if (comps < url_min_repeat_count && comps <= url_max_occurences)

										return 0;

									comp = alloca(comps * sizeof(*comp));

									for (i=0, c=url; c; i++)

									{

										comp[i].start = c;

										c = strpbrk(c, url_component_separators);

										if (c)

										{

											comp[i].length = c - comp[i].start;

											c++;

										}

										else

											comp[i].length = strlen(comp[i].start);

									}

									ASSERT(i == comps);

									for (i=0; i<comps; i++)

										comp[i].hash = hashf(comp[i].start, comp[i].length);

									if (comps > url_max_occurences)

									{

										hash_size = next_table_prime(comps);

										hash = alloca(hash_size * sizeof(*hash));

										next = alloca(comps * sizeof(*next));

										memset(hash, 255, hash_size * sizeof(*hash));

										for (i=0; i<comps; i++)

										{

											j = comp[i].hash % hash_size;

											for (k = hash[j]; ~k && (comp[i].hash != comp[k].hash || comp[i].length != comp[k].length ||

											    memcmp(comp[k].start, comp[i].start, comp[i].length)); k = next[k]);

											if (!~k)

											{

												next[i] = hash[j];

												hash[j] = i;

												comp[i].count = 1;

											}

											else

											{

												if (comp[k].count++ >= url_max_occurences)

													return 1;

											}

										}

									}

									for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)

										for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)

											if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)

												return comp_len;

									return 0;

								}