/*
 *	UCW Library -- URL Functions
 *
 *	(c) 1997--2004 Martin Mares <mj@ucw.cz>
 *	(c) 2001--2005 Robert Spalek <robert@ucw.cz>
 *
 *	This software may be freely distributed and used according to the terms
 *	of the GNU Lesser General Public License.
 *
 *	XXX: The buffer handling in this module is really horrible, but it works.
 */

#include <ucw/lib.h>
#include <ucw/url.h>
#include <ucw/chartype.h>
#include <ucw/conf.h>
#include <ucw/prime.h>

#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <alloca.h>

/* Configuration */

static uint url_ignore_spaces;
static uint url_ignore_underflow;
static char *url_component_separators = "";
static uint url_min_repeat_count = 0x7fffffff;
static uint url_max_repeat_length = 0;
static uint url_max_occurences = ~0U;

#ifndef TEST
static struct cf_section url_config = {
  CF_ITEMS {
    CF_UINT("IgnoreSpaces", &url_ignore_spaces),
    CF_UINT("IgnoreUnderflow", &url_ignore_underflow),
    CF_STRING("ComponentSeparators", &url_component_separators),
    CF_UINT("MinRepeatCount", &url_min_repeat_count),
    CF_UINT("MaxRepeatLength", &url_max_repeat_length),
    CF_UINT("MaxOccurences", &url_max_occurences),
    CF_END
  }
};

static void CONSTRUCTOR url_init_config(void)
{
  cf_declare_section("URL", &url_config, 0);
}
#endif

/* Escaping and de-escaping */

static uint
enhex(uint x)
{
  return (x<10) ? (x + '0') : (x - 10 + 'A');
}

int
url_deescape(const char *s, char *d)
{
  char *dstart = d;
  char *end = d + MAX_URL_SIZE - 10;
  while (*s)
    {
      if (d >= end)
	return URL_ERR_TOO_LONG;
      if (*s == '%')
	{
	  uint val;
	  if (!Cxdigit(s[1]) || !Cxdigit(s[2]))
	    return URL_ERR_INVALID_ESCAPE;
	  val = Cxvalue(s[1])*16 + Cxvalue(s[2]);
	  if (val < 0x20)
	    return URL_ERR_INVALID_ESCAPED_CHAR;
	  switch (val)
	    {
	    case ';':
	      val = NCC_SEMICOLON; break;
	    case '/':
	      val = NCC_SLASH; break;
	    case '?':
	      val = NCC_QUEST; break;
	    case ':':
	      val = NCC_COLON; break;
	    case '@':
	      val = NCC_AT; break;
	    case '=':
	      val = NCC_EQUAL; break;
	    case '&':
	      val = NCC_AND; break;
	    case '#':
	      val = NCC_HASH; break;
	    case '$':
	      val = NCC_DOLLAR; break;
	    case '+':
	      val = NCC_PLUS; break;
	    case ',':
	      val = NCC_COMMA; break;
	    }
	  *d++ = val;
	  s += 3;
	}
      else if ((byte) *s > 0x20)
	*d++ = *s++;
      else if (Cspace(*s))
	{
	  const char *s0 = s;
	  while (Cspace(*s))
	    s++;
	  if (!url_ignore_spaces || !(!*s || d == dstart))
	    {
	      while (Cspace(*s0))
		{
		  if (d >= end)
		    return URL_ERR_TOO_LONG;
		  *d++ = *s0++;
		}
	    }
	}
      else
	return URL_ERR_INVALID_CHAR;
    }
  *d = 0;
  return 0;
}

int
url_enescape(const char *s, char *d)
{
  char *end = d + MAX_URL_SIZE - 10;
  uint c;

  while (c = *s)
    {
      if (d >= end)
	return URL_ERR_TOO_LONG;
      if (Calnum(c) ||							/* RFC 2396 (2.1-2.3): Only alphanumerics ... */
	  c == '!' || c == '*' || c == '\'' || c == '(' || c == ')' ||	/* ... and some exceptions and reserved chars */
	  c == '$' || c == '-' || c == '_' || c == '.' || c == '+' ||
	  c == ',' || c == '=' || c == '&' || c == '#' || c == ';' ||
	  c == '/' || c == '?' || c == ':' || c == '@' || c == '~'
	)
	*d++ = *s++;
      else
	{
	  uint val = (byte)(((byte)*s < NCC_MAX) ? NCC_CHARS[(byte)*s] : *s);
	  *d++ = '%';
	  *d++ = enhex(val >> 4);
	  *d++ = enhex(val & 0x0f);
	  s++;
	}
    }
  *d = 0;
  return 0;
}

int
url_enescape_friendly(const char *src, char *dest)
{
  char *end = dest + MAX_URL_SIZE - 10;
  const byte *srcb = src;
  while (*srcb)
    {
      if (dest >= end)
	return URL_ERR_TOO_LONG;
      if ((byte)*srcb < NCC_MAX)
	*dest++ = NCC_CHARS[*srcb++];
      else if (*srcb >= 0x20 && *srcb < 0x7f)
	*dest++ = *srcb++;
      else
	{
	  *dest++ = '%';
	  *dest++ = enhex((byte)*srcb >> 4);
	  *dest++ = enhex(*srcb++ & 0x0f);
	}
    }
  *dest = 0;
  return 0;
}

/* Split an URL (several parts may be copied to the destination buffer) */

char *url_proto_names[URL_PROTO_MAX] = URL_PNAMES;
static int url_proto_path_flags[URL_PROTO_MAX] = URL_PATH_FLAGS;

uint
url_identify_protocol(const char *p)
{
  uint i;

  for(i=1; i<URL_PROTO_MAX; i++)
    if (!strcasecmp(p, url_proto_names[i]))
      return i;
  return URL_PROTO_UNKNOWN;
}

int
url_split(char *s, struct url *u, char *d)
{
  bzero(u, sizeof(struct url));
  u->port = ~0;
  u->bufend = d + MAX_URL_SIZE - 10;

  if (s[0] != '/')			/* Seek for "protocol:" */
    {
      char *p = s;
      while (*p && Calnum(*p))
	p++;
      if (p != s && *p == ':')
	{
	  u->protocol = d;
	  while (s < p)
	    *d++ = *s++;
	  *d++ = 0;
	  u->protoid = url_identify_protocol(u->protocol);
	  s++;
	  if (url_proto_path_flags[u->protoid] && (s[0] != '/' || s[1] != '/'))
	    {
	      /* The protocol requires complete host spec, but it's missing -> treat as a relative path instead */
	      int len = d - u->protocol;
	      d -= len;
	      s -= len;
	      u->protocol = NULL;
	      u->protoid = 0;
	    }
	}
    }

  if (s[0] == '/')			/* Host spec or absolute path */
    {
      if (s[1] == '/')			/* Host spec */
	{
	  char *q, *e;
	  char *at = NULL;
	  char *ep;

	  s += 2;
	  q = d;
	  while (*s && *s != '/' && *s != '?')	/* Copy user:passwd@host:port */
	    {
	      if (*s != '@')
		*d++ = *s;
	      else if (!at)
		{
		  *d++ = 0;
		  at = d;
		}
	      else			/* This shouldn't happen with sane URL's, but we need to be sure */
		*d++ = NCC_AT;
	      s++;
	    }
	  *d++ = 0;
	  if (at)			/* user:passwd present */
	    {
	      u->user = q;
	      if (e = strchr(q, ':'))
		{
		  *e++ = 0;
		  u->pass = e;
		}
	    }
	  else
	    at = q;
	  e = strchr(at, ':');
	  if (e)			/* host:port present */
	    {
	      uint p;
	      *e++ = 0;
	      p = strtoul(e, &ep, 10);
	      if (ep && *ep || p > 65535)
		return URL_ERR_INVALID_PORT;
	      else if (p)		/* Port 0 (e.g. in :/) is treated as default port */
		u->port = p;
	    }
	  u->host = at;
	}
    }

  u->rest = s;
  u->buf = d;
  return 0;
}

/* Normalization according to given base URL */

static uint std_ports[] = URL_DEFPORTS;	/* Default port numbers */

static int
relpath_merge(struct url *u, struct url *b)
{
  char *a = u->rest;
  char *o = b->rest;
  char *d = u->buf;
  char *e = u->bufend;
  char *p;

  if (a[0] == '/')			/* Absolute path => OK */
    return 0;
  if (o[0] != '/' && o[0] != '?')
    return URL_PATH_UNDERFLOW;

  if (!a[0])				/* Empty URL -> inherit everything */
    {
      u->rest = b->rest;
      return 0;
    }

  u->rest = d;				/* We know we'll need to copy the path somewhere else */

  if (a[0] == '#')			/* Another fragment */
    {
      for(p=o; *p && *p != '#'; p++)
	;
      goto copy;
    }
  if (a[0] == '?')			/* New query */
    {
      for(p=o; *p && *p != '#' && *p != '?'; p++)
	;
      goto copy;
    }

  p = NULL;				/* Copy original path and find the last slash */
  while (*o && *o != '?' && *o != '#')
    {
      if (d >= e)
	return URL_ERR_TOO_LONG;
      if ((*d++ = *o++) == '/')
	p = d;
    }
  if (!p)
    return URL_ERR_REL_NOTHING;
  d = p;

  while (*a)
    {
      if (a[0] == '.')
	{
	  if (a[1] == '/' || !a[1])	/* Skip "./" and ".$" */
	    {
	      a++;
	      if (a[0])
		a++;
	      continue;
	    }
	  else if (a[1] == '.' && (a[2] == '/' || !a[2])) /* "../" */
	    {
	      a += 2;
	      if (a[0])
		a++;
	      if (d <= u->buf + 1)
		{
		  /*
		   * RFC 1808 says we should leave ".." as a path segment, but
		   * we intentionally break the rule and refuse the URL.
		   */
		  if (!url_ignore_underflow)
		    return URL_PATH_UNDERFLOW;
		}
	      else
		{
		  d--;			/* Discard trailing slash */
		  while (d[-1] != '/')
		    d--;
		}
	      continue;
	    }
	}
      while (a[0] && a[0] != '/')
	{
	  if (d >= e)
	    return URL_ERR_TOO_LONG;
	  *d++ = *a++;
	}
      if (a[0])
	*d++ = *a++;
    }

okay:
  *d++ = 0;
  u->buf = d;
  return 0;

copy:					/* Combine part of old URL with the new one */
  while (o < p)
    if (d < e)
      *d++ = *o++;
    else
      return URL_ERR_TOO_LONG;
  while (*a)
    if (d < e)
      *d++ = *a++;
    else
      return URL_ERR_TOO_LONG;
  goto okay;
}

int
url_normalize(struct url *u, struct url *b)
{
  int err;

  /* Basic checks */
  if (url_proto_path_flags[u->protoid] && (!u->host || !*u->host) ||
      !u->host && u->user ||
      !u->user && u->pass ||
      !u->rest)
    return URL_SYNTAX_ERROR;

  if (!u->protocol)
    {
      /* Now we know it's a relative URL. Do we have any base? */
      if (!b || !url_proto_path_flags[b->protoid])
	return URL_ERR_REL_NOTHING;
      u->protocol = b->protocol;
      u->protoid = b->protoid;

      /* Reference to the same host */
      if (!u->host)
	{
	  u->host = b->host;
	  u->user = b->user;
	  u->pass = b->pass;
	  u->port = b->port;
	  if (err = relpath_merge(u, b))
	    return err;
	}
    }

  /* Change path "?" to "/?" because it's the true meaning */
  if (u->rest[0] == '?')
    {
      int l = strlen(u->rest);
      if (u->bufend - u->buf < l+1)
	return URL_ERR_TOO_LONG;
      u->buf[0] = '/';
      memcpy(u->buf+1, u->rest, l+1);
      u->rest = u->buf;
      u->buf += l+2;
    }

  /* Fill in missing info */
  if (u->port == ~0U)
    u->port = std_ports[u->protoid];

  return 0;
}

/* Name canonicalization */

static void
lowercase(char *b)
{
  if (b)
    while (*b)
      {
	if (*b >= 'A' && *b <= 'Z')
	  *b = *b + 0x20;
	b++;
      }
}

static void
kill_end_dot(char *b)
{
  char *k;

  if (b)
    {
      k = b + strlen(b) - 1;
      while (k > b && *k == '.')
	*k-- = 0;
    }
}

int
url_canonicalize(struct url *u)
{
  char *c;

  lowercase(u->protocol);
  lowercase(u->host);
  kill_end_dot(u->host);
  if ((!u->rest || !*u->rest) && url_proto_path_flags[u->protoid])
    u->rest = "/";
  if (u->rest && (c = strchr(u->rest, '#')))	/* Kill fragment reference */
    *c = 0;
  return 0;
}

/* Pack a broken-down URL */

static char *
append(char *d, const char *s, char *e)
{
  if (d)
    while (*s)
      {
	if (d >= e)
	  return NULL;
	*d++ = *s++;
      }
  return d;
}

int
url_pack(struct url *u, char *d)
{
  char *e = d + MAX_URL_SIZE - 10;

  if (u->protocol)
    {
      d = append(d, u->protocol, e);
      d = append(d, ":", e);
      u->protoid = url_identify_protocol(u->protocol);
    }
  if (u->host)
    {
      d = append(d, "//", e);
      if (u->user)
	{
	  d = append(d, u->user, e);
	  if (u->pass)
	    {
	      d = append(d, ":", e);
	      d = append(d, u->pass, e);
	    }
	  d = append(d, "@", e);
	}
      d = append(d, u->host, e);
      if (u->port != std_ports[u->protoid] && u->port != ~0U)
	{
	  char z[10];
	  sprintf(z, "%d", u->port);
	  d = append(d, ":", e);
	  d = append(d, z, e);
	}
    }
  if (u->rest)
    d = append(d, u->rest, e);
  if (!d)
    return URL_ERR_TOO_LONG;
  *d = 0;
  return 0;
}

/* Error messages */

static char *errmsg[] = {
  "Something is wrong",
  "Too long",
  "Invalid character",
  "Invalid escape",
  "Invalid escaped character",
  "Invalid port number",
  "Relative URL not allowed",
  "Unknown protocol",
  "Syntax error",
  "Path underflow"
};

char *
url_error(uint err)
{
  if (err >= sizeof(errmsg) / sizeof(char *))
    err = 0;
  return errmsg[err];
}

/* Standard cookbook recipes */

int
url_canon_split_rel(const char *u, char *buf1, char *buf2, struct url *url, struct url *base)
{
  int err;

  if (err = url_deescape(u, buf1))
    return err;
  if (err = url_split(buf1, url, buf2))
    return err;
  if (err = url_normalize(url, base))
    return err;
  return url_canonicalize(url);
}

int
url_auto_canonicalize_rel(const char *src, char *dst, struct url *base)
{
  char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE];
  int err;
  struct url ur;

  (void)((err = url_canon_split_rel(src, buf1, buf2, &ur, base)) ||
   (err = url_pack(&ur, buf3)) ||
   (err = url_enescape(buf3, dst)));
  return err;
}

/* Testing */

#ifdef TEST

int main(int argc, char **argv)
{
  char buf1[MAX_URL_SIZE], buf2[MAX_URL_SIZE], buf3[MAX_URL_SIZE], buf4[MAX_URL_SIZE];
  int err;
  struct url url, url0;
  char *base = "http://mj@www.hell.org/123/sub_dir;param/index.html;param?query&zzz/sub;query+#fragment?";

  if (argc != 2 && argc != 3)
    return 1;
  if (argc == 3)
    base = argv[2];
  if (err = url_deescape(argv[1], buf1))
    {
      printf("deesc: error %d\n", err);
      return 1;
    }
  printf("deesc: %s\n", buf1);
  if (err = url_split(buf1, &url, buf2))
    {
      printf("split: error %d\n", err);
      return 1;
    }
  printf("split: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
  if (err = url_split(base, &url0, buf3))
    {
      printf("split base: error %d\n", err);
      return 1;
    }
  if (err = url_normalize(&url0, NULL))
    {
      printf("normalize base: error %d\n", err);
      return 1;
    }
  printf("base: @%s@%s@%s@%s@%d@%s\n", url0.protocol, url0.user, url0.pass, url0.host, url0.port, url0.rest);
  if (err = url_normalize(&url, &url0))
    {
      printf("normalize: error %d\n", err);
      return 1;
    }
  printf("normalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
  if (err = url_canonicalize(&url))
    {
      printf("canonicalize: error %d\n", err);
      return 1;
    }
  printf("canonicalize: @%s@%s@%s@%s@%d@%s\n", url.protocol, url.user, url.pass, url.host, url.port, url.rest);
  if (err = url_pack(&url, buf4))
    {
      printf("pack: error %d\n", err);
      return 1;
    }
  printf("pack: %s\n", buf4);
  if (err = url_enescape(buf4, buf2))
    {
      printf("enesc: error %d\n", err);
      return 1;
    }
  printf("enesc: %s\n", buf2);
  return 0;
}

#endif

struct component {
	const char *start;
	int length;
	uint count;
	u32 hash;
};

static inline u32
hashf(const char *start, int length)
{
	u32 hf = length;
	while (length-- > 0)
		hf = (hf << 8 | hf >> 24) ^ *start++;
	return hf;
}

static inline uint
repeat_count(struct component *comp, uint count, uint len)
{
	struct component *orig_comp = comp;
	uint found = 0;
	while (1)
	{
		uint i;
		comp += len;
		count -= len;
		found++;
		if (count < len)
			return found;
		for (i=0; i<len; i++)
			if (comp[i].hash != orig_comp[i].hash
			|| comp[i].length != orig_comp[i].length
			|| memcmp(comp[i].start, orig_comp[i].start, comp[i].length))
				return found;
	}
}

int
url_has_repeated_component(const char *url)
{
	struct component *comp;
	uint comps, comp_len, rep_prefix, hash_size, *hash, *next;
	const char *c;
	uint i, j, k;

	for (comps=0, c=url; c; comps++)
	{
		c = strpbrk(c, url_component_separators);
		if (c)
			c++;
	}
	if (comps < url_min_repeat_count && comps <= url_max_occurences)
		return 0;
	comp = alloca(comps * sizeof(*comp));
	for (i=0, c=url; c; i++)
	{
		comp[i].start = c;
		c = strpbrk(c, url_component_separators);
		if (c)
		{
			comp[i].length = c - comp[i].start;
			c++;
		}
		else
			comp[i].length = strlen(comp[i].start);
	}
	ASSERT(i == comps);
	for (i=0; i<comps; i++)
		comp[i].hash = hashf(comp[i].start, comp[i].length);
	if (comps > url_max_occurences)
	{
		hash_size = next_table_prime(comps);
		hash = alloca(hash_size * sizeof(*hash));
		next = alloca(comps * sizeof(*next));
		memset(hash, 255, hash_size * sizeof(*hash));
		for (i=0; i<comps; i++)
		{
			j = comp[i].hash % hash_size;
			for (k = hash[j]; ~k && (comp[i].hash != comp[k].hash || comp[i].length != comp[k].length ||
			    memcmp(comp[k].start, comp[i].start, comp[i].length)); k = next[k]);
			if (!~k)
			{
				next[i] = hash[j];
				hash[j] = i;
				comp[i].count = 1;
			}
			else
			{
				if (comp[k].count++ >= url_max_occurences)
					return 1;
			}
		}
	}
	for (comp_len = 1; comp_len <= url_max_repeat_length && comp_len <= comps; comp_len++)
		for (rep_prefix = 0; rep_prefix <= comps - comp_len; rep_prefix++)
			if (repeat_count(comp + rep_prefix, comps - rep_prefix, comp_len) >= url_min_repeat_count)
				return comp_len;
	return 0;
}