sksp2024-mcu/libucw/ucw-xml/source.c

487 lines
13 KiB
C
Raw Normal View History

2024-09-14 21:50:33 +02:00
/*
* UCW Library -- A simple XML parser
*
* (c) 2007--2008 Pavel Charvat <pchar@ucw.cz>
*
* This software may be freely distributed and used according to the terms
* of the GNU Lesser General Public License.
*/
#undef LOCAL_DEBUG
#include <ucw/lib.h>
#include <ucw-xml/xml.h>
#include <ucw-xml/dtd.h>
#include <ucw-xml/internals.h>
#include <ucw/unicode.h>
#include <ucw/ff-unicode.h>
#include <charset/charconv.h>
#include <charset/fb-charconv.h>
/*** Character categorization ***/
#include "obj/ucw-xml/unicat.c"
static void
xml_init_cats(struct xml_context *ctx)
{
if (!(ctx->flags & XML_VERSION_1_1))
{
ctx->cat_chars = XML_CHAR_VALID_1_0;
ctx->cat_unrestricted = XML_CHAR_VALID_1_0;
ctx->cat_new_line = XML_CHAR_NEW_LINE_1_0;
ctx->cat_name = XML_CHAR_NAME_1_0;
ctx->cat_sname = XML_CHAR_SNAME_1_0;
}
else
{
ctx->cat_chars = XML_CHAR_VALID_1_1;
ctx->cat_unrestricted = XML_CHAR_UNRESTRICTED_1_1;
ctx->cat_new_line = XML_CHAR_NEW_LINE_1_1;
ctx->cat_name = XML_CHAR_NAME_1_1;
ctx->cat_sname = XML_CHAR_SNAME_1_1;
}
}
/*** Reading of document/external entities ***/
static void NONRET
xml_eof(struct xml_context *ctx)
{
ctx->err_msg = "Unexpected EOF";
ctx->err_code = XML_ERR_EOF;
xml_throw(ctx);
}
void NONRET
xml_fatal_nested(struct xml_context *ctx)
{
xml_fatal(ctx, "Entity is not nested correctly");
}
static inline void
xml_add_char(u32 **bstop, uint c)
{
*(*bstop)++ = c;
*(*bstop)++ = xml_char_cat(c);
}
struct xml_source *
xml_push_source(struct xml_context *ctx)
{
xml_push(ctx);
struct xml_source *src = ctx->src;
if (src)
{
src->bptr = ctx->bptr;
src->bstop = ctx->bstop;
}
src = mp_alloc_zero(ctx->stack, sizeof(*src));
src->next = ctx->src;
src->saved_depth = ctx->depth;
ctx->src = src;
ctx->flags &= ~(XML_SRC_EOF | XML_SRC_EXPECTED_DECL | XML_SRC_DOCUMENT);
ctx->bstop = ctx->bptr = src->buf;
ctx->depth = 0;
return src;
}
struct xml_source *
xml_push_fastbuf(struct xml_context *ctx, struct fastbuf *fb)
{
struct xml_source *src = xml_push_source(ctx);
src->fb = fb;
return src;
}
static void
xml_close_source(struct xml_source *src)
{
bclose(src->fb);
if (src->wrapped_fb)
bclose(src->wrapped_fb);
}
static void
xml_pop_source(struct xml_context *ctx)
{
TRACE(ctx, "pop_source");
if (unlikely(ctx->depth != 0))
xml_fatal(ctx, "Unexpected end of entity");
struct xml_source *src = ctx->src;
if (!src)
xml_fatal(ctx, "Undefined source");
xml_close_source(src);
ctx->depth = src->saved_depth;
ctx->src = src = src->next;
if (src)
{
ctx->bptr = src->bptr;
ctx->bstop = src->bstop;
}
xml_pop(ctx);
if (unlikely(!src))
xml_eof(ctx);
}
void
xml_sources_cleanup(struct xml_context *ctx)
{
struct xml_source *s;
while (s = ctx->src)
{
ctx->src = s->next;
xml_close_source(s);
}
}
static void xml_refill_utf8(struct xml_context *ctx);
void
xml_def_resolve_entity(struct xml_context *ctx, struct xml_dtd_entity *ent UNUSED)
{
xml_error(ctx, "References to external entities are not supported");
}
void
xml_push_entity(struct xml_context *ctx, struct xml_dtd_entity *ent)
{
TRACE(ctx, "xml_push_entity");
struct xml_source *src;
if (ent->flags & XML_DTD_ENTITY_EXTERNAL)
{
ASSERT(ctx->h_resolve_entity);
ctx->h_resolve_entity(ctx, ent);
ctx->flags |= XML_SRC_EXPECTED_DECL;
src = ctx->src;
}
else
{
src = xml_push_source(ctx);
fbbuf_init_read(src->fb = &src->wrap_fb, ent->text, strlen(ent->text), 0);
}
src->refill = xml_refill_utf8;
src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line;
src->refill_cat2 = ctx->cat_new_line;
}
static uint
xml_error_restricted(struct xml_context *ctx, uint c)
{
if (c == ~1U)
xml_error(ctx, "Corrupted encoding");
else
xml_error(ctx, "Restricted char U+%04X", c);
return UNI_REPLACEMENT;
}
static void xml_parse_decl(struct xml_context *ctx);
#define REFILL(ctx, func, params...) \
struct xml_source *src = ctx->src; \
struct fastbuf *fb = src->fb; \
if (ctx->bptr == ctx->bstop) \
ctx->bptr = ctx->bstop = src->buf; \
uint c, t1 = src->refill_cat1, t2 = src->refill_cat2, row = src->row; \
u32 *bend = src->buf + ARRAY_SIZE(src->buf), *bstop = ctx->bstop, \
*last_0xd = src->pending_0xd ? bstop : NULL; \
do \
{ \
c = func(fb, ##params); \
uint t = xml_char_cat(c); \
if (t & t1) \
/* Typical branch */ \
*bstop++ = c, *bstop++ = t; \
else if (t & t2) \
{ \
/* New line */ \
/* XML 1.0: 0xA | 0xD | 0xD 0xA */ \
/* XML 1.1: 0xA | 0xD | 0xD 0xA | 0x85 | 0xD 0x85 | 0x2028 */ \
if (c == 0xd) \
last_0xd = bstop + 2; \
else if (c != 0x2028 && last_0xd == bstop) \
{ \
last_0xd = NULL; \
continue; \
} \
xml_add_char(&bstop, 0xa), row++; \
} \
else if (c == '>') \
{ \
/* Used only in XML/TextDecl to switch the encoding */ \
*bstop++ = c, *bstop++ = t; \
break; \
} \
else if (~c) \
/* Restricted character */ \
xml_add_char(&bstop, xml_error_restricted(ctx, c)); \
else \
{ \
/* EOF */ \
ctx->flags |= XML_SRC_EOF; \
break; \
} \
} \
while (bstop < bend); \
src->pending_0xd = (last_0xd == bstop); \
ctx->bstop = bstop; \
src->row = row;
static void
xml_refill_utf8(struct xml_context *ctx)
{
REFILL(ctx, bget_utf8_repl, ~1U);
}
static void
xml_refill_utf16_le(struct xml_context *ctx)
{
REFILL(ctx, bget_utf16_le_repl, ~1U);
}
static void
xml_refill_utf16_be(struct xml_context *ctx)
{
REFILL(ctx, bget_utf16_be_repl, ~1U);
}
#undef REFILL
void
xml_refill(struct xml_context *ctx)
{
do
{
if (ctx->flags & XML_SRC_EOF)
xml_pop_source(ctx);
else if (ctx->flags & XML_SRC_EXPECTED_DECL)
xml_parse_decl(ctx);
else
{
ctx->src->refill(ctx);
TRACE(ctx, "refilled %u characters", (uint)((ctx->bstop - ctx->bptr) / 2));
}
}
while (ctx->bptr == ctx->bstop);
}
static uint
xml_source_row(struct xml_context *ctx, struct xml_source *src)
{
uint row = src->row;
for (u32 *p = ctx->bstop; p != ctx->bptr; p -= 2)
if (p[-1] & src->refill_cat2)
row--;
return row + 1;
}
uint
xml_row(struct xml_context *ctx)
{
return ctx->src ? xml_source_row(ctx, ctx->src) : 0;
}
/* Document/external entity header */
static char *
xml_parse_encoding_name(struct xml_context *ctx)
{
/* EncName ::= '"' [A-Za-z] ([A-Za-z0-9._] | '-')* '"' | "'" [A-Za-z] ([A-Za-z0-9._] | '-')* "'" */
char *p = mp_start_noalign(ctx->pool, 1);
uint q = xml_parse_quote(ctx);
if (unlikely(!(xml_get_cat(ctx) & XML_CHAR_ENC_SNAME)))
xml_fatal(ctx, "Invalid character in the encoding name");
while (1)
{
p = mp_spread(ctx->pool, p, 2);
*p++ = xml_last_char(ctx);
if (xml_get_char(ctx) == q)
break;
if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_ENC_NAME)))
xml_fatal(ctx, "Invalid character in the encoding name");
}
*p++ = 0;
return mp_end(ctx->pool, p);
}
static void
xml_init_charconv(struct xml_context *ctx, int cs)
{
// XXX: with a direct access to libucw-charset tables could be faster
struct xml_source *src = ctx->src;
TRACE(ctx, "wrapping charset %s", charset_name(cs));
src->wrapped_fb = src->fb;
src->fb = fb_wrap_charconv_in(src->fb, cs, CONV_CHARSET_UTF8);
}
static void
xml_parse_decl(struct xml_context *ctx)
{
TRACE(ctx, "xml_parse_decl");
struct xml_source *src = ctx->src;
ctx->flags &= ~XML_SRC_EXPECTED_DECL;
uint doc = ctx->flags & XML_SRC_DOCUMENT;
/* Setup valid Unicode ranges and force the reader to abort refill() after each '>', where we can switch encoding or XML version */
if (doc)
xml_init_cats(ctx);
src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line & ~XML_CHAR_GT;
src->refill_cat2 = ctx->cat_new_line;
/* Initialize the supplied charset (if any) or try to guess it */
char *expected_encoding = src->expected_encoding;
src->refill = xml_refill_utf8;
int bom = bpeekc(src->fb);
if (bom < 0)
ctx->flags |= XML_SRC_EOF;
if (!src->fb_encoding)
{
if (bom == 0xfe)
src->refill = xml_refill_utf16_be;
else if (bom == 0xff)
src->refill = xml_refill_utf16_le;
}
else
{
int cs = find_charset_by_name(src->fb_encoding);
if (cs == CONV_CHARSET_UTF8)
{}
else if (cs >= 0)
{
xml_init_charconv(ctx, cs);
bom = 0;
}
else if (strcasecmp(src->fb_encoding, "UTF-16"))
{
src->refill = xml_refill_utf16_be;
if (bom == 0xff)
src->refill = xml_refill_utf16_le;
}
else if (strcasecmp(src->fb_encoding, "UTF-16BE"))
src->refill = xml_refill_utf16_be;
else if (strcasecmp(src->fb_encoding, "UTF-16LE"))
src->refill = xml_refill_utf16_le;
else
{
xml_error(ctx, "Unknown encoding '%s'", src->fb_encoding);
expected_encoding = NULL;
}
}
uint utf16 = src->refill == xml_refill_utf16_le || src->refill == xml_refill_utf16_be;
if (utf16)
src->fb_encoding = (src->refill == xml_refill_utf16_be) ? "UTF-16BE" : "UTF-16LE";
if (!expected_encoding)
expected_encoding = src->fb_encoding;
if (bom > 0 && xml_peek_char(ctx) == 0xfeff)
xml_skip_char(ctx);
else if (utf16)
xml_error(ctx, "Missing or corrupted BOM");
TRACE(ctx, "Initial encoding=%s", src->fb_encoding ? : "?");
/* Look ahead for presence of XMLDecl or optional TextDecl */
if (!(ctx->flags & XML_SRC_EOF) && ctx->bstop != src->buf + ARRAY_SIZE(src->buf))
xml_refill(ctx);
u32 *bptr = ctx->bptr;
uint have_decl = (12 <= ctx->bstop - ctx->bptr && (bptr[11] & XML_CHAR_WHITE) &&
bptr[0] == '<' && bptr[2] == '?' && (bptr[4] & 0xdf) == 'X' && (bptr[6] & 0xdf) == 'M' && (bptr[8] & 0xdf) == 'L');
if (!have_decl)
{
if (doc)
xml_fatal(ctx, "Missing or corrupted XML header");
else if (expected_encoding && strcasecmp(src->expected_encoding, "UTF-8") && !utf16)
xml_error(ctx, "Missing or corrupted entity header");
goto exit;
}
ctx->bptr = bptr + 12;
xml_parse_white(ctx, 0);
/* Parse version string (mandatory in XMLDecl, optional in TextDecl) */
if (xml_peek_char(ctx) == 'v')
{
xml_parse_seq(ctx, "version");
xml_parse_eq(ctx);
char *version = xml_parse_pubid_literal(ctx, ctx->pool);
TRACE(ctx, "version=%s", version);
uint v = 0;
if (!strcmp(version, "1.1"))
v = XML_VERSION_1_1;
else if (strcmp(version, "1.0"))
{
xml_error(ctx, "Unknown XML version string '%s'", version);
version = "1.0";
}
if (doc)
{
ctx->version_str = version;
ctx->flags |= v;
}
else if (v > (ctx->flags & XML_VERSION_1_1))
xml_error(ctx, "XML 1.1 external entity included from XML 1.0 document");
if (!xml_parse_white(ctx, !doc))
goto end;
}
else if (doc)
{
xml_error(ctx, "Expected XML version");
ctx->version_str = "1.0";
}
/* Parse encoding string (optional in XMLDecl, mandatory in TextDecl) */
if (xml_peek_char(ctx) == 'e')
{
xml_parse_seq(ctx, "encoding");
xml_parse_eq(ctx);
src->decl_encoding = xml_parse_encoding_name(ctx);
TRACE(ctx, "encoding=%s", src->decl_encoding);
if (!xml_parse_white(ctx, 0))
goto end;
}
else if (!doc)
xml_error(ctx, "Expected XML encoding");
/* Parse whether the document is standalone (optional in XMLDecl) */
if (doc && xml_peek_char(ctx) == 's')
{
xml_parse_seq(ctx, "standalone");
xml_parse_eq(ctx);
uint c = xml_parse_quote(ctx);
if (ctx->standalone = (xml_peek_char(ctx) == 'y'))
xml_parse_seq(ctx, "yes");
else
xml_parse_seq(ctx, "no");
xml_parse_char(ctx, c);
TRACE(ctx, "standalone=%d", ctx->standalone);
xml_parse_white(ctx, 0);
}
end:
xml_parse_seq(ctx, "?>");
/* Switch to the final encoding */
if (src->decl_encoding)
{
int cs = find_charset_by_name(src->decl_encoding);
if (cs < 0 && !expected_encoding)
xml_error(ctx, "Unknown encoding '%s'", src->decl_encoding);
else if (!src->fb_encoding && cs >= 0 && cs != CONV_CHARSET_UTF8)
{
xml_init_charconv(ctx, cs);
src->fb_encoding = src->decl_encoding;
}
else if (expected_encoding && strcasecmp(src->decl_encoding, expected_encoding) && (!utf16 ||
!(!strcasecmp(src->decl_encoding, "UTF-16") ||
(!strcasecmp(src->decl_encoding, "UTF-16BE") && strcasecmp(expected_encoding, "UTF-16LE")) ||
(!strcasecmp(src->decl_encoding, "UTF-16LE") && strcasecmp(expected_encoding, "UTF-16BE")))))
xml_error(ctx, "The header contains encoding '%s' instead of expected '%s'", src->decl_encoding, expected_encoding);
}
if (!src->fb_encoding)
src->fb_encoding = "UTF-8";
TRACE(ctx, "Final encoding=%s", src->fb_encoding);
exit:
/* Update valid Unicode ranges */
if (doc)
xml_init_cats(ctx);
src->refill_cat1 = ctx->cat_unrestricted & ~ctx->cat_new_line;
src->refill_cat2 = ctx->cat_new_line;
}