/* * UCW Library -- A simple XML parser * * (c) 2007--2008 Pavel Charvat * (c) 2015 Martin Mares * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. */ #undef LOCAL_DEBUG #include #include #include #include #include #include #include #include #include #include /*** Basic parsing ***/ void NONRET xml_fatal_expected(struct xml_context *ctx, uint c) { if (c >= 32 && c < 127) xml_fatal(ctx, "Expected '%c'", c); else xml_fatal(ctx, "Expected U+%04x", c); } void NONRET xml_fatal_expected_white(struct xml_context *ctx) { xml_fatal(ctx, "Expected a white space"); } void NONRET xml_fatal_expected_quot(struct xml_context *ctx) { xml_fatal(ctx, "Expected a quotation mark"); } void xml_parse_eq(struct xml_context *ctx) { /* Eq ::= S? '=' S? */ xml_parse_white(ctx, 0); xml_parse_char(ctx, '='); xml_parse_white(ctx, 0); } /*** Memory management ***/ void *xml_do_push(struct xml_context *ctx, uint size) { /* Saves ctx->stack and ctx->flags state */ struct mempool_state state; mp_save(ctx->stack, &state); struct xml_stack *s = mp_alloc(ctx->stack, size); s->state = state; s->flags = ctx->flags; s->next = ctx->stack_list; ctx->stack_list = s; return s; } void xml_do_pop(struct xml_context *ctx, struct xml_stack *s) { /* Restore ctx->stack and ctx->flags state */ ctx->stack_list = s->next; ctx->flags = s->flags; mp_restore(ctx->stack, &s->state); } struct xml_node *xml_push_dom(struct xml_context *ctx, struct mempool_state *state) { /* Create a new DOM node */ TRACE(ctx, "push_dom"); struct xml_dom_stack *s = xml_do_push(ctx, sizeof(*s)); if (state) s->state = *state; else mp_save(ctx->pool, &s->state); struct xml_node *n = mp_alloc(ctx->pool, sizeof(*n)); n->user = NULL; if (n->parent = ctx->node) clist_add_tail(&n->parent->sons, &n->n); return ctx->node = n; } void xml_pop_dom(struct xml_context *ctx, uint free) { /* Leave DOM subtree */ TRACE(ctx, "pop_dom"); ASSERT(ctx->node); struct xml_node *p = ctx->node->parent; struct xml_dom_stack *s = (void *)ctx->stack_list; if (free) { /* See xml_pop_element() for cleanup of attribute hash table */ if (p) clist_remove(&ctx->node->n); mp_restore(ctx->pool, &s->state); } ctx->node = p; xml_do_pop(ctx, &s->stack); } /*** Basics ***/ uint xml_parse_white(struct xml_context *ctx, uint mandatory) { /* mandatory=1 -> S ::= (#x20 | #x9 | #xD | #xA)+ * mandatory=0 -> S? */ uint cnt = 0; while (xml_peek_cat(ctx) & XML_CHAR_WHITE) { xml_skip_char(ctx); cnt++; } if (unlikely(mandatory && !cnt)) xml_fatal_expected_white(ctx); return cnt; } /*** Names and nmtokens ***/ static char * xml_parse_string(struct xml_context *ctx, struct mempool *pool, uint first_cat, uint next_cat, char *err) { char *p = mp_start_noalign(pool, 2); *p++ = '<'; /* We always prepend a '<', so we can seek backwards in the string */ if (unlikely(!(xml_peek_cat(ctx) & first_cat))) xml_fatal(ctx, "%s", err); do { p = mp_spread(pool, p, 5); p = utf8_32_put(p, xml_skip_char(ctx)); } while (xml_peek_cat(ctx) & next_cat); *p++ = 0; return mp_end(pool, p) + 1; } static void xml_skip_string(struct xml_context *ctx, uint first_cat, uint next_cat, char *err) { if (unlikely(!(xml_get_cat(ctx) & first_cat))) xml_fatal(ctx, "%s", err); while (xml_peek_cat(ctx) & next_cat) xml_skip_char(ctx); } char * xml_parse_name(struct xml_context *ctx, struct mempool *pool) { /* Name ::= NameStartChar (NameChar)* */ return xml_parse_string(ctx, pool, ctx->cat_sname, ctx->cat_name, "Expected a name"); } void xml_skip_name(struct xml_context *ctx) { xml_skip_string(ctx, ctx->cat_sname, ctx->cat_name, "Expected a name"); } char * xml_parse_nmtoken(struct xml_context *ctx, struct mempool *pool) { /* Nmtoken ::= (NameChar)+ */ return xml_parse_string(ctx, pool, ctx->cat_name, ctx->cat_name, "Expected a nmtoken"); } /*** Simple literals ***/ char * xml_parse_system_literal(struct xml_context *ctx, struct mempool *pool) { /* SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") */ char *p = mp_start_noalign(pool, 1); uint q = xml_parse_quote(ctx), c; while ((c = xml_get_char(ctx)) != q) { p = mp_spread(pool, p, 5); p = utf8_32_put(p, c); } *p++ = 0; return mp_end(pool, p); } char * xml_parse_pubid_literal(struct xml_context *ctx, struct mempool *pool) { /* PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" */ char *p = mp_start_noalign(pool, 1); uint q = xml_parse_quote(ctx), c; while ((c = xml_get_char(ctx)) != q) { if (unlikely(!(xml_last_cat(ctx) & XML_CHAR_PUBID))) xml_fatal(ctx, "Expected a pubid character"); p = mp_spread(pool, p, 2); *p++ = c; } *p++ = 0; return mp_end(pool, p); } /*** Comments ***/ void xml_push_comment(struct xml_context *ctx) { TRACE(ctx, "push_comment"); /* Comment ::= '' * Already parsed: 'type = XML_NODE_COMMENT; char *p = mp_start_noalign(ctx->pool, 6); while (1) { if (xml_get_char(ctx) == '-') if (xml_get_char(ctx) == '-') break; else *p++ = '-'; p = utf8_32_put(p, xml_last_char(ctx)); p = mp_spread(ctx->pool, p, 6); } xml_parse_char(ctx, '>'); *p = 0; n->len = p - (char *)mp_ptr(ctx->pool); n->text = mp_end(ctx->pool, p + 1); if ((ctx->flags & XML_REPORT_COMMENTS) && ctx->h_comment) ctx->h_comment(ctx); } void xml_pop_comment(struct xml_context *ctx) { xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_COMMENTS)); xml_dec(ctx); TRACE(ctx, "pop_comment"); } void xml_skip_comment(struct xml_context *ctx) { TRACE(ctx, "skip_comment"); xml_parse_char(ctx, '-'); while (xml_get_char(ctx) != '-' || xml_get_char(ctx) != '-'); xml_parse_char(ctx, '>'); xml_dec(ctx); } /*** Processing instructions ***/ void xml_push_pi(struct xml_context *ctx) { TRACE(ctx, "push_pi"); /* Parses a PI to ctx->value and ctx->name: * PI ::= '' Char*)))? '?>' * PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) * Already parsed: 'type = XML_NODE_PI; n->name = xml_parse_name(ctx, ctx->pool); if (unlikely(!strcasecmp(n->name, "xml"))) xml_error(ctx, "Reserved PI target"); char *p = mp_start_noalign(ctx->pool, 5); if (!xml_parse_white(ctx, 0)) xml_parse_seq(ctx, "?>"); else while (1) { if (xml_get_char(ctx) == '?') if (xml_peek_char(ctx) == '>') { xml_skip_char(ctx); break; } else *p++ = '?'; else p = utf8_32_put(p, xml_last_char(ctx)); p = mp_spread(ctx->pool, p, 5); } *p = 0; n->len = p - (char *)mp_ptr(ctx->pool); n->text = mp_end(ctx->pool, p + 1); if ((ctx->flags & XML_REPORT_PIS) && ctx->h_pi) ctx->h_pi(ctx); } void xml_pop_pi(struct xml_context *ctx) { xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_PIS)); xml_dec(ctx); TRACE(ctx, "pop_pi"); } void xml_skip_pi(struct xml_context *ctx) { TRACE(ctx, "skip_pi"); if (ctx->flags & XML_VALIDATING) { struct mempool_state state; mp_save(ctx->stack, &state); if (unlikely(!strcasecmp(xml_parse_name(ctx, ctx->stack), "xml"))) xml_error(ctx, "Reserved PI target"); mp_restore(ctx->stack, &state); if (!xml_parse_white(ctx, 0)) { xml_parse_seq(ctx, "?>"); xml_dec(ctx); return; } } while (1) if (xml_get_char(ctx) == '?') if (xml_peek_char(ctx) == '>') break; xml_skip_char(ctx); xml_dec(ctx); } /*** Character references ***/ uint xml_parse_char_ref(struct xml_context *ctx) { TRACE(ctx, "parse_char_ref"); /* CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' * Already parsed: '&#' */ uint v = 0; if (xml_get_char(ctx) == 'x') { if (!(xml_get_cat(ctx) & XML_CHAR_XDIGIT)) { xml_error(ctx, "Expected a hexadecimal value of character reference"); goto recover; } do { v = (v << 4) + Cxvalue(xml_last_char(ctx)); } while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_XDIGIT)); } else { if (!(xml_last_cat(ctx) & XML_CHAR_DIGIT)) { xml_error(ctx, "Expected a numeric value of character reference"); goto recover; } do { v = v * 10 + xml_last_char(ctx) - '0'; } while (v < 0x110000 && (xml_get_cat(ctx) & XML_CHAR_DIGIT)); } uint cat = xml_char_cat(v); if (!(cat & ctx->cat_unrestricted)) { xml_error(ctx, "Character reference out of range"); goto recover; } if (xml_last_char(ctx) == ';') { xml_dec(ctx); return v; } xml_error(ctx, "Expected ';'"); recover: while (xml_last_char(ctx) != ';') xml_get_char(ctx); xml_dec(ctx); return UNI_REPLACEMENT; } /*** References to general entities ***/ static void xml_parse_ref(struct xml_context *ctx) { /* Reference ::= EntityRef | CharRef * EntityRef ::= '&' Name ';' * Already parsed: '&' */ struct fastbuf *out = &ctx->chars; if (xml_peek_char(ctx) == '#') { xml_skip_char(ctx); bput_utf8_32(out, xml_parse_char_ref(ctx)); } else { TRACE(ctx, "parse_ge_ref"); struct mempool_state state; mp_save(ctx->stack, &state); char *name = xml_parse_name(ctx, ctx->stack); xml_parse_char(ctx, ';'); struct xml_dtd_entity *ent = xml_dtd_find_entity(ctx, name); if (!ent) { xml_error(ctx, "Unknown entity &%s;", name); bputc(out, '&'); bputs(out, name); bputc(out, ';'); } else if (ent->flags & XML_DTD_ENTITY_TRIVIAL) { TRACE(ctx, "Trivial entity &%s;", name); bputs(out, ent->text); } else { TRACE(ctx, "Pushed entity &%s;", name); mp_restore(ctx->stack, &state); xml_dec(ctx); xml_push_entity(ctx, ent); return; } mp_restore(ctx->stack, &state); xml_dec(ctx); } } /*** Character data ***/ void xml_spout_chars(struct fastbuf *fb) { if (fb->bptr < fb->bufend) return; struct xml_context *ctx = SKIP_BACK(struct xml_context, chars, fb); struct mempool *pool = ctx->pool; if (fb->bufend != fb->buffer) { TRACE(ctx, "growing chars"); uint len = fb->bufend - fb->buffer; uint reported = fb->bstop - fb->buffer; fb->buffer = mp_expand(pool); fb->bufend = fb->buffer + mp_avail(pool); fb->bptr = fb->buffer + len; fb->bstop = fb->buffer + reported; } else { TRACE(ctx, "starting chars"); mp_save(pool, &ctx->chars_state); fb->bptr = fb->buffer = fb->bstop = mp_start_noalign(pool, 2); fb->bufend = fb->buffer + mp_avail(pool) - 1; } } static uint xml_end_chars(struct xml_context *ctx, char **out) { struct fastbuf *fb = &ctx->chars; uint len = fb->bptr - fb->buffer; if (len) { TRACE(ctx, "ending chars"); *fb->bptr = 0; *out = mp_end(ctx->pool, fb->bptr + 1); fb->bufend = fb->bstop = fb->bptr = fb->buffer; } return len; } static uint xml_report_chars(struct xml_context *ctx, char **out) { struct fastbuf *fb = &ctx->chars; uint len = fb->bptr - fb->buffer; if (len) { *fb->bptr = 0; *out = fb->bstop; fb->bstop = fb->bptr; } return len; } static uint xml_flush_chars(struct xml_context *ctx) { char *text, *rtext; uint len = xml_end_chars(ctx, &text), rlen; if (len) { if (ctx->flags & XML_NO_CHARS) { if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_ignorable) ctx->h_ignorable(ctx, text, len); mp_restore(ctx->pool, &ctx->chars_state); return 0; } if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext))) ctx->h_block(ctx, rtext, rlen); if (!(ctx->flags & XML_ALLOC_CHARS) && !(ctx->flags & XML_REPORT_CHARS)) { mp_restore(ctx->pool, &ctx->chars_state); return 0; } struct xml_node *n = xml_push_dom(ctx, &ctx->chars_state); n->type = XML_NODE_CHARS; n->text = text; n->len = len; if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_chars) ctx->h_chars(ctx); } return len; } static void xml_pop_chars(struct xml_context *ctx) { xml_pop_dom(ctx, !(ctx->flags & XML_ALLOC_CHARS)); TRACE(ctx, "pop_chars"); } static void xml_append_chars(struct xml_context *ctx) { TRACE(ctx, "append_chars"); struct fastbuf *out = &ctx->chars; if (ctx->flags & XML_NO_CHARS) while (xml_get_char(ctx) != '<') if (xml_last_cat(ctx) & XML_CHAR_WHITE) bput_utf8_32(out, xml_last_char(ctx)); else { xml_error(ctx, "This element must not contain character data"); while (xml_get_char(ctx) != '<'); break; } else while (xml_get_char(ctx) != '<') if (xml_last_char(ctx) == '&') { xml_inc(ctx); xml_parse_ref(ctx); } else bput_utf8_32(out, xml_last_char(ctx)); xml_unget_char(ctx); } /*** CDATA sections ***/ static void xml_skip_cdata(struct xml_context *ctx) { TRACE(ctx, "skip_cdata"); xml_parse_seq(ctx, "CDATA["); while (xml_get_char(ctx) != ']' || xml_get_char(ctx) != ']' || xml_get_char(ctx) != '>'); xml_dec(ctx); } static void xml_append_cdata(struct xml_context *ctx) { /* CDSect :== '' Char*)) ']]>' * Already parsed: 'flags & XML_NO_CHARS) { xml_error(ctx, "This element must not contain CDATA"); xml_skip_cdata(ctx); return; } xml_parse_seq(ctx, "CDATA["); struct fastbuf *out = &ctx->chars; uint rlen; char *rtext; if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_block && (rlen = xml_report_chars(ctx, &rtext))) ctx->h_block(ctx, rtext, rlen); while (1) { if (xml_get_char(ctx) == ']') { if (xml_get_char(ctx) == ']') if (xml_get_char(ctx) == '>') break; else bputc(out, ']'); bputc(out, ']'); } bput_utf8_32(out, xml_last_char(ctx)); } if ((ctx->flags & XML_REPORT_CHARS) && ctx->h_cdata && (rlen = xml_report_chars(ctx, &rtext))) ctx->h_cdata(ctx, rtext, rlen); xml_dec(ctx); } /*** Attribute values ***/ char * xml_parse_attr_value(struct xml_context *ctx, struct xml_dtd_attr *attr UNUSED) { TRACE(ctx, "parse_attr_value"); /* AttValue ::= '"' ([^<&"] | Reference)* '"' | "'" ([^<&'] | Reference)* "'" */ /* FIXME: -- check value constrains / normalize leading/trailing WS and repeated WS */ struct mempool_state state; uint quote = xml_parse_quote(ctx); mp_save(ctx->stack, &state); struct fastbuf *out = &ctx->chars; struct xml_source *src = ctx->src; while (1) { uint c = xml_get_char(ctx); if (c == '&') { xml_inc(ctx); xml_parse_ref(ctx); } else if (c == quote && src == ctx->src) break; else if (c == '<') xml_error(ctx, "Attribute value must not contain '<'"); else if (xml_last_cat(ctx) & XML_CHAR_WHITE) bputc(out, ' '); else bput_utf8_32(out, c); } mp_restore(ctx->stack, &state); char *text; return xml_end_chars(ctx, &text) ? text : ""; } uint xml_normalize_white(struct xml_context *ctx UNUSED, char *text) { char *s = text, *d = text; while (*s == 0x20) s++; while (1) { while (*s & ~0x20) *d++ = *s++; if (!*s) break; while (*++s == 0x20); *d++ = 0x20; } if (d != text && d[-1] == 0x20) d--; *d = 0; return d - text; } /*** Attributes ***/ static void xml_raw_add_attr(struct xml_context *ctx, struct xml_node *e, char *name, char *value) { struct xml_attr *a = mp_alloc(ctx->pool, sizeof(*a)); a->elem = e; a->ns = 0; /* Namespaces will be resolved later */ a->name = name; a->val = value; a->dtd = NULL; a->user = NULL; /* a->hash will be calculated later */ slist_add_tail(&e->attrs, &a->n); } static inline uint xml_attr_hash(uint ns, char *name) { return hash_string(name) ^ hash_u32(ns); } static void xml_parse_attr(struct xml_context *ctx) { TRACE(ctx, "parse_attr"); /* Attribute ::= Name Eq AttValue */ struct xml_node *e = ctx->node; char *n = xml_parse_name(ctx, ctx->pool); xml_parse_eq(ctx); char *v = xml_parse_attr_value(ctx, NULL); xml_raw_add_attr(ctx, e, n, v); } static void xml_process_attr(struct xml_context *ctx, struct xml_attr *a) { struct xml_node *e = a->elem; a->hash = xml_attr_hash(a->ns, a->name); XML_ATTR_FOR_EACH(a2, e) { if (a2 == a) break; if (a2->hash == a->hash && a2->ns == a->ns && !strcmp(a2->name, a->name)) xml_error(ctx, "Attribute %s is not unique in element <%s>", xml_attr_qname(ctx, a), xml_node_qname(ctx, e)); } } struct xml_attr * xml_attr_find(struct xml_context *ctx, struct xml_node *node, char *name) { return xml_attr_find_ns(ctx, node, 0, name); } struct xml_attr * xml_attr_find_ns(struct xml_context *ctx UNUSED, struct xml_node *node, uint ns, char *name) { ASSERT(node->type == XML_NODE_ELEM); uint hash = xml_attr_hash(ns, name); XML_ATTR_FOR_EACH(a, node) if (a->hash == hash && a->ns == ns && !strcmp(a->name, name)) return a; return NULL; } char * xml_attr_value_ns(struct xml_context *ctx, struct xml_node *node, uint ns, char *name) { struct xml_attr *attr = xml_attr_find_ns(ctx, node, ns, name); if (attr) return attr->val; if (!node->dtd) return NULL; if (ns) /* So far, our DTD support is not namespace-aware */ return NULL; struct xml_dtd_attr *dtd = xml_dtd_find_attr(ctx, node->dtd, name); return dtd ? dtd->default_value : NULL; } char * xml_attr_value(struct xml_context *ctx, struct xml_node *node, char *name) { return xml_attr_value_ns(ctx, node, 0, name); } char * xml_attr_qname(struct xml_context *ctx UNUSED, struct xml_attr *attr) { char *n = attr->name; while (n[-1] != '<') n--; return n; } /*** Elements ***/ static uint xml_validate_element(struct xml_dtd_elem_node *root, struct xml_dtd_elem *elem) { if (root->elem) return elem == root->elem; else SLIST_FOR_EACH(struct xml_dtd_elem_node *, son, root->sons) if (xml_validate_element(son, elem)) return 1; return 0; } static void xml_push_element(struct xml_context *ctx) { TRACE(ctx, "push_element"); /* EmptyElemTag | STag * EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' * STag ::= '<' Name (S Attribute)* S? '>' * Already parsed: '<' */ struct xml_node *e = xml_push_dom(ctx, NULL); clist_init(&e->sons); e->type = XML_NODE_ELEM; e->name = xml_parse_name(ctx, ctx->pool); slist_init(&e->attrs); if (!e->parent) { ctx->dom = e; if (ctx->doctype && strcmp(e->name, ctx->doctype)) xml_error(ctx, "The root element <%s> does not match the document type <%s>", e->name, ctx->doctype); } if (!ctx->dtd) e->dtd = NULL; else if (!(e->dtd = xml_dtd_find_elem(ctx, e->name))) xml_error(ctx, "Undefined element <%s>", e->name); else { struct xml_dtd_elem *dtd = e->dtd, *parent_dtd = e->parent ? e->parent->dtd : NULL; if (dtd->type == XML_DTD_ELEM_MIXED) ctx->flags &= ~XML_NO_CHARS; else ctx->flags |= XML_NO_CHARS; if (parent_dtd) if (parent_dtd->type == XML_DTD_ELEM_EMPTY) xml_error(ctx, "Empty element must not contain children"); else if (parent_dtd->type != XML_DTD_ELEM_ANY) { // FIXME: validate regular expressions if (!xml_validate_element(parent_dtd->node, dtd)) xml_error(ctx, "Unexpected element <%s>", e->name); } } /* Parse attributes */ while (1) { uint white = xml_parse_white(ctx, 0); uint c = xml_get_char(ctx); if (c == '/') { xml_parse_char(ctx, '>'); ctx->flags |= XML_EMPTY_ELEM_TAG; break; } else if (c == '>') break; else if (!white) xml_fatal_expected_white(ctx); xml_unget_char(ctx); xml_parse_attr(ctx); } /* Resolve namespaces */ xml_ns_push_element(ctx); /* Once we have namespaces, hash attribute names */ XML_ATTR_FOR_EACH(a, e) xml_process_attr(ctx, a); /* FIXME: DTD logic is not namespace-aware */ if (e->dtd) { XML_ATTR_FOR_EACH(a, e) { if (!(a->dtd = xml_dtd_find_attr(ctx, e->dtd, a->name))) xml_error(ctx, "Undefined attribute %s in element <%s>", a->name, e->name); else xml_validate_attr(ctx, a->dtd, a->val); } SLIST_FOR_EACH(struct xml_dtd_attr *, a, e->dtd->attrs) { if (a->default_mode == XML_ATTR_REQUIRED) { if (!xml_attr_find(ctx, e, a->name)) xml_error(ctx, "Missing required attribute %s in element <%s>", a->name, e->name); } else if (a->default_mode != XML_ATTR_IMPLIED && ctx->flags & XML_ALLOC_DEFAULT_ATTRS) { if (!xml_attr_find(ctx, e, a->name)) xml_raw_add_attr(ctx, e, a->name, a->default_value); } } } if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_stag) ctx->h_stag(ctx); } static void xml_pop_element(struct xml_context *ctx) { TRACE(ctx, "pop_element"); if ((ctx->flags & XML_REPORT_TAGS) && ctx->h_etag) ctx->h_etag(ctx); xml_ns_pop_element(ctx); struct xml_node *e = ctx->node; uint free = !(ctx->flags & XML_ALLOC_TAGS); if (free) { if (!e->parent) ctx->dom = NULL; #if 0 /* * With the current data structures, freeing of attributes is not necessary, * but it might be if we switch to a global hash table of large elements. */ SLIST_FOR_EACH(struct xml_attr *, a, e->attrs) xml_attrs_remove(ctx->tab_attrs, a); struct xml_node *n; while (n = clist_head(&e->sons)) { if (n->type == XML_NODE_ELEM) { SLIST_FOR_EACH(struct xml_attr *, a, n->attrs) xml_attrs_remove(ctx->tab_attrs, a); clist_insert_list_after(&n->sons, &n->n); } clist_remove(&n->n); } #endif } xml_pop_dom(ctx, free); xml_dec(ctx); } static void xml_parse_etag(struct xml_context *ctx) { /* ETag ::= '' * Already parsed: '<' */ struct xml_node *e = ctx->node; ASSERT(e); char *n = xml_node_qname(ctx, e); while (*n) { uint c; n = utf8_32_get(n, &c); if (xml_get_char(ctx) != c) goto recover; } xml_parse_white(ctx, 0); if (xml_get_char(ctx) != '>') { recover: xml_error(ctx, "Invalid ETag, expected ", xml_node_qname(ctx, e)); while (xml_get_char(ctx) != '>'); } xml_dec(ctx); } char * xml_node_qname(struct xml_context *ctx UNUSED, struct xml_node *node) { ASSERT(node->type == XML_NODE_ELEM); char *n = node->name; while (n[-1] != '<') n--; return n; } /*** Document type declaration ***/ static void xml_parse_doctype_decl(struct xml_context *ctx) { TRACE(ctx, "parse_doctype_decl"); /* doctypedecl ::= '' * Already parsed: '' */ if (ctx->doctype) xml_fatal(ctx, "Multiple document types not allowed"); xml_parse_seq(ctx, "DOCTYPE"); xml_parse_white(ctx, 1); ctx->doctype = xml_parse_name(ctx, ctx->pool); TRACE(ctx, "doctype=%s", ctx->doctype); uint c; if (xml_parse_white(ctx, 0) && ((c = xml_peek_char(ctx)) == 'S' || c == 'P')) { if (c == 'S') { xml_parse_seq(ctx, "SYSTEM"); xml_parse_white(ctx, 1); ctx->system_id = xml_parse_system_literal(ctx, ctx->pool); } else { xml_parse_seq(ctx, "PUBLIC"); xml_parse_white(ctx, 1); ctx->public_id = xml_parse_pubid_literal(ctx, ctx->pool); xml_parse_white(ctx, 1); ctx->system_id = xml_parse_system_literal(ctx, ctx->pool); } xml_parse_white(ctx, 0); ctx->flags |= XML_HAS_EXTERNAL_SUBSET; } if (xml_peek_char(ctx) == '[') { ctx->flags |= XML_HAS_INTERNAL_SUBSET; xml_skip_char(ctx); xml_inc(ctx); } if (ctx->h_doctype_decl) ctx->h_doctype_decl(ctx); } /////////////////////////////////////////////////////////////////////////////////////////////////////////// /* DTD: Internal subset */ static void xml_parse_subset(struct xml_context *ctx, uint external) { // FIXME: // -- comments/pi have no parent // -- conditional sections in external subset // -- check corectness of parameter entities /* '[' intSubset ']' * intSubset :== (markupdecl | DeclSep) * Already parsed: '[' * * extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)* */ while (1) { xml_parse_white(ctx, 0); uint c = xml_get_char(ctx); xml_inc(ctx); if (c == '<') if ((c = xml_get_char(ctx)) == '!') switch (c = xml_get_char(ctx)) { case '-': xml_push_comment(ctx); xml_pop_comment(ctx); break; case 'N': xml_parse_seq(ctx, "OTATION"); xml_parse_notation_decl(ctx); break; case 'E': if ((c = xml_get_char(ctx)) == 'N') { xml_parse_seq(ctx, "TITY"); xml_parse_entity_decl(ctx); } else if (c == 'L') { xml_parse_seq(ctx, "EMENT"); xml_parse_element_decl(ctx); } else goto invalid_markup; break; case 'A': xml_parse_seq(ctx, "TTLIST"); xml_parse_attr_list_decl(ctx); break; default: goto invalid_markup; } else if (c == '?') { xml_push_pi(ctx); xml_pop_pi(ctx); } else goto invalid_markup; else if (c == '%') xml_parse_pe_ref(ctx); else if (c == ']' && !external) { break; } else if (c == '>' && external) { break; } else goto invalid_markup; } xml_dec(ctx); return; invalid_markup: ; xml_fatal(ctx, "Invalid markup in the %s subset", external ? "external" : "internal"); } /*** The State Machine ***/ uint xml_next(struct xml_context *ctx) { /* A nasty state machine */ #define PULL(x) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##x; /* fall-thru */ case XML_STATE_##x: ; } while (0) #define PULL_STATE(x, s) do { if (ctx->pull & XML_PULL_##x) return ctx->state = XML_STATE_##s, XML_STATE_##x; /* fall-thru */ case XML_STATE_##s: ; } while (0) TRACE(ctx, "xml_next (state=%u)", ctx->state); jmp_buf throw_buf; ctx->throw_buf = &throw_buf; if (setjmp(throw_buf)) { error: if (ctx->err_code == XML_ERR_EOF && ctx->h_fatal) ctx->h_fatal(ctx); TRACE(ctx, "raised fatal error"); return ctx->state = XML_STATE_EOF; } uint c; switch (ctx->state) { case XML_STATE_START: TRACE(ctx, "entering prolog"); ctx->flags |= XML_SRC_DOCUMENT | XML_SRC_EXPECTED_DECL; if (ctx->h_document_start) ctx->h_document_start(ctx); /* XMLDecl */ xml_refill(ctx); if (ctx->h_xml_decl) ctx->h_xml_decl(ctx); PULL(XML_DECL); /* Misc* (doctypedecl Misc*)? */ while (1) { xml_parse_white(ctx, 0); xml_parse_char(ctx, '<'); xml_inc(ctx); if ((c = xml_get_char(ctx)) == '?') /* Processing intruction */ if (!(ctx->flags & XML_REPORT_PIS)) xml_skip_pi(ctx); else { xml_push_pi(ctx); PULL_STATE(PI, PROLOG_PI); xml_pop_pi(ctx); } else if (c != '!') { /* Found the root tag */ xml_unget_char(ctx); goto first_tag; } else if (xml_get_char(ctx) == '-') if (!(ctx->flags & XML_REPORT_COMMENTS)) xml_skip_comment(ctx); else { xml_push_comment(ctx); PULL_STATE(COMMENT, PROLOG_COMMENT); xml_pop_comment(ctx); } else { /* DocTypeDecl */ xml_unget_char(ctx); xml_parse_doctype_decl(ctx); PULL(DOCTYPE_DECL); if (ctx->flags & XML_HAS_DTD) if (ctx->flags & XML_PARSE_DTD) { xml_dtd_init(ctx); if (ctx->h_dtd_start) ctx->h_dtd_start(ctx); if (ctx->flags & XML_HAS_INTERNAL_SUBSET) { xml_parse_subset(ctx, 0); xml_dec(ctx); } if (ctx->flags & XML_HAS_EXTERNAL_SUBSET) { struct xml_dtd_entity ent = { .system_id = ctx->system_id, .public_id = ctx->public_id, }; xml_parse_white(ctx, 0); xml_parse_char(ctx, '>'); xml_unget_char(ctx); ASSERT(ctx->h_resolve_entity); ctx->h_resolve_entity(ctx, &ent); ctx->flags |= XML_SRC_EXPECTED_DECL; xml_parse_subset(ctx, 1); xml_unget_char(ctx);; } if (ctx->h_dtd_end) ctx->h_dtd_end(ctx); } else if (ctx->flags & XML_HAS_INTERNAL_SUBSET) xml_skip_internal_subset(ctx); xml_parse_white(ctx, 0); xml_parse_char(ctx, '>'); xml_dec(ctx); } } case XML_STATE_CHARS: while (1) { if (xml_peek_char(ctx) != '<') { /* CharData */ xml_append_chars(ctx); continue; } else xml_skip_char(ctx); xml_inc(ctx); first_tag: if ((c = xml_get_char(ctx)) == '?') { /* PI */ if (!(ctx->flags & (XML_REPORT_PIS | XML_ALLOC_PIS))) xml_skip_pi(ctx); else { if (xml_flush_chars(ctx)) { PULL_STATE(CHARS, CHARS_BEFORE_PI); xml_pop_chars(ctx); } xml_push_pi(ctx); PULL(PI); xml_pop_pi(ctx); } } else if (c == '!') if ((c = xml_get_char(ctx)) == '-') { /* Comment */ if (!(ctx->flags & (XML_REPORT_COMMENTS | XML_ALLOC_COMMENTS))) xml_skip_comment(ctx); else { if (xml_flush_chars(ctx)) { PULL_STATE(CHARS, CHARS_BEFORE_COMMENT); xml_pop_chars(ctx); } xml_push_comment(ctx); PULL(COMMENT); xml_pop_comment(ctx); } } else if (c == '[') { /* CDATA */ xml_append_cdata(ctx); } else xml_fatal(ctx, "Unexpected character after 'flags & XML_EMPTY_ELEM_TAG) goto pop_element; } else { /* ETag */ if (xml_flush_chars(ctx)) { PULL_STATE(CHARS, CHARS_BEFORE_ETAG); xml_pop_chars(ctx); } xml_parse_etag(ctx); pop_element: PULL(ETAG); xml_pop_element(ctx); if (!ctx->node) goto epilog; } } epilog: /* Misc* */ TRACE(ctx, "entering epilog"); while (1) { /* Epilog whitespace is the only place, where a valid document can reach EOF */ if (setjmp(throw_buf)) if (ctx->err_code == XML_ERR_EOF) { TRACE(ctx, "reached EOF"); ctx->state = XML_STATE_EOF; if (ctx->h_document_end) ctx->h_document_end(ctx); case XML_STATE_EOF: ctx->err_code = 0; ctx->err_msg = NULL; return XML_STATE_EOF; } else goto error; xml_parse_white(ctx, 0); if (setjmp(throw_buf)) goto error; /* Misc */ xml_parse_char(ctx, '<'); xml_inc(ctx); if ((c = xml_get_char(ctx)) == '?') /* Processing instruction */ if (!(ctx->flags & XML_REPORT_PIS)) xml_skip_pi(ctx); else { xml_push_pi(ctx); PULL_STATE(PI, EPILOG_PI); xml_pop_pi(ctx); } else if (c == '!') { xml_parse_char(ctx, '-'); /* Comment */ if (!(ctx->flags & XML_REPORT_COMMENTS)) xml_skip_comment(ctx); else { xml_push_comment(ctx); PULL_STATE(COMMENT, EPILOG_COMMENT); xml_pop_comment(ctx); } } else xml_fatal(ctx, "Syntax error in the epilog"); } } ASSERT(0); } uint xml_next_state(struct xml_context *ctx, uint pull) { uint saved = ctx->pull; ctx->pull = pull; uint res = xml_next(ctx); ctx->pull = saved; return res; } uint xml_skip_element(struct xml_context *ctx) { ASSERT(ctx->state == XML_STATE_STAG); struct xml_node *node = ctx->node; uint saved = ctx->pull, res; ctx->pull = XML_PULL_ETAG; while ((res = xml_next(ctx)) && ctx->node != node); ctx->pull = saved; return res; } uint xml_parse(struct xml_context *ctx) { /* This cycle should run only once unless the user overrides the value of ctx->pull in a SAX handler */ do { ctx->pull = 0; } while (xml_next(ctx)); return ctx->err_code; } char * xml_merge_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool) { ASSERT(node->type == XML_NODE_ELEM); char *p = mp_start_noalign(pool, 1); XML_NODE_FOR_EACH(son, node) if (son->type == XML_NODE_CHARS) { p = mp_spread(pool, p, son->len + 1); memcpy(p, son->text, son->len); p += son->len; } *p++ = 0; return mp_end(pool, p); } static char * xml_append_dom_chars(char *p, struct mempool *pool, struct xml_node *node) { XML_NODE_FOR_EACH(son, node) if (son->type == XML_NODE_CHARS) { p = mp_spread(pool, p, son->len + 1); memcpy(p, son->text, son->len); p += son->len; } else if (son->type == XML_NODE_ELEM) p = xml_append_dom_chars(p, pool, son); return p; } char * xml_merge_dom_chars(struct xml_context *ctx UNUSED, struct xml_node *node, struct mempool *pool) { ASSERT(node->type == XML_NODE_ELEM); char *p = mp_start_noalign(pool, 1); p = xml_append_dom_chars(p, pool, node); *p++ = 0; return mp_end(pool, p); }