/* * UCW Library: Reading and writing of UTF-8 on Fastbuf Streams * * (c) 2001--2015 Martin Mares * (c) 2004 Robert Spalek * * This software may be freely distributed and used according to the terms * of the GNU Lesser General Public License. */ #include #include #include #include #include /*** UTF-8 ***/ int bget_utf8_slow(struct fastbuf *b, uint repl) { int c = bgetc(b); int code; if (c < 0x80) /* Includes EOF */ return c; if (c < 0xc0) /* Incorrect combination */ return repl; if (c >= 0xf0) /* Too large, skip it */ { while ((c = bgetc(b)) >= 0x80 && c < 0xc0) ; goto wrong; } if (c >= 0xe0) /* 3 bytes */ { code = c & 0x0f; if ((c = bgetc(b)) < 0x80 || c >= 0xc0) goto wrong; code = (code << 6) | (c & 0x3f); if ((c = bgetc(b)) < 0x80 || c >= 0xc0) goto wrong; code = (code << 6) | (c & 0x3f); if (code < 0x800) goto wrong2; } else /* 2 bytes */ { code = c & 0x1f; if ((c = bgetc(b)) < 0x80 || c >= 0xc0) goto wrong; code = (code << 6) | (c & 0x3f); if (code < 0x80) goto wrong2; } return code; wrong: if (c >= 0) bungetc(b); wrong2: return repl; } int bget_utf8_32_slow(struct fastbuf *b, uint repl) { int c = bgetc(b); int code; int nr; int limit; if (c < 0x80) /* Includes EOF */ return c; if (c < 0xc0) /* Incorrect combination */ return repl; if (c < 0xe0) { code = c & 0x1f; nr = 1; limit = 0x80; } else if (c < 0xf0) { code = c & 0x0f; nr = 2; limit = 0x800; } else if (c < 0xf8) { code = c & 0x07; nr = 3; limit = 1 << 16; } else if (c < 0xfc) { code = c & 0x03; nr = 4; limit = 1 << 21; } else if (c < 0xfe) { code = c & 0x01; nr = 5; limit = 1 << 26; } else /* Too large */ goto wrong2; while (nr-- > 0) { if ((c = bgetc(b)) < 0x80 || c >= 0xc0) goto wrong; code = (code << 6) | (c & 0x3f); } if (code < limit) goto wrong2; return code; wrong: if (c >= 0) bungetc(b); wrong2: return repl; } void bput_utf8_slow(struct fastbuf *b, uint u) { ASSERT(u < 65536); if (u < 0x80) bputc(b, u); else { if (u < 0x800) bputc(b, 0xc0 | (u >> 6)); else { bputc(b, 0xe0 | (u >> 12)); bputc(b, 0x80 | ((u >> 6) & 0x3f)); } bputc(b, 0x80 | (u & 0x3f)); } } void bput_utf8_32_slow(struct fastbuf *b, uint u) { ASSERT(u < (1U<<31)); if (u < 0x80) bputc(b, u); else { if (u < 0x800) bputc(b, 0xc0 | (u >> 6)); else { if (u < (1<<16)) bputc(b, 0xe0 | (u >> 12)); else { if (u < (1<<21)) bputc(b, 0xf0 | (u >> 18)); else { if (u < (1<<26)) bputc(b, 0xf8 | (u >> 24)); else { bputc(b, 0xfc | (u >> 30)); bputc(b, 0x80 | ((u >> 24) & 0x3f)); } bputc(b, 0x80 | ((u >> 18) & 0x3f)); } bputc(b, 0x80 | ((u >> 12) & 0x3f)); } bputc(b, 0x80 | ((u >> 6) & 0x3f)); } bputc(b, 0x80 | (u & 0x3f)); } } /*** UTF-16 ***/ int bget_utf16_be_slow(struct fastbuf *b, uint repl) { if (bpeekc(b) < 0) return -1; uint u = bgetw_be(b), x, y; if ((int)u < 0) return repl; if ((x = u - 0xd800) >= 0x800) return u; if (x >= 0x400 || bpeekc(b) < 0 || (y = bgetw_be(b) - 0xdc00) >= 0x400) return repl; return 0x10000 + (x << 10) + y; } int bget_utf16_le_slow(struct fastbuf *b, uint repl) { if (bpeekc(b) < 0) return -1; uint u = bgetw_le(b), x, y; if ((int)u < 0) return repl; if ((x = u - 0xd800) >= 0x800) return u; if (x >= 0x400 || bpeekc(b) < 0 || (y = bgetw_le(b) - 0xdc00) >= 0x400) return repl; return 0x10000 + (x << 10) + y; } void bput_utf16_be_slow(struct fastbuf *b, uint u) { if (u < 0xd800 || (u < 0x10000 && u >= 0xe000)) { bputc(b, u >> 8); bputc(b, u & 0xff); } else if ((u -= 0x10000) < 0x100000) { bputc(b, 0xd8 | (u >> 18)); bputc(b, (u >> 10) & 0xff); bputc(b, 0xdc | ((u >> 8) & 0x3)); bputc(b, u & 0xff); } else ASSERT(0); } void bput_utf16_le_slow(struct fastbuf *b, uint u) { if (u < 0xd800 || (u < 0x10000 && u >= 0xe000)) { bputc(b, u & 0xff); bputc(b, u >> 8); } else if ((u -= 0x10000) < 0x100000) { bputc(b, (u >> 10) & 0xff); bputc(b, 0xd8 | (u >> 18)); bputc(b, u & 0xff); bputc(b, 0xdc | ((u >> 8) & 0x3)); } else ASSERT(0); } #ifdef TEST #include #include int main(int argc, char **argv) { #define FUNCS \ F(BGET_UTF8) F(BGET_UTF8_32) F(BGET_UTF16_BE) F(BGET_UTF16_LE) \ F(BPUT_UTF8) F(BPUT_UTF8_32) F(BPUT_UTF16_BE) F(BPUT_UTF16_LE) enum { #define F(x) FUNC_##x, FUNCS #undef F }; char *names[] = { #define F(x) [FUNC_##x] = #x, FUNCS #undef F }; uint func = ~0U; if (argc > 1) for (uint i = 0; i < ARRAY_SIZE(names); i++) if (!strcasecmp(names[i], argv[1])) func = i; if (!~func) { fprintf(stderr, "Invalid usage!\n"); return 1; } struct fastbuf *b = fbgrow_create(8); if (func < FUNC_BPUT_UTF8) { uint u; while (scanf("%x", &u) == 1) bputc(b, u); fbgrow_rewind(b); while (bpeekc(b) >= 0) { if (btell(b)) putchar(' '); switch (func) { case FUNC_BGET_UTF8: u = bget_utf8_slow(b, UNI_REPLACEMENT); break; case FUNC_BGET_UTF8_32: u = bget_utf8_32_slow(b, UNI_REPLACEMENT); break; case FUNC_BGET_UTF16_BE: u = bget_utf16_be_slow(b, UNI_REPLACEMENT); break; case FUNC_BGET_UTF16_LE: u = bget_utf16_le_slow(b, UNI_REPLACEMENT); break; default: ASSERT(0); } printf("%04x", u); } putchar('\n'); } else { uint u, i = 0; while (scanf("%x", &u) == 1) { switch (func) { case FUNC_BPUT_UTF8: bput_utf8_slow(b, u); break; case FUNC_BPUT_UTF8_32: bput_utf8_32_slow(b, u); break; case FUNC_BPUT_UTF16_BE: bput_utf16_be_slow(b, u); break; case FUNC_BPUT_UTF16_LE: bput_utf16_le_slow(b, u); break; default: ASSERT(0); } fbgrow_rewind(b); u = 0; while (bpeekc(b) >= 0) { if (i++) putchar(' '); printf("%02x", bgetc(b)); } fbgrow_reset(b); } putchar('\n'); } bclose(b); return 0; } #endif