You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
357 lines
6.5 KiB
357 lines
6.5 KiB
3 months ago
|
/*
|
||
|
* UCW Library: Reading and writing of UTF-8 on Fastbuf Streams
|
||
|
*
|
||
|
* (c) 2001--2015 Martin Mares <mj@ucw.cz>
|
||
|
* (c) 2004 Robert Spalek <robert@ucw.cz>
|
||
|
*
|
||
|
* This software may be freely distributed and used according to the terms
|
||
|
* of the GNU Lesser General Public License.
|
||
|
*/
|
||
|
|
||
|
#include <ucw/lib.h>
|
||
|
#include <ucw/fastbuf.h>
|
||
|
#include <ucw/unicode.h>
|
||
|
#include <ucw/ff-unicode.h>
|
||
|
#include <ucw/ff-binary.h>
|
||
|
|
||
|
/*** UTF-8 ***/
|
||
|
|
||
|
int
|
||
|
bget_utf8_slow(struct fastbuf *b, uint repl)
|
||
|
{
|
||
|
int c = bgetc(b);
|
||
|
int code;
|
||
|
|
||
|
if (c < 0x80) /* Includes EOF */
|
||
|
return c;
|
||
|
if (c < 0xc0) /* Incorrect combination */
|
||
|
return repl;
|
||
|
if (c >= 0xf0) /* Too large, skip it */
|
||
|
{
|
||
|
while ((c = bgetc(b)) >= 0x80 && c < 0xc0)
|
||
|
;
|
||
|
goto wrong;
|
||
|
}
|
||
|
if (c >= 0xe0) /* 3 bytes */
|
||
|
{
|
||
|
code = c & 0x0f;
|
||
|
if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
|
||
|
goto wrong;
|
||
|
code = (code << 6) | (c & 0x3f);
|
||
|
if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
|
||
|
goto wrong;
|
||
|
code = (code << 6) | (c & 0x3f);
|
||
|
if (code < 0x800)
|
||
|
goto wrong2;
|
||
|
}
|
||
|
else /* 2 bytes */
|
||
|
{
|
||
|
code = c & 0x1f;
|
||
|
if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
|
||
|
goto wrong;
|
||
|
code = (code << 6) | (c & 0x3f);
|
||
|
if (code < 0x80)
|
||
|
goto wrong2;
|
||
|
}
|
||
|
return code;
|
||
|
|
||
|
wrong:
|
||
|
if (c >= 0)
|
||
|
bungetc(b);
|
||
|
wrong2:
|
||
|
return repl;
|
||
|
}
|
||
|
|
||
|
int
|
||
|
bget_utf8_32_slow(struct fastbuf *b, uint repl)
|
||
|
{
|
||
|
int c = bgetc(b);
|
||
|
int code;
|
||
|
int nr;
|
||
|
int limit;
|
||
|
|
||
|
if (c < 0x80) /* Includes EOF */
|
||
|
return c;
|
||
|
if (c < 0xc0) /* Incorrect combination */
|
||
|
return repl;
|
||
|
if (c < 0xe0)
|
||
|
{
|
||
|
code = c & 0x1f;
|
||
|
nr = 1;
|
||
|
limit = 0x80;
|
||
|
}
|
||
|
else if (c < 0xf0)
|
||
|
{
|
||
|
code = c & 0x0f;
|
||
|
nr = 2;
|
||
|
limit = 0x800;
|
||
|
}
|
||
|
else if (c < 0xf8)
|
||
|
{
|
||
|
code = c & 0x07;
|
||
|
nr = 3;
|
||
|
limit = 1 << 16;
|
||
|
}
|
||
|
else if (c < 0xfc)
|
||
|
{
|
||
|
code = c & 0x03;
|
||
|
nr = 4;
|
||
|
limit = 1 << 21;
|
||
|
}
|
||
|
else if (c < 0xfe)
|
||
|
{
|
||
|
code = c & 0x01;
|
||
|
nr = 5;
|
||
|
limit = 1 << 26;
|
||
|
}
|
||
|
else /* Too large */
|
||
|
goto wrong2;
|
||
|
while (nr-- > 0)
|
||
|
{
|
||
|
if ((c = bgetc(b)) < 0x80 || c >= 0xc0)
|
||
|
goto wrong;
|
||
|
code = (code << 6) | (c & 0x3f);
|
||
|
}
|
||
|
if (code < limit)
|
||
|
goto wrong2;
|
||
|
return code;
|
||
|
|
||
|
wrong:
|
||
|
if (c >= 0)
|
||
|
bungetc(b);
|
||
|
wrong2:
|
||
|
return repl;
|
||
|
}
|
||
|
|
||
|
void
|
||
|
bput_utf8_slow(struct fastbuf *b, uint u)
|
||
|
{
|
||
|
ASSERT(u < 65536);
|
||
|
if (u < 0x80)
|
||
|
bputc(b, u);
|
||
|
else
|
||
|
{
|
||
|
if (u < 0x800)
|
||
|
bputc(b, 0xc0 | (u >> 6));
|
||
|
else
|
||
|
{
|
||
|
bputc(b, 0xe0 | (u >> 12));
|
||
|
bputc(b, 0x80 | ((u >> 6) & 0x3f));
|
||
|
}
|
||
|
bputc(b, 0x80 | (u & 0x3f));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void
|
||
|
bput_utf8_32_slow(struct fastbuf *b, uint u)
|
||
|
{
|
||
|
ASSERT(u < (1U<<31));
|
||
|
if (u < 0x80)
|
||
|
bputc(b, u);
|
||
|
else
|
||
|
{
|
||
|
if (u < 0x800)
|
||
|
bputc(b, 0xc0 | (u >> 6));
|
||
|
else
|
||
|
{
|
||
|
if (u < (1<<16))
|
||
|
bputc(b, 0xe0 | (u >> 12));
|
||
|
else
|
||
|
{
|
||
|
if (u < (1<<21))
|
||
|
bputc(b, 0xf0 | (u >> 18));
|
||
|
else
|
||
|
{
|
||
|
if (u < (1<<26))
|
||
|
bputc(b, 0xf8 | (u >> 24));
|
||
|
else
|
||
|
{
|
||
|
bputc(b, 0xfc | (u >> 30));
|
||
|
bputc(b, 0x80 | ((u >> 24) & 0x3f));
|
||
|
}
|
||
|
bputc(b, 0x80 | ((u >> 18) & 0x3f));
|
||
|
}
|
||
|
bputc(b, 0x80 | ((u >> 12) & 0x3f));
|
||
|
}
|
||
|
bputc(b, 0x80 | ((u >> 6) & 0x3f));
|
||
|
}
|
||
|
bputc(b, 0x80 | (u & 0x3f));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*** UTF-16 ***/
|
||
|
|
||
|
int
|
||
|
bget_utf16_be_slow(struct fastbuf *b, uint repl)
|
||
|
{
|
||
|
if (bpeekc(b) < 0)
|
||
|
return -1;
|
||
|
uint u = bgetw_be(b), x, y;
|
||
|
if ((int)u < 0)
|
||
|
return repl;
|
||
|
if ((x = u - 0xd800) >= 0x800)
|
||
|
return u;
|
||
|
if (x >= 0x400 || bpeekc(b) < 0 || (y = bgetw_be(b) - 0xdc00) >= 0x400)
|
||
|
return repl;
|
||
|
return 0x10000 + (x << 10) + y;
|
||
|
}
|
||
|
|
||
|
int
|
||
|
bget_utf16_le_slow(struct fastbuf *b, uint repl)
|
||
|
{
|
||
|
if (bpeekc(b) < 0)
|
||
|
return -1;
|
||
|
uint u = bgetw_le(b), x, y;
|
||
|
if ((int)u < 0)
|
||
|
return repl;
|
||
|
if ((x = u - 0xd800) >= 0x800)
|
||
|
return u;
|
||
|
if (x >= 0x400 || bpeekc(b) < 0 || (y = bgetw_le(b) - 0xdc00) >= 0x400)
|
||
|
return repl;
|
||
|
return 0x10000 + (x << 10) + y;
|
||
|
}
|
||
|
|
||
|
void
|
||
|
bput_utf16_be_slow(struct fastbuf *b, uint u)
|
||
|
{
|
||
|
if (u < 0xd800 || (u < 0x10000 && u >= 0xe000))
|
||
|
{
|
||
|
bputc(b, u >> 8);
|
||
|
bputc(b, u & 0xff);
|
||
|
}
|
||
|
else if ((u -= 0x10000) < 0x100000)
|
||
|
{
|
||
|
bputc(b, 0xd8 | (u >> 18));
|
||
|
bputc(b, (u >> 10) & 0xff);
|
||
|
bputc(b, 0xdc | ((u >> 8) & 0x3));
|
||
|
bputc(b, u & 0xff);
|
||
|
}
|
||
|
else
|
||
|
ASSERT(0);
|
||
|
}
|
||
|
|
||
|
void
|
||
|
bput_utf16_le_slow(struct fastbuf *b, uint u)
|
||
|
{
|
||
|
if (u < 0xd800 || (u < 0x10000 && u >= 0xe000))
|
||
|
{
|
||
|
bputc(b, u & 0xff);
|
||
|
bputc(b, u >> 8);
|
||
|
}
|
||
|
else if ((u -= 0x10000) < 0x100000)
|
||
|
{
|
||
|
bputc(b, (u >> 10) & 0xff);
|
||
|
bputc(b, 0xd8 | (u >> 18));
|
||
|
bputc(b, u & 0xff);
|
||
|
bputc(b, 0xdc | ((u >> 8) & 0x3));
|
||
|
}
|
||
|
else
|
||
|
ASSERT(0);
|
||
|
}
|
||
|
|
||
|
#ifdef TEST
|
||
|
|
||
|
#include <stdlib.h>
|
||
|
#include <stdio.h>
|
||
|
|
||
|
int main(int argc, char **argv)
|
||
|
{
|
||
|
#define FUNCS \
|
||
|
F(BGET_UTF8) F(BGET_UTF8_32) F(BGET_UTF16_BE) F(BGET_UTF16_LE) \
|
||
|
F(BPUT_UTF8) F(BPUT_UTF8_32) F(BPUT_UTF16_BE) F(BPUT_UTF16_LE)
|
||
|
|
||
|
enum {
|
||
|
#define F(x) FUNC_##x,
|
||
|
FUNCS
|
||
|
#undef F
|
||
|
};
|
||
|
char *names[] = {
|
||
|
#define F(x) [FUNC_##x] = #x,
|
||
|
FUNCS
|
||
|
#undef F
|
||
|
};
|
||
|
|
||
|
uint func = ~0U;
|
||
|
if (argc > 1)
|
||
|
for (uint i = 0; i < ARRAY_SIZE(names); i++)
|
||
|
if (!strcasecmp(names[i], argv[1]))
|
||
|
func = i;
|
||
|
if (!~func)
|
||
|
{
|
||
|
fprintf(stderr, "Invalid usage!\n");
|
||
|
return 1;
|
||
|
}
|
||
|
|
||
|
struct fastbuf *b = fbgrow_create(8);
|
||
|
if (func < FUNC_BPUT_UTF8)
|
||
|
{
|
||
|
uint u;
|
||
|
while (scanf("%x", &u) == 1)
|
||
|
bputc(b, u);
|
||
|
fbgrow_rewind(b);
|
||
|
while (bpeekc(b) >= 0)
|
||
|
{
|
||
|
if (btell(b))
|
||
|
putchar(' ');
|
||
|
switch (func)
|
||
|
{
|
||
|
case FUNC_BGET_UTF8:
|
||
|
u = bget_utf8_slow(b, UNI_REPLACEMENT);
|
||
|
break;
|
||
|
case FUNC_BGET_UTF8_32:
|
||
|
u = bget_utf8_32_slow(b, UNI_REPLACEMENT);
|
||
|
break;
|
||
|
case FUNC_BGET_UTF16_BE:
|
||
|
u = bget_utf16_be_slow(b, UNI_REPLACEMENT);
|
||
|
break;
|
||
|
case FUNC_BGET_UTF16_LE:
|
||
|
u = bget_utf16_le_slow(b, UNI_REPLACEMENT);
|
||
|
break;
|
||
|
default:
|
||
|
ASSERT(0);
|
||
|
}
|
||
|
printf("%04x", u);
|
||
|
}
|
||
|
putchar('\n');
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
uint u, i = 0;
|
||
|
while (scanf("%x", &u) == 1)
|
||
|
{
|
||
|
switch (func)
|
||
|
{
|
||
|
case FUNC_BPUT_UTF8:
|
||
|
bput_utf8_slow(b, u);
|
||
|
break;
|
||
|
case FUNC_BPUT_UTF8_32:
|
||
|
bput_utf8_32_slow(b, u);
|
||
|
break;
|
||
|
case FUNC_BPUT_UTF16_BE:
|
||
|
bput_utf16_be_slow(b, u);
|
||
|
break;
|
||
|
case FUNC_BPUT_UTF16_LE:
|
||
|
bput_utf16_le_slow(b, u);
|
||
|
break;
|
||
|
default:
|
||
|
ASSERT(0);
|
||
|
}
|
||
|
fbgrow_rewind(b);
|
||
|
u = 0;
|
||
|
while (bpeekc(b) >= 0)
|
||
|
{
|
||
|
if (i++)
|
||
|
putchar(' ');
|
||
|
printf("%02x", bgetc(b));
|
||
|
}
|
||
|
fbgrow_reset(b);
|
||
|
}
|
||
|
putchar('\n');
|
||
|
}
|
||
|
bclose(b);
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
#endif
|