195 lines
4.3 KiB
195 lines
4.3 KiB
# Character Set Table Generator 1.0
# (c) 1998 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
# This program can be freely distributed and used according to the terms
# of the GNU General Public License.
# Internal codes 0..255 are mapped to UniCode 0..255
# Internal code 256 is the replacement character (U#FFFD)
$ncs = 0;
print "/* Generated by tabgen 1.0, please don't edit manually. */\n\n";
print STDERR "Charset list...\n";
while (<>) {
(/^\w*$/ || /^#/) && next;
$charsets[$ncs++] = $_;
print STDERR "Found $ncs charsets, counting unique codes...\n";
for($unique=0; $unique<256; $unique++) {
$u2x{$unique} = $unique;
$x2u[$unique] = $unique;
$u2x{0xFFFD} = $unique;
$x2u[$unique++] = 0xFFFD;
print "static unsigned short int input_to_x[$ncs][256] = {\n";
for($x=0; $x<$ncs; $x++) {
$a = $charsets[$x];
print "\n/* $a */\n{\n";
open (A, $a) || die "Error opening $a";
while (<A>) {
(/^\w*$/ || /^#/) && next;
($i, $u, $c) = split /\t/;
$cc[$x][hex $i] = $u;
close A;
for($i=0; $i<256; $i++) {
$u = hex((defined $cc[$x][$i]) ? $cc[$x][$i] : "FFFD");
if (!defined $u2x{$u}) {
$x2u[$unique] = $u;
$u2x{$u} = $unique++;
$o = $u2x{$u};
print "$o,", ($i % 16 == 15) ? "\n" : " ";
$cc[$x][$i] = $o;
$cx[$x]{$o} = $i;
print "},\n";
print "};\n\n";
print STDERR "$unique unique codes...\n";
print "static unsigned short int x_to_uni[$unique] = {\n";
for($i=0; $i<$unique; $i++) {
print "$x2u[$i],", ($i % 16 == 15) ? "\n" : " ";
if ($i % 16) { print "\n"; }
print "};\n\n";
print STDERR "UNICODE table...\n";
for($i=0; $i<$unique; $i++) {
$u = $x2u[$i];
$p = $u / 256;
$pg[$p] = 1;
for($i=0; $i<256; $i++) {
if ($pg[$i]) {
print "static unsigned short int uni_to_x_$i\[256\] = {\n";
for($j=0; $j<256; $j++) {
$u = 256*$i + $j;
$u = defined($u2x{$u}) ? $u2x{$u} : 256;
print "$u,", ($j % 16 == 15) ? "\n" : " ";
print "};\n\n";
print "static unsigned short int *uni_to_x[256] = {\n";
for($i=hex "FF00"; $i<=hex "FFFF"; $i++) {
if (defined $u2x{$i} && $i != 0xFFFD) { die "Invalid replacement strategy!"; }
for($i=0; $i<256; $i++) {
print "uni_to_x_", $pg[$i] ? $i : "255", ",", ($i % 4 == 3) ? "\n" : " ";
print "};\n\n";
print STDERR "UniData file...\n";
open (U, "unidata/UnicodeData.txt") || die "No UnicodeData file";
while (<U>) {
($num,$name,$_,$_,$_,$exp) = split /;/;
if ($exp ne "") {
$exp =~ s/^<.*> *//g;
$a = "";
foreach $x (split (/ /, $exp)) {
if ($x ne "0020") {
$a = $a . " " . hex $x;
($expand{hex $num} = $a) =~ s/^ //;
close U;
print STDERR "Accent rules\n";
if (open(ACC, "misc/user_unacc")) {
while (<ACC>) {
(/^\s*$/ || /^#/) && next;
(/^(\d+)\s+(\d+)$/) || die "Syntax error in user accent rules";
$expand{$1} = $2;
close ACC;
print STDERR "Character expansions\n";
if (open(EXTRA, "misc/user_expand")) {
while (<EXTRA>) {
(/^\s*$/ || /^#/) && next;
(/^(\d+)\s+(.*)$/) || die "Syntax error in user expansions";
$expand{$1} = $2;
close EXTRA;
print "static unsigned short int x_to_output[$ncs][$unique] = {\n";
$pstr = 256;
for($c=0; $c<$ncs; $c++) {
print "\n/* $charsets[$c] */\n{\n";
for($i=0; $i<$unique; $i++) {
$u = $x2u[$i];
do {
$r = $u;
$u = "";
foreach $x (split (/ /, $r)) {
if (defined($k = $u2x{$x}) && defined $cx[$c]{$k}) {
$u = "$u $x";
} elsif (defined($k = $expand{$x})) {
$u = "$u $k";
$u =~ s/^ //;
} while ($r ne $u);
$u = "";
foreach $x (split (/ /, $r)) {
if (defined($k = $u2x{$x})) {
if ($k != 256 && defined ($k = $cx[$c]{$k})) {
$u = $u . pack("C", $k);
if (length($u) == 1) {
$z = unpack("C", $u);
} else {
if (!defined($string{$u})) {
$string{$u} = $pstr;
$strval{$pstr} = $u;
$pstr += 1 + length($u);
$z = $string{$u};
print "$z,", ($i % 16 == 15) ? "\n" : " ";
if ($i % 16) { print "\n"; }
print "},\n";
print "};\n\n";
print STDERR "And Tubular Bells...\n";
print "static unsigned char string_table[] = {\n";
$i = 256;
while ($i < $pstr) {
$w = $strval{$i};
print length $w, ",";
foreach $x (unpack("C256", $w)) {
print " $x,";
print "\n";
$i += 1 + length $w;
print "};\n";
print STDERR "Done.\n";