Page Menu
Home
Phorge
Search
Configure Global Search
Log In
Files
F120823940
mkchartable.pl
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Authored By
Unknown
Size
17 KB
Referenced Files
None
Subscribers
None
mkchartable.pl
View Options
#
!
/
usr
/
bin
/
perl
#
#
mkchartable
.
pl
--
Generate
character
set
mapping
table
#
#
Copyright
(
c
)
1994
-
2008
Carnegie
Mellon
University
.
All
rights
reserved
.
#
#
Redistribution
and
use
in
source
and
binary
forms
,
with
or
without
#
modification
,
are
permitted
provided
that
the
following
conditions
#
are
met
:
#
#
1.
Redistributions
of
source
code
must
retain
the
above
copyright
#
notice
,
this
list
of
conditions
and
the
following
disclaimer
.
#
#
2.
Redistributions
in
binary
form
must
reproduce
the
above
copyright
#
notice
,
this
list
of
conditions
and
the
following
disclaimer
in
#
the
documentation
and
/
or
other
materials
provided
with
the
#
distribution
.
#
#
3.
The
name
"Carnegie Mellon University"
must
not
be
used
to
#
endorse
or
promote
products
derived
from
this
software
without
#
prior
written
permission
.
For
permission
or
any
legal
#
details
,
please
contact
#
Carnegie
Mellon
University
#
Center
for
Technology
Transfer
and
Enterprise
Creation
#
4615
Forbes
Avenue
#
Suite
302
#
Pittsburgh
,
PA
15213
#
(
412
)
268
-
7393
,
fax
:
(
412
)
268
-
7395
#
innovation@andrew
.
cmu
.
edu
#
#
4.
Redistributions
of
any
form
whatsoever
must
retain
the
following
#
acknowledgment
:
#
"This product includes software developed by Computing Services
# at Carnegie Mellon University (http://www.cmu.edu/computing/)."
#
#
CARNEGIE
MELLON
UNIVERSITY
DISCLAIMS
ALL
WARRANTIES
WITH
REGARD
TO
#
THIS
SOFTWARE
,
INCLUDING
ALL
IMPLIED
WARRANTIES
OF
MERCHANTABILITY
#
AND
FITNESS
,
IN
NO
EVENT
SHALL
CARNEGIE
MELLON
UNIVERSITY
BE
LIABLE
#
FOR
ANY
SPECIAL
,
INDIRECT
OR
CONSEQUENTIAL
DAMAGES
OR
ANY
DAMAGES
#
WHATSOEVER
RESULTING
FROM
LOSS
OF
USE
,
DATA
OR
PROFITS
,
WHETHER
IN
#
AN
ACTION
OF
CONTRACT
,
NEGLIGENCE
OR
OTHER
TORTIOUS
ACTION
,
ARISING
#
OUT
OF
OR
IN
CONNECTION
WITH
THE
USE
OR
PERFORMANCE
OF
THIS
SOFTWARE
.
use
strict
;
use
warnings
;
#
n
.
b
.
this
script
MUST
NOT
'use locale'
as
it
depends
on
the
stability
#
of
perl's default sort order (which is changed by locale-awareness)
use IO::File;
use Getopt::Long;
my @maps;
my @charsets;
my $aliasfile;
my %codemap;
my $output;
GetOptions( 'map
|
m
=
s' => \@maps, #strings
'aliases
|
a
=
s' => \$aliasfile, #string
'output
|
o
=
s' => \$output); #string
open (OUTPUT, ">$output");
# charsets were probably specified in shell-glob order, which can vary by locale.
# sort them here, so that the ordering in our generated files is stable.
@charsets = sort @ARGV;
printheader(\@maps, \@charsets);
# first we parse the chartable unicode mappings and the fixes
# file to build the unicode to search canonical form tables.
foreach my $map (@maps) {
readmapfile(\%codemap, $map);
}
# we follow any mappings repeatedly until nothing in the
# table doesn't
change
any
more
mungemap
(
\
%codemap);
#
then
print
out
the
translation
tables
printmap
(
\
%codemap, $aliasfile);
#
XXX
-
should
probably
require
all
files
that
are
#
mentioned
in
the
lookup
table
to
be
specified
,
#
or
this
sucker
aintn't gunna compile.
foreach my $opt (@charsets) {
print "mkchartable: mapping $opt...\n";
my $table = readcharfile($opt);
printtable($table, $opt);
}
printlookup();
close (OUTPUT);
exit 0;
sub usage {
print "usage: mkchartable -m mapfile -o outputfile charsetfile...\n";
exit(1);
}
# Read a Unicode table, deriving useful mappings from it
sub readmapfile {
my ($codemap, $name) = @_;
my $mapfile = IO::File->new($name, 'r') || die "Failed to open $name\n";
while (my $line = $mapfile->getline()) {
chomp $line;
$line =~ s/^\s+//; # strip leading space
next if $line =~ m/^\#/; # comment
next if $line eq ''; # blank
my ($hexcode, $name, $category, $combiningclass, $bidicat,
$decomposition, $decimal, $digit, $numeric, $mirroredchar,
$uni1name, $comment, $upper, $lower, $title, @rest) = split '
;
', $line;
my $code = hex($hexcode);
# This is not RFC 5051
if ($code != 32 and $category =~ m/^Z/) {
$codemap->{$code}{chars} = [32]; # space
next;
}
# has a mapping to titlecase
$codemap->{$code}{title} = hex($title)
if $title;
# Compatability mapping, skip over the <type>
while ($decomposition ne '') {
# This is not RFC 5051
if ($decomposition =~ s/^<[^>]*>\s+//) {
# Ignore compat mappings to SP followed by combining char
$decomposition = '' if $decomposition =~ m/^0020 /
}
if ($decomposition =~ s/([0-9a-fA-F]+)\s*//) {
push @{$codemap->{$code}{chars}}, hex($1);
}
}
$codemap->{$code}{chars} ||= [$code];
}
}
# Perform the transitive closure on the unicode mapping table
# Calculate translations for mappings
sub mungemap {
my ($codemap) = @_;
my $total = keys %$codemap;
my $changed;
# Keep scanning the table until no changes are made
do {
$changed = 0;
foreach my $code (sort { $a <=> $b } keys %$codemap) {
my @new;
my $chars = $codemap->{$code}{chars};
# check if there are any translations for the mapped chars
foreach my $char (@$chars) {
if ($codemap->{$char}) {
my $newchars = $codemap->{$char}{chars};
push @new, @$newchars;
}
else {
push @new, $char;
}
}
# strip all whitespace, but put back one if nothing left
if (grep { $_ == 32 } @new) {
@new = grep { $_ != 32 } @new;
@new = (32) unless @new;
}
# no change
next if ("@new" eq "@$chars");
$changed++;
$codemap->{$code}{chars} = \@new;
}
print "mkchartable: expanded unicode mappings... ($changed/$total)\n"
if $changed;
} while ($changed);
print "mkchartable: building expansion table...\n";
print OUTPUT <<EOF;
/* Table of translations */
HIDDEN const int chartables_translation_multichar[] = {
0, /* the index of 0 is reserved to mean "no translation" */
EOF
my $offset = 1;
my $maxlen = 1;
foreach my $code (sort { $a <=> $b } keys %$codemap) {
my $chars = $codemap->{$code}{chars};
if (@$chars > 1) {
$maxlen = @$chars if $maxlen < @$chars;
# add to the translation table
print OUTPUT " ";
print OUTPUT join(", ", (map { sprintf("0x%04x", $_) } @$chars));
printf OUTPUT ", 0, /* Translation for %04x (offset %d) */\n", $code, $offset;
# update tracking
$codemap->{$code}{trans} = $offset;
$offset += @$chars + 1;
}
}
print OUTPUT <<EOF;
};
EOF
}
# output the tables used for canonising the unicode
# into search normal form.
sub printmap {
my ($codemap) = @_;
print "mkchartable: building translation table...\n";
# record which blocks we need mappings for
my @needblock;
foreach my $code (keys %$codemap) {
$needblock[($code >> 16) & 0xff][($code >> 8) & 0xff] = 1;
}
print OUTPUT << "EOF";
/* The next two tables are used for doing translations from
* 24-bit unicode values to canonical form. First look up the
* code >> 16 (highest order block) in the block16 table to
* find the index to the block8 table for that block.
* If the index is 255, there are no translations for that
* block, so return the same value. Otherwise, repeat for
* code >> 8 (middle block) to get an index into the
* direct translation block. Again, 255 means no translations
* for that block. Finally the translation can be one of.
*
* 0: no output
* +ve char: return this single char
* -ve number: offset into the chartables_translation_multichar
* table. Read chars until 0 encountered.
*/
HIDDEN const unsigned char chartables_translation_block16[256] = {
EOF
my $n16 = 0;
foreach my $block16 (0..255) {
if ($needblock[$block16]) {
printf OUTPUT (" %3d,", $n16++);
} else {
printf OUTPUT " 255,";
}
print OUTPUT "\n" if ($block16 % 8 == 7);
}
print OUTPUT <<EOF;
};
HIDDEN const unsigned char chartables_translation_block8[$n16][256] = {
EOF
my $n8 = 0;
foreach my $block16 (0..255) {
my $need8 = $needblock[$block16];
next unless $need8;
print OUTPUT " { /* translation for 16 bit offset $block16 */\n ";
foreach my $block8 (0..255) {
if ($need8->[$block8]) {
printf OUTPUT (" %3d,", $n8++);
} else {
printf OUTPUT " 255,";
}
print OUTPUT "\n " if ($block8 % 8 == 7);
}
print OUTPUT "},\n";
}
print OUTPUT <<EOF;
};
/* NOTE: Unlike other charset translation tables, the
* chartables_translation table is NOT used to directly parse
* a charset. Instead, it's
used
to
convert
from
a
unicode
*
character
to
the
"canonical form"
,
possibly
multiple
*
characters
.
*/
HIDDEN
const
int
chartables_translation
[
$
n8
][
256
]
=
{
EOF
foreach
my
$
block16
(
0..255
)
{
my
$
need8
=
$
needblock
[
$
block16
];
next
unless
$
need8
;
foreach
my
$
block8
(
0..255
)
{
next
unless
$
need8->
[
$
block8
];
print
OUTPUT
" { /* Mapping for unicode chars in block $block16 $block8 */\n "
;
foreach
my
$
i
(
0..255
)
{
my
$
codepoint
=
(
$
block16
<<
16
)
+
(
$
block8
<<
8
)
+
$
i
;
my
$
titlepoint
=
$
codemap->
{
$
codepoint
}{
title
}
||
$
codepoint
;
if
(
not
$
codemap->
{
$
titlepoint
}
or
not
keys
%{$codemap->{$titlepoint}}) {
printf
OUTPUT
" 0x%04x,"
,
$
titlepoint
;
}
elsif
(
$
codemap->
{
$
titlepoint
}{
trans
})
{
printf
OUTPUT
" - %4d,"
,
$
codemap->
{
$
titlepoint
}{
trans
};
}
else
{
printf
OUTPUT
" 0x%04x,"
,
$
codemap->
{
$
titlepoint
}{
chars
}[
0
];
}
print
OUTPUT
"\n "
if
(
$
i
% 8 == 7);
}
print
OUTPUT
"},\n"
;
}
}
printf
OUTPUT
"};\n\n"
;
}
#
read
a
charset
table
,
building
intermediate
state
tables
#
for
multibyte
sequences
and
named
state
tables
for
mode
#
switches
sub
readcharfile
{
my
(
$
name
)
=
@
_
;
my
$
charfile
=
IO
::
File
->
new
(
$
name
,
'r'
)
||
die
"Failed to read $name"
;
my
%data = (
currstate
=>
-
1
,
num
=>
0
,
tables
=>
[],
states
=>
{},
);
my
$
state
;
while
(
my
$
line
=
$
charfile->
getline
())
{
chomp
$
line
;
my
$
comment
=
$
line
;
$
line
=~
s/^\s+//
;
#
strip
leading
space
next
if
$
line
=~
m/^\#/
;
#
comment
next
if
$
line
eq
''
;
#
blank
if
(
$
line
=~
m/^:
(
\
S
+
)
/
)
{
#
New
state
$
state
=
getstate
(
\
%data, $1);
next
;
}
$
state
||
=
getstate
(
\
%data, "");
die
"Invalid data line $line\n"
unless
$
line
=~
s/^
([
0
-
9
a
-
fA
-
F
]
+
)
\s+//
;
my
$
code
=
hex
(
$
1
);
my
$
basestate
=
$
state
;
if
(
$
code
>
0xffffff
)
{
my
$
char
=
(
$
code
>>
24
)
&
0xff
;
my
$
newname
=
sprintf
"%s_%02x"
,
$
state->
{
name
}
||
'state'
,
$
char
;
my
$
newstate
=
getstate
(
\
%data, $newname);
$
state->
{
chars
}[
$
char
]
=
[
0
,
$
newstate->
{
num
},
"Auto multibyte state 4 bytes $newname"
];
$
state
=
$
newstate
;
}
if
(
$
code
>
0xffff
)
{
my
$
char
=
(
$
code
>>
16
)
&
0xff
;
my
$
newname
=
sprintf
"%s_%02x"
,
$
state->
{
name
}
||
'state'
,
$
char
;
my
$
newstate
=
getstate
(
\
%data, $newname);
$
state->
{
chars
}[
$
char
]
=
[
0
,
$
newstate->
{
num
},
"Auto multibyte state 3 bytes $newname"
];
$
state
=
$
newstate
;
}
if
(
$
code
>
0xff
)
{
my
$
char
=
(
$
code
>>
8
)
&
0xff
;
my
$
newname
=
sprintf
"%s_%02x"
,
$
state->
{
name
}
||
'state'
,
$
char
;
my
$
newstate
=
getstate
(
\
%data, $newname);
$
state->
{
chars
}[
$
char
]
=
[
0
,
$
newstate->
{
num
},
"Auto multibyte state 2 bytes $newname"
];
$
state
=
$
newstate
;
}
my
$
char
=
$
code
&
0xff
;
die
"Duplicate defs for $char in $state->{name}"
if
$
state->
{
chars
}[
$
char
];
#
nothing
if
(
$
line
=~
m/^\?/
)
{
next
;
}
#
state
switch
if
(
$
line
=~
m/^:
(
\
S
*
)
/
)
{
my
$
targetstate
=
getstate
(
\
%data, $1);
$
state->
{
chars
}[
$
char
]
=
[
0
,
$
targetstate->
{
num
},
$
comment
];
}
else
{
#
otherwise
it's a regular char
die "Invalid data line $line\n" unless $line =~ s/^([0-9a-fA-F]+)\s+//;
my $target = hex($1);
$state->{chars}[$char] = [$target, $basestate->{num}, $comment];
}
$state = $basestate;
}
return \%data;
}
# helper function to create a new state within a charset
sub getstate {
my ($data, $name) = @_;
if (exists $data->{states}{$name}) { # could be 0
return $data->{tables}[$data->{states}{$name}];
}
my $num = $data->{num};
my $next = $num;
if ($name =~ s/ \<$//) {
$next = -1;
}
my $state = $data->{tables}[$num] = {
name => $name,
num => $num,
next => $next,
codes => {},
};
$data->{states}{$name} = $num;
$data->{num}++;
return $state;
}
# output the table used for charset->unicode translation
sub printtable {
my ($data, $name) = @_;
my $num = $data->{num};
my $tables = $data->{tables};
$name =~ s{.*[\\/]}{}; # strip anything up to the last separator;
$name =~ s{\..*}{}; # after a dot
$name =~ s{-}{_}g; # underscores
print OUTPUT "static const struct charmap chartables_$name\[$num][256] = {\n";
foreach my $table (@$tables) {
my $chars = $table->{chars};
print OUTPUT " {";
if ($table->{name}) {
print OUTPUT " /* $table->{name} */";
}
print OUTPUT "\n";
foreach my $i (0..255) {
my $char = $chars->[$i];
if ($char) {
print OUTPUT " { $char->[0], $char->[1] }, /* $char->[2] */\n";
}
else {
print OUTPUT " { 0xfffd, 0 }, /* no such character */\n";
}
}
print OUTPUT " },\n";
}
print OUTPUT "};\n\n";
}
# print the header of the chartable.c file
sub printheader {
my ($maps, $charsets) = @_;
print OUTPUT <<EOF;
/* This file is generated by mkchartable.pl with the following arguments
*
EOF
foreach my $map (@$maps) {
my $sha1 = getsha1($map);
print OUTPUT " * map: $sha1 $map\n";
}
foreach my $charset (@$charsets) {
my $sha1 = getsha1($charset);
print OUTPUT " * charset: $sha1 $charset\n";
}
print OUTPUT <<EOF;
*/
#include "chartable.h"
#include "config.h"
EOF
}
# print the lookup table for charactersets at the end
# of the chartable.c file.
sub printlookup {
print OUTPUT <<EOF;
/*
* Mapping of character sets to tables. This table is deprecated and
* is only used for backward compatibility.
*/
HIDDEN const struct charset chartables_charset_table[] = {
{ "us-ascii", chartables_us_ascii }, /* US-ASCII must be charset number 0 */
{ "utf-8", 0 }, /* handled directly */
/* All remaining legacy charsets are handled by ICU */
{ "utf-7", 0 },
{ "iso-8859-1", 0 },
{ "iso-8859-2", 0 },
{ "iso-8859-3", 0 },
{ "iso-8859-4", 0 },
{ "iso-8859-5", 0 },
{ "iso-8859-6", 0 },
{ "iso-8859-7", 0 },
{ "iso-8859-8", 0 },
{ "iso-8859-9", 0 },
{ "koi8-r", 0 },
{ "iso-2022-jp", 0 },
{ "iso-2022-kr", 0 },
{ "gb2312", 0 },
{ "big5", 0 },
/* Compatibility names */
{ "unicode-1-1-utf-7", 0 },
{ "unicode-2-0-utf-7", 0 },
{ "x-unicode-2-0-utf-7", 0 },
/* End Compatibility Names */
{ "iso-8859-15", 0 },
{ "windows-1252", 0 },
{ "windows-1256", 0 },
{ "windows-1250", 0 },
{ "windows-1251", 0 },
{ "windows-1255", 0 },
{ "iso-8859-10", 0 },
{ "iso-8859-11", 0 },
{ "iso-8859-13", 0 },
{ "iso-8859-14", 0 },
{ "iso-8859-16", 0 },
{ "windows-1254", 0 },
{ "windows-1258", 0 },
{ "windows-874", 0 },
{ "imap-utf-7", 0 },
{ "windows-850", 0 },
{ "windows-31j", 0 },
{ "windows-936", 0 },
{ "windows-1257", 0 },
{ "koi8-u", 0 },
/* New character sets should only be added to end so that
* cache files stay with valid information */
};
HIDDEN const struct charset_alias charset_aliases[] = {
EOF
my %aliases;
if ($aliasfile and -f $aliasfile) {
open(FH, "<$aliasfile") || die "can't
read
alias
file
$
aliasfile
"
;
while
(
<
FH
>
)
{
next
if
m/^#/
;
next
unless
m/
(
\
S
+
)
\s+
(
\
S
+
)
/
;
$
aliases
{
$
1
}
=
$
2
;
}
close
(
FH
);
}
foreach
my
$
alias
(
sort
keys
%aliases) {
print
OUTPUT
qq
[
{
"$alias"
,
"$aliases{$alias}"
},
\n
];
}
print
OUTPUT
<<
EOF
;
{
0
,
0
}
};
HIDDEN
const
int
chartables_num_charsets
=
(
sizeof
(
chartables_charset_table
)
/
sizeof
(
*
chartables_charset_table
));
EOF
}
#
calculate
the
sha1
of
a
file
sub
getsha1
{
my
$
file
=
shift
;
my
$
fh
=
IO
::
File
->
new
(
$
file
,
'r'
)
||
return
"<none>"
;
if
(
eval
{
require
"Digest/SHA1.pm"
})
{
my
$
digest
=
Digest
::
SHA1
->
new
();
$
digest->
addfile
(
$
fh
);
return
$
digest->
hexdigest
();
}
else
{
return
"(install Digest::SHA1 for checksums)"
;
}
}
__
END__
File Metadata
Details
Attached
Mime Type
text/x-perl
Expires
Fri, Apr 24, 10:14 AM (2 d, 4 h ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
18849162
Default Alt Text
mkchartable.pl (17 KB)
Attached To
Mode
R111 cyrus-imapd
Attached
Detach File
Event Timeline