mkchartable.pl
No OneTemporary
Actions

Authored By

Unknown

Size

17 KB

Referenced Files

None

Subscribers

None

mkchartable.pl
View Options

	#!/usr/bin/perl
	#
	# mkchartable.pl -- Generate character set mapping table
	#
	# Copyright (c) 1994-2008 Carnegie Mellon University. All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions
	# are met:
	#
	# 1. Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	#
	# 2. Redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in
	# the documentation and/or other materials provided with the
	# distribution.
	#
	# 3. The name "Carnegie Mellon University" must not be used to
	# endorse or promote products derived from this software without
	# prior written permission. For permission or any legal
	# details, please contact
	# Carnegie Mellon University
	# Center for Technology Transfer and Enterprise Creation
	# 4615 Forbes Avenue
	# Suite 302
	# Pittsburgh, PA 15213
	# (412) 268-7393, fax: (412) 268-7395
	# innovation@andrew.cmu.edu
	#
	# 4. Redistributions of any form whatsoever must retain the following
	# acknowledgment:
	# "This product includes software developed by Computing Services
	# at Carnegie Mellon University (http://www.cmu.edu/computing/)."
	#
	# CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO
	# THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
	# AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
	# FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
	# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
	# AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
	# OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

	use strict;
	use warnings;

	# n.b. this script MUST NOT 'use locale' as it depends on the stability
	# of perl's default sort order (which is changed by locale-awareness)

	use IO::File;
	use Getopt::Long;

	my @maps;
	my @charsets;
	my $aliasfile;
	my %codemap;
	my $output;

	GetOptions( 'map\|m=s' => \@maps, #strings
	'aliases\|a=s' => \$aliasfile, #string
	'output\|o=s' => \$output); #string
	open (OUTPUT, ">$output");

	# charsets were probably specified in shell-glob order, which can vary by locale.
	# sort them here, so that the ordering in our generated files is stable.
	@charsets = sort @ARGV;

	printheader(\@maps, \@charsets);

	# first we parse the chartable unicode mappings and the fixes
	# file to build the unicode to search canonical form tables.
	foreach my $map (@maps) {
	readmapfile(\%codemap, $map);
	}

	# we follow any mappings repeatedly until nothing in the
	# table doesn't change any more
	mungemap(\%codemap);

	# then print out the translation tables
	printmap(\%codemap, $aliasfile);

	# XXX - should probably require all files that are
	# mentioned in the lookup table to be specified,
	# or this sucker aintn't gunna compile.
	foreach my $opt (@charsets) {
	print "mkchartable: mapping $opt...\n";
	my $table = readcharfile($opt);
	printtable($table, $opt);
	}

	printlookup();
	close (OUTPUT);

	exit 0;

	sub usage {
	print "usage: mkchartable -m mapfile -o outputfile charsetfile...\n";
	exit(1);
	}

	# Read a Unicode table, deriving useful mappings from it
	sub readmapfile {
	my ($codemap, $name) = @_;

	my $mapfile = IO::File->new($name, 'r') \|\| die "Failed to open $name\n";

	while (my $line = $mapfile->getline()) {
	chomp $line;
	$line =~ s/^\s+//; # strip leading space
	next if $line =~ m/^\#/; # comment
	next if $line eq ''; # blank

	my ($hexcode, $name, $category, $combiningclass, $bidicat,
	$decomposition, $decimal, $digit, $numeric, $mirroredchar,
	$uni1name, $comment, $upper, $lower, $title, @rest) = split ';', $line;
	my $code = hex($hexcode);

	# This is not RFC 5051
	if ($code != 32 and $category =~ m/^Z/) {
	$codemap->{$code}{chars} = [32]; # space
	next;
	}

	# has a mapping to titlecase
	$codemap->{$code}{title} = hex($title)
	if $title;

	# Compatability mapping, skip over the <type>
	while ($decomposition ne '') {
	# This is not RFC 5051
	if ($decomposition =~ s/^<[^>]*>\s+//) {
	# Ignore compat mappings to SP followed by combining char
	$decomposition = '' if $decomposition =~ m/^0020 /
	}

	if ($decomposition =~ s/([0-9a-fA-F]+)\s*//) {
	push @{$codemap->{$code}{chars}}, hex($1);
	}
	}

	$codemap->{$code}{chars} \|\|= [$code];
	}
	}

	# Perform the transitive closure on the unicode mapping table
	# Calculate translations for mappings
	sub mungemap {
	my ($codemap) = @_;

	my $total = keys %$codemap;
	my $changed;

	# Keep scanning the table until no changes are made
	do {
	$changed = 0;

	foreach my $code (sort { $a <=> $b } keys %$codemap) {
	my @new;
	my $chars = $codemap->{$code}{chars};

	# check if there are any translations for the mapped chars
	foreach my $char (@$chars) {
	if ($codemap->{$char}) {
	my $newchars = $codemap->{$char}{chars};
	push @new, @$newchars;
	}
	else {
	push @new, $char;
	}
	}

	# strip all whitespace, but put back one if nothing left
	if (grep { $_ == 32 } @new) {
	@new = grep { $_ != 32 } @new;
	@new = (32) unless @new;
	}

	# no change
	next if ("@new" eq "@$chars");

	$changed++;
	$codemap->{$code}{chars} = \@new;
	}

	print "mkchartable: expanded unicode mappings... ($changed/$total)\n"
	if $changed;
	} while ($changed);

	print "mkchartable: building expansion table...\n";

	print OUTPUT <<EOF;
	/* Table of translations */
	HIDDEN const int chartables_translation_multichar[] = {
	0, /* the index of 0 is reserved to mean "no translation" */
	EOF

	my $offset = 1;
	my $maxlen = 1;

	foreach my $code (sort { $a <=> $b } keys %$codemap) {
	my $chars = $codemap->{$code}{chars};
	if (@$chars > 1) {
	$maxlen = @$chars if $maxlen < @$chars;

	# add to the translation table
	print OUTPUT " ";
	print OUTPUT join(", ", (map { sprintf("0x%04x", $_) } @$chars));
	printf OUTPUT ", 0, /* Translation for %04x (offset %d) */\n", $code, $offset;

	# update tracking
	$codemap->{$code}{trans} = $offset;
	$offset += @$chars + 1;
	}
	}

	print OUTPUT <<EOF;
	};

	EOF
	}

	# output the tables used for canonising the unicode
	# into search normal form.
	sub printmap {
	my ($codemap) = @_;

	print "mkchartable: building translation table...\n";

	# record which blocks we need mappings for
	my @needblock;
	foreach my $code (keys %$codemap) {
	$needblock[($code >> 16) & 0xff][($code >> 8) & 0xff] = 1;
	}

	print OUTPUT << "EOF";
	/* The next two tables are used for doing translations from
	* 24-bit unicode values to canonical form. First look up the
	* code >> 16 (highest order block) in the block16 table to
	* find the index to the block8 table for that block.
	* If the index is 255, there are no translations for that
	* block, so return the same value. Otherwise, repeat for
	* code >> 8 (middle block) to get an index into the
	* direct translation block. Again, 255 means no translations
	* for that block. Finally the translation can be one of.
	*
	* 0: no output
	* +ve char: return this single char
	* -ve number: offset into the chartables_translation_multichar
	* table. Read chars until 0 encountered.
	*/
	HIDDEN const unsigned char chartables_translation_block16[256] = {
	EOF

	my $n16 = 0;
	foreach my $block16 (0..255) {
	if ($needblock[$block16]) {
	printf OUTPUT (" %3d,", $n16++);
	} else {
	printf OUTPUT " 255,";
	}
	print OUTPUT "\n" if ($block16 % 8 == 7);
	}

	print OUTPUT <<EOF;
	};

	HIDDEN const unsigned char chartables_translation_block8[$n16][256] = {
	EOF
	my $n8 = 0;
	foreach my $block16 (0..255) {
	my $need8 = $needblock[$block16];
	next unless $need8;
	print OUTPUT " { /* translation for 16 bit offset $block16 */\n ";
	foreach my $block8 (0..255) {
	if ($need8->[$block8]) {
	printf OUTPUT (" %3d,", $n8++);
	} else {
	printf OUTPUT " 255,";
	}
	print OUTPUT "\n " if ($block8 % 8 == 7);
	}
	print OUTPUT "},\n";
	}

	print OUTPUT <<EOF;
	};

	/* NOTE: Unlike other charset translation tables, the
	* chartables_translation table is NOT used to directly parse
	* a charset. Instead, it's used to convert from a unicode
	* character to the "canonical form", possibly multiple
	* characters.
	*/
	HIDDEN const int chartables_translation[$n8][256] = {
	EOF

	foreach my $block16 (0..255) {
	my $need8 = $needblock[$block16];
	next unless $need8;
	foreach my $block8 (0..255) {
	next unless $need8->[$block8];
	print OUTPUT " { /* Mapping for unicode chars in block $block16 $block8 */\n ";
	foreach my $i (0..255) {
	my $codepoint = ($block16 << 16) + ($block8 << 8) + $i;
	my $titlepoint = $codemap->{$codepoint}{title} \|\| $codepoint;
	if (not $codemap->{$titlepoint} or not keys %{$codemap->{$titlepoint}}) {
	printf OUTPUT " 0x%04x,", $titlepoint;
	}
	elsif ($codemap->{$titlepoint}{trans}) {
	printf OUTPUT " - %4d,", $codemap->{$titlepoint}{trans};
	}
	else {
	printf OUTPUT " 0x%04x,", $codemap->{$titlepoint}{chars}[0];
	}
	print OUTPUT "\n " if ($i % 8 == 7);
	}
	print OUTPUT "},\n";
	}
	}
	printf OUTPUT "};\n\n";
	}

	# read a charset table, building intermediate state tables
	# for multibyte sequences and named state tables for mode
	# switches
	sub readcharfile {
	my ($name) = @_;

	my $charfile = IO::File->new($name, 'r') \|\| die "Failed to read $name";

	my %data = (
	currstate => -1,
	num => 0,
	tables => [],
	states => {},
	);

	my $state;

	while (my $line = $charfile->getline()) {
	chomp $line;
	my $comment = $line;
	$line =~ s/^\s+//; # strip leading space
	next if $line =~ m/^\#/; # comment
	next if $line eq ''; # blank

	if ($line =~ m/^:(\S+)/) {
	# New state
	$state = getstate(\%data, $1);
	next;
	}

	$state \|\|= getstate(\%data, "");

	die "Invalid data line $line\n" unless $line =~ s/^([0-9a-fA-F]+)\s+//;

	my $code = hex($1);

	my $basestate = $state;

	if ($code > 0xffffff) {
	my $char = ($code >> 24) & 0xff;
	my $newname = sprintf "%s_%02x", $state->{name} \|\| 'state', $char;
	my $newstate = getstate(\%data, $newname);
	$state->{chars}[$char] = [0, $newstate->{num}, "Auto multibyte state 4 bytes $newname"];
	$state = $newstate;
	}
	if ($code > 0xffff) {
	my $char = ($code >> 16) & 0xff;
	my $newname = sprintf "%s_%02x", $state->{name} \|\| 'state', $char;
	my $newstate = getstate(\%data, $newname);
	$state->{chars}[$char] = [0, $newstate->{num}, "Auto multibyte state 3 bytes $newname"];
	$state = $newstate;
	}
	if ($code > 0xff) {
	my $char = ($code >> 8) & 0xff;
	my $newname = sprintf "%s_%02x", $state->{name} \|\| 'state', $char;
	my $newstate = getstate(\%data, $newname);
	$state->{chars}[$char] = [0, $newstate->{num}, "Auto multibyte state 2 bytes $newname"];
	$state = $newstate;
	}

	my $char = $code & 0xff;
	die "Duplicate defs for $char in $state->{name}"
	if $state->{chars}[$char];

	# nothing
	if ($line =~ m/^\?/) {
	next;
	}

	# state switch
	if ($line =~ m/^:(\S*)/) {
	my $targetstate = getstate(\%data, $1);
	$state->{chars}[$char] = [0, $targetstate->{num}, $comment];
	}
	else {
	# otherwise it's a regular char
	die "Invalid data line $line\n" unless $line =~ s/^([0-9a-fA-F]+)\s+//;
	my $target = hex($1);
	$state->{chars}[$char] = [$target, $basestate->{num}, $comment];
	}

	$state = $basestate;
	}

	return \%data;
	}

	# helper function to create a new state within a charset
	sub getstate {
	my ($data, $name) = @_;

	if (exists $data->{states}{$name}) { # could be 0
	return $data->{tables}[$data->{states}{$name}];
	}

	my $num = $data->{num};

	my $next = $num;
	if ($name =~ s/ \<$//) {
	$next = -1;
	}

	my $state = $data->{tables}[$num] = {
	name => $name,
	num => $num,
	next => $next,
	codes => {},
	};
	$data->{states}{$name} = $num;

	$data->{num}++;

	return $state;
	}

	# output the table used for charset->unicode translation
	sub printtable {
	my ($data, $name) = @_;

	my $num = $data->{num};
	my $tables = $data->{tables};

	$name =~ s{.*[\\/]}{}; # strip anything up to the last separator;
	$name =~ s{\..*}{}; # after a dot
	$name =~ s{-}{_}g; # underscores

	print OUTPUT "static const struct charmap chartables_$name\[$num][256] = {\n";

	foreach my $table (@$tables) {
	my $chars = $table->{chars};
	print OUTPUT " {";
	if ($table->{name}) {
	print OUTPUT " /* $table->{name} */";
	}
	print OUTPUT "\n";
	foreach my $i (0..255) {
	my $char = $chars->[$i];
	if ($char) {
	print OUTPUT " { $char->[0], $char->[1] }, /* $char->[2] */\n";
	}
	else {
	print OUTPUT " { 0xfffd, 0 }, /* no such character */\n";
	}
	}
	print OUTPUT " },\n";
	}
	print OUTPUT "};\n\n";
	}

	# print the header of the chartable.c file
	sub printheader {
	my ($maps, $charsets) = @_;

	print OUTPUT <<EOF;
	/* This file is generated by mkchartable.pl with the following arguments
	*
	EOF
	foreach my $map (@$maps) {
	my $sha1 = getsha1($map);
	print OUTPUT " * map: $sha1 $map\n";
	}
	foreach my $charset (@$charsets) {
	my $sha1 = getsha1($charset);
	print OUTPUT " * charset: $sha1 $charset\n";
	}
	print OUTPUT <<EOF;
	*/

	#include "chartable.h"
	#include "config.h"

	EOF
	}

	# print the lookup table for charactersets at the end
	# of the chartable.c file.
	sub printlookup {
	print OUTPUT <<EOF;

	/*
	* Mapping of character sets to tables. This table is deprecated and
	* is only used for backward compatibility.
	*/

	HIDDEN const struct charset chartables_charset_table[] = {
	{ "us-ascii", chartables_us_ascii }, /* US-ASCII must be charset number 0 */
	{ "utf-8", 0 }, /* handled directly */
	/* All remaining legacy charsets are handled by ICU */
	{ "utf-7", 0 },
	{ "iso-8859-1", 0 },
	{ "iso-8859-2", 0 },
	{ "iso-8859-3", 0 },
	{ "iso-8859-4", 0 },
	{ "iso-8859-5", 0 },
	{ "iso-8859-6", 0 },
	{ "iso-8859-7", 0 },
	{ "iso-8859-8", 0 },
	{ "iso-8859-9", 0 },
	{ "koi8-r", 0 },
	{ "iso-2022-jp", 0 },
	{ "iso-2022-kr", 0 },
	{ "gb2312", 0 },
	{ "big5", 0 },
	/* Compatibility names */
	{ "unicode-1-1-utf-7", 0 },
	{ "unicode-2-0-utf-7", 0 },
	{ "x-unicode-2-0-utf-7", 0 },
	/* End Compatibility Names */
	{ "iso-8859-15", 0 },
	{ "windows-1252", 0 },
	{ "windows-1256", 0 },
	{ "windows-1250", 0 },
	{ "windows-1251", 0 },
	{ "windows-1255", 0 },
	{ "iso-8859-10", 0 },
	{ "iso-8859-11", 0 },
	{ "iso-8859-13", 0 },
	{ "iso-8859-14", 0 },
	{ "iso-8859-16", 0 },
	{ "windows-1254", 0 },
	{ "windows-1258", 0 },
	{ "windows-874", 0 },
	{ "imap-utf-7", 0 },
	{ "windows-850", 0 },
	{ "windows-31j", 0 },
	{ "windows-936", 0 },
	{ "windows-1257", 0 },
	{ "koi8-u", 0 },

	/* New character sets should only be added to end so that
	* cache files stay with valid information */
	};

	HIDDEN const struct charset_alias charset_aliases[] = {
	EOF

	my %aliases;
	if ($aliasfile and -f $aliasfile) {
	open(FH, "<$aliasfile") \|\| die "can't read alias file $aliasfile";
	while (<FH>) {
	next if m/^#/;
	next unless m/(\S+)\s+(\S+)/;
	$aliases{$1} = $2;
	}
	close(FH);
	}

	foreach my $alias (sort keys %aliases) {
	print OUTPUT qq[ { "$alias", "$aliases{$alias}" },\n];
	}

	print OUTPUT <<EOF;
	{ 0, 0 }
	};

	HIDDEN const int chartables_num_charsets = (sizeof(chartables_charset_table)/sizeof(*chartables_charset_table));
	EOF
	}

	# calculate the sha1 of a file
	sub getsha1 {
	my $file = shift;
	my $fh = IO::File->new($file, 'r') \|\| return "<none>";
	if (eval { require "Digest/SHA1.pm" }) {
	my $digest = Digest::SHA1->new();
	$digest->addfile($fh);
	return $digest->hexdigest();
	}
	else {
	return "(install Digest::SHA1 for checksums)";
	}
	}

	__END__

File Metadata

Mime Type: text/x-perl
Expires: Fri, Apr 24, 10:14 AM (2 d, 4 h ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 18849162
Default Alt Text: mkchartable.pl (17 KB)

mkchartable.plNo OneTemporaryActions

mkchartable.plView Options

File Metadata

Event Timeline

mkchartable.pl
No OneTemporary
Actions

mkchartable.pl
View Options