charset.c
No OneTemporary
Actions

Authored By

Unknown

Size

36 KB

Referenced Files

None

Subscribers

None

charset.c
View Options

	/* charset.c -- International character set support
	*
	* Copyright (c) 1994-2008 Carnegie Mellon University. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* 3. The name "Carnegie Mellon University" must not be used to
	* endorse or promote products derived from this software without
	* prior written permission. For permission or any legal
	* details, please contact
	* Carnegie Mellon University
	* Center for Technology Transfer and Enterprise Creation
	* 4615 Forbes Avenue
	* Suite 302
	* Pittsburgh, PA 15213
	* (412) 268-7393, fax: (412) 268-7395
	* innovation@andrew.cmu.edu
	*
	* 4. Redistributions of any form whatsoever must retain the following
	* acknowledgment:
	* "This product includes software developed by Computing Services
	* at Carnegie Mellon University (http://www.cmu.edu/computing/)."
	*
	* CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO
	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
	* AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
	* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
	* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
	* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
	*
	* $Id: charset.c,v 1.50 2008/03/24 17:43:08 murch Exp $
	*/

	#include <config.h>
	#include <ctype.h>
	#include <stdlib.h>
	#include <string.h>

	#include "assert.h"
	#include "charset.h"
	#include "xmalloc.h"
	#include "chartable.h"
	#include "util.h"

	extern const unsigned char chartables_long_translations[];
	extern const int charset_max_translation;
	extern const unsigned char chartables_unicode_block[256];
	extern const unsigned char chartables_unicode[][256][4];
	extern const unsigned char chartables_us_ascii[][256][4];
	extern const struct charset chartables_charset_table[];
	extern const int chartables_num_charsets;

	struct decode_state {
	const unsigned char (*curtable)[256][4];
	const unsigned char (*lasttable)[256][4];
	const unsigned char (*initialtable)[256][4];
	unsigned utfcode;
	unsigned num_bits;
	unsigned b64_value;
	};
	#define START(state,table) \
	((state).curtable = (state.initialtable) = (table)); \
	((state).lasttable = NULL); \
	((state).utfcode = 0); \
	((state).num_bits = 0); \
	((state).b64_value = 0);


	static int xlate(int index, char *to);
	static int writeutf8(unsigned utfcode, char *to);

	#define TRANSLATE(state,c,ptr,idx) \
	{ \
	unsigned char _ch; \
	const unsigned char *_translation = (state).curtable[0][(unsigned char)(c) & 0xff]; \
	for (;;) { \
	switch (_ch = *_translation++) { \
	case JSR: \
	(state).lasttable = (state).curtable; \
	/* FALL THROUGH */ \
	case JMP: \
	(state).curtable = ((state).initialtable + \
	(_translation[0]<<8) + (_translation[1])); \
	break; \
	\
	case RET: \
	(state).curtable = (state).lasttable; \
	/* FALL THROUGH */ \
	case END: \
	break; \
	\
	case U7F: \
	(state).b64_value = 0; \
	(state).num_bits = 0; \
	(state).curtable = ((state).initialtable + 1); \
	/* FALL THROUGH */ \
	case U7N: \
	(state).b64_value <<= 6; \
	(state).b64_value += index_64[(unsigned char)(c) & 0xff]; \
	(state).num_bits += 6; \
	if ((state).num_bits >= 16) { \
	(state).num_bits -= 16; \
	(state).utfcode = \
	((state).b64_value >> (state).num_bits) & 0x7fff; \
	idx += writeutf8((state).utfcode, ptr+idx); \
	} \
	break; \
	\
	case U83: \
	(state).lasttable = (state).curtable; \
	(state).utfcode = (c & 0x0f) << 12; \
	(state).curtable = ((state).initialtable + 1); \
	break; \
	\
	case U83_2: \
	(state).utfcode += (c & 0x3f) << 6; \
	(state).curtable = ((state).initialtable + 2); \
	break; \
	\
	case U83_3: \
	(state).utfcode += (c & 0x03f); \
	(state).curtable = (state).initialtable; \
	idx += writeutf8((state).utfcode, ptr+idx); \
	break; \
	\
	case XLT: \
	idx += xlate((_translation[0]<<8) + (_translation[1]), ptr+idx); \
	_translation += 2; /* next translation is a RET or END */ \
	continue; \
	\
	default: \
	(ptr)[(idx)++] = _ch; \
	continue; \
	} \
	break; \
	} \
	}

	/* for a comp_pat, ascii[0x80] == 0 if there are any non-ascii characters
	in the pattern */
	struct comp_pat_s {
	int pat[256]; /* boyer-moore skip table */
	int ascii[256]; /* case-mapped version of table */
	int patlen;
	int patlastchar; /* last character in the pattern */
	int patotherlastchar; /* case-flip of the last character */
	};

	#define PATASCII(pat) (pat+256)
	#define PATLEN(pat) ((pat)[512])
	#define PATLASTCHAR(pat) ((pat)[513]) /* last character in the pattern */
	#define PATOTHERLASTCHAR(pat) ((pat)[514]) /* case-flip of the pattern */
	#define PATSIZE 515

	#define GROWSIZE 100

	#define XX 127
	/*
	* Table for decoding hexadecimal in quoted-printable
	*/
	static const char index_hex[256] = {
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	0, 1, 2, 3, 4, 5, 6, 7, 8, 9,XX,XX, XX,XX,XX,XX,
	XX,10,11,12, 13,14,15,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,10,11,12, 13,14,15,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	};
	#define HEXCHAR(c) (index_hex[(unsigned char)(c)])

	/*
	* Table for decoding base64
	*/
	static const char index_64[256] = {
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,62, XX,XX,XX,63,
	52,53,54,55, 56,57,58,59, 60,61,XX,XX, XX,XX,XX,XX,
	XX, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
	15,16,17,18, 19,20,21,22, 23,24,25,XX, XX,XX,XX,XX,
	XX,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
	41,42,43,44, 45,46,47,48, 49,50,51,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
	};
	#define CHAR64(c) (index_64[(unsigned char)(c)])

	#define USASCII(c) (chartables_us_ascii[0][(unsigned char)(c)][0])

	/*
	* Lookup the character set 'name'. Returns the character set number
	* or -1 if there is no matching character set.
	*/
	int charset_lookupname(const char *name)
	{
	int i;

	for (i=0; i<chartables_num_charsets; i++) {
	if (!strcasecmp(name, chartables_charset_table[i].name)) return i;
	}
	return -1;
	}

	/*
	* Convert the string 's' in the character set numbered 'charset'
	* into canonical searching form. Decodes into 'retval', which
	* must be reallocable and currently at least size 'alloced'.
	*/
	char charset_convert(const char s, int charset, char *retval,
	int alloced)
	{
	int pos = 0;
	struct decode_state state;

	if (!s) return 0;

	if (charset < 0 \|\| charset >= chartables_num_charsets) return xstrdup(EMPTY_STRING);

	START(state,chartables_charset_table[charset].table);

	if (!alloced) {
	alloced = GROWSIZE;
	retval = xmalloc(alloced);
	}
	*retval = '\0';

	while (*s) {
	if (pos + charset_max_translation >= alloced) {
	alloced += GROWSIZE;
	retval = xrealloc(retval, alloced);
	}
	TRANSLATE(state, *s, retval, pos);
	s++;
	}

	retval[pos] = '\0';
	return retval;
	}

	/*
	* Decode MIME strings (per RFC 2047) in 's'. It writes the decoded
	* string to 'retval', calling realloc() as needed. (Thus retval may
	* be NULL.) Returns retval, contining 's' in canonical searching form.
	*/
	char charset_decode_mimeheader(const char s, char *retval, int alloced)
	{
	int eatspace = 0;
	const char start, endcharset, encoding, end;
	const char *p;
	int i, c, c1, c2, c3, c4;
	struct decode_state state;
	int pos = 0;
	int len;

	if (!s) return 0;

	START(state,chartables_charset_table[0].table); /* just for msvc lint */
	start = s;
	while ((start = (const char*) strchr(start, '=')) != 0) {
	start++;
	if (*start != '?') continue;
	encoding = (const char*) strchr(start+1, '?');
	if (!encoding) continue;
	endcharset =
	(const char) strchr(start+1, ''); /* Language code delimiter */
	if (!endcharset \|\| endcharset > encoding) endcharset = encoding;
	if (encoding[1] != 'b' && encoding[1] != 'B' &&
	encoding[1] != 'q' && encoding[1] != 'Q') continue;
	if (encoding[2] != '?') continue;
	end = (const char*) strchr(encoding+3, '?');
	if (!end \|\| end[1] != '=') continue;

	/*
	* We have recognized a valid 1522-word.
	* Copy over leading text, unless it consists entirely of
	* whitespace and is between two 1522-words.
	*/
	if (eatspace) {
	for (p = s; p < (start-1) && isspace((int) *p); p++);
	if (p < (start-1)) eatspace = 0;
	}
	if (!eatspace) {
	len = start - s - 1;
	if (pos + len >= alloced) {
	alloced += len + GROWSIZE;
	retval = xrealloc(retval, alloced);
	}
	while (len--) {
	c = USASCII(*s);
	if (c != END) retval[pos++] = (char)c;
	s++;
	}
	}

	/*
	* Get the 1522-word's character set
	*/
	start++;
	for (i=0; i<chartables_num_charsets; i++) {
	if ((int)strlen(chartables_charset_table[i].name) == endcharset-start &&
	!strncasecmp(start, chartables_charset_table[i].name, endcharset-start)) {
	START(state,chartables_charset_table[i].table);
	break;
	}
	}

	if (i == chartables_num_charsets) {
	/* Unrecognized charset, nothing will match here */
	if (pos + 2 >= alloced) {
	alloced += 2 + GROWSIZE;
	retval = xrealloc(retval, alloced);
	}
	strcpy(retval+pos, EMPTY_STRING);
	pos += 1;
	}
	else if (encoding[1] == 'q' \|\| encoding[1] == 'Q') {
	/* Decode 'Q' encoding */
	p = encoding+3;
	while (p < end) {
	c = *p++;
	if (c == '=') {
	c = HEXCHAR(*p);
	p++;
	i = HEXCHAR(*p);
	p++;
	if (c == XX \|\| i == XX) {
	c = '\0';
	}
	else {
	c = (char)((c << 4) + i);
	}
	}
	else if (c == '_') c = ' ';

	if (pos + charset_max_translation >= alloced) {
	alloced += GROWSIZE;
	retval = xrealloc(retval, alloced);
	}
	TRANSLATE(state, c, retval, pos);
	}
	}
	else {
	/* Decode 'B' encoding */
	p = encoding+3;
	while (p < end) {
	if (pos + charset_max_translation*3 >= alloced) {
	alloced += GROWSIZE;
	retval = xrealloc(retval, alloced);
	}
	c1 = CHAR64(p[0]);
	if (c1 == XX) break;
	c2 = CHAR64(p[1]);
	if (c2 == XX) break;
	TRANSLATE(state,((c1<<2) \| ((c2&0x30)>>4)), retval, pos);

	c3 = CHAR64(p[2]);
	if (c3 == XX) break;
	TRANSLATE(state,(((c2&0XF)<<4) \| ((c3&0x3C)>>2)), retval, pos);

	c4 = CHAR64(p[3]);
	if (c4 == XX) break;
	TRANSLATE(state,(((c3&0x03) <<6) \| c4), retval, pos);

	p += 4;
	}
	}

	/* Prepare for the next iteration */
	s = start = end+2;
	eatspace = 1;
	}

	/* Copy over the tail part of the input string */
	len = strlen(s);
	if (pos + len >= alloced) {
	alloced += len + 1;
	retval = xrealloc(retval, alloced);
	}
	while (len--) {
	c = USASCII(*s);
	if (c != END) retval[pos++] = (char)c;
	s++;
	}
	retval[pos] = '\0';
	return retval;
	}

	/*
	* Compile the pattern 's' and return a pointer to the compiled form
	*/
	comp_pat charset_compilepat(const char s)
	{
	comp_pat *pat;
	int i, c, len;

	pat = (comp_pat )xmalloc(PATSIZE sizeof(comp_pat));
	PATLEN(pat) = len = strlen(s);
	if (len) {
	PATLASTCHAR(pat) = c = (unsigned char)s[len-1];
	if (isupper(c)) PATOTHERLASTCHAR(pat) = TOLOWER(c);
	else if (islower(c)) PATOTHERLASTCHAR(pat) = TOUPPER(c);
	else PATOTHERLASTCHAR(pat) = c;
	}
	for (i=0; i<512; i++) pat[i] = len;
	for (i=0; i<len; i++) {
	c = (unsigned char)s[i];
	PATASCII(pat)[c] = pat[c] = len-i-1;
	if (c & 0x80) PATASCII(pat)[0x80] = 0;
	}
	for (i='A'; i<='Z'; i++) {
	PATASCII(pat)[i] = PATASCII(pat)[i-'A'+'a'];
	}
	return pat;
	}

	/*
	* Free the compiled pattern 'pat'
	*/
	void charset_freepat(comp_pat *pat)
	{
	free((char *)pat);
	}

	/*
	* Search for the string 'substr', with compiled pattern 'pat'
	* in the string 's', with length 'len'. Return nonzero if match
	*/
	int charset_searchstring(const char substr, comp_pat pat,
	const char *s, int len)
	{
	int i, j, large;

	assert(pat != NULL);
	i = PATLEN(pat) - 1;
	if (i < 0) return 1;
	pat[PATLASTCHAR(pat)] = large = len + i + 2;
	for (;;) {
	/* Inner loop -- scan until last char match or end of string */
	while (i < len) {
	i += pat[(unsigned char)s[i]];
	}

	/* End of string */
	if (i < large) return 0;

	/* Last char match--back up and do compare */
	i -= large + 1;
	j = PATLEN(pat) - 2;
	while (j >= 0 && s[i] == substr[j]) {
	i--;
	j--;
	}
	if (j < 0) return 1; /* Found match */
	if (pat[(unsigned char)s[i]] == large \|\|
	pat[(unsigned char)s[i]] < PATLEN(pat)-j) {
	i += PATLEN(pat) - j;
	}
	else {
	i += pat[(unsigned char)s[i]];
	}
	}
	}

	static int xlate(int index, char *to) {
	const unsigned char *from = chartables_long_translations + index;
	int i = 0;

	while ((to++ = from++) != END) i++;
	return i;
	}

	static int writeutf8(unsigned utfcode, char *to)
	{
	int table = chartables_unicode_block[utfcode>>8];
	int idx = 0;
	struct decode_state state;

	if (table == 255) {
	/* No translations in this block */
	if (utfcode > 0x7ff) {
	to[0] = (char)(0xE0 + (utfcode >> 12));
	to[1] = (char)(0x80 + ((utfcode >> 6) & 0x3f));
	to[2] = (char)(0x80 + (utfcode & 0x3f));
	return 3;
	}
	if (utfcode > 0x7f) {
	to[0] = (char)(0xC0 + (utfcode >> 6));
	to[1] = (char)(0x80 + (utfcode & 0x3f));
	return 2;
	}
	to[0] = (char)utfcode;
	return 1;
	}

	START(state, chartables_unicode + table);
	TRANSLATE(state, (utfcode & 0xff), to, idx);

	return idx;

	}

	/*
	* The various charset_searchfile() helper functions
	*/
	struct input_state;
	typedef int rawproc_t(struct input_state state, char buf, int size);

	static int charset_readconvert(struct input_state state, char buf, int size);
	static rawproc_t charset_readplain;
	static rawproc_t charset_readplain_nospc;
	static rawproc_t charset_readmapnl;
	static rawproc_t charset_readqp;
	static rawproc_t charset_readqp_nospc;
	static rawproc_t charset_readqpmapnl;
	static rawproc_t charset_readbase64;
	static rawproc_t charset_readbase64_nospc;

	/*
	* State for the various charset_searchfile() helper functions
	*/
	struct input_state {
	rawproc_t rawproc; / Function to read and transfer-decode data */
	const char rawbase; / Location in mapped file of raw data */
	int rawlen; /* # bytes raw data left to read from file */
	char decodebuf[2048]; /* Buffer of data deocded, but not converted
	* into canonical searching form */
	int decodestart, decodeleft; /* Location/count of decoded data */
	struct decode_state decodestate; /* Charset state to convert decoded data
	* into canonical searching form */
	};


	/*
	* Search for the string 'substr' in the next 'len' bytes of
	* 'msg_base'. If 'mapnl' is nonzero, then LF characters in the file
	* map to CR LF and count as 2 bytes w.r.t. the value of 'len'.
	* 'charset' and 'encoding' specify the character set and
	* content transfer encoding of the data, respectively.
	* Returns nonzero iff the string was found.
	*/
	int charset_searchfile(const char substr, comp_pat pat,
	const char *msg_base, int mapnl, int len, int charset, int encoding)
	{
	int substrlen = PATLEN(pat);
	char *buf, smallbuf[2048];
	int bufsize;
	int n;
	int i, j, large;
	struct input_state state;

	/* Initialize character set mapping */
	if (charset < 0 \|\| charset >= chartables_num_charsets) return 0;
	START(state.decodestate, chartables_charset_table[charset].table);
	state.decodeleft = 0;

	/* check for trivial search */
	if (substrlen == 0) return 1;

	/*
	* Select buffer to hold canonical searching fomat data to
	* search
	*/
	if (substrlen < (int) sizeof(smallbuf)/2) {
	bufsize = sizeof(smallbuf);
	buf = smallbuf;
	}
	else {
	bufsize = substrlen+sizeof(smallbuf);
	buf = xmalloc(bufsize);
	}

	/* Optimized searching of us-ascii, using boyer-moore */
	if (charset == 0) {
	/* Initialize transfer-decoding */
	state.rawbase = msg_base;
	state.rawlen = len;
	/* don't need to special case mapnl since all such chars will
	be ignored, anyway */
	switch (encoding) {
	case ENCODING_NONE:
	state.rawproc = charset_readplain_nospc;
	break;

	case ENCODING_QP:
	state.rawproc = charset_readqp_nospc;
	break;

	case ENCODING_BASE64:
	state.rawproc = charset_readbase64_nospc;
	/* XXX have to have nl-mapping base64 in order to
	* properly count \n as 2 raw characters
	*/
	break;

	default:
	/* Don't know encoding--nothing can match */
	return 0;
	}

	if (PATASCII(pat)[0x80] == 0) {
	/* 8-bit chars in pattern--search must fail */
	if (buf != smallbuf) free(buf);
	return 0;
	}

	n = (*state.rawproc)(&state, buf, bufsize);
	if (n < substrlen) {
	if (buf != smallbuf) free(buf);
	return 0;
	}
	i = substrlen - 1;
	PATASCII(pat)[PATLASTCHAR(pat)] =
	PATASCII(pat)[PATOTHERLASTCHAR(pat)] = large = bufsize + i + 2;

	for (;;) {
	/* Inner loop -- scan until last char match or end of buffer */
	while (i < n) {
	i += PATASCII(pat)[(unsigned char)buf[i]];
	}

	/* End of buffer */
	if (i < large) {
	/* Read in more stuff */
	j = i-n;
	strncpy(buf, buf+i-(substrlen-1), substrlen-1-j);
	n = (*state.rawproc)(&state, buf+substrlen-1-j, bufsize-substrlen+1+j);
	i = substrlen-1;
	if (n > 0) {
	n += i-j;
	continue;
	}
	if (buf != smallbuf) free(buf);
	return 0;
	}

	/* Last char match--back up and do compare */
	i -= large + 1;
	j = PATLEN(pat) - 2;
	while (j >= 0 && TOLOWER(buf[i]) == TOLOWER(substr[j])) {
	i--;
	j--;
	}
	if (j < 0) {
	/* Found match */
	if (buf != smallbuf) free(buf);
	return 1;
	}
	if (PATASCII(pat)[(unsigned char)buf[i]] == large \|\|
	PATASCII(pat)[(unsigned char)buf[i]] < PATLEN(pat)-j) {
	i += PATLEN(pat) - j;
	}
	else {
	i += PATASCII(pat)[(unsigned char)buf[i]];
	}
	}
	/* NOTREACHED */
	}

	/* Do the (generalized) search */

	/* Initialize transfer-decoding */
	state.rawbase = msg_base;
	state.rawlen = len;
	switch (encoding) {
	case ENCODING_NONE:
	state.rawproc = mapnl ? charset_readmapnl : charset_readplain;
	break;

	case ENCODING_QP:
	state.rawproc = mapnl ? charset_readqpmapnl : charset_readqp;
	break;

	case ENCODING_BASE64:
	state.rawproc = charset_readbase64;
	/* XXX have to have nl-mapping base64 in order to
	* properly count \n as 2 raw characters
	*/
	break;

	default:
	/* Don't know encoding--nothing can match */
	return 0;
	}

	n = charset_readconvert(&state, buf, bufsize);
	if (n < substrlen) {
	if (buf != smallbuf) free(buf);
	return 0;
	}
	i = substrlen - 1;
	pat[PATLASTCHAR(pat)] = large = bufsize + i + 2;
	for (;;) {
	/* Inner loop -- scan until last char match or end of buffer */
	while (i < n) {
	i += pat[(unsigned char)buf[i]];
	}

	/* End of buffer */
	if (i < large) {
	/* Read in more stuff */
	j = i-n;
	strncpy(buf, buf+i-(substrlen-1), substrlen-1-j);
	n = charset_readconvert(&state, buf+substrlen-1-j,
	bufsize-substrlen+1+j);
	i = substrlen-1;
	if (n > 0) {
	n += i-j;
	continue;
	}
	if (buf != smallbuf) free(buf);
	return 0;
	}

	/* Last char match--back up and do compare */
	i -= large + 1;
	j = PATLEN(pat) - 2;
	while (j >= 0 && buf[i] == substr[j]) {
	i--;
	j--;
	}
	if (j < 0) {
	/* Found match */
	if (buf != smallbuf) free(buf);
	return 1;
	}
	if (pat[(unsigned char)buf[i]] == large \|\|
	pat[(unsigned char)buf[i]] < PATLEN(pat)-j) {
	i += PATLEN(pat) - j;
	}
	else {
	i += pat[(unsigned char)buf[i]];
	}
	}
	}

	/* This is based on charset_searchfile above. */
	int charset_extractfile(index_search_text_receiver_t receiver,
	void* rock, int uid, const char *msg_base, int mapnl, int len, int charset,
	int encoding) {
	char buf[2048];
	int n;
	struct input_state state;

	/* Initialize character set mapping */
	if (charset < 0 \|\| charset >= chartables_num_charsets) return 0;
	START(state.decodestate, chartables_charset_table[charset].table);
	state.decodeleft = 0;

	/* Initialize transfer-decoding */
	state.rawbase = msg_base;
	state.rawlen = len;
	switch (encoding) {
	case ENCODING_NONE:
	state.rawproc = mapnl ? charset_readmapnl : charset_readplain;
	break;

	case ENCODING_QP:
	state.rawproc = mapnl ? charset_readqpmapnl : charset_readqp;
	break;

	case ENCODING_BASE64:
	state.rawproc = charset_readbase64;
	/* XXX have to have nl-mapping base64 in order to
	* properly count \n as 2 raw characters
	*/
	break;

	default:
	/* Don't know encoding--nothing can match */
	return 0;
	}

	/* We don't need to do anything tricky. Just read and convert each block of the
	text, then hand the converted text down to the receiver. */
	do {
	n = charset_readconvert(&state, buf, sizeof(buf));
	if (n > 0) {
	receiver(uid, SEARCHINDEX_PART_BODY,
	SEARCHINDEX_CMD_APPENDPART, buf, n, rock);
	}
	} while (n > 0);

	return 1;
	}

	/*
	* Helper function to read at most 'size' bytes of converted
	* (into canonical searching format) data into 'buf'. Returns
	* the number of converted bytes, or 0 for end-of-data.
	*/
	static int charset_readconvert(struct input_state state, char buf, int size)
	{
	int retval = 0;

	if (state->decodeleft && state->decodestart != 0) {
	memmove(state->decodebuf, state->decodebuf+state->decodestart,
	state->decodeleft);
	}
	state->decodestart = 0;

	state->decodeleft += (*state->rawproc)(state,
	state->decodebuf+state->decodeleft,
	sizeof(state->decodebuf)-state->decodeleft);

	while (state->decodeleft) {
	if (retval + charset_max_translation > size) {
	return retval;
	}
	TRANSLATE(state->decodestate, state->decodebuf[state->decodestart], buf, retval);
	state->decodestart++;
	state->decodeleft--;
	}
	return retval;
	}

	/*
	* Decode the MIME body part (per RFC 2045) of 'len' bytes located at
	* 'msg_base' having the content transfer 'encoding'. Decodes into
	* 'retval' (if necessary), which must be reallocable and currently at
	* least size 'alloced'. Returns the number of decoded bytes in
	* 'outlen'.
	*/
	char charset_decode_mimebody(const char msg_base, int len, int encoding,
	char *retval, int alloced, int outlen)
	{
	struct input_state state;

	/* Initialize transfer-decoding */
	state.rawbase = msg_base;
	state.rawlen = len;

	switch (encoding) {
	case ENCODING_NONE:
	*outlen = len;
	return (char *) msg_base;

	case ENCODING_QP:
	state.rawproc = charset_readqp;
	break;

	case ENCODING_BASE64:
	state.rawproc = charset_readbase64;
	break;

	default:
	/* Don't know encoding--nothing can match */
	return NULL;
	}

	if (alloced < len+1) retval = xrealloc(retval, len+1);
	outlen = (state.rawproc)(&state, *retval, len);
	(retval)[outlen] = '\0';
	return *retval;
	}

	/*
	* Helper function to read at most 'size' bytes of trivial
	* transfer-decoded data into 'buf'. Returns the number of decoded
	* bytes, or 0 for end-of-data.
	*/
	static int charset_readplain(struct input_state state, char buf, int size)
	{
	if (size > state->rawlen) size = state->rawlen;
	if (!size) return 0;

	memcpy(buf, state->rawbase, size);
	state->rawlen -= size;
	state->rawbase += size;

	return size;
	}

	/*
	* Helper function to read at most 'size' bytes of trivial
	* transfer-decoded data into 'buf'. Removes any US-ASCII whitespace.
	* Returns the number of decoded bytes, or 0 for end-of-data.
	*/
	static int charset_readplain_nospc(struct input_state *state,
	char *buf, int size)
	{
	int i;

	for (i = 0; i < size; i++) {
	/* remove any whitespace at the beginning of rawbase */
	while (state->rawlen > 0 && USASCII(*state->rawbase) == END) {
	state->rawlen--;
	state->rawbase++;
	}

	if (state->rawlen == 0) break;

	/* copy a char */
	buf[i] = *state->rawbase++;
	state->rawlen--;
	}

	return i;
	}

	/*
	* Helper function to read at most 'size' bytes of trivial newline-mapped
	* transfer-decoded data into 'buf'. Returns the number of decoded
	* bytes, or 0 for end-of-data.
	*/
	static int charset_readmapnl(struct input_state state, char buf, int size)
	{
	int retval = 0;
	char c;

	while (size && state->rawlen > 0) {
	c = *state->rawbase;
	if (c == '\n') {
	if (size < 2) {
	return retval;
	}
	*buf++ = '\r';
	retval++;
	size--;
	state->rawlen--;
	}
	*buf++ = c;
	state->rawbase++;
	state->rawlen--;
	retval++;
	size--;
	}
	return retval;
	}

	/*
	* Helper function to read at most 'size' bytes of quoted-printable
	* transfer-decoded data into 'buf'. Returns the number of decoded
	* bytes, or 0 for end-of-data.
	*/
	static int charset_readqp(struct input_state state, char buf, int size)
	{
	int retval = 0;
	int c, c1, c2;
	const char nextline, endline;

	nextline = endline = state->rawbase;

	while (size && state->rawlen) {
	if (state->rawbase >= nextline) {
	/* Ignore trailing whitespace at end of line */

	nextline =
	(const char*) memchr(state->rawbase+1, '\r', state->rawlen-1);
	if (!nextline) nextline = state->rawbase + state->rawlen;
	endline = nextline;
	while (endline > state->rawbase &&
	(endline[-1] == ' ' \|\| endline[-1] == '\t')) {
	endline--;
	}
	}
	if (state->rawbase >= endline) {
	state->rawbase += nextline - endline;
	state->rawlen -= nextline - endline;
	continue;
	}

	c = state->rawbase[0];
	if (c == '=') {
	if (state->rawlen < 3) {
	return retval;
	}
	c1 = state->rawbase[1];
	c2 = state->rawbase[2];
	state->rawbase += 3;
	state->rawlen -= 3;
	c1 = HEXCHAR(c1);
	c2 = HEXCHAR(c2);
	/* Following line also takes care of soft line breaks */
	if (c1 == XX && c2 == XX) continue;
	*buf++ = (char)((c1 << 4) + c2);
	retval++;
	size--;
	}
	else {
	state->rawbase++;
	state->rawlen--;
	*buf++ = (char)c;
	retval++;
	size--;
	}
	}
	return retval;
	}

	/*
	* Helper function to read at most 'size' bytes of quoted-printable
	* transfer-decoded data into 'buf'. Returns the number of decoded
	* bytes, or 0 for end-of-data. Removes any US-ASCII whitespace.
	* Since it just throws out \r's anyway, it's simplier than paying
	* attention to them
	*/
	static int charset_readqp_nospc(struct input_state state, char buf, int size)
	{
	int retval = 0;
	int c, c1, c2;
	char dec;
	const char nextline, endline;

	nextline = endline = state->rawbase;

	while (size && state->rawlen) {
	if (state->rawbase >= nextline) {
	/* Ignore trailing whitespace at end of line */

	nextline =
	(const char*) memchr(state->rawbase+1, '\n', state->rawlen-1);
	if (!nextline) nextline = state->rawbase + state->rawlen;
	endline = nextline;
	while (endline > state->rawbase && (USASCII(endline[-1]) == END)) {
	endline--;
	}
	}
	if (state->rawbase >= endline) {
	state->rawbase += nextline - endline;
	state->rawlen -= nextline - endline;
	continue;
	}

	c = state->rawbase[0];
	if (c == '=') {
	if (state->rawlen < 3) {
	return retval;
	}
	c1 = state->rawbase[1];
	c2 = state->rawbase[2];
	state->rawbase += 3;
	state->rawlen -= 3;
	c1 = HEXCHAR(c1);
	c2 = HEXCHAR(c2);
	/* Following line also takes care of soft line breaks */
	if (c1 == XX && c2 == XX) continue;
	dec = (char)((c1 << 4) + c2);
	if (USASCII(dec) != END) {
	/* non-whitespace, take it */
	*buf++ = (char)((c1 << 4) + c2);
	retval++;
	size--;
	}
	}
	else {
	state->rawbase++;
	state->rawlen--;
	if (USASCII(c) != END) {
	/* non-whitespace, grab it */
	*buf++ = (char)c;
	retval++;
	size--;
	}
	}
	}
	return retval;
	}

	/*
	* Helper function to read at most 'size' bytes of QP newline-mapped
	* transfer-decoded data into 'buf'. Returns the number of decoded
	* bytes, or 0 for end-of-data.
	*/
	static int charset_readqpmapnl(struct input_state state, char buf, int size)
	{
	int retval = 0;
	int c, c1, c2;
	const char nextline, endline;

	nextline = endline = state->rawbase;

	while (size && state->rawlen > 0) {
	if (state->rawbase >= nextline) {
	/* Ignore trailing whitespace at end of line */

	nextline = (const char*)
	memchr(state->rawbase+1, '\n', state->rawlen - 1);
	if (!nextline) nextline = state->rawbase + state->rawlen;
	endline = nextline;
	while (endline > state->rawbase &&
	(endline[-1] == ' ' \|\| endline[-1] == '\t')) {
	endline--;
	}
	}
	if (state->rawbase >= endline) {
	state->rawbase += nextline - endline;
	state->rawlen -= nextline - endline;
	continue;
	}

	c = state->rawbase[0];
	if (c == '=') {
	if (state->rawbase+1 == endline) {
	state->rawbase = nextline + 1;
	state->rawlen -= 3 + (nextline - endline);

	continue;
	}
	if (state->rawlen < 3) {
	return retval;
	}
	c1 = state->rawbase[1];
	c2 = state->rawbase[2];
	state->rawbase += 3;
	state->rawlen -= 3;
	if (c2 == '\n') state->rawlen--;
	c1 = HEXCHAR(c1);
	c2 = HEXCHAR(c2);
	if (c1 == XX && c2 == XX) continue;
	*buf++ = (char)((c1 << 4) + c2);
	retval++;
	size--;
	}
	else if (c == '\n') {
	if (size < 2) {
	return retval;
	}
	state->rawbase++;
	state->rawlen -= 2;
	*buf++ = '\r';
	*buf++ = '\n';
	retval += 2;
	size -= 2;
	}
	else {
	state->rawbase++;
	state->rawlen--;
	*buf++ = (char)c;
	retval++;
	size--;
	}
	}
	return retval;
	}

	/*
	* Helper function to read at most 'size' bytes of base64
	* transfer-decoded data into 'buf'. Returns the number of decoded
	* bytes, or 0 for end-of-data.
	*/
	static int charset_readbase64(struct input_state state, char buf, int size)
	{
	int retval = 0;
	int c1, c2, c3, c4;

	while (size >= 3 && state->rawlen) {
	do {
	c1 = *state->rawbase++;
	state->rawlen--;
	if (c1 == '=') {
	state->rawlen = 0;
	return retval;
	}
	} while (state->rawlen && CHAR64(c1) == XX);
	if (!state->rawlen) {
	return retval;
	}

	do {
	c2 = *state->rawbase++;
	state->rawlen--;
	if (c2 == '=') {
	state->rawlen = 0;
	return retval;
	}
	} while (state->rawlen && CHAR64(c2) == XX);
	if (!state->rawlen) {
	return retval;
	}

	do {
	c3 = *state->rawbase++;
	state->rawlen--;
	if (c3 == '=') {
	*buf++ = (char)((CHAR64(c1)<<2) \| ((CHAR64(c2)&0x30)>>4));
	retval++;
	state->rawlen = 0;
	return retval;
	}
	} while (state->rawlen && CHAR64(c3) == XX);
	if (!state->rawlen) {
	return retval;
	}

	do {
	c4 = *state->rawbase++;
	state->rawlen--;
	if (c4 == '=') {
	*buf++ = (char)((CHAR64(c1)<<2) \| ((CHAR64(c2)&0x30)>>4));
	*buf++ = (char)(((CHAR64(c2)&0xf)<<4) \| ((CHAR64(c3)&0x3c)>>2));
	retval += 2;
	state->rawlen = 0;
	return retval;
	}
	} while (state->rawlen && CHAR64(c4) == XX);
	if (CHAR64(c4) == XX) {
	return retval;
	}

	*buf++ = (char)((CHAR64(c1)<<2) \| ((CHAR64(c2)&0x30)>>4));
	*buf++ = (char)(((CHAR64(c2)&0xf)<<4) \| ((CHAR64(c3)&0x3c)>>2));
	*buf++ = (char)(((CHAR64(c3)&0x3)<<6) \| CHAR64(c4));
	retval += 3;
	size -= 3;
	}
	return retval;
	}

	/*
	* Helper function to read at most 'size' bytes of base64
	* transfer-decoded data into 'buf'. Returns the number of decoded
	* bytes, or 0 for end-of-data. Removes any US-ASCII whitespace.
	*/
	static int charset_readbase64_nospc(struct input_state *state,
	char *buf, int size)
	{
	int retval = 0;
	int c1, c2, c3, c4;
	char dec;

	while (size >= 3 && state->rawlen) {
	do {
	c1 = *state->rawbase++;
	state->rawlen--;
	if (c1 == '=') {
	state->rawlen = 0;
	return retval;
	}
	} while (state->rawlen && CHAR64(c1) == XX);
	if (!state->rawlen) {
	return retval;
	}

	do {
	c2 = *state->rawbase++;
	state->rawlen--;
	if (c2 == '=') {
	state->rawlen = 0;
	return retval;
	}
	} while (state->rawlen && CHAR64(c2) == XX);
	if (!state->rawlen) {
	return retval;
	}

	do {
	c3 = *state->rawbase++;
	state->rawlen--;
	if (c3 == '=') {
	dec = (char)((CHAR64(c1)<<2) \| ((CHAR64(c2)&0x30)>>4));
	if (USASCII(dec) != END) {
	*buf++ = dec;
	retval++;
	}
	state->rawlen = 0;
	return retval;
	}
	} while (state->rawlen && CHAR64(c3) == XX);
	if (!state->rawlen) {
	return retval;
	}

	do {
	c4 = *state->rawbase++;
	state->rawlen--;
	if (c4 == '=') {
	dec = (char)((CHAR64(c1)<<2) \| ((CHAR64(c2)&0x30)>>4));
	if (USASCII(dec) != END) {
	*buf++ = dec;
	retval++;
	}
	dec = (char)(((CHAR64(c2)&0xf)<<4) \| ((CHAR64(c3)&0x3c)>>2));
	if (USASCII(dec) != END) {
	*buf++ = dec;
	retval++;
	}
	state->rawlen = 0;
	return retval;
	}
	} while (state->rawlen && CHAR64(c4) == XX);
	if (CHAR64(c4) == XX) {
	return retval;
	}

	dec = (char)((CHAR64(c1)<<2) \| ((CHAR64(c2)&0x30)>>4));
	if (USASCII(dec) != END) {
	*buf++ = dec;
	retval++;
	size--;
	}
	dec = (char)(((CHAR64(c2)&0xf)<<4) \| ((CHAR64(c3)&0x3c)>>2));
	if (USASCII(dec) != END) {
	*buf++ = dec;
	retval++;
	size--;
	}
	dec = (char)(((CHAR64(c3)&0x3)<<6) \| CHAR64(c4));
	if (USASCII(dec) != END) {
	*buf++ = dec;
	retval++;
	size--;
	}
	}
	return retval;
	}

	/*
	* Base64 encode the MIME body part (per RFC 2045) of 'len' bytes located at
	* 'msg_base'. Encodes into 'retval' which must large enough to
	* accomodate the encoded data. Returns the number of encoded bytes in
	* 'outlen' and the number of encoded lines in 'outlines'.
	*
	* May be called with 'msg_base' as NULL to get the number of encoded
	* bytes for allocating 'retval' of the proper size.
	*/
	#define BASE64_MAX_LINE_LEN 72

	static char base_64[] =
	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

	char charset_encode_mimebody(const char msg_base, int len,
	char retval, int outlen, int *outlines)
	{
	const unsigned char *s;
	unsigned char s0, s1, s2;
	char *d;
	int b64_len, b64_lines, cnt;

	b64_len = ((len + 2) / 3) * 4;
	b64_lines = (b64_len + BASE64_MAX_LINE_LEN - 1) / BASE64_MAX_LINE_LEN;

	/* account for CRLF added to each line */
	b64_len += 2 * b64_lines;

	if (outlen) *outlen = b64_len;
	if (outlines) *outlines = b64_lines;

	if (!msg_base) return NULL;

	for (s = (const unsigned char*) msg_base, d = retval, cnt = 0; len;
	s += 3, d += 4, cnt += 4) { /* process tuplets */
	if (cnt == BASE64_MAX_LINE_LEN) {
	/* reset line len count, add CRLF */
	cnt = 0;
	*d++ = '\r';
	*d++ = '\n';
	}

	s0 = s[0];
	s1 = --len ? s[1] : 0;
	/* byte 1: high 6 bits (1) */
	d[0] = base_64[s0 >> 2];
	/* byte 2: low 2 bits (1), high 4 bits (2) */
	d[1] = base_64[((s0 & 0x3) << 4) \| ((s1 & 0xf0) >> 4)];
	if (len) {
	s2 = --len ? s[2] : 0;
	/* byte 3: low 4 bits (2), high 2 bits (3) */
	d[2] = base_64[((s1 & 0xf) << 2) \| ((s2 & 0xc0) >> 6)];
	} else {
	/* byte 3: pad */
	d[2] = '=';
	}
	if (len) {
	--len;
	/* byte 4: low 6 bits (3) */
	d[3] = base_64[s2 & 0x3f];
	} else {
	/* byte 4: pad */
	d[3] = '=';
	}
	}

	/* add final CRLF */
	*d++ = '\r';
	*d++ = '\n';

	return (b64_len ? retval : NULL);
	}

File Metadata

Mime Type: text/x-c
Expires: Mon, Apr 6, 2:25 AM (1 w, 4 d ago)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 18832057
Default Alt Text: charset.c (36 KB)

charset.cNo OneTemporaryActions

charset.cView Options

File Metadata

Event Timeline

charset.c
No OneTemporary
Actions

charset.c
View Options