Page MenuHomePhorge

charset.c
No OneTemporary

Authored By
Unknown
Size
33 KB
Referenced Files
None
Subscribers
None

charset.c

/* charset.c -- International character set support
*
* Copyright (c) 1994-2008 Carnegie Mellon University. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The name "Carnegie Mellon University" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For permission or any legal
* details, please contact
* Carnegie Mellon University
* Center for Technology Transfer and Enterprise Creation
* 4615 Forbes Avenue
* Suite 302
* Pittsburgh, PA 15213
* (412) 268-7393, fax: (412) 268-7395
* innovation@andrew.cmu.edu
*
* 4. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by Computing Services
* at Carnegie Mellon University (http://www.cmu.edu/computing/)."
*
* CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO
* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* $Id: charset.c,v 1.55 2010/01/06 17:01:44 murch Exp $
*/
#include <config.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include "assert.h"
#include "charset.h"
#include "xmalloc.h"
#include "chartable.h"
#include "util.h"
struct qp_state {
int isheader;
int bytesleft;
int codepoint;
};
struct b64_state {
int bytesleft;
int codepoint;
};
struct table_state {
const struct charmap (*curtable)[256];
const struct charmap (*initialtable)[256];
int bytesleft;
int codepoint;
int mode;
int num_bits;
};
struct canon_state {
int flags;
int seenspace;
};
struct comp_pat_s {
int max_start;
size_t patlen;
};
struct search_state {
ssize_t *starts;
int max_start;
int havematch;
unsigned char *substr;
size_t patlen;
size_t offset;
};
struct convert_rock;
typedef void convertproc_t(struct convert_rock *rock, int c);
typedef void freeconvert_t(struct convert_rock *rock);
struct convert_rock {
convertproc_t *f;
freeconvert_t *cleanup;
struct convert_rock *next;
void *state;
};
#define GROWSIZE 100
#define XX 127
/*
* Table for decoding hexadecimal in quoted-printable
*/
static const char index_hex[256] = {
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,XX,XX, XX,XX,XX,XX,
XX,10,11,12, 13,14,15,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,10,11,12, 13,14,15,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
};
#define HEXCHAR(c) (index_hex[(unsigned char)(c)])
/*
* Table for decoding base64
*/
static const char index_64[256] = {
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,62, XX,XX,XX,63,
52,53,54,55, 56,57,58,59, 60,61,XX,XX, XX,XX,XX,XX,
XX, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
15,16,17,18, 19,20,21,22, 23,24,25,XX, XX,XX,XX,XX,
XX,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
41,42,43,44, 45,46,47,48, 49,50,51,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX,
};
#define CHAR64(c) (index_64[(unsigned char)(c)])
static inline void convert_putc(struct convert_rock *rock, int c)
{
rock->f(rock, c);
}
void convert_cat(struct convert_rock *rock, const char *s)
{
while (*s) {
convert_putc(rock, (unsigned char)*s);
s++;
}
}
void convert_catn(struct convert_rock *rock, const char *s, size_t len)
{
while (len-- > 0) {
convert_putc(rock, (unsigned char)*s);
s++;
}
}
/* convertproc_t conversion functions */
void qp2byte(struct convert_rock *rock, int c)
{
struct qp_state *s = (struct qp_state *)rock->state;
int val;
if (s->bytesleft) {
s->bytesleft--;
val = HEXCHAR(c);
if (val == XX) {
/* mark invalid regardless */
s->codepoint = -1;
return;
}
if (s->codepoint != -1) {
/* don't blat the invalid marker, but still absorb
* the second char */
s->codepoint = (s->codepoint << 4) + val;
}
if (!s->bytesleft) {
if (s->codepoint == -1)
convert_putc(rock->next, 0xfffd);
else
convert_putc(rock->next, s->codepoint & 0xff);
}
return;
}
/* start an encoded byte */
if (c == '=') {
s->bytesleft = 2;
s->codepoint = 0;
return;
}
/* underscores are space in headers */
if (s->isheader && c == '_') c = ' ';
convert_putc(rock->next, c);
}
void b64_2byte(struct convert_rock *rock, int c)
{
struct b64_state *s = (struct b64_state *)rock->state;
char b = CHAR64(c);
/* could just be whitespace, ignore it */
if (b == XX) return;
switch (s->bytesleft) {
case 0:
s->codepoint = b;
s->bytesleft = 3;
break;
case 3:
convert_putc(rock->next, ((s->codepoint << 2) | (b >> 4)) & 0xff);
s->codepoint = b;
s->bytesleft = 2;
break;
case 2:
convert_putc(rock->next, ((s->codepoint << 4) | (b >> 2)) & 0xff);
s->codepoint = b;
s->bytesleft = 1;
break;
case 1:
convert_putc(rock->next, ((s->codepoint << 6) | b) & 0xff);
s->codepoint = 0;
s->bytesleft = 0;
}
}
void stripnl2uni(struct convert_rock *rock, int c)
{
if (c != '\r' && c != '\n')
convert_putc(rock->next, c);
}
void table2uni(struct convert_rock *rock, int c)
{
struct table_state *s = (struct table_state *)rock->state;
struct charmap *map = (struct charmap *)&s->curtable[0][c & 0xff];
/* propagate errors */
if (c == 0xfffd) {
convert_putc(rock->next, c);
return;
}
if (map->c)
convert_putc(rock->next, map->c);
s->curtable = s->initialtable + map->next;
}
void utf8_2uni(struct convert_rock *rock, int c)
{
struct table_state *s = (struct table_state *)rock->state;
/* propagate errors */
if (c == 0xfffd) {
convert_putc(rock->next, c);
return;
}
if ((c & 0xf8) == 0xf0) { /* 11110xxx */
/* first of a 4 char sequence */
s->bytesleft = 3;
s->codepoint = c & 0x07; /* 00000111 */
}
else if ((c & 0xf0) == 0xe0) { /* 1110xxxx */
/* first of a 3 char sequence */
s->bytesleft = 2;
s->codepoint = c & 0x0f; /* 00001111 */
}
else if ((c & 0xe0) == 0xc0) { /* 110xxxxx */
/* first of a 2 char sequence */
s->bytesleft = 1;
s->codepoint = c & 0x1f; /* 00011111 */
}
else if ((c & 0xc0) == 0x80) { /* 10xxxxxx */
/* continuation char, handle only if expected */
if (s->bytesleft > 0) {
s->codepoint = (s->codepoint << 6) + (c & 0x3f); /* 00111111 */
s->bytesleft--;
if (!s->bytesleft) {
convert_putc(rock->next, s->codepoint);
s->codepoint = 0;
}
}
}
else { /* plain ASCII char */
convert_putc(rock->next, c);
s->bytesleft = 0;
s->codepoint = 0;
}
}
void utf7_2uni (struct convert_rock *rock, int c)
{
struct table_state *s = (struct table_state *)rock->state;
/* propagate errors */
if (c == 0xfffd) {
convert_putc(rock->next, c);
return;
}
if (c & 0x80) { /* skip 8-bit chars */
convert_putc(rock->next, 0xfffd);
return;
}
/* Inside a base64 encoded unicode fragment */
if (s->mode) {
/* '-' marks the end of a fragment */
if (c == '-') {
/* special case: sequence +- creates output '+' */
if (s->mode == 1)
convert_putc(rock->next, '+');
/* otherwise no output for the '-' */
s->mode = 0;
s->num_bits = 0;
s->codepoint = 0;
}
/* a normal char drops us out of base64 mode */
else if (CHAR64(c) == XX) {
/* pass on the char */
convert_putc(rock->next, c);
/* and switch back to ASCII mode */
s->mode = 0;
/* XXX: warn if num_bits > 4 or codepoint != 0 */
s->num_bits = 0;
s->codepoint = 0;
}
/* base64 char - process it into the state machine */
else {
s->mode = 2; /* we have some content, so don't process special +- */
/* add the 6 bits of value from this character */
s->codepoint = (s->codepoint << 6) + CHAR64(c);
s->num_bits += 6;
/* if we've got a full character's worth of bits, send it down
* the line and keep the remainder for the next character */
if (s->num_bits >= 16) {
s->num_bits -= 16;
convert_putc(rock->next, (s->codepoint >> s->num_bits) & 0x7fff);
s->codepoint &= ((1 << s->num_bits) - 1); /* avoid overflow by trimming */
}
}
}
/* regular ASCII mode */
else {
/* '+' switches to base64 unicode mode */
if (c == '+') {
s->mode = 1; /* switch mode, but no content processed yet */
s->codepoint = 0;
s->num_bits = 0;
}
/* regular ASCII char */
else {
convert_putc(rock->next, c);
}
}
}
void uni2searchform(struct convert_rock *rock, int c)
{
struct canon_state *s = (struct canon_state *)rock->state;
int i;
int code;
unsigned char table16, table8;
/* invalid character becomes an Oxff - that's illegal utf-8,
* so it won't match */
if (c == 0xfffd) {
convert_putc(rock->next, 0xff);
return;
}
table16 = chartables_translation_block16[(c>>16) & 0xff];
/* no translations */
if (table16 == 255) {
convert_putc(rock->next, c);
return;
}
table8 = chartables_translation_block8[table16][(c>>8) & 0xff];
/* no translations */
if (table8 == 255) {
convert_putc(rock->next, c);
return;
}
/* use the xlate table */
code = chartables_translation[table8][c & 0xff];
/* case - zero length output */
if (code == 0) {
return;
}
/* special case: whitespace or control characters */
if (code == ' ' || code == '\r' || code == '\n') {
if (s->flags & CHARSET_SKIPSPACE) {
return;
}
if (s->flags & CHARSET_MERGESPACE) {
if (s->seenspace)
return;
s->seenspace = 1;
code = ' '; /* one SPACE char */
}
}
else
s->seenspace = 0;
/* case - one character output */
if (code > 0) {
convert_putc(rock->next, code);
return;
}
/* case - multiple characters */
for (i = -code; chartables_translation_multichar[i]; i++) {
int c = chartables_translation_multichar[i];
/* diacritical character range. This duplicates the
* behaviour of Cyrus versions before 2.5 */
if (s->flags & CHARSET_SKIPDIACRIT) {
if ((c & ~0xff) == 0x300)
continue;
}
/* note: whitespace already stripped from multichar sequences... */
convert_putc(rock->next, c);
}
}
void uni2utf8(struct convert_rock *rock, int c)
{
if (c > 0xffff) {
convert_putc(rock->next, 0xF0 + ((c >> 18) & 0x07));
convert_putc(rock->next, 0x80 + ((c >> 12) & 0x3f));
convert_putc(rock->next, 0x80 + ((c >> 6) & 0x3f));
convert_putc(rock->next, 0x80 + ( c & 0x3f));
}
else if (c > 0x7ff) {
convert_putc(rock->next, 0xE0 + ((c >> 12) & 0x0f));
convert_putc(rock->next, 0x80 + ((c >> 6) & 0x3f));
convert_putc(rock->next, 0x80 + ( c & 0x3f));
}
else if (c > 0x7f) {
convert_putc(rock->next, 0xC0 + ((c >> 6) & 0x1f));
convert_putc(rock->next, 0x80 + ( c & 0x3f));
}
else {
convert_putc(rock->next, c);
}
}
void byte2search(struct convert_rock *rock, int c)
{
struct search_state *s = (struct search_state *)rock->state;
int i, cur;
unsigned char b = (unsigned char)c;
if (c == 0xfffd) {
c = 0xff; /* searchable by invalid character! */
}
/* check our "in_progress" matches to see if they're still valid */
for (i = 0, cur = 0; i < s->max_start; i++) {
/* no more active offsets */
if (s->starts[i] == -1)
break;
/* if we've passed one that's not ongoing, copy back */
if (cur < i)
s->starts[cur] = s->starts[i];
/* check that the substring is still maching */
if (b == s->substr[s->offset - s->starts[i]]) {
if (s->offset - s->starts[i] == s->patlen - 1) {
/* we're there! */
s->havematch = 1;
}
else {
/* keep this one, it's ongoing */
cur++;
}
}
}
/* starting a new one! */
if (b == s->substr[0]) {
/* have to treat this one specially! */
if (s->patlen == 1)
s->havematch = 1;
else
s->starts[cur++] = s->offset;
}
/* empty out any others that aren't being kept */
while (cur < i) s->starts[cur++] = -1;
/* increment the offset counter */
s->offset++;
}
void byte2buffer(struct convert_rock *rock, int c)
{
struct buf *buf = (struct buf *)rock->state;
buf_putc(buf, c & 0xff);
}
/* convert_rock manipulation routines */
void table_switch(struct convert_rock *rock, int charset_num)
{
struct table_state *state = (struct table_state *)rock->state;
/* wipe any current state */
memset(state, 0, sizeof(struct table_state));
/* it's a table based lookup */
if (chartables_charset_table[charset_num].table) {
/* set up the initial table */
state->curtable = state->initialtable
= chartables_charset_table[charset_num].table;
rock->f = table2uni;
}
/* special case UTF-8 */
else if (strstr(chartables_charset_table[charset_num].name, "utf-8")) {
rock->f = utf8_2uni;
}
/* special case UTF-7 */
else if (strstr(chartables_charset_table[charset_num].name, "utf-7")) {
rock->f = utf7_2uni;
}
/* should never happen */
else {
exit(1);
/* do something fatal here! */
}
}
/* Extract a cstring from a buffer. NOTE: caller must free the memory
* themselves once this is called. Resets the state. If you don't
* call this function then buffer_free will clean up */
static char *buffer_cstring(struct convert_rock *rock)
{
struct buf *buf = (struct buf *)rock->state;
return buf_release(buf);
}
static inline int search_havematch(struct convert_rock *rock)
{
struct search_state *s = (struct search_state *)rock->state;
return s->havematch;
}
/* conversion cleanup routines */
void basic_free(struct convert_rock *rock)
{
if (rock) {
if (rock->state) free(rock->state);
free(rock);
}
}
void search_free(struct convert_rock *rock)
{
if (rock && rock->state) {
struct search_state *s = (struct search_state *)rock->state;
if (s->starts) free(s->starts);
}
basic_free(rock);
}
static void buffer_free(struct convert_rock *rock)
{
if (rock && rock->state) {
struct buf *buf = (struct buf *)rock->state;
buf_free(buf);
}
basic_free(rock);
}
void convert_free(struct convert_rock *rock) {
struct convert_rock *next;
while (rock) {
next = rock->next;
if (rock->cleanup)
rock->cleanup(rock);
else
basic_free(rock);
rock = next;
}
}
/* converter initialisation routines */
struct convert_rock *qp_init(int isheader, struct convert_rock *next)
{
struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock));
struct qp_state *s = xzmalloc(sizeof(struct qp_state));
s->isheader = isheader;
rock->state = (void *)s;
rock->f = qp2byte;
rock->next = next;
return rock;
}
struct convert_rock *b64_init(struct convert_rock *next)
{
struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock));
rock->state = xzmalloc(sizeof(struct b64_state));
rock->f = b64_2byte;
rock->next = next;
return rock;
}
struct convert_rock *stripnl_init(struct convert_rock *next)
{
struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock));
rock->f = stripnl2uni;
rock->next = next;
return rock;
}
struct convert_rock *canon_init(int flags, struct convert_rock *next)
{
struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock));
struct canon_state *s = xzmalloc(sizeof(struct canon_state));
s->flags = flags;
rock->f = uni2searchform;
rock->state = s;
rock->next = next;
return rock;
}
struct convert_rock *uni_init(struct convert_rock *next)
{
struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock));
rock->f = uni2utf8;
rock->next = next;
return rock;
}
struct convert_rock *table_init(int charset_num, struct convert_rock *next)
{
struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock));
rock->state = xzmalloc(sizeof(struct table_state));
rock->next = next;
table_switch(rock, charset_num);
return rock;
}
struct convert_rock *search_init(const char *substr, comp_pat *pat) {
struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock));
struct search_state *s = xzmalloc(sizeof(struct search_state));
struct comp_pat_s *p = (struct comp_pat_s *)pat;
int i;
/* copy in tracking vars */
s->max_start = p->max_start;
s->patlen = p->patlen;
s->substr = (unsigned char *)substr;
/* allocate tracking space and initialise to "no match" */
s->starts = xmalloc(s->max_start * sizeof(size_t));
for (i = 0; i < s->max_start; i++) {
s->starts[i] = -1;
}
/* set up the rock */
rock->f = byte2search;
rock->cleanup = search_free;
rock->state = (void *)s;
return rock;
}
static struct convert_rock *buffer_init(void)
{
struct convert_rock *rock = xzmalloc(sizeof(struct convert_rock));
struct buf *buf = xzmalloc(sizeof(struct buf));
rock->f = byte2buffer;
rock->cleanup = buffer_free;
rock->state = (void *)buf;
return rock;
}
/* API */
/*
* Lookup the character set 'name'. Returns the character set number
* or -1 if there is no matching character set.
*/
int charset_lookupname(const char *name)
{
int i;
/* translate to canonical name */
for (i = 0; charset_aliases[i].name; i++) {
if (!strcasecmp(name, charset_aliases[i].name)) {
name = charset_aliases[i].canon_name;
break;
}
}
/* look up canonical name */
for (i = 0; i < chartables_num_charsets; i++) {
if (!strcasecmp(name, chartables_charset_table[i].name))
return i;
}
return -1;
}
static int lookup_buf(const char *buf, int len)
{
char *name = xstrndup(buf, len);
int res = charset_lookupname(name);
free(name);
return res;
}
/*
* Convert the string 's' in the character set numbered 'charset'
* into canonical searching form. Returns a newly allocated string
* which must be free()d by the caller.
*/
char *charset_convert(const char *s, int charset, int flags)
{
struct convert_rock *input, *tobuffer;
char *res;
if (!s) return 0;
if (charset < 0 || charset >= chartables_num_charsets)
return xstrdup("X");
/* set up the conversion path */
tobuffer = buffer_init();
input = uni_init(tobuffer);
input = canon_init(flags, input);
input = table_init(charset, input);
/* do the conversion */
convert_cat(input, s);
/* extract the result */
res = buffer_cstring(tobuffer);
/* clean up */
convert_free(input);
return res;
}
char *charset_utf8_to_searchform(const char *s, int flags)
{
int charset = charset_lookupname("utf-8");
return charset_convert(s, charset, flags);
}
/* Convert from a given charset and encoding into utf8 */
char *charset_to_utf8(const char *msg_base, size_t len, int charset, int encoding)
{
struct convert_rock *input, *tobuffer;
char *res;
/* Initialize character set mapping */
if (charset < 0 || charset >= chartables_num_charsets)
return 0;
/* check for trivial search */
if (len == 0)
return xstrdup("");
/* set up the conversion path */
tobuffer = buffer_init();
input = uni_init(tobuffer);
input = table_init(charset, input);
/* choose encoding extraction if needed */
switch (encoding) {
case ENCODING_NONE:
break;
case ENCODING_QP:
input = qp_init(0, input);
break;
case ENCODING_BASE64:
input = b64_init(input);
/* XXX have to have nl-mapping base64 in order to
* properly count \n as 2 raw characters
*/
break;
default:
/* Don't know encoding--nothing can match */
convert_free(input);
return 0;
}
convert_catn(input, msg_base, len);
res = buffer_cstring(tobuffer);
convert_free(input);
return res;
}
void mimeheader_cat(struct convert_rock *target, const char *s)
{
struct convert_rock *input, *stripnl;
int eatspace = 0;
const char *start, *endcharset, *encoding, *end;
int len;
int charset;
const char *p;
if (!s) return;
/* set up the conversion path */
input = table_init(0, target);
/* note: we assume the caller of this function has already
* determined that all newlines are followed by whitespace */
stripnl = stripnl_init(input);
start = s;
while ((start = (const char*) strchr(start, '=')) != 0) {
start++;
if (*start != '?') continue;
encoding = (const char*) strchr(start+1, '?');
if (!encoding) continue;
endcharset =
(const char*) strchr(start+1, '*'); /* Language code delimiter */
if (!endcharset || endcharset > encoding) endcharset = encoding;
if (encoding[1] != 'b' && encoding[1] != 'B' &&
encoding[1] != 'q' && encoding[1] != 'Q') continue;
if (encoding[2] != '?') continue;
end = (const char*) strchr(encoding+3, '?');
if (!end || end[1] != '=') continue;
/*
* We have recognized a valid 1522-word.
* Copy over leading text, unless it consists entirely of
* whitespace and is between two 1522-words.
*/
if (eatspace) {
for (p = s; p < (start-1) && Uisspace(*p); p++);
if (p < (start-1)) eatspace = 0;
}
if (!eatspace) {
len = start - s - 1;
table_switch(input, 0); /* US_ASCII */
convert_catn(stripnl, s, len);
}
/*
* Get the 1522-word's character set
*/
start++;
charset = lookup_buf(start, endcharset-start);
if (charset < 0) {
/* Unrecognized charset, nothing will match here */
convert_putc(input, 0xfffd); /* unknown character */
}
else {
struct convert_rock *extract;
table_switch(input, charset);
/* choose decoder */
if (encoding[1] == 'q' || encoding[1] == 'Q') {
extract = qp_init(1, input);
}
else {
extract = b64_init(input);
}
/* convert */
p = encoding+3;
convert_catn(extract, p, end - p);
/* clean up */
basic_free(extract);
}
/* Prepare for the next iteration */
s = start = end+2;
eatspace = 1;
}
/* Copy over the tail part of the input string */
if (*s) {
table_switch(input, 0); /* US_ASCII */
convert_cat(stripnl, s);
}
/* just free these ones, the rest can be cleaned up by the sender */
basic_free(stripnl);
basic_free(input);
}
/*
* Decode MIME strings (per RFC 2047) in 's'. Returns a newly allocated
* string, contining 's' in canonical searching form, which must be
* free()d by the caller.
*/
char *charset_decode_mimeheader(const char *s, int flags)
{
struct convert_rock *tobuffer, *input;
char *res;
if (!s) return NULL;
tobuffer = buffer_init();
input = uni_init(tobuffer);
input = canon_init(flags, input);
mimeheader_cat(input, s);
res = buffer_cstring(tobuffer);
convert_free(input);
return res;
}
/*
* Decode MIME strings (per RFC 2047) in 's'. Returns a newly allocated
* string, containing the decoded string, which must be free()d by the
* caller.
*/
char *charset_parse_mimeheader(const char *s)
{
struct convert_rock *tobuffer, *input;
char *res;
if (!s) return NULL;
tobuffer = buffer_init();
input = uni_init(tobuffer);
mimeheader_cat(input, s);
res = buffer_cstring(tobuffer);
convert_free(input);
return res;
}
int charset_search_mimeheader(const char *substr, comp_pat *pat,
const char *s, int flags)
{
struct convert_rock *input, *tosearch;
int res;
tosearch = search_init(substr, pat);
input = uni_init(tosearch);
input = canon_init(flags, input);
mimeheader_cat(input, s);
res = search_havematch(tosearch);
convert_free(input);
return res;
}
/* Compile a search pattern for later comparison. We just count
* how long the string is, and how many times the first character
* occurs. Later optimisation could reduce the max_start by
* deeper analysis of the possible paths through the string, but
* this is a good absolute maximum, and it just means a few more
* bytes get allocated... */
comp_pat *charset_compilepat(const char *s)
{
struct comp_pat_s *pat = xzmalloc(sizeof(struct comp_pat_s));
const char *p = s;
/* count occurances */
while (*p) {
if (*p == *s) pat->max_start++;
pat->patlen++;
p++;
}
return (comp_pat *)pat;
}
/*
* Free the compiled pattern 'pat'
*/
void charset_freepat(comp_pat *pat)
{
free((struct comp_pat_s *)pat);
}
/*
* Search for the string 'substr', with compiled pattern 'pat'
* in the string 's', with length 'len'. Return nonzero if match
*
* Uses the to_search target directly. Assumes 's' is already
* in search normal form (i.e. from a cache file)
*/
int charset_searchstring(const char *substr, comp_pat *pat,
const char *s, size_t len, int flags)
{
struct convert_rock *tosearch;
struct convert_rock *input;
int charset = charset_lookupname("utf-8");
int res;
if (!substr[0])
return 1; /* zero length string always matches */
/* set up the search handler */
tosearch = search_init(substr, pat);
/* and the input stream */
input = uni_init(tosearch);
input = canon_init(flags, input);
input = table_init(charset, input);
/* feed the handler */
while (len-- > 0) {
convert_putc(input, (unsigned char)*s++);
if (search_havematch(tosearch)) break; /* shortcut if there's a match */
}
/* copy the value */
res = search_havematch(tosearch);
/* clean up */
convert_free(input);
return res;
}
/*
* Search for the string 'substr' in the next 'len' bytes of
* 'msg_base'.
* 'charset' and 'encoding' specify the character set and
* content transfer encoding of the data, respectively.
* Returns nonzero iff the string was found.
*/
int charset_searchfile(const char *substr, comp_pat *pat,
const char *msg_base, size_t len,
int charset, int encoding, int flags)
{
struct convert_rock *input, *tosearch;
size_t i;
int res;
/* Initialize character set mapping */
if (charset < 0 || charset >= chartables_num_charsets)
return 0;
/* check for trivial search */
if (strlen(substr) == 0)
return 1;
/* set up the conversion path */
tosearch = search_init(substr, pat);
input = uni_init(tosearch);
input = canon_init(flags, input);
input = table_init(charset, input);
/* choose encoding extraction if needed */
switch (encoding) {
case ENCODING_NONE:
break;
case ENCODING_QP:
input = qp_init(0, input);
break;
case ENCODING_BASE64:
input = b64_init(input);
/* XXX have to have nl-mapping base64 in order to
* properly count \n as 2 raw characters
*/
break;
default:
/* Don't know encoding--nothing can match */
convert_free(input);
return 0;
}
/* implement the loop here so we can check on the search each time */
for (i = 0; i < len; i++) {
convert_putc(input, msg_base[i]);
if (search_havematch(tosearch)) break;
}
res = search_havematch(tosearch); /* copy before we free it */
convert_free(input);
return res;
}
/* This is based on charset_searchfile above. */
int charset_extractitem(index_search_text_receiver_t receiver,
void *rock, int uid,
const char *msg_base, size_t len,
int charset, int encoding, int flags,
int rpart, int rcmd)
{
struct convert_rock *input, *tobuffer;
struct buf *out;
size_t i;
/* Initialize character set mapping */
if (charset < 0 || charset >= chartables_num_charsets)
return 0;
/* set up the conversion path */
tobuffer = buffer_init();
input = uni_init(tobuffer);
input = canon_init(flags, input);
input = table_init(charset, input);
switch (encoding) {
case ENCODING_NONE:
break;
case ENCODING_QP:
input = qp_init(0, input);
break;
case ENCODING_BASE64:
input = b64_init(input);
/* XXX have to have nl-mapping base64 in order to
* properly count \n as 2 raw characters
*/
break;
default:
/* Don't know encoding--nothing can match */
convert_free(input);
return 0;
}
/* point to the buffer for easy block sending */
out = (struct buf *)tobuffer->state;
for (i = 0; i < len; i++) {
convert_putc(input, msg_base[i]);
/* process a block of output every so often */
if (buf_len(out) > 4096) {
receiver(uid, rpart, rcmd, out->s, out->len, rock);
buf_reset(out);
}
}
if (out->len) { /* finish it */
receiver(uid, rpart, rcmd, out->s, out->len, rock);
}
convert_free(input);
return 1;
}
int charset_extractfile(index_search_text_receiver_t receiver, void *rock,
int uid, const char *msg_base, size_t len,
int charset, int encoding, int flags)
{
return charset_extractitem(receiver, rock, uid, msg_base, len,
charset, encoding, flags,
SEARCHINDEX_PART_BODY,
SEARCHINDEX_CMD_APPENDPART);
}
/*
* Decode the MIME body part (per RFC 2045) of @len bytes located at
* @msg_base having the content transfer @encoding. Returns a pointer
* to decoded bytes. The number of decoded bytes is returned in
* *@outlen. Depending on the encoding, a newly allocated buffer may be
* written to *@decbuf, which should be free()d by the caller if it not
* zero. Note that the return value may point to either @msg_base or
* @decbuf, so @decbuf should not be free()d until the return value has
* been used.
*/
const char *charset_decode_mimebody(const char *msg_base, size_t len, int encoding,
char **decbuf, size_t *outlen)
{
struct convert_rock *input, *tobuffer;
*decbuf = NULL;
*outlen = 0;
switch (encoding) {
case ENCODING_NONE:
*outlen = len;
return msg_base;
case ENCODING_QP:
tobuffer = buffer_init();
input = qp_init(0, tobuffer);
break;
case ENCODING_BASE64:
tobuffer = buffer_init();
input = b64_init(tobuffer);
break;
default:
/* Don't know encoding--nothing can match */
return NULL;
}
convert_catn(input, msg_base, len);
/* extract the string from the buffer */
{
struct buf *buf = (struct buf *)tobuffer->state;
*outlen = buf->len;
*decbuf = buf_release(buf);
}
convert_free(input);
if (!*decbuf) {
/* didn't get a result - maybe blank input, don't return NULL */
*outlen = len;
return msg_base;
}
return *decbuf;
}
/*
* Base64 encode the MIME body part (per RFC 2045) of 'len' bytes located at
* 'msg_base'. Encodes into 'retval' which must large enough to
* accomodate the encoded data. Returns the number of encoded bytes in
* 'outlen' and the number of encoded lines in 'outlines'.
*
* May be called with 'msg_base' as NULL to get the number of encoded
* bytes for allocating 'retval' of the proper size.
*/
#define BASE64_MAX_LINE_LEN 72
static char base_64[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
char *charset_encode_mimebody(const char *msg_base, size_t len,
char *retval, size_t *outlen, int *outlines)
{
const unsigned char *s;
unsigned char s0, s1, s2;
char *d;
int b64_len, b64_lines, cnt;
b64_len = ((len + 2) / 3) * 4;
b64_lines = (b64_len + BASE64_MAX_LINE_LEN - 1) / BASE64_MAX_LINE_LEN;
/* account for CRLF added to each line */
b64_len += 2 * b64_lines;
if (outlen) *outlen = b64_len;
if (outlines) *outlines = b64_lines;
if (!msg_base) return NULL;
for (s = (const unsigned char*) msg_base, d = retval, cnt = 0; len;
s += 3, d += 4, cnt += 4) { /* process tuplets */
if (cnt == BASE64_MAX_LINE_LEN) {
/* reset line len count, add CRLF */
cnt = 0;
*d++ = '\r';
*d++ = '\n';
}
s0 = s[0];
s1 = --len ? s[1] : 0;
/* byte 1: high 6 bits (1) */
d[0] = base_64[s0 >> 2];
/* byte 2: low 2 bits (1), high 4 bits (2) */
d[1] = base_64[((s0 & 0x3) << 4) | ((s1 & 0xf0) >> 4)];
if (len) {
s2 = --len ? s[2] : 0;
/* byte 3: low 4 bits (2), high 2 bits (3) */
d[2] = base_64[((s1 & 0xf) << 2) | ((s2 & 0xc0) >> 6)];
} else {
/* byte 3: pad */
d[2] = '=';
}
if (len) {
--len;
/* byte 4: low 6 bits (3) */
d[3] = base_64[s2 & 0x3f];
} else {
/* byte 4: pad */
d[3] = '=';
}
}
/* add final CRLF */
*d++ = '\r';
*d++ = '\n';
return (b64_len ? retval : NULL);
}

File Metadata

Mime Type
text/x-c
Expires
Fri, Apr 24, 10:19 AM (5 h, 4 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
18721896
Default Alt Text
charset.c (33 KB)

Event Timeline