Page Menu
Home
Phorge
Search
Configure Global Search
Log In
Files
F120824244
charset.c
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Authored By
Unknown
Size
33 KB
Referenced Files
None
Subscribers
None
charset.c
View Options
/* charset.c -- International character set support
*
* Copyright (c) 1994-2008 Carnegie Mellon University. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The name "Carnegie Mellon University" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For permission or any legal
* details, please contact
* Carnegie Mellon University
* Center for Technology Transfer and Enterprise Creation
* 4615 Forbes Avenue
* Suite 302
* Pittsburgh, PA 15213
* (412) 268-7393, fax: (412) 268-7395
* innovation@andrew.cmu.edu
*
* 4. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by Computing Services
* at Carnegie Mellon University (http://www.cmu.edu/computing/)."
*
* CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO
* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* $Id: charset.c,v 1.55 2010/01/06 17:01:44 murch Exp $
*/
#include
<config.h>
#include
<ctype.h>
#include
<stdlib.h>
#include
<string.h>
#include
"assert.h"
#include
"charset.h"
#include
"xmalloc.h"
#include
"chartable.h"
#include
"util.h"
struct
qp_state
{
int
isheader
;
int
bytesleft
;
int
codepoint
;
};
struct
b64_state
{
int
bytesleft
;
int
codepoint
;
};
struct
table_state
{
const
struct
charmap
(
*
curtable
)[
256
];
const
struct
charmap
(
*
initialtable
)[
256
];
int
bytesleft
;
int
codepoint
;
int
mode
;
int
num_bits
;
};
struct
canon_state
{
int
flags
;
int
seenspace
;
};
struct
comp_pat_s
{
int
max_start
;
size_t
patlen
;
};
struct
search_state
{
ssize_t
*
starts
;
int
max_start
;
int
havematch
;
unsigned
char
*
substr
;
size_t
patlen
;
size_t
offset
;
};
struct
convert_rock
;
typedef
void
convertproc_t
(
struct
convert_rock
*
rock
,
int
c
);
typedef
void
freeconvert_t
(
struct
convert_rock
*
rock
);
struct
convert_rock
{
convertproc_t
*
f
;
freeconvert_t
*
cleanup
;
struct
convert_rock
*
next
;
void
*
state
;
};
#define GROWSIZE 100
#define XX 127
/*
* Table for decoding hexadecimal in quoted-printable
*/
static
const
char
index_hex
[
256
]
=
{
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
10
,
11
,
12
,
13
,
14
,
15
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
10
,
11
,
12
,
13
,
14
,
15
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
};
#define HEXCHAR(c) (index_hex[(unsigned char)(c)])
/*
* Table for decoding base64
*/
static
const
char
index_64
[
256
]
=
{
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
62
,
XX
,
XX
,
XX
,
63
,
52
,
53
,
54
,
55
,
56
,
57
,
58
,
59
,
60
,
61
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
26
,
27
,
28
,
29
,
30
,
31
,
32
,
33
,
34
,
35
,
36
,
37
,
38
,
39
,
40
,
41
,
42
,
43
,
44
,
45
,
46
,
47
,
48
,
49
,
50
,
51
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
};
#define CHAR64(c) (index_64[(unsigned char)(c)])
static
inline
void
convert_putc
(
struct
convert_rock
*
rock
,
int
c
)
{
rock
->
f
(
rock
,
c
);
}
void
convert_cat
(
struct
convert_rock
*
rock
,
const
char
*
s
)
{
while
(
*
s
)
{
convert_putc
(
rock
,
(
unsigned
char
)
*
s
);
s
++
;
}
}
void
convert_catn
(
struct
convert_rock
*
rock
,
const
char
*
s
,
size_t
len
)
{
while
(
len
--
>
0
)
{
convert_putc
(
rock
,
(
unsigned
char
)
*
s
);
s
++
;
}
}
/* convertproc_t conversion functions */
void
qp2byte
(
struct
convert_rock
*
rock
,
int
c
)
{
struct
qp_state
*
s
=
(
struct
qp_state
*
)
rock
->
state
;
int
val
;
if
(
s
->
bytesleft
)
{
s
->
bytesleft
--
;
val
=
HEXCHAR
(
c
);
if
(
val
==
XX
)
{
/* mark invalid regardless */
s
->
codepoint
=
-1
;
return
;
}
if
(
s
->
codepoint
!=
-1
)
{
/* don't blat the invalid marker, but still absorb
* the second char */
s
->
codepoint
=
(
s
->
codepoint
<<
4
)
+
val
;
}
if
(
!
s
->
bytesleft
)
{
if
(
s
->
codepoint
==
-1
)
convert_putc
(
rock
->
next
,
0xfffd
);
else
convert_putc
(
rock
->
next
,
s
->
codepoint
&
0xff
);
}
return
;
}
/* start an encoded byte */
if
(
c
==
'='
)
{
s
->
bytesleft
=
2
;
s
->
codepoint
=
0
;
return
;
}
/* underscores are space in headers */
if
(
s
->
isheader
&&
c
==
'_'
)
c
=
' '
;
convert_putc
(
rock
->
next
,
c
);
}
void
b64_2byte
(
struct
convert_rock
*
rock
,
int
c
)
{
struct
b64_state
*
s
=
(
struct
b64_state
*
)
rock
->
state
;
char
b
=
CHAR64
(
c
);
/* could just be whitespace, ignore it */
if
(
b
==
XX
)
return
;
switch
(
s
->
bytesleft
)
{
case
0
:
s
->
codepoint
=
b
;
s
->
bytesleft
=
3
;
break
;
case
3
:
convert_putc
(
rock
->
next
,
((
s
->
codepoint
<<
2
)
|
(
b
>>
4
))
&
0xff
);
s
->
codepoint
=
b
;
s
->
bytesleft
=
2
;
break
;
case
2
:
convert_putc
(
rock
->
next
,
((
s
->
codepoint
<<
4
)
|
(
b
>>
2
))
&
0xff
);
s
->
codepoint
=
b
;
s
->
bytesleft
=
1
;
break
;
case
1
:
convert_putc
(
rock
->
next
,
((
s
->
codepoint
<<
6
)
|
b
)
&
0xff
);
s
->
codepoint
=
0
;
s
->
bytesleft
=
0
;
}
}
void
stripnl2uni
(
struct
convert_rock
*
rock
,
int
c
)
{
if
(
c
!=
'\r'
&&
c
!=
'\n'
)
convert_putc
(
rock
->
next
,
c
);
}
void
table2uni
(
struct
convert_rock
*
rock
,
int
c
)
{
struct
table_state
*
s
=
(
struct
table_state
*
)
rock
->
state
;
struct
charmap
*
map
=
(
struct
charmap
*
)
&
s
->
curtable
[
0
][
c
&
0xff
];
/* propagate errors */
if
(
c
==
0xfffd
)
{
convert_putc
(
rock
->
next
,
c
);
return
;
}
if
(
map
->
c
)
convert_putc
(
rock
->
next
,
map
->
c
);
s
->
curtable
=
s
->
initialtable
+
map
->
next
;
}
void
utf8_2uni
(
struct
convert_rock
*
rock
,
int
c
)
{
struct
table_state
*
s
=
(
struct
table_state
*
)
rock
->
state
;
/* propagate errors */
if
(
c
==
0xfffd
)
{
convert_putc
(
rock
->
next
,
c
);
return
;
}
if
((
c
&
0xf8
)
==
0xf0
)
{
/* 11110xxx */
/* first of a 4 char sequence */
s
->
bytesleft
=
3
;
s
->
codepoint
=
c
&
0x07
;
/* 00000111 */
}
else
if
((
c
&
0xf0
)
==
0xe0
)
{
/* 1110xxxx */
/* first of a 3 char sequence */
s
->
bytesleft
=
2
;
s
->
codepoint
=
c
&
0x0f
;
/* 00001111 */
}
else
if
((
c
&
0xe0
)
==
0xc0
)
{
/* 110xxxxx */
/* first of a 2 char sequence */
s
->
bytesleft
=
1
;
s
->
codepoint
=
c
&
0x1f
;
/* 00011111 */
}
else
if
((
c
&
0xc0
)
==
0x80
)
{
/* 10xxxxxx */
/* continuation char, handle only if expected */
if
(
s
->
bytesleft
>
0
)
{
s
->
codepoint
=
(
s
->
codepoint
<<
6
)
+
(
c
&
0x3f
);
/* 00111111 */
s
->
bytesleft
--
;
if
(
!
s
->
bytesleft
)
{
convert_putc
(
rock
->
next
,
s
->
codepoint
);
s
->
codepoint
=
0
;
}
}
}
else
{
/* plain ASCII char */
convert_putc
(
rock
->
next
,
c
);
s
->
bytesleft
=
0
;
s
->
codepoint
=
0
;
}
}
void
utf7_2uni
(
struct
convert_rock
*
rock
,
int
c
)
{
struct
table_state
*
s
=
(
struct
table_state
*
)
rock
->
state
;
/* propagate errors */
if
(
c
==
0xfffd
)
{
convert_putc
(
rock
->
next
,
c
);
return
;
}
if
(
c
&
0x80
)
{
/* skip 8-bit chars */
convert_putc
(
rock
->
next
,
0xfffd
);
return
;
}
/* Inside a base64 encoded unicode fragment */
if
(
s
->
mode
)
{
/* '-' marks the end of a fragment */
if
(
c
==
'-'
)
{
/* special case: sequence +- creates output '+' */
if
(
s
->
mode
==
1
)
convert_putc
(
rock
->
next
,
'+'
);
/* otherwise no output for the '-' */
s
->
mode
=
0
;
s
->
num_bits
=
0
;
s
->
codepoint
=
0
;
}
/* a normal char drops us out of base64 mode */
else
if
(
CHAR64
(
c
)
==
XX
)
{
/* pass on the char */
convert_putc
(
rock
->
next
,
c
);
/* and switch back to ASCII mode */
s
->
mode
=
0
;
/* XXX: warn if num_bits > 4 or codepoint != 0 */
s
->
num_bits
=
0
;
s
->
codepoint
=
0
;
}
/* base64 char - process it into the state machine */
else
{
s
->
mode
=
2
;
/* we have some content, so don't process special +- */
/* add the 6 bits of value from this character */
s
->
codepoint
=
(
s
->
codepoint
<<
6
)
+
CHAR64
(
c
);
s
->
num_bits
+=
6
;
/* if we've got a full character's worth of bits, send it down
* the line and keep the remainder for the next character */
if
(
s
->
num_bits
>=
16
)
{
s
->
num_bits
-=
16
;
convert_putc
(
rock
->
next
,
(
s
->
codepoint
>>
s
->
num_bits
)
&
0x7fff
);
s
->
codepoint
&=
((
1
<<
s
->
num_bits
)
-
1
);
/* avoid overflow by trimming */
}
}
}
/* regular ASCII mode */
else
{
/* '+' switches to base64 unicode mode */
if
(
c
==
'+'
)
{
s
->
mode
=
1
;
/* switch mode, but no content processed yet */
s
->
codepoint
=
0
;
s
->
num_bits
=
0
;
}
/* regular ASCII char */
else
{
convert_putc
(
rock
->
next
,
c
);
}
}
}
void
uni2searchform
(
struct
convert_rock
*
rock
,
int
c
)
{
struct
canon_state
*
s
=
(
struct
canon_state
*
)
rock
->
state
;
int
i
;
int
code
;
unsigned
char
table16
,
table8
;
/* invalid character becomes an Oxff - that's illegal utf-8,
* so it won't match */
if
(
c
==
0xfffd
)
{
convert_putc
(
rock
->
next
,
0xff
);
return
;
}
table16
=
chartables_translation_block16
[(
c
>>
16
)
&
0xff
];
/* no translations */
if
(
table16
==
255
)
{
convert_putc
(
rock
->
next
,
c
);
return
;
}
table8
=
chartables_translation_block8
[
table16
][(
c
>>
8
)
&
0xff
];
/* no translations */
if
(
table8
==
255
)
{
convert_putc
(
rock
->
next
,
c
);
return
;
}
/* use the xlate table */
code
=
chartables_translation
[
table8
][
c
&
0xff
];
/* case - zero length output */
if
(
code
==
0
)
{
return
;
}
/* special case: whitespace or control characters */
if
(
code
==
' '
||
code
==
'\r'
||
code
==
'\n'
)
{
if
(
s
->
flags
&
CHARSET_SKIPSPACE
)
{
return
;
}
if
(
s
->
flags
&
CHARSET_MERGESPACE
)
{
if
(
s
->
seenspace
)
return
;
s
->
seenspace
=
1
;
code
=
' '
;
/* one SPACE char */
}
}
else
s
->
seenspace
=
0
;
/* case - one character output */
if
(
code
>
0
)
{
convert_putc
(
rock
->
next
,
code
);
return
;
}
/* case - multiple characters */
for
(
i
=
-
code
;
chartables_translation_multichar
[
i
];
i
++
)
{
int
c
=
chartables_translation_multichar
[
i
];
/* diacritical character range. This duplicates the
* behaviour of Cyrus versions before 2.5 */
if
(
s
->
flags
&
CHARSET_SKIPDIACRIT
)
{
if
((
c
&
~
0xff
)
==
0x300
)
continue
;
}
/* note: whitespace already stripped from multichar sequences... */
convert_putc
(
rock
->
next
,
c
);
}
}
void
uni2utf8
(
struct
convert_rock
*
rock
,
int
c
)
{
if
(
c
>
0xffff
)
{
convert_putc
(
rock
->
next
,
0xF0
+
((
c
>>
18
)
&
0x07
));
convert_putc
(
rock
->
next
,
0x80
+
((
c
>>
12
)
&
0x3f
));
convert_putc
(
rock
->
next
,
0x80
+
((
c
>>
6
)
&
0x3f
));
convert_putc
(
rock
->
next
,
0x80
+
(
c
&
0x3f
));
}
else
if
(
c
>
0x7ff
)
{
convert_putc
(
rock
->
next
,
0xE0
+
((
c
>>
12
)
&
0x0f
));
convert_putc
(
rock
->
next
,
0x80
+
((
c
>>
6
)
&
0x3f
));
convert_putc
(
rock
->
next
,
0x80
+
(
c
&
0x3f
));
}
else
if
(
c
>
0x7f
)
{
convert_putc
(
rock
->
next
,
0xC0
+
((
c
>>
6
)
&
0x1f
));
convert_putc
(
rock
->
next
,
0x80
+
(
c
&
0x3f
));
}
else
{
convert_putc
(
rock
->
next
,
c
);
}
}
void
byte2search
(
struct
convert_rock
*
rock
,
int
c
)
{
struct
search_state
*
s
=
(
struct
search_state
*
)
rock
->
state
;
int
i
,
cur
;
unsigned
char
b
=
(
unsigned
char
)
c
;
if
(
c
==
0xfffd
)
{
c
=
0xff
;
/* searchable by invalid character! */
}
/* check our "in_progress" matches to see if they're still valid */
for
(
i
=
0
,
cur
=
0
;
i
<
s
->
max_start
;
i
++
)
{
/* no more active offsets */
if
(
s
->
starts
[
i
]
==
-1
)
break
;
/* if we've passed one that's not ongoing, copy back */
if
(
cur
<
i
)
s
->
starts
[
cur
]
=
s
->
starts
[
i
];
/* check that the substring is still maching */
if
(
b
==
s
->
substr
[
s
->
offset
-
s
->
starts
[
i
]])
{
if
(
s
->
offset
-
s
->
starts
[
i
]
==
s
->
patlen
-
1
)
{
/* we're there! */
s
->
havematch
=
1
;
}
else
{
/* keep this one, it's ongoing */
cur
++
;
}
}
}
/* starting a new one! */
if
(
b
==
s
->
substr
[
0
])
{
/* have to treat this one specially! */
if
(
s
->
patlen
==
1
)
s
->
havematch
=
1
;
else
s
->
starts
[
cur
++
]
=
s
->
offset
;
}
/* empty out any others that aren't being kept */
while
(
cur
<
i
)
s
->
starts
[
cur
++
]
=
-1
;
/* increment the offset counter */
s
->
offset
++
;
}
void
byte2buffer
(
struct
convert_rock
*
rock
,
int
c
)
{
struct
buf
*
buf
=
(
struct
buf
*
)
rock
->
state
;
buf_putc
(
buf
,
c
&
0xff
);
}
/* convert_rock manipulation routines */
void
table_switch
(
struct
convert_rock
*
rock
,
int
charset_num
)
{
struct
table_state
*
state
=
(
struct
table_state
*
)
rock
->
state
;
/* wipe any current state */
memset
(
state
,
0
,
sizeof
(
struct
table_state
));
/* it's a table based lookup */
if
(
chartables_charset_table
[
charset_num
].
table
)
{
/* set up the initial table */
state
->
curtable
=
state
->
initialtable
=
chartables_charset_table
[
charset_num
].
table
;
rock
->
f
=
table2uni
;
}
/* special case UTF-8 */
else
if
(
strstr
(
chartables_charset_table
[
charset_num
].
name
,
"utf-8"
))
{
rock
->
f
=
utf8_2uni
;
}
/* special case UTF-7 */
else
if
(
strstr
(
chartables_charset_table
[
charset_num
].
name
,
"utf-7"
))
{
rock
->
f
=
utf7_2uni
;
}
/* should never happen */
else
{
exit
(
1
);
/* do something fatal here! */
}
}
/* Extract a cstring from a buffer. NOTE: caller must free the memory
* themselves once this is called. Resets the state. If you don't
* call this function then buffer_free will clean up */
static
char
*
buffer_cstring
(
struct
convert_rock
*
rock
)
{
struct
buf
*
buf
=
(
struct
buf
*
)
rock
->
state
;
return
buf_release
(
buf
);
}
static
inline
int
search_havematch
(
struct
convert_rock
*
rock
)
{
struct
search_state
*
s
=
(
struct
search_state
*
)
rock
->
state
;
return
s
->
havematch
;
}
/* conversion cleanup routines */
void
basic_free
(
struct
convert_rock
*
rock
)
{
if
(
rock
)
{
if
(
rock
->
state
)
free
(
rock
->
state
);
free
(
rock
);
}
}
void
search_free
(
struct
convert_rock
*
rock
)
{
if
(
rock
&&
rock
->
state
)
{
struct
search_state
*
s
=
(
struct
search_state
*
)
rock
->
state
;
if
(
s
->
starts
)
free
(
s
->
starts
);
}
basic_free
(
rock
);
}
static
void
buffer_free
(
struct
convert_rock
*
rock
)
{
if
(
rock
&&
rock
->
state
)
{
struct
buf
*
buf
=
(
struct
buf
*
)
rock
->
state
;
buf_free
(
buf
);
}
basic_free
(
rock
);
}
void
convert_free
(
struct
convert_rock
*
rock
)
{
struct
convert_rock
*
next
;
while
(
rock
)
{
next
=
rock
->
next
;
if
(
rock
->
cleanup
)
rock
->
cleanup
(
rock
);
else
basic_free
(
rock
);
rock
=
next
;
}
}
/* converter initialisation routines */
struct
convert_rock
*
qp_init
(
int
isheader
,
struct
convert_rock
*
next
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
struct
qp_state
*
s
=
xzmalloc
(
sizeof
(
struct
qp_state
));
s
->
isheader
=
isheader
;
rock
->
state
=
(
void
*
)
s
;
rock
->
f
=
qp2byte
;
rock
->
next
=
next
;
return
rock
;
}
struct
convert_rock
*
b64_init
(
struct
convert_rock
*
next
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
rock
->
state
=
xzmalloc
(
sizeof
(
struct
b64_state
));
rock
->
f
=
b64_2byte
;
rock
->
next
=
next
;
return
rock
;
}
struct
convert_rock
*
stripnl_init
(
struct
convert_rock
*
next
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
rock
->
f
=
stripnl2uni
;
rock
->
next
=
next
;
return
rock
;
}
struct
convert_rock
*
canon_init
(
int
flags
,
struct
convert_rock
*
next
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
struct
canon_state
*
s
=
xzmalloc
(
sizeof
(
struct
canon_state
));
s
->
flags
=
flags
;
rock
->
f
=
uni2searchform
;
rock
->
state
=
s
;
rock
->
next
=
next
;
return
rock
;
}
struct
convert_rock
*
uni_init
(
struct
convert_rock
*
next
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
rock
->
f
=
uni2utf8
;
rock
->
next
=
next
;
return
rock
;
}
struct
convert_rock
*
table_init
(
int
charset_num
,
struct
convert_rock
*
next
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
rock
->
state
=
xzmalloc
(
sizeof
(
struct
table_state
));
rock
->
next
=
next
;
table_switch
(
rock
,
charset_num
);
return
rock
;
}
struct
convert_rock
*
search_init
(
const
char
*
substr
,
comp_pat
*
pat
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
struct
search_state
*
s
=
xzmalloc
(
sizeof
(
struct
search_state
));
struct
comp_pat_s
*
p
=
(
struct
comp_pat_s
*
)
pat
;
int
i
;
/* copy in tracking vars */
s
->
max_start
=
p
->
max_start
;
s
->
patlen
=
p
->
patlen
;
s
->
substr
=
(
unsigned
char
*
)
substr
;
/* allocate tracking space and initialise to "no match" */
s
->
starts
=
xmalloc
(
s
->
max_start
*
sizeof
(
size_t
));
for
(
i
=
0
;
i
<
s
->
max_start
;
i
++
)
{
s
->
starts
[
i
]
=
-1
;
}
/* set up the rock */
rock
->
f
=
byte2search
;
rock
->
cleanup
=
search_free
;
rock
->
state
=
(
void
*
)
s
;
return
rock
;
}
static
struct
convert_rock
*
buffer_init
(
void
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
struct
buf
*
buf
=
xzmalloc
(
sizeof
(
struct
buf
));
rock
->
f
=
byte2buffer
;
rock
->
cleanup
=
buffer_free
;
rock
->
state
=
(
void
*
)
buf
;
return
rock
;
}
/* API */
/*
* Lookup the character set 'name'. Returns the character set number
* or -1 if there is no matching character set.
*/
int
charset_lookupname
(
const
char
*
name
)
{
int
i
;
/* translate to canonical name */
for
(
i
=
0
;
charset_aliases
[
i
].
name
;
i
++
)
{
if
(
!
strcasecmp
(
name
,
charset_aliases
[
i
].
name
))
{
name
=
charset_aliases
[
i
].
canon_name
;
break
;
}
}
/* look up canonical name */
for
(
i
=
0
;
i
<
chartables_num_charsets
;
i
++
)
{
if
(
!
strcasecmp
(
name
,
chartables_charset_table
[
i
].
name
))
return
i
;
}
return
-1
;
}
static
int
lookup_buf
(
const
char
*
buf
,
int
len
)
{
char
*
name
=
xstrndup
(
buf
,
len
);
int
res
=
charset_lookupname
(
name
);
free
(
name
);
return
res
;
}
/*
* Convert the string 's' in the character set numbered 'charset'
* into canonical searching form. Returns a newly allocated string
* which must be free()d by the caller.
*/
char
*
charset_convert
(
const
char
*
s
,
int
charset
,
int
flags
)
{
struct
convert_rock
*
input
,
*
tobuffer
;
char
*
res
;
if
(
!
s
)
return
0
;
if
(
charset
<
0
||
charset
>=
chartables_num_charsets
)
return
xstrdup
(
"X"
);
/* set up the conversion path */
tobuffer
=
buffer_init
();
input
=
uni_init
(
tobuffer
);
input
=
canon_init
(
flags
,
input
);
input
=
table_init
(
charset
,
input
);
/* do the conversion */
convert_cat
(
input
,
s
);
/* extract the result */
res
=
buffer_cstring
(
tobuffer
);
/* clean up */
convert_free
(
input
);
return
res
;
}
char
*
charset_utf8_to_searchform
(
const
char
*
s
,
int
flags
)
{
int
charset
=
charset_lookupname
(
"utf-8"
);
return
charset_convert
(
s
,
charset
,
flags
);
}
/* Convert from a given charset and encoding into utf8 */
char
*
charset_to_utf8
(
const
char
*
msg_base
,
size_t
len
,
int
charset
,
int
encoding
)
{
struct
convert_rock
*
input
,
*
tobuffer
;
char
*
res
;
/* Initialize character set mapping */
if
(
charset
<
0
||
charset
>=
chartables_num_charsets
)
return
0
;
/* check for trivial search */
if
(
len
==
0
)
return
xstrdup
(
""
);
/* set up the conversion path */
tobuffer
=
buffer_init
();
input
=
uni_init
(
tobuffer
);
input
=
table_init
(
charset
,
input
);
/* choose encoding extraction if needed */
switch
(
encoding
)
{
case
ENCODING_NONE
:
break
;
case
ENCODING_QP
:
input
=
qp_init
(
0
,
input
);
break
;
case
ENCODING_BASE64
:
input
=
b64_init
(
input
);
/* XXX have to have nl-mapping base64 in order to
* properly count \n as 2 raw characters
*/
break
;
default
:
/* Don't know encoding--nothing can match */
convert_free
(
input
);
return
0
;
}
convert_catn
(
input
,
msg_base
,
len
);
res
=
buffer_cstring
(
tobuffer
);
convert_free
(
input
);
return
res
;
}
void
mimeheader_cat
(
struct
convert_rock
*
target
,
const
char
*
s
)
{
struct
convert_rock
*
input
,
*
stripnl
;
int
eatspace
=
0
;
const
char
*
start
,
*
endcharset
,
*
encoding
,
*
end
;
int
len
;
int
charset
;
const
char
*
p
;
if
(
!
s
)
return
;
/* set up the conversion path */
input
=
table_init
(
0
,
target
);
/* note: we assume the caller of this function has already
* determined that all newlines are followed by whitespace */
stripnl
=
stripnl_init
(
input
);
start
=
s
;
while
((
start
=
(
const
char
*
)
strchr
(
start
,
'='
))
!=
0
)
{
start
++
;
if
(
*
start
!=
'?'
)
continue
;
encoding
=
(
const
char
*
)
strchr
(
start
+
1
,
'?'
);
if
(
!
encoding
)
continue
;
endcharset
=
(
const
char
*
)
strchr
(
start
+
1
,
'*'
);
/* Language code delimiter */
if
(
!
endcharset
||
endcharset
>
encoding
)
endcharset
=
encoding
;
if
(
encoding
[
1
]
!=
'b'
&&
encoding
[
1
]
!=
'B'
&&
encoding
[
1
]
!=
'q'
&&
encoding
[
1
]
!=
'Q'
)
continue
;
if
(
encoding
[
2
]
!=
'?'
)
continue
;
end
=
(
const
char
*
)
strchr
(
encoding
+
3
,
'?'
);
if
(
!
end
||
end
[
1
]
!=
'='
)
continue
;
/*
* We have recognized a valid 1522-word.
* Copy over leading text, unless it consists entirely of
* whitespace and is between two 1522-words.
*/
if
(
eatspace
)
{
for
(
p
=
s
;
p
<
(
start
-1
)
&&
Uisspace
(
*
p
);
p
++
);
if
(
p
<
(
start
-1
))
eatspace
=
0
;
}
if
(
!
eatspace
)
{
len
=
start
-
s
-
1
;
table_switch
(
input
,
0
);
/* US_ASCII */
convert_catn
(
stripnl
,
s
,
len
);
}
/*
* Get the 1522-word's character set
*/
start
++
;
charset
=
lookup_buf
(
start
,
endcharset
-
start
);
if
(
charset
<
0
)
{
/* Unrecognized charset, nothing will match here */
convert_putc
(
input
,
0xfffd
);
/* unknown character */
}
else
{
struct
convert_rock
*
extract
;
table_switch
(
input
,
charset
);
/* choose decoder */
if
(
encoding
[
1
]
==
'q'
||
encoding
[
1
]
==
'Q'
)
{
extract
=
qp_init
(
1
,
input
);
}
else
{
extract
=
b64_init
(
input
);
}
/* convert */
p
=
encoding
+
3
;
convert_catn
(
extract
,
p
,
end
-
p
);
/* clean up */
basic_free
(
extract
);
}
/* Prepare for the next iteration */
s
=
start
=
end
+
2
;
eatspace
=
1
;
}
/* Copy over the tail part of the input string */
if
(
*
s
)
{
table_switch
(
input
,
0
);
/* US_ASCII */
convert_cat
(
stripnl
,
s
);
}
/* just free these ones, the rest can be cleaned up by the sender */
basic_free
(
stripnl
);
basic_free
(
input
);
}
/*
* Decode MIME strings (per RFC 2047) in 's'. Returns a newly allocated
* string, contining 's' in canonical searching form, which must be
* free()d by the caller.
*/
char
*
charset_decode_mimeheader
(
const
char
*
s
,
int
flags
)
{
struct
convert_rock
*
tobuffer
,
*
input
;
char
*
res
;
if
(
!
s
)
return
NULL
;
tobuffer
=
buffer_init
();
input
=
uni_init
(
tobuffer
);
input
=
canon_init
(
flags
,
input
);
mimeheader_cat
(
input
,
s
);
res
=
buffer_cstring
(
tobuffer
);
convert_free
(
input
);
return
res
;
}
/*
* Decode MIME strings (per RFC 2047) in 's'. Returns a newly allocated
* string, containing the decoded string, which must be free()d by the
* caller.
*/
char
*
charset_parse_mimeheader
(
const
char
*
s
)
{
struct
convert_rock
*
tobuffer
,
*
input
;
char
*
res
;
if
(
!
s
)
return
NULL
;
tobuffer
=
buffer_init
();
input
=
uni_init
(
tobuffer
);
mimeheader_cat
(
input
,
s
);
res
=
buffer_cstring
(
tobuffer
);
convert_free
(
input
);
return
res
;
}
int
charset_search_mimeheader
(
const
char
*
substr
,
comp_pat
*
pat
,
const
char
*
s
,
int
flags
)
{
struct
convert_rock
*
input
,
*
tosearch
;
int
res
;
tosearch
=
search_init
(
substr
,
pat
);
input
=
uni_init
(
tosearch
);
input
=
canon_init
(
flags
,
input
);
mimeheader_cat
(
input
,
s
);
res
=
search_havematch
(
tosearch
);
convert_free
(
input
);
return
res
;
}
/* Compile a search pattern for later comparison. We just count
* how long the string is, and how many times the first character
* occurs. Later optimisation could reduce the max_start by
* deeper analysis of the possible paths through the string, but
* this is a good absolute maximum, and it just means a few more
* bytes get allocated... */
comp_pat
*
charset_compilepat
(
const
char
*
s
)
{
struct
comp_pat_s
*
pat
=
xzmalloc
(
sizeof
(
struct
comp_pat_s
));
const
char
*
p
=
s
;
/* count occurances */
while
(
*
p
)
{
if
(
*
p
==
*
s
)
pat
->
max_start
++
;
pat
->
patlen
++
;
p
++
;
}
return
(
comp_pat
*
)
pat
;
}
/*
* Free the compiled pattern 'pat'
*/
void
charset_freepat
(
comp_pat
*
pat
)
{
free
((
struct
comp_pat_s
*
)
pat
);
}
/*
* Search for the string 'substr', with compiled pattern 'pat'
* in the string 's', with length 'len'. Return nonzero if match
*
* Uses the to_search target directly. Assumes 's' is already
* in search normal form (i.e. from a cache file)
*/
int
charset_searchstring
(
const
char
*
substr
,
comp_pat
*
pat
,
const
char
*
s
,
size_t
len
,
int
flags
)
{
struct
convert_rock
*
tosearch
;
struct
convert_rock
*
input
;
int
charset
=
charset_lookupname
(
"utf-8"
);
int
res
;
if
(
!
substr
[
0
])
return
1
;
/* zero length string always matches */
/* set up the search handler */
tosearch
=
search_init
(
substr
,
pat
);
/* and the input stream */
input
=
uni_init
(
tosearch
);
input
=
canon_init
(
flags
,
input
);
input
=
table_init
(
charset
,
input
);
/* feed the handler */
while
(
len
--
>
0
)
{
convert_putc
(
input
,
(
unsigned
char
)
*
s
++
);
if
(
search_havematch
(
tosearch
))
break
;
/* shortcut if there's a match */
}
/* copy the value */
res
=
search_havematch
(
tosearch
);
/* clean up */
convert_free
(
input
);
return
res
;
}
/*
* Search for the string 'substr' in the next 'len' bytes of
* 'msg_base'.
* 'charset' and 'encoding' specify the character set and
* content transfer encoding of the data, respectively.
* Returns nonzero iff the string was found.
*/
int
charset_searchfile
(
const
char
*
substr
,
comp_pat
*
pat
,
const
char
*
msg_base
,
size_t
len
,
int
charset
,
int
encoding
,
int
flags
)
{
struct
convert_rock
*
input
,
*
tosearch
;
size_t
i
;
int
res
;
/* Initialize character set mapping */
if
(
charset
<
0
||
charset
>=
chartables_num_charsets
)
return
0
;
/* check for trivial search */
if
(
strlen
(
substr
)
==
0
)
return
1
;
/* set up the conversion path */
tosearch
=
search_init
(
substr
,
pat
);
input
=
uni_init
(
tosearch
);
input
=
canon_init
(
flags
,
input
);
input
=
table_init
(
charset
,
input
);
/* choose encoding extraction if needed */
switch
(
encoding
)
{
case
ENCODING_NONE
:
break
;
case
ENCODING_QP
:
input
=
qp_init
(
0
,
input
);
break
;
case
ENCODING_BASE64
:
input
=
b64_init
(
input
);
/* XXX have to have nl-mapping base64 in order to
* properly count \n as 2 raw characters
*/
break
;
default
:
/* Don't know encoding--nothing can match */
convert_free
(
input
);
return
0
;
}
/* implement the loop here so we can check on the search each time */
for
(
i
=
0
;
i
<
len
;
i
++
)
{
convert_putc
(
input
,
msg_base
[
i
]);
if
(
search_havematch
(
tosearch
))
break
;
}
res
=
search_havematch
(
tosearch
);
/* copy before we free it */
convert_free
(
input
);
return
res
;
}
/* This is based on charset_searchfile above. */
int
charset_extractitem
(
index_search_text_receiver_t
receiver
,
void
*
rock
,
int
uid
,
const
char
*
msg_base
,
size_t
len
,
int
charset
,
int
encoding
,
int
flags
,
int
rpart
,
int
rcmd
)
{
struct
convert_rock
*
input
,
*
tobuffer
;
struct
buf
*
out
;
size_t
i
;
/* Initialize character set mapping */
if
(
charset
<
0
||
charset
>=
chartables_num_charsets
)
return
0
;
/* set up the conversion path */
tobuffer
=
buffer_init
();
input
=
uni_init
(
tobuffer
);
input
=
canon_init
(
flags
,
input
);
input
=
table_init
(
charset
,
input
);
switch
(
encoding
)
{
case
ENCODING_NONE
:
break
;
case
ENCODING_QP
:
input
=
qp_init
(
0
,
input
);
break
;
case
ENCODING_BASE64
:
input
=
b64_init
(
input
);
/* XXX have to have nl-mapping base64 in order to
* properly count \n as 2 raw characters
*/
break
;
default
:
/* Don't know encoding--nothing can match */
convert_free
(
input
);
return
0
;
}
/* point to the buffer for easy block sending */
out
=
(
struct
buf
*
)
tobuffer
->
state
;
for
(
i
=
0
;
i
<
len
;
i
++
)
{
convert_putc
(
input
,
msg_base
[
i
]);
/* process a block of output every so often */
if
(
buf_len
(
out
)
>
4096
)
{
receiver
(
uid
,
rpart
,
rcmd
,
out
->
s
,
out
->
len
,
rock
);
buf_reset
(
out
);
}
}
if
(
out
->
len
)
{
/* finish it */
receiver
(
uid
,
rpart
,
rcmd
,
out
->
s
,
out
->
len
,
rock
);
}
convert_free
(
input
);
return
1
;
}
int
charset_extractfile
(
index_search_text_receiver_t
receiver
,
void
*
rock
,
int
uid
,
const
char
*
msg_base
,
size_t
len
,
int
charset
,
int
encoding
,
int
flags
)
{
return
charset_extractitem
(
receiver
,
rock
,
uid
,
msg_base
,
len
,
charset
,
encoding
,
flags
,
SEARCHINDEX_PART_BODY
,
SEARCHINDEX_CMD_APPENDPART
);
}
/*
* Decode the MIME body part (per RFC 2045) of @len bytes located at
* @msg_base having the content transfer @encoding. Returns a pointer
* to decoded bytes. The number of decoded bytes is returned in
* *@outlen. Depending on the encoding, a newly allocated buffer may be
* written to *@decbuf, which should be free()d by the caller if it not
* zero. Note that the return value may point to either @msg_base or
* @decbuf, so @decbuf should not be free()d until the return value has
* been used.
*/
const
char
*
charset_decode_mimebody
(
const
char
*
msg_base
,
size_t
len
,
int
encoding
,
char
**
decbuf
,
size_t
*
outlen
)
{
struct
convert_rock
*
input
,
*
tobuffer
;
*
decbuf
=
NULL
;
*
outlen
=
0
;
switch
(
encoding
)
{
case
ENCODING_NONE
:
*
outlen
=
len
;
return
msg_base
;
case
ENCODING_QP
:
tobuffer
=
buffer_init
();
input
=
qp_init
(
0
,
tobuffer
);
break
;
case
ENCODING_BASE64
:
tobuffer
=
buffer_init
();
input
=
b64_init
(
tobuffer
);
break
;
default
:
/* Don't know encoding--nothing can match */
return
NULL
;
}
convert_catn
(
input
,
msg_base
,
len
);
/* extract the string from the buffer */
{
struct
buf
*
buf
=
(
struct
buf
*
)
tobuffer
->
state
;
*
outlen
=
buf
->
len
;
*
decbuf
=
buf_release
(
buf
);
}
convert_free
(
input
);
if
(
!*
decbuf
)
{
/* didn't get a result - maybe blank input, don't return NULL */
*
outlen
=
len
;
return
msg_base
;
}
return
*
decbuf
;
}
/*
* Base64 encode the MIME body part (per RFC 2045) of 'len' bytes located at
* 'msg_base'. Encodes into 'retval' which must large enough to
* accomodate the encoded data. Returns the number of encoded bytes in
* 'outlen' and the number of encoded lines in 'outlines'.
*
* May be called with 'msg_base' as NULL to get the number of encoded
* bytes for allocating 'retval' of the proper size.
*/
#define BASE64_MAX_LINE_LEN 72
static
char
base_64
[]
=
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
;
char
*
charset_encode_mimebody
(
const
char
*
msg_base
,
size_t
len
,
char
*
retval
,
size_t
*
outlen
,
int
*
outlines
)
{
const
unsigned
char
*
s
;
unsigned
char
s0
,
s1
,
s2
;
char
*
d
;
int
b64_len
,
b64_lines
,
cnt
;
b64_len
=
((
len
+
2
)
/
3
)
*
4
;
b64_lines
=
(
b64_len
+
BASE64_MAX_LINE_LEN
-
1
)
/
BASE64_MAX_LINE_LEN
;
/* account for CRLF added to each line */
b64_len
+=
2
*
b64_lines
;
if
(
outlen
)
*
outlen
=
b64_len
;
if
(
outlines
)
*
outlines
=
b64_lines
;
if
(
!
msg_base
)
return
NULL
;
for
(
s
=
(
const
unsigned
char
*
)
msg_base
,
d
=
retval
,
cnt
=
0
;
len
;
s
+=
3
,
d
+=
4
,
cnt
+=
4
)
{
/* process tuplets */
if
(
cnt
==
BASE64_MAX_LINE_LEN
)
{
/* reset line len count, add CRLF */
cnt
=
0
;
*
d
++
=
'\r'
;
*
d
++
=
'\n'
;
}
s0
=
s
[
0
];
s1
=
--
len
?
s
[
1
]
:
0
;
/* byte 1: high 6 bits (1) */
d
[
0
]
=
base_64
[
s0
>>
2
];
/* byte 2: low 2 bits (1), high 4 bits (2) */
d
[
1
]
=
base_64
[((
s0
&
0x3
)
<<
4
)
|
((
s1
&
0xf0
)
>>
4
)];
if
(
len
)
{
s2
=
--
len
?
s
[
2
]
:
0
;
/* byte 3: low 4 bits (2), high 2 bits (3) */
d
[
2
]
=
base_64
[((
s1
&
0xf
)
<<
2
)
|
((
s2
&
0xc0
)
>>
6
)];
}
else
{
/* byte 3: pad */
d
[
2
]
=
'='
;
}
if
(
len
)
{
--
len
;
/* byte 4: low 6 bits (3) */
d
[
3
]
=
base_64
[
s2
&
0x3f
];
}
else
{
/* byte 4: pad */
d
[
3
]
=
'='
;
}
}
/* add final CRLF */
*
d
++
=
'\r'
;
*
d
++
=
'\n'
;
return
(
b64_len
?
retval
:
NULL
);
}
File Metadata
Details
Attached
Mime Type
text/x-c
Expires
Fri, Apr 24, 10:19 AM (6 h, 51 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
18721896
Default Alt Text
charset.c (33 KB)
Attached To
Mode
R111 cyrus-imapd
Attached
Detach File
Event Timeline