Page Menu
Home
Phorge
Search
Configure Global Search
Log In
Files
F120827270
charset.c
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Authored By
Unknown
Size
107 KB
Referenced Files
None
Subscribers
None
charset.c
View Options
/* charset.c -- International character set support
*
* Copyright (c) 1994-2008 Carnegie Mellon University. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The name "Carnegie Mellon University" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For permission or any legal
* details, please contact
* Carnegie Mellon University
* Center for Technology Transfer and Enterprise Creation
* 4615 Forbes Avenue
* Suite 302
* Pittsburgh, PA 15213
* (412) 268-7393, fax: (412) 268-7395
* innovation@andrew.cmu.edu
*
* 4. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by Computing Services
* at Carnegie Mellon University (http://www.cmu.edu/computing/)."
*
* CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO
* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include
<config.h>
#include
<stdlib.h>
#include
<string.h>
#include
"unicode/ucnv.h"
#include
"assert.h"
#include
"charset.h"
#include
"xmalloc.h"
#include
"chartable.h"
#include
"hash.h"
#include
"htmlchar.h"
#include
"util.h"
#include
<unicode/ustring.h>
#include
<unicode/unorm2.h>
#include
<unicode/utf8.h>
#define U_REPLACEMENT 0xfffd
#define unicode_isvalid(c) \
(!((c >= 0xd800 && c <= 0xdfff) || ((unsigned)c > 0x10ffff)))
char
QPSAFECHAR
[
256
]
=
{
/* control chars are unsafe */
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
/* http://en.wikipedia.org/wiki/Quoted-printable */
/* All printable ASCII characters (decimal values between 33 and 126) */
/* may be represented by themselves, except "=" (decimal 61). */
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
/* all high bits are unsafe */
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
struct
qp_state
{
int
isheader
;
int
len
;
unsigned
char
buf
[
1000
];
};
struct
b64_state
{
int
bytesleft
;
int
codepoint
;
};
struct
unfold_state
{
int
state
;
int
skipws
;
};
struct
canon_state
{
int
flags
;
int
seenspace
;
};
struct
comp_pat_s
{
int
max_start
;
size_t
patlen
;
};
struct
search_state
{
ssize_t
*
starts
;
int
max_start
;
int
havematch
;
unsigned
char
*
substr
;
size_t
patlen
;
size_t
offset
;
};
enum
html_state
{
HDATA
,
HTAGOPEN
,
HENDTAGOPEN
,
HTAGNAME
,
HSCTAG
,
HTAGPARAMS
,
HCHARACTER
,
HCHARACTER2
,
HCHARACTERHASH
,
HCHARACTERHEX
,
HCHARACTERDEC
,
HSCRIPTDATA
,
HSCRIPTLT
,
HSTYLEDATA
,
HSTYLELT
,
HBOGUSCOMM
,
HMUDECOPEN
,
HCOMMSTART
,
HCOMMSTARTDASH
,
HCOMM
,
HCOMMENDDASH
,
HCOMMEND
,
HCOMMENDBANG
};
struct
striphtml_state
{
struct
buf
name
;
#define HBEGIN (1<<0)
#define HEND (1<<1)
unsigned
int
ends
;
/* state stack */
int
depth
;
enum
html_state
stack
[
2
];
};
#define CHARSET_ICUBUF_BUFFER_SIZE 4096
struct
charset_charset
{
/* An open ICU converter for ICU backed converters. Or NULL. */
UConverter
*
conv
;
/* Cyrus-canonical charset name for this converter.
* Might differ from ICU name.*/
char
*
canon_name
;
/* Alias name of as provided by caller in charset_lookupname */
char
*
alias_name
;
/* The numeric charset identifier for table converters. Or -1 */
int
num
;
/* Table converter backend state */
const
struct
charmap
(
*
curtable
)[
256
];
const
struct
charmap
(
*
initialtable
)[
256
];
int
bytesleft
;
int
codepoint
;
int
mode
;
int
num_bits
;
/* ICU converter backend state */
short
flush
;
/* set if conv should be flushed */
char
*
buf
;
/* Target and source cache */
size_t
buf_size
;
char
*
tgt_base
;
char
*
tgt_top
;
char
*
tgt_next
;
char
*
src_base
;
char
*
src_top
;
char
*
src_next
;
};
struct
convert_rock
;
static
void
icu_reset
(
struct
convert_rock
*
rock
,
int
to_uni
);
static
void
icu_flush
(
struct
convert_rock
*
rock
);
static
void
icu_cleanup
(
struct
convert_rock
*
rock
,
int
is_free
);
static
void
table_reset
(
struct
convert_rock
*
rock
,
int
to_uni
);
static
void
table_cleanup
(
struct
convert_rock
*
rock
,
int
is_free
);
typedef
void
convertproc_t
(
struct
convert_rock
*
rock
,
uint32_t
c
);
typedef
void
cleanupconvert_t
(
struct
convert_rock
*
rock
,
int
is_free
);
typedef
void
flushproc_t
(
struct
convert_rock
*
rock
);
struct
convert_rock
{
convertproc_t
*
f
;
cleanupconvert_t
*
cleanup
;
flushproc_t
*
flush
;
struct
convert_rock
*
next
;
void
*
state
;
int
dont_free_state
;
/* flag for basic_free */
};
#define GROWSIZE 100
int
charset_debug
;
static
const
char
*
convert_name
(
struct
convert_rock
*
rock
);
#define XX 127
/*
* Table for decoding hexadecimal in quoted-printable
*/
static
const
unsigned
char
index_hex
[
256
]
=
{
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
10
,
11
,
12
,
13
,
14
,
15
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
10
,
11
,
12
,
13
,
14
,
15
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
};
#define HEXCHAR(c) (index_hex[(unsigned char)(c)])
/*
* Table for decoding base64
*/
static
const
char
index_64
[
256
]
=
{
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
62
,
XX
,
XX
,
XX
,
63
,
52
,
53
,
54
,
55
,
56
,
57
,
58
,
59
,
60
,
61
,
XX
,
XX
,
XX
,
64
,
XX
,
XX
,
XX
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
26
,
27
,
28
,
29
,
30
,
31
,
32
,
33
,
34
,
35
,
36
,
37
,
38
,
39
,
40
,
41
,
42
,
43
,
44
,
45
,
46
,
47
,
48
,
49
,
50
,
51
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
XX
,
};
#define CHAR64(c) (index_64[(unsigned char)(c)])
EXPORTED
int
encoding_lookupname
(
const
char
*
s
)
{
if
(
!
s
)
return
ENCODING_NONE
;
switch
(
s
[
0
])
{
case
'7'
:
if
(
!
strcasecmp
(
s
,
"7BIT"
))
return
ENCODING_NONE
;
// non-standard stuff seen in the wild
if
(
!
strcasecmp
(
s
,
"7-BIT"
))
return
ENCODING_NONE
;
break
;
case
'8'
:
if
(
!
strcasecmp
(
s
,
"8BIT"
))
return
ENCODING_NONE
;
// non-standard stuff seen in the wild
if
(
!
strcasecmp
(
s
,
"8-BIT"
))
return
ENCODING_NONE
;
break
;
case
'B'
:
case
'b'
:
if
(
!
strcasecmp
(
s
,
"BASE64"
))
return
ENCODING_BASE64
;
if
(
!
strcasecmp
(
s
,
"BINARY"
))
return
ENCODING_NONE
;
break
;
case
'N'
:
if
(
!
strcasecmp
(
s
,
"NONE"
))
return
ENCODING_NONE
;
break
;
case
'Q'
:
case
'q'
:
if
(
!
strcasecmp
(
s
,
"QUOTED-PRINTABLE"
))
return
ENCODING_QP
;
break
;
case
'u'
:
case
'U'
:
// this is rubbish, but it's been seen in the wild
if
(
!
strcasecmp
(
s
,
"UTF-8"
))
return
ENCODING_NONE
;
if
(
!
strcasecmp
(
s
,
"UTF8"
))
return
ENCODING_NONE
;
break
;
}
return
ENCODING_UNKNOWN
;
}
EXPORTED
const
char
*
encoding_name
(
int
encoding
)
{
switch
(
encoding
)
{
case
ENCODING_NONE
:
return
"NONE"
;
case
ENCODING_QP
:
return
"QUOTED-PRINTABLE"
;
case
ENCODING_BASE64
:
return
"BASE64"
;
case
ENCODING_UNKNOWN
:
return
"UNKNOWN"
;
default
:
return
"WTF"
;
}
}
static
void
convert_flush
(
struct
convert_rock
*
rock
)
{
while
(
rock
)
{
if
(
rock
->
flush
)
rock
->
flush
(
rock
);
rock
=
rock
->
next
;
}
}
static
inline
void
convert_putc
(
struct
convert_rock
*
rock
,
uint32_t
c
)
{
if
(
charset_debug
)
{
if
(
c
<
0xff
)
fprintf
(
stderr
,
"%s(0x%x = '%c')
\n
"
,
convert_name
(
rock
),
c
,
c
);
else
fprintf
(
stderr
,
"%s(0x%x)
\
n
", convert_name(rock), c)
;
}
rock
->
f
(
rock
,
c
);
}
static
void
convert_cat
(
struct
convert_rock
*
rock
,
const
char
*
s
)
{
while
(
*
s
)
{
convert_putc
(
rock
,
(
unsigned
char
)
*
s
);
s
++
;
}
convert_flush
(
rock
);
}
static
void
convert_catn
(
struct
convert_rock
*
rock
,
const
char
*
s
,
size_t
len
)
{
while
(
len
--
>
0
)
{
convert_putc
(
rock
,
(
unsigned
char
)
*
s
);
s
++
;
}
convert_flush
(
rock
);
}
/* convertproc_t conversion functions */
static
void
qp_flushline
(
struct
convert_rock
*
rock
,
int
endline
)
{
struct
qp_state
*
s
=
(
struct
qp_state
*
)
rock
->
state
;
int
i
;
/* strip trailing whitespace: RFC 2405 transport-padding */
while
(
s
->
len
&&
(
s
->
buf
[
s
->
len
-1
]
==
' '
||
s
->
buf
[
s
->
len
-1
]
==
'\t'
))
s
->
len
--
;
for
(
i
=
0
;
i
<
s
->
len
;
i
++
)
{
switch
(
s
->
buf
[
i
])
{
case
'='
:
if
(
i
+
1
>=
s
->
len
)
{
/* soft linebreak */
endline
=
0
;
break
;
}
if
(
i
+
2
<
s
->
len
)
{
int
val1
=
HEXCHAR
(
s
->
buf
[
i
+
1
]);
int
val2
=
HEXCHAR
(
s
->
buf
[
i
+
2
]);
if
(
val1
!=
XX
&&
val2
!=
XX
)
{
convert_putc
(
rock
->
next
,
(
val1
<<
4
)
+
val2
);
i
+=
2
;
break
;
}
}
/* otherwise too close to the end or invalid, just eject
* a literal '=' and keep going */
convert_putc
(
rock
->
next
,
'='
);
break
;
case
'_'
:
/* underscores are space in headers */
convert_putc
(
rock
->
next
,
s
->
isheader
?
' '
:
'_'
);
break
;
default
:
convert_putc
(
rock
->
next
,
s
->
buf
[
i
]);
break
;
}
}
if
(
endline
)
{
convert_putc
(
rock
->
next
,
'\r'
);
convert_putc
(
rock
->
next
,
'\n'
);
}
s
->
len
=
0
;
}
static
void
qp_flush
(
struct
convert_rock
*
rock
)
{
qp_flushline
(
rock
,
0
);
}
static
void
qp2byte
(
struct
convert_rock
*
rock
,
uint32_t
c
)
{
struct
qp_state
*
s
=
(
struct
qp_state
*
)
rock
->
state
;
assert
(
c
==
U_REPLACEMENT
||
(
unsigned
)
c
<=
0xff
);
switch
(
c
)
{
case
U_REPLACEMENT
:
/* just skip invalid characters */
break
;
case
'\r'
:
// XXX - handle \r embedded in lines?
break
;
case
'\n'
:
qp_flushline
(
rock
,
1
);
break
;
default
:
s
->
buf
[
s
->
len
++
]
=
c
;
/* really overlength line? just flush now */
if
(
s
->
len
>
998
)
qp_flushline
(
rock
,
0
);
break
;
}
}
static
void
b64_2byte
(
struct
convert_rock
*
rock
,
uint32_t
c
)
{
struct
b64_state
*
s
=
(
struct
b64_state
*
)
rock
->
state
;
char
b
=
CHAR64
(
c
);
/* could just be whitespace, ignore it */
if
(
b
==
XX
)
return
;
/* the padding character, reset state */
if
(
b
==
64
)
{
s
->
codepoint
=
0
;
s
->
bytesleft
=
0
;
return
;
}
switch
(
s
->
bytesleft
)
{
case
0
:
s
->
codepoint
=
b
;
s
->
bytesleft
=
3
;
break
;
case
3
:
convert_putc
(
rock
->
next
,
((
s
->
codepoint
<<
2
)
|
(
b
>>
4
))
&
0xff
);
s
->
codepoint
=
b
;
s
->
bytesleft
=
2
;
break
;
case
2
:
convert_putc
(
rock
->
next
,
((
s
->
codepoint
<<
4
)
|
(
b
>>
2
))
&
0xff
);
s
->
codepoint
=
b
;
s
->
bytesleft
=
1
;
break
;
case
1
:
convert_putc
(
rock
->
next
,
((
s
->
codepoint
<<
6
)
|
b
)
&
0xff
);
s
->
codepoint
=
0
;
s
->
bytesleft
=
0
;
}
}
/*
* This filter unfolds folded RFC 2822 header field lines, i.e. it strips
* a CRLF pair only if the first character after the CRLF is LWS, and
* leaves other CRLF or lone CR or LF alone. In particular the CRLFs
* which *separate* different header fields are preserved. This allows
* the 'keep' and 'merge' whitespace options to behave as expected when
* the search engine is term-based, i.e. uses whitespace and punctuation
* to find indexing terms.
*/
static
void
unfold2uni
(
struct
convert_rock
*
rock
,
uint32_t
c
)
{
struct
unfold_state
*
s
=
(
struct
unfold_state
*
)
rock
->
state
;
switch
(
s
->
state
)
{
case
0
:
if
(
c
==
'\r'
)
s
->
state
=
1
;
else
convert_putc
(
rock
->
next
,
c
);
break
;
case
1
:
if
(
c
==
'\n'
)
s
->
state
=
2
;
else
{
convert_putc
(
rock
->
next
,
'\r'
);
convert_putc
(
rock
->
next
,
c
);
s
->
state
=
0
;
}
break
;
case
2
:
if
(
c
!=
' '
&&
c
!=
'\t'
)
{
convert_putc
(
rock
->
next
,
'\r'
);
convert_putc
(
rock
->
next
,
'\n'
);
convert_putc
(
rock
->
next
,
c
);
}
else
if
(
!
s
->
skipws
)
{
convert_putc
(
rock
->
next
,
c
);
}
s
->
state
=
0
;
break
;
}
}
/*
* Given a Unicode codepoint, emit one or more Unicode codepoints in
* search-normalised form (having applied recursive Unicode
* decomposition, like U+2026 HORIZONTAL ELLIPSIS to the three
* characters U+2E U+2E U+2E).
*/
static
void
uni2searchform
(
struct
convert_rock
*
rock
,
uint32_t
c
)
{
struct
canon_state
*
s
=
(
struct
canon_state
*
)
rock
->
state
;
int
i
;
int
code
;
unsigned
char
table16
,
table8
;
if
(
c
==
U_REPLACEMENT
)
{
convert_putc
(
rock
->
next
,
c
);
return
;
}
table16
=
chartables_translation_block16
[(
c
>>
16
)
&
0xff
];
/* no translations */
if
(
table16
==
255
)
{
convert_putc
(
rock
->
next
,
c
);
return
;
}
table8
=
chartables_translation_block8
[
table16
][(
c
>>
8
)
&
0xff
];
/* no translations */
if
(
table8
==
255
)
{
convert_putc
(
rock
->
next
,
c
);
return
;
}
/* use the xlate table */
code
=
chartables_translation
[
table8
][
c
&
0xff
];
/* case - zero length output */
if
(
code
==
0
)
{
return
;
}
/* special case: whitespace or control characters */
if
(
code
==
' '
||
code
==
'\r'
||
code
==
'\n'
)
{
if
(
s
->
flags
&
CHARSET_SKIPSPACE
)
{
return
;
}
if
(
s
->
flags
&
CHARSET_MERGESPACE
)
{
if
(
s
->
seenspace
)
return
;
s
->
seenspace
=
1
;
code
=
' '
;
/* one SPACE char */
}
}
else
s
->
seenspace
=
0
;
/* case - one character output */
if
(
code
>
0
)
{
/* diacritical character range. This duplicates the
* behaviour of Cyrus versions before 2.5 */
if
(
s
->
flags
&
CHARSET_SKIPDIACRIT
)
{
if
(
0x300
<=
code
&&
code
<=
0x36f
)
return
;
}
convert_putc
(
rock
->
next
,
code
);
return
;
}
/* case - multiple characters */
for
(
i
=
-
code
;
chartables_translation_multichar
[
i
];
i
++
)
{
int
c
=
chartables_translation_multichar
[
i
];
/* diacritical character range. This duplicates the
* behaviour of Cyrus versions before 2.5 */
if
(
s
->
flags
&
CHARSET_SKIPDIACRIT
)
{
/* XXX combining diacritical marks only range from 0x300 to 0x36f
* but this would break backwards compatibility */
if
((
c
&
~
0xff
)
==
0x300
)
continue
;
}
/* note: whitespace already stripped from multichar sequences... */
convert_putc
(
rock
->
next
,
c
);
}
}
/*
* Given a Unicode codepoint, emit one or more Unicode codepoints in
* HTML form, suitable for generating search snippets.
*/
static
void
uni2html
(
struct
convert_rock
*
rock
,
uint32_t
c
)
{
struct
canon_state
*
s
=
(
struct
canon_state
*
)
rock
->
state
;
if
(
c
==
U_REPLACEMENT
)
{
convert_putc
(
rock
->
next
,
c
);
return
;
}
if
(
s
->
flags
&
CHARSET_ESCAPEHTML
)
{
if
(
c
==
'<'
)
{
convert_cat
(
rock
->
next
,
"<"
);
return
;
}
if
(
c
==
'>'
)
{
convert_cat
(
rock
->
next
,
">"
);
return
;
}
if
(
c
==
'&'
)
{
convert_cat
(
rock
->
next
,
"&"
);
return
;
}
}
/* special case: whitespace or control characters */
if
(
c
==
' '
||
c
==
'\r'
||
c
==
'\n'
)
{
if
(
s
->
flags
&
CHARSET_SKIPSPACE
)
{
return
;
}
if
(
s
->
flags
&
CHARSET_MERGESPACE
)
{
if
(
s
->
seenspace
)
return
;
s
->
seenspace
=
1
;
c
=
' '
;
/* one SPACE char */
}
}
else
s
->
seenspace
=
0
;
convert_putc
(
rock
->
next
,
c
);
}
static
void
byte2search
(
struct
convert_rock
*
rock
,
uint32_t
c
)
{
struct
search_state
*
s
=
(
struct
search_state
*
)
rock
->
state
;
int
i
,
cur
;
unsigned
char
b
=
(
unsigned
char
)
c
;
if
(
c
==
U_REPLACEMENT
)
{
c
=
0xff
;
/* searchable by invalid character! */
}
/* check our "in_progress" matches to see if they're still valid */
for
(
i
=
0
,
cur
=
0
;
i
<
s
->
max_start
;
i
++
)
{
/* no more active offsets */
if
(
s
->
starts
[
i
]
==
-1
)
break
;
/* if we've passed one that's not ongoing, copy back */
if
(
cur
<
i
)
s
->
starts
[
cur
]
=
s
->
starts
[
i
];
/* check that the substring is still matching */
if
(
b
==
s
->
substr
[
s
->
offset
-
s
->
starts
[
i
]])
{
if
(
s
->
offset
-
s
->
starts
[
i
]
==
s
->
patlen
-
1
)
{
/* we're there! */
s
->
havematch
=
1
;
}
else
{
/* keep this one, it's ongoing */
cur
++
;
}
}
}
/* starting a new one! */
if
(
b
==
s
->
substr
[
0
])
{
/* have to treat this one specially! */
if
(
s
->
patlen
==
1
)
s
->
havematch
=
1
;
else
s
->
starts
[
cur
++
]
=
s
->
offset
;
}
/* empty out any others that aren't being kept */
while
(
cur
<
i
)
s
->
starts
[
cur
++
]
=
-1
;
/* increment the offset counter */
s
->
offset
++
;
}
/* Given an octet, append it to a buffer */
static
void
byte2buffer
(
struct
convert_rock
*
rock
,
uint32_t
c
)
{
struct
buf
*
buf
=
(
struct
buf
*
)
rock
->
state
;
buf_putc
(
buf
,
c
&
0xff
);
}
/* Given an octet c and an icu converter, convert c to
* its Unicode codepoint. During a flush, c is ignored.
*/
static
void
icu2uni
(
struct
convert_rock
*
rock
,
uint32_t
c
)
{
struct
charset_charset
*
s
=
(
struct
charset_charset
*
)
rock
->
state
;
UErrorCode
err
;
/* Assert a sane state. */
assert
(
s
->
flush
||
((
unsigned
)
c
)
<=
0xff
||
c
==
U_REPLACEMENT
);
/* Fill up the buffer until its either full or we are flushing. */
if
(
!
s
->
flush
&&
c
<=
0xff
)
{
*
s
->
src_next
++
=
c
;
/* Is there still space in the buffer? */
if
(
s
->
src_next
<
s
->
src_top
)
return
;
}
do
{
size_t
n
;
UChar32
cp
;
/* Set up the target buffer. */
const
UChar
*
tgt_limit
=
(
const
UChar
*
)
s
->
tgt_top
;
UChar
*
tgt
=
(
UChar
*
)
s
->
tgt_next
;
/* Set up the source buffer */
const
char
*
src
=
s
->
src_base
;
const
char
*
src_limit
=
s
->
src_next
;
/* Convert the source buffer to Unicode. */
err
=
U_ZERO_ERROR
;
ucnv_toUnicode
(
s
->
conv
,
&
tgt
,
tgt_limit
,
&
src
,
src_limit
,
NULL
,
s
->
flush
||
c
==
U_REPLACEMENT
,
&
err
);
/* Keep any bytes left in the source buffer. */
n
=
src_limit
-
src
;
if
(
n
)
memmove
(
s
->
src_base
,
src
,
n
);
s
->
src_next
=
s
->
src_base
+
n
;
/* Bail out on errors. */
if
(
U_FAILURE
(
err
)
&&
err
!=
U_BUFFER_OVERFLOW_ERROR
)
{
convert_putc
(
rock
->
next
,
U_REPLACEMENT
);
return
;
}
/* Emit any complete codepoints. */
UChar
*
t
=
(
UChar
*
)
s
->
tgt_base
;
while
(
t
<
tgt
&&
(
U16_IS_SINGLE
(
*
t
)
||
t
<
tgt
-1
))
{
ssize_t
i
=
0
;
U16_NEXT
(
t
,
i
,
tgt
-
t
,
cp
);
convert_putc
(
rock
->
next
,
cp
);
t
+=
i
;
}
/* Keep any incomplete codepoints and reset the target buffer. */
n
=
(
tgt
-
t
)
*
sizeof
(
UChar
);
if
(
n
)
memmove
(
s
->
tgt_base
,
t
,
n
);
s
->
tgt_next
=
s
->
tgt_base
+
n
;
}
while
(
err
==
U_BUFFER_OVERFLOW_ERROR
);
/* Pass any errors down the pipeline. */
if
(
c
==
U_REPLACEMENT
)
{
convert_putc
(
rock
->
next
,
c
);
}
}
/* Given Unicode codepoint c and an icu converter, convert c and emit
* its octets. During a flush, c is ignored. */
static
void
uni2icu
(
struct
convert_rock
*
rock
,
uint32_t
c
)
{
struct
charset_charset
*
s
=
(
struct
charset_charset
*
)
rock
->
state
;
UErrorCode
err
;
UChar
*
src_next
=
(
UChar
*
)
s
->
src_next
;
/* Fill up the buffer until its either full or we are flushing. */
if
(
!
s
->
flush
)
{
if
(
U16_LENGTH
(
c
)
==
1
)
{
*
src_next
++
=
c
;
}
else
{
*
src_next
++
=
U16_LEAD
(
c
);
*
src_next
++
=
U16_TRAIL
(
c
);
}
s
->
src_next
=
(
char
*
)
src_next
;
/* Can we buffer at least one more 32-bit codepoint. */
if
(
s
->
src_next
<
(
s
->
src_top
-
2
*
sizeof
(
UChar
)))
return
;
}
do
{
size_t
n
;
char
*
t
;
/* Set up the target buffer. */
char
*
tgt
=
s
->
tgt_base
;
const
char
*
tgt_limit
=
s
->
tgt_top
;
/* Set up the source buffer */
const
UChar
*
src
=
(
const
UChar
*
)
s
->
src_base
;
const
UChar
*
src_limit
=
(
const
UChar
*
)
s
->
src_next
;
/* Convert the source buffer from Unicode. */
err
=
U_ZERO_ERROR
;
ucnv_fromUnicode
(
s
->
conv
,
&
tgt
,
tgt_limit
,
&
src
,
src_limit
,
NULL
,
s
->
flush
,
&
err
);
/* Keep any bytes left in the source buffer. */
n
=
(
src_limit
-
src
)
*
sizeof
(
UChar
);
if
(
n
)
memmove
(
s
->
src_base
,
src
,
n
);
s
->
src_next
=
s
->
src_base
+
n
;
/* Bail out on errors. */
if
(
U_FAILURE
(
err
)
&&
err
!=
U_BUFFER_OVERFLOW_ERROR
)
{
convert_putc
(
rock
->
next
,
U_REPLACEMENT
);
return
;
}
/* Emit any converted octets. */
for
(
t
=
s
->
tgt_base
;
t
<
tgt
;
t
++
)
convert_putc
(
rock
->
next
,
*
t
);
/* Reset the target buffer. */
s
->
tgt_next
=
s
->
tgt_base
;
}
while
(
err
==
U_BUFFER_OVERFLOW_ERROR
);
}
/* Given an octet in a UTF-8 encoded string, possibly emit a Unicode
* code point */
static
void
utf8_2uni
(
struct
convert_rock
*
rock
,
uint32_t
c
)
{
struct
charset_charset
*
s
=
(
struct
charset_charset
*
)
rock
->
state
;
if
(
c
==
U_REPLACEMENT
)
{
emit_replacement
:
convert_putc
(
rock
->
next
,
U_REPLACEMENT
);
s
->
bytesleft
=
0
;
s
->
codepoint
=
0
;
return
;
}
assert
((
unsigned
)
c
<=
0xff
);
/*
* The following bytes are never valid in UTF-8 streams:
*
* C0-C1 could only be used for overlong encoding of
* basic ASCII characters
* F5-FD start bytes of sequences that could only encode
* numbers larger than the 0x10FFFF limit of Unicode
* FE-FF not valid start bytes or anything else
*
* Thanks to http://en.wikipedia.org/wiki/UTF-8
*
* These checks are interspersed with bitwise checks below.
*
* When we see a valid leading character but a sequence has
* not finished, we have detected an ill-formed sequence. The
* correct thing to do according to Section 3.9 of the Unicode
* standard 6.0 is to jettison the current sequence, emit the
* Replacement character and begin a new sequence with the new
* character. From the standard:
*
* For example, with the input UTF-8 code unit sequence
* <C2 41 42>, such a UTF-8 conversion process must not
* return <U+FFFD> or <U+FFFD, U+0042>, because either of
* those outputs would be the result of misinterpreting
* a well-formed subsequence as being part of the ill-formed
* subsequence. The expected return value for such a process
* would instead be <U+FFFD, U+0041, U+0042>.
*/
if
((
c
&
0xf8
)
==
0xf0
)
{
/* 11110xxx */
/* first of a 4 char sequence */
if
(
s
->
bytesleft
)
/* incomplete sequence */
convert_putc
(
rock
->
next
,
U_REPLACEMENT
);
if
(
c
>=
0xf5
&&
c
<=
0xf7
)
goto
emit_replacement
;
s
->
bytesleft
=
3
;
s
->
codepoint
=
c
&
0x07
;
/* 00000111 */
}
else
if
((
c
&
0xf0
)
==
0xe0
)
{
/* 1110xxxx */
/* first of a 3 char sequence */
if
(
s
->
bytesleft
)
/* incomplete sequence */
convert_putc
(
rock
->
next
,
U_REPLACEMENT
);
s
->
bytesleft
=
2
;
s
->
codepoint
=
c
&
0x0f
;
/* 00001111 */
}
else
if
((
c
&
0xe0
)
==
0xc0
)
{
/* 110xxxxx */
/* first of a 2 char sequence */
if
(
s
->
bytesleft
)
/* incomplete sequence */
convert_putc
(
rock
->
next
,
U_REPLACEMENT
);
if
(
c
==
0xc0
||
c
==
0xc1
)
goto
emit_replacement
;
s
->
bytesleft
=
1
;
s
->
codepoint
=
c
&
0x1f
;
/* 00011111 */
}
else
if
((
c
&
0xc0
)
==
0x80
)
{
/* 10xxxxxx */
/* continuation char, handle only if expected */
if
(
s
->
bytesleft
>
0
)
{
s
->
codepoint
=
(
s
->
codepoint
<<
6
)
+
(
c
&
0x3f
);
/* 00111111 */
s
->
bytesleft
--
;
if
(
!
s
->
bytesleft
)
{
convert_putc
(
rock
->
next
,
s
->
codepoint
);
s
->
codepoint
=
0
;
}
}
else
goto
emit_replacement
;
}
else
if
(
c
>=
0xf8
&&
c
<=
0xff
)
{
goto
emit_replacement
;
}
else
{
/* plain ASCII char */
if
(
s
->
bytesleft
)
/* incomplete sequence */
convert_putc
(
rock
->
next
,
U_REPLACEMENT
);
convert_putc
(
rock
->
next
,
c
);
s
->
bytesleft
=
0
;
s
->
codepoint
=
0
;
}
}
/* Given a Unicode codepoint, emit valid UTF-8 encoded octets */
static
void
uni2utf8
(
struct
convert_rock
*
rock
,
uint32_t
c
)
{
if
(
!
unicode_isvalid
(
c
))
c
=
U_REPLACEMENT
;
/* UTF-8 can encode code points up to 0x7fffffff, but the currently
* defined last valid codepoint is 0x10ffff so we only handle that
* range. */
if
(
c
>
0xffff
)
{
convert_putc
(
rock
->
next
,
0xF0
+
((
c
>>
18
)
&
0x07
));
convert_putc
(
rock
->
next
,
0x80
+
((
c
>>
12
)
&
0x3f
));
convert_putc
(
rock
->
next
,
0x80
+
((
c
>>
6
)
&
0x3f
));
convert_putc
(
rock
->
next
,
0x80
+
(
c
&
0x3f
));
}
else
if
(
c
>
0x7ff
)
{
convert_putc
(
rock
->
next
,
0xE0
+
((
c
>>
12
)
&
0x0f
));
convert_putc
(
rock
->
next
,
0x80
+
((
c
>>
6
)
&
0x3f
));
convert_putc
(
rock
->
next
,
0x80
+
(
c
&
0x3f
));
}
else
if
(
c
>
0x7f
)
{
convert_putc
(
rock
->
next
,
0xC0
+
((
c
>>
6
)
&
0x1f
));
convert_putc
(
rock
->
next
,
0x80
+
(
c
&
0x3f
));
}
else
{
convert_putc
(
rock
->
next
,
c
);
}
}
/* Given an octet which is a codepoint in some 7bit or 8bit character
* set, or the Unicode replacement character, emit the corresponding
* Unicode codepoint. */
static
void
table2uni
(
struct
convert_rock
*
rock
,
uint32_t
c
)
{
struct
charset_charset
*
s
=
(
struct
charset_charset
*
)
rock
->
state
;
struct
charmap
*
map
;
if
(
c
==
U_REPLACEMENT
)
{
convert_putc
(
rock
->
next
,
c
);
return
;
}
assert
((
unsigned
)
c
<=
0xff
);
map
=
(
struct
charmap
*
)
&
s
->
curtable
[
0
][
c
&
0xff
];
if
(
map
->
c
)
convert_putc
(
rock
->
next
,
map
->
c
);
s
->
curtable
=
s
->
initialtable
+
map
->
next
;
}
/*
* The HTML5 standard mandates that certain Unicode code points
* cannot be generated using &#nnn; numerical character references,
* and should generate a parse error. This function detects them.
*/
static
int
html_uiserror
(
uint32_t
c
)
{
static
const
struct
{
unsigned
int
mask
,
lo
,
hi
;
}
ranges
[]
=
{
{
~
0U
,
0x0001
,
0x0008
},
{
~
0U
,
0x000b
,
0x000b
},
{
~
0U
,
0x000e
,
0x001f
},
{
~
0U
,
0x007f
,
0x009f
},
{
~
0U
,
0xfdd0
,
0xfdef
},
{
0xffff
,
0xfffe
,
0xffff
}
};
unsigned
int
i
;
for
(
i
=
0
;
i
<
VECTOR_SIZE
(
ranges
)
;
i
++
)
{
unsigned
c2
=
(
unsigned
)
c
&
ranges
[
i
].
mask
;
if
(
c2
>=
ranges
[
i
].
lo
&&
c2
<=
ranges
[
i
].
hi
)
return
1
;
}
return
0
;
}
static
void
html_saw_character
(
struct
convert_rock
*
rock
)
{
struct
striphtml_state
*
s
=
(
struct
striphtml_state
*
)
rock
->
state
;
const
char
*
ent
=
buf_cstring
(
&
s
->
name
);
int
c
;
static
const
int
compat_chars
[]
=
{
/* Mappings of old numeric character codepoints between 0x80 and
* 0x9f inclusive, defined for backwards compatibility by HTML5,
* to Unicode code points. Note that some of these are mapped
* to codepoints which are mandated to be parse errors. */
0x20AC
,
/* EURO SIGN (€) */
0x0081
,
/* <control> */
0x201A
,
/* SINGLE LOW-9 QUOTATION MARK (‚) */
0x0192
,
/* LATIN SMALL LETTER F WITH HOOK (ƒ) */
0x201E
,
/* DOUBLE LOW-9 QUOTATION MARK („) */
0x2026
,
/* HORIZONTAL ELLIPSIS (…) */
0x2020
,
/* DAGGER (†) */
0x2021
,
/* DOUBLE DAGGER (‡) */
0x02C6
,
/* MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ) */
0x2030
,
/* PER MILLE SIGN (‰) */
0x0160
,
/* LATIN CAPITAL LETTER S WITH CARON (Š) */
0x2039
,
/* SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹) */
0x0152
,
/* LATIN CAPITAL LIGATURE OE (Œ) */
0x008D
,
/* <control> */
0x017D
,
/* LATIN CAPITAL LETTER Z WITH CARON (Ž) */
0x008F
,
/* <control> */
0x0090
,
/* <control> */
0x2018
,
/* LEFT SINGLE QUOTATION MARK (‘) */
0x2019
,
/* RIGHT SINGLE QUOTATION MARK (’) */
0x201C
,
/* LEFT DOUBLE QUOTATION MARK (“) */
0x201D
,
/* RIGHT DOUBLE QUOTATION MARK (”) */
0x2022
,
/* BULLET (•) */
0x2013
,
/* EN DASH (–) */
0x2014
,
/* EM DASH (—) */
0x02DC
,
/* SMALL TILDE (˜) */
0x2122
,
/* TRADE MARK SIGN (™) */
0x0161
,
/* LATIN SMALL LETTER S WITH CARON (š) */
0x203A
,
/* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›) */
0x0153
,
/* LATIN SMALL LIGATURE OE (œ) */
0x009D
,
/* <control> */
0x017E
,
/* LATIN SMALL LETTER Z WITH CARON (ž) */
0x0178
/* LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ) */
};
if
(
charset_debug
)
fprintf
(
stderr
,
"html_saw_character(%s)
\n
"
,
ent
);
if
(
ent
[
0
]
==
'#'
)
{
if
(
ent
[
1
]
==
'x'
||
ent
[
1
]
==
'X'
)
c
=
strtoul
(
ent
+
2
,
NULL
,
16
);
else
c
=
strtoul
(
ent
+
1
,
NULL
,
10
);
/* no need for format error checking, the lexer did that */
/* Perform character mapping and validation per
* http://dev.w3.org/html5/spec/tokenization.html#consume-a-character-reference
*/
if
(
c
==
0
)
c
=
U_REPLACEMENT
;
else
if
(
c
>=
0x80
&&
c
<=
0x9f
)
c
=
compat_chars
[
c
-0x80
];
else
if
(
!
unicode_isvalid
(
c
))
c
=
U_REPLACEMENT
;
/* invalid Unicode codepoint */
if
(
html_uiserror
(
c
))
{
/* the HTML5 spec says this is a parse error, but it's
* unclear what that means for us; we choose to strip the
* character but could also emit the replacement character. */
return
;
}
}
else
{
c
=
htmlchar_from_string
(
ent
);
if
(
c
==
-1
)
{
c
=
U_REPLACEMENT
;
/* unknown character */
}
else
if
(
c
>
0xffff
)
{
/* Hack to handle a small minority of named characters
* which map to a sequence of two Unicode codepoints. */
convert_putc
(
rock
->
next
,
(
c
>>
16
)
&
0xffff
);
convert_putc
(
rock
->
next
,
c
&
0xffff
);
return
;
}
}
convert_putc
(
rock
->
next
,
c
);
}
static
const
char
*
html_state_as_string
(
enum
html_state
state
)
{
switch
(
state
)
{
case
HDATA
:
return
"HDATA"
;
case
HTAGOPEN
:
return
"HTAGOPEN"
;
case
HENDTAGOPEN
:
return
"HENDTAGOPEN"
;
case
HTAGNAME
:
return
"HTAGNAME"
;
case
HSCTAG
:
return
"HSCTAG"
;
case
HTAGPARAMS
:
return
"HTAGPARAMS"
;
case
HCHARACTER
:
return
"HCHARACTER"
;
case
HCHARACTER2
:
return
"HCHARACTER2"
;
case
HCHARACTERHASH
:
return
"HCHARACTERHASH"
;
case
HCHARACTERHEX
:
return
"HCHARACTERHEX"
;
case
HCHARACTERDEC
:
return
"HCHARACTERDEC"
;
case
HSCRIPTDATA
:
return
"HSCRIPTDATA"
;
case
HSCRIPTLT
:
return
"HSCRIPTLT"
;
case
HSTYLEDATA
:
return
"HSTYLEDATA"
;
case
HSTYLELT
:
return
"HSTYLELT"
;
case
HBOGUSCOMM
:
return
"HBOGUSCOMM"
;
case
HMUDECOPEN
:
return
"HMUDECOPEN"
;
case
HCOMMSTART
:
return
"HCOMMSTART"
;
case
HCOMMSTARTDASH
:
return
"HCOMMSTARTDASH"
;
case
HCOMM
:
return
"HCOMM"
;
case
HCOMMENDDASH
:
return
"HCOMMENDDASH"
;
case
HCOMMEND
:
return
"HCOMMEND"
;
case
HCOMMENDBANG
:
return
"HCOMMENDBANG"
;
}
return
"wtf?"
;
}
static
void
html_push
(
struct
striphtml_state
*
s
,
enum
html_state
state
)
{
assert
(
s
->
depth
<
(
int
)(
sizeof
(
s
->
stack
)
/
sizeof
(
s
->
stack
[
0
])));
if
(
charset_debug
)
fprintf
(
stderr
,
"html_push(%s)
\n
"
,
html_state_as_string
(
state
));
s
->
stack
[
s
->
depth
++
]
=
state
;
}
static
int
html_pop
(
struct
striphtml_state
*
s
)
{
assert
(
s
->
depth
>
0
);
if
(
charset_debug
)
fprintf
(
stderr
,
"html_pop()
\n
"
);
return
s
->
stack
[
--
s
->
depth
];
}
static
void
html_go
(
struct
striphtml_state
*
s
,
enum
html_state
state
)
{
assert
(
s
->
depth
>
0
);
if
(
charset_debug
)
fprintf
(
stderr
,
"html_go(%s)
\n
"
,
html_state_as_string
(
state
));
s
->
stack
[
s
->
depth
-1
]
=
state
;
}
static
enum
html_state
html_top
(
struct
striphtml_state
*
s
)
{
assert
(
s
->
depth
>
0
);
return
s
->
stack
[
s
->
depth
-1
];
}
static
int
is_phrasing
(
char
*
tag
)
{
static
const
char
*
const
phrasing_tags
[]
=
{
"a"
,
"q"
,
"cite"
,
"em"
,
"strong"
,
"small"
,
"mark"
,
"dfn"
,
"abbr"
,
"time"
,
"progress"
,
"meter"
,
"code"
,
"var"
,
"samp"
,
"kbd"
,
"sub"
,
"sup"
,
"span"
,
"i"
,
"b"
,
"bdo"
,
"ruby"
,
"ins"
,
"del"
};
static
struct
hash_table
hash
=
HASH_TABLE_INITIALIZER
;
if
(
hash
.
table
==
NULL
)
{
unsigned
int
i
;
construct_hash_table
(
&
hash
,
VECTOR_SIZE
(
phrasing_tags
),
0
);
for
(
i
=
0
;
i
<
VECTOR_SIZE
(
phrasing_tags
)
;
i
++
)
hash_insert
(
phrasing_tags
[
i
],
(
void
*
)
1
,
&
hash
);
}
return
(
hash_lookup
(
lcase
(
tag
),
&
hash
)
==
(
void
*
)
1
);
}
static
void
html_saw_tag
(
struct
convert_rock
*
rock
)
{
struct
striphtml_state
*
s
=
(
struct
striphtml_state
*
)
rock
->
state
;
char
*
tag
;
enum
html_state
state
=
html_top
(
s
);
buf_cstring
(
&
s
->
name
);
tag
=
s
->
name
.
s
;
if
(
charset_debug
)
fprintf
(
stderr
,
"html_saw_tag() state=%s tag=
\"
%s
\"
ends=%s,%s
\n
"
,
html_state_as_string
(
state
),
tag
,
(
s
->
ends
&
HBEGIN
?
"BEGIN"
:
"-"
),
(
s
->
ends
&
HEND
?
"END"
:
"-"
));
/* gnb:TODO: what are we supposed to do with a nested <script> tag? */
if
(
!
strcasecmp
(
tag
,
"script"
))
{
if
(
state
==
HDATA
&&
s
->
ends
==
HBEGIN
)
html_go
(
s
,
HSCRIPTDATA
);
else
if
(
state
==
HSCRIPTDATA
&&
s
->
ends
==
HEND
)
html_go
(
s
,
HDATA
);
/* BEGIN,END pair is doesn't affect state */
}
else
if
(
!
strcasecmp
(
tag
,
"style"
))
{
if
(
state
==
HDATA
&&
s
->
ends
==
HBEGIN
)
html_go
(
s
,
HSTYLEDATA
);
else
if
(
state
==
HSTYLEDATA
&&
s
->
ends
==
HEND
)
html_go
(
s
,
HDATA
);
/* BEGIN,END pair is doesn't affect state */
}
else
if
(
!
is_phrasing
(
tag
))
{
convert_putc
(
rock
->
next
,
' '
);
}
/* otherwise, no change */
}
/*
* Note we don't use isalnum - the test has to be in US-ASCII always
* regardless of the charset of the text or the locale of this process
*/
#define html_isalpha(c) \
(((c) >= 'a' && (c) <= 'z') || \
((c) >= 'A' && (c) <= 'Z'))
#define html_isxdigit(c) \
(((c) >= '0' && (c) <= '9') || \
((c) >= 'a' && (c) <= 'f') || \
((c) >= 'A' && (c) <= 'F'))
#define html_isdigit(c) \
(((c) >= '0' && (c) <= '9'))
#define html_isspace(c) \
((c) == ' ' || (c) == '\t' || (c) == '\r' || (c) == '\n')
void
striphtml2uni
(
struct
convert_rock
*
rock
,
uint32_t
c
)
{
struct
striphtml_state
*
s
=
(
struct
striphtml_state
*
)
rock
->
state
;
restart
:
switch
(
html_top
(
s
))
{
case
HDATA
:
if
(
c
==
'<'
)
{
html_push
(
s
,
HTAGOPEN
);
buf_reset
(
&
s
->
name
);
}
else
if
(
c
==
'&'
)
{
html_go
(
s
,
HCHARACTER
);
buf_reset
(
&
s
->
name
);
}
else
{
convert_putc
(
rock
->
next
,
c
);
}
break
;
case
HSCRIPTDATA
:
/* 8.2.4.6 Script data state */
if
(
c
==
'<'
)
{
html_push
(
s
,
HSCRIPTLT
);
}
/* else, strip the character */
break
;
case
HSCRIPTLT
:
/* 8.2.4.17 Script data less-than sign state */
/* TODO: deal with <! inside SCRIPT tags properly per
* http://dev.w3.org/html5/spec/tokenization.html#script-data-less-than-sign-state
*/
if
(
c
==
'/'
)
{
s
->
ends
=
HEND
;
html_go
(
s
,
HTAGNAME
);
buf_reset
(
&
s
->
name
);
}
else
{
/* naked <, emit it and restart with current character */
convert_putc
(
rock
->
next
,
c
);
html_pop
(
s
);
goto
restart
;
}
break
;
case
HSTYLEDATA
:
if
(
c
==
'<'
)
{
html_push
(
s
,
HSTYLELT
);
}
/* else, strip the character */
break
;
case
HSTYLELT
:
if
(
c
==
'/'
)
{
s
->
ends
=
HEND
;
html_go
(
s
,
HTAGNAME
);
buf_reset
(
&
s
->
name
);
}
else
{
/* naked <, emit it and restart with current character */
convert_putc
(
rock
->
next
,
c
);
html_pop
(
s
);
goto
restart
;
}
break
;
case
HCHARACTER
:
/* 8.2.4.2 Character reference in data state */
/* This is the "consume a character reference" algorithm
* rewritten to use additional lexical states */
if
(
c
==
'#'
)
{
buf_putc
(
&
s
->
name
,
c
);
html_go
(
s
,
HCHARACTERHASH
);
}
else
if
(
html_isalpha
(
c
))
{
buf_putc
(
&
s
->
name
,
c
);
html_go
(
s
,
HCHARACTER2
);
}
else
{
/* naked & - emit the & and restart */
convert_putc
(
rock
->
next
,
'&'
);
html_go
(
s
,
HDATA
);
goto
restart
;
}
break
;
case
HCHARACTERHASH
:
/* '&' then '#' */
if
(
c
==
'x'
||
c
==
'X'
)
{
buf_putc
(
&
s
->
name
,
c
);
html_go
(
s
,
HCHARACTERHEX
);
}
else
if
(
html_isdigit
(
c
))
{
buf_putc
(
&
s
->
name
,
c
);
html_go
(
s
,
HCHARACTERDEC
);
}
else
{
/* naked &# - emit the &# and restart */
convert_putc
(
rock
->
next
,
'&'
);
convert_putc
(
rock
->
next
,
'#'
);
html_go
(
s
,
HDATA
);
goto
restart
;
}
break
;
case
HCHARACTER2
:
if
(
html_isalpha
(
c
))
{
buf_putc
(
&
s
->
name
,
c
);
/* TODO: we're supposed to look this up
* to see if it's an known character so that
* ¬it; is parsed as the 4 chars
* '¬' 'i' 't' ';' */
}
else
{
html_saw_character
(
rock
);
html_go
(
s
,
HDATA
);
if
(
c
!=
';'
)
{
/* character reference not correctly terminated -
* restart with the next character */
goto
restart
;
}
}
break
;
case
HCHARACTERHEX
:
if
(
html_isxdigit
(
c
))
{
buf_putc
(
&
s
->
name
,
c
);
}
else
{
html_saw_character
(
rock
);
html_go
(
s
,
HDATA
);
if
(
c
!=
';'
)
{
/* character reference not correctly terminated -
* restart with the next character */
goto
restart
;
}
}
break
;
case
HCHARACTERDEC
:
if
(
html_isdigit
(
c
))
{
buf_putc
(
&
s
->
name
,
c
);
}
else
{
html_saw_character
(
rock
);
html_go
(
s
,
HDATA
);
if
(
c
!=
';'
)
{
/* character reference not correctly terminated -
* restart with the next character */
goto
restart
;
}
}
break
;
case
HTAGOPEN
:
/* 8.2.4.8 Tag open state */
if
(
c
==
'!'
)
{
/* markup declaration open delimiter - let's just assume
* it's a comment */
html_pop
(
s
);
html_go
(
s
,
HMUDECOPEN
);
buf_reset
(
&
s
->
name
);
}
else
if
(
c
==
'/'
)
{
html_go
(
s
,
HENDTAGOPEN
);
}
else
if
(
html_isalpha
(
c
))
{
s
->
ends
=
HBEGIN
;
buf_putc
(
&
s
->
name
,
c
);
html_go
(
s
,
HTAGNAME
);
}
else
{
/* apparently a naked <, emit the < and restart */
convert_putc
(
rock
->
next
,
'<'
);
html_pop
(
s
);
goto
restart
;
}
break
;
case
HENDTAGOPEN
:
/* 8.2.4.9 End tag open state */
if
(
html_isalpha
(
c
))
{
s
->
ends
=
HEND
;
buf_putc
(
&
s
->
name
,
c
);
html_go
(
s
,
HTAGNAME
);
}
else
{
/* error */
html_pop
(
s
);
}
break
;
case
HTAGNAME
:
/* 8.2.4.10 Tag name state */
/* gnb:TODO handle > embedded in "param" */
if
(
html_isspace
(
c
))
{
html_go
(
s
,
HTAGPARAMS
);
}
else
if
(
c
==
'/'
)
{
html_go
(
s
,
HSCTAG
);
}
else
if
(
c
==
'>'
)
{
html_pop
(
s
);
html_saw_tag
(
rock
);
}
else
if
(
html_isalpha
(
c
))
{
buf_putc
(
&
s
->
name
,
c
);
}
else
{
/* error */
html_pop
(
s
);
}
break
;
case
HSCTAG
:
/* 8.2.4.43 Self-closing start tag state */
if
(
c
==
'>'
)
{
s
->
ends
=
HBEGIN
|
HEND
;
html_pop
(
s
);
html_saw_tag
(
rock
);
}
else
{
/* whatever, keep stripping tag parameters */
html_go
(
s
,
HTAGPARAMS
);
}
break
;
case
HTAGPARAMS
:
/* ignores all text until next '>' */
if
(
c
==
'>'
)
{
html_pop
(
s
);
html_saw_tag
(
rock
);
}
else
if
(
c
==
'/'
)
{
html_go
(
s
,
HSCTAG
);
}
break
;
case
HBOGUSCOMM
:
/* 8.2.4.44 Bogus comment state */
/* strip all text until closing > */
if
(
c
==
'>'
)
{
html_go
(
s
,
HDATA
);
}
break
;
case
HMUDECOPEN
:
/* 8.2.4.45 Markup declaration open state */
if
(
c
==
'-'
)
{
buf_putc
(
&
s
->
name
,
c
);
if
(
s
->
name
.
len
==
2
)
html_go
(
s
,
HCOMMSTART
);
}
else
{
/* ignore DOCTYPE or CDATA */
html_go
(
s
,
HBOGUSCOMM
);
goto
restart
;
/* in case it's a > */
}
break
;
case
HCOMMSTART
:
/* 8.2.4.46 Comment start state */
if
(
c
==
'-'
)
html_go
(
s
,
HCOMMSTARTDASH
);
else
if
(
c
==
'>'
)
html_go
(
s
,
HDATA
);
/* very short comment <!--> */
else
html_go
(
s
,
HCOMM
);
break
;
case
HCOMMSTARTDASH
:
/* 8.2.4.47 Comment start dash state */
if
(
c
==
'-'
)
html_go
(
s
,
HCOMMEND
);
else
if
(
c
==
'>'
)
html_go
(
s
,
HDATA
);
/* incorrectly formed -> comment end */
else
html_go
(
s
,
HCOMM
);
/* else strip */
break
;
case
HCOMM
:
/* 8.2.4.48 Comment state */
if
(
c
==
'-'
)
html_go
(
s
,
HCOMMENDDASH
);
/* else strip */
break
;
case
HCOMMENDDASH
:
/* 8.2.4.49 Comment end dash state */
if
(
c
==
'-'
)
html_go
(
s
,
HCOMMEND
);
/* -- pair in comment */
else
html_go
(
s
,
HCOMM
);
/* lone - in comment */
break
;
case
HCOMMEND
:
/* 8.2.4.50 Comment end state */
if
(
c
==
'>'
)
html_go
(
s
,
HDATA
);
/* correctly formed --> comment end */
else
if
(
c
==
'!'
)
html_go
(
s
,
HCOMMENDBANG
);
/* --! in a comment */
else
if
(
c
!=
'-'
)
html_go
(
s
,
HCOMM
);
/* -- in the middle of a comment */
/* else, --- in comment, strip */
break
;
case
HCOMMENDBANG
:
/* 8.2.4.51 Comment end bang state */
if
(
c
==
'-'
)
html_go
(
s
,
HCOMMENDDASH
);
/* --!- in comment */
else
if
(
c
==
'>'
)
html_go
(
s
,
HDATA
);
/* --!> at end of comment */
else
html_go
(
s
,
HCOMM
);
/* --! in the middle of a comment */
break
;
}
}
static
const
char
*
convert_name
(
struct
convert_rock
*
rock
)
{
if
(
rock
->
f
==
b64_2byte
)
return
"b64_2byte"
;
if
(
rock
->
f
==
byte2buffer
)
return
"byte2buffer"
;
if
(
rock
->
f
==
byte2search
)
return
"byte2search"
;
if
(
rock
->
f
==
qp2byte
)
return
"qp2byte"
;
if
(
rock
->
f
==
striphtml2uni
)
return
"striphtml2uni"
;
if
(
rock
->
f
==
unfold2uni
)
return
"unfold2uni"
;
if
(
rock
->
f
==
uni2searchform
)
return
"uni2searchform"
;
if
(
rock
->
f
==
uni2html
)
return
"uni2html"
;
if
(
rock
->
f
==
table2uni
)
return
"table2uni"
;
if
(
rock
->
f
==
uni2utf8
)
return
"uni2utf8"
;
if
(
rock
->
f
==
utf8_2uni
)
return
"utf8_2uni"
;
if
(
rock
->
f
==
uni2icu
)
return
"uni2icu"
;
if
(
rock
->
f
==
icu2uni
)
return
"icu2uni"
;
return
"wtf"
;
}
/* convert_rock manipulation routines */
/* Extract a cstring from a buffer. NOTE: caller must free the memory
* themselves once this is called. Resets the state. If you don't
* call this function then buffer_cleanup will clean up */
static
char
*
buffer_cstring
(
struct
convert_rock
*
rock
)
{
struct
buf
*
buf
=
(
struct
buf
*
)
rock
->
state
;
return
buf_release
(
buf
);
}
static
void
buffer_trim
(
struct
convert_rock
*
rock
)
{
struct
buf
*
buf
=
(
struct
buf
*
)
rock
->
state
;
buf_trim
(
buf
);
}
static
inline
int
search_havematch
(
struct
convert_rock
*
rock
)
{
struct
search_state
*
s
=
(
struct
search_state
*
)
rock
->
state
;
return
s
->
havematch
;
}
/* conversion cleanup routines */
static
void
basic_free
(
struct
convert_rock
*
rock
)
{
if
(
rock
)
{
if
(
!
rock
->
dont_free_state
)
free
(
rock
->
state
);
free
(
rock
);
}
}
static
void
icu_flush
(
struct
convert_rock
*
rock
)
{
struct
charset_charset
*
s
=
(
struct
charset_charset
*
)
rock
->
state
;
s
->
flush
=
1
;
if
(
rock
->
f
==
icu2uni
)
{
icu2uni
(
rock
,
-1
);
}
else
if
(
rock
->
f
==
uni2icu
)
{
uni2icu
(
rock
,
U_REPLACEMENT
);
}
s
->
flush
=
0
;
}
static
void
icu_cleanup
(
struct
convert_rock
*
rock
,
int
is_free
)
{
if
(
rock
)
{
if
(
rock
->
state
)
icu_reset
(
rock
,
-1
/*don't care*/
);
if
(
is_free
)
free
(
rock
);
}
}
static
void
icu_reset
(
struct
convert_rock
*
rock
,
int
to_uni
)
{
struct
charset_charset
*
s
=
(
struct
charset_charset
*
)
rock
->
state
;
size_t
buf_size
=
CHARSET_ICUBUF_BUFFER_SIZE
;
if
(
s
->
buf_size
<
buf_size
)
{
s
->
buf
=
xrealloc
(
s
->
buf
,
buf_size
*
2
);
s
->
buf_size
=
buf_size
;
}
ucnv_reset
(
s
->
conv
);
s
->
tgt_base
=
s
->
buf
;
s
->
tgt_top
=
s
->
tgt_base
+
s
->
buf_size
;
s
->
tgt_next
=
s
->
tgt_base
;
s
->
src_base
=
s
->
buf
+
s
->
buf_size
;
s
->
src_top
=
s
->
src_base
+
s
->
buf_size
;
s
->
src_next
=
s
->
src_base
;
rock
->
f
=
to_uni
?
icu2uni
:
uni2icu
;
rock
->
flush
=
icu_flush
;
rock
->
cleanup
=
icu_cleanup
;
}
static
void
table_cleanup
(
struct
convert_rock
*
rock
,
int
is_free
)
{
if
(
is_free
)
free
(
rock
);
}
static
void
table_reset
(
struct
convert_rock
*
rock
,
int
to_uni
)
{
struct
charset_charset
*
s
=
(
struct
charset_charset
*
)
rock
->
state
;
if
(
chartables_charset_table
[
s
->
num
].
table
)
{
s
->
initialtable
=
chartables_charset_table
[
s
->
num
].
table
;
s
->
curtable
=
s
->
initialtable
;
}
if
(
strstr
(
chartables_charset_table
[
s
->
num
].
name
,
"utf-8"
))
{
rock
->
f
=
to_uni
?
utf8_2uni
:
uni2utf8
;
}
else
{
/* A truly table-based converter may never convert from Unicode
* to its charmap. This has been implicitly assumed in the existing
* code, but let's be explicit here. */
assert
(
to_uni
);
rock
->
f
=
table2uni
;
}
s
->
bytesleft
=
0
;
s
->
codepoint
=
0
;
s
->
mode
=
0
;
s
->
num_bits
=
0
;
rock
->
cleanup
=
table_cleanup
;
}
static
void
convert_switch
(
struct
convert_rock
*
rock
,
charset_t
to
,
int
to_uni
)
{
struct
charset_charset
*
s
=
(
struct
charset_charset
*
)
rock
->
state
;
/* make sure that the new state is sane */
assert
((
to
->
conv
==
NULL
)
!=
(
to
->
num
==
-1
));
/* flush any cached bytes in the ICU converter */
if
(
s
->
conv
)
{
icu_flush
(
rock
);
icu_reset
(
rock
,
to_uni
);
}
else
{
table_reset
(
rock
,
to_uni
);
}
/* how the new state in the pipeline */
rock
->
state
=
to
;
if
(
to
->
conv
)
{
icu_reset
(
rock
,
to_uni
);
}
else
{
table_reset
(
rock
,
to_uni
);
}
}
static
void
search_cleanup
(
struct
convert_rock
*
rock
,
int
is_free
)
{
if
(
rock
&&
rock
->
state
)
{
struct
search_state
*
s
=
(
struct
search_state
*
)
rock
->
state
;
if
(
s
->
starts
&&
!
is_free
)
{
int
i
;
for
(
i
=
0
;
i
<
s
->
max_start
;
i
++
)
{
s
->
starts
[
i
]
=
-1
;
}
}
else
free
(
s
->
starts
);
}
if
(
is_free
)
basic_free
(
rock
);
}
static
void
buffer_cleanup
(
struct
convert_rock
*
rock
,
int
is_free
)
{
if
(
rock
&&
rock
->
state
)
{
struct
buf
*
buf
=
(
struct
buf
*
)
rock
->
state
;
if
(
is_free
)
buf_free
(
buf
);
else
buf_reset
(
buf
);
}
if
(
is_free
)
basic_free
(
rock
);
}
static
void
dont_free
(
struct
convert_rock
*
rock
,
int
is_free
)
{
if
(
!
rock
||
!
is_free
)
return
;
/* NULL out state owned by caller, so we won't free in basic_free */
if
(
rock
)
rock
->
state
=
NULL
;
basic_free
(
rock
);
}
static
void
striphtml_cleanup
(
struct
convert_rock
*
rock
,
int
is_free
)
{
if
(
rock
&&
rock
->
state
)
{
struct
striphtml_state
*
s
=
(
struct
striphtml_state
*
)
rock
->
state
;
if
(
is_free
)
buf_free
(
&
s
->
name
);
else
buf_reset
(
&
s
->
name
);
}
if
(
is_free
)
basic_free
(
rock
);
}
static
void
convert_ncleanup
(
struct
convert_rock
*
rock
,
int
n
,
int
is_free
)
{
struct
convert_rock
*
next
;
int
i
=
0
;
while
(
rock
&&
(
!
n
||
(
i
++
<
n
)))
{
next
=
rock
->
next
;
if
(
rock
->
cleanup
)
rock
->
cleanup
(
rock
,
is_free
);
else
if
(
is_free
)
basic_free
(
rock
);
rock
=
next
;
}
}
#define convert_free(rock) convert_ncleanup(rock, 0, 1)
/* converter initialisation routines */
static
struct
convert_rock
*
qp_init
(
int
isheader
,
struct
convert_rock
*
next
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
struct
qp_state
*
s
=
xzmalloc
(
sizeof
(
struct
qp_state
));
s
->
isheader
=
isheader
;
rock
->
state
=
(
void
*
)
s
;
rock
->
f
=
qp2byte
;
rock
->
flush
=
qp_flush
;
rock
->
next
=
next
;
return
rock
;
}
static
struct
convert_rock
*
b64_init
(
struct
convert_rock
*
next
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
rock
->
state
=
xzmalloc
(
sizeof
(
struct
b64_state
));
rock
->
f
=
b64_2byte
;
rock
->
next
=
next
;
return
rock
;
}
static
struct
convert_rock
*
unfold_init
(
int
skipws
,
struct
convert_rock
*
next
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
struct
unfold_state
*
s
=
xzmalloc
(
sizeof
(
struct
unfold_state
));
s
->
skipws
=
skipws
;
rock
->
state
=
s
;
rock
->
f
=
unfold2uni
;
rock
->
next
=
next
;
return
rock
;
}
static
struct
convert_rock
*
canon_init
(
int
flags
,
struct
convert_rock
*
next
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
struct
canon_state
*
s
=
xzmalloc
(
sizeof
(
struct
canon_state
));
s
->
flags
=
flags
;
if
((
flags
&
CHARSET_KEEPCASE
))
rock
->
f
=
uni2html
;
else
rock
->
f
=
uni2searchform
;
rock
->
state
=
s
;
rock
->
next
=
next
;
return
rock
;
}
static
struct
convert_rock
*
convert_init
(
struct
charset_charset
*
s
,
int
to_uni
,
struct
convert_rock
*
next
)
{
struct
convert_rock
*
rock
;
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
rock
->
state
=
s
;
rock
->
next
=
next
;
/* Assert a sane state */
assert
((
s
->
conv
==
NULL
)
!=
(
s
->
num
==
-1
));
/* Initialize rock based on the converter type */
if
(
s
->
conv
)
{
icu_reset
(
rock
,
to_uni
);
}
else
{
table_reset
(
rock
,
to_uni
);
}
return
rock
;
}
static
struct
convert_rock
*
search_init
(
const
char
*
substr
,
comp_pat
*
pat
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
struct
search_state
*
s
=
xzmalloc
(
sizeof
(
struct
search_state
));
struct
comp_pat_s
*
p
=
(
struct
comp_pat_s
*
)
pat
;
int
i
;
/* copy in tracking vars */
s
->
max_start
=
p
->
max_start
;
s
->
patlen
=
p
->
patlen
;
s
->
substr
=
(
unsigned
char
*
)
substr
;
/* allocate tracking space and initialise to "no match" */
s
->
starts
=
xmalloc
(
s
->
max_start
*
sizeof
(
s
->
starts
[
0
]));
for
(
i
=
0
;
i
<
s
->
max_start
;
i
++
)
{
s
->
starts
[
i
]
=
-1
;
}
/* set up the rock */
rock
->
f
=
byte2search
;
rock
->
cleanup
=
search_cleanup
;
rock
->
state
=
(
void
*
)
s
;
return
rock
;
}
static
struct
convert_rock
*
buffer_init
(
size_t
hint
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
struct
buf
*
buf
=
xzmalloc
(
sizeof
(
struct
buf
));
if
(
hint
)
buf_ensure
(
buf
,
hint
);
rock
->
f
=
byte2buffer
;
rock
->
cleanup
=
buffer_cleanup
;
rock
->
state
=
(
void
*
)
buf
;
return
rock
;
}
static
struct
convert_rock
*
buffer_initm
(
size_t
hint
,
struct
buf
*
buf
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
if
(
hint
)
buf_ensure
(
buf
,
hint
);
rock
->
f
=
byte2buffer
;
rock
->
cleanup
=
buffer_cleanup
;
rock
->
state
=
(
void
*
)
buf
;
rock
->
dont_free_state
=
1
;
return
rock
;
}
static
void
buffer_setbuf
(
struct
convert_rock
*
rock
,
struct
buf
*
dst
)
{
if
(
rock
->
state
)
{
buf_free
(
rock
->
state
);
free
(
rock
->
state
);
}
rock
->
state
=
dst
;
rock
->
cleanup
=
dont_free
;
}
struct
convert_rock
*
striphtml_init
(
struct
convert_rock
*
next
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
struct
striphtml_state
*
s
=
xzmalloc
(
sizeof
(
struct
striphtml_state
));
/* gnb:TODO: if a DOCTYPE is present, sniff it to detect XHTML rules */
html_push
(
s
,
HDATA
);
rock
->
state
=
(
void
*
)
s
;
rock
->
f
=
striphtml2uni
;
rock
->
cleanup
=
striphtml_cleanup
;
rock
->
next
=
next
;
return
rock
;
}
struct
unorm_state
{
const
UNormalizer2
*
unorm
;
UChar
*
u16buf
;
int32_t
u16cap
;
UChar32
*
u32buf
;
int32_t
u32cap
;
int32_t
u32len
;
int32_t
spanlen
;
};
static
void
unorm_append
(
struct
unorm_state
*
st
,
uint32_t
c
)
{
if
(
st
->
u32len
==
st
->
u32cap
)
{
st
->
u32cap
+=
8
;
st
->
u32buf
=
xrealloc
(
st
->
u32buf
,
sizeof
(
UChar32
)
*
st
->
u32cap
);
}
if
(
!
st
->
spanlen
&&
!
unorm2_getCombiningClass
(
st
->
unorm
,
c
))
{
/* End of the first span of composable codepoints */
st
->
spanlen
=
st
->
u32len
;
}
st
->
u32buf
[
st
->
u32len
++
]
=
c
;
}
static
void
unorm_drain
(
struct
convert_rock
*
rock
,
int
is_flush
)
{
struct
unorm_state
*
st
=
rock
->
state
;
/* Have we reached the end of a composable span? */
if
(
!
st
->
spanlen
)
{
if
(
!
is_flush
)
{
return
;
}
st
->
spanlen
=
st
->
u32len
;
}
if
(
!
st
->
spanlen
)
return
;
/* Insertion-sort span by combining class */
int
i
;
for
(
i
=
1
;
i
<
st
->
spanlen
;
i
++
)
{
UChar32
c
=
st
->
u32buf
[
i
];
int
j
=
i
-
1
;
while
(
j
>=
0
)
{
if
(
unorm2_getCombiningClass
(
st
->
unorm
,
st
->
u32buf
[
j
])
<=
unorm2_getCombiningClass
(
st
->
unorm
,
c
))
{
break
;
}
st
->
u32buf
[
j
+
1
]
=
st
->
u32buf
[
j
];
j
=
j
-
1
;
}
st
->
u32buf
[
j
+
1
]
=
c
;
}
/* Emit composed codepoints in span */
UChar32
u1
=
st
->
u32buf
[
0
];
for
(
i
=
1
;
i
<
st
->
spanlen
;
i
++
)
{
UChar32
u2
=
unorm2_composePair
(
st
->
unorm
,
u1
,
st
->
u32buf
[
i
]);
if
(
u2
<
0
)
{
convert_putc
(
rock
->
next
,
u1
);
u1
=
st
->
u32buf
[
i
];
}
else
u1
=
u2
;
}
convert_putc
(
rock
->
next
,
u1
);
/* Keep any remaining code points */
int
j
;
for
(
i
=
0
,
j
=
st
->
spanlen
;
j
<
st
->
u32len
;
j
++
)
{
st
->
u32buf
[
i
++
]
=
st
->
u32buf
[
j
];
}
st
->
u32len
-=
st
->
spanlen
;
st
->
spanlen
=
0
;
for
(
i
=
0
;
i
<
st
->
u32len
;
i
++
)
{
if
(
!
unorm2_getCombiningClass
(
st
->
unorm
,
st
->
u32buf
[
i
]))
{
st
->
spanlen
=
i
;
break
;
}
}
/* In case of flush, drain all we got */
if
(
is_flush
&&
st
->
u32len
)
{
unorm_drain
(
rock
,
is_flush
);
}
}
static
void
unorm_cleanup
(
struct
convert_rock
*
rock
,
int
is_free
)
{
if
(
!
rock
||
!
rock
->
state
)
return
;
struct
unorm_state
*
st
=
rock
->
state
;
if
(
is_free
)
{
free
(
st
->
u16buf
);
free
(
st
->
u32buf
);
free
(
st
);
free
(
rock
);
}
else
{
int32_t
i
;
for
(
i
=
0
;
i
<
st
->
u16cap
;
i
++
)
{
st
->
u16buf
[
i
]
=
0
;
}
for
(
i
=
0
;
i
<
st
->
u32cap
;
i
++
)
{
st
->
u32buf
[
i
]
=
0
;
}
st
->
u32len
=
0
;
}
}
static
void
unorm_flush
(
struct
convert_rock
*
rock
)
{
unorm_drain
(
rock
,
1
);
}
static
void
unorm_convert
(
struct
convert_rock
*
rock
,
uint32_t
c
)
{
struct
unorm_state
*
st
=
rock
->
state
;
UErrorCode
err
=
U_ZERO_ERROR
;
int32_t
len
=
unorm2_getDecomposition
(
st
->
unorm
,
c
,
NULL
,
0
,
&
err
);
if
(
len
>
0
)
{
/* Decompose c into NFD */
if
(
len
>
st
->
u16cap
)
{
st
->
u16buf
=
xrealloc
(
st
->
u16buf
,
sizeof
(
UChar
)
*
len
);
st
->
u16cap
=
len
;
}
err
=
U_ZERO_ERROR
;
unorm2_getDecomposition
(
st
->
unorm
,
c
,
st
->
u16buf
,
st
->
u16cap
,
&
err
);
/* Append NFD codepoints */
if
(
U_SUCCESS
(
err
))
{
int32_t
i
=
0
;
while
(
i
<
len
)
{
U16_NEXT
(
st
->
u16buf
,
i
,
len
,
c
);
unorm_append
(
st
,
c
);
}
}
}
if
(
len
<
0
||
U_FAILURE
(
err
))
{
/* Append verbatim */
unorm_append
(
st
,
c
);
}
unorm_drain
(
rock
,
0
);
}
static
struct
convert_rock
*
unorm_init
(
struct
convert_rock
*
next
)
{
struct
convert_rock
*
rock
=
xzmalloc
(
sizeof
(
struct
convert_rock
));
struct
unorm_state
*
st
=
xzmalloc
(
sizeof
(
struct
unorm_state
));
UErrorCode
err
=
U_ZERO_ERROR
;
st
->
unorm
=
unorm2_getNFCInstance
(
&
err
);
assert
(
U_SUCCESS
(
err
));
st
->
u16cap
=
8
;
st
->
u16buf
=
xmalloc
(
sizeof
(
UChar
)
*
st
->
u16cap
);
st
->
u32cap
=
8
;
st
->
u32buf
=
xmalloc
(
sizeof
(
UChar32
)
*
st
->
u32cap
);
rock
->
f
=
unorm_convert
;
rock
->
flush
=
unorm_flush
;
rock
->
cleanup
=
unorm_cleanup
;
rock
->
next
=
next
;
rock
->
state
=
st
;
return
rock
;
}
static
char
*
convert_to_name
(
const
char
*
to
,
charset_t
charset
,
const
char
*
src
,
size_t
len
)
{
UErrorCode
err
=
U_ZERO_ERROR
;
const
char
*
from
;
char
*
res
=
NULL
;
size_t
n
;
/* determine the name of the source encoding */
from
=
charset_canon_name
(
charset
);
/* allocate the target buffer */
/* we preflight to compromise between memory and runtime efficiency */
n
=
ucnv_convert
(
to
,
from
,
res
,
0
,
src
,
len
,
&
err
)
+
1
;
if
(
err
!=
U_BUFFER_OVERFLOW_ERROR
)
return
NULL
;
res
=
xmalloc
(
n
);
/* run the conversion */
err
=
U_ZERO_ERROR
;
ucnv_convert
(
to
,
from
,
res
,
n
,
src
,
len
,
&
err
);
if
(
U_FAILURE
(
err
))
{
free
(
res
);
return
NULL
;
}
return
res
;
}
static
charset_t
lookup_buf
(
const
char
*
buf
,
size_t
len
)
{
char
*
name
=
xstrndup
(
buf
,
len
);
charset_t
cs
=
charset_lookupname
(
name
);
free
(
name
);
return
cs
;
}
/* RFC 2047: In this case the set of characters that may be used in a “Q”-encoded
‘encoded-word’ is restricted to: <upper and lower case ASCII
letters, decimal digits, "!", "*", "+", "-", "/", "=", and "_" */
/* of course = and _ are not included in the set, because they themselves
need to be quoted it’s just saying they can be present in the Q wordi
itself, because they’re part of the quoting system */
char
QPMIMEPHRASESAFECHAR
[
256
]
=
{
/* control chars are unsafe */
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
/* ASCII map... */
/* ! " # $ % & ' ( ) * + , - . / */
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
1
,
0
,
1
,
0
,
1
,
/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
/* @ A B C D E F G H I J K L M N O */
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
/* P Q R S T U V W X Y Z [ \\ ] ^ _ */
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
/* ` a b c d e f g h i j k l m n o */
0
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
/* p q r s t u v w x y z { | } ~ DEL */
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
,
0
,
/* all high bits are unsafe */
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
/* API */
EXPORTED
const
char
*
charset_alias_name
(
charset_t
cs
)
{
if
(
cs
&&
cs
->
alias_name
)
{
return
cs
->
alias_name
;
}
return
charset_canon_name
(
cs
);
}
/*
* Return the name of the given character set number, or "unknown" if
* not known.
*/
EXPORTED
const
char
*
charset_canon_name
(
charset_t
cs
)
{
if
(
!
cs
)
goto
done
;
if
(
cs
->
canon_name
)
return
cs
->
canon_name
;
if
(
cs
->
conv
)
{
UErrorCode
err
=
U_ZERO_ERROR
;
const
char
*
name
=
ucnv_getName
(
cs
->
conv
,
&
err
);
if
(
U_SUCCESS
(
err
))
return
name
;
}
else
if
(
cs
->
num
>=
0
&&
cs
->
num
<
chartables_num_charsets
)
{
return
chartables_charset_table
[
cs
->
num
].
name
;
}
done
:
return
"unknown"
;
}
/*
* Lookup the character set 'name'. Returns the character set
* or CHARSET_UNKNOWN_CHARSET if there is no matching character set.
*/
EXPORTED
charset_t
charset_lookupname
(
const
char
*
name
)
{
int
i
;
struct
charset_charset
*
cs
;
UErrorCode
err
;
UConverter
*
conv
;
/* create the converter */
cs
=
xzmalloc
(
sizeof
(
struct
charset_charset
));
cs
->
num
=
-1
;
if
(
!
name
)
{
cs
->
num
=
0
;
// us-ascii
return
cs
;
}
cs
->
alias_name
=
xstrdup
(
name
);
/* translate alias to canonical name */
for
(
i
=
0
;
charset_aliases
[
i
].
name
;
i
++
)
{
if
(
!
strcasecmp
(
name
,
charset_aliases
[
i
].
name
))
{
cs
->
canon_name
=
xstrdup
(
charset_aliases
[
i
].
canon_name
);
break
;
}
}
if
(
!
cs
->
canon_name
)
{
/* otherwise use canonical name, if defined */
for
(
i
=
0
;
charset_aliases
[
i
].
name
;
i
++
)
{
if
(
!
strcasecmp
(
name
,
charset_aliases
[
i
].
canon_name
))
{
cs
->
canon_name
=
xstrdup
(
charset_aliases
[
i
].
canon_name
);
break
;
}
}
}
/* Is it a table based lookup, or UTF-8? */
if
(
cs
->
canon_name
)
{
for
(
i
=
0
;
i
<
chartables_num_charsets
;
i
++
)
{
if
(
!
strcasecmp
(
cs
->
canon_name
,
chartables_charset_table
[
i
].
name
))
{
if
((
chartables_charset_table
[
i
].
table
)
||
!
strcmp
(
cs
->
canon_name
,
"utf-8"
))
{
cs
->
num
=
i
;
return
cs
;
}
}
}
}
/* Otherwise, let's see if we can fallback to ICU */
err
=
U_ZERO_ERROR
;
conv
=
ucnv_open
(
cs
->
canon_name
?
cs
->
canon_name
:
name
,
&
err
);
if
(
U_SUCCESS
(
err
))
{
cs
->
conv
=
conv
;
return
cs
;
}
/* Still here? This means we don't know this charset name */
free
(
cs
->
alias_name
);
free
(
cs
->
canon_name
);
free
(
cs
);
return
CHARSET_UNKNOWN_CHARSET
;
}
EXPORTED
void
charset_free
(
charset_t
*
charsetp
)
{
if
(
charsetp
&&
*
charsetp
!=
CHARSET_UNKNOWN_CHARSET
)
{
struct
charset_charset
*
s
=
*
charsetp
;
/* Close the ICU converter */
if
(
s
->
conv
)
ucnv_close
(
s
->
conv
);
/* Free up memory. */
if
(
s
->
buf
)
free
(
s
->
buf
);
free
(
s
->
alias_name
);
free
(
s
->
canon_name
);
/* Release the converter */
free
(
s
);
*
charsetp
=
CHARSET_UNKNOWN_CHARSET
;
}
}
/* Lookup charset for the legacy numeric charset identifier id */
EXPORTED
charset_t
charset_lookupnumid
(
int
id
)
{
if
(
id
<
0
||
id
>=
chartables_num_charsets
)
return
CHARSET_UNKNOWN_CHARSET
;
if
(
!
chartables_charset_table
[
id
].
name
)
return
CHARSET_UNKNOWN_CHARSET
;
return
charset_lookupname
(
chartables_charset_table
[
id
].
name
);
}
struct
charset_conv
{
struct
convert_rock
*
input
;
charset_t
charset
;
charset_t
utf8
;
struct
buf
dst
;
};
EXPORTED
charset_conv_t
*
charset_conv_new
(
charset_t
charset
,
int
flags
)
{
struct
charset_conv
*
conv
=
xzmalloc
(
sizeof
(
struct
charset_conv
));
conv
->
charset
=
charset
;
conv
->
utf8
=
charset_lookupname
(
"utf-8"
);
/* set up the conversion path */
struct
convert_rock
*
input
,
*
tobuffer
;
tobuffer
=
buffer_initm
(
0
,
&
conv
->
dst
);
input
=
convert_init
(
conv
->
utf8
,
0
/*to_uni*/
,
tobuffer
);
input
=
canon_init
(
flags
,
input
);
if
(
flags
&
CHARSET_UNORM_NFC
)
{
input
=
unorm_init
(
input
);
}
input
=
convert_init
(
conv
->
charset
,
1
/*to_uni*/
,
input
);
conv
->
input
=
input
;
return
conv
;
}
EXPORTED
const
char
*
charset_conv_convert
(
charset_conv_t
*
conv
,
const
char
*
s
)
{
if
(
!
s
)
return
NULL
;
convert_ncleanup
(
conv
->
input
,
0
,
0
);
buf_reset
(
&
conv
->
dst
);
if
(
conv
->
charset
==
CHARSET_UNKNOWN_CHARSET
)
buf_setcstr
(
&
conv
->
dst
,
"X"
);
else
convert_cat
(
conv
->
input
,
s
);
return
buf_cstring
(
&
conv
->
dst
);
}
EXPORTED
void
charset_conv_free
(
charset_conv_t
**
convp
)
{
if
(
!
convp
||
!*
convp
)
return
;
charset_conv_t
*
conv
=
*
convp
;
convert_free
(
conv
->
input
);
charset_free
(
&
conv
->
utf8
);
buf_free
(
&
conv
->
dst
);
free
(
conv
);
*
convp
=
NULL
;
}
/*
* Convert the string 's' in the character set numbered 'charset'
* into canonical searching form. Returns a newly allocated string
* which must be free()d by the caller.
*/
EXPORTED
char
*
charset_convert
(
const
char
*
s
,
charset_t
charset
,
int
flags
)
{
charset_conv_t
*
conv
=
charset_conv_new
(
charset
,
flags
);
char
*
ret
=
NULL
;
if
(
charset_conv_convert
(
conv
,
s
))
{
ret
=
buf_release
(
&
conv
->
dst
);
}
charset_conv_free
(
&
conv
);
return
ret
;
}
/* Convert from a given charset and encoding into IMAP UTF-7 */
EXPORTED
char
*
charset_to_imaputf7
(
const
char
*
msg_base
,
size_t
len
,
charset_t
charset
,
int
encoding
)
{
struct
convert_rock
*
input
,
*
tobuffer
;
char
*
res
;
charset_t
imaputf7
;
/* Initialize character set mapping */
if
(
charset
==
CHARSET_UNKNOWN_CHARSET
)
return
0
;
/* check for trivial case */
if
(
len
==
0
)
return
xstrdup
(
""
);
/* check if we can convert the whole block at once */
if
(
encoding
==
ENCODING_NONE
)
return
convert_to_name
(
"imap-mailbox-name"
,
charset
,
msg_base
,
len
);
/* set up the conversion path */
imaputf7
=
charset_lookupname
(
"imap-mailbox-name"
);
tobuffer
=
buffer_init
(
len
);
input
=
convert_init
(
imaputf7
,
0
/*to_uni*/
,
tobuffer
);
input
=
convert_init
(
charset
,
1
/*to_uni*/
,
input
);
/* choose encoding extraction if needed */
switch
(
encoding
)
{
case
ENCODING_NONE
:
break
;
case
ENCODING_QP
:
input
=
qp_init
(
0
,
input
);
break
;
case
ENCODING_BASE64
:
input
=
b64_init
(
input
);
/* XXX have to have nl-mapping base64 in order to
* properly count \n as 2 raw characters
*/
break
;
default
:
/* Don't know encoding--nothing can match */
convert_free
(
input
);
charset_free
(
&
imaputf7
);
return
0
;
}
/* do the conversion */
convert_catn
(
input
,
msg_base
,
len
);
/* extract the result */
res
=
buffer_cstring
(
tobuffer
);
/* clean up */
convert_free
(
input
);
charset_free
(
&
imaputf7
);
return
res
;
}
EXPORTED
char
*
charset_utf8_to_searchform
(
const
char
*
s
,
int
flags
)
{
charset_t
utf8
=
charset_lookupname
(
"utf-8"
);
char
*
ret
=
charset_convert
(
s
,
utf8
,
flags
);
charset_free
(
&
utf8
);
return
ret
;
}
EXPORTED
char
*
charset_utf8_normalize
(
const
char
*
src
)
{
int32_t
srclen
=
strlen
(
src
);
UChar
*
uni
=
NULL
;
int32_t
unilen
=
0
;
UChar
*
nfc
=
NULL
;
int32_t
nfclen
=
0
;
char
*
ret
=
NULL
;
int32_t
retlen
=
0
;
/* Fast-path for ASCII.
* Unicode Standard Annex #15, section 1.3: "Text exclusively
* containing ASCII characters (U+0000..U+007F) is left
* unaffected by all of the Normalization Forms."
* See http://www.unicode.org/reports/tr15/#Description_Norm
* */
const
char
*
p
;
for
(
p
=
src
;
*
p
&&
isascii
(
*
p
);
p
++
)
{}
if
(
*
p
==
'\0'
)
{
return
xstrdup
(
src
);
}
/* Convert the UTF-8 string to UChar */
UErrorCode
err
=
U_ZERO_ERROR
;
u_strFromUTF8
(
uni
,
unilen
,
&
unilen
,
src
,
srclen
,
&
err
);
if
(
U_FAILURE
(
err
)
&&
err
!=
U_BUFFER_OVERFLOW_ERROR
)
{
goto
done
;
}
err
=
U_ZERO_ERROR
;
unilen
++
;
uni
=
xzmalloc
(
unilen
*
sizeof
(
UChar
));
u_strFromUTF8
(
uni
,
unilen
,
&
unilen
,
src
,
srclen
,
&
err
);
if
(
U_FAILURE
(
err
))
{
goto
done
;
}
/* Normalize the UChars to NFC */
err
=
U_ZERO_ERROR
;
const
UNormalizer2
*
norm
=
unorm2_getNFCInstance
(
&
err
);
if
(
!
norm
||
U_FAILURE
(
err
))
{
goto
done
;
}
/* Quick-check if the Unicode string requires normalization.
* Skip normalization only if libicu is certain not to need
* to do anything. */
if
(
unorm2_quickCheck
(
norm
,
uni
,
unilen
,
&
err
)
==
UNORM_YES
)
{
nfc
=
uni
;
nfclen
=
unilen
;
uni
=
NULL
;
}
else
{
err
=
U_ZERO_ERROR
;
nfclen
=
unorm2_normalize
(
norm
,
uni
,
unilen
,
nfc
,
0
,
&
err
)
+
1
;
if
(
U_FAILURE
(
err
)
&&
err
!=
U_BUFFER_OVERFLOW_ERROR
)
{
goto
done
;
}
err
=
U_ZERO_ERROR
;
nfclen
++
;
nfc
=
xzmalloc
(
nfclen
*
sizeof
(
UChar
));
unorm2_normalize
(
norm
,
uni
,
unilen
,
nfc
,
nfclen
,
&
err
);
if
(
U_FAILURE
(
err
))
{
goto
done
;
}
free
(
uni
);
uni
=
NULL
;
}
/* Convert the NFC UChars back to UTF-8 */
err
=
U_ZERO_ERROR
;
u_strToUTF8
(
ret
,
0
,
&
retlen
,
nfc
,
nfclen
,
&
err
);
if
(
U_FAILURE
(
err
)
&&
err
!=
U_BUFFER_OVERFLOW_ERROR
)
{
goto
done
;
}
err
=
U_ZERO_ERROR
;
retlen
++
;
ret
=
xzmalloc
(
retlen
);
u_strToUTF8
(
ret
,
retlen
,
&
retlen
,
nfc
,
nfclen
,
&
err
);
if
(
U_FAILURE
(
err
)
&&
err
!=
U_BUFFER_OVERFLOW_ERROR
)
{
goto
done
;
}
done
:
free
(
uni
);
free
(
nfc
);
return
ret
;
}
/* Convert from a given charset and encoding into utf8 */
EXPORTED
char
*
charset_to_utf8
(
const
char
*
msg_base
,
size_t
len
,
charset_t
charset
,
int
encoding
)
{
struct
convert_rock
*
input
,
*
tobuffer
;
char
*
res
;
charset_t
utf8
;
/* Initialize character set mapping */
if
(
charset
==
CHARSET_UNKNOWN_CHARSET
)
return
NULL
;
/* check for trivial search */
if
(
len
==
0
)
return
xstrdup
(
""
);
/* check if we can convert the whole block at once */
if
(
encoding
==
ENCODING_NONE
)
return
convert_to_name
(
"utf-8"
,
charset
,
msg_base
,
len
);
/* set up the conversion path */
utf8
=
charset_lookupname
(
"utf-8"
);
tobuffer
=
buffer_init
(
len
);
input
=
convert_init
(
utf8
,
0
/*to_uni*/
,
tobuffer
);
input
=
convert_init
(
charset
,
1
/*to_uni*/
,
input
);
/* choose encoding extraction if needed */
switch
(
encoding
)
{
case
ENCODING_NONE
:
break
;
case
ENCODING_QP
:
input
=
qp_init
(
0
,
input
);
break
;
case
ENCODING_BASE64
:
input
=
b64_init
(
input
);
/* XXX have to have nl-mapping base64 in order to
* properly count \n as 2 raw characters
*/
break
;
default
:
/* Don't know encoding--nothing can match */
convert_free
(
input
);
charset_free
(
&
utf8
);
return
0
;
}
convert_catn
(
input
,
msg_base
,
len
);
res
=
buffer_cstring
(
tobuffer
);
convert_free
(
input
);
charset_free
(
&
utf8
);
return
res
;
}
/* Decode bytes from src into buffer dst */
EXPORTED
int
charset_decode
(
struct
buf
*
dst
,
const
char
*
src
,
size_t
len
,
int
encoding
)
{
struct
convert_rock
*
input
;
buf_reset
(
dst
);
/* check for trivial decode */
if
(
len
==
0
||
encoding
==
ENCODING_NONE
)
{
buf_setmap
(
dst
,
src
,
len
);
return
0
;
}
/* set up the conversion path */
input
=
buffer_init
(
len
);
buffer_setbuf
(
input
,
dst
);
/* choose encoding extraction if needed */
switch
(
encoding
)
{
case
ENCODING_NONE
:
break
;
case
ENCODING_QP
:
input
=
qp_init
(
0
,
input
);
break
;
case
ENCODING_BASE64
:
input
=
b64_init
(
input
);
/* XXX have to have nl-mapping base64 in order to
* properly count \n as 2 raw characters
*/
break
;
default
:
/* Don't know encoding--nothing can match */
convert_free
(
input
);
return
-1
;
}
convert_catn
(
input
,
src
,
len
);
convert_free
(
input
);
return
0
;
}
static
void
mimeheader_cat
(
struct
convert_rock
*
target
,
const
char
*
s
,
int
flags
)
{
struct
convert_rock
*
input
,
*
unfold
;
int
eatspace
=
0
;
const
char
*
start
,
*
endcharset
,
*
encoding
,
*
end
;
int
len
;
const
char
*
p
;
charset_t
defaultcs
,
cs
;
if
(
!
s
)
return
;
/* Keep track of the decoding pipeline before the current
* encoded-word. This allows to share decoding state for
* multi-octet characters that are broken across words. */
int
lastenc
=
ENCODING_UNKNOWN
;
charset_t
lastcs
=
CHARSET_UNKNOWN_CHARSET
;
struct
convert_rock
*
extract
=
NULL
;
/* set up the conversion path */
if
(
flags
&
CHARSET_MIME_UTF8
)
{
defaultcs
=
charset_lookupname
(
"utf-8"
);
}
else
{
defaultcs
=
charset_lookupname
(
"us-ascii"
);
}
input
=
convert_init
(
defaultcs
,
1
/*to_uni*/
,
target
);
/* note: we assume the caller of this function has already
* determined that all newlines are followed by whitespace */
unfold
=
unfold_init
(
0
/*skipws*/
,
input
);
start
=
s
;
while
((
start
=
(
const
char
*
)
strchr
(
start
,
'='
))
!=
0
)
{
start
++
;
if
(
*
start
!=
'?'
)
continue
;
endcharset
=
NULL
;
for
(
p
=
start
+
1
;
*
p
&&
*
p
!=
'='
&&
*
p
!=
'?'
;
++
p
)
if
(
'*'
==
*
p
&&
!
endcharset
)
endcharset
=
p
;
if
(
*
p
!=
'?'
)
{
start
=
p
;
continue
;
}
encoding
=
p
;
if
(
!
endcharset
)
endcharset
=
p
;
if
((
encoding
[
1
]
!=
'b'
&&
encoding
[
1
]
!=
'B'
&&
encoding
[
1
]
!=
'q'
&&
encoding
[
1
]
!=
'Q'
)
||
(
encoding
[
2
]
!=
'?'
))
{
start
=
p
;
continue
;
}
for
(
p
=
encoding
+
3
;
*
p
&&
*
p
!=
'?'
;
++
p
)
if
(
'='
==
*
p
&&
'?'
==
p
[
1
]
&&
'='
!=
p
[
2
])
break
;
if
(
*
p
!=
'?'
||
p
[
1
]
!=
'='
)
{
start
=
p
;
continue
;
}
end
=
p
;
if
(
!
end
||
end
[
1
]
!=
'='
)
continue
;
/*
* We have recognized a valid 1522-word.
* Copy over leading text, unless it consists entirely of
* whitespace and is between two 1522-words.
*/
if
(
eatspace
)
{
for
(
p
=
s
;
p
<
(
start
-1
)
&&
Uisspace
(
*
p
);
p
++
);
if
(
p
<
(
start
-1
))
eatspace
=
0
;
}
if
(
!
eatspace
)
{
len
=
start
-
s
-
1
;
convert_switch
(
input
,
defaultcs
,
1
/*to_uni*/
);
convert_catn
(
unfold
,
s
,
len
);
/* Reset decoder pipeline */
charset_free
(
&
lastcs
);
lastenc
=
ENCODING_UNKNOWN
;
basic_free
(
extract
);
extract
=
NULL
;
}
/*
* Get the 1522-word's character set
*/
start
++
;
cs
=
lookup_buf
(
start
,
endcharset
-
start
);
int
enc
=
encoding
[
1
]
==
'q'
||
encoding
[
1
]
==
'Q'
?
ENCODING_QP
:
ENCODING_BASE64
;
if
(
cs
==
CHARSET_UNKNOWN_CHARSET
)
{
/* Unrecognized charset, nothing will match here */
convert_putc
(
input
,
U_REPLACEMENT
);
/* unknown character */
charset_free
(
&
cs
);
convert_switch
(
input
,
defaultcs
,
1
/*to_uni*/
);
/* Reset decoder pipeline */
charset_free
(
&
lastcs
);
lastcs
=
CHARSET_UNKNOWN_CHARSET
;
lastenc
=
ENCODING_UNKNOWN
;
basic_free
(
extract
);
extract
=
NULL
;
}
else
if
(
!
strcmp
(
charset_canon_name
(
cs
),
charset_canon_name
(
lastcs
))
&&
enc
==
lastenc
&&
enc
)
{
/* Reuse the previous decoder */
charset_free
(
&
cs
);
p
=
encoding
+
3
;
convert_catn
(
extract
,
p
,
end
-
p
);
}
else
{
/* Reset the previous decoder and start a new decoding pipeline */
convert_switch
(
input
,
cs
,
1
/*to_uni*/
);
charset_free
(
&
lastcs
);
lastcs
=
CHARSET_UNKNOWN_CHARSET
;
lastenc
=
ENCODING_UNKNOWN
;
basic_free
(
extract
);
extract
=
NULL
;
/* choose decoder */
if
(
enc
==
ENCODING_QP
)
{
extract
=
qp_init
(
1
,
input
);
}
else
{
extract
=
b64_init
(
input
);
}
/* convert */
p
=
encoding
+
3
;
convert_catn
(
extract
,
p
,
end
-
p
);
lastcs
=
cs
;
lastenc
=
enc
;
}
/* Prepare for the next iteration */
s
=
start
=
end
+
2
;
eatspace
=
1
;
}
/* Copy over the tail part of the input string */
if
(
*
s
)
{
convert_switch
(
input
,
defaultcs
,
1
/*to_uni*/
);
/* US_ASCII */
convert_cat
(
unfold
,
s
);
}
/* just free the first two items, the rest can be cleaned up by the sender */
basic_free
(
unfold
);
convert_ncleanup
(
input
,
1
,
1
);
charset_free
(
&
defaultcs
);
charset_free
(
&
lastcs
);
basic_free
(
extract
);
}
/*
* Decode MIME strings (per RFC 2047) in 's'. Returns a newly allocated
* string, containing 's' in canonical searching form, which must be
* free()d by the caller.
*/
EXPORTED
char
*
charset_decode_mimeheader
(
const
char
*
s
,
int
flags
)
{
struct
convert_rock
*
tobuffer
,
*
input
;
char
*
res
;
charset_t
utf8
;
if
(
!
s
)
return
NULL
;
utf8
=
charset_lookupname
(
"utf-8"
);
tobuffer
=
buffer_init
(
0
);
input
=
convert_init
(
utf8
,
0
/*to_uni*/
,
tobuffer
);
input
=
canon_init
(
flags
,
input
);
mimeheader_cat
(
input
,
s
,
flags
);
res
=
buffer_cstring
(
tobuffer
);
convert_free
(
input
);
charset_free
(
&
utf8
);
return
res
;
}
/*
* Unfold len bytes of string s into a new string, which must be freed()
* by the caller. Unfolding removes any CRLF that is immediately followed
* by a tab or space character. If flags sets CHARSET_UNFOLD_SKIPWS, then
* the whitespace character is also omitted.
*/
EXPORTED
char
*
charset_unfold
(
const
char
*
s
,
size_t
len
,
int
flags
)
{
struct
convert_rock
*
tobuffer
,
*
input
;
char
*
res
;
if
(
!
s
)
return
NULL
;
tobuffer
=
buffer_init
(
len
);
input
=
unfold_init
(
flags
&
CHARSET_UNFOLD_SKIPWS
,
tobuffer
);
convert_catn
(
input
,
s
,
len
);
res
=
buffer_cstring
(
tobuffer
);
convert_free
(
input
);
return
res
;
}
/*
* Decode MIME strings (per RFC 2047) in 's'. Returns a newly allocated
* string, containing the decoded string, which must be free()d by the
* caller.
*/
EXPORTED
char
*
charset_parse_mimeheader
(
const
char
*
s
,
int
flags
)
{
struct
convert_rock
*
tobuffer
,
*
input
;
char
*
res
;
charset_t
utf8
;
if
(
!
s
)
return
NULL
;
utf8
=
charset_lookupname
(
"utf-8"
);
tobuffer
=
buffer_init
(
0
);
input
=
convert_init
(
utf8
,
0
/*to_uni*/
,
tobuffer
);
mimeheader_cat
(
input
,
s
,
flags
);
if
(
flags
&
CHARSET_TRIMWS
)
buffer_trim
(
tobuffer
);
res
=
buffer_cstring
(
tobuffer
);
convert_free
(
input
);
charset_free
(
&
utf8
);
return
res
;
}
/* Decode extended MIME values (per RFC 2231). Returns a newly allocated UTF-8
* encoded string, containing the decoded string, which must be free()d by the
* caller.
*
* If lang is not NULL, sets the buffer contents to the 'language' * field as
* encoded in the MIME value.
*
* If s can't be decoded to an extended value as defined in RFC 2231, then
* return NULL. E.g. it does not attempt to decode the value as RFC 2047 MIME
* header, nor returns the value verbatim. */
EXPORTED
char
*
charset_parse_mimexvalue
(
const
char
*
s
,
struct
buf
*
lang
)
{
if
(
!
s
)
return
NULL
;
const
char
*
p
,
*
q
;
struct
buf
buf
=
BUF_INITIALIZER
;
charset_t
cs
;
char
*
ret
=
NULL
;
/* Determine charset */
p
=
s
;
q
=
strchr
(
p
,
'\''
);
if
(
!
q
)
return
NULL
;
buf_setmap
(
&
buf
,
p
,
q
-
p
);
cs
=
charset_lookupname
(
buf_cstring
(
&
buf
));
if
(
cs
==
CHARSET_UNKNOWN_CHARSET
)
goto
done
;
/* Determine language */
p
=
q
+
1
;
q
=
strchr
(
p
,
'\''
);
if
(
!
q
)
goto
done
;
if
(
lang
)
{
buf_setmap
(
lang
,
p
,
q
-
p
);
}
/* Decode octects */
buf_reset
(
&
buf
);
p
=
q
+
1
;
while
(
*
p
)
{
if
(
*
p
==
'%'
)
{
char
c
;
if
(
*
(
p
+
1
)
==
0
||
*
(
p
+
2
)
==
0
)
goto
done
;
if
(
hex_to_bin
(
p
+
1
,
2
,
&
c
)
==
-1
)
goto
done
;
buf_appendmap
(
&
buf
,
&
c
,
1
);
p
+=
3
;
}
else
{
buf_appendmap
(
&
buf
,
p
++
,
1
);
}
}
ret
=
charset_to_utf8
(
buf_base
(
&
buf
),
buf_len
(
&
buf
),
cs
,
0
);
done
:
charset_free
(
&
cs
);
buf_free
(
&
buf
);
return
ret
;
}
/* Encode UTF-8 encoded string s as extended MIME RFC 2231 value.
* If lang is non-NULL, set the extended value language property
* accordingly. The returned string must be free()d by caller. */
EXPORTED
char
*
charset_encode_mimexvalue
(
const
char
*
s
,
const
char
*
lang
)
{
const
unsigned
char
*
p
;
struct
buf
buf
=
BUF_INITIALIZER
;
if
(
!
s
)
return
NULL
;
buf_printf
(
&
buf
,
"utf-8'%s'"
,
lang
?
lang
:
""
);
for
(
p
=
(
const
unsigned
char
*
)
s
;
*
p
;
p
++
)
{
if
((
*
p
>=
'0'
&&
*
p
<=
'9'
)
||
(
*
p
>=
'a'
&&
*
p
<=
'z'
)
||
(
*
p
>=
'A'
&&
*
p
<=
'Z'
)
||
(
strchr
(
"!#$&+-.^_`|~"
,
*
p
)))
{
// Safe attr char
buf_putc
(
&
buf
,
*
p
);
}
else
buf_printf
(
&
buf
,
"%%%X%X"
,
*
p
>>
4
,
*
p
&
0xf
);
}
return
buf_release
(
&
buf
);
}
EXPORTED
int
charset_search_mimeheader
(
const
char
*
substr
,
comp_pat
*
pat
,
const
char
*
s
,
int
flags
)
{
struct
convert_rock
*
input
,
*
tosearch
;
int
res
;
charset_t
utf8
=
charset_lookupname
(
"utf-8"
);
tosearch
=
search_init
(
substr
,
pat
);
input
=
convert_init
(
utf8
,
0
/*to_uni*/
,
tosearch
);
input
=
canon_init
(
flags
,
input
);
mimeheader_cat
(
input
,
s
,
flags
);
res
=
search_havematch
(
tosearch
);
convert_free
(
input
);
charset_free
(
&
utf8
);
return
res
;
}
/* Compile a search pattern for later comparison. We just count
* how long the string is, and how many times the first character
* occurs. Later optimisation could reduce the max_start by
* deeper analysis of the possible paths through the string, but
* this is a good absolute maximum, and it just means a few more
* bytes get allocated... */
EXPORTED
comp_pat
*
charset_compilepat
(
const
char
*
s
)
{
struct
comp_pat_s
*
pat
=
xzmalloc
(
sizeof
(
struct
comp_pat_s
));
const
char
*
p
=
s
;
/* count occurrences */
while
(
*
p
)
{
if
(
*
p
==
*
s
)
pat
->
max_start
++
;
pat
->
patlen
++
;
p
++
;
}
return
(
comp_pat
*
)
pat
;
}
/*
* Free the compiled pattern 'pat'
*/
EXPORTED
void
charset_freepat
(
comp_pat
*
pat
)
{
free
((
struct
comp_pat_s
*
)
pat
);
}
/*
* Search for the string 'substr', with compiled pattern 'pat'
* in the string 's', with length 'len'. Return nonzero if match
*
* Uses the to_search target directly. Assumes 's' is already
* in search normal form (i.e. from a cache file)
*/
EXPORTED
int
charset_searchstring
(
const
char
*
substr
,
comp_pat
*
pat
,
const
char
*
s
,
size_t
len
,
int
flags
)
{
if
(
!
substr
[
0
])
return
1
;
/* zero length string always matches */
struct
convert_rock
*
tosearch
;
struct
convert_rock
*
input
;
int
res
;
charset_t
utf8from
,
utf8to
;
utf8from
=
charset_lookupname
(
"utf-8"
);
utf8to
=
charset_lookupname
(
"utf-8"
);
/* set up the search handler */
tosearch
=
search_init
(
substr
,
pat
);
/* and the input stream */
input
=
convert_init
(
utf8to
,
0
/*to_uni*/
,
tosearch
);
input
=
canon_init
(
flags
,
input
);
input
=
convert_init
(
utf8from
,
1
/*to_uni*/
,
input
);
/* feed the handler */
while
(
len
--
>
0
)
{
convert_putc
(
input
,
(
unsigned
char
)
*
s
++
);
if
(
search_havematch
(
tosearch
))
break
;
/* shortcut if there's a match */
}
/* copy the value */
res
=
search_havematch
(
tosearch
);
/* clean up */
convert_free
(
input
);
charset_free
(
&
utf8from
);
charset_free
(
&
utf8to
);
return
res
;
}
/*
* Search for the string 'substr' in the next 'len' bytes of
* 'msg_base'.
* 'charset' and 'encoding' specify the character set and
* content transfer encoding of the data, respectively.
* Returns nonzero iff the string was found.
*/
EXPORTED
int
charset_searchfile
(
const
char
*
substr
,
comp_pat
*
pat
,
const
char
*
msg_base
,
size_t
len
,
charset_t
charset
,
int
encoding
,
int
flags
)
{
struct
convert_rock
*
input
,
*
tosearch
;
size_t
i
;
int
res
;
charset_t
utf8
;
/* Initialize character set mapping */
if
(
charset
==
CHARSET_UNKNOWN_CHARSET
)
return
0
;
/* check for trivial search */
if
(
strlen
(
substr
)
==
0
)
return
1
;
/* set up the conversion path */
utf8
=
charset_lookupname
(
"utf-8"
);
tosearch
=
search_init
(
substr
,
pat
);
input
=
convert_init
(
utf8
,
0
/*to_uni*/
,
tosearch
);
input
=
canon_init
(
flags
,
input
);
input
=
convert_init
(
charset
,
1
/*to_uni*/
,
input
);
/* choose encoding extraction if needed */
switch
(
encoding
)
{
case
ENCODING_NONE
:
break
;
case
ENCODING_QP
:
input
=
qp_init
(
0
,
input
);
break
;
case
ENCODING_BASE64
:
input
=
b64_init
(
input
);
/* XXX have to have nl-mapping base64 in order to
* properly count \n as 2 raw characters
*/
break
;
default
:
/* Don't know encoding--nothing can match */
convert_free
(
input
);
charset_free
(
&
utf8
);
return
0
;
}
/* implement the loop here so we can check on the search each time */
for
(
i
=
0
;
i
<
len
;
i
++
)
{
convert_putc
(
input
,
(
unsigned
char
)
msg_base
[
i
]);
if
(
search_havematch
(
tosearch
))
break
;
}
res
=
search_havematch
(
tosearch
);
/* copy before we free it */
convert_free
(
input
);
charset_free
(
&
utf8
);
return
res
;
}
/* This is based on charset_searchfile above. */
EXPORTED
int
charset_extract
(
void
(
*
cb
)(
const
struct
buf
*
,
void
*
),
void
*
rock
,
const
struct
buf
*
data
,
charset_t
charset
,
int
encoding
,
const
char
*
subtype
,
int
flags
)
{
struct
convert_rock
*
input
,
*
tobuffer
;
struct
buf
*
out
;
size_t
i
;
charset_t
utf8
;
if
(
charset_debug
)
fprintf
(
stderr
,
"charset_extract()
\n
"
);
/* Initialize character set mapping */
if
(
charset
==
CHARSET_UNKNOWN_CHARSET
)
return
0
;
/* set up the conversion path */
utf8
=
charset_lookupname
(
"utf-8"
);
tobuffer
=
buffer_init
(
buf_len
(
data
));
input
=
convert_init
(
utf8
,
0
/*to_uni*/
,
tobuffer
);
input
=
canon_init
(
flags
,
input
);
if
(
!
strcmpsafe
(
subtype
,
"HTML"
))
{
if
((
flags
&
CHARSET_SKIPHTML
))
{
/* silently pretend we indexed it, but actually ignore it */
convert_free
(
input
);
charset_free
(
&
utf8
);
return
1
;
}
/* this is text/html data, so we can make ourselves useful by
* stripping html tags, css and js. */
if
(
!
(
flags
&
CHARSET_KEEPHTML
))
{
input
=
striphtml_init
(
input
);
}
}
input
=
convert_init
(
charset
,
1
/*to_uni*/
,
input
);
switch
(
encoding
)
{
case
ENCODING_NONE
:
break
;
case
ENCODING_QP
:
input
=
qp_init
(
0
,
input
);
break
;
case
ENCODING_BASE64
:
input
=
b64_init
(
input
);
/* XXX have to have nl-mapping base64 in order to
* properly count \n as 2 raw characters
*/
break
;
default
:
/* Don't know encoding--nothing can match */
convert_free
(
input
);
charset_free
(
&
utf8
);
return
0
;
}
/* point to the buffer for easy block sending */
out
=
(
struct
buf
*
)
tobuffer
->
state
;
for
(
i
=
0
;
i
<
data
->
len
;
i
++
)
{
convert_putc
(
input
,
(
unsigned
char
)
data
->
s
[
i
]);
/* process a block of output every so often */
if
(
buf_len
(
out
)
>
4096
)
{
cb
(
out
,
rock
);
buf_reset
(
out
);
}
}
/* finish it */
convert_flush
(
input
);
if
(
out
->
len
)
{
cb
(
out
,
rock
);
}
convert_free
(
input
);
charset_free
(
&
utf8
);
return
1
;
}
/*
* Decode the MIME body part (per RFC 2045) of @len bytes located at
* @msg_base having the content transfer @encoding. Returns a pointer
* to decoded bytes. The number of decoded bytes is returned in
* *@outlen. Depending on the encoding, a newly allocated buffer may be
* written to *@decbuf, which should be free()d by the caller if it not
* zero. Note that the return value may point to either @msg_base or
* @decbuf, so @decbuf should not be free()d until the return value has
* been used.
*/
EXPORTED
const
char
*
charset_decode_mimebody
(
const
char
*
msg_base
,
size_t
len
,
int
encoding
,
char
**
decbuf
,
size_t
*
outlen
)
{
struct
convert_rock
*
input
,
*
tobuffer
;
*
decbuf
=
NULL
;
*
outlen
=
0
;
switch
(
encoding
)
{
case
ENCODING_NONE
:
*
outlen
=
len
;
return
msg_base
;
case
ENCODING_QP
:
tobuffer
=
buffer_init
(
len
);
input
=
qp_init
(
0
,
tobuffer
);
break
;
case
ENCODING_BASE64
:
tobuffer
=
buffer_init
(
len
);
input
=
b64_init
(
tobuffer
);
break
;
default
:
/* Don't know encoding--nothing can match */
return
NULL
;
}
convert_catn
(
input
,
msg_base
,
len
);
/* extract the string from the buffer */
{
struct
buf
*
buf
=
(
struct
buf
*
)
tobuffer
->
state
;
*
outlen
=
buf
->
len
;
*
decbuf
=
buf_release
(
buf
);
}
convert_free
(
input
);
if
(
!*
decbuf
)
{
/* didn't get a result - maybe blank input, don't return NULL */
*
outlen
=
len
;
return
msg_base
;
}
return
*
decbuf
;
}
/* Maximum octect length of base64 or QP-encoded text lines, excluding
* terminating CR LF and encoding-specific padding/footer. */
#define ENCODED_MAX_LINE_LEN 72
/*
* Base64 encode the MIME body part (per RFC 2045) of 'len' bytes located at
* 'msg_base'. Encodes into 'retval' which must large enough to
* accomodate the encoded data. Returns the number of encoded bytes in
* 'outlen' and the number of encoded lines in 'outlines'.
*
* May be called with 'msg_base' as NULL to get the number of encoded
* bytes for allocating 'retval' of the proper size.
*/
static
const
char
base_64
[]
=
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
;
EXPORTED
char
*
charset_encode_mimebody
(
const
char
*
msg_base
,
size_t
len
,
char
*
retval
,
size_t
*
outlen
,
int
*
outlines
,
int
wrap
)
{
const
unsigned
char
*
s
;
unsigned
char
s0
,
s1
,
s2
;
char
*
d
;
int
b64_len
,
b64_lines
,
cnt
;
b64_len
=
((
len
+
2
)
/
3
)
*
4
;
if
(
wrap
)
{
b64_lines
=
(
b64_len
+
ENCODED_MAX_LINE_LEN
-
1
)
/
ENCODED_MAX_LINE_LEN
;
/* account for CRLF added to each line */
b64_len
+=
2
*
b64_lines
;
}
else
b64_lines
=
1
;
if
(
outlen
)
*
outlen
=
b64_len
;
if
(
outlines
)
*
outlines
=
b64_lines
;
if
(
!
msg_base
)
return
NULL
;
for
(
s
=
(
const
unsigned
char
*
)
msg_base
,
d
=
retval
,
cnt
=
0
;
len
;
s
+=
3
,
d
+=
4
,
cnt
+=
4
)
{
/* process tuplets */
if
(
wrap
&&
cnt
==
ENCODED_MAX_LINE_LEN
)
{
/* reset line len count, add CRLF */
cnt
=
0
;
*
d
++
=
'\r'
;
*
d
++
=
'\n'
;
}
s0
=
s
[
0
];
s1
=
--
len
?
s
[
1
]
:
0
;
/* byte 1: high 6 bits (1) */
d
[
0
]
=
base_64
[
s0
>>
2
];
/* byte 2: low 2 bits (1), high 4 bits (2) */
d
[
1
]
=
base_64
[((
s0
&
0x3
)
<<
4
)
|
((
s1
&
0xf0
)
>>
4
)];
if
(
len
)
{
s2
=
--
len
?
s
[
2
]
:
0
;
/* byte 3: low 4 bits (2), high 2 bits (3) */
d
[
2
]
=
base_64
[((
s1
&
0xf
)
<<
2
)
|
((
s2
&
0xc0
)
>>
6
)];
}
else
{
/* byte 3: pad */
d
[
2
]
=
'='
;
}
if
(
len
)
{
--
len
;
/* byte 4: low 6 bits (3) */
d
[
3
]
=
base_64
[
s2
&
0x3f
];
}
else
{
/* byte 4: pad */
d
[
3
]
=
'='
;
}
}
if
(
wrap
)
{
/* add final CRLF */
*
d
++
=
'\r'
;
*
d
++
=
'\n'
;
}
return
(
b64_len
?
retval
:
NULL
);
}
#define ATOM_SPECIALS "()<>[]:;@\\,.\" \t"
/* Find the first email address (addr-spec) in data */
static
const
char
*
find_addr
(
const
char
*
data
,
size_t
datalen
,
size_t
*
addrlen
)
{
const
char
*
end
=
data
+
datalen
;
const
char
*
s
,
*
e
,
*
at
;
int
angle_addr
=
0
;
if
(
datalen
<
3
)
return
NULL
;
at
=
strchr
(
data
+
1
,
'@'
);
if
(
!
at
||
at
>=
end
-
1
)
return
NULL
;
/* find end of domain */
e
=
at
+
1
;
if
(
*
e
==
'['
)
{
/* find end of domain-literal */
while
(
++
e
<
end
&&
!
(
*
e
==
'['
||
*
e
==
']'
||
*
e
==
'\\'
));
/* domain-literal MUST end with ']' */
if
(
*
e
++
!=
']'
)
return
NULL
;
}
else
if
(
!
isspace
(
*
e
)
&&
!
strchr
(
ATOM_SPECIALS
,
*
e
))
{
/* find end of dot-atom */
while
(
++
e
<
end
&&
(
*
e
==
'.'
||
!
strchr
(
ATOM_SPECIALS
,
*
e
)));
/* atom MUST NOT end with '.' */
if
(
*
(
e
-1
)
==
'.'
)
return
NULL
;
}
else
return
NULL
;
if
(
e
<
end
)
{
/* gobble trailing data */
if
(
*
e
==
'>'
)
{
angle_addr
=
1
;
e
++
;
}
/* gobble trailing whitespace */
while
(
e
<
end
&&
isspace
(
*
e
))
e
++
;
/* multiple addresses MUST only be separated with ',' */
if
(
e
<
end
&&
*
e
++
!=
','
)
return
NULL
;
/* gobble trailing whitespace */
while
(
e
<
end
&&
isspace
(
*
e
))
e
++
;
}
/* find start of localpart */
s
=
at
-
1
;
if
(
*
s
==
'\"'
)
{
/* find start of quoted-string */
while
(
--
s
>=
data
&&
(
*
s
!=
'"'
||
(
--
s
>=
data
&&
*
s
==
'\\'
)));
/* quoted-string must start with '"' */
if
(
*
(
s
+
1
)
!=
'"'
)
return
NULL
;
}
else
if
(
!
isspace
(
*
s
)
&&
!
strchr
(
ATOM_SPECIALS
,
*
s
))
{
/* find start of dot-atom */
while
(
--
s
>=
data
&&
(
*
s
==
'.'
||
!
strchr
(
ATOM_SPECIALS
,
*
s
)));
/* atom MUST NOT start with '.' */
if
(
*
(
s
+
1
)
==
'.'
)
return
NULL
;
}
else
return
NULL
;
if
(
s
<
data
)
s
=
data
;
else
if
(
angle_addr
)
{
/* angle-addr MUST start with '<' */
if
(
*
s
!=
'<'
)
return
NULL
;
/* gobble leading whitespace */
while
(
s
>
data
&&
isspace
(
*
(
s
-1
)))
s
--
;
}
else
if
(
!
(
isspace
(
*
s
)
||
*
s
==
','
))
{
/* invalid separator */
return
NULL
;
}
/* found a valid address */
*
addrlen
=
e
-
s
;
return
s
;
}
/*
* If 'isheader' is non-zero "Q" encode (per RFC 2047), otherwise
* quoted-printable encode (per RFC 2045), the 'data' of 'len' bytes.
* Returns a buffer which the caller must free.
* Returns the number of encoded bytes in 'outlen'.
*/
static
char
*
qp_encode
(
const
char
*
data
,
size_t
len
,
int
isheader
,
int
force_quote
,
size_t
*
outlen
)
{
struct
buf
buf
=
BUF_INITIALIZER
;
size_t
n
;
int
need_quote
=
0
,
need_fold
=
0
;
if
(
!
force_quote
)
{
size_t
prev_lf
=
0
;
for
(
n
=
0
;
n
<
len
;
n
++
)
{
unsigned
char
this
=
data
[
n
];
unsigned
char
next
=
(
n
<
len
-
1
)
?
data
[
n
+
1
]
:
'\0'
;
if
(
QPSAFECHAR
[
this
]
||
this
==
'='
||
this
==
' '
||
this
==
'\t'
)
{
/* per RFC 5322: printable ASCII (decimal 33 - 126), SP, HTAB */
/* but only if the line doesn't exceed the 76 octet limit */
if
(
n
-
prev_lf
<=
74
)
continue
;
if
(
isheader
)
{
need_fold
=
1
;
continue
;
}
}
else
if
(
!
isheader
&&
this
==
'\r'
&&
next
==
'\n'
)
{
/* line break (CRLF) */
n
++
;
prev_lf
=
n
;
continue
;
}
need_quote
=
1
;
break
;
}
}
else
{
need_quote
=
1
;
}
if
(
need_quote
)
{
int
cnt
=
0
;
if
(
isheader
)
{
buf_appendcstr
(
&
buf
,
"=?UTF-8?Q?"
);
cnt
=
10
;
}
for
(
n
=
0
;
n
<
len
;
n
++
)
{
unsigned
char
this
=
data
[
n
];
unsigned
char
next
=
(
n
<
len
-
1
)
?
data
[
n
+
1
]
:
'\0'
;
/* Insert line break before exceeding line length limits */
if
(
isheader
)
{
/* RFC 2047 forbids splitting multi-octet characters */
int
needbytes
;
if
(
this
<
0x80
)
needbytes
=
0
;
else
if
(
this
<
0xc0
)
needbytes
=
0
;
// UTF-8 continuation
else
if
(
this
<
0xe0
)
needbytes
=
3
;
else
if
(
this
<
0xf0
)
needbytes
=
6
;
else
if
(
this
<
0xf8
)
needbytes
=
9
;
else
needbytes
=
0
;
// impossible UTF-8 encoding
if
(
cnt
+
needbytes
>=
ENCODED_MAX_LINE_LEN
)
{
buf_appendcstr
(
&
buf
,
"?=
\r\n
=?UTF-8?Q?"
);
cnt
=
11
;
}
}
else
if
(
cnt
>=
ENCODED_MAX_LINE_LEN
)
{
/* add soft line break to body */
buf_appendcstr
(
&
buf
,
"=
\r\n
"
);
cnt
=
0
;
}
if
((
QPSAFECHAR
[
this
]
/* per RFC 2047: '?' and '_' in header aren't safe */
&&
!
(
isheader
&&
(
this
==
'?'
||
this
==
'_'
)))
/* per RFC 2045: non-trailing whitespace in body is safe */
||
(
!
isheader
&&
(
this
==
' '
||
this
==
'\t'
)
&&
!
(
next
==
'\0'
||
next
==
'\r'
||
next
==
'\n'
)))
{
/* literal representation */
buf_putc
(
&
buf
,
(
char
)
this
);
cnt
++
;
}
else
if
(
isheader
&&
this
==
' '
)
{
/* per RFC 2047: represent SP in header as '_' for legibility */
buf_putc
(
&
buf
,
'_'
);
cnt
++
;
}
else
if
(
this
==
'\r'
&&
next
==
'\n'
)
{
if
(
isheader
)
{
/* folded header, split encoded token */
if
(
n
<
len
-
2
)
{
/* only if actually a folding character, skip it */
if
(
data
[
n
+
2
]
==
' '
||
data
[
n
+
2
]
==
'\t'
)
n
++
;
buf_appendcstr
(
&
buf
,
"?=
\r\n
=?UTF-8?Q?"
);
cnt
=
12
;
}
}
else
{
/* line break (CRLF) in body */
buf_appendcstr
(
&
buf
,
"
\r\n
"
);
cnt
=
0
;
}
n
++
;
}
else
{
/* 8-bit representation */
buf_printf
(
&
buf
,
"=%02X"
,
this
);
cnt
+=
3
;
}
}
if
(
isheader
)
buf_appendcstr
(
&
buf
,
"?="
);
}
else
if
(
need_fold
)
{
/* fold header every 78 characters (if possible) */
size_t
i
=
0
,
j
=
0
,
last_wsp
=
0
;
while
((
len
-
i
>
78
)
&&
(
j
<
len
))
{
j
+=
strcspn
(
data
+
j
,
"
\t
"
);
if
(
last_wsp
&&
(
j
-
i
>
78
))
{
buf_appendmap
(
&
buf
,
data
+
i
,
last_wsp
-
i
);
buf_appendcstr
(
&
buf
,
"
\r\n
"
);
i
=
last_wsp
;
}
last_wsp
=
j
++
;
}
buf_appendcstr
(
&
buf
,
data
+
i
);
}
else
{
buf_setmap
(
&
buf
,
data
,
len
);
}
if
(
outlen
)
*
outlen
=
buf_len
(
&
buf
);
return
buf_release
(
&
buf
);
}
/*
* Quoted-Printable encode the MIME body part (per RFC 2045) of 'len' bytes
* located at 'msg_base'.
* Returns a buffer which the caller must free.
* Returns the number of encoded bytes in 'outlen'.
*/
EXPORTED
char
*
charset_qpencode_mimebody
(
const
char
*
msg_base
,
size_t
len
,
int
force_quote
,
size_t
*
outlen
)
{
if
(
!
msg_base
)
return
NULL
;
return
qp_encode
(
msg_base
,
len
,
0
,
force_quote
,
outlen
);
}
static
void
encode_mimephrase
(
const
char
*
data
,
size_t
len
,
struct
buf
*
buf
,
int
*
cnt
);
static
char
*
encode_addrheader
(
const
char
*
header
,
size_t
len
,
int
force_quote
,
const
char
*
addr
,
size_t
addr_len
)
{
struct
buf
buf
=
BUF_INITIALIZER
;
size_t
n
=
0
;
int
cnt
=
0
;
do
{
size_t
phrase_len
=
addr
?
(
size_t
)
(
addr
-
(
header
+
n
))
:
len
-
n
;
if
(
phrase_len
)
{
/* display-name precedes address */
const
char
*
phrase
=
header
+
n
;
int
need_encode
=
0
,
need_bytes
=
phrase_len
;
const
char
*
c
;
for
(
c
=
phrase
;
c
<
addr
;
c
++
)
{
if
(
force_quote
||
(
*
c
&
0x80
))
{
need_encode
=
1
;
need_bytes
=
3
*
phrase_len
+
12
;
// assume max size
break
;
}
}
/* don't fold in the middle of a phrase */
if
(
cnt
+
need_bytes
>=
ENCODED_MAX_LINE_LEN
)
{
buf_appendcstr
(
&
buf
,
"
\r\n
"
);
cnt
=
1
;
}
if
(
need_encode
)
{
encode_mimephrase
(
phrase
,
phrase_len
,
&
buf
,
&
cnt
);
}
else
{
buf_appendmap
(
&
buf
,
phrase
,
phrase_len
);
cnt
+=
phrase_len
;
}
}
if
(
addr
)
{
/* don't fold in the middle of an address */
if
(
cnt
+
addr_len
>=
ENCODED_MAX_LINE_LEN
)
{
buf_appendcstr
(
&
buf
,
"
\r\n
"
);
cnt
=
1
;
}
buf_appendmap
(
&
buf
,
addr
,
addr_len
);
cnt
+=
addr_len
;
}
/* jump to end of address */
n
+=
phrase_len
+
addr_len
;
}
while
((
addr
=
find_addr
(
header
+
n
,
len
-
n
,
&
addr_len
))
||
n
<
len
);
return
buf_release
(
&
buf
);
}
/* "Q" encode the header field body (per RFC 2047) of 'len' bytes
* located at 'header'.
* Returns a buffer which the caller must free.
*/
EXPORTED
char
*
charset_encode_mimeheader
(
const
char
*
header
,
size_t
len
,
int
force_quote
)
{
if
(
!
header
)
return
NULL
;
if
(
!
len
)
len
=
strlen
(
header
);
size_t
addr_len
=
0
;
const
char
*
addr
=
find_addr
(
header
,
len
,
&
addr_len
);
if
(
addr
)
{
/* "Q" encode as an address header */
return
encode_addrheader
(
header
,
len
,
force_quote
,
addr
,
addr_len
);
}
return
qp_encode
(
header
,
len
,
1
,
force_quote
,
NULL
);
}
/*
* If 'isheader' is non-zero "Q" encode (per RFC 2047), otherwise
* quoted-printable encode (per RFC 2045), the 'data' of 'len' bytes.
* Returns a buffer which the caller must free.
* Returns the number of encoded bytes in 'outlen'.
*/
static
void
encode_mimephrase
(
const
char
*
data
,
size_t
len
,
struct
buf
*
buf
,
int
*
cnt
)
{
size_t
n
;
buf_appendcstr
(
buf
,
"=?UTF-8?Q?"
);
*
cnt
+=
10
;
for
(
n
=
0
;
n
<
len
;
n
++
)
{
unsigned
char
this
=
data
[
n
];
/* RFC 2047 forbids splitting multi-octet characters */
int
needbytes
;
if
(
this
<
0x80
)
needbytes
=
0
;
else
if
(
this
<
0xc0
)
needbytes
=
0
;
// UTF-8 continuation
else
if
(
this
<
0xe0
)
needbytes
=
3
;
else
if
(
this
<
0xf0
)
needbytes
=
6
;
else
if
(
this
<
0xf8
)
needbytes
=
9
;
else
needbytes
=
0
;
// impossible UTF-8 encoding
if
(
*
cnt
+
needbytes
>=
ENCODED_MAX_LINE_LEN
)
{
buf_appendcstr
(
buf
,
"?=
\r\n
=?UTF-8?Q?"
);
*
cnt
=
11
;
}
if
(
QPMIMEPHRASESAFECHAR
[
this
])
{
/* literal representation */
buf_putc
(
buf
,
(
char
)
this
);
*
cnt
+=
1
;
}
else
if
(
this
==
' '
)
{
/* per RFC 2047: represent SP in header as '_' for legibility */
buf_putc
(
buf
,
'_'
);
*
cnt
+=
1
;
}
else
{
/* 8-bit representation */
buf_printf
(
buf
,
"=%02X"
,
this
);
*
cnt
+=
3
;
}
}
buf_appendcstr
(
buf
,
"?="
);
}
EXPORTED
char
*
charset_encode_mimephrase
(
const
char
*
data
)
{
struct
buf
buf
=
BUF_INITIALIZER
;
int
cnt
=
0
;
encode_mimephrase
(
data
,
strlen
(
data
),
&
buf
,
&
cnt
);
return
buf_release
(
&
buf
);
}
static
void
extract_plain_cb
(
const
struct
buf
*
buf
,
void
*
rock
)
{
struct
buf
*
dst
=
(
struct
buf
*
)
rock
;
const
char
*
p
;
int
seenspace
=
0
;
/* Just merge multiple space into one. That's similar to
* charset_extract's MERGE_SPACE but since we don't want
* it to canonify the text into search form */
for
(
p
=
buf_base
(
buf
);
p
<
buf_base
(
buf
)
+
buf_len
(
buf
)
&&
*
p
;
p
++
)
{
if
(
*
p
==
' '
)
{
if
(
seenspace
)
continue
;
seenspace
=
1
;
}
else
{
seenspace
=
0
;
}
buf_appendmap
(
dst
,
p
,
1
);
}
}
EXPORTED
char
*
charset_extract_plain
(
const
char
*
html
)
{
struct
buf
src
=
BUF_INITIALIZER
;
struct
buf
dst
=
BUF_INITIALIZER
;
charset_t
utf8
=
charset_lookupname
(
"utf8"
);
char
*
text
;
char
*
tmp
,
*
q
;
const
char
*
p
;
/* Replace <br> and <p> with newlines */
q
=
tmp
=
xstrdup
(
html
);
p
=
html
;
while
(
*
p
)
{
if
(
!
strncmp
(
p
,
"<br>"
,
4
)
||
!
strncmp
(
p
,
"</p>"
,
4
))
{
*
q
++
=
'\n'
;
p
+=
4
;
}
else
if
(
!
strncmp
(
p
,
"p>"
,
3
))
{
p
+=
3
;
}
else
{
*
q
++
=
*
p
++
;
}
}
*
q
=
0
;
/* Strip html tags */
buf_init_ro
(
&
src
,
tmp
,
q
-
tmp
);
buf_setcstr
(
&
dst
,
""
);
charset_extract
(
&
extract_plain_cb
,
&
dst
,
&
src
,
utf8
,
ENCODING_NONE
,
"HTML"
,
CHARSET_KEEPCASE
);
buf_cstring
(
&
dst
);
/* Trim text */
buf_trim
(
&
dst
);
text
=
buf_releasenull
(
&
dst
);
if
(
!
strlen
(
text
))
{
free
(
text
);
text
=
NULL
;
}
buf_free
(
&
src
);
free
(
tmp
);
charset_free
(
&
utf8
);
return
text
;
}
EXPORTED
struct
char_counts
charset_count_validutf8
(
const
char
*
data
,
size_t
datalen
)
{
if
(
datalen
>
INT32_MAX
)
{
datalen
=
INT32_MAX
;
}
struct
char_counts
counts
=
{
0
,
0
,
0
};
int32_t
i
=
0
;
int32_t
length
=
(
int32_t
)
datalen
;
const
uint8_t
*
data8
=
(
const
uint8_t
*
)
data
;
while
(
i
<
length
)
{
UChar32
c
;
U8_NEXT
(
data8
,
i
,
length
,
c
);
if
(
c
==
0xfffd
)
counts
.
replacement
++
;
else
if
(
c
>=
0
)
counts
.
valid
++
;
else
counts
.
invalid
++
;
}
return
counts
;
}
File Metadata
Details
Attached
Mime Type
text/x-c
Expires
Fri, Apr 24, 11:08 AM (1 w, 5 d ago)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
ad/2f/fd465ae2065c61e844c3fc819d16
Default Alt Text
charset.c (107 KB)
Attached To
Mode
R111 cyrus-imapd
Attached
Detach File
Event Timeline