Page MenuHomePhorge

No OneTemporary

Authored By
Unknown
Size
275 KB
Referenced Files
None
Subscribers
None
This file is larger than 256 KB, so syntax highlighting was skipped.
diff --git a/imap/jmap_mail_query.c b/imap/jmap_mail_query.c
index 89edb078a..7a23f7f7a 100644
--- a/imap/jmap_mail_query.c
+++ b/imap/jmap_mail_query.c
@@ -1,1261 +1,1261 @@
/* jmap_mail_query.c -- Helper routines for JMAP Email/query
*
* Copyright (c) 1994-2018 Carnegie Mellon University. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The name "Carnegie Mellon University" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For permission or any legal
* details, please contact
* Carnegie Mellon University
* Center for Technology Transfer and Enterprise Creation
* 4615 Forbes Avenue
* Suite 302
* Pittsburgh, PA 15213
* (412) 268-7393, fax: (412) 268-7395
* innovation@andrew.cmu.edu
*
* 4. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by Computing Services
* at Carnegie Mellon University (http://www.cmu.edu/computing/)."
*
* CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO
* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
*/
#include <config.h>
#include <string.h>
#include <syslog.h>
#include <errno.h>
#include "libconfig.h"
#include "jmap_mail_query.h"
#include "jmap_util.h"
#include "json_support.h"
#include "search_engines.h"
#include "imap/imap_err.h"
#ifndef JMAP_URN_MAIL
#define JMAP_URN_MAIL "urn:ietf:params:jmap:mail"
#endif
#ifndef JMAP_MAIL_EXTENSION
#define JMAP_MAIL_EXTENSION "https://cyrusimap.org/ns/jmap/mail"
#endif
static int _email_threadkeyword_is_valid(const char *keyword)
{
/* \Seen is always supported */
if (!strcasecmp(keyword, "$Seen"))
return 1;
const char *counted_flags = config_getstring(IMAPOPT_CONVERSATIONS_COUNTED_FLAGS);
if (!counted_flags)
return 0;
/* We really shouldn't do all this string mangling for each keyword */
strarray_t *flags = strarray_split(counted_flags, " ", STRARRAY_TRIM);
int i, is_supported = 0;
for (i = 0; i < flags->count; i++) {
const char *flag = strarray_nth(flags, i);
const char *kw = keyword;
if (*flag == '\\') { // special case \ => $
flag++;
if (*kw != '$') continue;
kw++;
}
if (!strcasecmp(flag, kw)) {
is_supported = 1;
break;
}
}
strarray_free(flags);
return is_supported;
}
#ifdef WITH_DAV
#include "annotate.h"
#include "carddav_db.h"
#include "global.h"
#include "hash.h"
#include "index.h"
#include "search_query.h"
#include "times.h"
HIDDEN void jmap_email_contactfilter_init(const char *accountid,
const char *addressbookid,
struct email_contactfilter *cfilter)
{
memset(cfilter, 0, sizeof(struct email_contactfilter));
cfilter->accountid = accountid;
if (addressbookid) {
cfilter->addrbook = carddav_mboxname(accountid, addressbookid);
}
}
HIDDEN void jmap_email_contactfilter_fini(struct email_contactfilter *cfilter)
{
if (cfilter->carddavdb) {
carddav_close(cfilter->carddavdb);
}
free(cfilter->addrbook);
free_hash_table(&cfilter->contactgroups, (void(*)(void*))strarray_free);
}
static int _get_sharedaddressbook_cb(const mbentry_t *mbentry, void *rock)
{
mbname_t **mbnamep = rock;
if (!mbentry) return 0;
if (!(mbentry->mbtype & MBTYPE_ADDRESSBOOK)) return 0;
mbname_t *mbname = mbname_from_intname(mbentry->name);
if (!strcmpsafe(strarray_nth(mbname_boxes(mbname), -1), "Shared")) {
*mbnamep = mbname;
return CYRUSDB_DONE;
}
mbname_free(&mbname);
return 0;
}
static mbname_t *_get_sharedaddressbookuser(const char *userid)
{
mbname_t *res = NULL;
int flags = MBOXTREE_PLUS_RACL|MBOXTREE_SKIP_ROOT|MBOXTREE_SKIP_CHILDREN;
// XXX - do we need to pass req->authstate right through??
int r = mboxlist_usermboxtree(userid, NULL, _get_sharedaddressbook_cb, &res, flags);
if (r == CYRUSDB_DONE)
return res;
mbname_free(&res);
return NULL;
}
static const struct contactfilters_t {
const char *field;
int isany;
} contactfilters[] = {
{ "fromContactGroupId", 0 },
{ "toContactGroupId", 0 },
{ "ccContactGroupId", 0 },
{ "bccContactGroupId", 0 },
{ "fromAnyContact", 1 },
{ "toAnyContact", 1 },
{ "ccAnyContact", 1 },
{ "bccAnyContact", 1 },
{ NULL, 0 }
};
HIDDEN int jmap_email_contactfilter_from_filtercondition(struct jmap_parser *parser,
json_t *filter,
struct email_contactfilter *cfilter)
{
int havefield = 0;
const struct contactfilters_t *c;
mbname_t *othermb = NULL;
int r = 0;
/* prefilter to see if there are any fields that we will need to look up */
for (c = contactfilters; c->field; c++) {
json_t *arg = json_object_get(filter, c->field);
if (!arg) continue;
const char *groupid = c->isany ? (json_is_true(arg) ? "" : NULL) : json_string_value(arg);
if (!groupid) continue; // avoid looking up if invalid!
havefield = 1;
break;
}
if (!havefield) goto done;
/* ensure we have preconditions for lookups */
if (!cfilter->contactgroups.size) {
/* Initialize groups lookup table */
construct_hash_table(&cfilter->contactgroups, 32, 0);
}
if (!cfilter->carddavdb) {
/* Open CardDAV db first time we need it */
cfilter->carddavdb = carddav_open_userid(cfilter->accountid);
if (!cfilter->carddavdb) {
syslog(LOG_ERR, "jmap: carddav_open_userid(%s) failed",
cfilter->accountid);
r = CYRUSDB_INTERNAL;
goto done;
}
}
othermb = _get_sharedaddressbookuser(cfilter->accountid);
if (othermb) {
int r2 = carddav_set_otheruser(cfilter->carddavdb, mbname_userid(othermb));
if (r2) syslog(LOG_NOTICE, "DBNOTICE: failed to open otheruser %s contacts for %s",
mbname_userid(othermb), cfilter->accountid);
}
/* fetch members for each filter referenced */
for (c = contactfilters; c->field; c++) {
json_t *arg = json_object_get(filter, c->field);
if (!arg) continue;
const char *groupid = c->isany ? (json_is_true(arg) ? "" : NULL) : json_string_value(arg);
if (!groupid) continue;
if (hash_lookup(groupid, &cfilter->contactgroups)) continue;
/* Lookup group member email addresses */
strarray_t *members = carddav_getgroup(cfilter->carddavdb, cfilter->addrbook, groupid, othermb);
if (!members) {
jmap_parser_invalid(parser, c->field);
}
else {
hash_insert(groupid, members, &cfilter->contactgroups);
}
}
done:
mbname_free(&othermb);
return r;
}
HIDDEN void jmap_emailbodies_fini(struct emailbodies *bodies)
{
ptrarray_fini(&bodies->attslist);
ptrarray_fini(&bodies->textlist);
ptrarray_fini(&bodies->htmllist);
}
static int _email_extract_bodies_internal(const struct body *parts,
int nparts,
const char *multipart_type,
int in_alternative,
ptrarray_t *textlist,
ptrarray_t *htmllist,
ptrarray_t *attslist)
{
int i;
enum parttype { OTHER, PLAIN, HTML, MULTIPART, INLINE_MEDIA, MESSAGE };
int textlist_count = textlist ? textlist->count : -1;
int htmllist_count = htmllist ? htmllist->count : -1;
for (i = 0; i < nparts; i++) {
const struct body *part = parts + i;
/* Determine part type */
enum parttype parttype = OTHER;
if (!strcmp(part->type, "TEXT") && !strcmp(part->subtype, "PLAIN"))
parttype = PLAIN;
else if (!strcmp(part->type, "TEXT") && !strcmp(part->subtype, "RICHTEXT"))
parttype = PLAIN; // RFC 1341
else if (!strcmp(part->type, "TEXT") && !strcmp(part->subtype, "ENRICHED"))
parttype = PLAIN; // RFC 1563
else if (!strcmp(part->type, "TEXT") && !strcmp(part->subtype, "HTML"))
parttype = HTML;
else if (!strcmp(part->type, "MULTIPART"))
parttype = MULTIPART;
else if (!strcmp(part->type, "IMAGE") || !strcmp(part->type, "AUDIO") || !strcmp(part->type, "VIDEO"))
parttype = INLINE_MEDIA;
/* Determine disposition name, if any. */
const char *dispname = NULL;
struct param *param;
for (param = part->disposition_params; param; param = param->next) {
if (!strncasecmp(param->attribute, "filename", 8)) {
dispname = param->value;
break;
}
}
if (!dispname) {
for (param = part->params; param; param = param->next) {
if (!strncasecmp(param->attribute, "name", 4)) {
dispname = param->value;
break;
}
}
}
/* Determine if it's an inlined part */
int is_inline =
(!part->disposition || strcmp(part->disposition, "ATTACHMENT")) &&
/* Must be one of the allowed body types */
(parttype == PLAIN || parttype == HTML || parttype == INLINE_MEDIA) &&
/* If multipart/related, only the first part can be inline
* If a text part with a filename, and not the first item in the
* multipart, assume it is an attachment */
(i == 0 || (strcmp(multipart_type, "RELATED") &&
(parttype == INLINE_MEDIA || !dispname)));
/* Handle by part type */
if (parttype == MULTIPART) {
_email_extract_bodies_internal(part->subpart, part->numparts,
part->subtype,
in_alternative || !strcmp(part->subtype, "ALTERNATIVE"),
textlist, htmllist, attslist);
}
else if (is_inline) {
if (!strcmp(multipart_type, "ALTERNATIVE")) {
if (parttype == PLAIN && textlist) {
ptrarray_append(textlist, (void*) part);
}
else if (parttype == HTML && htmllist) {
ptrarray_append(htmllist, (void*) part);
}
else {
ptrarray_append(attslist, (void*) part);
}
continue;
}
else if (in_alternative) {
if (parttype == PLAIN)
htmllist = NULL;
if (parttype == HTML)
textlist = NULL;
}
if (textlist)
ptrarray_append(textlist, (void*) part);
if (htmllist)
ptrarray_append(htmllist, (void*) part);
if ((!textlist || !htmllist) && parttype == INLINE_MEDIA)
ptrarray_append(attslist, (void*) part);
}
else {
ptrarray_append(attslist, (void*) part);
}
}
if (!strcmp(multipart_type, "ALTERNATIVE")) {
int j;
/* Found HTML part only */
if (textlist && htmllist && textlist_count == textlist->count) {
for (j = htmllist_count; j < htmllist->count; j++)
ptrarray_append(textlist, ptrarray_nth(htmllist, j));
}
/* Found TEXT part only */
if (htmllist && textlist && htmllist_count == htmllist->count) {
for (j = textlist_count; j < textlist->count; j++)
ptrarray_append(htmllist, ptrarray_nth(textlist, j));
}
}
return 0;
}
HIDDEN int jmap_emailbodies_extract(const struct body *root,
struct emailbodies *bodies)
{
return _email_extract_bodies_internal(root, 1, "MIXED", 0,
&bodies->textlist, &bodies->htmllist,
&bodies->attslist);
}
struct matchmime_receiver {
struct search_text_receiver super;
xapian_dbw_t *dbw;
struct buf buf;
};
static int _matchmime_tr_begin_mailbox(search_text_receiver_t *rx __attribute__((unused)),
struct mailbox *mailbox __attribute__((unused)),
int incremental __attribute__((unused)))
{
return 0;
}
static uint32_t _matchmime_tr_first_unindexed_uid(search_text_receiver_t *rx __attribute__((unused)))
{
return 1;
}
static uint8_t _matchmime_tr_is_indexed(search_text_receiver_t *rx __attribute__((unused)),
message_t *msg __attribute__((unused)))
{
return 0;
}
static int _matchmime_tr_begin_message(search_text_receiver_t *rx, message_t *msg)
{
const struct message_guid *guid;
int r = message_get_guid(msg, &guid);
if (r) return r;
struct matchmime_receiver *tr = (struct matchmime_receiver *) rx;
return xapian_dbw_begin_doc(tr->dbw, guid, 'G');
}
static int _matchmime_tr_begin_bodypart(search_text_receiver_t *rx __attribute__((unused)),
const char *partid __attribute__((unused)),
const struct message_guid *content_guid __attribute__((unused)),
const char *type __attribute__((unused)),
const char *subtype __attribute__((unused)))
{
return 0;
}
static void _matchmime_tr_begin_part(search_text_receiver_t *rx __attribute__((unused)),
int part __attribute__((unused)))
{
}
static void _matchmime_tr_append_text(search_text_receiver_t *rx,
const struct buf *text)
{
struct matchmime_receiver *tr = (struct matchmime_receiver *) rx;
if (buf_len(&tr->buf) >= SEARCH_MAX_PARTS_SIZE) return;
size_t n = SEARCH_MAX_PARTS_SIZE - buf_len(&tr->buf);
if (n > buf_len(text)) {
n = buf_len(text);
}
buf_appendmap(&tr->buf, buf_base(text), n);
}
static void _matchmime_tr_end_part(search_text_receiver_t *rx, int part)
{
struct matchmime_receiver *tr = (struct matchmime_receiver *) rx;
xapian_dbw_doc_part(tr->dbw, &tr->buf, part);
buf_reset(&tr->buf);
}
static void _matchmime_tr_end_bodypart(search_text_receiver_t *rx __attribute__((unused)))
{
}
static int _matchmime_tr_end_message(search_text_receiver_t *rx, uint8_t indexlevel)
{
struct matchmime_receiver *tr = (struct matchmime_receiver *) rx;
return xapian_dbw_end_doc(tr->dbw, indexlevel);
}
static int _matchmime_tr_end_mailbox(search_text_receiver_t *rx __attribute__((unused)),
struct mailbox *mailbox __attribute__((unused)))
{
return 0;
}
static int _matchmime_tr_flush(search_text_receiver_t *rx __attribute__((unused)))
{
return 0;
}
static int _matchmime_tr_audit_mailbox(search_text_receiver_t *rx __attribute__((unused)),
bitvector_t *unindexed __attribute__((unused)))
{
return 0;
}
static int _matchmime_tr_index_charset_flags(int base_flags)
{
return base_flags | CHARSET_KEEPCASE;
}
static int _matchmime_tr_index_message_format(int format __attribute__((unused)),
int is_snippet __attribute__((unused)))
{
return MESSAGE_SNIPPET;
}
static int _email_matchmime_evaluate_xcb(void *data __attribute__((unused)),
size_t n, void *rock)
{
int *matches = rock;
/* There's just a single message in the in-memory database,
* so no need to check the message guid in the search result. */
*matches = n > 0;
return 0;
}
static xapian_query_t *_email_matchmime_contactgroup(const char *groupid,
int part,
xapian_db_t *db,
struct email_contactfilter *cfilter)
{
xapian_query_t *xq = NULL;
if (cfilter->contactgroups.size) {
strarray_t *members = hash_lookup(groupid, &cfilter->contactgroups);
if (members && strarray_size(members)) {
ptrarray_t xsubqs = PTRARRAY_INITIALIZER;
int i;
for (i = 0; i < strarray_size(members); i++) {
const char *member = strarray_nth(members, i);
if (!strchr(member, '@')) continue;
xapian_query_t *xsubq = xapian_query_new_match(db, part, member);
if (xsubq) ptrarray_append(&xsubqs, xsubq);
}
if (ptrarray_size(&xsubqs)) {
xq = xapian_query_new_compound(db, /*is_or*/1,
(xapian_query_t **) xsubqs.data, xsubqs.count);
}
}
}
if (!xq) {
xq = xapian_query_new_not(db, xapian_query_new_matchall(db));
}
return xq;
}
static xapian_query_t *build_type_query(xapian_db_t *db, const char *type)
{
strarray_t types = STRARRAY_INITIALIZER;
ptrarray_t xqs = PTRARRAY_INITIALIZER;
/* Handle type wildcards */
if (!strcasecmp(type, "image")) {
strarray_append(&types, "image/gif");
strarray_append(&types, "image/jpeg");
strarray_append(&types, "image/pjpeg");
strarray_append(&types, "image/jpg");
strarray_append(&types, "image/png");
strarray_append(&types, "image/bmp");
strarray_append(&types, "image/tiff");
}
else if (!strcasecmp(type, "document")) {
strarray_append(&types, "application/msword");
strarray_append(&types, "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
strarray_append(&types, "application/vnd.openxmlformats-officedocument.wordprocessingml.template");
strarray_append(&types, "application/vnd.sun.xml.writer");
strarray_append(&types, "application/vnd.sun.xml.writer.template");
strarray_append(&types, "application/vnd.oasis.opendocument.text");
strarray_append(&types, "application/vnd.oasis.opendocument.text-template");
strarray_append(&types, "application/x-iwork-pages-sffpages");
strarray_append(&types, "application/vnd.apple.pages");
}
else if (!strcasecmp(type, "spreadsheet")) {
strarray_append(&types, "application/vnd.ms-excel");
strarray_append(&types, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
strarray_append(&types, "application/vnd.openxmlformats-officedocument.spreadsheetml.template");
strarray_append(&types, "application/vnd.sun.xml.calc");
strarray_append(&types, "application/vnd.sun.xml.calc.template");
strarray_append(&types, "application/vnd.oasis.opendocument.spreadsheet");
strarray_append(&types, "application/vnd.oasis.opendocument.spreadsheet-template");
strarray_append(&types, "application/x-iwork-numbers-sffnumbers");
strarray_append(&types, "application/vnd.apple.numbers");
}
else if (!strcasecmp(type, "presentation")) {
strarray_append(&types, "application/vnd.ms-powerpoint");
strarray_append(&types, "application/vnd.openxmlformats-officedocument.presentationml.presentation");
strarray_append(&types, "application/vnd.openxmlformats-officedocument.presentationml.template");
strarray_append(&types, "application/vnd.openxmlformats-officedocument.presentationml.slideshow");
strarray_append(&types, "application/vnd.sun.xml.impress");
strarray_append(&types, "application/vnd.sun.xml.impress.template");
strarray_append(&types, "application/vnd.oasis.opendocument.presentation");
strarray_append(&types, "application/vnd.oasis.opendocument.presentation-template");
strarray_append(&types, "application/x-iwork-keynote-sffkey");
strarray_append(&types, "application/vnd.apple.keynote");
}
else if (!strcasecmp(type, "email")) {
strarray_append(&types, "message/rfc822");
}
else if (!strcasecmp(type, "pdf")) {
strarray_append(&types, "application/pdf");
}
else {
strarray_append(&types, type);
}
/* Build expression */
int i;
for (i = 0; i < strarray_size(&types); i++) {
const char *t = strarray_nth(&types, i);
xapian_query_t *xq = xapian_query_new_match(db, SEARCH_PART_TYPE, t);
if (xq) ptrarray_append(&xqs, xq);
}
xapian_query_t *xq = xapian_query_new_compound(db, /*is_or*/1,
(xapian_query_t **) xqs.data, xqs.count);
ptrarray_fini(&xqs);
strarray_fini(&types);
return xq;
}
static int _email_matchmime_evaluate(json_t *filter,
message_t *m,
xapian_db_t *db,
struct email_contactfilter *cfilter,
time_t internaldate)
{
if (!json_object_size(filter)) {
/* Match all */
return 1;
}
json_t *conditions = json_object_get(filter, "conditions");
if (json_is_array(conditions)) {
/* Evaluate FilterOperator */
const char *strop = json_string_value(json_object_get(filter, "operator"));
enum search_op op = SEOP_UNKNOWN;
int matches;
if (!strcasecmpsafe(strop, "AND")) {
op = SEOP_AND;
matches = 1;
}
else if (!strcasecmpsafe(strop, "OR")) {
op = SEOP_OR;
matches = json_array_size(conditions) == 0;
}
else if (!strcasecmpsafe(strop, "NOT")) {
op = SEOP_NOT;
matches = json_array_size(conditions) != 0;
}
else return 0;
json_t *condition;
size_t i;
json_array_foreach(conditions, i, condition) {
int cond_matches = _email_matchmime_evaluate(condition, m, db, cfilter, internaldate);
if (op == SEOP_AND && !cond_matches) {
return 0;
}
if (op == SEOP_OR && cond_matches) {
return 1;
}
if (op == SEOP_NOT && cond_matches) {
return 0;
}
}
return matches;
}
/* Evaluate FilterCondition */
int need_matches = json_object_size(filter);
int have_matches = 0;
json_t *jval;
#define MATCHMIME_XQ_OR_MATCHALL(_xq) \
((_xq) ? _xq : xapian_query_new_matchall(db))
/* Xapian-backed criteria */
ptrarray_t xqs = PTRARRAY_INITIALIZER;
const char *match;
if ((match = json_string_value(json_object_get(filter, "text")))) {
ptrarray_t childqueries = PTRARRAY_INITIALIZER;
int i;
for (i = 0 ; i < SEARCH_NUM_PARTS ; i++) {
switch (i) {
case SEARCH_PART_LISTID:
case SEARCH_PART_TYPE:
case SEARCH_PART_LANGUAGE:
case SEARCH_PART_PRIORITY:
case SEARCH_PART_ATTACHMENTBODY:
continue;
}
void *xq = xapian_query_new_match(db, i, match);
if (xq) ptrarray_push(&childqueries, xq);
}
xapian_query_t *xq = xapian_query_new_compound(db, /*is_or*/1,
(xapian_query_t **)childqueries.data,
childqueries.count);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
ptrarray_fini(&childqueries);
}
if ((match = json_string_value(json_object_get(filter, "from")))) {
xapian_query_t *xq = xapian_query_new_match(db, SEARCH_PART_FROM, match);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((match = json_string_value(json_object_get(filter, "to")))) {
xapian_query_t *xq = xapian_query_new_match(db, SEARCH_PART_TO, match);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((match = json_string_value(json_object_get(filter, "cc")))) {
xapian_query_t *xq = xapian_query_new_match(db, SEARCH_PART_CC, match);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((match = json_string_value(json_object_get(filter, "bcc")))) {
xapian_query_t *xq = xapian_query_new_match(db, SEARCH_PART_BCC, match);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((match = json_string_value(json_object_get(filter, "deliveredTo")))) {
xapian_query_t *xq = xapian_query_new_match(db, SEARCH_PART_DELIVEREDTO, match);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((match = json_string_value(json_object_get(filter, "subject")))) {
xapian_query_t *xq = xapian_query_new_match(db, SEARCH_PART_SUBJECT, match);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((match = json_string_value(json_object_get(filter, "body")))) {
xapian_query_t *xq = xapian_query_new_match(db, SEARCH_PART_BODY, match);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((match = json_string_value(json_object_get(filter, "fromContactGroupId")))) {
xapian_query_t *xq = _email_matchmime_contactgroup(match, SEARCH_PART_FROM, db, cfilter);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((match = json_string_value(json_object_get(filter, "toContactGroupId")))) {
xapian_query_t *xq = _email_matchmime_contactgroup(match, SEARCH_PART_TO, db, cfilter);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((match = json_string_value(json_object_get(filter, "ccContactGroupId")))) {
xapian_query_t *xq = _email_matchmime_contactgroup(match, SEARCH_PART_CC, db, cfilter);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((match = json_string_value(json_object_get(filter, "bccContactGroupId")))) {
xapian_query_t *xq = _email_matchmime_contactgroup(match, SEARCH_PART_BCC, db, cfilter);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((json_is_true(json_object_get(filter, "fromAnyContact")))) {
xapian_query_t *xq = _email_matchmime_contactgroup("", SEARCH_PART_FROM, db, cfilter);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((json_is_true(json_object_get(filter, "toAnyContact")))) {
xapian_query_t *xq = _email_matchmime_contactgroup("", SEARCH_PART_TO, db, cfilter);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((json_is_true(json_object_get(filter, "ccAnyContact")))) {
xapian_query_t *xq = _email_matchmime_contactgroup("", SEARCH_PART_CC, db, cfilter);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((json_is_true(json_object_get(filter, "bccAnyContact")))) {
xapian_query_t *xq = _email_matchmime_contactgroup("", SEARCH_PART_BCC, db, cfilter);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((match = json_string_value(json_object_get(filter, "attachmentName")))) {
xapian_query_t *xq = xapian_query_new_match(db, SEARCH_PART_ATTACHMENTNAME, match);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((match = json_string_value(json_object_get(filter, "attachmentType")))) {
xapian_query_t *xq = build_type_query(db, match);
ptrarray_append(&xqs, MATCHMIME_XQ_OR_MATCHALL(xq));
}
if ((match = json_string_value(json_object_get(filter, "listId")))) {
xapian_query_t *xq = xapian_query_new_match(db, SEARCH_PART_LISTID, match);
if (xq) ptrarray_append(&xqs, xq);
}
if (JNOTNULL(jval = json_object_get(filter, "isHighPriority"))) {
xapian_query_t *xq = xapian_query_new_match(db, SEARCH_PART_PRIORITY, "1");
if (xq && !json_boolean_value(jval)) {
xq = xapian_query_new_not(db, xq);
}
if (xq) ptrarray_append(&xqs, xq);
}
// ignore attachmentBody
#undef MATCHMIME_XQ_OR_MATCHALL
if (xqs.count) {
int matches = 0;
xapian_query_t *xq = xapian_query_new_compound(db, /*is_or*/0,
(xapian_query_t **) xqs.data, xqs.count);
- xapian_query_run(db, xq, 0, _email_matchmime_evaluate_xcb, &matches);
+ xapian_query_run(db, xq, _email_matchmime_evaluate_xcb, &matches);
xapian_query_free(xq);
size_t xqs_count = xqs.count;
ptrarray_fini(&xqs);
if (matches) {
have_matches += xqs_count; // assumes one xapian query per criteria
}
else return 0;
}
/* size */
if (json_object_get(filter, "minSize") || json_object_get(filter, "maxSize")) {
uint32_t size;
if (message_get_size(m, &size) == 0) {
json_int_t jint;
if ((jint = json_integer_value(json_object_get(filter, "minSize"))) > 0) {
if (size >= jint) {
have_matches++;
}
else return 0;
}
if ((jint = json_integer_value(json_object_get(filter, "maxSize"))) > 0) {
if (size < jint) {
have_matches++;
}
else return 0;
}
}
}
/* hasAttachment */
if (JNOTNULL(jval = json_object_get(filter, "hasAttachment"))) {
const struct body *body;
if (message_get_cachebody(m, &body) == 0) {
struct emailbodies bodies = EMAILBODIES_INITIALIZER;
if (jmap_emailbodies_extract(body, &bodies) == 0) {
int have = ptrarray_size(&bodies.attslist) > 0;
int want = jval == json_true();
jmap_emailbodies_fini(&bodies);
if (have == want) {
have_matches++;
}
else return 0;
}
}
}
/* header */
if (JNOTNULL((jval = json_object_get(filter, "header")))) {
const char *hdr = NULL, *val = "", *cmp = NULL;
switch (json_array_size(jval)) {
case 3:
cmp = json_string_value(json_array_get(jval, 2));
GCC_FALLTHROUGH
case 2:
val = json_string_value(json_array_get(jval, 1));
GCC_FALLTHROUGH
case 1:
hdr = json_string_value(json_array_get(jval, 0));
break;
default:
return 0;
}
struct jmap_headermatch *hm = jmap_headermatch_new(hdr, val, cmp);
int matches = jmap_headermatch_match(hm, m);
jmap_headermatch_free(&hm);
if (matches) {
have_matches++;
} else return 0;
}
/* before */
if (JNOTNULL(jval = json_object_get(filter, "before"))) {
time_t t;
time_from_iso8601(json_string_value(jval), &t);
if (internaldate < t) {
have_matches++;
} else return 0;
}
/* after */
if (JNOTNULL(jval = json_object_get(filter, "after"))) {
time_t t;
time_from_iso8601(json_string_value(jval), &t);
if (internaldate >= t) {
have_matches++;
} else return 0;
}
return need_matches == have_matches;
}
HIDDEN void jmap_filter_parser_invalid(const char *field, void *rock)
{
struct jmap_email_filter_parser_rock *frock =
(struct jmap_email_filter_parser_rock *) rock;
jmap_parser_invalid(frock->parser, field);
}
HIDDEN void jmap_filter_parser_push_index(const char *field, size_t index,
const char *name, void *rock)
{
struct jmap_email_filter_parser_rock *frock =
(struct jmap_email_filter_parser_rock *) rock;
jmap_parser_push_index(frock->parser, field, index, name);
}
HIDDEN void jmap_filter_parser_pop(void *rock)
{
struct jmap_email_filter_parser_rock *frock =
(struct jmap_email_filter_parser_rock *) rock;
jmap_parser_pop(frock->parser);
}
HIDDEN void jmap_email_filtercondition_validate(const char *field, json_t *arg,
void *rock)
{
struct jmap_email_filter_parser_rock *frock =
(struct jmap_email_filter_parser_rock *) rock;
if (!strcmp(field, "inMailbox")) {
if (!json_is_string(arg)) {
jmap_parser_invalid(frock->parser, field);
}
}
else if (!strcmp(field, "inMailboxOtherThan")) {
if (!json_is_array(arg)) {
jmap_parser_invalid(frock->parser, field);
}
}
else if (!strcmp(field, "allInThreadHaveKeyword") ||
!strcmp(field, "someInThreadHaveKeyword") ||
!strcmp(field, "noneInThreadHaveKeyword")) {
const char *s;
if (!json_is_string(arg) ||
!(s = json_string_value(arg)) ||
!jmap_email_keyword_is_valid(s)) {
jmap_parser_invalid(frock->parser, field);
}
else if (!_email_threadkeyword_is_valid(s)) {
json_array_append_new(frock->unsupported,
json_pack("{s:s}", field, s));
}
}
else if (!strcmp(field, "hasKeyword") ||
!strcmp(field, "notKeyword")) {
if (!json_is_string(arg) ||
!jmap_email_keyword_is_valid(json_string_value(arg))) {
jmap_parser_invalid(frock->parser, field);
}
}
else {
jmap_parser_invalid(frock->parser, field);
}
}
HIDDEN matchmime_t *jmap_email_matchmime_init(const struct buf *mime, json_t **err)
{
matchmime_t *matchmime = xzmalloc(sizeof(matchmime_t));
int r = 0;
matchmime->mime = mime;
/* Parse message into memory */
matchmime->m = message_new_from_data(buf_base(mime), buf_len(mime));
if (!matchmime->m) {
syslog(LOG_ERR, "jmap_matchmime: can't create Cyrus message");
*err = jmap_server_error(r);
jmap_email_matchmime_free(&matchmime);
return NULL;
}
/* Open temporary database */
matchmime->dbpath = create_tempdir(config_getstring(IMAPOPT_TEMP_PATH), "matchmime");
if (!matchmime->dbpath) {
syslog(LOG_ERR, "jmap_matchmime: can't create tempdir: %s", strerror(errno));
*err = jmap_server_error(IMAP_INTERNAL);
jmap_email_matchmime_free(&matchmime);
return NULL;
}
/* Open search index in temp directory */
const char *paths[2];
paths[0] = matchmime->dbpath;
paths[1] = NULL;
r = xapian_dbw_open(paths, &matchmime->dbw, /*mode*/0, /*nosync*/1);
if (r) {
syslog(LOG_ERR, "jmap_matchmime: can't open search backend: %s",
error_message(r));
*err = jmap_server_error(r);
jmap_email_matchmime_free(&matchmime);
return NULL;
}
/* Index message bodies in-memory */
struct matchmime_receiver tr = {
{
_matchmime_tr_begin_mailbox,
_matchmime_tr_first_unindexed_uid,
_matchmime_tr_is_indexed,
_matchmime_tr_begin_message,
_matchmime_tr_begin_bodypart,
_matchmime_tr_begin_part,
_matchmime_tr_append_text,
_matchmime_tr_end_part,
_matchmime_tr_end_bodypart,
_matchmime_tr_end_message,
_matchmime_tr_end_mailbox,
_matchmime_tr_flush,
_matchmime_tr_audit_mailbox,
_matchmime_tr_index_charset_flags,
_matchmime_tr_index_message_format
},
matchmime->dbw, BUF_INITIALIZER
};
r = index_getsearchtext(matchmime->m, NULL, (struct search_text_receiver*) &tr, 0);
buf_free(&tr.buf);
if (r) {
syslog(LOG_ERR, "jmap_matchmime: can't index MIME message: %s",
error_message(r));
*err = jmap_server_error(r);
jmap_email_matchmime_free(&matchmime);
return NULL;
}
return matchmime;
}
HIDDEN void jmap_email_matchmime_free(matchmime_t **matchmimep)
{
matchmime_t *matchmime = *matchmimep;
if (!matchmime) return;
if (matchmime->m) message_unref(&matchmime->m);
if (matchmime->dbw) xapian_dbw_close(matchmime->dbw);
if (matchmime->dbpath) removedir(matchmime->dbpath);
free(matchmime->dbpath);
free(matchmime);
*matchmimep = NULL;
}
HIDDEN int jmap_email_matchmime(matchmime_t *matchmime,
json_t *jfilter,
const char *accountid,
time_t internaldate,
json_t **err)
{
int r = 0;
int matches = 0;
struct jmap_parser parser = JMAP_PARSER_INITIALIZER;
strarray_t capabilities = STRARRAY_INITIALIZER;
struct email_contactfilter cfilter;
json_t *unsupported = json_array();
struct jmap_email_filter_parser_rock frock = { &parser, unsupported } ;
jmap_email_filter_parse_ctx_t parse_ctx = {
&jmap_email_filtercondition_validate,
&jmap_filter_parser_invalid,
&jmap_filter_parser_push_index,
&jmap_filter_parser_pop,
&capabilities,
&frock
};
/* Parse filter */
strarray_append(&capabilities, JMAP_URN_MAIL);
strarray_append(&capabilities, JMAP_MAIL_EXTENSION);
jmap_email_filter_parse(jfilter, &parse_ctx);
/* Gather contactgroup ids */
jmap_email_contactfilter_init(accountid, /*addressbookid*/NULL, &cfilter);
ptrarray_t work = PTRARRAY_INITIALIZER;
ptrarray_push(&work, jfilter);
json_t *jf;
while ((jf = ptrarray_pop(&work))) {
size_t i;
json_t *jval;
json_array_foreach(json_object_get(jf, "conditions"), i, jval) {
ptrarray_push(&work, jval);
}
r = jmap_email_contactfilter_from_filtercondition(&parser, jf, &cfilter);
if (r) break;
}
ptrarray_fini(&work);
if (r) {
syslog(LOG_ERR, "jmap_matchmime: can't load contactgroups from filter: %s",
error_message(r));
*err = jmap_server_error(r);
goto done;
}
else if (json_array_size(parser.invalid)) {
*err = json_pack("{s:s s:O}", "type", "invalidArguments",
"arguments", parser.invalid);
goto done;
}
else if (json_array_size(unsupported)) {
*err = json_pack("{s:s s:O}", "type", "unsupportedFilter",
"filters", unsupported);
goto done;
}
/* Evaluate filter */
xapian_db_t *db = NULL;
r = xapian_db_opendbw(matchmime->dbw, &db);
if (r) {
syslog(LOG_ERR, "jmap_matchmime: can't open query backend: %s",
error_message(r));
*err = jmap_server_error(r);
goto done;
}
matches = _email_matchmime_evaluate(jfilter, matchmime->m, db, &cfilter, internaldate);
xapian_db_close(db);
done:
jmap_email_contactfilter_fini(&cfilter);
jmap_parser_fini(&parser);
strarray_fini(&capabilities);
json_decref(unsupported);
return matches;
}
#endif /* WITH_DAV */
static void headermatch_normalize(struct jmap_headermatch *hm, struct buf *val)
{
if (!buf_len(val)) return;
buf_cstring(val);
buf_trim(val);
/* Fast-path ASCII without consecutive whitespace */
char *s;
for (s = val->s; *s; s++) {
if (!isascii(*s)) {
break;
}
if (isspace(s[0]) && isspace(s[1])) {
break;
}
*s = toupper(*s);
}
if (!*s) return;
/* Convert value */
buf_setcstr(val, charset_conv_convert(hm->conv, buf_cstring(val)));
}
static void jmap_headermatch_init(struct jmap_headermatch *hm)
{
hm->utf8 = charset_lookupname("utf8");
hm->conv = charset_conv_new(hm->utf8,
CHARSET_SKIPDIACRIT|
CHARSET_MERGESPACE|
CHARSET_TRIMWS|
CHARSET_UNORM_NFC);
}
HIDDEN struct jmap_headermatch *jmap_headermatch_new(const char *header,
const char *value,
const char *strop)
{
struct jmap_headermatch *hm = xzmalloc(sizeof(struct jmap_headermatch));
jmap_headermatch_init(hm);
struct buf *val = &hm->tmp[0];
if (!strcmpsafe(strop, "equals")) {
hm->op = HEADERMATCH_EQUALS;
}
else if (!strcmpsafe(strop, "startsWith")) {
hm->op = HEADERMATCH_STARTS;
}
else if (!strcmpsafe(strop, "endsWith")) {
hm->op = HEADERMATCH_ENDS;
}
else {
hm->op = HEADERMATCH_CONTAINS;
}
hm->header = lcase(xstrdup(header));
buf_setcstr(val, value);
headermatch_normalize(hm, val);
hm->len = buf_len(val);
hm->value = xstrdup(buf_cstring(val));
buf_reset(&hm->tmp[1]);
buf_reset(&hm->tmp[0]);
return hm;
}
HIDDEN void jmap_headermatch_free(struct jmap_headermatch **hmp)
{
if (!hmp || !*hmp) return;
struct jmap_headermatch *hm = *hmp;
free(hm->header);
free(hm->value);
buf_free(&hm->tmp[0]);
buf_free(&hm->tmp[1]);
buf_free(&hm->tmp[2]);
charset_conv_free(&hm->conv);
charset_free(&hm->utf8);
free(hm);
*hmp = NULL;
}
HIDDEN struct jmap_headermatch *jmap_headermatch_dup(struct jmap_headermatch *hm)
{
if (!hm) return NULL;
struct jmap_headermatch *hm2 = xzmalloc(sizeof(struct jmap_headermatch));
jmap_headermatch_init(hm2);
hm2->op = hm->op;
hm2->header = xstrdup(hm->header);
hm2->value = xstrdup(hm->value);
hm2->len = hm->len;
return hm2;
}
HIDDEN int jmap_headermatch_match(struct jmap_headermatch *hm, message_t *msg)
{
int match = 0;
struct buf *msgbuf = &hm->tmp[0];
if (!message_get_field(msg, hm->header,
MESSAGE_RAW|MESSAGE_APPEND|MESSAGE_MULTIPLE, msgbuf)) {
if (!buf_len(msgbuf)) {
match = 0;
goto done;
}
else if (!*hm->value) {
match = 1;
goto done;
}
/* Iterate header values until match found */
const char *p = buf_cstring(msgbuf);
do {
struct buf *val = &hm->tmp[1];
/* Extract value, including optional line folds */
const char *q;
for (q = p; *q; q++) {
if (*q == '\r') {
if (q[1] == '\n' && (q[2] == '\t' || q[2] == ' ')) {
q++;
}
else {
break;
}
}
}
buf_setmap(val, p, q - p);
/* Match header value */
headermatch_normalize(hm, val);
if (buf_len(val) >= hm->len) {
const char *v = buf_cstring(val);
switch (hm->op) {
case HEADERMATCH_EQUALS:
match = !strcmp(v, hm->value);
break;
case HEADERMATCH_STARTS:
match = !strncmp(v, hm->value, hm->len);
break;
case HEADERMATCH_ENDS:
match = !strcmp(v + buf_len(val) - hm->len, hm->value);
break;
default:
match = strstr(v, hm->value) != NULL;
}
}
/* Find next header value, if any */
if (*q) q += (q[1] == '\n') ? 2 : 1;
p = q;
q = strchr(p, ':');
if (q) p = q + 1;
} while(!match && *p);
}
done:
buf_reset(&hm->tmp[0]);
buf_reset(&hm->tmp[1]);
buf_reset(&hm->tmp[2]);
return match;
}
diff --git a/imap/search_xapian.c b/imap/search_xapian.c
index 36840928e..d8e5c5b64 100644
--- a/imap/search_xapian.c
+++ b/imap/search_xapian.c
@@ -1,4229 +1,3978 @@
/* search_xapian.c -- glue code for searching with Xapian
*
* Copyright (c) 1994-2012 Carnegie Mellon University. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The name "Carnegie Mellon University" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For permission or any legal
* details, please contact
* Carnegie Mellon University
* Center for Technology Transfer and Enterprise Creation
* 4615 Forbes Avenue
* Suite 302
* Pittsburgh, PA 15213
* (412) 268-7393, fax: (412) 268-7395
* innovation@andrew.cmu.edu
*
* 4. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by Computing Services
* at Carnegie Mellon University (http://www.cmu.edu/computing/)."
*
* CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO
* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <config.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <errno.h>
#include <fcntl.h>
#include <stdlib.h>
#include <syslog.h>
#include <string.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <dirent.h>
#include "assert.h"
#include "bitvector.h"
#include "bloom.h"
#include "global.h"
#include "ptrarray.h"
#include "user.h"
#include "xmalloc.h"
#include "xstrlcpy.h"
#include "xstrlcat.h"
#include "mappedfile.h"
#include "mboxlist.h"
#include "mboxname.h"
#include "xstats.h"
#include "search_engines.h"
#include "sequence.h"
#include "cyr_lock.h"
#include "xapian_wrap.h"
#include "command.h"
/* generated headers are not necessarily in current directory */
#include "imap/imap_err.h"
#define INDEXEDDB_VERSION 2
#define INDEXEDDB_FNAME "/cyrus.indexed.db"
#define XAPIAN_DIRNAME "/xapian"
#define ACTIVEFILE_METANAME "xapianactive"
#define XAPIAN_NAME_LOCK_PREFIX "$XAPIAN$"
// this seems to translate for 4Gb-ish - units are 10 bytes?
#define XAPIAN_REINDEX_TEMPDIR_SIZE 419430400
#define XAPIAN_REINDEX_TEMPDIR_COUNT 64000
/* Name of columns */
#define COL_CYRUSID "cyrusid"
struct segment
{
int part;
struct message_guid guid;
char doctype;
int sequence; /* forces stable sort order JIC */
int is_finished;
char *partid;
struct buf text;
};
static const char *xapian_rootdir(const char *tier, const char *partition);
static int xapian_basedir(const char *tier, const char *mboxname, const char *part,
const char *root, char **basedir);
/* ====================================================================== */
static int check_config(char **errstr)
{
const char *s;
if (!config_getswitch(IMAPOPT_CONVERSATIONS)) {
syslog(LOG_ERR, "ERROR: conversations required but not enabled");
if (errstr)
*errstr = xstrdup("xapian: conversations required but not enabled");
return IMAP_NOTFOUND;
}
s = config_getstring(IMAPOPT_DEFAULTSEARCHTIER);
if (!s || !strlen(s)) {
syslog(LOG_ERR, "ERROR: no default search tier configured");
if (errstr)
*errstr = xstrdup("xapian: no default search tier configured");
return IMAP_PARTITION_UNKNOWN;
}
return 0;
}
/* ====================================================================== */
/* the "activefile" file lists the tiers and generations of all the
* currently active search databases. The format is space separated
* records tiername:generation, i.e. "meta:0". If there is no file present,
* it is created by finding all the existing search directories (from
* filesystem inspection) and prepending default:nextgen where default
* is the searchdefaulttier value and nextgen is one higher than the
* largest generation found. In the simplest configuration this is
* just ":0" */
struct activeitem {
char *tier;
int generation;
};
enum LockType {
AF_LOCK_READ = 0,
AF_LOCK_WRITE = 1,
};
static struct activeitem *activeitem_parse(const char *input)
{
struct activeitem *res = NULL;
char *num = strrchr(input, ':');
if (!num) return NULL;
res = xzmalloc(sizeof(struct activeitem));
res->tier = xstrndup(input, num-input);
res->generation = atoi(num+1);
return res;
}
static void activeitem_free(struct activeitem *item)
{
if (!item) return;
free(item->tier);
free(item);
}
char *activeitem_generate(const char *tier, int generation)
{
struct buf buf = BUF_INITIALIZER;
buf_printf(&buf, "%s:%d", tier, generation);
return buf_release(&buf);
}
/* calculate the next name for this tier, by incrementing the generation
* to one higher than any existing active record */
static char *activefile_nextname(const strarray_t *active, const char *tier)
{
int max = -1;
int i;
for (i = 0; i < active->count; i++) {
struct activeitem *item = activeitem_parse(strarray_nth(active, i));
if (item && !strcmp(item->tier, tier)) {
if (item->generation > max)
max = item->generation;
}
activeitem_free(item);
}
return activeitem_generate(tier, max+1);
}
/* filter a list of active records to only those in certain tiers.
* Used to calculate which databases to use as sources for compression */
static strarray_t *activefile_filter(const strarray_t *active, const strarray_t *tiers, const char *partition)
{
strarray_t *res = strarray_new();
int i;
for (i = 0; i < active->count; i++) {
const char *name = strarray_nth(active, i);
struct activeitem *item = activeitem_parse(name);
/* we want to compress anything which can't possibly exist as well
* as anything which matches the filter tiers */
if (!item || strarray_find(tiers, item->tier, 0) >= 0
|| strarray_find(tiers, name, 0) >= 0
|| !xapian_rootdir(item->tier, partition))
strarray_append(res, name);
activeitem_free(item);
}
return res;
}
/* the activefile file is a per-user meta file */
static char *activefile_fname(const char *mboxname)
{
char *userid = mboxname_to_userid(mboxname);
if (!userid) return NULL;
char *res = user_hash_meta(userid, ACTIVEFILE_METANAME);
free(userid);
return res;
}
/* file format is very simple */
static strarray_t *activefile_read(struct mappedfile *activefile)
{
return strarray_nsplit(mappedfile_base(activefile), mappedfile_size(activefile), NULL, 1);
}
/* to write a activefile file safely, we need to do the create .NEW,
* write, fsync, rename dance. This unlocks the original file, so
* callers will need to lock again if they need a locked file.
* The 'mappedfile' API isn't a perfect match for what we need here,
* but it's close enough, and avoids open coding the lock dance. */
static int activefile_write(struct mappedfile *mf, const strarray_t *new)
{
char *newname = strconcat(mappedfile_fname(mf), ".NEW", (char *)NULL);
struct mappedfile *newfile = NULL;
int r;
ssize_t nwritten;
char *towrite = NULL;
r = mappedfile_open(&newfile, newname, MAPPEDFILE_CREATE|MAPPEDFILE_RW);
if (r) goto done;
r = mappedfile_writelock(newfile);
if (r) goto done;
towrite = strarray_join(new, " ");
nwritten = mappedfile_pwrite(newfile, towrite, strlen(towrite), 0);
free(towrite);
if (nwritten < 0) {
/* commit anyway so mappedfile doesn't have kittens
* about the map being closed dirty */
r = IMAP_IOERROR;
mappedfile_commit(newfile);
goto done;
}
r = mappedfile_commit(newfile);
if (r) goto done;
r = mappedfile_rename(newfile, mappedfile_fname(mf));
if (r) unlink(newname);
/* we lose control over the lock here, so we have to release */
mappedfile_unlock(mf);
done:
if (newfile) {
mappedfile_unlock(newfile);
mappedfile_close(&newfile);
}
free(newname);
return r;
}
/* if the mappedfile has no content, it needs to be initialised
* with some dummy data. Strictly it doesn't, but it makes
* reasoning about everything else easier if there's always a
* file */
static void inspect_filesystem(const char *mboxname, const char *partition,
strarray_t *found, strarray_t *bogus);
static void _activefile_init(const char *mboxname, const char *partition,
struct mappedfile *activefile)
{
int r = mappedfile_writelock(activefile);
const char *tier = config_getstring(IMAPOPT_DEFAULTSEARCHTIER);
strarray_t *list = NULL;
/* failed to lock, doh */
if (r) return;
/* did someone beat us to it? */
if (mappedfile_size(activefile)) {
mappedfile_unlock(activefile);
return;
}
list = strarray_new();
inspect_filesystem(mboxname, partition, list, NULL);
/* always put the next item on the front so we don't write to any
* existing databases */
strarray_unshiftm(list, activefile_nextname(list, tier));
activefile_write(activefile, list);
strarray_free(list);
}
static int activefile_open(const char *mboxname, const char *partition,
struct mappedfile **activefile, enum LockType type,
strarray_t **ret)
{
char *fname = activefile_fname(mboxname);
int r;
if (!fname) return IMAP_MAILBOX_NONEXISTENT;
/* try to open the file, and populate with initial values if it's empty */
r = mappedfile_open(activefile, fname, MAPPEDFILE_CREATE|MAPPEDFILE_RW);
if (!r && !mappedfile_size(*activefile))
_activefile_init(mboxname, partition, *activefile);
free(fname);
if (r) {
xsyslog(LOG_ERR, "mappedfile_open failed",
"fname=<%s> error=<%s>",
fname, error_message(r));
return r;
}
/* take the requested lock (a better helper API would allow this to be
* specified as part of the open call, but here's where we are */
if (type == AF_LOCK_WRITE) r = mappedfile_writelock(*activefile);
else r = mappedfile_readlock(*activefile);
if (r) {
xsyslog(LOG_ERR, "mappedfile_readlock failed",
"fname=<%s> error=<%s>",
fname, error_message(r));
return IMAP_MAILBOX_LOCKED;
}
/* finally, read the contents */
*ret = activefile_read(*activefile);
return 0;
}
static int xapstat(const char *path)
{
struct stat sbuf;
int r;
/* is there a glass file? */
char *glass = strconcat(path, "/iamglass", (char *)NULL);
r = stat(glass, &sbuf);
free(glass);
/* zero byte file is the same as no database */
if (!r && !sbuf.st_size) {
r = -1;
errno = ENOENT;
}
if (!r) return 0;
/* check for old chert file */
char *chert = strconcat(path, "/iamchert", (char *)NULL);
r = stat(chert, &sbuf);
free(chert);
/* zero byte file is the same as no database */
if (!r && !sbuf.st_size) {
r = -1;
errno = ENOENT;
}
return r;
}
/* given an item from the activefile file, and the mboxname and partition
* to calculate the user, find the path. If dostat is true, also stat the
* path and return NULL if it doesn't exist (used for filtering databases
* to actually search in */
static char *activefile_path(const char *mboxname, const char *part, const char *item, int dostat)
{
char *basedir = NULL;
struct buf buf = BUF_INITIALIZER;
char *dest = NULL;
struct activeitem *ai = activeitem_parse(item);
xapian_basedir(ai->tier, mboxname, part, NULL, &basedir);
if (!basedir) goto out;
buf_printf(&buf, "%s%s", basedir, XAPIAN_DIRNAME);
free(basedir);
if (ai->generation)
buf_printf(&buf, ".%d", ai->generation);
dest = buf_release(&buf);
if (dostat) {
if (xapstat(dest)) {
if (errno != ENOENT)
syslog(LOG_ERR, "IOERROR: can't read %s for search, check permissions: %m", dest);
free(dest);
dest = NULL;
}
}
out:
buf_free(&buf);
activeitem_free(ai);
return dest;
}
/* convert an array of activefile items to an array of database paths,
* optionally stripping records where the path doesn't exist. If itemsptr
* is not NULL, it stores the unparsed items for which database paths
* exist in order and cardinality of the returned string array value.
*/
static strarray_t *activefile_resolve(const char *mboxname, const char *part,
const strarray_t *items, int dostat,
strarray_t **itemsptr)
{
strarray_t *result = strarray_new();
int i;
if (itemsptr) {
*itemsptr = strarray_new();
}
for (i = 0; i < items->count; i++) {
int statthis = (dostat == 1 || (dostat == 2 && i));
const char *item = strarray_nth(items, i);
char *dir = activefile_path(mboxname, part, item, statthis);
if (dir) {
strarray_appendm(result, dir);
if (itemsptr) {
strarray_append(*itemsptr, item);
}
}
}
return result;
}
/* ====================================================================== */
/* the filesystem layout is inspectable - this is useful for a couple of
* purposes - both rebuilding the activefile if it's lost, and also finding
* stale "missing" directories after a successful rebuild */
struct inspectrock {
const char *mboxname;
const char *partition;
strarray_t *found;
strarray_t *bogus;
};
static void inspect_check(const char *key, const char *val __attribute__((unused)), void *rock)
{
struct inspectrock *ir = (struct inspectrock *)rock;
const char *match = strstr(key, "searchpartition-");
char *basedir = NULL;
char *tier = NULL;
char *fname = NULL;
DIR *dirh = NULL;
struct dirent *de;
bit64 generation;
const char *rest;
if (!match) goto out;
tier = xstrndup(key, match - key);
if (xapian_basedir(tier, ir->mboxname, ir->partition, NULL, &basedir))
goto out;
dirh = opendir(basedir);
if (!dirh) goto out;
while ((de = readdir(dirh))) {
generation = 0;
if (de->d_name[0] == '.') continue;
free(fname);
fname = strconcat(basedir, "/", de->d_name, (char *)NULL);
/* only 'xapian' directories allowed */
if (strncmp(de->d_name, "xapian", 6)) goto bogus;
/* xapian by itself is tier zero */
if (de->d_name[6]) {
/* otherwise it's xapian.generation */
if (de->d_name[6] != '.') goto bogus;
/* unless it exactly matches digits, it's either got .NEW on the end or is
* likewise bogus, track it */
if (parsenum(de->d_name + 7, &rest, strlen(de->d_name)-7, &generation) || rest[0])
goto bogus;
}
/* found one! */
strarray_appendm(ir->found, activeitem_generate(tier, (int)generation));
continue;
bogus:
if (ir->bogus) {
strarray_appendm(ir->bogus, fname);
fname = NULL;
}
}
out:
if (dirh) closedir(dirh);
free(fname);
free(basedir);
free(tier);
}
static void inspect_filesystem(const char *mboxname, const char *partition,
strarray_t *found, strarray_t *bogus)
{
struct inspectrock rock;
rock.mboxname = mboxname;
rock.partition = partition;
rock.found = found;
rock.bogus = bogus;
config_foreachoverflowstring(inspect_check, &rock);
}
/* ====================================================================== */
/* The "indexed database" contains information about which cyrus messages
* are indexed in this sphinx directory. The keys are mailbox.uidvalidity
* and the values are "version sequence", where sequence is an IMAP-style
* sequence of UIDs. This allows squatter to quickly determine which
* messages are not yet indexed in any active database. */
/* parse both the old version 1 (just max UID rather than range) and
* current version sequence from a mapped database value */
static struct seqset *parse_indexed(const char *data, size_t datalen)
{
struct seqset *seq = NULL;
const char *rest;
bit64 version;
char *val;
if (parsenum(data, &rest, datalen, &version))
return NULL;
if (*rest++ != ' ')
return NULL;
switch(version) {
case 1:
{
char buf[20];
snprintf(buf, 20, "1:%.*s", (int)(datalen - (rest - data)), rest);
return seqset_parse(buf, NULL, 0);
}
case 2:
val = xstrndup(rest, datalen - (rest - data));
seq = seqset_parse(val, NULL, 0);
free(val);
return seq;
}
return NULL;
}
static int tierexists_cb(void *rock, const char *key, size_t keylen,
const char *data __attribute__((unused)),
size_t datalen __attribute__((unused)))
{
const int *verbose = rock;
if (*verbose > 1) {
syslog(LOG_INFO, "tierexists_cb: found tier key %.*s", (int) keylen, key);
}
return CYRUSDB_DONE;
}
struct cachetier_rock {
struct buf *buf;
struct db *dst_db;
struct txn *txn;
};
static int cachetier_cb(void *rock, const char *key, size_t keylen,
const char *data, size_t datalen)
{
if (*key == '#') {
/* Ignore cache entries */
return 0;
}
struct cachetier_rock *mr = rock;
size_t prefix_len = buf_len(mr->buf);
int r = 0;
buf_appendmap(mr->buf, key, keylen);
r = cyrusdb_store(mr->dst_db, buf_base(mr->buf), buf_len(mr->buf),
data, datalen, &mr->txn);
buf_truncate(mr->buf, prefix_len);
if (r) {
syslog(LOG_ERR, "cachetier_cb: could not save key %.*s for tier %s: %s",
(int) keylen, key, buf_cstring(mr->buf), cyrusdb_strerror(r));
}
return r;
}
/*
* Merge the indexed.db of all search tiers activetiers[1..n] into the
* indexed.db of the top tier.
*
* Any entries in indexed.dbs located at activedirs[1..n] are cached into
* the indexed.db located at activedirs[0] (created if not exists), using
* a special prefix:
*
* The keys from merged entries are formatted as
*
* '#c'.<tiername:tiergen>'#'<key>
*
* and any keys starting with '#' are ignored during the merge.
*
* Returns 0 on success or a cyrusdb error code.
*/
static int cache_indexed(const strarray_t *activedirs,
const strarray_t *activetiers,
int verbose)
{
struct db *src_db = NULL;
struct db *dst_db = NULL;
struct buf path = BUF_INITIALIZER;
struct buf key = BUF_INITIALIZER;
int r = 0;
int i;
assert(activedirs->count == activetiers->count);
if (!activedirs->count) {
return 0;
}
/* Open destination database */
buf_printf(&path, "%s%s", strarray_nth(activedirs, 0), INDEXEDDB_FNAME);
r = cyrusdb_open(config_getstring(IMAPOPT_SEARCH_INDEXED_DB),
buf_cstring(&path), CYRUSDB_CREATE, &dst_db);
if (r) {
syslog(LOG_ERR, "cache_indexed: can't open destination db %s: %s",
buf_cstring(&path), cyrusdb_strerror(r));
goto out;
}
for (i = 1; i < activedirs->count; i++) {
/* Reset state */
if (src_db) {
cyrusdb_close(src_db);
}
src_db = NULL;
buf_reset(&path);
buf_reset(&key);
/* Check if the tier is already merged. We assume a tier is merged
* if at least one entry with its tier prefix exists. */
buf_printf(&key, "#c.%s#", strarray_nth(activetiers, i));
r = cyrusdb_foreach(dst_db, buf_base(&key), buf_len(&key),
NULL, tierexists_cb, &verbose, NULL);
if (r == CYRUSDB_DONE) {
if (verbose) {
syslog(LOG_INFO, "cache_indexed: tier %s is already merged",
strarray_nth(activetiers, i));
}
r = 0;
continue;
}
else if (r) goto out;
/* Open source database */
buf_printf(&path, "%s%s", strarray_nth(activedirs, i), INDEXEDDB_FNAME);
r = cyrusdb_open(config_getstring(IMAPOPT_SEARCH_INDEXED_DB),
buf_cstring(&path), 0, &src_db);
if (r == CYRUSDB_NOTFOUND) {
if (verbose) {
syslog(LOG_INFO, "cache_indexed: no db found at %s",
buf_cstring(&path));
}
r = 0;
continue;
}
else if (r) goto out;
/* Merge the entries from source into destination. The first
* store in the callback will create a write transaction. */
struct cachetier_rock rock = { &key, dst_db, NULL };
r = cyrusdb_foreach(src_db, NULL, 0, NULL, cachetier_cb, &rock, NULL);
cyrusdb_close(src_db);
src_db = NULL;
if (r) {
cyrusdb_abort(dst_db, rock.txn);
goto out;
}
cyrusdb_commit(dst_db, rock.txn);
}
out:
if (dst_db) {
cyrusdb_close(dst_db);
}
if (src_db) {
cyrusdb_close(src_db);
}
buf_free(&key);
buf_free(&path);
return r;
}
/*
* Read the indexed UIDs sequence for mailbox mboxname
* from the activetiers located at activedirs and join
* them into a single result res.
*
* If do_cache is true, any activetiers[1..n] that are not
* already cached in the top tier (activetiers[0]) are
* cached before looking up their sequence sets in the
* cache. Caller must guarantee an exlusive write lock on
* activetier[0].
*
* If do_cache is zero, the sequence sets are constructed
* by looking up first any already cached indexes in the
* top tier, followed by looking up entries in any non-
* cached activetiers[1..n]. Since no writes are done,
* this operation is safe without exclusively locking
* the top tier.
*
* Returns 0 on success or a cyrusdb error code.
*/
static int read_indexed(const strarray_t *activedirs,
const strarray_t *activetiers,
const char *mboxname,
uint32_t uidvalidity,
struct seqset *res,
int do_cache,
int verbose)
{
struct db *db = NULL;
struct db *srcdb = NULL;
struct buf path = BUF_INITIALIZER;
struct buf key = BUF_INITIALIZER;
const char *data = NULL;
size_t datalen = 0;
int r = 0;
int i;
assert(activedirs->count == activetiers->count);
if (!activedirs->count) {
return 0;
}
if (do_cache) {
/* Merge search tiers first */
r = cache_indexed(activedirs, activetiers, verbose);
if (r) return r;
}
/* Open database */
buf_printf(&path, "%s%s", strarray_nth(activedirs, 0), INDEXEDDB_FNAME);
r = cyrusdb_open(config_getstring(IMAPOPT_SEARCH_INDEXED_DB),
buf_cstring(&path), CYRUSDB_CREATE, &db);
if (r) {
syslog(LOG_ERR, "read_indexed: can't open db %s: %s",
buf_cstring(&path), cyrusdb_strerror(r));
goto out;
}
/* Lookup entry in top tier */
buf_printf(&key, "%s.%u", mboxname, uidvalidity);
r = cyrusdb_fetch(db, key.s, key.len, &data, &datalen, (struct txn **)NULL);
if (r && r != CYRUSDB_NOTFOUND) {
goto out;
}
else if (!r) {
struct seqset *seq = parse_indexed(data, datalen);
if (seq) {
seqset_join(res, seq);
seqset_free(seq);
if (verbose > 1) {
syslog(LOG_INFO, "read_indexed: top tier seq=%.*s", (int)datalen, data);
}
}
}
r = 0;
/* Lookup entries from lower tiers */
for (i = 1; i < activedirs->count; i++) {
buf_reset(&key);
if (srcdb) {
cyrusdb_close(srcdb);
srcdb = NULL;
}
/* First look in the cached tiers in the top tier database. */
buf_printf(&key, "#c.%s#%s.%u", strarray_nth(activetiers, i), mboxname, uidvalidity);
r = cyrusdb_fetch(db, key.s, key.len, &data, &datalen, (struct txn **)NULL);
/* Fall back to the lower tiers if we haven't merged all tiers. */
if (r == CYRUSDB_NOTFOUND && !do_cache) {
buf_reset(&path);
buf_printf(&path, "%s%s", strarray_nth(activedirs, i), INDEXEDDB_FNAME);
r = cyrusdb_open(config_getstring(IMAPOPT_SEARCH_INDEXED_DB),
buf_cstring(&path), 0, &srcdb);
if (r) {
syslog(LOG_ERR, "read_indexed: can't open db %s: %s",
buf_cstring(&path), cyrusdb_strerror(r));
goto out;
}
buf_reset(&key);
buf_printf(&key, "%s.%u", mboxname, uidvalidity);
r = cyrusdb_fetch(srcdb, key.s, key.len, &data, &datalen, (struct txn **)NULL);
}
if (r && r != CYRUSDB_NOTFOUND) {
goto out;
}
/* No entry found */
if (r == CYRUSDB_NOTFOUND) {
r = 0;
continue;
}
/* Parse and join the sequence sets */
struct seqset *seq = parse_indexed(data, datalen);
if (seq) {
seqset_join(res, seq);
seqset_free(seq);
if (verbose > 1) {
syslog(LOG_INFO, "read_indexed: tier %s seq=%.*s",
strarray_nth(activetiers, i), (int)datalen, data);
}
}
}
out:
if (db) {
cyrusdb_close(db);
}
if (srcdb) {
cyrusdb_close(srcdb);
}
buf_free(&key);
buf_free(&path);
return r;
}
/* store the given sequence into the already opened cyrus db
* with the given key. If there is an existing sequence in
* the DB, then join this sequence to it, so incremental
* indexing does what you would expect. */
static int store_indexed(struct db *db, struct txn **tid,
const char *key, size_t keylen,
const struct seqset *val)
{
struct buf data = BUF_INITIALIZER;
char *str = NULL;
int r;
const char *olddata = NULL;
size_t oldlen = 0;
r = cyrusdb_fetch(db, key, keylen, &olddata, &oldlen, tid);
if (r == CYRUSDB_NOTFOUND) {
str = seqset_cstring(val);
}
else if (r) return r;
else {
struct seqset *seq = parse_indexed(olddata, oldlen);
if (seq) {
seqset_join(seq, val);
str = seqset_cstring(seq);
seqset_free(seq);
}
else {
str = seqset_cstring(val);
}
}
if (!str) return 0;
buf_printf(&data, "%u %s", INDEXEDDB_VERSION, str);
r = cyrusdb_store(db, key, keylen, data.s, data.len, tid);
buf_free(&data);
free(str);
return r;
}
/* Given the directory of a xapian database which has just had
* messages indexed into it, add the sequence of UIDs to the
* record for the given mailbox and uidvalidity */
static int write_indexed(const char *dir,
const char *mboxname,
uint32_t uidvalidity,
struct seqset *seq,
int verbose)
{
struct buf path = BUF_INITIALIZER;
struct buf key = BUF_INITIALIZER;
struct db *db = NULL;
struct txn *txn = NULL;
int r = 0;
buf_reset(&path);
buf_printf(&path, "%s%s", dir, INDEXEDDB_FNAME);
if (verbose) {
char *str = seqset_cstring(seq);
syslog(LOG_INFO, "write_indexed db=%s mailbox=%s uidvalidity=%u uids=%s",
buf_cstring(&path), mboxname, uidvalidity, str);
free(str);
}
buf_printf(&key, "%s.%u", mboxname, uidvalidity);
r = cyrusdb_open(config_getstring(IMAPOPT_SEARCH_INDEXED_DB),
buf_cstring(&path), CYRUSDB_CREATE, &db);
if (r) goto out;
r = store_indexed(db, &txn, key.s, key.len, seq);
if (!r)
r = cyrusdb_commit(db, txn);
else
cyrusdb_abort(db, txn);
out:
if (db) cyrusdb_close(db);
buf_free(&path);
buf_free(&key);
return r;
}
/* ====================================================================== */
static int copy_files(const char *fromdir, const char *todir)
{
char *fromdir2 = strconcat(fromdir, "/", (char *)NULL);
int r = run_command(RSYNC_BIN, "-a", fromdir2, todir, (char *)NULL);
free(fromdir2);
return r;
}
/* ====================================================================== */
/* shared lock for xapian dbs */
struct xapiandb_lock {
struct mappedfile *activefile;
struct mboxlock *namelock;
strarray_t *activedirs;
strarray_t *activetiers;
xapian_db_t *db;
};
#define XAPIANDB_LOCK_INITIALIZER { NULL, NULL, NULL, NULL, NULL }
static void xapiandb_lock_release(struct xapiandb_lock *lock)
{
if (lock->db) xapian_db_close(lock->db);
/* now that the databases are closed, it's safe to unlock
* the active file */
if (lock->activefile) {
mappedfile_unlock(lock->activefile);
mappedfile_close(&lock->activefile);
}
if (lock->namelock) {
mboxname_release(&lock->namelock);
}
strarray_free(lock->activedirs);
strarray_free(lock->activetiers);
memset(lock, 0, sizeof(struct xapiandb_lock));
}
/*
* This function builds a lockfilename of the format:
* $XAPIAN$<userid>
* example:
* If the userid is `foo@bar.com` then the lockfilename is
* $XAPIAN$foo@bar^com
*
* It replaces '.' in a string with a '^' into a struct buf
*/
static char *xapiandb_namelock_fname_from_userid(const char *userid)
{
const char *p;
struct buf buf = BUF_INITIALIZER;
buf_setcstr(&buf, XAPIAN_NAME_LOCK_PREFIX);
for (p = userid; *p; p++) {
switch(*p) {
case '.':
buf_putc(&buf, '^');
break;
default:
buf_putc(&buf, *p);
break;
}
}
return buf_release(&buf);
}
static int xapiandb_lock_open(struct mailbox *mailbox, struct xapiandb_lock *lock)
{
strarray_t *active = NULL;
char *namelock_fname = NULL;
char *userid = NULL;
int r = 0;
assert(lock->namelock == NULL);
assert(lock->activefile == NULL);
assert(lock->activedirs == NULL);
assert(lock->activetiers == NULL);
/* Do nothing if there is no userid */
userid = mboxname_to_userid(mailbox->name);
if (!userid) goto out;
namelock_fname = xapiandb_namelock_fname_from_userid(userid);
/* Get a shared lock */
r = mboxname_lock(namelock_fname, &lock->namelock, LOCK_SHARED);
if (r) {
syslog(LOG_ERR, "Could not acquire shared namelock on %s\n",
namelock_fname);
goto out;
}
/* need to hold a read-only lock on the activefile file
* to ensure no databases are deleted out from under us */
r = activefile_open(mailbox->name, mailbox->part, &lock->activefile, AF_LOCK_READ, &active);
if (r) goto out;
/* only try to open directories with databases in them */
lock->activedirs = activefile_resolve(mailbox->name, mailbox->part, active,
/*dostat*/1, &lock->activetiers);
/* open the databases */
if (lock->activedirs->count) {
const char **paths = (const char **) lock->activedirs->data;
r = xapian_db_open(paths, &lock->db);
if (r) goto out;
}
out:
if (r) xapiandb_lock_release(lock);
strarray_free(active);
free(namelock_fname);
free(userid);
return r;
}
/* ====================================================================== */
#define XAPIAN_SEARCH_OP_DOCTYPE 1025
struct opnode
{
int op; /* SEARCH_OP_* or SEARCH_PART_* or XAPIAN_SEARCH_OP_* */
strarray_t *items;
struct opnode *next;
struct opnode *children;
};
typedef struct xapian_builder xapian_builder_t;
struct xapian_builder {
search_builder_t super;
struct xapiandb_lock lock;
struct seqset *indexed;
struct mailbox *mailbox;
int opts;
struct opnode *root;
ptrarray_t stack; /* points to opnode* */
int (*proc)(const char *, uint32_t, uint32_t, const strarray_t *, void *);
int (*proc_guidsearch)(const conv_guidrec_t*,size_t,void*);
void *rock;
};
static struct opnode *opnode_new(int op, const strarray_t *arg)
{
struct opnode *on = xzmalloc(sizeof(struct opnode));
on->op = op;
on->items = strarray_dup(arg);
return on;
}
static void opnode_delete(struct opnode *on)
{
struct opnode *child;
struct opnode *next;
for (child = on->children ; child ; child = next) {
next = child->next;
opnode_delete(child);
}
strarray_free(on->items);
free(on);
}
static void opnode_detach_child(struct opnode *parent, struct opnode *child)
{
struct opnode **prevp;
for (prevp = &parent->children ; *prevp ; prevp = &((*prevp)->next)) {
if (*prevp == child) {
*prevp = child->next;
child->next = NULL;
return;
}
}
}
static void opnode_append_child(struct opnode *parent, struct opnode *child)
{
struct opnode **tailp;
for (tailp = &parent->children ; *tailp ; tailp = &((*tailp)->next))
;
*tailp = child;
child->next = NULL;
}
static void opnode_insert_child(struct opnode *parent __attribute__((unused)),
struct opnode *after,
struct opnode *child)
{
child->next = after->next;
after->next = child;
}
static struct opnode *opnode_deep_copy(const struct opnode *on)
{
if (!on) return NULL;
struct opnode *clone = opnode_new(on->op, on->items);
const struct opnode *child;
for (child = on->children; child; child = child->next) {
opnode_append_child(clone, opnode_deep_copy(child));
}
return clone;
}
static const char *opnode_serialise(struct buf *buf, const struct opnode *on)
{
if (!on) return "";
buf_putc(buf, '(');
if (on->op < SEARCH_NUM_PARTS) {
buf_appendcstr(buf, "MATCH");
buf_putc(buf, ' ');
const char *part = search_part_as_string(on->op);
buf_appendcstr(buf, part ? part : "ANY");
}
else if (on->op == SEARCH_OP_AND)
buf_appendcstr(buf, "AND");
else if (on->op == SEARCH_OP_OR)
buf_appendcstr(buf, "OR");
else if (on->op == SEARCH_OP_NOT)
buf_appendcstr(buf, "NOT");
else if (on->op == SEARCH_OP_TRUE)
buf_appendcstr(buf, "TRUE");
else if (on->op == SEARCH_OP_FALSE)
buf_appendcstr(buf, "FALSE");
else if (on->op == XAPIAN_SEARCH_OP_DOCTYPE)
buf_appendcstr(buf, "DOCTYPE");
else
buf_appendcstr(buf, "UNKNOWN");
if (on->items) {
buf_putc(buf, ' ');
buf_putc(buf, '(');
int i = 0;
for (i = 0; i < strarray_size(on->items); i++) {
if (i) buf_putc(buf, ' ');
buf_putc(buf, '"');
buf_appendcstr(buf, strarray_nth(on->items, i));
buf_putc(buf, '"');
}
buf_putc(buf, ')');
}
if (on->children) {
buf_putc(buf, ' ');
const struct opnode *child;
for (child = on->children ; child ; child = child->next) {
opnode_serialise(buf, child);
}
}
buf_putc(buf, ')');
return buf_cstring(buf);
}
static void optimise_nodes(struct opnode *parent, struct opnode *on)
{
struct opnode *child;
struct opnode *next;
switch (on->op) {
case SEARCH_OP_NOT:
case SEARCH_OP_OR:
case SEARCH_OP_AND:
for (child = on->children ; child ; child = next) {
next = child->next;
optimise_nodes(on, child);
}
if (parent) {
if (!on->children) {
/* empty node - remove it */
opnode_detach_child(parent, on);
opnode_delete(on);
}
else if (on->op != SEARCH_OP_NOT && !on->children->next) {
/* logical AND or OR with only one child - replace
* the node with its child */
struct opnode *child = on->children;
opnode_detach_child(on, child);
opnode_insert_child(parent, on, child);
opnode_detach_child(parent, on);
opnode_delete(on);
}
}
break;
}
}
static xapian_query_t *opnode_to_query(const xapian_db_t *db, struct opnode *on, int opts)
{
struct opnode *child;
xapian_query_t *qq = NULL;
int i, j;
ptrarray_t childqueries = PTRARRAY_INITIALIZER;
if (!on) return xapian_query_new_matchall(db);
switch (on->op) {
case SEARCH_OP_TRUE:
qq = xapian_query_new_matchall(db);
break;
case SEARCH_OP_FALSE:
qq = xapian_query_new_not(db, xapian_query_new_matchall(db));
break;
case SEARCH_OP_NOT:
if (on->children)
qq = xapian_query_new_not(db, opnode_to_query(db, on->children, opts));
break;
case SEARCH_OP_OR:
case SEARCH_OP_AND:
for (child = on->children ; child ; child = child->next) {
qq = opnode_to_query(db, child, opts);
if (qq) ptrarray_push(&childqueries, qq);
}
qq = NULL;
if (childqueries.count)
qq = xapian_query_new_compound(db, (on->op == SEARCH_OP_OR),
(xapian_query_t **)childqueries.data,
childqueries.count);
break;
case SEARCH_PART_ANY:
/* Xapian does not have a convenient way of search for "any
* field"; instead we fake it by explicitly searching for
* all of the available prefixes */
for (i = 0 ; i < SEARCH_NUM_PARTS ; i++) {
switch (i) {
case SEARCH_PART_LISTID:
case SEARCH_PART_TYPE:
case SEARCH_PART_LANGUAGE:
case SEARCH_PART_PRIORITY:
continue;
case SEARCH_PART_ATTACHMENTBODY:
if (!(opts & SEARCH_ATTACHMENTS_IN_ANY)) {
continue;
}
// fallthrough
}
for (j = 0; j < strarray_size(on->items); j++) {
void *q = xapian_query_new_match(db, i, strarray_nth(on->items, j));
if (q) ptrarray_push(&childqueries, q);
}
}
qq = xapian_query_new_compound(db, /*is_or*/1,
(xapian_query_t **)childqueries.data,
childqueries.count);
break;
case XAPIAN_SEARCH_OP_DOCTYPE:
assert(on->items != NULL && strarray_size(on->items));
const char *val = strarray_nth(on->items, 0);
qq = xapian_query_new_has_doctype(db, val[0], NULL);
break;
default:
assert(on->items != NULL);
assert(on->children == NULL);
if (strarray_size(on->items) > 1) {
for (j = 0; j < strarray_size(on->items); j++) {
void *q = xapian_query_new_match(db, on->op, strarray_nth(on->items, j));
if (q) ptrarray_push(&childqueries, q);
}
qq = xapian_query_new_compound(db, /*is_or*/1,
(xapian_query_t **)childqueries.data,
childqueries.count);
}
else {
qq = xapian_query_new_match(db, on->op, strarray_nth(on->items, 0));
}
break;
}
ptrarray_fini(&childqueries);
return qq;
}
static int is_dnfclause(const struct opnode *on)
{
if (on->op < SEARCH_NUM_PARTS) {
return 1;
}
if (on->op == SEARCH_OP_TRUE || on->op == SEARCH_OP_FALSE) {
return 1;
}
if (on->op == SEARCH_OP_NOT) {
const struct opnode *child;
for (child = on->children; child; child = child->next) {
if (child->op >= SEARCH_NUM_PARTS)
return 0;
}
return 1;
}
if (on->op == SEARCH_OP_AND) {
const struct opnode *child;
for (child = on->children; child; child = child->next) {
if (child->op < SEARCH_NUM_PARTS ||
child->op == SEARCH_OP_TRUE ||
child->op == SEARCH_OP_FALSE) {
continue;
}
else if (child->op == SEARCH_OP_NOT) {
const struct opnode *gchild;
for (gchild = child->children; gchild; gchild = gchild->next) {
if (gchild->op >= SEARCH_NUM_PARTS &&
gchild->op < XAPIAN_SEARCH_OP_DOCTYPE) {
return 0;
}
}
continue;
}
else return 0;
}
return 1;
}
return 0;
}
static int is_orclause(const struct opnode *on)
{
if (on->op != SEARCH_OP_OR) {
return 0;
}
const struct opnode *child;
for (child = on->children; child; child = child->next) {
if (child->op < SEARCH_NUM_PARTS ||
child->op >= XAPIAN_SEARCH_OP_DOCTYPE) {
// A MATCH or our own extensions are OK.
continue;
}
else if (child->op != SEARCH_OP_OR) {
// Not an OR operator.
return 0;
}
else if (!is_orclause(child)) {
// Not a pure OR subclause.
return 0;
}
}
return 1;
}
static int normalise_dnfclause(const struct opnode *expr, struct opnode **normalised)
{
/* Normalise DNF clause expr to an AND clause, with each child
* expression being a part MATCH or single-valued NOT. */
struct opnode *root = opnode_deep_copy(expr);
if (root->op == SEARCH_OP_NOT) {
/* Convert NOT(x,y) to AND(NOT(x),NOT(y)) */
struct opnode *newroot = opnode_new(SEARCH_OP_AND, NULL);
while (root->children) {
struct opnode *child = root->children;
opnode_detach_child(root, child);
struct opnode *notchild = opnode_new(SEARCH_OP_NOT, NULL);
opnode_append_child(notchild, child);
opnode_append_child(newroot, notchild);
}
opnode_delete(root);
root = newroot;
}
else if (root->op < SEARCH_NUM_PARTS) {
/* Convert MATCH to AND(MATCH) */
struct opnode *newroot = opnode_new(SEARCH_OP_AND, NULL);
opnode_append_child(newroot, root);
root = newroot;
}
struct opnode *child = root->children;
while (child) {
/* Convert AND(NOT(x,y)) to AND(NOT(x),NOT(y)) */
if (child->op != SEARCH_OP_NOT) {
child = child->next;
continue;
}
if (!child->children || !child->children->next) {
child = child->next;
continue;
}
while (child->children) {
struct opnode *grandchild = child->children;
opnode_detach_child(child, grandchild);
struct opnode *notgrandchild = opnode_new(SEARCH_OP_NOT, NULL);
opnode_append_child(notgrandchild, grandchild);
opnode_append_child(root, notgrandchild);
}
struct opnode *next = child->next;
opnode_detach_child(root, child);
opnode_delete(child);
child = next;
}
*normalised = root;
return 0;
}
-static int split_legacyv4_query(xapian_builder_t *bb,
- const struct opnode *expr,
- ptrarray_t *clauses)
-{
- struct opnode *root = NULL;
- int r = normalise_dnfclause(expr, &root);
- if (r) return r;
-
- assert(root->op == SEARCH_OP_AND);
-
- /* Split clause into subclauses for top-level guid
- * search, body part search and any part. */
-
- struct opnode *qguid = opnode_new(SEARCH_OP_AND, NULL);
- struct opnode *qpart = opnode_new(SEARCH_OP_AND, NULL);
- struct opnode *qany = opnode_new(SEARCH_OP_AND, NULL);
-
- struct opnode *child = root->children;
- while (child) {
- struct opnode *next = child->next;
- opnode_detach_child(root, child);
-
- int part = child->op == SEARCH_OP_NOT ? child->children->op : child->op;
-
- if (child->op == SEARCH_OP_NOT && part == SEARCH_PART_ANY) {
- struct opnode *clone = opnode_deep_copy(child);
- opnode_append_child(qguid, child);
- opnode_append_child(qpart, clone);
- }
- else {
- switch (part) {
- /* Map search parts to their document type queries.
- * It's annoying that this mapping must be kept in
- * sync with the search part indexing in index.c */
- case SEARCH_PART_ANY:
- opnode_append_child(qany, child);
- break;
- case SEARCH_PART_FROM:
- case SEARCH_PART_TO:
- case SEARCH_PART_CC:
- case SEARCH_PART_BCC:
- case SEARCH_PART_SUBJECT:
- case SEARCH_PART_LISTID:
- case SEARCH_PART_TYPE:
- case SEARCH_PART_HEADERS:
- case SEARCH_PART_ATTACHMENTNAME:
- opnode_append_child(qguid, child);
- break;
- case SEARCH_PART_BODY:
- case SEARCH_PART_LOCATION:
- case SEARCH_PART_ATTACHMENTBODY:
- opnode_append_child(qpart, child);
- break;
- default:
- syslog(LOG_ERR, "search_xapian: ignoring unexpected part: %s",
- search_part_as_string(part));
- opnode_delete(child);
- child = NULL;
- }
- }
- child = next;
- }
-
- /* Create Xapian queries for each subquery */
-
- if (qguid->children) {
- optimise_nodes(NULL, qguid);
- xapian_query_t *xq = opnode_to_query(bb->lock.db, qguid, bb->opts);
- if (xq) xq = xapian_query_new_has_doctype(bb->lock.db, XAPIAN_WRAP_DOCTYPE_MSG, xq);
- if (xq) ptrarray_append(clauses, xq);
- }
- if (qpart->children) {
- optimise_nodes(NULL, qpart);
- xapian_query_t *xq = opnode_to_query(bb->lock.db, qpart, bb->opts);
- if (xq) xq = xapian_query_new_has_doctype(bb->lock.db, XAPIAN_WRAP_DOCTYPE_PART, xq);
- if (xq) ptrarray_append(clauses, xq);
- }
-
- for (child = qany->children; child; child = child->next) {
- optimise_nodes(NULL, child);
- xapian_query_t *xq = opnode_to_query(bb->lock.db, child, bb->opts);
- if (xq) ptrarray_append(clauses, xq);
- }
-
- /* Clean up */
-
- opnode_delete(qguid);
- opnode_delete(qpart);
- opnode_delete(qany);
- opnode_delete(root);
- return 0;
-}
-
struct xapian_match {
bitvector_t uids;
hashu64_table partids_by_uid;
};
static void xapian_match_free_partids(uint64_t key __attribute__((unused)),
void *data,
void *rock __attribute__((unused)))
{
strarray_free((strarray_t*)data);
}
static void xapian_match_reset(struct xapian_match *match)
{
bv_fini(&match->uids);
if (match->partids_by_uid.size) {
hashu64_enumerate(&match->partids_by_uid, xapian_match_free_partids, NULL);
}
free_hashu64_table(&match->partids_by_uid, NULL);
}
struct xapian_run_rock {
xapian_builder_t *bb;
struct xapian_match *matches;
};
static int xapian_run_guid_cb(const conv_guidrec_t *rec, void *rock)
{
struct xapian_run_rock *xrock = rock;
xapian_builder_t *bb = xrock->bb;
struct xapian_match *matches = xrock->matches;
if (!(bb->opts & SEARCH_MULTIPLE)) {
if (strcmp(rec->mboxname, bb->mailbox->name))
return 0;
}
struct xapian_match *match = matches + rec->foldernum;
bv_set(&match->uids, rec->uid);
if (rec->part) {
if (!match->partids_by_uid.size) {
construct_hashu64_table(&match->partids_by_uid, 1024, 0);
}
strarray_t *partids = hashu64_lookup(rec->uid, &match->partids_by_uid);
if (!partids) {
partids = strarray_new();
hashu64_insert(rec->uid, partids, &match->partids_by_uid);
}
strarray_add(partids, rec->part);
}
return 0;
}
static int memcmp40(const void *a, const void *b)
{
return memcmp(a, b, 40);
}
static int xapian_run_cb(void *data, size_t nmemb, void *rock)
{
struct xapian_run_rock *xrock = rock;
xapian_builder_t *bb = xrock->bb;
int r = cmd_cancelled(/*insearch*/1);
if (r) return r;
struct conversations_state *cstate = mailbox_get_cstate(bb->mailbox);
if (!cstate) return IMAP_NOTFOUND;
qsort(data, nmemb, 41, memcmp40); // byte 41 is always zero
return conversations_iterate_searchset(cstate, data, nmemb, xapian_run_guid_cb, xrock);
}
struct xapian_run_guidsearch_rock {
xapian_builder_t *bb;
size_t total;
};
static int xapian_run_guidsearch_guid_cb(const conv_guidrec_t *rec, void *rock)
{
struct xapian_run_guidsearch_rock *xrock = rock;
xapian_builder_t *bb = xrock->bb;
return bb->proc_guidsearch(rec, xrock->total, bb->rock);
}
static int xapian_run_guidsearch_cb(void *data, size_t nmemb, void *rock)
{
xapian_builder_t *bb = rock;
int r = cmd_cancelled(/*insearch*/1);
if (r) return r;
struct conversations_state *cstate = mailbox_get_cstate(bb->mailbox);
if (!cstate) return IMAP_NOTFOUND;
qsort(data, nmemb, 41, memcmp40); // byte 41 is always zero
struct xapian_run_guidsearch_rock xrock = { bb, nmemb };
return conversations_iterate_searchset(cstate, data, nmemb,
xapian_run_guidsearch_guid_cb, &xrock);
}
-static int run_legacy_v4_query(xapian_builder_t *bb)
-{
- int i, r = 0;
- struct conversations_state *cstate = mailbox_get_cstate(bb->mailbox);
- if (!cstate) {
- syslog(LOG_INFO, "search_xapian: can't open conversations for %s",
- bb->mailbox->name);
- return IMAP_NOTFOUND;
- }
- uint32_t num_folders = conversations_num_folders(cstate);
- struct xapian_match *result = NULL;
- ptrarray_t clauses = PTRARRAY_INITIALIZER;
-
- if (is_dnfclause(bb->root)) {
- /* Split into sub queries for each document type */
- r = split_legacyv4_query(bb, bb->root, &clauses);
- if (r) goto out;
- }
- else if (is_orclause(bb->root)) {
- xapian_query_t *xq = opnode_to_query(bb->lock.db, bb->root, bb->opts);
- ptrarray_append(&clauses, xq);
- }
- else {
- struct buf buf = BUF_INITIALIZER;
- opnode_serialise(&buf, bb->root);
- syslog(LOG_ERR, "search_xapian: expected DNF or OR clause, got %s",
- buf_cstring(&buf));
- buf_free(&buf);
- r = IMAP_INTERNAL;
- goto out;
- }
-
- /* Run clauses and intersect results */
- for (i = 0; i < ptrarray_size(&clauses); i++) {
- struct xapian_match *matches = xzmalloc(sizeof(struct xapian_match) * num_folders);
-
- struct xapian_run_rock xrock = { bb, matches };
- xapian_query_t *xq = ptrarray_nth(&clauses, i);
- // sort the response by GUID for more efficient later handling
- r = xapian_query_run(bb->lock.db, xq, /*is_legacy*/1,
- xapian_run_cb, &xrock);
- if (r) {
- uint32_t j;
- for (j = 0; j < num_folders; j++) {
- xapian_match_reset(matches + j);
- }
- free(matches);
- goto out;
- }
-
- /* First clause, use as initial result */
- if (result == NULL) {
- result = matches;
- continue;
- }
-
- /* Intersect mailbox names */
- uint32_t j;
- for (j = 0; j < num_folders; j++) {
- if (!bv_count(&matches[j].uids)) {
- xapian_match_reset(result + j);
- }
- }
-
- /* Intersect UIDs */
- for (j = 0; j < num_folders; j++) {
- struct xapian_match *result_match = result + j;
- struct xapian_match *clause_match = matches + j;
- if (bv_count(&result_match->uids)) {
- bv_andeq(&result_match->uids, &clause_match->uids);
- if (result_match->partids_by_uid.size && clause_match->partids_by_uid.size) {
- /* Intersect partids */
- int uid;
- for (uid = bv_next_set(&result_match->uids, 0); uid != -1;
- uid = bv_next_set(&result_match->uids, uid+1)) {
- strarray_t *result_partids = hashu64_lookup(uid, &result_match->partids_by_uid);
- strarray_t *clause_partids = hashu64_lookup(uid, &clause_match->partids_by_uid);
- if (result_partids && clause_partids) {
- int i = 0, newlen = 0;
- for (i = 0; i < strarray_size(result_partids); i++) {
- const char *partid = strarray_nth(result_partids, i);
- if (strarray_find(clause_partids, partid, 0) >= 0) {
- strarray_swap(result_partids, i, newlen);
- newlen++;
- }
- }
- if (newlen) {
- strarray_truncate(result_partids, newlen);
- }
- else {
- hashu64_del(uid, &result_match->partids_by_uid);
- strarray_free(result_partids);
- }
- }
- }
- }
- }
- xapian_match_reset(clause_match);
- }
- free(matches);
- }
-
- /* Return results */
- if (result) {
- r = 0;
- uint32_t j;
- for (j = 0; j < num_folders; j++) {
- struct xapian_match *match = result + j;
- if (bv_count(&match->uids)) {
- const char *mboxname = conversations_folder_name(cstate, j);
- int uid;
- for (uid = bv_next_set(&match->uids, 0); uid != -1;
- uid = bv_next_set(&match->uids, uid+1)) {
- strarray_t *partids = NULL;
- if (match->partids_by_uid.size) {
- partids = hashu64_lookup(uid, &match->partids_by_uid);
- }
- r = bb->proc(mboxname, /*uidvalidity*/0, uid, partids, bb->rock);
- if (r) break;
- }
- }
- }
- if (r) goto out;
- }
-
-out:
- if (result) {
- uint32_t j;
- for (j = 0; j < num_folders; j++) {
- xapian_match_reset(result + j);
- }
- free(result);
- }
- if (ptrarray_size(&clauses)) {
- xapian_query_t *xq;
- while ((xq = ptrarray_pop(&clauses))) xapian_query_free(xq);
- }
- ptrarray_fini(&clauses);
- return r;
-}
-
static int validate_query(xapian_db_t *db, struct opnode *on)
{
if (!on) return 0;
struct opnode *child;
for (child = on->children ; child ; child = child->next) {
int r = validate_query(db, child);
if (r) return r;
}
return 0;
}
static int run_query(xapian_builder_t *bb)
{
struct opnode *root = NULL;
xapian_query_t *xq = NULL;
int r = 0;
/* Validate query for this db */
r = validate_query(bb->lock.db, bb->root);
if (r) return r;
if (bb->proc_guidsearch) {
xq = opnode_to_query(bb->lock.db, bb->root, bb->opts);
if (!xq) goto out;
- r = xapian_query_run(bb->lock.db, xq, /*is_legacy*/0,
- xapian_run_guidsearch_cb, bb);
+ r = xapian_query_run(bb->lock.db, xq, xapian_run_guidsearch_cb, bb);
goto out;
}
/* Fallback to UID search */
if (bb->root && is_dnfclause(bb->root)) {
struct opnode *norm = NULL;
r = normalise_dnfclause(bb->root, &norm);
if (r) return r;
assert(norm->op == SEARCH_OP_AND);
/* Exclude P doctypes from matches for headers or ANY */
root = opnode_new(SEARCH_OP_AND, NULL);
while (norm->children) {
struct opnode *child = norm->children;
opnode_detach_child(norm, child);
if (child->op != SEARCH_OP_NOT) {
opnode_append_child(root, child);
continue;
}
struct opnode *expr = child->children;
if (expr->op >= SEARCH_NUM_PARTS) {
opnode_append_child(root, child);
continue;
}
if (!search_part_is_body(expr->op) || expr->op == SEARCH_PART_ANY) {
/* Transform NOT(MATCH) to AND(NOT(MATCH),NOT(DOCTYPE==P)) */
struct opnode *notdp = opnode_new(SEARCH_OP_NOT, NULL);
strarray_t ar = STRARRAY_INITIALIZER;
strarray_append(&ar, "P");
opnode_append_child(notdp, opnode_new(XAPIAN_SEARCH_OP_DOCTYPE, &ar));
strarray_fini(&ar);
struct opnode *node = opnode_new(SEARCH_OP_AND, NULL);
opnode_append_child(node, child);
opnode_append_child(node, notdp);
opnode_append_child(root, node);
}
}
opnode_delete(norm);
}
else if (bb->root && is_orclause(bb->root)) {
root = bb->root;
}
else if (bb->root) {
struct buf buf = BUF_INITIALIZER;
opnode_serialise(&buf, bb->root);
syslog(LOG_ERR, "search_xapian: expected DNF or OR clause, got %s",
buf_cstring(&buf));
buf_free(&buf);
r = IMAP_INTERNAL;
goto out;
}
xq = opnode_to_query(bb->lock.db, root, bb->opts);
if (!xq) goto out;
struct conversations_state *cstate = mailbox_get_cstate(bb->mailbox);
if (!cstate) {
syslog(LOG_INFO, "search_xapian: can't open conversations for %s",
bb->mailbox->name);
r = IMAP_NOTFOUND;
goto out;
}
uint32_t num_folders = conversations_num_folders(cstate);
struct xapian_match *result = xzmalloc(sizeof(struct xapian_match) * num_folders);
struct xapian_run_rock xrock = { bb, result };
// sort the response by GUID for more efficient later handling
- r = xapian_query_run(bb->lock.db, xq, /*is_legacy*/0,
- xapian_run_cb, &xrock);
+ r = xapian_query_run(bb->lock.db, xq, xapian_run_cb, &xrock);
if (result) {
r = 0;
uint32_t j;
for (j = 0; j < num_folders; j++) {
struct xapian_match *match = result + j;
if (bv_count(&match->uids)) {
const char *mboxname = conversations_folder_name(cstate, j);
int uid;
for (uid = bv_next_set(&match->uids, 0); uid != -1;
uid = bv_next_set(&match->uids, uid+1)) {
strarray_t *partids = NULL;
if (match->partids_by_uid.size) {
partids = hashu64_lookup(uid, &match->partids_by_uid);
}
r = bb->proc(mboxname, /*uidvalidity*/0, uid, partids, bb->rock);
if (r) break;
}
}
xapian_match_reset(match);
}
}
free(result);
out:
if (root && root != bb->root) opnode_delete(root);
xapian_query_free(xq);
return r;
}
static void add_stemmers(xapian_db_t *db, struct opnode *on)
{
if (!on) return;
if (on->op == SEARCH_PART_LANGUAGE) {
int i;
for (i = 0; i < strarray_size(on->items); i++) {
xapian_query_add_stemmer(db, strarray_nth(on->items, i));
}
}
struct opnode *child;
for (child = on->children ; child ; child = child->next) {
add_stemmers(db, child);
}
}
static int run_internal(xapian_builder_t *bb)
{
int r = 0;
/* Sanity check builder */
assert((bb->proc == NULL) != (bb->proc_guidsearch == NULL));
- if (!bb->lock.db) goto out; // no index for this user
+ if (!bb->lock.db) return 0; // no index for this user
/* Validate config */
r = check_config(NULL);
if (r) return r;
if (bb->root) optimise_nodes(NULL, bb->root);
/* Stem using any languages explicitly requested by the user. */
add_stemmers(bb->lock.db, bb->root);
- /* A database can contain sub-databases of multiple versions */
- if (xapian_db_has_otherthan_v4_index(bb->lock.db)) {
- // all but version 4 databases index header and body together
- r = run_query(bb);
- if (r) goto out;
- }
- if (xapian_db_has_legacy_v4_index(bb->lock.db)) {
- // legacy version 4 databases index headers and body separately
- r = run_legacy_v4_query(bb);
- if (r) goto out;
- }
-
-out:
- return r;
+ return run_query(bb);
}
static int run(search_builder_t *bx, search_hit_cb_t proc, void *rock)
{
xapian_builder_t *bb = (xapian_builder_t *)bx;
bb->proc = proc;
bb->rock = rock;
return run_internal(bb);
}
static int run_guidsearch(search_builder_t *bx, search_hitguid_cb_t proc, void *rock)
{
xapian_builder_t *bb = (xapian_builder_t *)bx;
bb->proc_guidsearch = proc;
bb->rock = rock;
- // we can't do GUIDSEARCH on v4 databases
- if (!bb->lock.db || xapian_db_has_legacy_v4_index(bb->lock.db))
- return IMAP_SEARCH_NOT_SUPPORTED;
+ if (!bb->lock.db) return IMAP_SEARCH_NOT_SUPPORTED;
return run_internal(bb);
}
static void begin_boolean(search_builder_t *bx, int op)
{
xapian_builder_t *bb = (xapian_builder_t *)bx;
struct opnode *top = ptrarray_tail(&bb->stack);
struct opnode *on = opnode_new(op, NULL);
if (top)
opnode_append_child(top, on);
else
bb->root = on;
ptrarray_push(&bb->stack, on);
if (SEARCH_VERBOSE(bb->opts))
syslog(LOG_INFO, "begin_boolean(op=%s)", search_op_as_string(op));
}
static void end_boolean(search_builder_t *bx, int op __attribute__((unused)))
{
xapian_builder_t *bb = (xapian_builder_t *)bx;
if (SEARCH_VERBOSE(bb->opts))
syslog(LOG_INFO, "end_boolean");
ptrarray_pop(&bb->stack);
}
static void matchlist(search_builder_t *bx, int part, const strarray_t *vals)
{
xapian_builder_t *bb = (xapian_builder_t *)bx;
struct opnode *top = ptrarray_tail(&bb->stack);
struct opnode *on;
if (!vals) return;
if (SEARCH_VERBOSE(bb->opts)) {
char *item = strarray_join(vals, ",");
syslog(LOG_INFO, "match(part=%s, str=\"%s\")",
search_part_as_string(part), item);
free(item);
}
on = opnode_new(part, vals);
if (top)
opnode_append_child(top, on);
else
bb->root = on;
}
static void match(search_builder_t *bx, int part, const char *val)
{
strarray_t items = STRARRAY_INITIALIZER;
strarray_append(&items, val);
matchlist(bx, part, &items);
strarray_fini(&items);
}
static void *get_internalised(search_builder_t *bx)
{
xapian_builder_t *bb = (xapian_builder_t *)bx;
struct opnode *on = bb->root;
bb->root = NULL;
optimise_nodes(NULL, on);
return on;
}
static char *describe_internalised(void *internalised __attribute__((unused)))
{
return xstrdup("--xapian query--");
}
static void free_internalised(void *internalised)
{
struct opnode *on = (struct opnode *)internalised;
if (on) opnode_delete(on);
}
static search_builder_t *begin_search(struct mailbox *mailbox, int opts)
{
int r = check_config(NULL);
if (r) return NULL;
xapian_builder_t *bb = xzmalloc(sizeof(xapian_builder_t));
bb->super.begin_boolean = begin_boolean;
bb->super.end_boolean = end_boolean;
bb->super.match = match;
bb->super.matchlist = matchlist;
bb->super.get_internalised = get_internalised;
bb->super.run = run;
bb->super.run_guidsearch = run_guidsearch;
bb->mailbox = mailbox;
bb->opts = opts;
r = xapiandb_lock_open(mailbox, &bb->lock);
if (r) goto out;
if (!bb->lock.activedirs || !bb->lock.activedirs->count) goto out;
/* read the list of all indexed messages to allow (optional) false positives
* for unindexed messages */
// TODO also handle for guidsearch
bb->indexed = seqset_init(0, SEQ_MERGE);
r = read_indexed(bb->lock.activedirs, bb->lock.activetiers, mailbox->name,
mailbox->i.uidvalidity, bb->indexed, /*do_cache*/0, /*verbose*/0);
if (r) goto out;
out:
/* XXX - error return? */
return &bb->super;
}
static void end_search(search_builder_t *bx)
{
xapian_builder_t *bb = (xapian_builder_t *)bx;
if (bb->indexed) seqset_free(bb->indexed);
ptrarray_fini(&bb->stack);
if (bb->root) opnode_delete(bb->root);
xapiandb_lock_release(&bb->lock);
free(bx);
}
/* ====================================================================== */
/* base class for both update and snippet receivers */
typedef struct xapian_receiver xapian_receiver_t;
struct xapian_receiver
{
search_text_receiver_t super;
int verbose;
struct mailbox *mailbox;
struct message_guid guid;
uint32_t uid;
time_t internaldate;
int part;
const struct message_guid *part_guid;
const char *partid;
unsigned int parts_total;
int truncate_warning;
ptrarray_t segs;
};
/* receiver used for updating the index */
typedef struct xapian_update_receiver xapian_update_receiver_t;
struct xapian_update_receiver
{
xapian_receiver_t super;
xapian_dbw_t *dbw;
struct mappedfile *activefile;
struct mboxlock *xapiandb_namelock;
unsigned int uncommitted;
unsigned int commits;
struct seqset *oldindexed;
struct seqset *indexed;
strarray_t *activedirs;
strarray_t *activetiers;
hash_table cached_seqs;
int mode;
int flags;
};
/* receiver used for extracting snippets after a search */
typedef struct xapian_snippet_receiver xapian_snippet_receiver_t;
struct xapian_snippet_receiver
{
xapian_receiver_t super;
xapian_snipgen_t *snipgen;
struct opnode *root;
search_snippet_cb_t proc;
void *rock;
struct xapiandb_lock lock;
const search_snippet_markup_t *markup;
};
struct is_indexed_rock {
xapian_update_receiver_t *tr;
char doctype;
};
static int is_indexed_cb(const conv_guidrec_t *rec, void *rock);
static const char *xapian_rootdir(const char *tier, const char *partition)
{
char *confkey;
const char *root;
if (!partition) {
partition = config_getstring(IMAPOPT_DEFAULTPARTITION);
if (!partition) {
syslog(LOG_ERR, "no default partition configured");
return NULL;
}
}
confkey = strconcat(tier, "searchpartition-", partition, NULL);
root = config_getoverflowstring(confkey, NULL);
if (!root) {
syslog(LOG_ERR, "undefined search partition: %s", confkey);
}
free(confkey);
return root;
}
/* Returns in *basedirp a new string which must be free()d */
static int xapian_basedir(const char *tier,
const char *mboxname, const char *partition,
const char *root, char **basedirp)
{
char *basedir = NULL;
mbname_t *mbname = NULL;
char c[2], d[2];
int r;
if (!root)
root = xapian_rootdir(tier, partition);
if (!root) {
r = IMAP_PARTITION_UNKNOWN;
goto out;
}
mbname = mbname_from_intname(mboxname);
if (!mbname_userid(mbname)) {
r = IMAP_PARTITION_UNKNOWN;
goto out;
}
const char *domain = mbname_domain(mbname);
const char *localpart = mbname_localpart(mbname);
if (domain)
basedir = strconcat(root,
FNAME_DOMAINDIR,
dir_hash_b(domain, config_fulldirhash, d),
"/", domain,
"/", dir_hash_b(localpart, config_fulldirhash, c),
FNAME_USERDIR,
localpart,
(char *)NULL);
else
basedir = strconcat(root,
"/", dir_hash_b(localpart, config_fulldirhash, c),
FNAME_USERDIR,
localpart,
(char *)NULL);
r = 0;
out:
if (!r && basedirp)
*basedirp = basedir;
else
free(basedir);
mbname_free(&mbname);
return r;
}
static int check_directory(const char *dir, int verbose, int create)
{
int r;
char *dummyfile = NULL;
struct stat sb;
r = stat(dir, &sb);
if (r < 0) {
if (errno != ENOENT) {
/* something went wrong - permissions problem most likely */
syslog(LOG_ERR, "IOERROR: unable to stat %s: %m", dir);
r = IMAP_IOERROR;
goto out;
}
/* the directory is just missing */
if (!create) {
/* caller doesn't care that much */
r = IMAP_NOTFOUND;
goto out;
}
if (verbose)
syslog(LOG_INFO, "Building directory %s", dir);
dummyfile = strconcat(dir, "/dummy", (char *)NULL);
cyrus_mkdir(dummyfile, 0700);
r = stat(dir, &sb);
if (r < 0) {
/* something went wrong - permissions problem most likely */
syslog(LOG_ERR, "IOERROR: unable to stat %s: %m", dir);
r = IMAP_IOERROR;
goto out;
}
}
out:
free(dummyfile);
return r;
}
static int flush(search_text_receiver_t *rx)
{
xapian_update_receiver_t *tr = (xapian_update_receiver_t *)rx;
int r = 0;
struct timeval start, end;
if (tr->uncommitted) {
assert(tr->dbw);
gettimeofday(&start, NULL);
r = xapian_dbw_commit_txn(tr->dbw);
if (r) goto out;
gettimeofday(&end, NULL);
syslog(LOG_INFO, "Xapian committed %u updates in %.6f sec",
tr->uncommitted, timesub(&start, &end));
tr->uncommitted = 0;
tr->commits++;
}
/* We write out the indexed list for the mailbox only after successfully
* updating the index, to avoid a future instance not realising that
* there are unindexed messages should we fail to index */
if (tr->indexed) {
r = write_indexed(strarray_nth(tr->activedirs, 0),
tr->super.mailbox->name, tr->super.mailbox->i.uidvalidity,
tr->indexed, tr->super.verbose);
if (r) goto out;
}
out:
return r;
}
static int audit_mailbox(search_text_receiver_t *rx, bitvector_t *unindexed)
{
xapian_update_receiver_t *tr = (xapian_update_receiver_t *)rx;
struct mailbox_iter *iter = NULL;
const message_t *msg = NULL;
int r = 0;
if (tr->mode != XAPIAN_DBW_XAPINDEXED) {
syslog(LOG_ERR, "search_xapian: require XAPIAN_DBW_XAPINDEXED mode");
r = IMAP_INTERNAL;
goto done;
}
iter = mailbox_iter_init(tr->super.mailbox, 0, ITER_SKIP_UNLINKED);
while ((msg = mailbox_iter_step(iter))) {
uint32_t uid;
r = message_get_uid((message_t*) msg, &uid);
if (r) goto done;
if (!seqset_ismember(tr->oldindexed, uid)) {
if (tr->super.verbose)
syslog(LOG_INFO, "search_xapian: ignoring %s:%d during audit",
tr->super.mailbox->name, uid);
continue;
}
const struct message_guid *guid;
r = message_get_guid((message_t*) msg, &guid);
if (r) goto done;
uint8_t indexlevel = xapian_dbw_is_indexed(tr->dbw, guid, XAPIAN_WRAP_DOCTYPE_MSG);
if (indexlevel == 0 || (indexlevel & SEARCH_INDEXLEVEL_PARTIAL)) {
bv_set(unindexed, uid);
}
}
done:
mailbox_iter_done(&iter);
return r;
}
static void free_segments(xapian_receiver_t *tr)
{
int i;
struct segment *seg;
for (i = 0 ; i < tr->segs.count ; i++) {
seg = (struct segment *)ptrarray_nth(&tr->segs, i);
buf_free(&seg->text);
free(seg->partid);
free(seg);
}
ptrarray_truncate(&tr->segs, 0);
}
static int begin_message(search_text_receiver_t *rx, message_t *msg)
{
xapian_update_receiver_t *tr = (xapian_update_receiver_t *)rx;
const struct message_guid *guid = NULL;
int r = message_get_uid(msg, &tr->super.uid);
if (!r) r = message_get_guid(msg, &guid);
if (!r) r = message_get_internaldate(msg, &tr->super.internaldate);
if (r) return r;
message_guid_copy(&tr->super.guid, guid);
free_segments((xapian_receiver_t *)tr);
tr->super.parts_total = 0;
tr->super.truncate_warning = 0;
return 0;
}
static int begin_bodypart(search_text_receiver_t *rx,
const char *partid,
const struct message_guid *content_guid,
const char *type __attribute__((unused)),
const char *subtype __attribute__((unused)))
{
xapian_receiver_t *tr = (xapian_receiver_t *)rx;
tr->partid = partid;
tr->part_guid = content_guid;
return 0;
}
static void begin_part(search_text_receiver_t *rx, int part)
{
xapian_receiver_t *tr = (xapian_receiver_t *)rx;
tr->part = part;
}
static void append_text(search_text_receiver_t *rx,
const struct buf *text)
{
xapian_receiver_t *tr = (xapian_receiver_t *)rx;
struct segment *seg;
if (tr->part) {
unsigned len = text->len;
if (tr->parts_total + len > SEARCH_MAX_PARTS_SIZE) {
if (!tr->truncate_warning++)
syslog(LOG_ERR, "Xapian: truncating text from "
"message mailbox %s uid %u",
tr->mailbox->name, tr->uid);
len = SEARCH_MAX_PARTS_SIZE - tr->parts_total;
}
if (len) {
tr->parts_total += len;
seg = (struct segment *)ptrarray_tail(&tr->segs);
if (!seg || seg->is_finished || seg->part != tr->part) {
seg = (struct segment *)xzmalloc(sizeof(*seg));
seg->sequence = tr->segs.count;
seg->part = tr->part;
seg->partid = xstrdupnull(tr->partid);
if (tr->part_guid && search_part_is_body(tr->part)) {
message_guid_copy(&seg->guid, tr->part_guid);
seg->doctype = XAPIAN_WRAP_DOCTYPE_PART;
} else {
message_guid_copy(&seg->guid, &tr->guid);
seg->doctype = XAPIAN_WRAP_DOCTYPE_MSG;
}
ptrarray_append(&tr->segs, seg);
}
buf_appendmap(&seg->text, text->s, len);
}
}
}
static void end_part(search_text_receiver_t *rx,
int part __attribute__((unused)))
{
xapian_receiver_t *tr = (xapian_receiver_t *)rx;
struct segment *seg;
seg = (struct segment *)ptrarray_tail(&tr->segs);
if (seg)
seg->is_finished = 1;
if (tr->verbose > 1)
syslog(LOG_NOTICE, "Xapian: %llu bytes in part %s",
(seg ? (unsigned long long)seg->text.len : 0),
search_part_as_string(tr->part));
tr->part = 0;
}
static void end_bodypart(search_text_receiver_t *rx __attribute__((unused)))
{
xapian_receiver_t *tr = (xapian_receiver_t *)rx;
tr->partid = NULL;
tr->part_guid = NULL;
}
static int doctype_cmp(char doctype1, char doctype2)
{
if (doctype1 == XAPIAN_WRAP_DOCTYPE_MSG &&
doctype2 != XAPIAN_WRAP_DOCTYPE_MSG) return -1;
if (doctype1 != XAPIAN_WRAP_DOCTYPE_MSG &&
doctype2 == XAPIAN_WRAP_DOCTYPE_MSG) return 1;
return doctype1 - doctype2;
}
static int compare_segs(const void **v1, const void **v2)
{
const struct segment *s1 = *(const struct segment **)v1;
const struct segment *s2 = *(const struct segment **)v2;
int r;
r = doctype_cmp(s1->doctype, s2->doctype);
if (!r)
r = message_guid_cmp(&s1->guid, &s2->guid);
if (!r)
r = strcmpsafe(s1->partid, s2->partid);
if (!r)
r = s1->part - s2->part;
if (!r)
r = s1->sequence - s2->sequence;
return r;
}
static int is_indexed_part(xapian_update_receiver_t *tr, const struct message_guid *guid)
{
if (tr->mode == XAPIAN_DBW_XAPINDEXED) {
return xapian_dbw_is_indexed(tr->dbw, guid, XAPIAN_WRAP_DOCTYPE_PART);
}
struct conversations_state *cstate = mailbox_get_cstate(tr->super.mailbox);
if (!cstate) {
xsyslog(LOG_INFO, "can't open conversations", "mailbox=<%s>",
tr->super.mailbox->name);
return 0;
}
int ret = 0;
char *guidrep = xstrdup(message_guid_encode(guid));
struct is_indexed_rock rock = { tr, XAPIAN_WRAP_DOCTYPE_PART };
int r = conversations_guid_foreach(cstate, guidrep, is_indexed_cb, &rock);
if (r == CYRUSDB_DONE) ret = SEARCH_INDEXLEVEL_BASIC;
else if (r) {
xsyslog(LOG_ERR, "unexpected return code", "guid=<%s> r=<%d> err=<%s>",
message_guid_encode(guid), r, cyrusdb_strerror(r));
}
free(guidrep);
return ret;
}
static int end_message_update(search_text_receiver_t *rx, uint8_t indexlevel)
{
xapian_update_receiver_t *tr = (xapian_update_receiver_t *)rx;
int i;
struct segment *seg;
int r = 0;
if (!ptrarray_size(&tr->super.segs)) goto out;
if (!tr->dbw) {
r = xapian_dbw_open((const char **)tr->activedirs->data, &tr->dbw, tr->mode, /*nosync*/0);
if (r) goto out;
}
ptrarray_sort(&tr->super.segs, compare_segs);
// index headers and body parts with message guid
if (!tr->uncommitted) {
r = xapian_dbw_begin_txn(tr->dbw);
if (r) goto out;
}
r = xapian_dbw_begin_doc(tr->dbw, &tr->super.guid, XAPIAN_WRAP_DOCTYPE_MSG);
if (r) goto out;
for (i = 0 ; i < tr->super.segs.count ; i++) {
seg = (struct segment *)ptrarray_nth(&tr->super.segs, i);
r = xapian_dbw_doc_part(tr->dbw, &seg->text, seg->part);
if (r) goto out;
}
r = xapian_dbw_end_doc(tr->dbw, indexlevel);
if (r) goto out;
++tr->uncommitted;
// index body parts with content guid
const struct message_guid *last_guid = NULL;
for (i = 0 ; i < tr->super.segs.count ; i++) {
seg = (struct segment *)ptrarray_nth(&tr->super.segs, i);
if (seg->doctype == XAPIAN_WRAP_DOCTYPE_MSG) continue;
if (!last_guid || message_guid_cmp(last_guid, &seg->guid)) {
if (last_guid) {
// finalize indexing of previous part
r = xapian_dbw_end_doc(tr->dbw, SEARCH_INDEXLEVEL_BASIC);
if (r) goto out;
++tr->uncommitted;
last_guid = NULL;
}
if (!(tr->flags & SEARCH_UPDATE_ALLOW_DUPPARTS) &&
is_indexed_part(tr, &seg->guid)) {
continue;
}
last_guid = &seg->guid;
// TODO which internaldate, if any?
r = xapian_dbw_begin_doc(tr->dbw, &seg->guid, seg->doctype);
if (r) goto out;
}
r = xapian_dbw_doc_part(tr->dbw, &seg->text, seg->part);
if (r) goto out;
}
if (last_guid) {
// body parts have no index level
r = xapian_dbw_end_doc(tr->dbw, SEARCH_INDEXLEVEL_BASIC);
if (r) goto out;
++tr->uncommitted;
}
/* start the range back at the first unindexed if necessary */
if (!tr->indexed) {
tr->indexed = seqset_init(0, SEQ_MERGE);
/* we want to say that we indexed the entire gap from last time
* up until this first message as well, so our indexed range
* isn't gappy */
seqset_add(tr->indexed, seqset_firstnonmember(tr->oldindexed), 1);
}
seqset_add(tr->indexed, tr->super.uid, 1);
out:
tr->super.uid = 0;
message_guid_set_null(&tr->super.guid);
tr->super.internaldate = 0;
return r;
}
static int _starts_with_tier(const strarray_t *active, const char *tier)
{
if (!active) return 0;
if (!active->count) return 0;
const char *candidate = strarray_nth(active, 0);
struct activeitem *item = activeitem_parse(candidate);
int res = !strcmp(item->tier, tier);
activeitem_free(item);
return res;
}
static int begin_mailbox_update(search_text_receiver_t *rx,
struct mailbox *mailbox,
int flags)
{
xapian_update_receiver_t *tr = (xapian_update_receiver_t *)rx;
char *fname = activefile_fname(mailbox->name);
strarray_t *active = NULL;
int r = IMAP_IOERROR;
char *namelock_fname = NULL;
char *userid = NULL;
tr->flags = flags;
/* not an indexable mailbox, fine - return a code to avoid
* trying to index each message as well */
if (!fname) {
r = IMAP_MAILBOX_NONEXISTENT;
goto out;
}
/* Do nothing if there is no userid */
userid = mboxname_to_userid(mailbox->name);
if (!userid) goto out;
/* Get a shared namelock */
namelock_fname = xapiandb_namelock_fname_from_userid(userid);
r = mboxname_lock(namelock_fname, &tr->xapiandb_namelock, LOCK_SHARED);
if (r) {
syslog(LOG_ERR, "Could not acquire shared namelock on %s\n",
namelock_fname);
goto out;
}
/* we're using "not incremental" to mean "check that the GUID of every message
* in the mailbox is present in an index rather than trusting the UID ranges */
/* we grab an activefile writelock to index. Strictly we don't need it, but
* doing this guarantees we never write under a client which is reading, which
* avoids this:
*
* IOERROR: Xapian: caught exception: : DatabaseModifiedError: The revision
* being read has been discarded - you should call Xapian::Database::reopen()
* and retry the operation
*
* in theory, this will go away eventually, and we can switch back to write: 0
* in this code.
*
* http://grokbase.com/t/xapian/xapian-discuss/0667ppbks8/#20060608j8x5aeept49dv5fm8d02xkczgr
*
* "This is almost invariably caused by updating a database while reading
* from it. If two updates are committed before the read completes, you
* get this error (it's DatabaseModifiedError). It's a bit of a pain
* and will be going away in the future, but it's not too hard to design
* to avoid it happening at least."
*/
const char *deftier = config_getstring(IMAPOPT_DEFAULTSEARCHTIER);
r = activefile_open(mailbox->name, mailbox->part, &tr->activefile, AF_LOCK_WRITE, &active);
if (r) {
syslog(LOG_ERR, "Failed to lock activefile for %s", mailbox->name);
goto out;
}
if (!active) active = strarray_new();
// make sure we're indexing to the default tier
while (!_starts_with_tier(active, deftier)) {
char *newstart = activefile_nextname(active, config_getstring(IMAPOPT_DEFAULTSEARCHTIER));
syslog(LOG_NOTICE, "create new search tier %s for %s", newstart, mailbox->name);
strarray_unshiftm(active, newstart);
r = activefile_write(tr->activefile, active);
mappedfile_close(&tr->activefile);
strarray_free(active);
active = NULL;
r = activefile_open(mailbox->name, mailbox->part, &tr->activefile, AF_LOCK_WRITE, &active);
if (r) {
syslog(LOG_ERR, "Failed to lock activefile for %s", mailbox->name);
goto out;
}
}
assert(active->count);
tr->mode = (flags & (SEARCH_UPDATE_XAPINDEXED|SEARCH_UPDATE_AUDIT)) ?
XAPIAN_DBW_XAPINDEXED : XAPIAN_DBW_CONVINDEXED;
/* doesn't matter if the first one doesn't exist yet, we'll create it. Only stat the others if we're going
* to be opening them */
int dostat = tr->mode == XAPIAN_DBW_XAPINDEXED ? 2 : 0;
tr->activedirs = activefile_resolve(mailbox->name, mailbox->part, active, dostat, &tr->activetiers);
// this should never be able to fail here, because the first item will always exist!
assert(tr->activedirs && tr->activedirs->count);
/* create the directory if needed */
r = check_directory(strarray_nth(tr->activedirs, 0), tr->super.verbose, /*create*/1);
if (r) goto out;
if (tr->mode == XAPIAN_DBW_XAPINDEXED) {
/* open the DB now, we need it to check if messages are indexed */
r = xapian_dbw_open((const char **)tr->activedirs->data, &tr->dbw, tr->mode, /*nosync*/0);
if (r) goto out;
}
/* read the indexed data from every directory so know what still needs indexing */
tr->oldindexed = seqset_init(0, SEQ_MERGE);
if ((flags & (SEARCH_UPDATE_INCREMENTAL|SEARCH_UPDATE_AUDIT))) {
r = read_indexed(tr->activedirs, tr->activetiers, mailbox->name, mailbox->i.uidvalidity,
tr->oldindexed, /*do_cache*/1, tr->super.verbose);
if (r) goto out;
}
/* purge any stale cache for this mailbox index sequences */
struct seqset *seq = hash_del(mailbox->name, &tr->cached_seqs);
if (seq) seqset_free(seq);
tr->super.mailbox = mailbox;
out:
free(fname);
free(userid);
free(namelock_fname);
strarray_free(active);
return r;
}
static uint32_t first_unindexed_uid(search_text_receiver_t *rx)
{
xapian_update_receiver_t *tr = (xapian_update_receiver_t *)rx;
return seqset_firstnonmember(tr->oldindexed);
}
static int is_indexed_cb(const conv_guidrec_t *rec, void *rock)
{
xapian_update_receiver_t *tr = ((struct is_indexed_rock*)rock)->tr;
char doctype = ((struct is_indexed_rock*)rock)->doctype;
if (doctype == XAPIAN_WRAP_DOCTYPE_MSG && rec->part) return 0;
/* Is this a part in the message we are just indexing? */
if (doctype == XAPIAN_WRAP_DOCTYPE_PART && rec->uid == tr->super.uid &&
!strcmp(rec->mboxname, tr->super.mailbox->name)) {
return 0;
}
/* Is this GUID record in the mailbox we are currently indexing? */
if (!strcmp(tr->super.mailbox->name, rec->mboxname)) {
if (seqset_ismember(tr->indexed, rec->uid) ||
seqset_ismember(tr->oldindexed, rec->uid)) {
return CYRUSDB_DONE;
}
return 0;
}
/* Is this GUID record in an already cached sequence set? */
struct seqset *seq = hash_lookup(rec->mboxname, &tr->cached_seqs);
if (seq) {
return seqset_ismember(seq, rec->uid) ? CYRUSDB_DONE : 0;
}
/* Read the index cache for this mailbox */
mbentry_t *mb = NULL;
seq = seqset_init(0, SEQ_MERGE);
int r = 0;
r = mboxlist_lookup(rec->mboxname, &mb, NULL);
if (r) {
syslog(LOG_ERR, "is_indexed_cb: mboxlist_lookup %s failed: %s",
rec->mboxname, error_message(r));
goto out;
}
r = read_indexed(tr->activedirs, tr->activetiers, mb->name, mb->uidvalidity,
seq, /*do_cache*/1, tr->super.verbose);
if (r) {
syslog(LOG_ERR, "is_indexed_cb: read_indexed %s failed: %s",
rec->mboxname, error_message(r));
goto out;
}
hash_insert(rec->mboxname, seq, &tr->cached_seqs);
out:
mboxlist_entry_free(&mb);
if (r) {
seqset_free(seq);
return 0;
}
return seqset_ismember(seq, rec->uid) ? CYRUSDB_DONE : 0;
}
static uint8_t is_indexed(search_text_receiver_t *rx, message_t *msg)
{
xapian_update_receiver_t *tr = (xapian_update_receiver_t *)rx;
uint32_t uid = 0;
message_get_uid(msg, &uid);
/* bail early if we've already indexed this message in THIS run */
if (seqset_ismember(tr->indexed, uid))
return 1;
uint8_t ret = 0;
const struct message_guid *guid = NULL;
message_get_guid(msg, &guid);
if (tr->mode == XAPIAN_DBW_CONVINDEXED) {
/* Determine if msg is already indexed */
struct conversations_state *cstate = mailbox_get_cstate(tr->super.mailbox);
if (!cstate) {
syslog(LOG_INFO, "search_xapian: can't open conversations for %s",
tr->super.mailbox->name);
return 0;
}
char *guidrep = xstrdup(message_guid_encode(guid));
struct is_indexed_rock rock = { tr, XAPIAN_WRAP_DOCTYPE_MSG };
int r = conversations_guid_foreach(cstate, guidrep, is_indexed_cb, &rock);
if (r == CYRUSDB_DONE) ret = SEARCH_INDEXLEVEL_BASIC;
else if (r) {
syslog(LOG_ERR, "is_indexed %s:%d: unexpected return code: %d (%s)",
tr->super.mailbox->name, uid, r, cyrusdb_strerror(r));
}
free(guidrep);
}
else if (tr->mode == XAPIAN_DBW_XAPINDEXED) {
// XXX check for all parts of that message?
ret = xapian_dbw_is_indexed(tr->dbw, guid, XAPIAN_WRAP_DOCTYPE_MSG);
}
return ret;
}
static int end_mailbox_update(search_text_receiver_t *rx,
struct mailbox *mailbox
__attribute__((unused)))
{
xapian_update_receiver_t *tr = (xapian_update_receiver_t *)rx;
int r = 0;
r = flush(rx);
/* flush before cleaning up, since indexed data is written by flush */
if (tr->indexed) {
seqset_free(tr->indexed);
tr->indexed = NULL;
}
if (tr->oldindexed) {
seqset_free(tr->oldindexed);
tr->oldindexed = NULL;
}
tr->super.mailbox = NULL;
if (tr->dbw) {
xapian_dbw_close(tr->dbw);
tr->dbw = NULL;
}
/* don't unlock until DB is committed */
if (tr->activefile) {
mappedfile_unlock(tr->activefile);
mappedfile_close(&tr->activefile);
tr->activefile = NULL;
}
/* Release xapian db named lock */
if (tr->xapiandb_namelock) {
mboxname_release(&tr->xapiandb_namelock);
tr->xapiandb_namelock = NULL;
}
if (tr->activedirs) {
strarray_free(tr->activedirs);
tr->activedirs = NULL;
}
if (tr->activetiers) {
strarray_free(tr->activetiers);
tr->activetiers = NULL;
}
tr->flags = 0;
return r;
}
static int xapian_charset_flags(int flags)
{
return (flags|CHARSET_KEEPCASE|CHARSET_MIME_UTF8) & ~CHARSET_SKIPDIACRIT;
}
static int xapian_message_format(int format __attribute__((unused)),
int is_snippet __attribute__((unused)))
{
return MESSAGE_SNIPPET;
}
static search_text_receiver_t *begin_update(int verbose)
{
xapian_update_receiver_t *tr;
if (check_config(NULL)) return NULL;
tr = xzmalloc(sizeof(xapian_update_receiver_t));
tr->super.super.begin_mailbox = begin_mailbox_update;
tr->super.super.first_unindexed_uid = first_unindexed_uid;
tr->super.super.is_indexed = is_indexed;
tr->super.super.begin_message = begin_message;
tr->super.super.begin_bodypart = begin_bodypart;
tr->super.super.begin_part = begin_part;
tr->super.super.append_text = append_text;
tr->super.super.end_part = end_part;
tr->super.super.end_bodypart = end_bodypart;
tr->super.super.end_message = end_message_update;
tr->super.super.end_mailbox = end_mailbox_update;
tr->super.super.flush = flush;
tr->super.super.audit_mailbox = audit_mailbox;
tr->super.super.index_charset_flags = xapian_charset_flags;
tr->super.super.index_message_format = xapian_message_format;
tr->super.verbose = verbose;
construct_hash_table(&tr->cached_seqs, 128, 0);
return &tr->super.super;
}
static void free_receiver(xapian_receiver_t *tr)
{
free_segments(tr);
ptrarray_fini(&tr->segs);
free(tr);
}
static int end_update(search_text_receiver_t *rx)
{
xapian_update_receiver_t *tr = (xapian_update_receiver_t *)rx;
free_hash_table(&tr->cached_seqs, (void(*)(void*))seqset_free);
free_receiver(&tr->super);
return 0;
}
static int begin_mailbox_snippets(search_text_receiver_t *rx,
struct mailbox *mailbox,
int incremental __attribute__((unused)))
{
xapian_snippet_receiver_t *tr = (xapian_snippet_receiver_t *)rx;
tr->super.mailbox = mailbox;
int r = xapiandb_lock_open(mailbox, &tr->lock);
if (r) goto out;
if (!tr->lock.activedirs || !tr->lock.activedirs->count) goto out;
tr->snipgen = xapian_snipgen_new(tr->lock.db, tr->markup->hi_start,
tr->markup->hi_end, tr->markup->omit);
out:
return r;
}
/* Find match terms for the given part and add them to the Xapian
* snippet generator. */
static void generate_snippet_terms(xapian_snipgen_t *snipgen,
int part,
struct opnode *on)
{
struct opnode *child;
int i;
switch (on->op) {
case SEARCH_OP_TRUE:
case SEARCH_OP_FALSE:
// ignore
break;
case SEARCH_OP_NOT:
case SEARCH_OP_OR:
case SEARCH_OP_AND:
for (child = on->children ; child ; child = child->next)
generate_snippet_terms(snipgen, part, child);
break;
case SEARCH_PART_ANY:
assert(on->children == NULL);
if (part != SEARCH_PART_HEADERS) {
for (i = 0; i < strarray_size(on->items); i++)
xapian_snipgen_add_match(snipgen, strarray_nth(on->items, i));
}
break;
default:
/* other SEARCH_PART_* constants */
if (on->op >= 0 && on->op < SEARCH_NUM_PARTS) {
assert(on->children == NULL);
if (part == on->op) {
for (i = 0; i < strarray_size(on->items); i++)
xapian_snipgen_add_match(snipgen, strarray_nth(on->items, i));
}
}
break;
}
}
static int flush_snippets(search_text_receiver_t *rx)
{
xapian_snippet_receiver_t *tr = (xapian_snippet_receiver_t *)rx;
struct buf snippets = BUF_INITIALIZER;
int i;
struct segment *seg;
int last_part = -1;
int r = 0;
if (!tr->root) {
goto out;
}
if (!tr->lock.activedirs || !tr->lock.activedirs->count) {
goto out;
}
if (!tr->snipgen) {
r = IMAP_INTERNAL; /* need to call begin_mailbox() */
goto out;
}
ptrarray_sort(&tr->super.segs, compare_segs);
const struct message_guid *last_guid = NULL;
const char *last_partid = NULL;
for (i = 0 ; i < tr->super.segs.count ; i++) {
seg = (struct segment *)ptrarray_nth(&tr->super.segs, i);
if (!last_guid || message_guid_cmp(last_guid, &seg->guid) ||
strcmpsafe(seg->partid, last_partid) || seg->part != last_part) {
if (i) {
/* In contrast to the update code, we start and end a document
* for each search part of the same message. This is due to
* the way the snippet callbacks are implemented. */
r = xapian_snipgen_end_doc(tr->snipgen, &snippets);
if (!r && snippets.len) {
r = tr->proc(tr->super.mailbox, tr->super.uid, last_part,
last_partid, snippets.s, tr->rock);
}
if (r) goto out;
}
if (search_part_is_body(seg->part)) {
r = xapian_snipgen_begin_doc(tr->snipgen, &seg->guid,
XAPIAN_WRAP_DOCTYPE_PART);
}
else {
r = xapian_snipgen_begin_doc(tr->snipgen, &tr->super.guid,
XAPIAN_WRAP_DOCTYPE_MSG);
}
if (r) break;
generate_snippet_terms(tr->snipgen, seg->part, tr->root);
last_guid = &seg->guid;
last_part = -1;
}
r = xapian_snipgen_doc_part(tr->snipgen, &seg->text, seg->part);
last_partid = seg->partid;
last_part = seg->part;
if (r) break;
}
if (last_part != -1) {
r = xapian_snipgen_end_doc(tr->snipgen, &snippets);
if (!r && snippets.len)
r = tr->proc(tr->super.mailbox, tr->super.uid, last_part,
last_partid, snippets.s, tr->rock);
}
free_segments(&tr->super);
out:
buf_free(&snippets);
return r;
}
static int end_message_snippets(search_text_receiver_t *rx,
uint8_t indexlevel __attribute__((unused)))
{
return flush_snippets(rx);
}
static int end_mailbox_snippets(search_text_receiver_t *rx,
struct mailbox *mailbox
__attribute__((unused)))
{
xapian_snippet_receiver_t *tr = (xapian_snippet_receiver_t *)rx;
xapiandb_lock_release(&tr->lock);
tr->super.mailbox = NULL;
xapian_snipgen_free(tr->snipgen);
tr->snipgen = NULL;
return 0;
}
static search_text_receiver_t *begin_snippets(void *internalised,
int verbose,
search_snippet_markup_t *m,
search_snippet_cb_t proc,
void *rock)
{
xapian_snippet_receiver_t *tr;
if (check_config(NULL)) return NULL;
tr = xzmalloc(sizeof(xapian_snippet_receiver_t));
tr->super.super.begin_mailbox = begin_mailbox_snippets;
tr->super.super.begin_message = begin_message;
tr->super.super.begin_bodypart = begin_bodypart;
tr->super.super.begin_part = begin_part;
tr->super.super.append_text = append_text;
tr->super.super.end_part = end_part;
tr->super.super.end_bodypart = end_bodypart;
tr->super.super.end_message = end_message_snippets;
tr->super.super.end_mailbox = end_mailbox_snippets;
tr->super.super.flush = flush_snippets;
tr->super.super.index_charset_flags = xapian_charset_flags;
tr->super.verbose = verbose;
tr->root = (struct opnode *)internalised;
tr->proc = proc;
tr->rock = rock;
tr->markup = m;
return &tr->super.super;
}
static int end_snippets(search_text_receiver_t *rx)
{
xapian_snippet_receiver_t *tr = (xapian_snippet_receiver_t *)rx;
xapian_snipgen_free(tr->snipgen);
free_receiver(&tr->super);
return 0;
}
static int list_files(const char *userid, strarray_t *files)
{
char *mboxname = mboxname_user_mbox(userid, NULL);
struct mboxlist_entry *mbentry = NULL;
char *fname = NULL;
DIR *dirh = NULL;
struct dirent *de;
struct stat sb;
strarray_t *active = NULL;
strarray_t *dirs = NULL;
struct mappedfile *activefile = NULL;
struct mboxlock *xapiandb_namelock = NULL;
char *namelock_fname = NULL;
int r;
int i;
r = mboxlist_lookup(mboxname, &mbentry, NULL);
if (r == IMAP_MAILBOX_NONEXISTENT) {
/* no user, no worries */
r = 0;
goto out;
}
if (r) {
syslog(LOG_ERR, "IOERROR: failed to lookup %s", mboxname);
goto out;
}
/* Get a shared namelock */
namelock_fname = xapiandb_namelock_fname_from_userid(userid);
r = mboxname_lock(namelock_fname, &xapiandb_namelock, LOCK_SHARED);
if (r) {
syslog(LOG_ERR, "Could not acquire shared namelock on %s\n",
namelock_fname);
goto out;
}
/* Get a readlock on the activefile */
r = activefile_open(mboxname, mbentry->partition, &activefile, AF_LOCK_READ, &active);
if (r) {
syslog(LOG_ERR, "Couldn't open active file: %s", mboxname);
goto out;
}
if (!active) goto out;
dirs = activefile_resolve(mboxname, mbentry->partition, active, /*dostat*/1, NULL/*resultitems*/);
for (i = 0; i < dirs->count; i++) {
const char *basedir = strarray_nth(dirs, i);
dirh = opendir(basedir);
if (!dirh) continue;
while ((de = readdir(dirh))) {
if (de->d_name[0] == '.') continue;
free(fname);
fname = strconcat(basedir, "/", de->d_name, (char *)NULL);
r = stat(fname, &sb);
if (!r && S_ISREG(sb.st_mode)) {
strarray_appendm(files, fname);
fname = NULL;
}
}
closedir(dirh);
dirh = NULL;
}
out:
if (activefile) {
mappedfile_unlock(activefile);
mappedfile_close(&activefile);
}
if (xapiandb_namelock) {
mboxname_release(&xapiandb_namelock);
xapiandb_namelock = NULL;
}
strarray_free(active);
strarray_free(dirs);
free(fname);
free(namelock_fname);
mboxlist_entry_free(&mbentry);
free(mboxname);
return 0;
}
struct mbfilter {
const char *userid;
struct bloom bloom;
struct db *indexeddb;
struct txn **tid;
const strarray_t *destpaths;
const strarray_t *desttiers;
char *temp_path;
strarray_t temptargets;
int numindexed;
int flags;
};
static void free_mbfilter(struct mbfilter *filter)
{
int i;
if (filter->tid) cyrusdb_abort(filter->indexeddb, *filter->tid);
cyrusdb_close(filter->indexeddb);
bloom_free(&filter->bloom);
for (i = 0; i < strarray_size(&filter->temptargets); i++) {
removedir(strarray_nth(&filter->temptargets, i));
}
strarray_fini(&filter->temptargets);
if (filter->temp_path) {
removedir(filter->temp_path);
free(filter->temp_path);
}
}
static int copyindexed_cb(void *rock,
const char *key, size_t keylen,
const char *data, size_t datalen)
{
/* Ignore cached index entries */
if (*key == '#') {
return 0;
}
/* Copy the record */
struct mbfilter *filter = (struct mbfilter *)rock;
struct seqset *seq = parse_indexed(data, datalen);
int r = 0;
if (seq) {
r = store_indexed(filter->indexeddb, filter->tid, key, keylen, seq);
seqset_free(seq);
}
return r;
}
static int mbdata_exists_cb(const char *cyrusid, void *rock)
{
struct mbfilter *filter = (struct mbfilter *)rock;
if (strncmp(cyrusid, "*G*", 3) && strncmp(cyrusid, "*P*", 3)) return 0;
return bloom_check(&filter->bloom, cyrusid+3, 40);
}
static int bloomadd_cb(void *rock,
const char *key,
size_t keylen __attribute__((unused)),
const char *data __attribute__((unused)),
size_t datalen __attribute__((unused)))
{
struct bloom *bloom = (struct bloom *)rock;
bloom_add(bloom, key+1, 40);
return 0;
}
static int create_filter(const strarray_t *srcpaths, const strarray_t *destpaths,
const strarray_t *desttiers,
const char *userid, int flags, struct mbfilter *filter,
int bloom)
{
struct buf buf = BUF_INITIALIZER;
int r = 0;
int i;
struct conversations_state *cstate = NULL;
memset(filter, 0, sizeof(struct mbfilter));
filter->destpaths = destpaths;
filter->desttiers = desttiers;
filter->userid = userid;
filter->flags = flags;
/* build the cyrus.indexed.db from the contents of the source dirs */
buf_reset(&buf);
buf_printf(&buf, "%s%s", strarray_nth(destpaths, 0), INDEXEDDB_FNAME);
r = cyrusdb_open(config_getstring(IMAPOPT_SEARCH_INDEXED_DB),
buf_cstring(&buf), CYRUSDB_CREATE, &filter->indexeddb);
if (r) {
printf("ERROR: failed to open indexed %s\n", buf_cstring(&buf));
goto done;
}
for (i = 0; i < srcpaths->count; i++) {
struct db *db = NULL;
buf_reset(&buf);
buf_printf(&buf, "%s%s", strarray_nth(srcpaths, i), INDEXEDDB_FNAME);
r = cyrusdb_open(config_getstring(IMAPOPT_SEARCH_INDEXED_DB),
buf_cstring(&buf), 0, &db);
if (r) {
r = 0;
continue;
}
r = cyrusdb_foreach(db, "", 0, NULL, copyindexed_cb, filter, NULL);
cyrusdb_close(db);
if (r) {
printf("ERROR: failed to process indexed db %s\n", strarray_nth(srcpaths, i));
goto done;
}
}
if (filter->tid) r = cyrusdb_commit(filter->indexeddb, *filter->tid);
if (r) {
printf("ERROR: failed to commit indexed %s\n", strarray_nth(destpaths, 0));
goto done;
}
if (bloom) {
/* assume a 4 million maximum records */
bloom_init(&filter->bloom, 4000000, 0.01);
r = conversations_open_user(userid, 1/*shared*/, &cstate);
if (r) {
printf("ERROR: failed to open conversations for %s\n", userid);
goto done;
}
r = cyrusdb_foreach(cstate->db, "G", 1, NULL, bloomadd_cb, &filter->bloom, NULL);
}
done:
conversations_commit(&cstate);
buf_free(&buf);
return r;
}
static int search_filter(const char *userid, const strarray_t *srcpaths,
const strarray_t *destpaths, const strarray_t *desttiers, int flags)
{
struct mbfilter filter;
int verbose = SEARCH_VERBOSE(flags);
int r;
r = create_filter(srcpaths, destpaths, desttiers, userid, flags, &filter, 1);
if (r) goto done;
if (verbose)
printf("Filtering database %s\n", strarray_nth(destpaths, 0));
r = xapian_filter(strarray_nth(destpaths, 0), (const char **)srcpaths->data,
mbdata_exists_cb, &filter);
if (r) goto done;
if (verbose)
printf("done %s\n", strarray_nth(destpaths, 0));
done:
free_mbfilter(&filter);
return r;
}
static int reindex_mb(void *rock,
const char *key, size_t keylen,
const char *data, size_t datalen)
{
struct mbfilter *filter = (struct mbfilter *)rock;
char *mboxname = xstrndup(key, keylen);
struct seqset *seq = parse_indexed(data, datalen);
xapian_update_receiver_t *tr = NULL;
struct mailbox *mailbox = NULL;
struct buf buf = BUF_INITIALIZER;
ptrarray_t batch = PTRARRAY_INITIALIZER;
int verbose = SEARCH_VERBOSE(filter->flags);
strarray_t alldirs = STRARRAY_INITIALIZER;
int r = 0;
int i;
char *dot;
uint32_t uidvalidity;
dot = strrchr(mboxname, '.');
*dot++ = '\0';
uidvalidity = atol(dot);
if (!seq) goto done;
r = mailbox_open_irl(mboxname, &mailbox);
if (r == IMAP_MAILBOX_NONEXISTENT) {
r = 0; /* it's not an error to have a no-longer-exiting mailbox to index */
goto done;
}
if (r) goto done;
if (mailbox->i.uidvalidity != uidvalidity) goto done; /* returns 0, nothing to index */
struct mailbox_iter *iter = mailbox_iter_init(mailbox, 0, ITER_SKIP_UNLINKED);
mailbox_iter_startuid(iter, seqset_first(seq));
const message_t *msg;
while ((msg = mailbox_iter_step(iter))) {
const struct index_record *record = msg_record(msg);
/* it wasn't in the previous index, skip it */
if (!seqset_ismember(seq, record->uid))
continue;
/* we need to create a new message, because the iterator reuses its one */
ptrarray_append(&batch, (void *)message_new_from_record(mailbox, record));
if (record->uid > seqset_last(seq))
break;
}
mailbox_iter_done(&iter);
mailbox_unlock_index(mailbox, NULL);
// nothing to do? Bonus!
if (!ptrarray_size(&batch)) goto done;
/* open the DB */
tr = (xapian_update_receiver_t *)begin_update(verbose);
tr->mode = XAPIAN_DBW_XAPINDEXED; // always use XAPINDEXED for reindex, so we reindex the same emails
tr->super.mailbox = mailbox;
tr->activedirs = strarray_dup(filter->destpaths);
tr->activetiers = strarray_dup(filter->desttiers);
// include all the new databases too
strarray_cat(&alldirs, &filter->temptargets);
// skip the first one, there's no data in there!
for (i = 1; i < strarray_size(filter->destpaths); i++)
strarray_append(&alldirs, strarray_nth(filter->destpaths, i));
r = xapian_dbw_open((const char **)alldirs.data, &tr->dbw, tr->mode, /*nosync*/1);
if (r) goto done;
/* initialise here so it doesn't add firstunindexed
* from oldindexed in is_indexed */
tr->indexed = seqset_init(0, SEQ_MERGE);
int allow_partials = 0;
int getsearchtext_flags = 0;
if (filter->flags & SEARCH_COMPACT_ALLOW_PARTIALS) {
allow_partials = 1;
getsearchtext_flags |= INDEX_GETSEARCHTEXT_PARTIALS;
}
int base = 0;
int batchend = 0;
for (base = 0; base < batch.count; base = batchend) {
/* XXX - errors here could leak... */
/* game on */
batchend = base + 1024;
if (batchend > batch.count) batchend = batch.count;
/* preload */
for (i = base ; i < batchend ; i++) {
message_t *msg = ptrarray_nth(&batch, i);
/* add the record to the list */
uint8_t indexlevel = is_indexed((search_text_receiver_t *)tr, msg);
if (indexlevel == 0 || ((indexlevel & SEARCH_INDEXLEVEL_PARTIAL) && !allow_partials)) {
const char *fname;
r = message_get_fname(msg, &fname);
if (r) goto done;
r = warmup_file(fname, 0, 0);
if (r) goto done; /* means we failed to open a file,
so we'll fail later anyway */
}
else {
// remove it from the list now so we don't try to index it later
message_unref(&msg);
ptrarray_set(&batch, i, NULL);
}
}
/* index the messages */
for (i = base ; i < batchend ; i++) {
message_t *msg = ptrarray_nth(&batch, i);
if (!msg) continue;
r = index_getsearchtext(msg, NULL, &tr->super.super, getsearchtext_flags);
// we must unref the message and then zero out the entry in the ptrarray
// now, because index_getsearchtext will have mapped the file in, and even
// if we decided not to index it, we won't need it again
message_unref(&msg);
ptrarray_set(&batch, i, NULL);
if (r) goto done;
filter->numindexed++;
}
// the next write will start a new transaction if uncommitted == 0
if (tr->uncommitted) {
r = xapian_dbw_commit_txn(tr->dbw);
if (r) goto done;
tr->uncommitted = 0;
tr->commits++;
// we don't want to blow out the temporary space, so let's split every so often!
if (filter->numindexed > XAPIAN_REINDEX_TEMPDIR_COUNT ||
xapian_dbw_total_length(tr->dbw) > XAPIAN_REINDEX_TEMPDIR_SIZE) {
// close the database, move the data, open a new database with a new target! Yikes
xapian_dbw_close(tr->dbw);
// we move the existing temp database to the same partition as the target, then
// start up a brand new temp database at the same path!
// e.g. temp == /tmpfs/cyrus-tempXXX-reindex/xapian
// buf == /mnt/searchdrive/path/to/user/xapian.23.REINDEX.NEW.<num>
buf_reset(&buf);
buf_printf(&buf, "%s.%d", strarray_nth(filter->destpaths, 0), (int)strarray_size(&filter->temptargets));
const char *temp = strarray_nth(&filter->temptargets, 0);
syslog(LOG_DEBUG, "REINDEX: chunking %s to %s", temp, buf_cstring(&buf));
r = copy_files(temp, buf_cstring(&buf));
removedir(temp);
// insert the new path directly after the temporary directory
strarray_insert(&filter->temptargets, 1, buf_cstring(&buf));
// and also open it for the next writes
strarray_insert(&alldirs, 1, buf_cstring(&buf));
// finally, re-open the database on the new empty directory with the extra path added
if (!r) r = xapian_dbw_open((const char **)alldirs.data, &tr->dbw, tr->mode, /*nosync*/1);
if (r) goto done;
filter->numindexed = 0;
}
}
}
done:
if (tr) {
strarray_free(tr->activedirs);
strarray_free(tr->activetiers);
if (tr->indexed) seqset_free(tr->indexed);
if (tr->dbw) xapian_dbw_close(tr->dbw);
free_receiver(&tr->super);
}
mailbox_close(&mailbox);
for (i = 0; i < batch.count; i++) {
message_t *msg = ptrarray_nth(&batch, i);
message_unref(&msg);
}
ptrarray_fini(&batch);
free(mboxname);
seqset_free(seq);
strarray_fini(&alldirs);
buf_free(&buf);
return r;
}
static int search_reindex(const char *userid, const strarray_t *srcpaths,
const strarray_t *destpaths, const strarray_t *desttiers, int flags)
{
struct buf buf = BUF_INITIALIZER;
struct mbfilter filter;
int verbose = SEARCH_VERBOSE(flags);
int r;
r = create_filter(srcpaths, destpaths, desttiers, userid, flags, &filter, 0);
if (r) goto done;
if (verbose)
printf("Reindexing messages for %s\n", userid);
// set up temporary target
filter.temp_path = create_tempdir(config_getstring(IMAPOPT_TEMP_PATH), "reindex");
buf_printf(&buf, "%s/xapian", filter.temp_path);
strarray_append(&filter.temptargets, buf_cstring(&buf));
// do the indexing
r = cyrusdb_foreach(filter.indexeddb, "", 0, NULL, reindex_mb, &filter, NULL);
if (r) {
printf("ERROR: failed to reindex to %s\n", strarray_nth(destpaths, 0));
goto done;
}
// we exactly managed to split at the end, or there was nothing to process!
if (!filter.numindexed)
free(strarray_shift(&filter.temptargets)); // removes temp_path
// put all the indexes into the destination path
if (strarray_size(&filter.temptargets) == 0) {
// nothing to copy!
}
else if (strarray_size(&filter.temptargets) == 1) {
// copy into place. Strictly this is a waste, we could just compact directly from here
r = copy_files(strarray_nth(&filter.temptargets, 0), strarray_nth(destpaths, 0));
}
else {
// we're going to double-compact, but that's OK
r = xapian_compact_dbs(strarray_nth(destpaths, 0), (const char **)filter.temptargets.data);
}
if (verbose)
printf("done %s\n", strarray_nth(destpaths, 0));
done:
free_mbfilter(&filter);
buf_free(&buf);
return r;
}
static int search_compress(const char *userid, const strarray_t *srcpaths,
const strarray_t *destpaths, const strarray_t *desttiers, int flags)
{
struct buf buf = BUF_INITIALIZER;
struct mbfilter filter;
int verbose = SEARCH_VERBOSE(flags);
int r;
r = create_filter(srcpaths, destpaths, desttiers, userid, flags, &filter, 0);
if (r) goto done;
if (verbose)
printf("Compressing messages for %s\n", userid);
r = xapian_compact_dbs(strarray_nth(destpaths, 0), (const char **)srcpaths->data);
if (r) {
printf("ERROR: failed to compress to %s\n", strarray_nth(destpaths, 0));
goto done;
}
if (verbose)
printf("done %s\n", strarray_nth(destpaths, 0));
done:
free_mbfilter(&filter);
buf_free(&buf);
return r;
}
static void cleanup_xapiandirs(const char *mboxname, const char *partition, strarray_t *active, int verbose)
{
int i;
strarray_t found = STRARRAY_INITIALIZER;
strarray_t bogus = STRARRAY_INITIALIZER;
inspect_filesystem(mboxname, partition, &found, &bogus);
for (i = 0; i < strarray_size(active); i++) {
const char *item = strarray_nth(active, i);
strarray_remove_all(&found, item);
}
for (i = 0; i < strarray_size(&found); i++) {
const char *item = strarray_nth(&found, i);
char *path = activefile_path(mboxname, partition, item, /*dostat*/0);
if (verbose)
printf("Removing unreferenced item %s (%s)\n", item, path);
removedir(path);
free(path);
}
for (i = 0; i < strarray_size(&bogus); i++) {
const char *path = strarray_nth(&bogus, i);
if (verbose)
printf("Removing bogus path %s\n", path);
removedir(path);
}
strarray_fini(&found);
strarray_fini(&bogus);
}
static int compact_dbs(const char *userid, const strarray_t *reindextiers,
const strarray_t *srctiers, const char *desttier,
int flags)
{
char *mboxname = mboxname_user_mbox(userid, NULL);
struct mboxlist_entry *mbentry = NULL;
struct mappedfile *activefile = NULL;
struct mboxlock *xapiandb_namelock = NULL;
strarray_t *srcdirs = NULL;
strarray_t *newdirs = NULL;
strarray_t *active = NULL;
strarray_t *tochange = NULL;
strarray_t *reindexitems = NULL;
strarray_t *orig = NULL;
strarray_t *toreindex = NULL;
strarray_t *tocompact = NULL;
char *newdest = NULL;
char *destdir = NULL;
char *tempdestdir = NULL;
char *tempreindexdir = NULL;
strarray_t *newtiers = NULL;
char *namelock_fname = NULL;
int verbose = SEARCH_VERBOSE(flags);
int created_something = 0;
int r = 0;
int i;
r = mboxlist_lookup(mboxname, &mbentry, NULL);
if (r == IMAP_MAILBOX_NONEXISTENT) {
/* no user, no worries */
r = 0;
goto out;
}
if (r) {
syslog(LOG_ERR, "IOERROR: failed to lookup %s", mboxname);
goto out;
}
r = check_config(NULL);
if (r) goto out;
if (!xapian_rootdir(desttier, mbentry->partition)) {
if (verbose)
printf("INVALID: unknown tier %s\n", desttier);
goto out;
}
/* Generated the namelock filename */
namelock_fname = xapiandb_namelock_fname_from_userid(userid);
/* Get an exclusive namelock */
int lockflags = LOCK_EXCLUSIVE;
if (flags & SEARCH_COMPACT_NONBLOCKING) lockflags |= LOCK_NONBLOCK;
r = mboxname_lock(namelock_fname, &xapiandb_namelock, lockflags);
if (r == IMAP_MAILBOX_LOCKED) {
// that's OK, we asked for it!
r = 0;
goto out;
}
if (r) {
syslog(LOG_ERR, "Could not acquire shared namelock on %s\n",
namelock_fname);
goto out;
}
/* take an exclusive lock on the activefile file */
r = activefile_open(mboxname, mbentry->partition, &activefile, AF_LOCK_WRITE, &active);
if (r) {
syslog(LOG_ERR, "Failed to lock activefile for %s", mboxname);
goto out;
}
if (!active || !active->count) goto out;
orig = strarray_dup(active);
/* read the activefile file, taking down the names of all paths with a
* level less than or equal to that requested */
tochange = activefile_filter(active, srctiers, mbentry->partition);
if (!tochange || !tochange->count) goto out;
/* also, track which ones to reindex */
if (reindextiers) {
reindexitems = activefile_filter(tochange, reindextiers, mbentry->partition);
}
else {
reindexitems = strarray_new();
}
if (tochange->count == 1 && srctiers->count == 1 &&
(flags & SEARCH_COMPACT_COPYONE) && !strcmp(desttier, strarray_nth(srctiers, 0))) {
if (verbose) {
printf("Skipping %s for %s, only one\n", strarray_nth(tochange, 0), mboxname);
}
goto out;
}
/* find out which items actually exist from the set to be compressed - first pass */
srcdirs = activefile_resolve(mboxname, mbentry->partition, tochange, /*dostat*/1, NULL/*resultitems*/);
if (!srcdirs || !srcdirs->count) goto out;
/* NOTE: it's safe to keep this list even over the unlock/relock because we
* always write out a new first item if necessary, so these will never be
* written to after we release the lock - if they don't have content now,
* they never will */
/* register the target name first, and put it at the end of the file */
newdest = activefile_nextname(active, desttier);
strarray_push(active, newdest);
if (verbose) {
char *target = strarray_join(tochange, ",");
char *activestr = strarray_join(orig, ",");
char *reindexstr = strarray_join(reindexitems, ",");
const char *reindex = (flags & SEARCH_COMPACT_REINDEX)
? "ALL" : reindexstr ? reindexstr : "NONE";
printf("compressing %s to %s for %s (active %s) (reindex %s)\n",
target, newdest, mboxname, activestr, reindex);
free(reindexstr);
free(activestr);
free(target);
}
/* are we going to change the first active? We need to start indexing to
* a new location! */
if (strarray_find(tochange, strarray_nth(active, 0), 0) >= 0) {
/* always recalculate the first name once the destination is chosen,
* because we may be compressing to the default tier for some reason */
char *newstart = activefile_nextname(active, config_getstring(IMAPOPT_DEFAULTSEARCHTIER));
if (verbose) {
printf("adding new initial search location %s\n", newstart);
}
strarray_unshiftm(active, newstart);
}
destdir = activefile_path(mboxname, mbentry->partition, newdest, /*dostat*/0);
tempdestdir = strconcat(destdir, ".NEW", (char *)NULL);
/* write the new file and release the exclusive lock */
activefile_write(activefile, active);
mappedfile_unlock(activefile);
/* Release the exclusive named lock */
if (xapiandb_namelock) {
mboxname_release(&xapiandb_namelock);
xapiandb_namelock = NULL;
}
/* Get a shared name lock */
r = mboxname_lock(namelock_fname, &xapiandb_namelock, LOCK_SHARED);
if (r) {
syslog(LOG_ERR, "Could not acquire shared namelock on %s\n",
namelock_fname);
goto out;
}
/* take a shared lock */
mappedfile_readlock(activefile);
/* reread and ensure our 'directory zero' is still directory zero,
* otherwise abort now */
{
strarray_t *newactive = activefile_read(activefile);
if (strarray_cmp(active, newactive)) {
if (verbose) {
printf("aborting compact of %s, lost the race early\n", mboxname);
}
strarray_free(newactive);
goto out;
}
strarray_free(newactive);
}
/* release the sharedlock on the active file, compcating is safe
* without locking activefile.
*/
mappedfile_unlock(activefile);
/* make sure the destination path exists */
r = cyrus_mkdir(tempdestdir, 0755);
if (r) goto out;
/* and doesn't contain any junk */
removedir(tempdestdir);
r = mkdir(tempdestdir, 0755);
if (r) goto out;
if (srcdirs->count == 1 && (flags & SEARCH_COMPACT_COPYONE)) {
if (verbose) {
printf("only one source, copying directly to %s\n", tempdestdir);
}
cyrus_mkdir(tempdestdir, 0755);
removedir(tempdestdir);
r = copy_files(strarray_nth(srcdirs, 0), tempdestdir);
if (r) goto out;
created_something = 1;
}
else if (srcdirs->count) {
if (verbose) {
printf("compacting databases\n");
}
/* calculate the existing databases that we also need to check for duplicates */
strarray_t *existing = strarray_dup(orig);
for (i = 0; i < tochange->count; i++)
strarray_remove_all(existing, strarray_nth(tochange, i));
newdirs = activefile_resolve(mboxname, mbentry->partition, existing, /*dostat*/1, &newtiers);
strarray_free(existing);
/* we'll be prepending the final target directory to newdirs before compacting,
* so also add the new tier so the indexes match up */
strarray_unshift(newtiers, newdest);
tocompact = strarray_new();
if ((flags & SEARCH_COMPACT_REINDEX)) {
/* all databases to be reindexed */
toreindex = strarray_dup(srcdirs);
}
else {
toreindex = activefile_resolve(mboxname, mbentry->partition, reindexitems, 0, NULL);
xapian_check_if_needs_reindex(srcdirs, toreindex, flags & SEARCH_COMPACT_ONLYUPGRADE);
for (i = 0; i < srcdirs->count; i++) {
const char *thisdir = strarray_nth(srcdirs, i);
if (strarray_find(toreindex, thisdir, 0) < 0)
strarray_append(tocompact, thisdir);
}
}
if (!toreindex->count && (flags & SEARCH_COMPACT_ONLYUPGRADE)) {
/* nothing to reindex, so bail now. Since we don't set 'r', we will just
* abort with no change other than a new tmp location which compresses down
* soon enough */
goto out;
}
// first, we'll reindex anything that needs reindexing to a temporary directory
if (toreindex->count) {
tempreindexdir = strconcat(tempdestdir, ".REINDEX", (char *)NULL);
// add this directory to the repack target as the first entry point
strarray_unshift(newdirs, tempreindexdir);
r = search_reindex(userid, toreindex, newdirs, newtiers, flags);
if (r) {
printf("ERROR: failed to reindex to %s", tempreindexdir);
removedir(tempreindexdir);
goto out;
}
// remove tempreindexdir from newdirs again, it's going to be compacted instead
free(strarray_shift(newdirs));
// and then add the temporary directory to the to-compact list if anything was indexed into it
if (!xapstat(tempreindexdir))
strarray_unshift(tocompact, tempreindexdir);
}
// then we'll compact together all the source databases
if (tocompact->count) {
// and now we're ready to compact to the real tempdir
strarray_unshift(newdirs, tempdestdir);
if (flags & SEARCH_COMPACT_FILTER) {
r = search_filter(userid, tocompact, newdirs, newtiers, flags);
if (r) {
printf("ERROR: failed to filter to %s", tempdestdir);
goto out;
}
}
else {
r = search_compress(userid, tocompact, newdirs, newtiers, flags);
if (r) {
printf("ERROR: failed to compact to %s", tempdestdir);
goto out;
}
}
if (!xapstat(tempdestdir)) {
created_something = 1;
}
}
if (tempreindexdir) {
removedir(tempreindexdir);
free(tempreindexdir);
tempreindexdir = NULL;
}
}
/* Release the shared named lock */
if (xapiandb_namelock) {
mboxname_release(&xapiandb_namelock);
xapiandb_namelock = NULL;
}
/* Get an exclusive namelock */
r = mboxname_lock(namelock_fname, &xapiandb_namelock, LOCK_EXCLUSIVE);
if (r) {
syslog(LOG_ERR, "Could not acquire shared namelock on %s\n",
namelock_fname);
goto out;
}
/* check that we still have 'directory zero'. If not, delete all
* temporary files and abort */
{
strarray_t *newactive = activefile_read(activefile);
if (strarray_cmp(active, newactive)) {
if (verbose) {
printf("aborting compact of %s, lost the race late\n", mboxname);
}
strarray_free(newactive);
goto out;
}
strarray_free(newactive);
}
if (created_something) {
/* rename the destination data into place */
if (verbose) {
printf("renaming tempdir into place\n");
}
removedir(destdir);
r = rename(tempdestdir, destdir);
if (r) {
printf("ERROR: failed to rename into place %s to %s\n", tempdestdir, destdir);
goto out;
}
}
else {
if (verbose) {
printf("nothing compacted, cleaning up %s\n", newdest);
}
strarray_append(tochange, newdest);
}
for (i = 0; i < tochange->count; i++)
strarray_remove_all(active, strarray_nth(tochange, i));
/* Get an exclusive lock on the activefile */
mappedfile_writelock(activefile);
activefile_write(activefile, active);
/* release the lock */
mappedfile_unlock(activefile);
/* And finally remove all directories on disk of the source dbs */
for (i = 0; i < srcdirs->count; i++)
removedir(strarray_nth(srcdirs, i));
/* remove any other files that are still lying around! */
cleanup_xapiandirs(mboxname, mbentry->partition, active, verbose);
/* Release the exclusive named lock */
if (xapiandb_namelock) {
mboxname_release(&xapiandb_namelock);
xapiandb_namelock = NULL;
}
if (verbose) {
char *alist = strarray_join(active, ",");
printf("finished compact of %s (active %s)\n", mboxname, alist);
free(alist);
}
out:
strarray_free(orig);
strarray_free(active);
strarray_free(srcdirs);
strarray_free(newdirs);
strarray_free(newtiers);
strarray_free(toreindex);
strarray_free(tochange);
strarray_free(tocompact);
strarray_free(reindexitems);
free(namelock_fname);
free(newdest);
free(destdir);
free(tempdestdir);
free(tempreindexdir);
mappedfile_unlock(activefile);
mappedfile_close(&activefile);
if (xapiandb_namelock) {
mboxname_release(&xapiandb_namelock);
xapiandb_namelock = NULL;
}
mboxlist_entry_free(&mbentry);
free(mboxname);
return r;
}
/* cleanup */
static void delete_one(const char *key, const char *val __attribute__((unused)), void *rock)
{
const char *mboxname = (const char *)rock;
const char *partition = NULL;
char *tier = NULL;
char *basedir = NULL;
partition = strstr(key, "searchpartition-");
if (!partition) return;
tier = xstrndup(key, partition - key);
partition += 16; /* skip over name */
xapian_basedir(tier, mboxname, partition, NULL, &basedir);
if (basedir)
removedir(basedir);
free(basedir);
free(tier);
}
static int delete_user(const char *userid)
{
char *mboxname = mboxname_user_mbox(userid, /*subfolder*/NULL);
char *activename = activefile_fname(mboxname);
struct mappedfile *activefile = NULL;
struct mboxlock *xapiandb_namelock = NULL;
char *namelock_fname = NULL;
int r = 0;
/* Get an exclusive namelock */
namelock_fname = xapiandb_namelock_fname_from_userid(userid);
r = mboxname_lock(namelock_fname, &xapiandb_namelock, LOCK_EXCLUSIVE);
if (r) {
syslog(LOG_ERR, "Could not acquire shared namelock on %s\n",
namelock_fname);
goto out;
}
/* grab an exclusive lock on activefile: that way we won't delete
* it out from under something else (such as squatter)
*/
r = mappedfile_open(&activefile, activename, MAPPEDFILE_RW);
if (r) goto out;
r = mappedfile_writelock(activefile);
if (r) goto out;
config_foreachoverflowstring(delete_one, mboxname);
unlink(activename);
out:
if (activefile) {
mappedfile_unlock(activefile);
mappedfile_close(&activefile);
}
if (xapiandb_namelock) {
mboxname_release(&xapiandb_namelock);
xapiandb_namelock = NULL;
}
free(namelock_fname);
free(activename);
free(mboxname);
return r;
}
static int langstats(const char *userid, ptrarray_t *lstats, size_t *total_docs)
{
struct mailbox *mailbox = NULL;
char *inboxname = mboxname_user_mbox(userid, NULL);
struct xapiandb_lock lock = XAPIANDB_LOCK_INITIALIZER;
int r = mailbox_open_irl(inboxname, &mailbox);
if (r) goto out;
r = xapiandb_lock_open(mailbox, &lock);
if (r || lock.db == NULL) goto out;
r = xapian_db_langstats(lock.db, lstats, total_docs);
out:
xapiandb_lock_release(&lock);
mailbox_close(&mailbox);
free(inboxname);
return r;
}
static int can_match(enum search_op matchop, int partnum)
{
return matchop == SEOP_FUZZYMATCH && partnum != SEARCH_PART_NONE;
}
const struct search_engine xapian_search_engine = {
"Xapian",
SEARCH_FLAG_CAN_BATCH | SEARCH_FLAG_CAN_GUIDSEARCH,
begin_search,
end_search,
begin_update,
end_update,
begin_snippets,
end_snippets,
describe_internalised,
free_internalised,
list_files,
compact_dbs,
delete_user, /* XXX: fixme */
check_config,
langstats,
can_match
};
diff --git a/imap/xapian_wrap.cpp b/imap/xapian_wrap.cpp
index 1608a9aca..faafea9d7 100644
--- a/imap/xapian_wrap.cpp
+++ b/imap/xapian_wrap.cpp
@@ -1,2511 +1,2497 @@
#include <errno.h>
#include <config.h>
#include <string.h>
#include <sys/types.h>
#include <syslog.h>
#include <fstream>
#include <sstream>
#include <algorithm>
#include <memory>
extern "C" {
#include <assert.h>
#include "libconfig.h"
#include "util.h"
#include "search_engines.h"
#include "search_part.h"
#include "xmalloc.h"
#include "xapian_wrap.h"
#include "charset.h"
#include "ptrarray.h"
#include "parseaddr.h"
/* generated headers are not necessarily in current directory */
#include "imap/imap_err.h"
};
#include <unicode/unistr.h>
#include <unicode/locid.h>
#include <xapian.h>
#ifdef HAVE_CLD2
#include <cld2/public/compact_lang_det.h>
#endif
// from global.h
extern int charset_flags;
#define SLOT_CYRUSID 0
#define SLOT_DOCLANGS 1
#define SLOT_INDEXLEVEL 2
#define SLOT_INDEXVERSION 3
static const unsigned XAPIAN_MAX_TERM_LENGTH = 200; /* in UTF-8 bytes */
/* ====================================================================== */
static void make_cyrusid(struct buf *dst, const struct message_guid *guid, char doctype)
{
buf_reset(dst);
buf_putc(dst, '*');
buf_putc(dst, doctype);
buf_putc(dst, '*');
buf_appendcstr(dst, message_guid_encode(guid));
buf_cstring(dst);
}
/* ====================================================================== */
/*
* A brief history of Xapian db versions:
* Version 0: uses STEM_ALL for all terms, term prefixes don't start with 'X'
* Version 1: term prefixes start with 'X'
* Version 2: uses STEM_SOME for some terms
* Version 3: removes all use of STEM_ALL
* Version 4: indexes headers and bodies in separate documents
* Version 5: indexes headers and bodies together and stems by language
* Version 6: stores all detected languages of a document in slot SLOT_DOCLANGS (deprecated)
* Version 7: indexes new DELIVEREDTO search part
* Version 8: reintroduces language indexing for non-English text
* Version 9: introduces index levels as keys to cyrusid metadata
* Version 10: indexes new PRIORITY search part
* Version 11: indexes LIST-ID as single value
* Version 12: indexes email domains as single values. Supports subdomain search.
* Version 13: indexes content-type and subtype separately
* Version 14: adds SLOT_INDEXVERSION to documents
* Version 15: receives indexed header fields and text in original format (rather than search form)
* Version 16: indexes entire addr-spec as a single value. Prevents cross-matching localparts and domains
*/
#define XAPIAN_DB_CURRENT_VERSION 16
#define XAPIAN_DB_MIN_SUPPORTED_VERSION 5
static std::set<int> read_db_versions(const Xapian::Database &database)
{
std::set<int> versions;
// db_version is a comma-separated list of version numbers
std::string val = database.get_metadata("cyrus.db_version");
if (!val.empty()) {
strarray_t *vstr = strarray_split(val.c_str(), ",", 0);
for (int i = 0; i < strarray_size(vstr); i++) {
int version = std::atoi(strarray_nth(vstr, i));
if (version) versions.insert(version);
}
strarray_free(vstr);
}
// Up to version 3 this was named stem version.
val = database.get_metadata("cyrus.stem-version");
if (!val.empty()) {
versions.insert(std::stoi(val));
}
return versions;
}
static void write_db_versions(Xapian::WritableDatabase &database, std::set<int> &versions)
{
std::ostringstream val;
for (std::set<int>::iterator it = versions.begin(); it != versions.end(); ++it) {
if (it != versions.begin()) val << ",";
val << *it;
}
database.set_metadata("cyrus.db_version", val.str());
database.set_metadata("cyrus.stem-version", "");
}
/* ====================================================================== */
#define XAPIAN_LANG_COUNT_KEYPREFIX "lang.count"
#define XAPIAN_LANG_DOC_KEYPREFIX "lang.doc"
static std::string lang_prefix(const std::string& iso_lang, const char *prefix)
{
std::string ustr = std::string(prefix) + "XI" + iso_lang;
std::transform(ustr.begin(), ustr.end(), ustr.begin(), ::toupper);
return ustr;
}
static std::string lang_doc_key(const char *cyrusid)
{
std::string key(XAPIAN_LANG_DOC_KEYPREFIX ".");
key += cyrusid;
return key;
}
static std::string lang_count_key(const std::string& iso_lang)
{
std::string key(XAPIAN_LANG_COUNT_KEYPREFIX ".");
key += iso_lang;
return key;
}
static int calculate_language_counts(const Xapian::Database& db,
std::map<const std::string, unsigned>& lang_counts)
{
std::set<int> db_versions = read_db_versions(db);
if (db_versions.lower_bound(8) == db_versions.begin()) {
// count all indexed body parts
size_t nparts = 0;
for (Xapian::TermIterator it = db.metadata_keys_begin("cyrusid.*P*");
it != db.metadata_keys_end("cyrusid.*P*"); ++it) {
nparts++;
}
// count body parts with language metadata
const std::string prefix{XAPIAN_LANG_DOC_KEYPREFIX ".*P*"};
size_t nlangparts = 0;
for (Xapian::TermIterator it = db.metadata_keys_begin(prefix);
it != db.metadata_keys_end(prefix); ++it) {
lang_counts[db.get_metadata(*it)] += 1;
nlangparts++;
}
// English or unknown language body parts have no metadata.
lang_counts["en"] += nparts - nlangparts;
// Sanity check data
if (nparts < nlangparts) {
return IMAP_IOERROR;
}
}
return 0;
}
static void remove_legacy_metadata(Xapian::WritableDatabase& db)
{
const std::string prefix{XAPIAN_LANG_DOC_KEYPREFIX "."};
for (Xapian::TermIterator key = db.metadata_keys_begin(prefix);
key != db.metadata_keys_end(prefix); ++key) {
const std::string& val = db.get_metadata(*key);
// Remove legacy keys and values.
if ((*key).find('.') != std::string::npos ||
(!val.empty() && !isalpha(val[0]))) {
db.set_metadata(*key, "");
}
}
for (Xapian::docid docid = 1; docid <= db.get_lastdocid(); ++docid) {
try {
Xapian::Document doc = db.get_document(docid);
const std::string& val = doc.get_value(SLOT_DOCLANGS);
// Remove legacy doclang slot values.
if (!val.empty() && !isalpha(val[0])) {
doc.remove_value(SLOT_DOCLANGS);
}
}
catch (Xapian::DocNotFoundError e) {
// ignore
}
}
}
static void write_language_counts(Xapian::WritableDatabase& db,
const std::map<const std::string, unsigned>& lang_counts)
{
for (Xapian::TermIterator it = db.metadata_keys_begin(XAPIAN_LANG_COUNT_KEYPREFIX);
it != db.metadata_keys_end(XAPIAN_LANG_COUNT_KEYPREFIX); ++it) {
db.set_metadata(*it, "");
}
for (const std::pair<std::string, unsigned>& it : lang_counts) {
db.set_metadata(lang_count_key(it.first), std::to_string(it.second));
}
}
static void read_language_counts(const Xapian::Database& db,
std::map<const std::string, unsigned>& lang_counts)
{
std::set<int> db_versions = read_db_versions(db);
if (db_versions.lower_bound(8) == db_versions.begin()) {
const std::string prefix(XAPIAN_LANG_COUNT_KEYPREFIX ".");
for (Xapian::TermIterator it = db.metadata_keys_begin(prefix);
it != db.metadata_keys_end(prefix); ++it) {
std::string iso_lang = (*it).substr(prefix.length());
unsigned count = std::stol(db.get_metadata(*it));
lang_counts[iso_lang] += count;
}
}
}
static void parse_doclangs(const std::string& val, std::set<std::string>& doclangs)
{
if (val.empty() || !isalpha(val[0])) return;
size_t base = 0, pos;
while ((pos = val.find(',', base)) != std::string::npos) {
doclangs.insert(val.substr(base, pos - base));
base = pos + 1;
}
doclangs.insert(val.substr(base));
}
static std::string format_doclangs(const std::set<std::string>& doclangs)
{
std::ostringstream val;
for (std::set<std::string>::iterator it = doclangs.begin(); it != doclangs.end(); ++it) {
if (it != doclangs.begin()) val << ",";
val << *it;
}
std::string s = val.str();
return s;
}
static std::string parse_langcode(const char *str)
{
std::string lstr(str);
std::transform(lstr.begin(), lstr.end(), lstr.begin(), ::tolower);
// accept syntax for two and three letter ISO 639 codes
if (!(isalpha(lstr[0]) && isalpha(lstr[1]) &&
(lstr[2] == '\0' || (isalpha(lstr[2]) && lstr[3] == '\0')))) {
return std::string();
}
return lstr;
}
// Process-scoped, thread-unsafe cache of stoppers by ISO 639 code.
static std::map<const std::string, std::unique_ptr<Xapian::Stopper>> stoppers;
static const Xapian::Stopper* get_stopper(const std::string& iso)
{
// Lookup cached entry.
try {
return stoppers.at(iso).get();
} catch (const std::out_of_range&) {};
// Lookup language name by ISO code.
std::string lang_name;
icu::Locale loc(iso.c_str());
if (loc.isBogus()) return NULL;
icu::UnicodeString ulang_name;
loc.getDisplayLanguage(icu::Locale("en"), ulang_name);
ulang_name.toLower();
ulang_name.toUTF8String(lang_name);
// Read stopper file and add to cache.
const char *swpath = config_getstring(IMAPOPT_SEARCH_STOPWORD_PATH);
if (!swpath) return NULL;
// Open stopword file
// XXX doesn't play nice with WIN32 paths
std::string fname(std::string(swpath) + "/" + lang_name + ".txt");
errno = 0;
std::ifstream inFile (fname);
if (inFile.fail()) {
syslog(LOG_DEBUG, "Xapian: could not open stopword file %s: %s",
fname.c_str(), errno ? strerror(errno) : "unknown error");
return NULL;
}
// Create and store the Xapian stopper
stoppers[iso].reset(new Xapian::SimpleStopper(
std::istream_iterator<std::string>(inFile),
std::istream_iterator<std::string>()));
return stoppers[iso].get();
}
class CyrusSearchStemmer : public Xapian::StemImplementation
{
charset_t utf8 {charset_lookupname("utf-8")};
std::map<const std::string, std::string> cache;
Xapian::Stem stem {"en"};
public:
virtual ~CyrusSearchStemmer() { charset_free(&utf8); }
virtual std::string operator() (const std::string &word) override {
// Is this word already in the cache?
try {
return cache.at(word);
} catch (const std::out_of_range&) {}
// Convert the word to search form
std::unique_ptr<char, decltype(std::free)*>
q {charset_convert(word.c_str(), utf8, charset_flags), std::free};
std::string s = q ? stem(Xapian::Unicode::tolower(q.get())) : stem(word);
if (s.size() > XAPIAN_MAX_TERM_LENGTH) return std::string{};
// Store the normalized word in the cache
return cache[word] = s;
}
virtual std::string get_description () const override {
return "Cyrus";
}
};
class FrenchContractionStemmer : public Xapian::StemImplementation
{
Xapian::Stem stem {"fr"};
public:
virtual std::string operator() (const std::string &word) override {
size_t pos = 0;
switch (word[0]) {
case 'q':
if (word.length() <= 3 || word[1] != 'u') {
break;
}
pos++;
// fall through
case 'c':
case 'd':
case 'j':
case 'l':
case 'm':
case 'n':
case 's':
case 't':
// APOSTROPHE (U+0027)
if (word.length() > pos + 2 && word[pos+1] == 0x27) {
return stem(word.substr(pos + 2));
}
// RIGHT SINGLE QUOTATION MARK (U+2019)
// FULLWIDTH APOSTROPHE (U+FF07)
else if (!word.compare(pos + 1, 3, "\xe2\x80\x99") ||
!word.compare(pos + 1, 3, "\xef\xbc\x87")) {
return stem(word.substr(pos + 4));
}
// fall through
}
// not a contraction
return stem(word);
}
virtual std::string get_description () const override {
return "fr-contraction";
}
};
static Xapian::Stem get_stemmer(const std::string& iso_lang)
{
return iso_lang == "fr" ?
Xapian::Stem{new FrenchContractionStemmer} :
Xapian::Stem{iso_lang};
}
#ifdef HAVE_CLD2
static std::string detect_language(const struct buf *part)
{
std::string iso_lang;
bool reliable = false;
CLD2::Language lang = CLD2::DetectLanguage(part->s, part->len, 1, &reliable);
if (reliable && lang != CLD2::UNKNOWN_LANGUAGE) {
std::string code(CLD2::LanguageCode(lang));
std::transform(code.begin(), code.end(), code.begin(), ::tolower);
// Map CLD2 special codes to ISO 639.
if (!code.compare("zh-Hant")) {
code = "zh";
}
else if (!code.compare("sr-ME" )) {
code = "sr"; // not a political statement!
}
else if (!code.compare("xxx")) {
code = "";
}
iso_lang = parse_langcode(code.c_str());
}
return iso_lang;
}
#endif /* HAVE_CLD2 */
/* ====================================================================== */
static uint8_t better_indexlevel(uint8_t levela, uint8_t levelb)
{
uint8_t a = levela & ~SEARCH_INDEXLEVEL_PARTIAL;
uint8_t b = levelb & ~SEARCH_INDEXLEVEL_PARTIAL;
if (a > b) return levela;
if (a < b) return levelb;
return (levela & SEARCH_INDEXLEVEL_PARTIAL) ? levelb : levela;
}
static uint8_t parse_indexlevel(const std::string& s)
{
uint8_t level = 0;
if (hex_to_bin(s.c_str(), s.length(), &level) != 1) {
return 0;
}
return level;
}
static std::string format_indexlevel(uint8_t level)
{
char hex[4];
bin_to_lchex(&level, 1, hex);
return std::string(hex, 2);
}
/* ====================================================================== */
class CyrusMetadataCompactor : public Xapian::Compactor
{
public:
CyrusMetadataCompactor() { }
std::string resolve_duplicate_metadata(const std::string &key,
size_t num_tags,
const std::string tags[])
{
if (key.rfind("cyrusid.", 0) == 0) {
uint8_t indexlevel = parse_indexlevel(tags[0]);
size_t bestpos = 0;
for (size_t i = 1; i < num_tags; i++) {
uint8_t level = parse_indexlevel(tags[i]);
if (better_indexlevel(indexlevel, level) == level) {
indexlevel = level;
bestpos = i;
}
}
return tags[bestpos];
}
return tags[0];
}
};
int xapian_compact_dbs(const char *dest, const char **sources)
{
int r = 0;
Xapian::Database db;
const char *thispath = "(unknown path)";
try {
std::set<int> db_versions;
std::map<const std::string, unsigned> lang_counts;
std::vector<Xapian::Database> subdbs;
while (*sources) {
thispath = *sources;
Xapian::Database subdb(*sources++);
db.add_database(subdb);
subdbs.push_back(subdb);
// Aggregate db versions.
bool need_metadata = false;
for (Xapian::docid docid = 1; docid <= subdb.get_lastdocid(); ++docid) {
try {
Xapian::Document doc = subdb.get_document(docid);
const std::string& val = doc.get_value(SLOT_INDEXVERSION);
if (!val.empty()) {
int version = std::atoi(val.c_str());
if (version) db_versions.insert(version);
}
else need_metadata = true;
}
catch (Xapian::DocNotFoundError e) {
// ignore
}
}
if (need_metadata) {
/* At least one document didn't have its index version set.
* Read the legacy version from the metadata. */
std::set<int> md_versions = read_db_versions(subdb);
db_versions.insert(md_versions.begin(), md_versions.lower_bound(14));
}
// Aggregate language counts.
r = calculate_language_counts(subdb, lang_counts);
if (r) {
xsyslog(LOG_ERR, "IOERROR: corrupt language metadata",
"path=<%s>", thispath);
return r;
}
}
thispath = "(unknown path)";
// Compact database.
static CyrusMetadataCompactor comp;
// FULLER because we never write to compression targets again.
db.compact(dest, Xapian::Compactor::FULLER | Xapian::DBCOMPACT_MULTIPASS, 0, comp);
Xapian::WritableDatabase newdb(dest);
write_db_versions(newdb, db_versions);
// Clean metadata.
remove_legacy_metadata(newdb);
// Reset language counts.
write_language_counts(newdb, lang_counts);
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s> path=<%s>",
err.get_description().c_str(), thispath);
r = IMAP_IOERROR;
}
return r;
}
/* ====================================================================== */
static const char *get_term_prefix(int db_version, int partnum)
{
/*
* We use term prefixes to store terms per search part.
* In addition, each Xapian document contains a "XE"
* prefix to indicate its document type, listed in
* the XAPIAN_WRAP_DOCTYPE definitions. The "XE" prefix
* MUST not be used for any search part.
*
*/
static const char * const term_prefixes[SEARCH_NUM_PARTS] = {
NULL, /* ANY */
"XF", /* FROM */
"XT", /* TO */
"XC", /* CC */
"XB", /* BCC */
"XS", /* SUBJECT */
"XL", /* LISTID */
"XY", /* TYPE */
"XH", /* HEADERS */
"", /* BODY */
"XO", /* LOCATION */
"XA", /* ATTACHMENTNAME */
"XAB", /* ATTACHMENTBODY */
"XDT", /* DELIVEREDTO */
"XI", /* LANGUAGE */
"XP" /* PRIORITY */
};
static const char * const term_prefixes_v0[SEARCH_NUM_PARTS] = {
NULL, /* ANY */
"F", /* FROM */
"T", /* TO */
"C", /* CC */
"B", /* BCC */
"S", /* SUBJECT */
"L", /* LISTID */
"Y", /* TYPE */
"H", /* HEADERS */
"D", /* BODY */
"O", /* LOCATION */
"A", /* ATTACHMENTNAME */
"AB", /* ATTACHMENTBODY */
"E", /* DELIVEREDTO */
NULL, /* LANGUAGE */
NULL /* PRIORITY */
};
return db_version > 0 ? term_prefixes[partnum] : term_prefixes_v0[partnum];
}
static Xapian::TermGenerator::stem_strategy get_stem_strategy(int db_version, int partnum)
{
static Xapian::TermGenerator::stem_strategy stem_strategy[SEARCH_NUM_PARTS] = {
// Version 2 and higher
Xapian::TermGenerator::STEM_NONE, /* ANY */
Xapian::TermGenerator::STEM_NONE, /* FROM */
Xapian::TermGenerator::STEM_NONE, /* TO */
Xapian::TermGenerator::STEM_NONE, /* CC */
Xapian::TermGenerator::STEM_NONE, /* BCC */
Xapian::TermGenerator::STEM_SOME, /* SUBJECT */
Xapian::TermGenerator::STEM_NONE, /* LISTID */
Xapian::TermGenerator::STEM_NONE, /* TYPE */
Xapian::TermGenerator::STEM_NONE, /* HEADERS */
Xapian::TermGenerator::STEM_SOME, /* BODY */
Xapian::TermGenerator::STEM_SOME, /* LOCATION */
Xapian::TermGenerator::STEM_NONE, /* ATTACHMENTNAME */
Xapian::TermGenerator::STEM_SOME, /* ATTACHMENTBODY */
Xapian::TermGenerator::STEM_NONE, /* DELIVEREDTO */
Xapian::TermGenerator::STEM_NONE, /* LANGUAGE */
Xapian::TermGenerator::STEM_NONE /* PRIORITY */
};
static Xapian::TermGenerator::stem_strategy stem_strategy_v1[SEARCH_NUM_PARTS] = {
// Version 1: Stem bodies using STEM_SOME with stopwords
Xapian::TermGenerator::STEM_NONE, /* ANY */
Xapian::TermGenerator::STEM_ALL, /* FROM */
Xapian::TermGenerator::STEM_ALL, /* TO */
Xapian::TermGenerator::STEM_ALL, /* CC */
Xapian::TermGenerator::STEM_ALL, /* BCC */
Xapian::TermGenerator::STEM_ALL, /* SUBJECT */
Xapian::TermGenerator::STEM_ALL, /* LISTID */
Xapian::TermGenerator::STEM_ALL, /* TYPE */
Xapian::TermGenerator::STEM_ALL, /* HEADERS */
Xapian::TermGenerator::STEM_SOME, /* BODY */
Xapian::TermGenerator::STEM_SOME, /* LOCATION */
Xapian::TermGenerator::STEM_NONE, /* ATTACHMENTNAME */
Xapian::TermGenerator::STEM_SOME, /* ATTACHMENTBODY */
Xapian::TermGenerator::STEM_ALL, /* DELIVEREDTO */
Xapian::TermGenerator::STEM_NONE, /* LANGUAGE */
Xapian::TermGenerator::STEM_NONE /* PRIORITY */
};
static Xapian::TermGenerator::stem_strategy stem_strategy_v0[SEARCH_NUM_PARTS] = {
// Version 0: Initial version
Xapian::TermGenerator::STEM_NONE, /* ANY */
Xapian::TermGenerator::STEM_ALL, /* FROM */
Xapian::TermGenerator::STEM_ALL, /* TO */
Xapian::TermGenerator::STEM_ALL, /* CC */
Xapian::TermGenerator::STEM_ALL, /* BCC */
Xapian::TermGenerator::STEM_ALL, /* SUBJECT */
Xapian::TermGenerator::STEM_ALL, /* LISTID */
Xapian::TermGenerator::STEM_ALL, /* TYPE */
Xapian::TermGenerator::STEM_ALL, /* HEADERS */
Xapian::TermGenerator::STEM_ALL, /* BODY */
Xapian::TermGenerator::STEM_ALL, /* LOCATION */
Xapian::TermGenerator::STEM_ALL, /* ATTACHMENTNAME */
Xapian::TermGenerator::STEM_ALL, /* ATTACHMENTBODY */
Xapian::TermGenerator::STEM_ALL, /* DELIVEREDTO */
Xapian::TermGenerator::STEM_NONE, /* LANGUAGE */
Xapian::TermGenerator::STEM_NONE /* PRIORITY */
};
switch (db_version) {
case 0:
return stem_strategy_v0[partnum];
case 1:
return stem_strategy_v1[partnum];
default:
return stem_strategy[partnum];
}
}
/* For all db paths in sources that are not using the latest database
* version or not readable, report their paths in toreindex */
void xapian_check_if_needs_reindex(const strarray_t *sources, strarray_t *toreindex, int always_upgrade)
{
// Check the version of all dbs in sources
for (int i = 0; i < sources->count; i++) {
const char *thispath = strarray_nth(sources, i);
try {
for (const int& it: read_db_versions(Xapian::Database{thispath})) {
if (it < XAPIAN_DB_MIN_SUPPORTED_VERSION ||
(always_upgrade && (it != XAPIAN_DB_CURRENT_VERSION))) {
strarray_add(toreindex, thispath);
}
}
}
catch (const Xapian::Error &err) {
strarray_add(toreindex, thispath);
}
}
}
/* ====================================================================== */
static inline void add_boolean_nterm(Xapian::Document& doc,
const std::string& term,
size_t n = XAPIAN_MAX_TERM_LENGTH)
{
if (term.size() && term.size() < n) {
doc.add_boolean_term(term);
}
}
struct xapian_dbw
{
// Database context.
Xapian::WritableDatabase *database;
ptrarray_t otherdbs;
Xapian::TermGenerator *term_generator;
Xapian::Stem *default_stemmer;
const Xapian::Stopper* default_stopper;
// Document context.
Xapian::Document *document;
char doctype;
char *cyrusid;
std::set<std::string> *doclangs;
std::vector<std::string> *subjects;
};
static int xapian_dbw_init(xapian_dbw_t *dbw)
{
dbw->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer);
dbw->default_stopper = get_stopper("en");
dbw->term_generator = new Xapian::TermGenerator;
dbw->term_generator->set_max_word_length(XAPIAN_MAX_TERM_LENGTH);
/* Always enable CJK word tokenization */
#ifdef USE_XAPIAN_CJK_WORDS
dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS,
~Xapian::TermGenerator::FLAG_CJK_WORDS);
#else
dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM,
~Xapian::TermGenerator::FLAG_CJK_NGRAM);
#endif
dbw->doclangs = new std::set<std::string>;
dbw->subjects = new std::vector<std::string>;
return 0;
}
int xapian_dbw_open(const char **paths, xapian_dbw_t **dbwp, int mode, int nosync)
{
xapian_dbw_t *dbw = (xapian_dbw_t *)xzmalloc(sizeof(xapian_dbw_t));
int r = 0;
const char *thispath = *paths++;
std::set<int> db_versions;
try {
int flags = Xapian::DB_BACKEND_GLASS|Xapian::DB_RETRY_LOCK;
if (nosync) flags |= Xapian::DB_DANGEROUS|Xapian::DB_NO_SYNC;
try {
dbw->database = new Xapian::WritableDatabase{thispath, flags|Xapian::DB_OPEN};
db_versions = read_db_versions(*dbw->database);
} catch (Xapian::DatabaseOpeningError &e) {
/* It's OK not to atomically create or open, since we can assume
* the xapianactive file items to be locked. */
dbw->database = new Xapian::WritableDatabase{thispath, flags|Xapian::DB_CREATE};
}
if (db_versions.find(XAPIAN_DB_CURRENT_VERSION) == db_versions.end()) {
// Always index using latest database version.
db_versions.insert(XAPIAN_DB_CURRENT_VERSION);
write_db_versions(*dbw->database, db_versions);
}
r = xapian_dbw_init(dbw);
}
catch (const Xapian::DatabaseLockError &err) {
/* somebody else is already indexing this user. They may be doing a different
* mailbox, so we need to re-insert this mailbox into the queue! */
r = IMAP_MAILBOX_LOCKED;
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s> path=<%s>",
err.get_description().c_str(), thispath);
r = IMAP_IOERROR;
}
if (r) {
xapian_dbw_close(dbw);
return r;
}
/* open the read-only databases */
if (mode == XAPIAN_DBW_XAPINDEXED) {
while (*paths) {
try {
thispath = *paths;
ptrarray_append(&dbw->otherdbs, new Xapian::Database{*paths++});
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: reading database",
"exception=<%s> path=<%s>",
err.get_description().c_str(), thispath);
}
}
}
*dbwp = dbw;
return 0;
}
void xapian_dbw_close(xapian_dbw_t *dbw)
{
if (!dbw) return;
try {
delete dbw->database;
delete dbw->term_generator;
delete dbw->document;
delete dbw->default_stemmer;
delete dbw->doclangs;
delete dbw->subjects;
for (int i = 0; i < dbw->otherdbs.count; i++) {
delete (Xapian::Database *)ptrarray_nth(&dbw->otherdbs, i);
}
ptrarray_fini(&dbw->otherdbs);
free(dbw->cyrusid);
free(dbw);
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s>",
err.get_description().c_str());
}
}
int xapian_dbw_begin_txn(xapian_dbw_t *dbw)
{
int r = 0;
try {
dbw->database->begin_transaction();
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s>",
err.get_description().c_str());
r = IMAP_IOERROR;
}
return r;
}
int xapian_dbw_commit_txn(xapian_dbw_t *dbw)
{
int r = 0;
try {
dbw->database->commit_transaction();
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s>",
err.get_description().c_str());
r = IMAP_IOERROR;
}
return r;
}
int xapian_dbw_cancel_txn(xapian_dbw_t *dbw)
{
int r = 0;
try {
dbw->database->cancel_transaction();
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s>",
err.get_description().c_str());
r = IMAP_IOERROR;
}
return r;
}
int xapian_dbw_begin_doc(xapian_dbw_t *dbw, const struct message_guid *guid, char doctype)
{
int r = 0;
try {
delete dbw->document;
dbw->document = new Xapian::Document;
dbw->doctype = doctype;
/* Set document id and type */
struct buf buf = BUF_INITIALIZER;
make_cyrusid(&buf, guid, doctype);
dbw->document->add_value(SLOT_CYRUSID, buf_cstring(&buf));
dbw->cyrusid = buf_release(&buf);
add_boolean_nterm(*dbw->document, std::string("XE") + doctype);
/* Initialize term generator */
dbw->term_generator->set_document(*dbw->document);
dbw->term_generator->set_termpos(1);
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s>",
err.get_description().c_str());
r = IMAP_IOERROR;
}
return r;
}
static int add_language_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
{
std::string prefix(get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum));
std::string val = parse_langcode(buf_cstring(part));
if (val.empty()) {
syslog(LOG_INFO, "Xapian: not a valid ISO 639 code: %s",
buf_cstring(part));
return 0;
}
add_boolean_nterm(*dbw->document, prefix + val);
return 0;
}
static std::string parse_priority(const char *str)
{
const char *err;
uint32_t u;
if (parseuint32(str, &err, &u) == -1 || *err || u == 0) {
return std::string();
}
return std::to_string(u);
}
static int add_priority_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
{
std::string prefix(get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum));
if (buf_len(part)) {
std::string val = parse_priority(buf_cstring(part));
if (val.empty()) {
syslog(LOG_DEBUG, "Xapian: not a valid priority: %s",
buf_cstring(part));
return 0;
}
add_boolean_nterm(*dbw->document, prefix + val);
}
return 0;
}
static std::string parse_listid(const char *str)
{
std::string val;
/* Extract list-id */
const char *start = strrchr(str, '<');
if (start) {
/* RFC2919 list-id header (with optional closing bracket) */
const char *end = strchr(++start, '>');
if (end)
val = std::string(start, end - start);
else
val = std::string(start);
}
else {
/* Groups-style header: 'list list-id[; contact list-contact]'
* As seen at Google Group, Yahoo, et al. */
for (start = str; isspace(*start); start++) {}
if (!strncasecmp("list", start, 4) && isspace(start[4])) {
for (start = start + 4; isspace(*start); start++) {}
if (*start) {
const char *end = strchr(start, ';');
if (!end || end - start) {
val = end ? std::string(start, end - start) : std::string{start};
}
}
}
/* just raw value, that's OK too, like sentry creates. Parse up to first whitespace */
else {
const char *end;
for (end = start; *end && !isspace(*end); end++) {}
val = std::string(start, end - start);
}
}
/* Normalize list-id */
val.erase(std::remove_if(val.begin(), val.end(), isspace), val.end());
std::transform(val.begin(), val.end(), val.begin(), ::tolower);
return val;
}
static int add_listid_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
{
std::string prefix(get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum));
/* Normalize list-id */
std::string val = parse_listid(buf_cstring(part));
val.erase(std::remove_if(val.begin(), val.end(), isspace), val.end());
std::transform(val.begin(), val.end(), val.begin(), ::tolower);
if (val.empty()) {
syslog(LOG_WARNING, "Xapian: not a valid list-id: %s",
buf_cstring(part));
return 0;
}
add_boolean_nterm(*dbw->document, prefix + val);
return 0;
}
static int add_email_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
{
std::string prefix(get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum));
std::string lpart = Xapian::Unicode::tolower(buf_cstring(part));
struct address_itr itr;
address_itr_init(&itr, lpart.c_str(), 0);
const struct address *addr;
while ((addr = address_itr_next(&itr))) {
if (addr->invalid) {
continue;
}
if (addr->name) {
dbw->term_generator->set_stemmer(Xapian::Stem());
dbw->term_generator->set_stopper(NULL);
dbw->term_generator->index_text(Xapian::Utf8Iterator(addr->name), 1, prefix + 'N');
dbw->term_generator->set_stemmer(Xapian::Stem());
dbw->term_generator->set_stopper(NULL);
dbw->term_generator->index_text(Xapian::Utf8Iterator(addr->name), 1, prefix);
}
if (addr->mailbox) {
// index mailbox as single value
std::string val(addr->mailbox);
// ignore whitespace (as seen in quoted mailboxes)
val.erase(std::remove_if(val.begin(), val.end(), isspace), val.end());
add_boolean_nterm(*dbw->document, prefix + 'L' + val);
// index individual terms
dbw->term_generator->set_stemmer(Xapian::Stem());
dbw->term_generator->set_stopper(NULL);
dbw->term_generator->index_text(Xapian::Utf8Iterator(val), 1, prefix);
}
if (addr->domain && strcmp(addr->domain, "unspecified-domain")) {
// index reversed domain
std::string val;
strarray_t *sa = strarray_split(addr->domain, ".", 0);
val.reserve(buf_len(part));
for (int i = strarray_size(sa) - 1; i >= 0; i--) {
val.append(strarray_nth(sa, i));
if (i > 0) {
val.append(1, '.');
}
}
strarray_free(sa);
add_boolean_nterm(*dbw->document, prefix + "D" + val);
// index individual terms
dbw->term_generator->set_stemmer(Xapian::Stem());
dbw->term_generator->set_stopper(NULL);
dbw->term_generator->index_text(Xapian::Utf8Iterator(addr->domain,
strlen(addr->domain)), 1, prefix);
}
// index entire addr-spec
char *a = address_get_all(addr, /*canon_domain*/1);
if (a) {
add_boolean_nterm(*dbw->document, prefix + 'A' + std::string(a));
free(a);
}
}
address_itr_fini(&itr);
return 0;
}
static std::pair<std::string, std::string> parse_content_type(const char *str)
{
std::pair<std::string, std::string> ret;
struct buf buf = BUF_INITIALIZER;
const char *sep = strchr(str, '/');
if (sep) {
/* type */
buf_setmap(&buf, str, sep - str);
buf_lcase(&buf);
buf_trim(&buf);
ret.first = std::string(buf_cstring(&buf));
/* subtype */
buf_setcstr(&buf, sep + 1);
buf_lcase(&buf);
buf_trim(&buf);
ret.second = std::string(buf_cstring(&buf));
}
else {
/* type or subtype */
buf_setcstr(&buf, str);
buf_lcase(&buf);
buf_trim(&buf);
ret.first = std::string(buf_cstring(&buf));
}
buf_free(&buf);
return ret;
}
static int add_type_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
{
std::string prefix(get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum));
std::pair<std::string, std::string> ct = parse_content_type(buf_cstring(part));
if (!ct.first.empty()) {
add_boolean_nterm(*dbw->document, prefix + "T" + ct.first);
}
if (!ct.second.empty()) {
add_boolean_nterm(*dbw->document, prefix + "S" + ct.second);
}
if (!ct.first.empty() && !ct.second.empty()) {
add_boolean_nterm(*dbw->document, prefix + ct.first + '/' + ct.second);
}
return 0;
}
int add_text_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
{
const char *prefix = get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum);
int r = 0;
// Index text.
Xapian::TermGenerator::stem_strategy stem_strategy =
get_stem_strategy(XAPIAN_DB_CURRENT_VERSION, partnum);
dbw->term_generator->set_stemming_strategy(stem_strategy);
if (stem_strategy != Xapian::TermGenerator::STEM_NONE) {
if (config_getswitch(IMAPOPT_SEARCH_INDEX_LANGUAGE)){
// Index by language.
#ifndef HAVE_CLD2
// XXX is this really an "IOERROR"?
xsyslog(LOG_ERR, "IOERROR: language indexing requires CLD2 library",
NULL);
return IMAP_IOERROR;
#else
if (search_part_is_body(partnum)) {
const std::string iso_lang = detect_language(part);
if (!iso_lang.empty()) {
if (iso_lang.compare("en")) {
// Stem and index by non-default language.
try {
dbw->term_generator->set_stemmer(get_stemmer(iso_lang));
dbw->term_generator->set_stopper(get_stopper(iso_lang));
dbw->term_generator->index_text(Xapian::Utf8Iterator(part->s, part->len),
1, lang_prefix(iso_lang, prefix));
} catch (const Xapian::InvalidArgumentError &err) {
syslog(LOG_DEBUG, "Xapian: no stemmer for language %s",
iso_lang.c_str());
}
}
if (dbw->doctype == 'P') {
// Keep track of stemmer language.
std::string key = lang_doc_key(dbw->cyrusid);
dbw->database->set_metadata(key, iso_lang);
dbw->document->add_value(SLOT_DOCLANGS, iso_lang);
// Update language counts for body parts.
key = lang_count_key(iso_lang);
const std::string val = dbw->database->get_metadata(key);
dbw->database->set_metadata(key, val.empty() ?
"1" : std::to_string(std::stoi(val) + 1));
}
// Store detected languages in document.
dbw->doclangs->insert(iso_lang.c_str());
add_boolean_nterm(*dbw->document, std::string("XI") + iso_lang);
}
}
else if (partnum == SEARCH_PART_SUBJECT) {
// Keep subject text to index by language later.
dbw->subjects->push_back(buf_cstring(part));
}
#endif /* HAVE_CLD2 */
}
// Index with default stemmer.
dbw->term_generator->set_stemmer(*dbw->default_stemmer);
dbw->term_generator->set_stopper(dbw->default_stopper);
} else {
// Index with no stemming.
dbw->term_generator->set_stemmer(Xapian::Stem());
dbw->term_generator->set_stopper(NULL);
}
dbw->term_generator->index_text(Xapian::Utf8Iterator(part->s, part->len), 1, prefix);
return r;
}
int xapian_dbw_doc_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
{
int r = 0;
if (!get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum)) {
syslog(LOG_ERR, "xapian_wrapper: no prefix for partnum %d", partnum);
return IMAP_INTERNAL;
}
try {
// Handle search parts.
switch (partnum) {
case SEARCH_PART_PRIORITY:
r = add_priority_part(dbw, part, partnum);
break;
case SEARCH_PART_LISTID:
r = add_listid_part(dbw, part, partnum);
break;
case SEARCH_PART_LANGUAGE:
r = add_language_part(dbw, part, partnum);
break;
case SEARCH_PART_FROM:
case SEARCH_PART_TO:
case SEARCH_PART_CC:
case SEARCH_PART_BCC:
case SEARCH_PART_DELIVEREDTO:
r = add_email_part(dbw, part, partnum);
break;
case SEARCH_PART_TYPE:
r = add_type_part(dbw, part, partnum);
break;
default:
r = add_text_part(dbw, part, partnum);
}
// Finalize index.
dbw->term_generator->increase_termpos();
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s>",
err.get_description().c_str());
r = IMAP_IOERROR;
}
return r;
}
int xapian_dbw_end_doc(xapian_dbw_t *dbw, uint8_t indexlevel)
{
int r = 0;
assert(indexlevel > 0);
try {
if (config_getswitch(IMAPOPT_SEARCH_INDEX_LANGUAGE)){
// Keep track of languages used in this message.
if (dbw->doctype == 'G') {
std::string val = format_doclangs(*dbw->doclangs);
dbw->database->set_metadata(lang_doc_key(dbw->cyrusid), val);
dbw->document->add_value(SLOT_DOCLANGS, val);
}
// Index subjects by detected document languages.
for (std::set<std::string>::iterator it = dbw->doclangs->begin(); it != dbw->doclangs->end(); ++it) {
std::string iso_lang = *it;
if (iso_lang.compare("en")) {
try {
const char *tp = get_term_prefix(XAPIAN_DB_CURRENT_VERSION, SEARCH_PART_SUBJECT);
std::string prefix = lang_prefix(iso_lang, tp);
dbw->term_generator->set_stemmer(get_stemmer(iso_lang));
dbw->term_generator->set_stopper(get_stopper(iso_lang));
for (const std::string& subject : *dbw->subjects)
dbw->term_generator->index_text(Xapian::Utf8Iterator(subject), 1, prefix);
} catch (const Xapian::InvalidArgumentError &err) {
// ignore unknown stemmer
}
}
}
}
dbw->document->add_value(SLOT_INDEXLEVEL, format_indexlevel(indexlevel));
dbw->document->add_value(SLOT_INDEXVERSION,
std::to_string(XAPIAN_DB_CURRENT_VERSION));
dbw->database->add_document(*dbw->document);
dbw->database->set_metadata("cyrusid." + std::string(dbw->cyrusid),
format_indexlevel(indexlevel));
delete dbw->document;
dbw->document = 0;
dbw->doctype = 0;
free(dbw->cyrusid);
dbw->cyrusid = NULL;
dbw->doclangs->clear();
dbw->subjects->clear();
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s>",
err.get_description().c_str());
r = IMAP_IOERROR;
}
return r;
}
unsigned long xapian_dbw_total_length(xapian_dbw_t *dbw)
{
unsigned long res = 0;
try {
res = dbw->database->get_total_length();
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s>",
err.get_description().c_str());
}
return res;
}
uint8_t xapian_dbw_is_indexed(xapian_dbw_t *dbw, const struct message_guid *guid, char doctype)
{
struct buf buf = BUF_INITIALIZER;
make_cyrusid(&buf, guid, doctype);
std::string key = "cyrusid." + std::string(buf_cstring(&buf));
buf_free(&buf);
/* indexed in the current DB? */
uint8_t indexlevel = parse_indexlevel(dbw->database->get_metadata(key));
if (indexlevel == SEARCH_INDEXLEVEL_BEST ||
(indexlevel && doctype == XAPIAN_WRAP_DOCTYPE_PART)) {
return indexlevel;
}
/* indexed in other DBs? */
for (int i = 0; i < dbw->otherdbs.count; i++) {
Xapian::Database *database = (Xapian::Database *)ptrarray_nth(&dbw->otherdbs, i);
uint8_t level = parse_indexlevel(database->get_metadata(key));
if (level == SEARCH_INDEXLEVEL_BEST ||
(level && doctype == XAPIAN_WRAP_DOCTYPE_PART)) {
return level;
}
else indexlevel = better_indexlevel(indexlevel, level);
}
return indexlevel;
}
/* ====================================================================== */
struct xapian_db
{
std::string *paths;
Xapian::Database *database; // all but version 4 databases
- Xapian::Database *legacydbv4; // version 4 databases
std::vector<Xapian::Database> *subdbs; // all database subdbs
Xapian::Stem *default_stemmer;
const Xapian::Stopper* default_stopper;
std::set<std::string> *stem_languages;
Xapian::QueryParser *parser;
std::set<int> *db_versions;
xapian_dbw_t *dbw;
};
static int xapian_db_init(xapian_db_t *db)
{
int r = 0;
try {
db->parser = new Xapian::QueryParser;
db->parser->set_default_op(Xapian::Query::OP_AND);
- db->parser->set_database(db->database ? *db->database : *db->legacydbv4);
db->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer);
db->default_stopper = get_stopper("en");
// Determine stemmer languages (in addition to English).
db->stem_languages = new std::set<std::string>;
std::map<const std::string, unsigned> lang_counts;
size_t total_doccount = 0;
for (const Xapian::Database& subdb : *db->subdbs) {
read_language_counts(subdb, lang_counts);
total_doccount += subdb.get_doccount();
}
total_doccount /= 2; // Crude estimate.
for (std::pair<const std::string, unsigned>& it : lang_counts) {
if (it.first.compare("en") && ((double) it.second / total_doccount) >= 0.05) {
db->stem_languages->insert(it.first);
}
}
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s>",
err.get_description().c_str());
r = IMAP_IOERROR;
}
return r;
}
int xapian_db_open(const char **paths, xapian_db_t **dbp)
{
xapian_db_t *db = (xapian_db_t *)xzmalloc(sizeof(xapian_db_t));
const char *thispath = "(unknown)";
int r = 0;
try {
db->paths = new std::string;
while (paths && *paths) {
thispath = *paths++;
Xapian::Database subdb {thispath};
std::set<int> db_versions = read_db_versions(subdb);
if (db_versions.empty()) {
syslog(LOG_ERR, "xapian_wrapper: invalid db version in %s", thispath);
r = IMAP_INTERNAL;
goto done;
}
if (!db->db_versions)
db->db_versions = new std::set<int>;
db->db_versions->insert(db_versions.begin(), db_versions.end());
- // Databases with version 4 split indexing by doctype.
+ // Check for experimental v4 indexes, they were bogus.
if (db_versions.find(4) != db_versions.end()) {
- if (!db->legacydbv4) db->legacydbv4 = new Xapian::Database;
- db->legacydbv4->add_database(subdb);
- }
- // Databases with any but version 4 are regular dbs.
- if (db_versions.size() > 1 || db_versions.find(4) == db_versions.end()) {
- if (!db->database) db->database = new Xapian::Database;
- db->database->add_database(subdb);
+ xsyslog(LOG_WARNING, "deprecated v4 index detected, "
+ "search may return wrong results",
+ "db=<%s>", thispath);
}
+ // Add database.
+ if (!db->database) db->database = new Xapian::Database;
+ db->database->add_database(subdb);
// Xapian database has no API to access subdbs.
if (!db->subdbs) db->subdbs = new std::vector<Xapian::Database>;
db->subdbs->push_back(subdb);
db->paths->append(thispath).push_back(' ');
}
thispath = "(unknown)";
- if (!db->database && !db->legacydbv4) {
+ if (!db->database) {
r = IMAP_NOTFOUND;
goto done;
}
r = xapian_db_init(db);
if (r) goto done;
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s> path=<%s>",
err.get_description().c_str(), thispath);
r = IMAP_IOERROR;
}
done:
if (r)
xapian_db_close(db);
else
*dbp = db;
return r;
}
int xapian_db_opendbw(struct xapian_dbw *dbw, xapian_db_t **dbp)
{
xapian_db_t *db = (xapian_db_t *)xzmalloc(sizeof(xapian_db_t));
db->dbw = dbw;
db->database = dbw->database;
db->db_versions = new std::set<int>();
std::set<int> dbw_versions = read_db_versions(*dbw->database);
db->db_versions->insert(dbw_versions.begin(), dbw_versions.end());
db->subdbs = new std::vector<Xapian::Database>;
db->subdbs->push_back(*dbw->database);
int r = xapian_db_init(db);
if (r) {
xapian_db_close(db);
db = NULL;
}
*dbp = db;
return r;
}
void xapian_db_close(xapian_db_t *db)
{
if (!db) return;
try {
if (!db->dbw) delete db->database;
- delete db->legacydbv4;
delete db->parser;
delete db->paths;
delete db->db_versions;
delete db->default_stemmer;
delete db->stem_languages;
delete db->subdbs;
free(db);
}
catch (const Xapian::Error &err) {
/* XXX - memory leak? */
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s>",
err.get_description().c_str());
}
}
int xapian_db_langstats(xapian_db_t *db, ptrarray_t* lstats, size_t *nolang)
{
std::map<const std::string, unsigned> lang_counts;
size_t total_part = 0;
size_t total_lang = 0;
for (const Xapian::Database& subdb : *db->subdbs) {
// count body parts
for (Xapian::TermIterator it = subdb.metadata_keys_begin("cyrusid.*P*");
it != subdb.metadata_keys_end("cyrusid.*P*"); ++it) {
total_part++;
}
// cummulate language counts
read_language_counts(subdb, lang_counts);
}
for (const std::pair<const std::string, unsigned>& counts : lang_counts) {
struct search_langstat *stat = (struct search_langstat*)
xzmalloc(sizeof(struct search_langstat));
stat->iso_lang = xstrdup(counts.first.c_str());
stat->count = counts.second;
ptrarray_append(lstats, stat);
total_lang += counts.second;
}
*nolang = total_part > total_lang ? total_part - total_lang : 0;
return 0;
}
void xapian_query_add_stemmer(xapian_db_t *db, const char *iso_lang)
{
if (strcmp(iso_lang, "en")) db->stem_languages->insert(iso_lang);
}
-int xapian_db_has_otherthan_v4_index(const xapian_db_t *db)
-{
- return db->database != NULL;
-}
-
-int xapian_db_has_legacy_v4_index(const xapian_db_t *db)
-{
- return db->legacydbv4 != NULL;
-}
-
static Xapian::Query* query_new_textmatch(const xapian_db_t *db,
const char *match,
const char *prefix,
Xapian::TermGenerator::stem_strategy tg_stem_strategy)
{
unsigned flags = Xapian::QueryParser::FLAG_PHRASE |
Xapian::QueryParser::FLAG_WILDCARD;
std::string lmatch = Xapian::Unicode::tolower(match);
if (tg_stem_strategy != Xapian::TermGenerator::STEM_NONE) {
// Query without any stemmer.
db->parser->set_stemmer(Xapian::Stem());
db->parser->set_stopper(NULL);
db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
Xapian::Query q = db->parser->parse_query(lmatch, flags, prefix);
// Query with default stemmer. But don't stem stopwords.
if (!db->default_stopper || !(*db->default_stopper)(lmatch)) {
db->parser->set_stemmer(*db->default_stemmer);
db->parser->set_stopper(db->default_stopper);
db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
q |= db->parser->parse_query(lmatch, flags, prefix);
}
// Stem query for each language detected in the index.
for (const std::string& iso_lang : *db->stem_languages) {
try {
const Xapian::Stopper *stopper = get_stopper(iso_lang);
db->parser->set_stemmer(get_stemmer(iso_lang));
db->parser->set_stopper(stopper);
db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
if (!stopper || !(*stopper)(lmatch)) {
q |= db->parser->parse_query(lmatch, flags, lang_prefix(iso_lang, prefix));
}
} catch (const Xapian::InvalidArgumentError &err) {
syslog(LOG_INFO, "Xapian: no stemmer for language %s", iso_lang.c_str());
}
}
return new Xapian::Query(q);
}
else {
db->parser->set_stemmer(Xapian::Stem());
db->parser->set_stopper(NULL);
db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
return new Xapian::Query {db->parser->parse_query(lmatch, flags, prefix)};
}
}
static Xapian::Query *query_new_language(const xapian_db_t *db __attribute__((unused)),
const char *prefix,
const char *str)
{
std::string val = parse_langcode(str);
if (val.empty()) {
syslog(LOG_DEBUG, "Xapian: invalid language in query: %s", str);
return new Xapian::Query(Xapian::Query::MatchNothing);
}
return new Xapian::Query(std::string(prefix) + val);
}
static Xapian::Query *query_new_priority(const xapian_db_t *db __attribute__((unused)),
const char *prefix,
const char *str)
{
std::string val = parse_priority(str);
if (val.empty()) {
syslog(LOG_DEBUG, "Xapian: invalid priority in query: %s", str);
return new Xapian::Query(Xapian::Query::MatchNothing);
}
return new Xapian::Query(std::string(prefix) + val);
}
static Xapian::Query *query_new_listid(const xapian_db_t *db,
const char *prefix,
const char *str)
{
Xapian::Query *q = NULL;
std::string val = parse_listid(str);
if (!val.empty()) {
q = new Xapian::Query(std::string(prefix) + val);
}
else {
syslog(LOG_DEBUG, "Xapian: invalid listid in query: %s", str);
q = new Xapian::Query(Xapian::Query::MatchNothing);
}
if (db->db_versions->lower_bound(11) != db->db_versions->begin()) {
// query in legacy format
db->parser->set_stemmer(Xapian::Stem());
db->parser->set_stopper(NULL);
db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
q = new Xapian::Query(Xapian::Query::OP_OR, *q,
db->parser->parse_query(str, 0, prefix));
}
return q;
}
static Xapian::Query *query_new_email(const xapian_db_t *db,
const char *_prefix,
const char *str)
{
std::string prefix(_prefix);
unsigned qpflags = Xapian::QueryParser::FLAG_PHRASE |
Xapian::QueryParser::FLAG_WILDCARD;
db->parser->set_stemmer(Xapian::Stem());
db->parser->set_stopper(NULL);
db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
std::string mystr = Xapian::Unicode::tolower(str);
str = mystr.c_str();
const char *atsign = strchr(str, '@');
if (!atsign) {
// query free text
return new Xapian::Query{db->parser->parse_query(str, qpflags, prefix)};
}
Xapian::Query q = Xapian::Query::MatchNothing;
// query name and mailbox (unless just searching for '@domain')
if (atsign > str) {
struct address *addr = NULL;
parseaddr_list(str, &addr);
if (addr && addr->name) {
Xapian::Query qq = db->parser->parse_query(addr->name, qpflags, prefix + 'N');
if (q.get_type() != q.LEAF_MATCH_NOTHING) {
q &= qq;
}
else q = qq;
}
if (addr && addr->mailbox) {
// strip the domain from the mailbox
std::string mail(addr->mailbox);
mail.erase(std::remove_if(mail.begin(), mail.end(), isspace), mail.end());
int wildcard = mail[mail.size()-1] == '*';
if (wildcard) {
mail.resize(mail.size()-1);
}
if (!mail.empty()) {
std::string term(prefix + 'L' + mail);
Xapian::Query qq = wildcard ?
Xapian::Query(Xapian::Query::OP_WILDCARD, term) :
Xapian::Query(term);
if (q.get_type() != q.LEAF_MATCH_NOTHING) {
q &= qq;
}
else q = qq;
}
}
// ignore @domain - it's being handled below
if (addr) parseaddr_free(addr);
}
// query domain
if (atsign[1]) {
std::string domain;
const char *dstart = atsign + 1;
bool wildcard = *dstart == '*';
if (wildcard) dstart++;
const char *dend;
for (dend = dstart; *dend; dend++) {
char c = *dend;
if (Uisalnum(c) || c == '-' || c == '[' || c == ']' || c == ':') {
continue;
}
else if (c == '.' && (dend-1 == dstart || dend[-2] != '.')) {
continue;
}
else {
break;
}
}
if (dend > dstart) {
strarray_t *sa = strarray_nsplit(dstart, dend - dstart, ".", 0);
for (int i = strarray_size(sa) - 1; i >= 0; i--) {
domain.append(strarray_nth(sa, i));
if (i > 0) {
domain.append(1, '.');
}
}
strarray_free(sa);
if (*dstart == '.') {
domain.append(1, '.');
}
}
if (!domain.empty()) {
std::string term(prefix + 'D' + domain);
Xapian::Query qq = wildcard ? Xapian::Query(Xapian::Query::OP_WILDCARD, term) :
Xapian::Query(term);
{
// FIXME - temporarily also search for '@' prefix
std::string term2(prefix + '@' + domain);
Xapian::Query qq2 = wildcard ? Xapian::Query(Xapian::Query::OP_WILDCARD, term2) :
Xapian::Query(term2);
qq |= qq2;
}
if (q.get_type() != q.LEAF_MATCH_NOTHING) {
q &= qq;
}
else q = qq;
}
}
if (q.get_type() == q.LEAF_MATCH_ALL) {
q = Xapian::Query::MatchNothing;
}
// query in legacy format as well!
if (db->db_versions->lower_bound(12) != db->db_versions->begin()) {
q |= db->parser->parse_query(str, qpflags, prefix);
}
// query localpart@domain (ONLY if no wildcards)
if ((atsign > str) && atsign[1] && !strchr(str, '*')) {
struct address *addr = NULL;
parseaddr_list(str, &addr);
if (addr) {
char *a = address_get_all(addr, /*canon_domain*/1);
if (a) {
// query 'A' term for index >= 16
std::string term(prefix + 'A' + std::string(a));
Xapian::Query qq =
Xapian::Query(Xapian::Query::OP_AND,
Xapian::Query(Xapian::Query::OP_VALUE_GE,
Xapian::valueno(SLOT_INDEXVERSION),
std::string("16")),
Xapian::Query(term));
if (q.get_type() != q.LEAF_MATCH_NOTHING) {
// otherwise, query 'L' + 'D' terms (as per above)
Xapian::Query qq2 =
Xapian::Query(Xapian::Query::OP_AND,
Xapian::Query(Xapian::Query::OP_VALUE_LE,
Xapian::valueno(SLOT_INDEXVERSION),
std::string("15")),
q);
qq |= qq2;
}
q = qq;
}
parseaddr_free(addr);
free(a);
}
}
return new Xapian::Query(q);
}
static void append_alnum(struct buf *buf, const char *ss)
{
const unsigned char *s = (const unsigned char *)ss;
for ( ; *s ; ++s) {
if (Uisalnum(*s))
buf_putc(buf, *s);
}
}
static Xapian::Query *query_new_type(const xapian_db_t *db __attribute__((unused)),
const char *_prefix,
const char *str)
{
std::pair<std::string, std::string> ct = parse_content_type(str);
std::string prefix(_prefix);
Xapian::Query q = Xapian::Query::MatchNothing;
bool query_legacy = db->db_versions->lower_bound(13) != db->db_versions->begin();
struct buf buf = BUF_INITIALIZER;
unsigned qpflags = Xapian::QueryParser::FLAG_PHRASE |
Xapian::QueryParser::FLAG_WILDCARD;
if (!ct.first.empty() && ct.second.empty()) {
/* Match either type or subtype */
if (ct.first != "*") {
q = Xapian::Query(Xapian::Query::OP_OR,
Xapian::Query(prefix + 'T' + ct.first),
Xapian::Query(prefix + 'S' + ct.first));
if (query_legacy) {
append_alnum(&buf, ct.first.c_str());
q |= db->parser->parse_query(buf_cstring(&buf), qpflags, prefix);
}
}
}
else if (ct.first == "*" || ct.second == "*") {
/* Wildcard query */
if (!ct.first.empty() && ct.first != "*") {
/* Match type */
q = Xapian::Query(prefix + 'T' + ct.first);
if (query_legacy) {
append_alnum(&buf, ct.first.c_str());
q |= db->parser->parse_query(buf_cstring(&buf), qpflags, prefix);
}
}
if (!ct.second.empty() && ct.second != "*") {
/* Match subtype */
q = Xapian::Query(prefix + 'S' + ct.second);
if (query_legacy) {
append_alnum(&buf, ct.second.c_str());
q |= db->parser->parse_query(buf_cstring(&buf), qpflags, prefix);
}
}
}
else if (!ct.first.empty() && !ct.second.empty()) {
/* Verbatim search */
q = Xapian::Query(prefix + ct.first + '/' + ct.second);
if (query_legacy) {
append_alnum(&buf, ct.first.c_str());
buf_putc(&buf, '_');
append_alnum(&buf, ct.second.c_str());
q |= db->parser->parse_query(buf_cstring(&buf), qpflags, prefix);
}
}
buf_free(&buf);
return new Xapian::Query(q);
}
Xapian::Query*
xapian_query_new_match_internal(const xapian_db_t *db, int partnum, const char *str)
{
const char *prefix = get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum);
try {
// Handle special value search parts.
if (partnum == SEARCH_PART_LANGUAGE) {
return query_new_language(db, prefix, str);
}
else if (partnum == SEARCH_PART_PRIORITY) {
return query_new_priority(db, prefix, str);
}
else if (partnum == SEARCH_PART_LISTID) {
return query_new_listid(db, prefix, str);
}
else if (partnum == SEARCH_PART_FROM ||
partnum == SEARCH_PART_TO ||
partnum == SEARCH_PART_CC ||
partnum == SEARCH_PART_BCC ||
partnum == SEARCH_PART_DELIVEREDTO) {
return query_new_email(db, prefix, str);
}
else if (partnum == SEARCH_PART_TYPE) {
return query_new_type(db, prefix, str);
}
// Don't stem queries for Thaana codepage (0780) or higher.
for (const unsigned char *p = (const unsigned char *)str; *p; p++) {
if (*p > 221) //has highbit
return new Xapian::Query {db->parser->parse_query(
str,
#ifdef USE_XAPIAN_CJK_WORDS
Xapian::QueryParser::FLAG_CJK_WORDS,
#else
Xapian::QueryParser::FLAG_CJK_NGRAM,
#endif
prefix)};
}
// Stemable codepage.
Xapian::TermGenerator::stem_strategy stem_strategy =
get_stem_strategy(XAPIAN_DB_CURRENT_VERSION, partnum);
Xapian::Query *qq = query_new_textmatch(db, str, prefix, stem_strategy);
if (qq->get_type() == Xapian::Query::LEAF_MATCH_NOTHING) {
delete qq;
qq = NULL;
}
return qq;
} catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s>",
err.get_description().c_str());
return 0;
}
}
xapian_query_t *
xapian_query_new_match(const xapian_db_t *db, int partnum, const char *str)
{
if (db->subdbs->empty()) {
// no database to query
return NULL;
}
const char *prefix = get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum);
if (!prefix) {
return NULL;
}
int min_version = *db->db_versions->begin();
if (min_version < XAPIAN_DB_MIN_SUPPORTED_VERSION) {
xsyslog(LOG_WARNING,
"deprecated database version, reindex required",
"version=<%d> min_supported_version=<%d> paths=<%s>",
min_version, XAPIAN_DB_MIN_SUPPORTED_VERSION,
db->paths->c_str());
}
Xapian::Query *q = xapian_query_new_match_internal(db, partnum, str);
if (min_version < 15) {
/* Older versions indexed header fields in Cyrus search form */
charset_t utf8 = charset_lookupname("utf-8");
char *mystr = charset_convert(str, utf8, charset_flags);
if (mystr) {
Xapian::Query *qq = xapian_query_new_match_internal(db, partnum, mystr);
if (qq && q) {
*q |= *qq;
}
else if (!q) q = qq;
}
free(mystr);
charset_free(&utf8);
}
return (xapian_query_t*) q;
}
xapian_query_t *xapian_query_new_compound(const xapian_db_t *db __attribute__((unused)),
int is_or, xapian_query_t **children, int n)
{
try {
// I want to use std::initializer_list<Xapian::Query*> here
// but that requires "experimental" gcc C++0x support :(
// 'compound' owns a refcount on each child. We need to
// drop the one we got when we allocated the children
Xapian::Query* compound = new Xapian::Query;
if (is_or)
for (int i = 0 ; i < n ; i++) {
*compound |= *(Xapian::Query*)children[i];
delete (Xapian::Query*)children[i];
}
else
for (int i = 0 ; i < n ; i++) {
if (compound->empty())
*compound = *(Xapian::Query*)children[i];
else
*compound &= *(Xapian::Query*)children[i];
delete (Xapian::Query*)children[i];
}
return (xapian_query_t *)compound;
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s>",
err.get_description().c_str());
return 0;
}
}
/* Xapian does not have an OP_NOT. WTF? We fake it with
* OP_AND_NOT where the left child is MatchAll */
xapian_query_t *xapian_query_new_not(const xapian_db_t *db __attribute__((unused)),
xapian_query_t *child)
{
try {
Xapian::Query *qq = new Xapian::Query(
Xapian::Query::OP_AND_NOT,
Xapian::Query::MatchAll,
*(Xapian::Query *)child);
// 'compound' owns a refcount on each child. We need to
// drop the one we got when we allocated the children
delete (Xapian::Query *)child;
return (xapian_query_t *)qq;
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s>",
err.get_description().c_str());
return 0;
}
}
xapian_query_t *xapian_query_new_matchall(const xapian_db_t *db __attribute__((unused)))
{
return (xapian_query_t *) new Xapian::Query(Xapian::Query::MatchAll);
}
xapian_query_t *xapian_query_new_has_doctype(const xapian_db_t *db __attribute__((unused)),
char doctype, xapian_query_t *child)
{
try {
Xapian::Query *qq = new Xapian::Query(
Xapian::Query::OP_FILTER,
child ? *(Xapian::Query *)child : Xapian::Query::MatchAll,
std::string("XE") + doctype);
// 'compound' owns a refcount on each child. We need to
// drop the one we got when we allocated the children
delete (Xapian::Query *)child;
return (xapian_query_t *)qq;
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s>",
err.get_description().c_str());
return 0;
}
}
void xapian_query_free(xapian_query_t *qq)
{
try {
delete (Xapian::Query *)qq;
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s>",
err.get_description().c_str());
}
}
-int xapian_query_run(const xapian_db_t *db, const xapian_query_t *qq, int is_legacy,
+int xapian_query_run(const xapian_db_t *db, const xapian_query_t *qq,
int (*cb)(void *data, size_t n, void *rock), void *rock)
{
const Xapian::Query *query = (const Xapian::Query *)qq;
void *data = NULL;
size_t n = 0;
- if ((is_legacy && !db->legacydbv4) || (!is_legacy && !db->database)) return 0;
+ if (!db->database) return 0;
try {
- Xapian::Database *database = is_legacy ? db->legacydbv4 : db->database;
+ Xapian::Database *database = db->database;
Xapian::Enquire enquire(*database);
enquire.set_query(*query);
enquire.set_sort_by_value(0, false); // sort by cyrusid ascending
Xapian::MSet matches = enquire.get_mset(0, database->get_doccount());
size_t size = matches.size();
if (size) data = xzmalloc(size * 41);
for (Xapian::MSetIterator i = matches.begin() ; i != matches.end() ; ++i) {
const Xapian::Document& d = i.get_document();
const std::string& cyrusid = d.get_value(SLOT_CYRUSID);
/* ignore documents with no cyrusid. Shouldn't happen, but has been seen */
if (cyrusid.length() != 43) {
xsyslog(LOG_ERR, "IOERROR: skipping document with zero-length cyrusid",
"documentid=<%u> paths=<%s>",
d.get_docid(), db->paths->c_str());
continue;
}
const char *cstr = cyrusid.c_str();
if (cstr[0] != '*' || !isalpha(cstr[1]) || cstr[2] != '*') {
xsyslog(LOG_ERR, "IOERROR: skipping document with invalid cyrusid",
"cyrusid=<%s> documentid=<%u> paths=<%s>",
cstr, d.get_docid(), db->paths->c_str());
continue;
}
if (n >= size) throw Xapian::DatabaseError("Too many records in MSet");
char *entry = (char *) data + (41*n);
memcpy(entry, cstr+3, 40);
entry[40] = '\0';
++n;
}
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s> query=<%s>",
err.get_description().c_str(),
query ? query->get_description().c_str() : "");
free(data);
return IMAP_IOERROR;
}
if (!n) {
free(data);
return 0;
}
int r = cb(data, n, rock);
free(data);
return r;
}
/* ====================================================================== */
struct xapian_snipgen
{
Xapian::Stem *default_stemmer;
xapian_db_t *db;
Xapian::Database *memdb;
std::vector<std::string> *loose_terms;
std::vector<std::string> *queries;
char *cyrusid;
char doctype;
struct buf *buf;
const char *hi_start;
const char *hi_end;
const char *omit;
size_t max_len;
};
xapian_snipgen_t *xapian_snipgen_new(xapian_db_t *db,
const char *hi_start,
const char *hi_end,
const char *omit)
{
xapian_snipgen_t *snipgen = (xapian_snipgen_t *)xzmalloc(sizeof(xapian_snipgen_t));
snipgen->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer);
snipgen->db = db;
snipgen->memdb = new Xapian::WritableDatabase(std::string(), Xapian::DB_BACKEND_INMEMORY);
snipgen->buf = buf_new();
snipgen->hi_start = hi_start;
snipgen->hi_end = hi_end;
snipgen->omit = omit;
snipgen->max_len = (size_t) config_getint(IMAPOPT_SEARCH_SNIPPET_LENGTH);
return snipgen;
}
void xapian_snipgen_free(xapian_snipgen_t *snipgen)
{
if (!snipgen) return;
delete snipgen->default_stemmer;
delete snipgen->loose_terms;
delete snipgen->queries;
delete snipgen->memdb;
free(snipgen->cyrusid);
buf_destroy(snipgen->buf);
free(snipgen);
}
static Xapian::Query xapian_snipgen_build_query(xapian_snipgen_t *snipgen, Xapian::Stem& stemmer)
{
Xapian::TermGenerator term_generator;
Xapian::Query q;
if (snipgen->loose_terms) {
/* Add loose query terms */
term_generator.set_stemmer(stemmer);
#ifdef USE_XAPIAN_CJK_WORDS
term_generator.set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS,
~Xapian::TermGenerator::FLAG_CJK_WORDS);
#else
term_generator.set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM,
~Xapian::TermGenerator::FLAG_CJK_NGRAM);
#endif
for(size_t i = 0; i < snipgen->loose_terms->size(); ++i)
{
term_generator.index_text(Xapian::Utf8Iterator((*snipgen->loose_terms)[i]));
}
const Xapian::Document& doc = term_generator.get_document();
q = Xapian::Query(Xapian::Query::OP_OR, doc.termlist_begin(), doc.termlist_end());
}
if (snipgen->queries) {
/* Add phrase queries */
unsigned flags = Xapian::QueryParser::FLAG_PHRASE|
Xapian::QueryParser::FLAG_WILDCARD|
#ifdef USE_XAPIAN_CJK_WORDS
Xapian::QueryParser::FLAG_CJK_WORDS;
#else
Xapian::QueryParser::FLAG_CJK_NGRAM;
#endif
Xapian::QueryParser queryparser;
queryparser.set_stemmer(stemmer);
for(size_t i = 0; i < snipgen->queries->size(); ++i) {
q |= queryparser.parse_query((*snipgen->queries)[i], flags);;
}
}
return q;
}
int xapian_snipgen_add_match(xapian_snipgen_t *snipgen, const char *match)
{
size_t len = strlen(match);
bool is_query = len > 1 && ((match[0] == '"' && match[len-1] == '"') ||
(strchr(match, '*') != NULL));
if (is_query) {
if (!snipgen->queries) {
snipgen->queries = new std::vector<std::string>;
}
snipgen->queries->push_back(match);
} else {
if (!snipgen->loose_terms) {
snipgen->loose_terms = new std::vector<std::string>;
}
snipgen->loose_terms->push_back(match);
}
return 0;
}
int xapian_snipgen_begin_doc(xapian_snipgen_t *snipgen,
const struct message_guid *guid, char doctype)
{
struct buf buf = BUF_INITIALIZER;
make_cyrusid(&buf, guid, doctype);
snipgen->cyrusid = buf_release(&buf);
snipgen->doctype = doctype;
buf_reset(snipgen->buf);
return 0;
}
int xapian_snipgen_make_snippet(xapian_snipgen_t *snipgen,
const struct buf *part,
Xapian::Stem* stemmer)
{
int r = 0;
try {
std::string text {buf_base(part), buf_len(part)};
Xapian::Enquire enquire(*snipgen->memdb);
Xapian::Query qq = xapian_snipgen_build_query(snipgen, *stemmer);
if (qq.empty()) return 0;
enquire.set_query(qq);
unsigned flags = Xapian::MSet::SNIPPET_EXHAUSTIVE |
Xapian::MSet::SNIPPET_EMPTY_WITHOUT_MATCH;
#ifdef USE_XAPIAN_CJK_WORDS
flags |= Xapian::MSet::SNIPPET_CJK_WORDS;
#endif
const std::string snippet = enquire.get_mset(0, 0).snippet(text,
snipgen->max_len - buf_len(snipgen->buf),
*stemmer, flags,
snipgen->hi_start,
snipgen->hi_end,
snipgen->omit);
if (!snippet.empty()) {
if (buf_len(snipgen->buf)) {
buf_appendoverlap(snipgen->buf, snipgen->omit);
}
buf_appendcstr(snipgen->buf, snippet.c_str());
}
} catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s>",
err.get_description().c_str());
r = IMAP_IOERROR;
}
return r;
}
int xapian_snipgen_doc_part(xapian_snipgen_t *snipgen, const struct buf *part,
int partnum __attribute__((unused)))
{
// Ignore empty queries.
if (!snipgen->loose_terms && !snipgen->queries) return 0;
// Don't exceed allowed snippet length.
if (buf_len(snipgen->buf) >= snipgen->max_len) return 0;
if (config_getswitch(IMAPOPT_SEARCH_INDEX_LANGUAGE) &&
snipgen->db->database && snipgen->cyrusid) {
std::set<std::string> doclangs;
// Lookup stemmer language for this document part, if any.
std::string key = lang_doc_key(snipgen->cyrusid);
for (const Xapian::Database& subdb : *snipgen->db->subdbs) {
std::string val = subdb.get_metadata(key);
if (!val.empty()) parse_doclangs(val, doclangs);
break;
}
// Generate snippets for each detected message language.
// The first non-empty snippet wins.
size_t prev_size = buf_len(snipgen->buf);
for (std::set<std::string>::iterator it = doclangs.begin(); it != doclangs.end(); ++it) {
const std::string& iso_lang = *it;
if (iso_lang.compare("en")) {
try {
Xapian::Stem stemmer = get_stemmer(iso_lang);
int r = xapian_snipgen_make_snippet(snipgen, part, &stemmer);
if (!r && prev_size != buf_len(snipgen->buf)) {
return 0;
}
} catch (const Xapian::InvalidArgumentError &err) {
// ignore unknown stemmer
}
}
}
}
/* Using a custom stemmer did not generate a snippet.
* This could be because the query matched using the
* default stemmer, so try generating a snippet with
* that stemmer instead.*/
return xapian_snipgen_make_snippet(snipgen, part, snipgen->default_stemmer);
}
int xapian_snipgen_end_doc(xapian_snipgen_t *snipgen, struct buf *buf)
{
buf_reset(buf);
buf_copy(buf, snipgen->buf);
buf_cstring(buf);
buf_reset(snipgen->buf);
delete snipgen->loose_terms;
snipgen->loose_terms = NULL;
delete snipgen->queries;
snipgen->queries = NULL;
free(snipgen->cyrusid);
snipgen->cyrusid = NULL;
snipgen->doctype = 0;
return 0;
}
/* cb returns true if document should be copied, false if not */
int xapian_filter(const char *dest, const char **sources,
int (*cb)(const char *cyrusid, void *rock),
void *rock)
{
int r = 0;
const char *thispath = "(unknown path)";
try {
/* create a destination database */
Xapian::WritableDatabase destdb {dest, Xapian::DB_CREATE|Xapian::DB_BACKEND_GLASS};
/* With multiple databases as above, the docids are interleaved, so it
* might be worth trying to open each source and copy its documents to
* destdb in turn for better locality of reference, and so better cache
* use. -- Olly on the mailing list */
std::vector<Xapian::Database> srcdbs;
// Open databases and aggregate database-level metadata.
while (*sources) {
thispath = *sources++;
const Xapian::Database srcdb {thispath};
srcdbs.push_back(srcdb);
}
// Copy all matching documents.
std::set<int> db_versions;
for (size_t i = 0; i < srcdbs.size(); ++i) {
const Xapian::Database& srcdb = srcdbs.at(i);
bool need_md_versions = false;
std::set<int> md_versions = read_db_versions(srcdb);
/* copy all matching documents to the new DB */
for (Xapian::ValueIterator it = srcdb.valuestream_begin(SLOT_CYRUSID);
it != srcdb.valuestream_end(SLOT_CYRUSID); ++it) {
const std::string& cyrusid = *it;
const std::string idkey {"cyrusid." + cyrusid};
// check if caller wants this cyrusid
if (!cb(cyrusid.c_str(), rock)) {
continue;
}
// is it already indexed?
if (!destdb.get_metadata(idkey).empty()) {
continue;
}
// is there a subsequent db with a better index level? (only for G docs)
uint8_t indexlevel = parse_indexlevel(srcdb.get_metadata(idkey));
if (cyrusid[1] == XAPIAN_WRAP_DOCTYPE_MSG) {
int found_better = 0;
for (size_t j = i + 1; !found_better && j < srcdbs.size(); ++j) {
uint8_t level = parse_indexlevel(srcdbs[j].get_metadata(idkey));
found_better = better_indexlevel(indexlevel, level) != indexlevel;
}
if (found_better) {
continue;
}
}
// add document
Xapian::Document srcdoc = srcdb.get_document(it.get_docid());
Xapian::docid docid = destdb.add_document(srcdoc);
destdb.set_metadata(idkey, format_indexlevel(indexlevel));
// copy document language metadata
const std::string& langkey = lang_doc_key(cyrusid.c_str());
if (destdb.get_metadata(langkey).empty()) {
std::string val = srcdb.get_metadata(langkey);
if (!val.empty() && isalpha(val[0])) {
destdb.set_metadata(langkey, val);
}
}
const std::string& langval = srcdoc.get_value(SLOT_DOCLANGS);
if (!langval.empty() && !isalpha(langval[0])) {
destdb.get_document(docid).remove_value(SLOT_DOCLANGS);
}
// add document index version
const std::string& verval = srcdoc.get_value(SLOT_INDEXVERSION);
if (!verval.empty()) {
int version = std::atoi(verval.c_str());
if (version) db_versions.insert(version);
}
else need_md_versions = true;
}
if (need_md_versions) {
/* At least one added document didn't have its index
* version slot set in this subdb. Read legacy versions. */
std::set<int> md_versions = read_db_versions(srcdb);
db_versions.insert(md_versions.begin(), md_versions.lower_bound(14));
}
}
thispath = "(unknown path)";
// set the versions
write_db_versions(destdb, db_versions);
// recalculate language counts
std::map<const std::string, unsigned> lang_counts;
r = calculate_language_counts(destdb, lang_counts);
if (r) {
xsyslog(LOG_ERR, "IOERROR: corrupt metadata",
"filter=<%s>",
dest);
return r;
}
write_language_counts(destdb, lang_counts);
/* commit all changes explicitly */
destdb.commit();
}
catch (const Xapian::Error &err) {
xsyslog(LOG_ERR, "IOERROR: caught exception",
"exception=<%s> path=<%s>",
err.get_description().c_str(), thispath);
r = IMAP_IOERROR;
}
return r;
}
const char *xapian_version_string()
{
return Xapian::version_string();
}
struct xapian_doc {
Xapian::TermGenerator *termgen;
Xapian::Document *doc;
};
xapian_doc_t *xapian_doc_new(void)
{
xapian_doc_t *doc = (xapian_doc_t *) xzmalloc(sizeof(struct xapian_doc));
doc->doc = new Xapian::Document;
doc->termgen = new Xapian::TermGenerator;
doc->termgen->set_document(*doc->doc);
return doc;
}
void xapian_doc_index_text(xapian_doc_t *doc, const char *text, size_t len)
{
doc->termgen->index_text(Xapian::Utf8Iterator(text, len));
}
size_t xapian_doc_termcount(xapian_doc_t *doc)
{
return doc->doc->termlist_count();
}
int xapian_doc_foreach_term(xapian_doc_t *doc, int(*cb)(const char*, void*), void *rock)
{
for (Xapian::TermIterator ti = doc->doc->termlist_begin();
ti != doc->doc->termlist_end(); ++ti) {
int r = cb((*ti).c_str(), rock);
if (r) return r;
}
return 0;
}
void xapian_doc_reset(xapian_doc_t *doc)
{
doc->doc->clear_values();
}
extern void xapian_doc_close(xapian_doc_t *termgen);
void xapian_doc_close(xapian_doc_t *doc)
{
delete doc->termgen;
delete doc->doc;
free(doc);
}
diff --git a/imap/xapian_wrap.h b/imap/xapian_wrap.h
index 5b1f0948d..72d1fc512 100644
--- a/imap/xapian_wrap.h
+++ b/imap/xapian_wrap.h
@@ -1,121 +1,118 @@
/* xapian_wrap.h -- C++ hiding wrapper API for Xapian
*
* Copyright (c) 1994-2012 Carnegie Mellon University. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The name "Carnegie Mellon University" must not be used to
* endorse or promote products derived from this software without
* prior written permission. For permission or any legal
* details, please contact
* Carnegie Mellon University
* Center for Technology Transfer and Enterprise Creation
* 4615 Forbes Avenue
* Suite 302
* Pittsburgh, PA 15213
* (412) 268-7393, fax: (412) 268-7395
* innovation@andrew.cmu.edu
*
* 4. Redistributions of any form whatsoever must retain the following
* acknowledgment:
* "This product includes software developed by Computing Services
* at Carnegie Mellon University (http://www.cmu.edu/computing/)."
*
* CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO
* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#ifndef __CYRUS_IMAP_XAPIAN_WRAP__
#define __CYRUS_IMAP_XAPIAN_WRAP__
#include "message_guid.h"
#include "util.h"
#include "strarray.h"
#include "ptrarray.h"
typedef struct xapian_dbw xapian_dbw_t;
typedef struct xapian_db xapian_db_t;
typedef struct xapian_query xapian_query_t;
typedef struct xapian_snipgen xapian_snipgen_t;
typedef struct xapian_doc xapian_doc_t;
/* Document types */
#define XAPIAN_WRAP_DOCTYPE_MSG 'G'
#define XAPIAN_WRAP_DOCTYPE_PART 'P'
/* compaction interface */
extern int xapian_compact_dbs(const char *dest, const char **sources);
extern void xapian_check_if_needs_reindex(const strarray_t *sources, strarray_t *toreindex, int always_upgrade);
/* write-side interface */
#define XAPIAN_DBW_CONVINDEXED 0
#define XAPIAN_DBW_XAPINDEXED 1
extern int xapian_dbw_open(const char **paths, xapian_dbw_t **dbwp, int mode, int nosync);
extern void xapian_dbw_close(xapian_dbw_t *dbw);
extern int xapian_dbw_begin_txn(xapian_dbw_t *dbw);
extern int xapian_dbw_commit_txn(xapian_dbw_t *dbw);
extern int xapian_dbw_cancel_txn(xapian_dbw_t *dbw);
extern int xapian_dbw_begin_doc(xapian_dbw_t *dbw, const struct message_guid *guid, char doctype);
extern int xapian_dbw_doc_part(xapian_dbw_t *dbw, const struct buf *part, int num_part);
extern int xapian_dbw_end_doc(xapian_dbw_t *dbw, uint8_t indexlevel);
extern unsigned long xapian_dbw_total_length(xapian_dbw_t *dbw);
extern uint8_t xapian_dbw_is_indexed(xapian_dbw_t *dbw, const struct message_guid *guid, char doctype);
/* query-side interface */
extern int xapian_db_open(const char **paths, xapian_db_t **dbp);
extern int xapian_db_opendbw(struct xapian_dbw *dbw, xapian_db_t **dbp);
extern void xapian_db_close(xapian_db_t *);
extern void xapian_query_add_stemmer(xapian_db_t *, const char *iso_lang);
extern xapian_query_t *xapian_query_new_match(const xapian_db_t *, int num_part, const char *term);
extern xapian_query_t *xapian_query_new_compound(const xapian_db_t *, int is_or, xapian_query_t **children, int n);
extern xapian_query_t *xapian_query_new_matchall(const xapian_db_t *);
extern xapian_query_t *xapian_query_new_not(const xapian_db_t *, xapian_query_t *);
extern xapian_query_t *xapian_query_new_has_doctype(const xapian_db_t *, char doctype, xapian_query_t *);
extern void xapian_query_free(xapian_query_t *);
-extern int xapian_query_run(const xapian_db_t *, const xapian_query_t *query, int is_legacy,
+extern int xapian_query_run(const xapian_db_t *, const xapian_query_t *query,
int (*cb)(void *base, size_t n, void *rock), void *rock);
/* snippets interface */
extern xapian_snipgen_t *xapian_snipgen_new(xapian_db_t *db, const char *hi_start, const char *hi_end, const char *omit);
extern void xapian_snipgen_free(xapian_snipgen_t *);
extern int xapian_snipgen_add_match(xapian_snipgen_t *snipgen, const char *match);
extern int xapian_snipgen_begin_doc(xapian_snipgen_t *snipgen, const struct message_guid *guid, char doctype);
extern int xapian_snipgen_doc_part(xapian_snipgen_t *snipgen, const struct buf *part, int partnum);
extern int xapian_snipgen_end_doc(xapian_snipgen_t *snipgen, struct buf *);
/* filter interface */
extern int xapian_filter(const char *dest, const char **sources,
int (*cb)(const char *cyrusid, void *rock),
void *rock);
-/* XXX legacy version 4 DB support */
-extern int xapian_db_has_legacy_v4_index(const xapian_db_t *);
-extern int xapian_db_has_otherthan_v4_index(const xapian_db_t *);
/* Language indexing support */
extern int xapian_db_langstats(xapian_db_t*, ptrarray_t*, size_t*);
/* Document interface */
extern xapian_doc_t *xapian_doc_new(void);
extern void xapian_doc_index_text(xapian_doc_t *doc, const char *text, size_t len);
extern size_t xapian_doc_termcount(xapian_doc_t *doc);
extern int xapian_doc_foreach_term(xapian_doc_t *doc, int(*cb)(const char*, void*), void *rock);
extern void xapian_doc_reset(xapian_doc_t *doc);
extern void xapian_doc_close(xapian_doc_t *doc);
extern const char *xapian_version_string();
#endif

File Metadata

Mime Type
text/x-diff
Expires
Fri, Apr 24, 2:27 PM (1 w, 5 d ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
18887868
Default Alt Text
(275 KB)

Event Timeline