public-inbox.git  about / heads / tags
an "archives first" approach to mailing lists
blob 311ca05f5aebc2ceb432370967e33fbfc4ad758f 8743 bytes (raw)
$ git show HEAD:lib/PublicInbox/xh_cidx.h	# shows this blob on the CLI

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
 
// Copyright (C) all contributors <meta@public-inbox.org>
// License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
// This file is only intended to be included by xap_helper.h
// it implements pieces used by CodeSearchIdx.pm

static void term_length_extract(struct req *req)
{
	req->lenv = (size_t *)calloc(req->pfxc, sizeof(size_t));
	if (!req->lenv)
		EABORT("lenv = calloc(%d %zu)", req->pfxc, sizeof(size_t));
	for (int i = 0; i < req->pfxc; i++) {
		char *pfx = req->pfxv[i];
		// extract trailing digits as length:
		// $len = s/([0-9]+)\z// ? ($1+0) : 0
		for (size_t j = 0; pfx[j]; j++) {
			if (pfx[j] < '0' || pfx[j] > '9')
				continue;
			if (j == 0) {
				warnx("W: `%s' not a valid prefix", pfx);
				continue;
			}
			char *end;
			unsigned long long tmp = strtoull(pfx + j, &end, 10);
			if (*end || tmp >= (unsigned long long)SIZE_MAX) {
				warnx("W: `%s' not recognized", pfx);
			} else {
				req->lenv[i] = (size_t)tmp;
				pfx[j] = 0;
				break;
			}
		}
	}
}

static void dump_ibx_term(struct req *req, int p,
			Xapian::Document *doc, const char *ibx_id)
{
	Xapian::TermIterator cur = doc->termlist_begin();
	Xapian::TermIterator end = doc->termlist_end();
	const char *pfx = req->pfxv[p];
	size_t pfx_len = strlen(pfx);
	size_t term_len = req->lenv[p];

	for (cur.skip_to(pfx); cur != end; cur++) {
		std::string tn = *cur;
		if (!starts_with(&tn, pfx, pfx_len)) break;
		if (term_len > 0 && (tn.length() - pfx_len) != term_len)
			continue;
		fprintf(req->fp[0], "%s %s\n", tn.c_str() + pfx_len, ibx_id);
		++req->nr_out;
	}
}

static enum exc_iter dump_ibx_iter(struct req *req, const char *ibx_id,
				Xapian::MSetIterator *i)
{
	try {
		Xapian::Document doc = i->get_document();
		for (int p = 0; p < req->pfxc; p++)
			dump_ibx_term(req, p, &doc, ibx_id);
	} catch (const Xapian::DatabaseModifiedError & e) {
		req->srch->db->reopen();
		return ITER_RETRY;
	} catch (const Xapian::DocNotFoundError & e) { // oh well...
		warnx("doc not found: %s", e.get_description().c_str());
	}
	return ITER_OK;
}

static bool cmd_dump_ibx(struct req *req)
{
	if ((optind + 1) >= req->argc)
		ABORT("usage: dump_ibx [OPTIONS] IBX_ID QRY_STR");
	if (!req->pfxc)
		ABORT("dump_ibx requires -A PREFIX");

	const char *ibx_id = req->argv[optind];
	if (my_setlinebuf(req->fp[0])) // for sort(1) pipe
		EABORT("setlinebuf(fp[0])"); // WTF?
	req->asc = true;
	req->sort_col = -1;
	term_length_extract(req);
	Xapian::MSet mset = mail_mset(req, req->argv[optind + 1]);

	// @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine
	// in case we need to retry on DB reopens
	for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) {
		for (int t = 10; t > 0; --t)
			switch (dump_ibx_iter(req, ibx_id, &i)) {
			case ITER_OK: t = 0; break; // leave inner loop
			case ITER_RETRY: break; // continue for-loop
			case ITER_ABORT: return false; // error
			}
	}
	emit_mset_stats(req, &mset);
	return true;
}

struct dump_roots_tmp {
	struct stat sb;
	void *mm_ptr;
	char **entries;
	struct fbuf wbuf;
	int root2off_fd;
};

#define CLEANUP_DUMP_ROOTS __attribute__((__cleanup__(dump_roots_ensure)))
static void dump_roots_ensure(void *ptr)
{
	struct dump_roots_tmp *drt = (struct dump_roots_tmp *)ptr;
	if (drt->root2off_fd >= 0)
		xclose(drt->root2off_fd);
	hdestroy(); // idempotent
	size_t size = off2size(drt->sb.st_size);
	if (drt->mm_ptr && munmap(drt->mm_ptr, size))
		EABORT("BUG: munmap(%p, %zu)", drt->mm_ptr, size);
	free(drt->entries);
	fbuf_ensure(&drt->wbuf);
}

static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc)
{
	Xapian::TermIterator cur = doc->termlist_begin();
	Xapian::TermIterator end = doc->termlist_end();
	ENTRY e, *ep;
	fbuf_init(root_offs);
	for (cur.skip_to("G"); cur != end; cur++) {
		std::string tn = *cur;
		if (!starts_with(&tn, "G", 1)) break;
		union { const char *in; char *out; } u;
		u.in = tn.c_str() + 1;
		e.key = u.out;
		ep = hsearch(e, FIND);
		if (!ep) ABORT("hsearch miss `%s'", e.key);
		// ep->data is a NUL-terminated string matching /[0-9]+/
		fputc(' ', root_offs->fp);
		fputs((const char *)ep->data, root_offs->fp);
	}
	fputc('\n', root_offs->fp);
	ERR_CLOSE(root_offs->fp, EXIT_FAILURE); // ENOMEM
	root_offs->fp = NULL;
	return true;
}

// writes term values matching @pfx for a given @doc, ending the line
// with the contents of @root_offs
static void dump_roots_term(struct req *req, int p,
				struct dump_roots_tmp *drt,
				struct fbuf *root_offs,
				Xapian::Document *doc)
{
	Xapian::TermIterator cur = doc->termlist_begin();
	Xapian::TermIterator end = doc->termlist_end();
	const char *pfx = req->pfxv[p];
	size_t pfx_len = strlen(pfx);
	size_t term_len = req->lenv[p];

	for (cur.skip_to(pfx); cur != end; cur++) {
		std::string tn = *cur;
		if (!starts_with(&tn, pfx, pfx_len)) break;
		if (term_len > 0 && (tn.length() - pfx_len) != term_len)
			continue;
		fputs(tn.c_str() + pfx_len, drt->wbuf.fp);
		fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp);
		++req->nr_out;
	}
}

// we may have lines which exceed PIPE_BUF, so we do our own
// buffering and rely on flock(2), here
static bool dump_roots_flush(struct req *req, struct dump_roots_tmp *drt)
{
	bool ok = true;
	off_t off = ftello(drt->wbuf.fp);
	if (off < 0) EABORT("ftello");
	if (!off) return ok;

	ERR_FLUSH(drt->wbuf.fp); // ENOMEM
	int fd = fileno(req->fp[0]);

	while (flock(drt->root2off_fd, LOCK_EX)) {
		if (errno == EINTR) continue;
		err(EXIT_FAILURE, "LOCK_EX"); // ENOLCK?
	}
	ok = write_all(fd, &drt->wbuf, (size_t)off);
	while (flock(drt->root2off_fd, LOCK_UN)) {
		if (errno == EINTR) continue;
		err(EXIT_FAILURE, "LOCK_UN"); // ENOLCK?
	}
	if (fseeko(drt->wbuf.fp, 0, SEEK_SET)) EABORT("fseeko");
	return ok;
}

static enum exc_iter dump_roots_iter(struct req *req,
				struct dump_roots_tmp *drt,
				Xapian::MSetIterator *i)
{
	CLEANUP_FBUF struct fbuf root_offs = {}; // " $ID0 $ID1 $IDx..\n"
	try {
		Xapian::Document doc = i->get_document();
		if (!root2offs_str(&root_offs, &doc))
			return ITER_ABORT; // bad request, abort
		for (int p = 0; p < req->pfxc; p++)
			dump_roots_term(req, p, drt, &root_offs, &doc);
	} catch (const Xapian::DatabaseModifiedError & e) {
		req->srch->db->reopen();
		return ITER_RETRY;
	} catch (const Xapian::DocNotFoundError & e) { // oh well...
		warnx("doc not found: %s", e.get_description().c_str());
	}
	return ITER_OK;
}

static bool cmd_dump_roots(struct req *req)
{
	CLEANUP_DUMP_ROOTS struct dump_roots_tmp drt = {};
	drt.root2off_fd = -1;
	if ((optind + 1) >= req->argc)
		ABORT("usage: dump_roots [OPTIONS] ROOT2ID_FILE QRY_STR");
	if (!req->pfxc)
		ABORT("dump_roots requires -A PREFIX");
	const char *root2off_file = req->argv[optind];
	drt.root2off_fd = open(root2off_file, O_RDONLY);
	if (drt.root2off_fd < 0)
		EABORT("open(%s)", root2off_file);
	if (fstat(drt.root2off_fd, &drt.sb)) // ENOMEM?
		err(EXIT_FAILURE, "fstat(%s)", root2off_file);
	// each entry is at least 43 bytes ({OIDHEX}\0{INT}\0),
	// so /32 overestimates the number of expected entries by
	// ~%25 (as recommended by Linux hcreate(3) manpage)
	size_t size = off2size(drt.sb.st_size);
	size_t est = (size / 32) + 1; //+1 for "\0" termination
	drt.mm_ptr = mmap(NULL, size, PROT_READ,
				MAP_PRIVATE, drt.root2off_fd, 0);
	if (drt.mm_ptr == MAP_FAILED)
		err(EXIT_FAILURE, "mmap(%zu, %s)", size, root2off_file);
	size_t asize = est * 2;
	if (asize < est) ABORT("too many entries: %zu", est);
	drt.entries = (char **)calloc(asize, sizeof(char *));
	if (!drt.entries)
		err(EXIT_FAILURE, "calloc(%zu * 2, %zu)", est, sizeof(char *));
	size_t tot = split2argv(drt.entries, (char *)drt.mm_ptr, size, asize);
	if (tot <= 0) return false; // split2argv already warned on error
	if (!hcreate(est))
		err(EXIT_FAILURE, "hcreate(%zu)", est);
	for (size_t i = 0; i < tot; ) {
		ENTRY e;
		e.key = hsearch_enter_key(drt.entries[i++]); // dies on ENOMEM
		e.data = drt.entries[i++];
		if (!hsearch(e, ENTER))
			err(EXIT_FAILURE, "hsearch(%s => %s, ENTER)", e.key,
					(const char *)e.data);
	}
	req->asc = true;
	req->sort_col = -1;
	Xapian::MSet mset = commit_mset(req, req->argv[optind + 1]);
	term_length_extract(req);

	fbuf_init(&drt.wbuf);

	// @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine
	// in case we need to retry on DB reopens
	for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) {
		for (int t = 10; t > 0; --t)
			switch (dump_roots_iter(req, &drt, &i)) {
			case ITER_OK: t = 0; break; // leave inner loop
			case ITER_RETRY: break; // continue for-loop
			case ITER_ABORT: return false; // error
			}
		if (!(req->nr_out & 0x3fff) && !dump_roots_flush(req, &drt))
			return false;
	}
	if (!dump_roots_flush(req, &drt))
		return false;
	emit_mset_stats(req, &mset);
	return true;
}

git clone https://public-inbox.org/public-inbox.git
git clone http://7fh6tueqddpjyxjmgtdiueylzoqt6pt7hec3pukyptlmohoowvhde4yd.onion/public-inbox.git