dumping ground for random patches and texts
 help / color / mirror / Atom feed
* [PATCH 1/7] WIP-cidx-xh-split
@ 2023-11-26 14:19 Eric Wong
  2023-11-26 14:19 ` [PATCH 2/7] WIP-cidx Eric Wong
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Eric Wong @ 2023-11-26 14:19 UTC (permalink / raw)
  To: spew

---
 MANIFEST                        |   1 +
 lib/PublicInbox/XapHelperCxx.pm |  10 +-
 lib/PublicInbox/xap_helper.h    | 270 +-------------------------------
 lib/PublicInbox/xh_cidx.h       | 260 ++++++++++++++++++++++++++++++
 4 files changed, 273 insertions(+), 268 deletions(-)
 create mode 100644 lib/PublicInbox/xh_cidx.h

diff --git a/MANIFEST b/MANIFEST
index 85811133..bbbe0b91 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -378,6 +378,7 @@ lib/PublicInbox/XapHelperCxx.pm
 lib/PublicInbox/Xapcmd.pm
 lib/PublicInbox/gcf2_libgit2.h
 lib/PublicInbox/xap_helper.h
+lib/PublicInbox/xh_cidx.h
 sa_config/Makefile
 sa_config/README
 sa_config/root/etc/spamassassin/public-inbox.pre
diff --git a/lib/PublicInbox/XapHelperCxx.pm b/lib/PublicInbox/XapHelperCxx.pm
index f421c7bc..8a66fdcd 100644
--- a/lib/PublicInbox/XapHelperCxx.pm
+++ b/lib/PublicInbox/XapHelperCxx.pm
@@ -20,7 +20,7 @@ $ENV{PERL_INLINE_DIRECTORY} // die('BUG: PERL_INLINE_DIRECTORY unset');
 substr($dir, 0, 0) = "$ENV{PERL_INLINE_DIRECTORY}/";
 my $bin = "$dir/xap_helper";
 my ($srcpfx) = (__FILE__ =~ m!\A(.+/)[^/]+\z!);
-my @srcs = map { $srcpfx.$_ } qw(xap_helper.h);
+my @srcs = map { $srcpfx.$_ } qw(xap_helper.h xh_cidx.h);
 my @pm_dep = map { $srcpfx.$_ } qw(Search.pm CodeSearch.pm);
 my $ldflags = '-Wl,-O1';
 $ldflags .= ' -Wl,--compress-debug-sections=zlib' if $^O ne 'openbsd';
@@ -61,11 +61,9 @@ sub build () {
 	require PublicInbox::OnDestroy;
 	my ($prog) = ($bin =~ m!/([^/]+)\z!);
 	my $lk = PublicInbox::Lock->new("$dir/$prog.lock")->lock_for_scope;
-	open my $fh, '>', "$dir/$prog.cpp";
-	say $fh qq(# include "$_") for @srcs;
-	print $fh PublicInbox::Search::generate_cxx();
-	print $fh PublicInbox::CodeSearch::generate_cxx();
-	close $fh;
+	write_file '>', "$dir/$prog.cpp", qq{#include "xap_helper.h"\n},
+			PublicInbox::Search::generate_cxx(),
+			PublicInbox::CodeSearch::generate_cxx();
 
 	opendir my $dh, '.';
 	my $restore = PublicInbox::OnDestroy->new(\&chdir, $dh);
diff --git a/lib/PublicInbox/xap_helper.h b/lib/PublicInbox/xap_helper.h
index b6b517d5..0824ce71 100644
--- a/lib/PublicInbox/xap_helper.h
+++ b/lib/PublicInbox/xap_helper.h
@@ -146,6 +146,12 @@ struct worker {
 	unsigned nr;
 };
 
+struct fbuf {
+	FILE *fp;
+	char *ptr;
+	size_t len;
+};
+
 #define SPLIT2ARGV(dst,buf,len) split2argv(dst,buf,len,MY_ARRAY_SIZE(dst))
 static size_t split2argv(char **dst, char *buf, size_t len, size_t limit)
 {
@@ -253,87 +259,11 @@ static bool starts_with(const std::string *s, const char *pfx, size_t pfx_len)
 	return s->size() >= pfx_len && !memcmp(pfx, s->c_str(), pfx_len);
 }
 
-static void dump_ibx_term(struct req *req, const char *pfx,
-			Xapian::Document *doc, const char *ibx_id)
-{
-	Xapian::TermIterator cur = doc->termlist_begin();
-	Xapian::TermIterator end = doc->termlist_end();
-	size_t pfx_len = strlen(pfx);
-
-	for (cur.skip_to(pfx); cur != end; cur++) {
-		std::string tn = *cur;
-
-		if (starts_with(&tn, pfx, pfx_len)) {
-			fprintf(req->fp[0], "%s %s\n",
-				tn.c_str() + pfx_len, ibx_id);
-			++req->nr_out;
-		}
-	}
-}
-
 static int my_setlinebuf(FILE *fp) // glibc setlinebuf(3) can't report errors
 {
 	return setvbuf(fp, NULL, _IOLBF, 0);
 }
 
-static enum exc_iter dump_ibx_iter(struct req *req, const char *ibx_id,
-				Xapian::MSetIterator *i)
-{
-	try {
-		Xapian::Document doc = i->get_document();
-		for (int p = 0; p < req->pfxc; p++)
-			dump_ibx_term(req, req->pfxv[p], &doc, ibx_id);
-	} catch (const Xapian::DatabaseModifiedError & e) {
-		req->srch->db->reopen();
-		return ITER_RETRY;
-	} catch (const Xapian::DocNotFoundError & e) { // oh well...
-		warnx("doc not found: %s", e.get_description().c_str());
-	}
-	return ITER_OK;
-}
-
-static bool cmd_dump_ibx(struct req *req)
-{
-	if ((optind + 1) >= req->argc)
-		ABORT("usage: dump_ibx [OPTIONS] IBX_ID QRY_STR");
-	if (!req->pfxc)
-		ABORT("dump_ibx requires -A PREFIX");
-
-	const char *ibx_id = req->argv[optind];
-	if (my_setlinebuf(req->fp[0])) // for sort(1) pipe
-		EABORT("setlinebuf(fp[0])"); // WTF?
-	req->asc = true;
-	req->sort_col = -1;
-	Xapian::MSet mset = mail_mset(req, req->argv[optind + 1]);
-
-	// @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine
-	// in case we need to retry on DB reopens
-	for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) {
-		for (int t = 10; t > 0; --t)
-			switch (dump_ibx_iter(req, ibx_id, &i)) {
-			case ITER_OK: t = 0; break; // leave inner loop
-			case ITER_RETRY: break; // continue for-loop
-			case ITER_ABORT: return false; // error
-			}
-	}
-	emit_mset_stats(req, &mset);
-	return true;
-}
-
-struct fbuf {
-	FILE *fp;
-	char *ptr;
-	size_t len;
-};
-
-struct dump_roots_tmp {
-	struct stat sb;
-	void *mm_ptr;
-	char **entries;
-	struct fbuf wbuf;
-	int root2off_fd;
-};
-
 // n.b. __cleanup__ works fine with C++ exceptions, but not longjmp
 // Only clang and g++ are supported, as AFAIK there's no other
 // relevant Free(-as-in-speech) C++ compilers.
@@ -360,126 +290,6 @@ static void xclose(int fd)
 		EABORT("BUG: close");
 }
 
-#define CLEANUP_DUMP_ROOTS __attribute__((__cleanup__(dump_roots_ensure)))
-static void dump_roots_ensure(void *ptr)
-{
-	struct dump_roots_tmp *drt = (struct dump_roots_tmp *)ptr;
-	if (drt->root2off_fd >= 0)
-		xclose(drt->root2off_fd);
-	hdestroy(); // idempotent
-	if (drt->mm_ptr && munmap(drt->mm_ptr, drt->sb.st_size))
-		EABORT("BUG: munmap(%p, %zu)", drt->mm_ptr, drt->sb.st_size);
-	free(drt->entries);
-	fbuf_ensure(&drt->wbuf);
-}
-
-static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc)
-{
-	Xapian::TermIterator cur = doc->termlist_begin();
-	Xapian::TermIterator end = doc->termlist_end();
-	ENTRY e, *ep;
-	fbuf_init(root_offs);
-	for (cur.skip_to("G"); cur != end; cur++) {
-		std::string tn = *cur;
-		if (!starts_with(&tn, "G", 1))
-			continue;
-		union { const char *in; char *out; } u;
-		u.in = tn.c_str() + 1;
-		e.key = u.out;
-		ep = hsearch(e, FIND);
-		if (!ep) ABORT("hsearch miss `%s'", e.key);
-		// ep->data is a NUL-terminated string matching /[0-9]+/
-		fputc(' ', root_offs->fp);
-		fputs((const char *)ep->data, root_offs->fp);
-	}
-	fputc('\n', root_offs->fp);
-	if (ferror(root_offs->fp) | fclose(root_offs->fp))
-		err(EXIT_FAILURE, "ferror|fclose(root_offs)"); // ENOMEM
-	root_offs->fp = NULL;
-	return true;
-}
-
-// writes term values matching @pfx for a given @doc, ending the line
-// with the contents of @root_offs
-static void dump_roots_term(struct req *req, const char *pfx,
-				struct dump_roots_tmp *drt,
-				struct fbuf *root_offs,
-				Xapian::Document *doc)
-{
-	Xapian::TermIterator cur = doc->termlist_begin();
-	Xapian::TermIterator end = doc->termlist_end();
-	size_t pfx_len = strlen(pfx);
-
-	for (cur.skip_to(pfx); cur != end; cur++) {
-		std::string tn = *cur;
-		if (!starts_with(&tn, pfx, pfx_len))
-			continue;
-		fputs(tn.c_str() + pfx_len, drt->wbuf.fp);
-		fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp);
-		++req->nr_out;
-	}
-}
-
-// we may have lines which exceed PIPE_BUF, so we do our own
-// buffering and rely on flock(2), here
-static bool dump_roots_flush(struct req *req, struct dump_roots_tmp *drt)
-{
-	char *p;
-	int fd = fileno(req->fp[0]);
-	bool ok = true;
-
-	if (!drt->wbuf.fp) return true;
-	if (fd < 0) EABORT("BUG: fileno");
-	if (ferror(drt->wbuf.fp) | fclose(drt->wbuf.fp)) // ENOMEM?
-		err(EXIT_FAILURE, "ferror|fclose(drt->wbuf.fp)");
-	drt->wbuf.fp = NULL;
-	if (!drt->wbuf.len) goto done_free;
-	while (flock(drt->root2off_fd, LOCK_EX)) {
-		if (errno == EINTR) continue;
-		err(EXIT_FAILURE, "LOCK_EX"); // ENOLCK?
-	}
-	p = drt->wbuf.ptr;
-	do { // write to client FD
-		ssize_t n = write(fd, p, drt->wbuf.len);
-		if (n > 0) {
-			drt->wbuf.len -= n;
-			p += n;
-		} else {
-			perror(n ? "write" : "write (zero bytes)");
-			return false;
-		}
-	} while (drt->wbuf.len);
-	while (flock(drt->root2off_fd, LOCK_UN)) {
-		if (errno == EINTR) continue;
-		err(EXIT_FAILURE, "LOCK_UN"); // ENOLCK?
-	}
-done_free: // OK to skip on errors, dump_roots_ensure calls fbuf_ensure
-	free(drt->wbuf.ptr);
-	drt->wbuf.ptr = NULL;
-	return ok;
-}
-
-static enum exc_iter dump_roots_iter(struct req *req,
-				struct dump_roots_tmp *drt,
-				Xapian::MSetIterator *i)
-{
-	CLEANUP_FBUF struct fbuf root_offs = {}; // " $ID0 $ID1 $IDx..\n"
-	try {
-		Xapian::Document doc = i->get_document();
-		if (!root2offs_str(&root_offs, &doc))
-			return ITER_ABORT; // bad request, abort
-		for (int p = 0; p < req->pfxc; p++)
-			dump_roots_term(req, req->pfxv[p], drt,
-					&root_offs, &doc);
-	} catch (const Xapian::DatabaseModifiedError & e) {
-		req->srch->db->reopen();
-		return ITER_RETRY;
-	} catch (const Xapian::DocNotFoundError & e) { // oh well...
-		warnx("doc not found: %s", e.get_description().c_str());
-	}
-	return ITER_OK;
-}
-
 static char *hsearch_enter_key(char *s)
 {
 #if defined(__OpenBSD__) || defined(__DragonFly__)
@@ -499,72 +309,6 @@ static char *hsearch_enter_key(char *s)
 	return s;
 }
 
-static bool cmd_dump_roots(struct req *req)
-{
-	CLEANUP_DUMP_ROOTS struct dump_roots_tmp drt = {};
-	drt.root2off_fd = -1;
-	if ((optind + 1) >= req->argc)
-		ABORT("usage: dump_roots [OPTIONS] ROOT2ID_FILE QRY_STR");
-	if (!req->pfxc)
-		ABORT("dump_roots requires -A PREFIX");
-	const char *root2off_file = req->argv[optind];
-	drt.root2off_fd = open(root2off_file, O_RDONLY);
-	if (drt.root2off_fd < 0)
-		EABORT("open(%s)", root2off_file);
-	if (fstat(drt.root2off_fd, &drt.sb)) // ENOMEM?
-		err(EXIT_FAILURE, "fstat(%s)", root2off_file);
-	// each entry is at least 43 bytes ({OIDHEX}\0{INT}\0),
-	// so /32 overestimates the number of expected entries by
-	// ~%25 (as recommended by Linux hcreate(3) manpage)
-	size_t est = (drt.sb.st_size / 32) + 1; //+1 for "\0" termination
-	if ((uint64_t)drt.sb.st_size > (uint64_t)SIZE_MAX)
-		err(EXIT_FAILURE, "%s size too big (%lld bytes > %zu)",
-			root2off_file, (long long)drt.sb.st_size, SIZE_MAX);
-	drt.mm_ptr = mmap(NULL, drt.sb.st_size, PROT_READ,
-				MAP_PRIVATE, drt.root2off_fd, 0);
-	if (drt.mm_ptr == MAP_FAILED)
-		err(EXIT_FAILURE, "mmap(%zu, %s)",
-			drt.sb.st_size, root2off_file);
-	drt.entries = (char **)calloc(est * 2, sizeof(char *));
-	if (!drt.entries)
-		err(EXIT_FAILURE, "calloc(%zu * 2, %zu)", est, sizeof(char *));
-	size_t tot = split2argv(drt.entries, (char *)drt.mm_ptr,
-				drt.sb.st_size, est * 2);
-	if (tot <= 0) return false; // split2argv already warned on error
-	if (!hcreate(est))
-		err(EXIT_FAILURE, "hcreate(%zu)", est);
-	for (size_t i = 0; i < tot; ) {
-		ENTRY e;
-		e.key = hsearch_enter_key(drt.entries[i++]); // dies on ENOMEM
-		e.data = drt.entries[i++];
-		if (!hsearch(e, ENTER))
-			err(EXIT_FAILURE, "hsearch(%s => %s, ENTER)", e.key,
-					(const char *)e.data);
-	}
-	req->asc = true;
-	req->sort_col = -1;
-	Xapian::MSet mset = commit_mset(req, req->argv[optind + 1]);
-
-	// @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine
-	// in case we need to retry on DB reopens
-	for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) {
-		if (!drt.wbuf.fp)
-			fbuf_init(&drt.wbuf);
-		for (int t = 10; t > 0; --t)
-			switch (dump_roots_iter(req, &drt, &i)) {
-			case ITER_OK: t = 0; break; // leave inner loop
-			case ITER_RETRY: break; // continue for-loop
-			case ITER_ABORT: return false; // error
-			}
-		if (!(req->nr_out & 0x3fff) && !dump_roots_flush(req, &drt))
-			return false;
-	}
-	if (!dump_roots_flush(req, &drt))
-		return false;
-	emit_mset_stats(req, &mset);
-	return true;
-}
-
 // for test usage only, we need to ensure the compiler supports
 // __cleanup__ when exceptions are thrown
 struct inspect { struct req *req; };
@@ -588,6 +332,8 @@ static bool cmd_test_inspect(struct req *req)
 	return false;
 }
 
+#include "xh_cidx.h" // CodeSearchIdx.pm stuff
+
 #define CMD(n) { .fn_len = sizeof(#n) - 1, .fn_name = #n, .fn = cmd_##n }
 static const struct cmd_entry {
 	size_t fn_len;
diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h
new file mode 100644
index 00000000..8513d88b
--- /dev/null
+++ b/lib/PublicInbox/xh_cidx.h
@@ -0,0 +1,260 @@
+// Copyright (C) all contributors <meta@public-inbox.org>
+// License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
+// This file is only intended to be included by xap_helper.h
+// it implements pieces used by CodeSearchIdx.pm
+
+static void dump_ibx_term(struct req *req, const char *pfx,
+			Xapian::Document *doc, const char *ibx_id)
+{
+	Xapian::TermIterator cur = doc->termlist_begin();
+	Xapian::TermIterator end = doc->termlist_end();
+	size_t pfx_len = strlen(pfx);
+
+	for (cur.skip_to(pfx); cur != end; cur++) {
+		std::string tn = *cur;
+
+		if (starts_with(&tn, pfx, pfx_len)) {
+			fprintf(req->fp[0], "%s %s\n",
+				tn.c_str() + pfx_len, ibx_id);
+			++req->nr_out;
+		}
+	}
+}
+
+static enum exc_iter dump_ibx_iter(struct req *req, const char *ibx_id,
+				Xapian::MSetIterator *i)
+{
+	try {
+		Xapian::Document doc = i->get_document();
+		for (int p = 0; p < req->pfxc; p++)
+			dump_ibx_term(req, req->pfxv[p], &doc, ibx_id);
+	} catch (const Xapian::DatabaseModifiedError & e) {
+		req->srch->db->reopen();
+		return ITER_RETRY;
+	} catch (const Xapian::DocNotFoundError & e) { // oh well...
+		warnx("doc not found: %s", e.get_description().c_str());
+	}
+	return ITER_OK;
+}
+
+static bool cmd_dump_ibx(struct req *req)
+{
+	if ((optind + 1) >= req->argc)
+		ABORT("usage: dump_ibx [OPTIONS] IBX_ID QRY_STR");
+	if (!req->pfxc)
+		ABORT("dump_ibx requires -A PREFIX");
+
+	const char *ibx_id = req->argv[optind];
+	if (my_setlinebuf(req->fp[0])) // for sort(1) pipe
+		EABORT("setlinebuf(fp[0])"); // WTF?
+	req->asc = true;
+	req->sort_col = -1;
+	Xapian::MSet mset = mail_mset(req, req->argv[optind + 1]);
+
+	// @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine
+	// in case we need to retry on DB reopens
+	for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) {
+		for (int t = 10; t > 0; --t)
+			switch (dump_ibx_iter(req, ibx_id, &i)) {
+			case ITER_OK: t = 0; break; // leave inner loop
+			case ITER_RETRY: break; // continue for-loop
+			case ITER_ABORT: return false; // error
+			}
+	}
+	emit_mset_stats(req, &mset);
+	return true;
+}
+
+struct dump_roots_tmp {
+	struct stat sb;
+	void *mm_ptr;
+	char **entries;
+	struct fbuf wbuf;
+	int root2off_fd;
+};
+
+#define CLEANUP_DUMP_ROOTS __attribute__((__cleanup__(dump_roots_ensure)))
+static void dump_roots_ensure(void *ptr)
+{
+	struct dump_roots_tmp *drt = (struct dump_roots_tmp *)ptr;
+	if (drt->root2off_fd >= 0)
+		xclose(drt->root2off_fd);
+	hdestroy(); // idempotent
+	if (drt->mm_ptr && munmap(drt->mm_ptr, drt->sb.st_size))
+		EABORT("BUG: munmap(%p, %zu)", drt->mm_ptr, drt->sb.st_size);
+	free(drt->entries);
+	fbuf_ensure(&drt->wbuf);
+}
+
+static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc)
+{
+	Xapian::TermIterator cur = doc->termlist_begin();
+	Xapian::TermIterator end = doc->termlist_end();
+	ENTRY e, *ep;
+	fbuf_init(root_offs);
+	for (cur.skip_to("G"); cur != end; cur++) {
+		std::string tn = *cur;
+		if (!starts_with(&tn, "G", 1))
+			continue;
+		union { const char *in; char *out; } u;
+		u.in = tn.c_str() + 1;
+		e.key = u.out;
+		ep = hsearch(e, FIND);
+		if (!ep) ABORT("hsearch miss `%s'", e.key);
+		// ep->data is a NUL-terminated string matching /[0-9]+/
+		fputc(' ', root_offs->fp);
+		fputs((const char *)ep->data, root_offs->fp);
+	}
+	fputc('\n', root_offs->fp);
+	if (ferror(root_offs->fp) | fclose(root_offs->fp))
+		err(EXIT_FAILURE, "ferror|fclose(root_offs)"); // ENOMEM
+	root_offs->fp = NULL;
+	return true;
+}
+
+// writes term values matching @pfx for a given @doc, ending the line
+// with the contents of @root_offs
+static void dump_roots_term(struct req *req, const char *pfx,
+				struct dump_roots_tmp *drt,
+				struct fbuf *root_offs,
+				Xapian::Document *doc)
+{
+	Xapian::TermIterator cur = doc->termlist_begin();
+	Xapian::TermIterator end = doc->termlist_end();
+	size_t pfx_len = strlen(pfx);
+
+	for (cur.skip_to(pfx); cur != end; cur++) {
+		std::string tn = *cur;
+		if (!starts_with(&tn, pfx, pfx_len))
+			continue;
+		fputs(tn.c_str() + pfx_len, drt->wbuf.fp);
+		fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp);
+		++req->nr_out;
+	}
+}
+
+// we may have lines which exceed PIPE_BUF, so we do our own
+// buffering and rely on flock(2), here
+static bool dump_roots_flush(struct req *req, struct dump_roots_tmp *drt)
+{
+	char *p;
+	int fd = fileno(req->fp[0]);
+	bool ok = true;
+
+	if (!drt->wbuf.fp) return true;
+	if (fd < 0) EABORT("BUG: fileno");
+	if (ferror(drt->wbuf.fp) | fclose(drt->wbuf.fp)) // ENOMEM?
+		err(EXIT_FAILURE, "ferror|fclose(drt->wbuf.fp)");
+	drt->wbuf.fp = NULL;
+	if (!drt->wbuf.len) goto done_free;
+	while (flock(drt->root2off_fd, LOCK_EX)) {
+		if (errno == EINTR) continue;
+		err(EXIT_FAILURE, "LOCK_EX"); // ENOLCK?
+	}
+	p = drt->wbuf.ptr;
+	do { // write to client FD
+		ssize_t n = write(fd, p, drt->wbuf.len);
+		if (n > 0) {
+			drt->wbuf.len -= n;
+			p += n;
+		} else {
+			perror(n ? "write" : "write (zero bytes)");
+			return false;
+		}
+	} while (drt->wbuf.len);
+	while (flock(drt->root2off_fd, LOCK_UN)) {
+		if (errno == EINTR) continue;
+		err(EXIT_FAILURE, "LOCK_UN"); // ENOLCK?
+	}
+done_free: // OK to skip on errors, dump_roots_ensure calls fbuf_ensure
+	free(drt->wbuf.ptr);
+	drt->wbuf.ptr = NULL;
+	return ok;
+}
+
+static enum exc_iter dump_roots_iter(struct req *req,
+				struct dump_roots_tmp *drt,
+				Xapian::MSetIterator *i)
+{
+	CLEANUP_FBUF struct fbuf root_offs = {}; // " $ID0 $ID1 $IDx..\n"
+	try {
+		Xapian::Document doc = i->get_document();
+		if (!root2offs_str(&root_offs, &doc))
+			return ITER_ABORT; // bad request, abort
+		for (int p = 0; p < req->pfxc; p++)
+			dump_roots_term(req, req->pfxv[p], drt,
+					&root_offs, &doc);
+	} catch (const Xapian::DatabaseModifiedError & e) {
+		req->srch->db->reopen();
+		return ITER_RETRY;
+	} catch (const Xapian::DocNotFoundError & e) { // oh well...
+		warnx("doc not found: %s", e.get_description().c_str());
+	}
+	return ITER_OK;
+}
+
+static bool cmd_dump_roots(struct req *req)
+{
+	CLEANUP_DUMP_ROOTS struct dump_roots_tmp drt = {};
+	drt.root2off_fd = -1;
+	if ((optind + 1) >= req->argc)
+		ABORT("usage: dump_roots [OPTIONS] ROOT2ID_FILE QRY_STR");
+	if (!req->pfxc)
+		ABORT("dump_roots requires -A PREFIX");
+	const char *root2off_file = req->argv[optind];
+	drt.root2off_fd = open(root2off_file, O_RDONLY);
+	if (drt.root2off_fd < 0)
+		EABORT("open(%s)", root2off_file);
+	if (fstat(drt.root2off_fd, &drt.sb)) // ENOMEM?
+		err(EXIT_FAILURE, "fstat(%s)", root2off_file);
+	// each entry is at least 43 bytes ({OIDHEX}\0{INT}\0),
+	// so /32 overestimates the number of expected entries by
+	// ~%25 (as recommended by Linux hcreate(3) manpage)
+	size_t est = (drt.sb.st_size / 32) + 1; //+1 for "\0" termination
+	if ((uint64_t)drt.sb.st_size > (uint64_t)SIZE_MAX)
+		err(EXIT_FAILURE, "%s size too big (%lld bytes > %zu)",
+			root2off_file, (long long)drt.sb.st_size, SIZE_MAX);
+	drt.mm_ptr = mmap(NULL, drt.sb.st_size, PROT_READ,
+				MAP_PRIVATE, drt.root2off_fd, 0);
+	if (drt.mm_ptr == MAP_FAILED)
+		err(EXIT_FAILURE, "mmap(%zu, %s)",
+			drt.sb.st_size, root2off_file);
+	drt.entries = (char **)calloc(est * 2, sizeof(char *));
+	if (!drt.entries)
+		err(EXIT_FAILURE, "calloc(%zu * 2, %zu)", est, sizeof(char *));
+	size_t tot = split2argv(drt.entries, (char *)drt.mm_ptr,
+				drt.sb.st_size, est * 2);
+	if (tot <= 0) return false; // split2argv already warned on error
+	if (!hcreate(est))
+		err(EXIT_FAILURE, "hcreate(%zu)", est);
+	for (size_t i = 0; i < tot; ) {
+		ENTRY e;
+		e.key = hsearch_enter_key(drt.entries[i++]); // dies on ENOMEM
+		e.data = drt.entries[i++];
+		if (!hsearch(e, ENTER))
+			err(EXIT_FAILURE, "hsearch(%s => %s, ENTER)", e.key,
+					(const char *)e.data);
+	}
+	req->asc = true;
+	req->sort_col = -1;
+	Xapian::MSet mset = commit_mset(req, req->argv[optind + 1]);
+
+	// @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine
+	// in case we need to retry on DB reopens
+	for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) {
+		if (!drt.wbuf.fp)
+			fbuf_init(&drt.wbuf);
+		for (int t = 10; t > 0; --t)
+			switch (dump_roots_iter(req, &drt, &i)) {
+			case ITER_OK: t = 0; break; // leave inner loop
+			case ITER_RETRY: break; // continue for-loop
+			case ITER_ABORT: return false; // error
+			}
+		if (!(req->nr_out & 0x3fff) && !dump_roots_flush(req, &drt))
+			return false;
+	}
+	if (!dump_roots_flush(req, &drt))
+		return false;
+	emit_mset_stats(req, &mset);
+	return true;
+}

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/7] WIP-cidx
  2023-11-26 14:19 [PATCH 1/7] WIP-cidx-xh-split Eric Wong
@ 2023-11-26 14:19 ` Eric Wong
  2023-11-26 14:19 ` [PATCH 3/7] hval: relative paths Eric Wong
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2023-11-26 14:19 UTC (permalink / raw)
  To: spew

more work
---
 MANIFEST                        |   1 +
 Makefile.PL                     |   8 ++-
 lib/PublicInbox/Search.pm       |  25 ++++++++
 lib/PublicInbox/XapHelper.pm    |  51 ++++++++++-----
 lib/PublicInbox/XapHelperCxx.pm |   6 +-
 lib/PublicInbox/xap_helper.h    | 110 ++++++++++++++++++++++++++------
 lib/PublicInbox/xh_cidx.h       |  37 ++++-------
 lib/PublicInbox/xh_mset.h       |  96 ++++++++++++++++++++++++++++
 t/cindex.t                      |  52 ++++++++++++++-
 t/xap_helper.t                  |  49 ++++++++++++--
 10 files changed, 363 insertions(+), 72 deletions(-)
 create mode 100644 lib/PublicInbox/xh_mset.h

diff --git a/MANIFEST b/MANIFEST
index bbbe0b91..7b6178f9 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -379,6 +379,7 @@ lib/PublicInbox/Xapcmd.pm
 lib/PublicInbox/gcf2_libgit2.h
 lib/PublicInbox/xap_helper.h
 lib/PublicInbox/xh_cidx.h
+lib/PublicInbox/xh_mset.h
 sa_config/Makefile
 sa_config/README
 sa_config/root/etc/spamassassin/public-inbox.pre
diff --git a/Makefile.PL b/Makefile.PL
index 38e030f5..28f8263e 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -273,14 +273,16 @@ pm_to_blib : lib/PublicInbox.pm
 lib/PublicInbox.pm : FORCE
 	VERSION=\$(VERSION) \$(PERL) -w ./version-gen.perl
 
+XH_TESTS = t/xap_helper.t t/cindex.t
+
 test-asan : pure_all
-	TEST_XH_CXX_ONLY=1 CXXFLAGS='-O0 -Wall -ggdb3 -fsanitize=address' \\
-		prove -bvw t/xap_helper.t
+	TEST_XH_CXX_ONLY=1 CXXFLAGS='-Wall -ggdb3 -fsanitize=address' \\
+		prove -bvw \$(XH_TESTS)
 
 VG_OPT = -v --trace-children=yes --track-fds=yes
 VG_OPT += --leak-check=yes --track-origins=yes
 test-valgrind : pure_all
 	TEST_XH_CXX_ONLY=1 VALGRIND="valgrind \$(VG_OPT)" \\
-		prove -bvw t/xap_helper.t
+		prove -bvw \$(XH_TESTS)
 EOF
 }
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 477f77dc..6145b027 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -76,6 +76,25 @@ our @MAIL_VMAP = (
 );
 our @MAIL_NRP;
 
+# Getopt::Long spec, only short options for portability in C++ implementation
+our @XH_SPEC = (
+	'a', # ascending sort
+	'c', # code search
+	'd=s@', # shard dirs
+	'g=s', # git dir (with -c)
+	'k=i', # sort column (like sort(1))
+	'm=i', # maximum number of results
+	'o=i', # offset
+	'p', # show percent
+	'r', # 1=relevance then column
+	't', # collapse threads
+	'A=s@', # prefixes
+	'D', # emit docdata
+	'K=i', # timeout kill after i seconds
+	'O=s', # eidx_key
+	'T=i', # threadid
+);
+
 sub load_xapian () {
 	return 1 if defined $Xap;
 	# n.b. PI_XAPIAN is intended for development use only
@@ -247,6 +266,12 @@ sub mdocid {
 	int(($docid - 1) / $nshard) + 1;
 }
 
+sub docids_to_artnums {
+	my $nshard = shift->{nshard};
+	# XXX does array vs arrayref make a difference in modern Perls?
+	map { int(($_ - 1) / $nshard) + 1 } @_;
+}
+
 sub mset_to_artnums {
 	my ($self, $mset) = @_;
 	my $nshard = $self->{nshard};
diff --git a/lib/PublicInbox/XapHelper.pm b/lib/PublicInbox/XapHelper.pm
index fe831b8f..b21e70a2 100644
--- a/lib/PublicInbox/XapHelper.pm
+++ b/lib/PublicInbox/XapHelper.pm
@@ -21,21 +21,6 @@ my $X = \%PublicInbox::Search::X;
 our (%SRCH, %WORKERS, $nworker, $workerset, $in);
 our $stderr = \*STDERR;
 
-# only short options for portability in C++ implementation
-our @SPEC = (
-	'a', # ascending sort
-	'c', # code search
-	'd=s@', # shard dirs
-	'k=i', # sort column (like sort(1))
-	'm=i', # maximum number of results
-	'o=i', # offset
-	'r', # 1=relevance then column
-	't', # collapse threads
-	'A=s@', # prefixes
-	'O=s', # eidx_key
-	'T=i', # timeout in seconds
-);
-
 sub cmd_test_inspect {
 	my ($req) = @_;
 	print { $req->{0} } "pid=$$ has_threadid=",
@@ -144,10 +129,44 @@ sub cmd_dump_roots {
 	emit_mset_stats($req, $mset);
 }
 
+sub mset_iter ($$) {
+	my ($req, $it) = @_;
+	eval {
+		my $buf = $it->get_docid;
+		$buf .= "\0".$it->get_percent if $req->{p};
+		my $doc = ($req->{A} || $req->{D}) ? $it->get_document : undef;
+		for my $p (@{$req->{A}}) {
+			$buf .= "\0".$p.$_ for xap_terms($p, $doc);
+		}
+		$buf .= "\0".$doc->get_data if $req->{D};
+		say { $req->{0} } $buf;
+	};
+	$@ ? iter_retry_check($req) : 0;
+}
+
+sub cmd_mset { # to be used by WWW + IMAP
+	my ($req, $qry_str) = @_;
+	$qry_str // die 'usage: mset [OPTIONS] QRY_STR';
+	my $opt = { limit => $req->{'m'}, offset => $req->{o} // 0 };
+	$opt->{relevance} = 1 if $req->{r};
+	$opt->{threads} = 1 if defined $req->{t};
+	$opt->{git_dir} = $req->{g} if defined $req->{g};
+	$opt->{eidx_key} = $req->{O} if defined $req->{O};
+	$opt->{threadid} = $req->{T} if defined $req->{T};
+	my $mset = $req->{srch}->mset($qry_str, $opt);
+	say { $req->{0} } 'mset.size=', $mset->size;
+	for my $it ($mset->items) {
+		for (my $t = 10; $t > 0; --$t) {
+			$t = mset_iter($req, $it) // $t;
+		}
+	}
+}
+
 sub dispatch {
 	my ($req, $cmd, @argv) = @_;
 	my $fn = $req->can("cmd_$cmd") or return;
-	$GLP->getoptionsfromarray(\@argv, $req, @SPEC) or return;
+	$GLP->getoptionsfromarray(\@argv, $req, @PublicInbox::Search::XH_SPEC)
+		or return;
 	my $dirs = delete $req->{d} or die 'no -d args';
 	my $key = join("\0", @$dirs);
 	$req->{srch} = $SRCH{$key} //= do {
diff --git a/lib/PublicInbox/XapHelperCxx.pm b/lib/PublicInbox/XapHelperCxx.pm
index 8a66fdcd..1aa75f2a 100644
--- a/lib/PublicInbox/XapHelperCxx.pm
+++ b/lib/PublicInbox/XapHelperCxx.pm
@@ -20,13 +20,15 @@ $ENV{PERL_INLINE_DIRECTORY} // die('BUG: PERL_INLINE_DIRECTORY unset');
 substr($dir, 0, 0) = "$ENV{PERL_INLINE_DIRECTORY}/";
 my $bin = "$dir/xap_helper";
 my ($srcpfx) = (__FILE__ =~ m!\A(.+/)[^/]+\z!);
-my @srcs = map { $srcpfx.$_ } qw(xap_helper.h xh_cidx.h);
+my @srcs = map { $srcpfx.$_ } qw(xh_mset.h xh_cidx.h xap_helper.h);
 my @pm_dep = map { $srcpfx.$_ } qw(Search.pm CodeSearch.pm);
 my $ldflags = '-Wl,-O1';
 $ldflags .= ' -Wl,--compress-debug-sections=zlib' if $^O ne 'openbsd';
 my $xflags = ($ENV{CXXFLAGS} // '-Wall -ggdb3 -pipe') . ' ' .
 	' -DTHREADID=' . PublicInbox::Search::THREADID .
-	' ' . ($ENV{LDFLAGS} // $ldflags);
+	' -DXH_SPEC="'.join('',
+		map { s/=.*/:/; $_ } @PublicInbox::Search::XH_SPEC) . '" ' .
+	($ENV{LDFLAGS} // $ldflags);
 my $xap_modversion;
 
 sub xap_cfg (@) {
diff --git a/lib/PublicInbox/xap_helper.h b/lib/PublicInbox/xap_helper.h
index 0824ce71..02b6a914 100644
--- a/lib/PublicInbox/xap_helper.h
+++ b/lib/PublicInbox/xap_helper.h
@@ -124,10 +124,12 @@ struct req { // argv and pfxv point into global rbuf
 	char *argv[MY_ARG_MAX];
 	char *pfxv[MY_ARG_MAX]; // -A <prefix>
 	struct srch *srch;
+	char *Pgit_dir;
 	char *Oeidx_key;
 	cmd fn;
 	unsigned long long max;
 	unsigned long long off;
+	unsigned long long threadid;
 	unsigned long timeout_sec;
 	size_t nr_out;
 	long sort_col; // value column, negative means BoolWeight
@@ -138,6 +140,8 @@ struct req { // argv and pfxv point into global rbuf
 	bool collapse_threads;
 	bool code_search;
 	bool relevance; // sort by relevance before column
+	bool emit_percent;
+	bool emit_docdata;
 	bool asc; // ascending sort
 };
 
@@ -230,12 +234,53 @@ static Xapian::MSet mail_mset(struct req *req, const char *qry_str)
 	return enquire_mset(req, &enq);
 }
 
+static bool starts_with(const std::string *s, const char *pfx, size_t pfx_len)
+{
+	return s->size() >= pfx_len && !memcmp(pfx, s->c_str(), pfx_len);
+}
+
+static void apply_roots_filter(struct req *req, Xapian::Query *qry)
+{
+	if (!req->Pgit_dir) return;
+	req->Pgit_dir[0] = 'P'; // modifies static rbuf
+	Xapian::Database *xdb = req->srch->db;
+	for (int i = 0; i < 9; i++) {
+		try {
+			std::string P = req->Pgit_dir;
+			Xapian::PostingIterator p = xdb->postlist_begin(P);
+			if (p == xdb->postlist_end(P)) {
+				warnx("W: %s not indexed?", req->Pgit_dir + 1);
+				return;
+			}
+			Xapian::TermIterator cur = xdb->termlist_begin(*p);
+			Xapian::TermIterator end = xdb->termlist_end(*p);
+			cur.skip_to("G");
+			if (cur == end) {
+				warnx("W: %s has no root commits?",
+					req->Pgit_dir + 1);
+				return;
+			}
+			Xapian::Query f = Xapian::Query(*cur);
+			for (++cur; cur != end; ++cur) {
+				std::string tn = *cur;
+				if (!starts_with(&tn, "G", 1))
+					continue;
+				f = Xapian::Query(Xapian::Query::OP_OR, f, tn);
+			}
+			*qry = Xapian::Query(Xapian::Query::OP_FILTER, *qry, f);
+			return;
+		} catch (const Xapian::DatabaseModifiedError & e) {
+			xdb->reopen();
+		}
+	}
+}
+
 // for cindex
 static Xapian::MSet commit_mset(struct req *req, const char *qry_str)
 {
 	struct srch *srch = req->srch;
 	Xapian::Query qry = srch->qp->parse_query(qry_str, srch->qp_flags);
-	// TODO: git_dir + roots_filter
+	apply_roots_filter(req, &qry);
 
 	// we only want commits:
 	qry = Xapian::Query(Xapian::Query::OP_FILTER, qry,
@@ -254,11 +299,6 @@ static void emit_mset_stats(struct req *req, const Xapian::MSet *mset)
 		ABORT("BUG: %s caller only passed 1 FD", req->argv[0]);
 }
 
-static bool starts_with(const std::string *s, const char *pfx, size_t pfx_len)
-{
-	return s->size() >= pfx_len && !memcmp(pfx, s->c_str(), pfx_len);
-}
-
 static int my_setlinebuf(FILE *fp) // glibc setlinebuf(3) can't report errors
 {
 	return setvbuf(fp, NULL, _IOLBF, 0);
@@ -284,6 +324,32 @@ static void fbuf_init(struct fbuf *fbuf)
 	if (!fbuf->fp) err(EXIT_FAILURE, "open_memstream(fbuf)");
 }
 
+static bool write_all(int fd, const struct fbuf *wbuf, size_t len)
+{
+	const char *p = wbuf->ptr;
+	assert(wbuf->len >= len);
+	do { // write to client FD
+		ssize_t n = write(fd, p, len);
+		if (n > 0) {
+			len -= n;
+			p += n;
+		} else {
+			perror(n ? "write" : "write (zero bytes)");
+			return false;
+		}
+	} while (len);
+	return true;
+}
+
+#define ERR_FLUSH(f) do { \
+	if (ferror(f) | fflush(f)) err(EXIT_FAILURE, "ferror|fflush "#f); \
+} while (0)
+
+#define ERR_CLOSE(f, e) do { \
+	if (ferror(f) | fclose(f)) \
+		e ? err(e, "ferror|fclose "#f) : perror("ferror|fclose "#f); \
+} while (0)
+
 static void xclose(int fd)
 {
 	if (close(fd) < 0 && errno != EINTR)
@@ -332,6 +398,7 @@ static bool cmd_test_inspect(struct req *req)
 	return false;
 }
 
+#include "xh_mset.h" // read-only (WWW, IMAP, lei) stuff
 #include "xh_cidx.h" // CodeSearchIdx.pm stuff
 
 #define CMD(n) { .fn_len = sizeof(#n) - 1, .fn_name = #n, .fn = cmd_##n }
@@ -341,6 +408,7 @@ static const struct cmd_entry {
 	cmd fn;
 } cmds[] = { // should be small enough to not need bsearch || gperf
 	// most common commands first
+	CMD(mset), // WWW and IMAP requests
 	CMD(dump_ibx), // many inboxes
 	CMD(dump_roots), // per-cidx shard
 	CMD(test_inspect), // least common commands last
@@ -513,7 +581,7 @@ static void dispatch(struct req *req)
 	char *end;
 	FILE *kfp;
 	struct srch **s;
-	req->fn = NULL;
+	req->threadid = ULLONG_MAX;
 	for (c = 0; c < (int)MY_ARRAY_SIZE(cmds); c++) {
 		if (cmds[c].fn_len == size &&
 			!memcmp(cmds[c].fn_name, req->argv[0], size)) {
@@ -533,12 +601,13 @@ static void dispatch(struct req *req)
 	optarg = NULL;
 	MY_DO_OPTRESET();
 
-	// keep sync with @PublicInbox::XapHelper::SPEC
-	while ((c = getopt(req->argc, req->argv, "acd:k:m:o:rtA:O:T:")) != -1) {
+	// XH_SPEC is generated from @PublicInbox::Search::XH_SPEC
+	while ((c = getopt(req->argc, req->argv, XH_SPEC)) != -1) {
 		switch (c) {
 		case 'a': req->asc = true; break;
 		case 'c': req->code_search = true; break;
 		case 'd': fwrite(optarg, strlen(optarg) + 1, 1, kfp); break;
+		case 'g': req->Pgit_dir = optarg - 1; break; // pad "P" prefix
 		case 'k':
 			req->sort_col = strtol(optarg, &end, 10);
 			if (*end) ABORT("-k %s", optarg);
@@ -556,6 +625,7 @@ static void dispatch(struct req *req)
 			if (*end || req->off == ULLONG_MAX)
 				ABORT("-o %s", optarg);
 			break;
+		case 'p': req->emit_percent = true; break;
 		case 'r': req->relevance = true; break;
 		case 't': req->collapse_threads = true; break;
 		case 'A':
@@ -563,17 +633,22 @@ static void dispatch(struct req *req)
 			if (MY_ARG_MAX == req->pfxc)
 				ABORT("too many -A");
 			break;
-		case 'O': req->Oeidx_key = optarg - 1; break; // pad "O" prefix
-		case 'T':
+		case 'D': req->emit_docdata = true; break;
+		case 'K':
 			req->timeout_sec = strtoul(optarg, &end, 10);
 			if (*end || req->timeout_sec == ULONG_MAX)
+				ABORT("-K %s", optarg);
+			break;
+		case 'O': req->Oeidx_key = optarg - 1; break; // pad "O" prefix
+		case 'T':
+			req->threadid = strtoull(optarg, &end, 10);
+			if (*end || req->threadid == ULLONG_MAX)
 				ABORT("-T %s", optarg);
 			break;
 		default: ABORT("bad switch `-%c'", c);
 		}
 	}
-	if (ferror(kfp) | fclose(kfp)) /* sets kbuf.srch */
-		err(EXIT_FAILURE, "ferror|fclose"); // likely ENOMEM
+	ERR_CLOSE(kfp, EXIT_FAILURE); // may ENOMEM, sets kbuf.srch
 	kbuf.srch->db = NULL;
 	kbuf.srch->qp = NULL;
 	kbuf.srch->paths_len = size - offsetof(struct srch, paths);
@@ -632,8 +707,7 @@ static void stderr_restore(FILE *tmp_err)
 	stderr = orig_err;
 	return;
 #endif
-	if (ferror(stderr) | fflush(stderr))
-		err(EXIT_FAILURE, "ferror|fflush stderr");
+	ERR_CLOSE(stderr, EXIT_FAILURE);
 	while (dup2(orig_err_fd, STDERR_FILENO) < 0) {
 		if (errno != EINTR)
 			err(EXIT_FAILURE, "dup2(%d => 2)", orig_err_fd);
@@ -663,12 +737,10 @@ static void recv_loop(void) // worker process loop
 			stderr_set(req.fp[1]);
 		req.argc = (int)SPLIT2ARGV(req.argv, rbuf, len);
 		dispatch(&req);
-		if (ferror(req.fp[0]) | fclose(req.fp[0]))
-			perror("ferror|fclose fp[0]");
+		ERR_CLOSE(req.fp[0], 0);
 		if (req.fp[1]) {
 			stderr_restore(req.fp[1]);
-			if (ferror(req.fp[1]) | fclose(req.fp[1]))
-				perror("ferror|fclose fp[1]");
+			ERR_CLOSE(req.fp[1], 0);
 		}
 	}
 }
diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h
index 8513d88b..49190214 100644
--- a/lib/PublicInbox/xh_cidx.h
+++ b/lib/PublicInbox/xh_cidx.h
@@ -106,8 +106,7 @@ static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc)
 		fputs((const char *)ep->data, root_offs->fp);
 	}
 	fputc('\n', root_offs->fp);
-	if (ferror(root_offs->fp) | fclose(root_offs->fp))
-		err(EXIT_FAILURE, "ferror|fclose(root_offs)"); // ENOMEM
+	ERR_CLOSE(root_offs->fp, EXIT_FAILURE); // ENOMEM
 	root_offs->fp = NULL;
 	return true;
 }
@@ -137,38 +136,24 @@ static void dump_roots_term(struct req *req, const char *pfx,
 // buffering and rely on flock(2), here
 static bool dump_roots_flush(struct req *req, struct dump_roots_tmp *drt)
 {
-	char *p;
-	int fd = fileno(req->fp[0]);
 	bool ok = true;
+	off_t off = ftello(drt->wbuf.fp);
+	if (off < 0) EABORT("ftello");
+	if (!off) return ok;
+
+	ERR_FLUSH(drt->wbuf.fp); // ENOMEM
+	int fd = fileno(req->fp[0]);
 
-	if (!drt->wbuf.fp) return true;
-	if (fd < 0) EABORT("BUG: fileno");
-	if (ferror(drt->wbuf.fp) | fclose(drt->wbuf.fp)) // ENOMEM?
-		err(EXIT_FAILURE, "ferror|fclose(drt->wbuf.fp)");
-	drt->wbuf.fp = NULL;
-	if (!drt->wbuf.len) goto done_free;
 	while (flock(drt->root2off_fd, LOCK_EX)) {
 		if (errno == EINTR) continue;
 		err(EXIT_FAILURE, "LOCK_EX"); // ENOLCK?
 	}
-	p = drt->wbuf.ptr;
-	do { // write to client FD
-		ssize_t n = write(fd, p, drt->wbuf.len);
-		if (n > 0) {
-			drt->wbuf.len -= n;
-			p += n;
-		} else {
-			perror(n ? "write" : "write (zero bytes)");
-			return false;
-		}
-	} while (drt->wbuf.len);
+	ok = write_all(fd, &drt->wbuf, (size_t)off);
 	while (flock(drt->root2off_fd, LOCK_UN)) {
 		if (errno == EINTR) continue;
 		err(EXIT_FAILURE, "LOCK_UN"); // ENOLCK?
 	}
-done_free: // OK to skip on errors, dump_roots_ensure calls fbuf_ensure
-	free(drt->wbuf.ptr);
-	drt->wbuf.ptr = NULL;
+	if (fseeko(drt->wbuf.fp, 0, SEEK_SET)) EABORT("fseeko");
 	return ok;
 }
 
@@ -239,11 +224,11 @@ static bool cmd_dump_roots(struct req *req)
 	req->sort_col = -1;
 	Xapian::MSet mset = commit_mset(req, req->argv[optind + 1]);
 
+	fbuf_init(&drt.wbuf);
+
 	// @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine
 	// in case we need to retry on DB reopens
 	for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) {
-		if (!drt.wbuf.fp)
-			fbuf_init(&drt.wbuf);
 		for (int t = 10; t > 0; --t)
 			switch (dump_roots_iter(req, &drt, &i)) {
 			case ITER_OK: t = 0; break; // leave inner loop
diff --git a/lib/PublicInbox/xh_mset.h b/lib/PublicInbox/xh_mset.h
new file mode 100644
index 00000000..056fe22b
--- /dev/null
+++ b/lib/PublicInbox/xh_mset.h
@@ -0,0 +1,96 @@
+// Copyright (C) all contributors <meta@public-inbox.org>
+// License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
+// This file is only intended to be included by xap_helper.h
+// it implements pieces used by WWW, IMAP and lei
+
+static void emit_doc_term(FILE *fp, const char *pfx, Xapian::Document *doc)
+{
+	Xapian::TermIterator cur = doc->termlist_begin();
+	Xapian::TermIterator end = doc->termlist_end();
+	size_t pfx_len = strlen(pfx);
+
+	for (cur.skip_to(pfx); cur != end; cur++) {
+		std::string tn = *cur;
+		if (!starts_with(&tn, pfx, pfx_len)) continue;
+		fputc(0, fp);
+		fwrite(tn.data(), tn.size(), 1, fp);
+	}
+}
+
+static enum exc_iter mset_iter(const struct req *req, FILE *fp, off_t off,
+				Xapian::MSetIterator *i)
+{
+	try {
+		fprintf(fp, "%llu", (unsigned long long)(*(*i))); // get_docid
+		if (req->emit_percent)
+			fprintf(fp, "%c%d", 0, i->get_percent());
+		if (req->pfxc || req->emit_docdata) {
+			Xapian::Document doc = i->get_document();
+			for (int p = 0; p < req->pfxc; p++)
+				emit_doc_term(fp, req->pfxv[p], &doc);
+			if (req->emit_docdata) {
+				std::string d = doc.get_data();
+				fputc(0, fp);
+				fwrite(d.data(), d.size(), 1, fp);
+			}
+		}
+		fputc('\n', fp);
+	} catch (const Xapian::DatabaseModifiedError & e) {
+		req->srch->db->reopen();
+		if (fseeko(fp, off, SEEK_SET) < 0) EABORT("fseeko");
+		return ITER_RETRY;
+	} catch (const Xapian::DocNotFoundError & e) { // oh well...
+		warnx("doc not found: %s", e.get_description().c_str());
+		if (fseeko(fp, off, SEEK_SET) < 0) EABORT("fseeko");
+	}
+	return ITER_OK;
+}
+
+#ifndef WBUF_FLUSH_THRESHOLD
+#	define WBUF_FLUSH_THRESHOLD (BUFSIZ - 1000)
+#endif
+#if WBUF_FLUSH_THRESHOLD < 0
+#	undef WBUF_FLUSH_THRESHOLD
+#	define WBUF_FLUSH_THRESHOLD BUFSIZ
+#endif
+
+static bool cmd_mset(struct req *req)
+{
+	if (optind >= req->argc) ABORT("usage: mset [OPTIONS] WANT QRY_STR");
+	if (req->fp[1]) ABORT("mset only accepts 1 FD");
+	const char *qry_str = req->argv[optind];
+	CLEANUP_FBUF struct fbuf wbuf = {};
+	Xapian::MSet mset = req->code_search ? commit_mset(req, qry_str) :
+						mail_mset(req, qry_str);
+	fbuf_init(&wbuf);
+	fprintf(wbuf.fp, "mset.size=%llu\n", (unsigned long long)mset.size());
+	int fd = fileno(req->fp[0]);
+	for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) {
+		off_t off = ftello(wbuf.fp);
+		if (off < 0) EABORT("ftello");
+		/*
+		 * TODO verify our fflush + fseeko use isn't affected by a
+		 * glibc <2.25 bug:
+		 * https://sourceware.org/bugzilla/show_bug.cgi?id=20181
+		 * CentOS 7.x only has glibc 2.17.  In any case, bug #20181
+		 * shouldn't affect us since our use of fseeko is used to
+		 * effectively discard data.
+		 */
+		if (off > WBUF_FLUSH_THRESHOLD) {
+			ERR_FLUSH(wbuf.fp);
+			if (!write_all(fd, &wbuf, (size_t)off)) return false;
+			if (fseeko(wbuf.fp, 0, SEEK_SET)) EABORT("fseeko");
+			off = 0;
+		}
+		for (int t = 10; t > 0; --t)
+			switch (mset_iter(req, wbuf.fp, off, &i)) {
+			case ITER_OK: t = 0; break; // leave inner loop
+			case ITER_RETRY: break; // continue for-loop
+			case ITER_ABORT: return false; // error
+			}
+	}
+	off_t off = ftello(wbuf.fp);
+	if (off < 0) EABORT("ftello");
+	ERR_FLUSH(wbuf.fp);
+	return off > 0 ? write_all(fd, &wbuf, (size_t)off) : true;
+}
diff --git a/t/cindex.t b/t/cindex.t
index 1a9e564a..ac7a6000 100644
--- a/t/cindex.t
+++ b/t/cindex.t
@@ -121,22 +121,70 @@ my $no_metadata_set = sub {
 
 use_ok 'PublicInbox::CodeSearch';
 
+
+my @xh_args;
+my $exp = [ 'initial with NUL character', 'remove NUL character' ];
+my $zp_git = abs_path("$zp/.git");
 if ('multi-repo search') {
 	my $csrch = PublicInbox::CodeSearch->new("$tmp/ext");
 	my $mset = $csrch->mset('NUL');
 	is(scalar($mset->items), 2, 'got results');
-	my $exp = [ 'initial with NUL character', 'remove NUL character' ];
 	my @have = sort(map { $_->get_document->get_data } $mset->items);
 	is_xdeeply(\@have, $exp, 'got expected subjects');
 
 	$mset = $csrch->mset('NUL', { git_dir => "$tmp/wt0/.git" });
 	is(scalar($mset->items), 0, 'no results with other GIT_DIR');
 
-	$mset = $csrch->mset('NUL', { git_dir => abs_path("$zp/.git") });
+	$mset = $csrch->mset('NUL', { git_dir => $zp_git });
 	@have = sort(map { $_->get_document->get_data } $mset->items);
 	is_xdeeply(\@have, $exp, 'got expected subjects w/ GIT_DIR filter');
 	my @xdb = $csrch->xdb_shards_flat;
 	$no_metadata_set->(0, ['indexlevel'], \@xdb);
+	@xh_args = $csrch->xh_args;
+}
+
+my $test_xhc = sub {
+	my ($xhc) = @_;
+	my $impl = $xhc->{impl};
+	my ($r, @l);
+	$r = $xhc->mkreq([], qw(mset -D -c -g), $zp_git, @xh_args, 'NUL');
+	chomp(@l = <$r>);
+	is(shift(@l), 'mset.size=2', "got expected header $impl");
+	my %docid2data;
+	my @got = sort map {
+		my @f = split /\0/;
+		is scalar(@f), 2, 'got 2 entries';
+		$docid2data{$f[0]} = $f[1];
+		$f[1];
+	} @l;
+	is_deeply(\@got, $exp, "expected doc_data $impl");
+
+	$r = $xhc->mkreq([], qw(mset -c -g), "$tmp/wt0/.git", @xh_args, 'NUL');
+	chomp(@l = <$r>);
+	is(shift(@l), 'mset.size=0', "got miss in wrong dir $impl");
+	is_deeply(\@l, [], "no extra lines $impl");
+
+	my $csrch = PublicInbox::CodeSearch->new("$tmp/ext");
+	while (my ($did, $expect) = each %docid2data) {
+		is_deeply($csrch->xdb->get_document($did)->get_data,
+			$expect, "docid=$did data matches");
+	}
+	ok(!$xhc->{io}->close, "$impl close");
+	is($?, 66 << 8, "got EX_NOINPUT from $impl exit");
+};
+
+SKIP: {
+	require_mods('+SCM_RIGHTS', 1);
+	require PublicInbox::XapClient;
+	my $xhc = PublicInbox::XapClient::start_helper('-j0');
+	$test_xhc->($xhc);
+	skip 'PI_NO_CXX set', 1 if $ENV{PI_NO_CXX};
+	$xhc->{impl} =~ /Cxx/ or
+		skip 'C++ compiler or xapian development libs missing', 1;
+	skip 'TEST_XH_CXX_ONLY set', 1 if $ENV{TEST_XH_CXX_ONLY};
+	local $ENV{PI_NO_CXX} = 1; # force XS or SWIG binding test
+	$xhc = PublicInbox::XapClient::start_helper('-j0');
+	$test_xhc->($xhc);
 }
 
 if ('--update') {
diff --git a/t/xap_helper.t b/t/xap_helper.t
index e3abeded..ee25b2dc 100644
--- a/t/xap_helper.t
+++ b/t/xap_helper.t
@@ -40,6 +40,7 @@ my $v2 = create_inbox 'v2', indexlevel => 'medium', version => 2,
 };
 
 my @ibx_idx = glob("$v2->{inboxdir}/xap*/?");
+my @ibx_shard_args = map { ('-d', $_) } @ibx_idx;
 my (@int) = glob("$crepo/public-inbox-cindex/cidx*/?");
 my (@ext) = glob("$crepo/cidx-ext/cidx*/?");
 is(scalar(@ext), 2, 'have 2 external shards') or diag explain(\@ext);
@@ -76,8 +77,7 @@ my $test = sub {
 	is($cinfo{has_threadid}, '0', 'has_threadid false for cindex');
 	is($cinfo{pid}, $info{pid}, 'PID unchanged for cindex');
 
-	my @dump = (qw(dump_ibx -A XDFID), (map { ('-d', $_) } @ibx_idx),
-			qw(13 rt:0..));
+	my @dump = (qw(dump_ibx -A XDFID), @ibx_shard_args, qw(13 rt:0..));
 	$r = $doreq->($s, @dump);
 	my @res;
 	while (sysread($r, my $buf, 512) != 0) { push @res, $buf }
@@ -89,7 +89,8 @@ my $test = sub {
 	my $res = do { local $/; <$r> };
 	is(join('', @res), $res, 'got identical response w/ error pipe');
 	my $stats = do { local $/; <$err_rd> };
-	is($stats, "mset.size=6 nr_out=6\n", 'mset.size reported');
+	is($stats, "mset.size=6 nr_out=6\n", 'mset.size reported') or
+		diag "res=$res";
 
 	return wantarray ? ($ar, $s) : $ar if $cinfo{pid} == $pid;
 
@@ -198,7 +199,47 @@ for my $n (@NO_CXX) {
 	is(scalar(@res), scalar(grep(/\A[0-9a-f]{40,} [0-9]+\n\z/, @res)),
 		'entries match format');
 	$err = do { local $/; <$err_r> };
-	is($err, "mset.size=6 nr_out=5\n", "got expected status ($xhc->{impl})");
+	is $err, "mset.size=6 nr_out=5\n", "got expected status ($xhc->{impl})";
+
+	$r = $xhc->mkreq([], qw(mset -p -A XDFID -A Q), @ibx_shard_args,
+				'dfn:lib/PublicInbox/Search.pm');
+	chomp((my $hdr, @res) = readline($r));
+	is $hdr, 'mset.size=1', "got expected header via mset ($xhc->{impl}";
+	is scalar(@res), 1, 'got one result';
+	@res = split /\0/, $res[0];
+	{
+		my $doc = $v2->search->xdb->get_document($res[0]);
+		my @q = PublicInbox::Search::xap_terms('Q', $doc);
+		is_deeply \@q, [ $mid ], 'docid usable';
+	}
+	ok $res[1] > 0 && $res[1] <= 100, 'pct > 0 && <= 100';
+	is $res[2], 'XDFID'.$dfid, 'XDFID result matches';
+	is $res[3], 'Q'.$mid, 'Q (msgid) mset result matches';
+	is scalar(@res), 4, 'only 4 columns in result';
+
+	$r = $xhc->mkreq([], qw(mset -p -A XDFID -A Q), @ibx_shard_args,
+				'dt:19700101'.'000000..');
+	chomp(($hdr, @res) = readline($r));
+	is $hdr, 'mset.size=6',
+		"got expected header via multi-result mset ($xhc->{impl}";
+	is(scalar(@res), 6, 'got 6 rows');
+	for my $r (@res) {
+		my ($docid, $pct, @rest) = split /\0/, $r;
+		my $doc = $v2->search->xdb->get_document($docid);
+		ok $pct > 0 && $pct <= 100,
+			"pct > 0 && <= 100 #$docid ($xhc->{impl})";
+		my %terms;
+		for (@rest) {
+			s/\A([A-Z]+)// or xbail 'no prefix=', \@rest;
+			push @{$terms{$1}}, $_;
+		}
+		while (my ($pfx, $vals) = each %terms) {
+			@$vals = sort @$vals;
+			my @q = PublicInbox::Search::xap_terms($pfx, $doc);
+			is_deeply $vals, \@q,
+				"#$docid $pfx as expected ($xhc->{impl})";
+		}
+	}
 }
 
 done_testing;

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/7] hval: relative paths
  2023-11-26 14:19 [PATCH 1/7] WIP-cidx-xh-split Eric Wong
  2023-11-26 14:19 ` [PATCH 2/7] WIP-cidx Eric Wong
@ 2023-11-26 14:19 ` Eric Wong
  2023-11-26 14:19 ` [PATCH 4/7] www_coderepo: load and use cindex join data Eric Wong
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2023-11-26 14:19 UTC (permalink / raw)
  To: spew

---
 lib/PublicInbox/Hval.pm | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm
index e9b9ae64..93848d11 100644
--- a/lib/PublicInbox/Hval.pm
+++ b/lib/PublicInbox/Hval.pm
@@ -13,6 +13,7 @@ our @EXPORT_OK = qw/ascii_html obfuscate_addrs to_filename src_escape
 		to_attr prurl mid_href fmt_ts ts2str utf8_maybe/;
 use POSIX qw(strftime);
 my $enc_ascii = find_encoding('us-ascii');
+use File::Spec;
 
 # safe-ish acceptable filename pattern for portability
 our $FN = '[a-zA-Z0-9][a-zA-Z0-9_\-\.]+[a-zA-Z0-9]'; # needs \z anchor
@@ -69,7 +70,14 @@ sub prurl ($$) {
 		$u = $host_match[0] // $u->[0];
 		# fall through to below:
 	}
-	index($u, '//') == 0 ? "$env->{'psgi.url_scheme'}:$u" : $u;
+	my $dslash = index($u, '//');
+	if ($dslash == 0) {
+		"$env->{'psgi.url_scheme'}:$u"
+	} elsif ($dslash < 0 && substr($u, 0, 1) ne '/') {
+		File::Spec->abs2rel("/$u", $env->{PATH_INFO});
+	} else {
+		$u;
+	}
 }
 
 # for misguided people who believe in this stuff, give them a

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 4/7] www_coderepo: load and use cindex join data
  2023-11-26 14:19 [PATCH 1/7] WIP-cidx-xh-split Eric Wong
  2023-11-26 14:19 ` [PATCH 2/7] WIP-cidx Eric Wong
  2023-11-26 14:19 ` [PATCH 3/7] hval: relative paths Eric Wong
@ 2023-11-26 14:19 ` Eric Wong
  2023-11-26 14:19 ` [PATCH 5/7] cindex: require `-g GIT_DIR' or `-r PROJECT_ROOT' Eric Wong
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2023-11-26 14:19 UTC (permalink / raw)
  To: spew

---
 lib/PublicInbox/CodeSearch.pm    |  55 ++++++++++++----
 lib/PublicInbox/CodeSearchIdx.pm |  42 ++++++------
 lib/PublicInbox/Config.pm        |  35 +++++++++-
 lib/PublicInbox/Search.pm        |   9 +++
 lib/PublicInbox/SolverGit.pm     |   6 +-
 lib/PublicInbox/WWW.pm           |   1 +
 lib/PublicInbox/WwwCoderepo.pm   | 108 ++++++++++++++++++++++++++++++-
 t/cindex.t                       |  28 +++++++-
 xt/solver.t                      |   3 +-
 9 files changed, 248 insertions(+), 39 deletions(-)

diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm
index 9051d85f..19bcde93 100644
--- a/lib/PublicInbox/CodeSearch.pm
+++ b/lib/PublicInbox/CodeSearch.pm
@@ -21,7 +21,7 @@ use constant {
 our @CODE_NRP;
 our @CODE_VMAP = (
 	[ AT, 'd:' ], # mairix compat
-	[ AT, 'dt:' ], # mail compat
+	[ AT, 'dt:' ], # public-inbox mail compat
 	[ CT, 'ct:' ],
 );
 
@@ -51,7 +51,7 @@ my %prob_prefix = ( # copied from PublicInbox::Search
 sub new {
 	my ($cls, $dir, $cfg) = @_;
 	# can't have a PublicInbox::Config here due to circular refs
-	bless { xpfx => "$dir/cidx".CIDX_SCHEMA_VER,
+	bless { topdir => $dir, xpfx => "$dir/cidx".CIDX_SCHEMA_VER,
 		-cfg_f => $cfg->{-f} }, $cls;
 }
 
@@ -63,7 +63,20 @@ sub join_data {
 	my $cur = $self->xdb->get_metadata($key) or return;
 	$cur = eval { PublicInbox::Config::json()->decode(uncompress($cur)) };
 	warn "E: $@ (corrupt metadata in `$key' key?)" if $@;
-	$cur;
+	my @m = grep { ref($cur->{$_}) ne 'ARRAY' } qw(ekeys roots ibx2root);
+	if (@m) {
+		warn <<EOM;
+W: $self->{topdir} join data for $self->{-cfg_f} missing: @m
+EOM
+		undef;
+	} elsif (@{$cur->{ekeys}} != @{$cur->{ibx2root}}) {
+		warn <<EOM;
+W: $self->{topdir} join data for $self->{cfg_f} mismatched ekeys and ibx2root
+EOM
+		undef;
+	} else {
+		$cur;
+	}
 }
 
 sub qparse_new ($) {
@@ -191,21 +204,41 @@ sub roots2paths { # for diagnostics
 			}
 			$size = $mset->size;
 		} while ($size);
-		substr($_, 0, 1, '/') for @$dirs; # s!^P!/!
 		@$dirs = sort @$dirs;
 	}
 	\%ret;
 }
 
-sub paths2roots { # for diagnostics
-	my ($self) = @_;
+sub root_oids ($$) {
+	my ($self, $git_dir) = @_;
+	my @ids = $self->docids_by_postlist('P'.$git_dir);
+	@ids or warn <<"";
+BUG? (non-fatal) `$git_dir' not indexed in $self->{topdir}
+
+	warn <<"" if @ids > 1;
+BUG: (non-fatal) $git_dir indexed multiple times in $self->{topdir}
+
+	my %ret;
+	for my $docid (@ids) {
+		my @oids = xap_terms('G', $self->xdb, $docid);
+		@ret{@oids} = @oids;
+	}
+	sort keys %ret;
+}
+
+sub paths2roots {
+	my ($self, $paths) = @_;
 	my %ret;
-	my $tmp = roots2paths($self);
-	for my $root_oidhex (keys %$tmp) {
-		my $paths = delete $tmp->{$root_oidhex};
-		push @{$ret{$_}}, $root_oidhex for @$paths;
+	if ($paths) {
+		for my $p (keys %$paths) { @{$ret{$p}} = root_oids($self, $p) }
+	} else {
+		my $tmp = roots2paths($self);
+		for my $root_oidhex (keys %$tmp) {
+			my $paths = delete $tmp->{$root_oidhex};
+			push @{$ret{$_}}, $root_oidhex for @$paths;
+		}
+		@$_ = sort(@$_) for values %ret;
 	}
-	@$_ = sort(@$_) for values %ret;
 	\%ret;
 }
 
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index bb1d698b..a6cbe0b0 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -172,7 +172,7 @@ sub count_shards { scalar($_[0]->xdb_shards_flat) }
 sub update_commit ($$$) {
 	my ($self, $cmt, $roots) = @_; # fields from @FMT
 	my $x = 'Q'.$cmt->{H};
-	my ($docid, @extra) = sort { $a <=> $b } docids_by_postlist($self, $x);
+	my ($docid, @extra) = sort { $a <=> $b } $self->docids_by_postlist($x);
 	@extra and warn "W: $cmt->{H} indexed multiple times, pruning ",
 			join(', ', map { "#$_" } @extra), "\n";
 	$self->{xdb}->delete_document($_) for @extra;
@@ -377,15 +377,6 @@ sub seen ($$) {
 # used to select the shard for a GIT_DIR
 sub git_dir_hash ($) { hex(substr(sha256_hex($_[0]), 0, 8)) }
 
-sub docids_by_postlist ($$) { # consider moving to PublicInbox::Search
-	my ($self, $q) = @_;
-	my $cur = $self->{xdb}->postlist_begin($q);
-	my $end = $self->{xdb}->postlist_end($q);
-	my @ids;
-	for (; $cur != $end; $cur++) { push(@ids, $cur->get_docid) };
-	@ids;
-}
-
 sub _cb { # run_await cb
 	my ($pid, $cmd, undef, $opt, $cb, $self, $git, @arg) = @_;
 	return if $DO_QUIT;
@@ -452,7 +443,7 @@ sub prep_repo ($$) {
 
 sub check_existing { # retry_reopen callback
 	my ($shard, $self, $git) = @_;
-	my @docids = docids_by_postlist($shard, 'P'.$git->{git_dir});
+	my @docids = $shard->docids_by_postlist('P'.$git->{git_dir});
 	my $docid = shift(@docids) // return get_roots($self, $git);
 	my $doc = $shard->get_doc($docid) //
 			die "BUG: no #$docid ($git->{git_dir})";
@@ -778,7 +769,7 @@ sub prune_init { # via wq_io_do in IDX_SHARDS
 
 sub prune_one { # via wq_io_do in IDX_SHARDS
 	my ($self, $term) = @_;
-	my @docids = docids_by_postlist($self, $term);
+	my @docids = $self->docids_by_postlist($term);
 	for (@docids) {
 		$TXN_BYTES -= $self->{xdb}->get_doclength($_) * 42;
 		$self->{xdb}->delete_document($_);
@@ -894,10 +885,9 @@ sub current_join_data ($) {
 sub score_old_join_data ($$$) {
 	my ($self, $score, $ekeys_new) = @_;
 	my $old = ($JOIN{reset} ? undef : current_join_data($self)) or return;
-	my @old = @$old{qw(ekeys roots ibx2root)};
-	@old == 3 or return warn "W: ekeys/roots missing from old JOIN data\n";
 	progress($self, 'merging old join data...');
-	my ($ekeys_old, $roots_old, $ibx2root_old) = @old;
+	my ($ekeys_old, $roots_old, $ibx2root_old) =
+					@$old{qw(ekeys roots ibx2root)};
 	# score: "ibx_off root_off" => nr
 	my $i = -1;
 	my %root2id_new = map { $_ => ++$i } @OFF2ROOT;
@@ -905,16 +895,24 @@ sub score_old_join_data ($$$) {
 	my %ekey2id_new = map { $_ => ++$i } @$ekeys_new;
 	for my $ibx_off_old (0..$#$ibx2root_old) {
 		my $root_offs_old = $ibx2root_old->[$ibx_off_old];
-		my $ekey = $ekeys_old->[$ibx_off_old] //
-			warn "W: no ibx #$ibx_off_old in old JOIN data\n";
-		my $ibx_off_new = $ekey2id_new{$ekey // next} //
+		my $ekey = $ekeys_old->[$ibx_off_old] // do {
+			warn "W: no ibx #$ibx_off_old in old join data\n";
+			next;
+		};
+		my $ibx_off_new = $ekey2id_new{$ekey} // do {
 			warn "W: `$ekey' no longer exists\n";
+			next;
+		};
 		for (@$root_offs_old) {
 			my ($nr, $rid_old) = @$_;
-			my $root_old = $roots_old->[$rid_old] //
-				warn "W: no root #$rid_old in old JOIN data\n";
-			my $rid_new = $root2id_new{$root_old // next} //
+			my $root_old = $roots_old->[$rid_old] // do {
+				warn "W: no root #$rid_old in old data\n";
+				next;
+			};
+			my $rid_new = $root2id_new{$root_old} // do {
 				warn "W: root `$root_old' no longer exists\n";
+				next;
+			};
 			$score->{"$ibx_off_new $rid_new"} += $nr;
 		}
 	}
@@ -963,7 +961,7 @@ sub do_join {
 		progress($self, "$ekey => $root has $nr matches");
 		push @{$new->{ibx2root}->[$ibx_off]}, [ $nr, $root_off ];
 	}
-	for my $ary (values %$new) { # sort by nr
+	for my $ary (values %$new) { # sort by nr (largest first)
 		for (@$ary) { @$_ = sort { $b->[0] <=> $a->[0] } @$_ }
 	}
 	$new->{ekeys} = \@ekeys;
diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm
index 9bee94b8..c8ecc06b 100644
--- a/lib/PublicInbox/Config.pm
+++ b/lib/PublicInbox/Config.pm
@@ -412,8 +412,8 @@ sub get_1 {
 
 sub repo_objs {
 	my ($self, $ibxish) = @_;
-	my $ibx_coderepos = $ibxish->{coderepo} // return;
 	$ibxish->{-repo_objs} // do {
+		my $ibx_coderepos = $ibxish->{coderepo} // return;
 		parse_cgitrc($self, undef, 0);
 		my $coderepos = $self->{-coderepos};
 		my @repo_objs;
@@ -568,6 +568,39 @@ sub _fill_ei ($$) {
 	$es;
 }
 
+sub _fill_csrch ($$) {
+	my ($self, $name) = @_; # "" is a valid name for cindex
+	return if $name ne '' && !valid_foo_name($name, 'cindex');
+	eval { require PublicInbox::CodeSearch } or return;
+	my $pfx = "cindex.$name";
+	my $d = $self->{"$pfx.topdir"} // return;
+	-d $d or return;
+	if (index($d, "\n") >= 0) {
+		warn "E: `$d' must not contain `\\n'\n";
+		return;
+	}
+	my $csrch = PublicInbox::CodeSearch->new($d, $self);
+	for my $k (qw(localprefix)) {
+		my $v = $self->{"$pfx.$k"} // next;
+		$csrch->{$k} = _array($v);
+	}
+	$csrch->{name} = $name;
+	$csrch;
+}
+
+sub lookup_cindex {
+	my ($self, $name) = @_;
+	$self->{-csrch_by_name}->{$name} //= _fill_csrch($self, $name);
+}
+
+sub each_cindex {
+	my ($self, $cb, @arg) = @_;
+	for my $s (grep(m!\Acindex\.[^\./]*\z!, @{$self->{-section_order}})) {
+		my $csrch = lookup_cindex($self, substr($s, length('cindex.')));
+		$cb->($csrch, @arg) if $csrch;
+	}
+}
+
 sub config_cmd {
 	my ($self, $env, $opt) = @_;
 	my $f = $self->{-f} // default_file();
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 6145b027..43f7f52f 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -649,4 +649,13 @@ sub xh_args { # prep getopt args to feed to xap_helper.h socket
 	map { ('-d', $_) } shard_dirs($_[0]);
 }
 
+sub docids_by_postlist ($$) {
+	my ($self, $q) = @_;
+	my $cur = $self->xdb->postlist_begin($q);
+	my $end = $self->{xdb}->postlist_end($q);
+	my @ids;
+	for (; $cur != $end; $cur++) { push(@ids, $cur->get_docid) };
+	@ids;
+}
+
 1;
diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm
index ba3c94cb..b0e6cc24 100644
--- a/lib/PublicInbox/SolverGit.pm
+++ b/lib/PublicInbox/SolverGit.pm
@@ -640,9 +640,13 @@ sub resolve_patch ($$) {
 # so user_cb never references the SolverGit object
 sub new {
 	my ($class, $ibx, $user_cb, $uarg) = @_;
+	my $gits = $ibx ? $ibx->{-repo_objs} : undef;
+
+	# FIXME: cindex --join= is super-aggressive and may hit too many
+	$gits = [ @$gits[0..2] ] if $gits && @$gits > 3;
 
 	bless { # $ibx is undef if coderepo only (see WwwCoderepo)
-		gits => $ibx ? $ibx->{-repo_objs} : undef,
+		gits => $gits,
 		user_cb => $user_cb,
 		uarg => $uarg,
 		# -cur_di, -qsp_err, -msg => temp fields for Qspawn callbacks
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index 6b616bd4..289599b8 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -189,6 +189,7 @@ sub preload {
 		}
 		$pi_cfg->ALL and require PublicInbox::Isearch;
 		$self->cgit;
+		$self->coderepo;
 		$self->stylesheets_prepare($_) for ('', '../', '../../');
 		$self->news_www;
 	}
diff --git a/lib/PublicInbox/WwwCoderepo.pm b/lib/PublicInbox/WwwCoderepo.pm
index 0eb4a2d6..9012e786 100644
--- a/lib/PublicInbox/WwwCoderepo.pm
+++ b/lib/PublicInbox/WwwCoderepo.pm
@@ -14,12 +14,14 @@ use PublicInbox::ViewVCS;
 use PublicInbox::WwwStatic qw(r);
 use PublicInbox::GitHTTPBackend;
 use PublicInbox::WwwStream;
-use PublicInbox::Hval qw(ascii_html utf8_maybe);
+use PublicInbox::Hval qw(prurl ascii_html utf8_maybe);
 use PublicInbox::ViewDiff qw(uri_escape_path);
 use PublicInbox::RepoSnapshot;
 use PublicInbox::RepoAtom;
 use PublicInbox::RepoTree;
 use PublicInbox::OnDestroy;
+use URI::Escape qw(uri_escape_utf8);
+use File::Spec;
 
 my @EACH_REF = (qw(git for-each-ref --sort=-creatordate),
 		"--format=%(HEAD)%00".join('%00', map { "%($_)" }
@@ -37,6 +39,74 @@ $ git for-each-ref --sort=-creatordate refs/tags \
 my $NO_HEADS = "# no heads (branches), yet...\n";
 my $NO_TAGS = "# no tags, yet...\n";
 
+sub csrch_load_coderepos { # each_cindex callback
+	my ($csrch, $self, $pi_cfg) = @_;
+	my $name = $csrch->{name};
+	my $cfg_f = $pi_cfg->{-f};
+	my $lpfx = $csrch->{localprefix} or return warn <<EOM;
+W: cindex.$name.localprefix unset in $cfg_f, ignoring cindex.$name
+EOM
+	my $lre = join('|', map { $_ .= '/'; tr!/!/!s; quotemeta } @$lpfx);
+	$lre = qr!\A(?:$lre)!;
+	my $coderepos = $pi_cfg->{-coderepos};
+	my $nick_pfx = $name eq '' ? '' : "$name/";
+	my %dir2cr;
+	for my $p ($csrch->all_terms('P')) {
+		my $nick = $p;
+		$nick =~ s!$lre!$nick_pfx!s or next;
+		$dir2cr{$p} = $coderepos->{$nick} //= do {
+			my $git = PublicInbox::Git->new($p);
+			$git->{nick} = $nick; # for git->pub_urls
+			$git;
+		};
+	}
+	my $jd = $csrch->join_data or return warn <<EOM;
+W: cindex.$name.topdir=$csrch->{topdir} has no usable join data for $cfg_f
+EOM
+	my ($ekeys, $roots, $ibx2root) = @$jd{qw(ekeys roots ibx2root)};
+	my $roots2paths = $csrch->roots2paths;
+	for my $root_offs (@$ibx2root) {
+		my $ekey = shift(@$ekeys) // die 'BUG: {ekeys} empty';
+		scalar(@$root_offs) or next;
+		my $ibx = $pi_cfg->lookup_eidx_key($ekey) // do {
+			warn "W: `$ekey' gone from $cfg_f\n";
+			next;
+		};
+		my $gits = $ibx->{-repo_objs} //= [];
+		my %ibx_p2g = map { $_->{git_dir} => $_ } @$gits;
+		for (@$root_offs) { # sorted by $nr descending
+			my ($nr, $root_off) = @$_;
+			my $root_oid = $roots->[$root_off] // do {
+				warn <<EOM;
+BUG: root #$root_off invalid in join data for `$ekey' with $cfg_f
+EOM
+				next;
+			};
+			my $git_dirs = $roots2paths->{$root_oid};
+			@$git_dirs = grep { !$ibx_p2g{$_} } @$git_dirs;
+			# @$git_dirs or warn "W: no matches for $root_oid\n";
+			for (@$git_dirs) {
+				if (my $git = $dir2cr{$_}) {
+					$ibx_p2g{$_} = $git;
+					$ibx->{-hide}->{www} or
+						push @{$git->{ibx_score}},
+							[ $nr, $ibx->{name} ];
+					push @$gits, $git;
+				} else {
+					warn <<EOM;
+W: no coderepo available for $_ (localprefix=@$lpfx)
+EOM
+				}
+			}
+		}
+		delete $ibx->{-repo_objs} if !@$gits;
+	}
+	for my $git (values %dir2cr) {
+		my $s = $git->{ibx_score};
+		@$s = sort { $b->[0] <=> $a->[0] } @$s if $s;
+	}
+}
+
 # shared with PublicInbox::Cgit
 sub prepare_coderepos {
 	my ($self) = @_;
@@ -62,6 +132,7 @@ sub prepare_coderepos {
 		my $eidx = $pi_cfg->lookup_ei($k) // next;
 		$pi_cfg->repo_objs($eidx);
 	}
+	$pi_cfg->each_cindex(\&csrch_load_coderepos, $self, $pi_cfg);
 }
 
 sub new {
@@ -119,6 +190,40 @@ sub _refs_tags_link {
 		"</a>$align ", ascii_html($s), " ($cd)", @snap_fmt, "\n");
 }
 
+sub emit_joined_inboxes ($) {
+	my ($ctx) = @_;
+	my $names = $ctx->{git}->{ibx_names}; # coderepo directives in config
+	my $score = $ctx->{git}->{ibx_score}; # generated w/ cindex --join
+	($names || $score) or return;
+	my $pi_cfg = $ctx->{wcr}->{pi_cfg};
+	my ($u, $h);
+	my $zfh = $ctx->zfh;
+	print $zfh "\n# associated public inboxes:";
+	my @ns = map { [ 0, $_ ] } @$names;
+	for (@ns, @$score) {
+		my ($nr, $name) = @$_;
+		my $ibx = $pi_cfg->lookup_name($name) // do {
+			warn "W: inbox `$name' gone for $ctx->{git}->{git_dir}";
+			say $zfh '# ', ascii_html($name), ' (missing inbox?)';
+			next;
+		};
+		if (scalar(@{$ibx->{url} // []})) {
+			$u = $h = ascii_html(prurl($ctx->{env}, $ibx->{url}));
+		} else {
+			$h = uri_escape_utf8($name);
+			$h = File::Spec->abs2rel("/$h", "/$ctx->{git}->{nick}");
+			$h = ascii_html($h . '/');
+			$u = ascii_html($name);
+		}
+		if ($nr) {
+			printf $zfh "\n% 11u", $nr;
+		} else {
+			print $zfh "\n", ' 'x11;
+		}
+		print $zfh qq{ <a\nhref="$h">$u</a>};
+	}
+}
+
 sub summary_END { # called via OnDestroy
 	my ($ctx) = @_;
 	my $wcb = delete($ctx->{-wcb}) or return; # already done
@@ -174,6 +279,7 @@ EOM
 	for (@r) { print $zfh _refs_tags_link($_, './', $snap_pfx, @snap_fmt) }
 	print $zfh $NO_TAGS if !@r;
 	print $zfh qq(<a href="refs/tags/">...</a>\n) if $last;
+	emit_joined_inboxes $ctx;
 	$wcb->($ctx->html_done('</pre>'));
 }
 
diff --git a/t/cindex.t b/t/cindex.t
index ac7a6000..afcc226e 100644
--- a/t/cindex.t
+++ b/t/cindex.t
@@ -5,7 +5,7 @@ use v5.12;
 use PublicInbox::TestCommon;
 use Cwd qw(getcwd abs_path);
 use List::Util qw(sum);
-use autodie qw(close open rename);
+use autodie qw(close mkdir open rename);
 require_mods(qw(json Xapian));
 use_ok 'PublicInbox::CodeSearchIdx';
 use PublicInbox::Import;
@@ -227,7 +227,7 @@ SKIP: { # --prune
 }
 
 File::Path::remove_tree("$tmp/ext");
-ok(mkdir("$tmp/ext", 0707), 'create $tmp/ext with odd permissions');
+mkdir("$tmp/ext", 0707);
 ok(run_script([qw(-cindex --dangerous -q -d), "$tmp/ext", $zp]),
 	'external on existing dir');
 {
@@ -265,4 +265,28 @@ EOM
 		'non-Xapian-enabled inbox noted');
 }
 
+# we need to support blank sections for a top-level repos
+# (e.g. <https://example.com/my-project>
+# git.kernel.org could use "pub" as section name, though, since all git repos
+# are currently under //git.kernel.org/pub/**/*
+{
+	mkdir(my $d = "$tmp/blanksection");
+	my $cfg = cfg_new($d, <<EOM);
+[cindex ""]
+	topdir = $tmp/ext
+	localprefix = $tmp
+EOM
+	my $csrch = $cfg->lookup_cindex('');
+	is ref($csrch), 'PublicInbox::CodeSearch', 'codesearch w/ blank name';
+	is_deeply $csrch->{localprefix}, [ "$tmp" ], 'localprefix respected';
+	my $nr = 0;
+	$cfg->each_cindex(sub {
+		my ($cs, @rest) = @_;
+		is $cs->{topdir}, $csrch->{topdir}, 'each_cindex works';
+		is_deeply \@rest, [ '.' ], 'got expected arg';
+		++$nr;
+	}, '.');
+	is $nr, 1, 'iterated through cindices';
+}
+
 done_testing;
diff --git a/xt/solver.t b/xt/solver.t
index 51b4144c..372d003b 100644
--- a/xt/solver.t
+++ b/xt/solver.t
@@ -10,6 +10,7 @@ use_ok($_) for @psgi;
 use_ok 'PublicInbox::WWW';
 my $cfg = PublicInbox::Config->new;
 my $www = PublicInbox::WWW->new($cfg);
+$www->preload;
 my $app = sub {
 	my $env = shift;
 	$env->{'psgi.errors'} = \*STDERR;
@@ -63,7 +64,7 @@ while (my ($ibx_name, $urls) = each %$todo) {
 			skip(qq{[publicinbox "$ibx_name"] not configured},
 				scalar(@$urls));
 		}
-		if (!defined($ibx->{coderepo})) {
+		if (!defined($ibx->{-repo_objs})) {
 			push @gone, $ibx_name;
 			skip(qq{publicinbox.$ibx_name.coderepo not configured},
 				scalar(@$urls));

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 5/7] cindex: require `-g GIT_DIR' or `-r PROJECT_ROOT'
  2023-11-26 14:19 [PATCH 1/7] WIP-cidx-xh-split Eric Wong
                   ` (2 preceding siblings ...)
  2023-11-26 14:19 ` [PATCH 4/7] www_coderepo: load and use cindex join data Eric Wong
@ 2023-11-26 14:19 ` Eric Wong
  2023-11-26 14:19 ` [PATCH 6/7] git: speed up ->git_path for non-worktrees Eric Wong
  2023-11-26 14:19 ` [PATCH 7/7] cindex: set -cfg_f field unconditionally for --show Eric Wong
  5 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2023-11-26 14:19 UTC (permalink / raw)
  To: spew

Accepting @ARGV without switches ends up being ambiguous with
optional parameters for --join and --show.  Requiring users to
specify `--join=' or `--show=' is a bit awkward (as it with
-clone --objstore= and the like, but that may be historical).
---
 Documentation/public-inbox-cindex.pod |  2 +-
 script/public-inbox-cindex            | 34 +++++++++++++++++----------
 t/cindex-join.t                       |  2 +-
 t/cindex.t                            |  9 +++----
 4 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/Documentation/public-inbox-cindex.pod b/Documentation/public-inbox-cindex.pod
index 3ff394be..0c9c4bdb 100644
--- a/Documentation/public-inbox-cindex.pod
+++ b/Documentation/public-inbox-cindex.pod
@@ -4,7 +4,7 @@ public-inbox-cindex - create and update search for code repositories
 
 =head1 SYNOPSIS
 
-public-inbox-cindex [OPTIONS] GIT_DIR...
+public-inbox-cindex [OPTIONS] -g GIT_DIR [-g GIT_DIR]
 
 public-inbox-cindex [OPTIONS] --update
 
diff --git a/script/public-inbox-cindex b/script/public-inbox-cindex
index 97890c1b..d615a8ca 100755
--- a/script/public-inbox-cindex
+++ b/script/public-inbox-cindex
@@ -4,8 +4,8 @@
 use v5.12;
 use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
 my $help = <<EOF; # the following should fit w/o scrolling in 80x24 term:
-usage: public-inbox-cindex [options] GIT_DIR...
-usage: public-inbox-cindex [options] --project-list=FILE PROJECT_ROOT
+usage: public-inbox-cindex [options] -g GIT_DIR...
+usage: public-inbox-cindex [options] --project-list=FILE -r PROJECT_ROOT
 
   Create and update search indices for code repos
 
@@ -29,7 +29,8 @@ GetOptions($opt, qw(quiet|q verbose|v+ reindex jobs|j=i fsync|sync! dangerous
 		indexlevel|index-level|L=s join:s@
 		batch_size|batch-size=s max_size|max-size=s
 		include|I=s@ only=s@ all show:s@
-		project-list=s exclude=s@
+		project-list=s exclude=s@ project-root|r=s
+		git-dir|g=s@
 		sort-parallel=s sort-compress-program=s sort-buffer-size=s
 		d=s update|u scan! prune dry-run|n C=s@ help|h))
 	or die $help;
@@ -50,23 +51,32 @@ PublicInbox::Admin::progress_prepare($opt);
 my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
 %ENV = (%ENV, %$env) if $env;
 
-require PublicInbox::CodeSearchIdx; # unstable internal API
 my @git_dirs;
+require PublicInbox::CodeSearchIdx; # unstable internal API
 if (defined(my $pl = $opt->{'project-list'})) {
-	my $pfx = shift @ARGV // die <<EOM;
+	my $pfx = $opt->{'project-root'} // die <<EOM;
 PROJECTS_ROOT required for --project-list
 EOM
-	@ARGV and die <<EOM;
---project-list does not accept additional directories
-(@ARGV)
-beyond `$pfx'
+	$opt->{'git-dir'} and die <<EOM;
+--project-list does not accept additional --git-dir directories
+(@{$opt->{'git-dir'}})
 EOM
 	open my $fh, '<', $pl or die "open($pl): $!\n";
 	chomp(@git_dirs = <$fh>);
-	$_ = PublicInbox::Admin::resolve_git_dir("$pfx/$_") for @git_dirs;
-} else {
-	@git_dirs = map { PublicInbox::Admin::resolve_git_dir($_) } @ARGV;
+	$pfx .= '/';
+	$pfx =~ tr!/!/!s;
+	substr($_, 0, 0, $pfx) for @git_dirs;
+} elsif (my $gd = $opt->{'git-dir'}) {
+	@git_dirs = @$gd;
+} elsif (@ARGV) {
+	my @g = map { "-g $_" } @ARGV;
+	die <<EOM;
+Specify git directories with `-g' (or --git-dir=): @g
+Or use --project-list=... and --project-root=...
+EOM
 }
+
+$_ = PublicInbox::Admin::resolve_git_dir($_) for @git_dirs;
 if (defined $cidx_dir) { # external index
 	die "`%' is not allowed in $cidx_dir\n" if $cidx_dir =~ /\%/;
 	my $cidx = PublicInbox::CodeSearchIdx->new($cidx_dir, $opt);
diff --git a/t/cindex-join.t b/t/cindex-join.t
index 2836eb6c..8d0b09d2 100644
--- a/t/cindex-join.t
+++ b/t/cindex-join.t
@@ -70,7 +70,7 @@ my $cidxdir = "$tmpdir/cidx";
 my $rdr = { 1 => \my $cout, 2 => \my $cerr };
 ok run_script([qw(-cindex -v --all --show=join_data),
 		'--join=aggressive,dt:..2022-12-01',
-		'-d', $cidxdir, values %code ],
+		'-d', $cidxdir, map { ('-g', $_) } values %code ],
 		$env, $rdr), 'initial join inboxes w/ coderepos';
 my $out = PublicInbox::Config->json->decode($cout);
 is($out->{join_data}->{dt}->[0], '19700101'.'000000',
diff --git a/t/cindex.t b/t/cindex.t
index afcc226e..134df4ea 100644
--- a/t/cindex.t
+++ b/t/cindex.t
@@ -33,7 +33,7 @@ git gc -q
 EOM
 }; # /create_coderepo
 
-ok(run_script([qw(-cindex --dangerous -q), "$tmp/wt0"]), 'cindex internal');
+ok(run_script([qw(-cindex --dangerous -q -g), "$tmp/wt0"]), 'cindex internal');
 {
 	my $exists = -e "$tmp/wt0/.git/public-inbox-cindex/cidx.lock";
 	my @st = stat(_);
@@ -67,13 +67,14 @@ git gc -q
 EOM
 }; # /create_coderepo
 
-ok(run_script([qw(-cindex --dangerous -q -d), "$tmp/ext", $zp, "$tmp/wt0"]),
+ok(run_script([qw(-cindex --dangerous -q -d), "$tmp/ext",
+		'-g', $zp, '-g', "$tmp/wt0" ]),
 	'cindex external');
 ok(-e "$tmp/ext/cidx.lock", 'external dir created');
 ok(!-d "$zp/.git/public-inbox-cindex", 'no cindex in original coderepo');
 
 ok(run_script([qw(-cindex -L medium --dangerous -q -d),
-	"$tmp/med", $zp, "$tmp/wt0"]), 'cindex external medium');
+	"$tmp/med", '-g', $zp, '-g', "$tmp/wt0"]), 'cindex external medium');
 
 
 SKIP: {
@@ -228,7 +229,7 @@ SKIP: { # --prune
 
 File::Path::remove_tree("$tmp/ext");
 mkdir("$tmp/ext", 0707);
-ok(run_script([qw(-cindex --dangerous -q -d), "$tmp/ext", $zp]),
+ok(run_script([qw(-cindex --dangerous -q -d), "$tmp/ext", '-g', $zp]),
 	'external on existing dir');
 {
 	my @st = stat("$tmp/ext/cidx.lock");

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 6/7] git: speed up ->git_path for non-worktrees
  2023-11-26 14:19 [PATCH 1/7] WIP-cidx-xh-split Eric Wong
                   ` (3 preceding siblings ...)
  2023-11-26 14:19 ` [PATCH 5/7] cindex: require `-g GIT_DIR' or `-r PROJECT_ROOT' Eric Wong
@ 2023-11-26 14:19 ` Eric Wong
  2023-11-26 14:19 ` [PATCH 7/7] cindex: set -cfg_f field unconditionally for --show Eric Wong
  5 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2023-11-26 14:19 UTC (permalink / raw)
  To: spew

Only worktrees need to use `git rev-parse --git-path', so avoid
the spawn overhead of a new process.  With the SolverGit.pm
limit on coderepo scans disabled and scanning over 800 git repos
for git@vger matches, this reduces up xt/solver.t times by
roughly 25%.
---
 lib/PublicInbox/Git.pm | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm
index fe834210..6c7c11b6 100644
--- a/lib/PublicInbox/Git.pm
+++ b/lib/PublicInbox/Git.pm
@@ -99,15 +99,15 @@ sub new {
 
 sub git_path ($$) {
 	my ($self, $path) = @_;
-	$self->{-git_path}->{$path} //= do {
+	$self->{-git_path}->{$path} // do {
+		return "$self->{git_dir}/$path" if -d $self->{git_dir};
+
 		local $/ = "\n";
-		chomp(my $str = $self->qx(qw(rev-parse --git-path), $path));
+		chomp(my $s = $self->qx(qw(rev-parse --git-path), $path));
 
 		# git prior to 2.5.0 did not understand --git-path
-		if ($str eq "--git-path\n$path") {
-			$str = "$self->{git_dir}/$path";
-		}
-		$str;
+		$self->{-git_path}->{$path} = $s eq "--git-path\n$path" ?
+						"$self->{git_dir}/$path" : $s;
 	};
 }
 

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 7/7] cindex: set -cfg_f field unconditionally for --show
  2023-11-26 14:19 [PATCH 1/7] WIP-cidx-xh-split Eric Wong
                   ` (4 preceding siblings ...)
  2023-11-26 14:19 ` [PATCH 6/7] git: speed up ->git_path for non-worktrees Eric Wong
@ 2023-11-26 14:19 ` Eric Wong
  5 siblings, 0 replies; 7+ messages in thread
From: Eric Wong @ 2023-11-26 14:19 UTC (permalink / raw)
  To: spew

There's also no need to local-ize the field, since it's
not going to hold other references and the CodeSearch*
objects are tied to the config file anyways.
---
 lib/PublicInbox/CodeSearchIdx.pm | 5 ++---
 t/cindex-join.t                  | 5 +++++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index a6cbe0b0..d49e9a8d 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -1131,8 +1131,6 @@ sub init_join_prefork ($) {
 	} split(/,/, join(',', @$subopt));
 	require PublicInbox::CidxXapHelperAux;
 	require PublicInbox::XapClient;
-	my $cfg = $self->{-opt}->{-pi_cfg} // die 'BUG: -pi_cfg unset';
-	$self->{-cfg_f} = $cfg->{-f} = rel2abs_collapsed($cfg->{-f});
 	my @unknown;
 	my $pfx = $JOIN{prefixes} // 'patchid';
 	for (split /\+/, $pfx) {
@@ -1223,7 +1221,8 @@ sub cidx_run { # main entry point
 				$PublicInbox::SearchIdx::BATCH_BYTES;
 	local $MAX_SIZE = $self->{-opt}->{max_size};
 	local $self->{PENDING} = {}; # used by PublicInbox::CidxXapHelperAux
-	local $self->{-cfg_f};
+	my $cfg = $self->{-opt}->{-pi_cfg} // die 'BUG: -pi_cfg unset';
+	$self->{-cfg_f} = $cfg->{-f} = rel2abs_collapsed($cfg->{-f});
 	if (grep { $_ } @{$self->{-opt}}{qw(prune join)}) {
 		require File::Temp;
 		$TMPDIR = File::Temp->newdir('cidx-all-git-XXXX', TMPDIR => 1);
diff --git a/t/cindex-join.t b/t/cindex-join.t
index 8d0b09d2..323d69c6 100644
--- a/t/cindex-join.t
+++ b/t/cindex-join.t
@@ -79,4 +79,9 @@ is($out->{join_data}->{dt}->[0], '19700101'.'000000',
 ok run_script([qw(-cindex -v --all -u --join --show),
 		'-d', $cidxdir], $env, $rdr), 'incremental --join';
 
+ok run_script([qw(-cindex -v --no-scan --show),
+		'-d', $cidxdir], $env, $rdr), 'show';
+$out = PublicInbox::Config->json->decode($cout);
+is ref($out->{join_data}), 'HASH', 'got hash join data';
+is $cerr, '', 'no warnings or errors in stderr w/ --show';
 done_testing;

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2023-11-26 14:19 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-26 14:19 [PATCH 1/7] WIP-cidx-xh-split Eric Wong
2023-11-26 14:19 ` [PATCH 2/7] WIP-cidx Eric Wong
2023-11-26 14:19 ` [PATCH 3/7] hval: relative paths Eric Wong
2023-11-26 14:19 ` [PATCH 4/7] www_coderepo: load and use cindex join data Eric Wong
2023-11-26 14:19 ` [PATCH 5/7] cindex: require `-g GIT_DIR' or `-r PROJECT_ROOT' Eric Wong
2023-11-26 14:19 ` [PATCH 6/7] git: speed up ->git_path for non-worktrees Eric Wong
2023-11-26 14:19 ` [PATCH 7/7] cindex: set -cfg_f field unconditionally for --show Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).