dumping ground for random patches and texts
 help / color / mirror / Atom feed
From: Eric Wong <e@80x24.org>
To: spew@80x24.org
Subject: [PATCH 2/2] WIP-reposearchidx
Date: Fri, 17 Feb 2023 04:01:13 +0000	[thread overview]
Message-ID: <20230217040113.111644-2-e@80x24.org> (raw)
In-Reply-To: <20230217040113.111644-1-e@80x24.org>

WIP
---
 MANIFEST                         |   2 +
 lib/PublicInbox/CodeSearch.pm    |  44 ++++++++++
 lib/PublicInbox/CodeSearchIdx.pm | 146 +++++++++++++++++++++++++++++++
 lib/PublicInbox/Search.pm        |  43 +++++----
 lib/PublicInbox/SearchIdx.pm     |  38 ++++----
 script/public-inbox-cindex       |  70 +++++++++++++++
 6 files changed, 307 insertions(+), 36 deletions(-)
 create mode 100644 lib/PublicInbox/CodeSearch.pm
 create mode 100644 lib/PublicInbox/CodeSearchIdx.pm
 create mode 100755 script/public-inbox-cindex

diff --git a/MANIFEST b/MANIFEST
index aba32762..6c068b18 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -161,6 +161,7 @@ lib/PublicInbox/AltId.pm
 lib/PublicInbox/AutoReap.pm
 lib/PublicInbox/Cgit.pm
 lib/PublicInbox/CmdIPC4.pm
+lib/PublicInbox/CodeSearchIdx.pm
 lib/PublicInbox/CompressNoop.pm
 lib/PublicInbox/Config.pm
 lib/PublicInbox/ConfigIter.pm
@@ -363,6 +364,7 @@ sa_config/README
 sa_config/root/etc/spamassassin/public-inbox.pre
 sa_config/user/.spamassassin/user_prefs
 script/lei
+script/public-inbox-cindex
 script/public-inbox-clone
 script/public-inbox-compact
 script/public-inbox-convert
diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm
new file mode 100644
index 00000000..d7dfdb08
--- /dev/null
+++ b/lib/PublicInbox/CodeSearch.pm
@@ -0,0 +1,44 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# read-only external index for coderepos.
+package PublicInbox::CodeSearch;
+use v5.12;
+use parent qw(PublicInbox::Search);
+use constant {
+	AT => 0, # author time
+	CT => 1, # commit time # should we even care to index this?
+	SCHEMA_VERSION => 1,
+};
+
+# bool_pfx_internal:
+# 	type => 'T', # 'c' - commit, 'r' - repo GIT_DIR
+# 	tags are not indexed, only normal branches (refs/heads/*), not hidden
+
+# note: the non-X term prefix allocations are shared with
+# Xapian omega, see xapian-applications/omega/docs/termprefixes.rst
+my %bool_pfx_external = (
+	oid => 'Q', # type:commit - git OID hex (40|64)-byte SHA-(1|256)
+		# type:repo - rel2abs_collapsed(GIT_DIR)
+	%COMMIT_BOOL_COMMON
+	# we use O/eidx_key to store GIT_DIR key
+);
+
+my %prob_prefix = ( # copied from PublicInbox::Search
+	# do we care about committer? or partial commit OID?
+	# o => 'XQ', # 'oid:' (bool) is exact, 'o:' (prob) can do partial
+	%PublicInbox::Search::COMMIT_PROB_COMMON,
+
+	# default:
+	'' => 'S A XQUOT XFN ' . $PublicInbox::Search::NON_QUOTED_BODY
+);
+
+# read-only
+sub mset {
+	my ($self, $query_string, $opt) = @_;
+	my $qp = $self->{qp} //= cqparse_new($self);
+	my $query = $qp->parse_query($query_string, $self->{qp_flags});
+}
+
+
+1;
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
new file mode 100644
index 00000000..8ba2214b
--- /dev/null
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -0,0 +1,146 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+#
+# indexer for git coderepos, just commits for now
+package PublicInbox::CodeSearchIdx;
+use v5.12;
+use parent qw(PublicInbox::Lock PublicInbox::SearchIdx);
+use Socket qw(AF_UNIX MSG_EOR SOCK_SEQPACKET);
+use constant FROM_DATE => length(" Mon Sep 17 00:00:00 2001\n");
+use PublicInbox::Eml;
+use PublicInbox::DS ();
+use PublicInbox::InboxWritable ();
+use PublicInbox::IPC qw(nproc_shards);
+use PublicInbox::Admin;
+use POSIX qw(WNOHANG);
+
+our $SEEN_MAX = 100000; # w/o reading trees, git walks commits quickly
+my @FMT = qw(H cn ce ct an ae at s b);
+my @LOG = (qw(log --all --no-decorate --no-color --no-notes -p -M
+	--stdin --no-walk=unsorted), '--pretty=format:%x00'.
+	join('%n', map { "%$_" } @FMT).'%x00');
+
+sub new {
+	my (undef, $dir, $opt) = @_;
+	my $l = $opt->{indexlevel} // 'full';
+	$l !~ $PublicInbox::SearchIdx::INDEXLEVELS and
+		die "invalid indexlevel=$l\n";
+	$l eq 'basic' and die "E: indexlevel=basic not supported\n";
+	my $self = bless {
+		xpfx => "$dir/c".PublicInbox::CodeSearch::SCHEMA_VERSION,
+		cidx_dir => $dir,
+		creat => $opt->{creat},
+		indexlevel => $l,
+		transact_bytes => 0,
+		total_bytes => 0,
+		current_info => '',
+		parallel => 1,
+		lock_path => "$dir/cidx.lock",
+	}, __PACKAGE__;
+	$self->{nshard} = $self->count_shards ||
+		nproc_shards({nproc => $opt->{jobs}});
+	$self->{-no_fsync} = 1 if !$opt->{fsync};
+	$self->{-dangerous} = 1 if $opt->{dangerous};
+	$self;
+}
+
+# TODO: may be used for reshard/compact
+sub count_shards { scalar($_[0]->xdb_shards_flat) }
+
+# sharded reader for `git log --pretty=format: --stdin'
+sub commit_reader {
+	my ($self, $r, $c) = @_; # $c = SOCK_SEQPACKET consumer
+	my ($H, $ct, $buf);
+
+	# the parent process of this shard process writes directly to
+	# the stdin of `git log', we consume git log's stdout:
+	my $rd = $self->{git}->popen(@LOG, undef, { 0 => $r });
+	close $r or die "close: $!";
+
+	local $/ = "\0";
+	my $buf = <$rd> // return; # leading "\0"
+	my (%c, $hdr);
+	while (defined($hdr = <$rd>)) {
+		chomp($hdr);
+		@c{@FMT} = split(/\n/, scalar(@FMT));
+		chomp($c{patch} = <$rd>);
+		use Data::Dumper;
+		warn Dumper(\%c);
+		# $self->add_xapian($eml, $smsg, [ $H ]);
+	}
+}
+
+# this is a bit of weird pipe+process structure unseen anywhere else in
+# our code base, but maybe LeiToMail can take advantage of it someday
+sub stream_log {
+	my ($self) = @_;
+	my (%pids, @pipes, $err);
+	my $reap = sub {
+		my ($sig) = @_;
+		do {
+			my $pid = waitpid(-1, $sig ? WNOHANG : 0) or return;
+			return if $pid < 0;
+			my $j = delete $pids{$pid} // "unknown PID:$pid";
+			if ($?) {
+				warn "$j exited with \$?=$?\n";
+				$err = 1;
+			}
+		} while ($sig);
+	};
+
+	local $SIG{CHLD} = $reap;
+	my $nshard = $self->{nshard};
+	for my $n (0..($nshard - 1)) {
+		pipe(my ($r, $w)) or die "pipe: $!";
+		$w->autoflush(1);
+		push @pipes, $w;
+		my $sigset = PublicInbox::DS::block_signals();
+		my $pid = fork // die "fork: $!";
+		if ($pid == 0) {
+			$0 = "code index [$n]";
+			for (@pipes) { close($_) or die "close: $!" }
+			for (qw(TTOU TTIN TERM QUIT INT CHLD)) {
+				$SIG{$_} = 'DEFAULT';
+			}
+			PublicInbox::DS::sig_setmask($sigset);
+			eval { commit_reader($self, $r) };
+			warn "E: $@" if $@;
+			POSIX::_exit($@ ? 1 : 0);
+		} else {
+			$pids{$pid} = "code index [$n]";
+			close($r) or die "close: $!";
+		}
+	}
+
+	# children all running, now feed them anything we haven't seen:
+	my $fh = $self->{git}->popen(qw(rev-list --all));
+	my ($H, $n, $seen);
+	while (defined($H = <$fh>)) {
+		chomp $H;
+		if (seen($self, $H)) {
+			last if ++$seen > $SEEN_MAX;
+		} else {
+			$n = hex(substr($H, 0, 4)) % $nshard;
+			say { $pipes[$n] } $H or die "say: $!";
+			$seen = 0;
+		}
+	}
+	close $fh or die "close: $!";
+	for (@pipes) { close($_) or die "close: $!" }
+	$reap->() while %pids;
+	die "subprocess(es) failed\n" if $err;
+}
+
+sub git { $_[0]->{git} } # for PublicInbox::SearchIdx
+
+sub cidx_sync { # main entry point
+	my ($self, $opt) = @_;
+	my $warn_cb = $SIG{__WARN__} || \&CORE::warn;
+	local $self->{current_info} = '';
+}
+
+# bypass PublicInbox::SearchIdx::with_umask:
+no warnings 'once';
+*with_umask = \&PublicInbox::InboxWritable::with_umask;
+
+1;
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 86219dfe..c51663ff 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -110,43 +110,50 @@ sub load_xapian () {
 # a prefix common in patch emails
 our $LANG = 'english';
 
+our %COMMIT_BOOL_COMMON = (
+	dfpre => 'XDFPRE',
+	dfpost => 'XDFPOST',
+	dfblob => 'XDFPRE XDFPOST',
+	patchid => 'XDFID',
+);
+
 # note: the non-X term prefix allocations are shared with
 # Xapian omega, see xapian-applications/omega/docs/termprefixes.rst
 my %bool_pfx_external = (
 	mid => 'Q', # Message-ID (full/exact), this is mostly uniQue
 	lid => 'G', # newsGroup (or similar entity), just inside <>
-	dfpre => 'XDFPRE',
-	dfpost => 'XDFPOST',
-	dfblob => 'XDFPRE XDFPOST',
-	patchid => 'XDFID',
+	%COMMIT_BOOL_COMMON
 );
 
-my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID';
-my %prob_prefix = (
-	# for mairix compatibility
+# for mairix compatibility
+our $NON_QUOTED_BODY = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID';
+our %COMMIT_PROB_COMMON = (
 	s => 'S',
-	m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial
-	l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial
 	f => 'A',
-	t => 'XTO',
-	tc => 'XTO XCC',
-	c => 'XCC',
-	tcf => 'XTO XCC A',
-	a => 'XTO XCC A',
-	b => $non_quoted_body . ' XQUOT',
-	bs => $non_quoted_body . ' XQUOT S',
+	b => $NON_QUOTED_BODY . ' XQUOT',
+	bs => $NON_QUOTED_BODY . ' XQUOT S',
 	n => 'XFN',
 
 	q => 'XQUOT',
-	nq => $non_quoted_body,
+	nq => $NON_QUOTED_BODY,
 	dfn => 'XDFN',
 	dfa => 'XDFA',
 	dfb => 'XDFB',
 	dfhh => 'XDFHH',
 	dfctx => 'XDFCTX',
+);
 
+my %prob_prefix = (
+	m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial
+	l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial
+	t => 'XTO',
+	tc => 'XTO XCC',
+	c => 'XCC',
+	tcf => 'XTO XCC A',
+	a => 'XTO XCC A',
+	%COMMIT_PROB_COMMON,
 	# default:
-	'' => 'XM S A XQUOT XFN ' . $non_quoted_body,
+	'' => 'XM S A XQUOT XFN ' . $NON_QUOTED_BODY,
 );
 
 # not documenting m: and mid: for now, the using the URLs works w/o Xapian
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 257b83a5..66f688de 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -134,6 +134,7 @@ sub idx_acquire {
 		load_xapian_writable();
 		$flag = $self->{creat} ? $DB_CREATE_OR_OPEN : $DB_OPEN;
 	}
+	my $owner = $self->{ibx} // $self->{eidx} // $self;
 	if ($self->{creat}) {
 		require File::Path;
 		$self->lock_acquire;
@@ -146,13 +147,11 @@ sub idx_acquire {
 			require PublicInbox::Syscall;
 			PublicInbox::Syscall::nodatacow_dir($dir);
 			$self->{-set_has_threadid_once} = 1;
-			if (($self->{ibx} // $self->{eidx})->{-dangerous}) {
-				$flag |= $DB_DANGEROUS;
-			}
+			$flag |= $DB_DANGEROUS if $owner->{-dangerous};
 		}
 	}
 	return unless defined $flag;
-	$flag |= $DB_NO_SYNC if ($self->{ibx} // $self->{eidx})->{-no_fsync};
+	$flag |= $DB_NO_SYNC if $owner->{-no_fsync};
 	my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) };
 	croak "Failed opening $dir: $@" if $@;
 	$self->{xdb} = $xdb;
@@ -376,9 +375,9 @@ sub index_xapian { # msg_iter callback
 		$fh->autoflush(1);
 		print $fh $s or die "print: $!";
 		sysseek($fh, 0, SEEK_SET) or die "sysseek: $!";
-		my $id = ($self->{ibx} // $self->{eidx})->git->qx(
-						[qw(patch-id --stable)],
-						{}, { 0 => $fh, 2 => $eh });
+		my $id = ($self->{git} // ($self->{ibx} // $self->{eidx})->git
+			)->qx([qw(patch-id --stable)], {},
+				{ 0 => $fh, 2 => $eh });
 		$id =~ /\A([a-f0-9]{40,})/ and $doc->add_term('XDFID'.$1);
 		seek($eh, 0, SEEK_SET) or die "seek: $!";
 		while (<$eh>) { warn $_ }
@@ -414,14 +413,16 @@ sub index_list_id ($$$) {
 
 sub index_ids ($$$$) {
 	my ($self, $doc, $hdr, $mids) = @_;
-	for my $mid (@$mids) {
-		index_phrase($self, $mid, 1, 'XM');
+	if (!$self->{-repo_idx}) {
+		for my $mid (@$mids) {
+			index_phrase($self, $mid, 1, 'XM');
 
-		# because too many Message-IDs are prefixed with
-		# "Pine.LNX."...
-		if ($mid =~ /\w{12,}/) {
-			my @long = ($mid =~ /(\w{3,}+)/g);
-			index_phrase($self, join(' ', @long), 1, 'XM');
+			# because too many Message-IDs are prefixed with
+			# "Pine.LNX."...
+			if ($mid =~ /\w{12,}/) {
+				my @long = ($mid =~ /(\w{3,}+)/g);
+				index_phrase($self, join(' ', @long), 1, 'XM');
+			}
 		}
 	}
 	$doc->add_boolean_term('Q' . $_) for @$mids;
@@ -438,10 +439,11 @@ sub eml2doc ($$$;$) {
 	add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd);
 	my $dt = strftime('%Y%m%d%H%M%S', @ds);
 	add_val($doc, PublicInbox::Search::DT(), $dt);
-	add_val($doc, PublicInbox::Search::BYTES(), $smsg->{bytes});
-	add_val($doc, PublicInbox::Search::UID(), $smsg->{num});
-	add_val($doc, PublicInbox::Search::THREADID, $smsg->{tid});
-
+	if (!$self->{-repo_idx}) {
+		add_val($doc, PublicInbox::Search::BYTES(), $smsg->{bytes});
+		add_val($doc, PublicInbox::Search::UID(), $smsg->{num});
+		add_val($doc, PublicInbox::Search::THREADID, $smsg->{tid});
+	}
 	my $tg = term_generator($self);
 	$tg->set_document($doc);
 	index_headers($self, $smsg);
diff --git a/script/public-inbox-cindex b/script/public-inbox-cindex
new file mode 100755
index 00000000..f6717239
--- /dev/null
+++ b/script/public-inbox-cindex
@@ -0,0 +1,70 @@
+#!perl -w
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use v5.12;
+use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
+my $help = <<EOF; # the following should fit w/o scrolling in 80x24 term:
+usage: public-inbox-cindex [options] CINDEX_DIR [GIT_DIR...]
+
+  Create and update detached search indices for coderepos
+
+  --no-fsync          speed up indexing, risk corruption on power outage
+  -L LEVEL            `medium', or `full' (default: medium)
+  --all               index all configured repos
+  --jobs=NUM          set or disable parallelization (NUM=0)
+  --batch-size=BYTES  flush changes to OS after a given number of bytes
+  --max-size=BYTES    do not index messages larger than the given size
+  --gc                perform garbage collection instead of indexing
+  --reindex           index previously indexed repos
+  --verbose | -v      increase verbosity (may be repeated)
+
+BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes)
+See public-inbox-codeindex(1) man page for full documentation.
+EOF
+my $opt = { quiet => -1, compact => 0, fsync => 1, scan => 1 };
+GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i
+		fsync|sync! dangerous
+		indexlevel|index-level|L=s max_size|max-size=s
+		batch_size|batch-size=s
+		dedupe:s@ gc commit-interval=i watch scan! dry-run|n
+		all C=s@ help|h))
+	or die $help;
+if ($opt->{help}) { print $help; exit 0 };
+die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
+require IO::Handle;
+STDOUT->autoflush(1);
+STDERR->autoflush(1);
+local $SIG{USR1} = 'IGNORE'; # to be overridden in cidx_sync
+# require lazily to speed up --help
+require PublicInbox::Admin;
+PublicInbox::Admin::do_chdir(delete $opt->{C});
+my $cfg = PublicInbox::Config->new;
+my $cidx_dir = shift(@ARGV) // die "E: $help";
+if ($opt->{gc}) {
+	die "E: repository paths must not be specified with --gc\n" if @ARGV;
+	for my $sw (qw(watch dry-run)) {
+		die "E: --$sw is not compatible with --gc\n" if $opt->{$sw};
+	}
+}
+PublicInbox::Admin::require_or_die(qw(-search));
+PublicInbox::Admin::progress_prepare($opt);
+my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
+local %ENV = (%ENV, %$env) if $env;
+require PublicInbox::CodeSearchIdx;
+my $cidx = PublicInbox::CodeSearchIdx->new($cidx_dir, $opt);
+if ($opt->{gc}) {
+	$cidx->attach_config($cfg);
+	$cidx->cidx_gc($opt);
+} else {
+	if ($opt->{config}) {
+		$cidx->attach_config($cfg);
+	} else {
+		$cidx->attach_config($cfg, \@ibxs);
+	}
+	if ($opt->{watch}) {
+		$cfg = undef; # save memory only after SIGHUP
+		$cidx->cidx_watch($opt);
+	} else {
+		$cidx->cidx_sync($opt);
+	}
+}

      reply	other threads:[~2023-02-17  4:01 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-02-17  4:01 [PATCH 1/2] ipc: move nproc_shards from v2writable Eric Wong
2023-02-17  4:01 ` Eric Wong [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230217040113.111644-2-e@80x24.org \
    --to=e@80x24.org \
    --cc=spew@80x24.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).