From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: * X-Spam-ASN: AS48874 86.104.194.0/24 X-Spam-Status: No, score=1.5 required=3.0 tests=AWL,BAYES_00,RCVD_IN_MSPIKE_BL, RCVD_IN_MSPIKE_ZBI,RCVD_IN_SBL_CSS,RCVD_IN_XBL,RDNS_NONE,SPF_FAIL, SPF_HELO_FAIL shortcircuit=no autolearn=no autolearn_force=no version=3.4.6 Received: from 80x24.org (unknown [86.104.194.190]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256) (No client certificate requested) by dcvr.yhbt.net (Postfix) with ESMTPS id DFBCF1F626 for ; Fri, 17 Feb 2023 04:01:44 +0000 (UTC) From: Eric Wong To: spew@80x24.org Subject: [PATCH 2/2] WIP-reposearchidx Date: Fri, 17 Feb 2023 04:01:13 +0000 Message-Id: <20230217040113.111644-2-e@80x24.org> In-Reply-To: <20230217040113.111644-1-e@80x24.org> References: <20230217040113.111644-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: WIP --- MANIFEST | 2 + lib/PublicInbox/CodeSearch.pm | 44 ++++++++++ lib/PublicInbox/CodeSearchIdx.pm | 146 +++++++++++++++++++++++++++++++ lib/PublicInbox/Search.pm | 43 +++++---- lib/PublicInbox/SearchIdx.pm | 38 ++++---- script/public-inbox-cindex | 70 +++++++++++++++ 6 files changed, 307 insertions(+), 36 deletions(-) create mode 100644 lib/PublicInbox/CodeSearch.pm create mode 100644 lib/PublicInbox/CodeSearchIdx.pm create mode 100755 script/public-inbox-cindex diff --git a/MANIFEST b/MANIFEST index aba32762..6c068b18 100644 --- a/MANIFEST +++ b/MANIFEST @@ -161,6 +161,7 @@ lib/PublicInbox/AltId.pm lib/PublicInbox/AutoReap.pm lib/PublicInbox/Cgit.pm lib/PublicInbox/CmdIPC4.pm +lib/PublicInbox/CodeSearchIdx.pm lib/PublicInbox/CompressNoop.pm lib/PublicInbox/Config.pm lib/PublicInbox/ConfigIter.pm @@ -363,6 +364,7 @@ sa_config/README sa_config/root/etc/spamassassin/public-inbox.pre sa_config/user/.spamassassin/user_prefs script/lei +script/public-inbox-cindex script/public-inbox-clone script/public-inbox-compact script/public-inbox-convert diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm new file mode 100644 index 00000000..d7dfdb08 --- /dev/null +++ b/lib/PublicInbox/CodeSearch.pm @@ -0,0 +1,44 @@ +# Copyright (C) all contributors +# License: AGPL-3.0+ + +# read-only external index for coderepos. +package PublicInbox::CodeSearch; +use v5.12; +use parent qw(PublicInbox::Search); +use constant { + AT => 0, # author time + CT => 1, # commit time # should we even care to index this? + SCHEMA_VERSION => 1, +}; + +# bool_pfx_internal: +# type => 'T', # 'c' - commit, 'r' - repo GIT_DIR +# tags are not indexed, only normal branches (refs/heads/*), not hidden + +# note: the non-X term prefix allocations are shared with +# Xapian omega, see xapian-applications/omega/docs/termprefixes.rst +my %bool_pfx_external = ( + oid => 'Q', # type:commit - git OID hex (40|64)-byte SHA-(1|256) + # type:repo - rel2abs_collapsed(GIT_DIR) + %COMMIT_BOOL_COMMON + # we use O/eidx_key to store GIT_DIR key +); + +my %prob_prefix = ( # copied from PublicInbox::Search + # do we care about committer? or partial commit OID? + # o => 'XQ', # 'oid:' (bool) is exact, 'o:' (prob) can do partial + %PublicInbox::Search::COMMIT_PROB_COMMON, + + # default: + '' => 'S A XQUOT XFN ' . $PublicInbox::Search::NON_QUOTED_BODY +); + +# read-only +sub mset { + my ($self, $query_string, $opt) = @_; + my $qp = $self->{qp} //= cqparse_new($self); + my $query = $qp->parse_query($query_string, $self->{qp_flags}); +} + + +1; diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm new file mode 100644 index 00000000..8ba2214b --- /dev/null +++ b/lib/PublicInbox/CodeSearchIdx.pm @@ -0,0 +1,146 @@ +# Copyright (C) all contributors +# License: AGPL-3.0+ +# +# indexer for git coderepos, just commits for now +package PublicInbox::CodeSearchIdx; +use v5.12; +use parent qw(PublicInbox::Lock PublicInbox::SearchIdx); +use Socket qw(AF_UNIX MSG_EOR SOCK_SEQPACKET); +use constant FROM_DATE => length(" Mon Sep 17 00:00:00 2001\n"); +use PublicInbox::Eml; +use PublicInbox::DS (); +use PublicInbox::InboxWritable (); +use PublicInbox::IPC qw(nproc_shards); +use PublicInbox::Admin; +use POSIX qw(WNOHANG); + +our $SEEN_MAX = 100000; # w/o reading trees, git walks commits quickly +my @FMT = qw(H cn ce ct an ae at s b); +my @LOG = (qw(log --all --no-decorate --no-color --no-notes -p -M + --stdin --no-walk=unsorted), '--pretty=format:%x00'. + join('%n', map { "%$_" } @FMT).'%x00'); + +sub new { + my (undef, $dir, $opt) = @_; + my $l = $opt->{indexlevel} // 'full'; + $l !~ $PublicInbox::SearchIdx::INDEXLEVELS and + die "invalid indexlevel=$l\n"; + $l eq 'basic' and die "E: indexlevel=basic not supported\n"; + my $self = bless { + xpfx => "$dir/c".PublicInbox::CodeSearch::SCHEMA_VERSION, + cidx_dir => $dir, + creat => $opt->{creat}, + indexlevel => $l, + transact_bytes => 0, + total_bytes => 0, + current_info => '', + parallel => 1, + lock_path => "$dir/cidx.lock", + }, __PACKAGE__; + $self->{nshard} = $self->count_shards || + nproc_shards({nproc => $opt->{jobs}}); + $self->{-no_fsync} = 1 if !$opt->{fsync}; + $self->{-dangerous} = 1 if $opt->{dangerous}; + $self; +} + +# TODO: may be used for reshard/compact +sub count_shards { scalar($_[0]->xdb_shards_flat) } + +# sharded reader for `git log --pretty=format: --stdin' +sub commit_reader { + my ($self, $r, $c) = @_; # $c = SOCK_SEQPACKET consumer + my ($H, $ct, $buf); + + # the parent process of this shard process writes directly to + # the stdin of `git log', we consume git log's stdout: + my $rd = $self->{git}->popen(@LOG, undef, { 0 => $r }); + close $r or die "close: $!"; + + local $/ = "\0"; + my $buf = <$rd> // return; # leading "\0" + my (%c, $hdr); + while (defined($hdr = <$rd>)) { + chomp($hdr); + @c{@FMT} = split(/\n/, scalar(@FMT)); + chomp($c{patch} = <$rd>); + use Data::Dumper; + warn Dumper(\%c); + # $self->add_xapian($eml, $smsg, [ $H ]); + } +} + +# this is a bit of weird pipe+process structure unseen anywhere else in +# our code base, but maybe LeiToMail can take advantage of it someday +sub stream_log { + my ($self) = @_; + my (%pids, @pipes, $err); + my $reap = sub { + my ($sig) = @_; + do { + my $pid = waitpid(-1, $sig ? WNOHANG : 0) or return; + return if $pid < 0; + my $j = delete $pids{$pid} // "unknown PID:$pid"; + if ($?) { + warn "$j exited with \$?=$?\n"; + $err = 1; + } + } while ($sig); + }; + + local $SIG{CHLD} = $reap; + my $nshard = $self->{nshard}; + for my $n (0..($nshard - 1)) { + pipe(my ($r, $w)) or die "pipe: $!"; + $w->autoflush(1); + push @pipes, $w; + my $sigset = PublicInbox::DS::block_signals(); + my $pid = fork // die "fork: $!"; + if ($pid == 0) { + $0 = "code index [$n]"; + for (@pipes) { close($_) or die "close: $!" } + for (qw(TTOU TTIN TERM QUIT INT CHLD)) { + $SIG{$_} = 'DEFAULT'; + } + PublicInbox::DS::sig_setmask($sigset); + eval { commit_reader($self, $r) }; + warn "E: $@" if $@; + POSIX::_exit($@ ? 1 : 0); + } else { + $pids{$pid} = "code index [$n]"; + close($r) or die "close: $!"; + } + } + + # children all running, now feed them anything we haven't seen: + my $fh = $self->{git}->popen(qw(rev-list --all)); + my ($H, $n, $seen); + while (defined($H = <$fh>)) { + chomp $H; + if (seen($self, $H)) { + last if ++$seen > $SEEN_MAX; + } else { + $n = hex(substr($H, 0, 4)) % $nshard; + say { $pipes[$n] } $H or die "say: $!"; + $seen = 0; + } + } + close $fh or die "close: $!"; + for (@pipes) { close($_) or die "close: $!" } + $reap->() while %pids; + die "subprocess(es) failed\n" if $err; +} + +sub git { $_[0]->{git} } # for PublicInbox::SearchIdx + +sub cidx_sync { # main entry point + my ($self, $opt) = @_; + my $warn_cb = $SIG{__WARN__} || \&CORE::warn; + local $self->{current_info} = ''; +} + +# bypass PublicInbox::SearchIdx::with_umask: +no warnings 'once'; +*with_umask = \&PublicInbox::InboxWritable::with_umask; + +1; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 86219dfe..c51663ff 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -110,43 +110,50 @@ sub load_xapian () { # a prefix common in patch emails our $LANG = 'english'; +our %COMMIT_BOOL_COMMON = ( + dfpre => 'XDFPRE', + dfpost => 'XDFPOST', + dfblob => 'XDFPRE XDFPOST', + patchid => 'XDFID', +); + # note: the non-X term prefix allocations are shared with # Xapian omega, see xapian-applications/omega/docs/termprefixes.rst my %bool_pfx_external = ( mid => 'Q', # Message-ID (full/exact), this is mostly uniQue lid => 'G', # newsGroup (or similar entity), just inside <> - dfpre => 'XDFPRE', - dfpost => 'XDFPOST', - dfblob => 'XDFPRE XDFPOST', - patchid => 'XDFID', + %COMMIT_BOOL_COMMON ); -my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID'; -my %prob_prefix = ( - # for mairix compatibility +# for mairix compatibility +our $NON_QUOTED_BODY = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID'; +our %COMMIT_PROB_COMMON = ( s => 'S', - m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial - l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial f => 'A', - t => 'XTO', - tc => 'XTO XCC', - c => 'XCC', - tcf => 'XTO XCC A', - a => 'XTO XCC A', - b => $non_quoted_body . ' XQUOT', - bs => $non_quoted_body . ' XQUOT S', + b => $NON_QUOTED_BODY . ' XQUOT', + bs => $NON_QUOTED_BODY . ' XQUOT S', n => 'XFN', q => 'XQUOT', - nq => $non_quoted_body, + nq => $NON_QUOTED_BODY, dfn => 'XDFN', dfa => 'XDFA', dfb => 'XDFB', dfhh => 'XDFHH', dfctx => 'XDFCTX', +); +my %prob_prefix = ( + m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial + l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial + t => 'XTO', + tc => 'XTO XCC', + c => 'XCC', + tcf => 'XTO XCC A', + a => 'XTO XCC A', + %COMMIT_PROB_COMMON, # default: - '' => 'XM S A XQUOT XFN ' . $non_quoted_body, + '' => 'XM S A XQUOT XFN ' . $NON_QUOTED_BODY, ); # not documenting m: and mid: for now, the using the URLs works w/o Xapian diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 257b83a5..66f688de 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -134,6 +134,7 @@ sub idx_acquire { load_xapian_writable(); $flag = $self->{creat} ? $DB_CREATE_OR_OPEN : $DB_OPEN; } + my $owner = $self->{ibx} // $self->{eidx} // $self; if ($self->{creat}) { require File::Path; $self->lock_acquire; @@ -146,13 +147,11 @@ sub idx_acquire { require PublicInbox::Syscall; PublicInbox::Syscall::nodatacow_dir($dir); $self->{-set_has_threadid_once} = 1; - if (($self->{ibx} // $self->{eidx})->{-dangerous}) { - $flag |= $DB_DANGEROUS; - } + $flag |= $DB_DANGEROUS if $owner->{-dangerous}; } } return unless defined $flag; - $flag |= $DB_NO_SYNC if ($self->{ibx} // $self->{eidx})->{-no_fsync}; + $flag |= $DB_NO_SYNC if $owner->{-no_fsync}; my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) }; croak "Failed opening $dir: $@" if $@; $self->{xdb} = $xdb; @@ -376,9 +375,9 @@ sub index_xapian { # msg_iter callback $fh->autoflush(1); print $fh $s or die "print: $!"; sysseek($fh, 0, SEEK_SET) or die "sysseek: $!"; - my $id = ($self->{ibx} // $self->{eidx})->git->qx( - [qw(patch-id --stable)], - {}, { 0 => $fh, 2 => $eh }); + my $id = ($self->{git} // ($self->{ibx} // $self->{eidx})->git + )->qx([qw(patch-id --stable)], {}, + { 0 => $fh, 2 => $eh }); $id =~ /\A([a-f0-9]{40,})/ and $doc->add_term('XDFID'.$1); seek($eh, 0, SEEK_SET) or die "seek: $!"; while (<$eh>) { warn $_ } @@ -414,14 +413,16 @@ sub index_list_id ($$$) { sub index_ids ($$$$) { my ($self, $doc, $hdr, $mids) = @_; - for my $mid (@$mids) { - index_phrase($self, $mid, 1, 'XM'); + if (!$self->{-repo_idx}) { + for my $mid (@$mids) { + index_phrase($self, $mid, 1, 'XM'); - # because too many Message-IDs are prefixed with - # "Pine.LNX."... - if ($mid =~ /\w{12,}/) { - my @long = ($mid =~ /(\w{3,}+)/g); - index_phrase($self, join(' ', @long), 1, 'XM'); + # because too many Message-IDs are prefixed with + # "Pine.LNX."... + if ($mid =~ /\w{12,}/) { + my @long = ($mid =~ /(\w{3,}+)/g); + index_phrase($self, join(' ', @long), 1, 'XM'); + } } } $doc->add_boolean_term('Q' . $_) for @$mids; @@ -438,10 +439,11 @@ sub eml2doc ($$$;$) { add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd); my $dt = strftime('%Y%m%d%H%M%S', @ds); add_val($doc, PublicInbox::Search::DT(), $dt); - add_val($doc, PublicInbox::Search::BYTES(), $smsg->{bytes}); - add_val($doc, PublicInbox::Search::UID(), $smsg->{num}); - add_val($doc, PublicInbox::Search::THREADID, $smsg->{tid}); - + if (!$self->{-repo_idx}) { + add_val($doc, PublicInbox::Search::BYTES(), $smsg->{bytes}); + add_val($doc, PublicInbox::Search::UID(), $smsg->{num}); + add_val($doc, PublicInbox::Search::THREADID, $smsg->{tid}); + } my $tg = term_generator($self); $tg->set_document($doc); index_headers($self, $smsg); diff --git a/script/public-inbox-cindex b/script/public-inbox-cindex new file mode 100755 index 00000000..f6717239 --- /dev/null +++ b/script/public-inbox-cindex @@ -0,0 +1,70 @@ +#!perl -w +# Copyright (C) all contributors +# License: AGPL-3.0+ +use v5.12; +use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); +my $help = < -1, compact => 0, fsync => 1, scan => 1 }; +GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i + fsync|sync! dangerous + indexlevel|index-level|L=s max_size|max-size=s + batch_size|batch-size=s + dedupe:s@ gc commit-interval=i watch scan! dry-run|n + all C=s@ help|h)) + or die $help; +if ($opt->{help}) { print $help; exit 0 }; +die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0; +require IO::Handle; +STDOUT->autoflush(1); +STDERR->autoflush(1); +local $SIG{USR1} = 'IGNORE'; # to be overridden in cidx_sync +# require lazily to speed up --help +require PublicInbox::Admin; +PublicInbox::Admin::do_chdir(delete $opt->{C}); +my $cfg = PublicInbox::Config->new; +my $cidx_dir = shift(@ARGV) // die "E: $help"; +if ($opt->{gc}) { + die "E: repository paths must not be specified with --gc\n" if @ARGV; + for my $sw (qw(watch dry-run)) { + die "E: --$sw is not compatible with --gc\n" if $opt->{$sw}; + } +} +PublicInbox::Admin::require_or_die(qw(-search)); +PublicInbox::Admin::progress_prepare($opt); +my $env = PublicInbox::Admin::index_prepare($opt, $cfg); +local %ENV = (%ENV, %$env) if $env; +require PublicInbox::CodeSearchIdx; +my $cidx = PublicInbox::CodeSearchIdx->new($cidx_dir, $opt); +if ($opt->{gc}) { + $cidx->attach_config($cfg); + $cidx->cidx_gc($opt); +} else { + if ($opt->{config}) { + $cidx->attach_config($cfg); + } else { + $cidx->attach_config($cfg, \@ibxs); + } + if ($opt->{watch}) { + $cfg = undef; # save memory only after SIGHUP + $cidx->cidx_watch($opt); + } else { + $cidx->cidx_sync($opt); + } +}