From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF, T_SCC_BODY_TEXT_LINE shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id C645F1F513 for ; Sun, 26 Nov 2023 14:19:33 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1701008373; bh=c0/wIqB/Xm9ECVYMu7Bok0OQ9MYaN9UMdfWFfmstgfM=; h=From:To:Subject:Date:In-Reply-To:References:From; b=paWUph1p72pyAvd22+wsgrkHTsvk7D3Zw+bR9E9Vo+Uy2Ya3lf6NnPvXUMXrVmRPq iX6S02YLPr8Bt7b7dZxcI2Oxv1y5GuG6XiCmIzhkvsCSr3j+kNjLQqX3jWAZxI19jx 2FCEiwOfS2oKvozoIVNLfyQvobIHl8oDe/6opKNA= From: Eric Wong To: spew@80x24.org Subject: [PATCH 4/7] www_coderepo: load and use cindex join data Date: Sun, 26 Nov 2023 14:19:30 +0000 Message-ID: <20231126141933.593525-4-e@80x24.org> In-Reply-To: <20231126141933.593525-1-e@80x24.org> References: <20231126141933.593525-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: --- lib/PublicInbox/CodeSearch.pm | 55 ++++++++++++---- lib/PublicInbox/CodeSearchIdx.pm | 42 ++++++------ lib/PublicInbox/Config.pm | 35 +++++++++- lib/PublicInbox/Search.pm | 9 +++ lib/PublicInbox/SolverGit.pm | 6 +- lib/PublicInbox/WWW.pm | 1 + lib/PublicInbox/WwwCoderepo.pm | 108 ++++++++++++++++++++++++++++++- t/cindex.t | 28 +++++++- xt/solver.t | 3 +- 9 files changed, 248 insertions(+), 39 deletions(-) diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm index 9051d85f..19bcde93 100644 --- a/lib/PublicInbox/CodeSearch.pm +++ b/lib/PublicInbox/CodeSearch.pm @@ -21,7 +21,7 @@ use constant { our @CODE_NRP; our @CODE_VMAP = ( [ AT, 'd:' ], # mairix compat - [ AT, 'dt:' ], # mail compat + [ AT, 'dt:' ], # public-inbox mail compat [ CT, 'ct:' ], ); @@ -51,7 +51,7 @@ my %prob_prefix = ( # copied from PublicInbox::Search sub new { my ($cls, $dir, $cfg) = @_; # can't have a PublicInbox::Config here due to circular refs - bless { xpfx => "$dir/cidx".CIDX_SCHEMA_VER, + bless { topdir => $dir, xpfx => "$dir/cidx".CIDX_SCHEMA_VER, -cfg_f => $cfg->{-f} }, $cls; } @@ -63,7 +63,20 @@ sub join_data { my $cur = $self->xdb->get_metadata($key) or return; $cur = eval { PublicInbox::Config::json()->decode(uncompress($cur)) }; warn "E: $@ (corrupt metadata in `$key' key?)" if $@; - $cur; + my @m = grep { ref($cur->{$_}) ne 'ARRAY' } qw(ekeys roots ibx2root); + if (@m) { + warn <{topdir} join data for $self->{-cfg_f} missing: @m +EOM + undef; + } elsif (@{$cur->{ekeys}} != @{$cur->{ibx2root}}) { + warn <{topdir} join data for $self->{cfg_f} mismatched ekeys and ibx2root +EOM + undef; + } else { + $cur; + } } sub qparse_new ($) { @@ -191,21 +204,41 @@ sub roots2paths { # for diagnostics } $size = $mset->size; } while ($size); - substr($_, 0, 1, '/') for @$dirs; # s!^P!/! @$dirs = sort @$dirs; } \%ret; } -sub paths2roots { # for diagnostics - my ($self) = @_; +sub root_oids ($$) { + my ($self, $git_dir) = @_; + my @ids = $self->docids_by_postlist('P'.$git_dir); + @ids or warn <<""; +BUG? (non-fatal) `$git_dir' not indexed in $self->{topdir} + + warn <<"" if @ids > 1; +BUG: (non-fatal) $git_dir indexed multiple times in $self->{topdir} + + my %ret; + for my $docid (@ids) { + my @oids = xap_terms('G', $self->xdb, $docid); + @ret{@oids} = @oids; + } + sort keys %ret; +} + +sub paths2roots { + my ($self, $paths) = @_; my %ret; - my $tmp = roots2paths($self); - for my $root_oidhex (keys %$tmp) { - my $paths = delete $tmp->{$root_oidhex}; - push @{$ret{$_}}, $root_oidhex for @$paths; + if ($paths) { + for my $p (keys %$paths) { @{$ret{$p}} = root_oids($self, $p) } + } else { + my $tmp = roots2paths($self); + for my $root_oidhex (keys %$tmp) { + my $paths = delete $tmp->{$root_oidhex}; + push @{$ret{$_}}, $root_oidhex for @$paths; + } + @$_ = sort(@$_) for values %ret; } - @$_ = sort(@$_) for values %ret; \%ret; } diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm index bb1d698b..a6cbe0b0 100644 --- a/lib/PublicInbox/CodeSearchIdx.pm +++ b/lib/PublicInbox/CodeSearchIdx.pm @@ -172,7 +172,7 @@ sub count_shards { scalar($_[0]->xdb_shards_flat) } sub update_commit ($$$) { my ($self, $cmt, $roots) = @_; # fields from @FMT my $x = 'Q'.$cmt->{H}; - my ($docid, @extra) = sort { $a <=> $b } docids_by_postlist($self, $x); + my ($docid, @extra) = sort { $a <=> $b } $self->docids_by_postlist($x); @extra and warn "W: $cmt->{H} indexed multiple times, pruning ", join(', ', map { "#$_" } @extra), "\n"; $self->{xdb}->delete_document($_) for @extra; @@ -377,15 +377,6 @@ sub seen ($$) { # used to select the shard for a GIT_DIR sub git_dir_hash ($) { hex(substr(sha256_hex($_[0]), 0, 8)) } -sub docids_by_postlist ($$) { # consider moving to PublicInbox::Search - my ($self, $q) = @_; - my $cur = $self->{xdb}->postlist_begin($q); - my $end = $self->{xdb}->postlist_end($q); - my @ids; - for (; $cur != $end; $cur++) { push(@ids, $cur->get_docid) }; - @ids; -} - sub _cb { # run_await cb my ($pid, $cmd, undef, $opt, $cb, $self, $git, @arg) = @_; return if $DO_QUIT; @@ -452,7 +443,7 @@ sub prep_repo ($$) { sub check_existing { # retry_reopen callback my ($shard, $self, $git) = @_; - my @docids = docids_by_postlist($shard, 'P'.$git->{git_dir}); + my @docids = $shard->docids_by_postlist('P'.$git->{git_dir}); my $docid = shift(@docids) // return get_roots($self, $git); my $doc = $shard->get_doc($docid) // die "BUG: no #$docid ($git->{git_dir})"; @@ -778,7 +769,7 @@ sub prune_init { # via wq_io_do in IDX_SHARDS sub prune_one { # via wq_io_do in IDX_SHARDS my ($self, $term) = @_; - my @docids = docids_by_postlist($self, $term); + my @docids = $self->docids_by_postlist($term); for (@docids) { $TXN_BYTES -= $self->{xdb}->get_doclength($_) * 42; $self->{xdb}->delete_document($_); @@ -894,10 +885,9 @@ sub current_join_data ($) { sub score_old_join_data ($$$) { my ($self, $score, $ekeys_new) = @_; my $old = ($JOIN{reset} ? undef : current_join_data($self)) or return; - my @old = @$old{qw(ekeys roots ibx2root)}; - @old == 3 or return warn "W: ekeys/roots missing from old JOIN data\n"; progress($self, 'merging old join data...'); - my ($ekeys_old, $roots_old, $ibx2root_old) = @old; + my ($ekeys_old, $roots_old, $ibx2root_old) = + @$old{qw(ekeys roots ibx2root)}; # score: "ibx_off root_off" => nr my $i = -1; my %root2id_new = map { $_ => ++$i } @OFF2ROOT; @@ -905,16 +895,24 @@ sub score_old_join_data ($$$) { my %ekey2id_new = map { $_ => ++$i } @$ekeys_new; for my $ibx_off_old (0..$#$ibx2root_old) { my $root_offs_old = $ibx2root_old->[$ibx_off_old]; - my $ekey = $ekeys_old->[$ibx_off_old] // - warn "W: no ibx #$ibx_off_old in old JOIN data\n"; - my $ibx_off_new = $ekey2id_new{$ekey // next} // + my $ekey = $ekeys_old->[$ibx_off_old] // do { + warn "W: no ibx #$ibx_off_old in old join data\n"; + next; + }; + my $ibx_off_new = $ekey2id_new{$ekey} // do { warn "W: `$ekey' no longer exists\n"; + next; + }; for (@$root_offs_old) { my ($nr, $rid_old) = @$_; - my $root_old = $roots_old->[$rid_old] // - warn "W: no root #$rid_old in old JOIN data\n"; - my $rid_new = $root2id_new{$root_old // next} // + my $root_old = $roots_old->[$rid_old] // do { + warn "W: no root #$rid_old in old data\n"; + next; + }; + my $rid_new = $root2id_new{$root_old} // do { warn "W: root `$root_old' no longer exists\n"; + next; + }; $score->{"$ibx_off_new $rid_new"} += $nr; } } @@ -963,7 +961,7 @@ sub do_join { progress($self, "$ekey => $root has $nr matches"); push @{$new->{ibx2root}->[$ibx_off]}, [ $nr, $root_off ]; } - for my $ary (values %$new) { # sort by nr + for my $ary (values %$new) { # sort by nr (largest first) for (@$ary) { @$_ = sort { $b->[0] <=> $a->[0] } @$_ } } $new->{ekeys} = \@ekeys; diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 9bee94b8..c8ecc06b 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -412,8 +412,8 @@ sub get_1 { sub repo_objs { my ($self, $ibxish) = @_; - my $ibx_coderepos = $ibxish->{coderepo} // return; $ibxish->{-repo_objs} // do { + my $ibx_coderepos = $ibxish->{coderepo} // return; parse_cgitrc($self, undef, 0); my $coderepos = $self->{-coderepos}; my @repo_objs; @@ -568,6 +568,39 @@ sub _fill_ei ($$) { $es; } +sub _fill_csrch ($$) { + my ($self, $name) = @_; # "" is a valid name for cindex + return if $name ne '' && !valid_foo_name($name, 'cindex'); + eval { require PublicInbox::CodeSearch } or return; + my $pfx = "cindex.$name"; + my $d = $self->{"$pfx.topdir"} // return; + -d $d or return; + if (index($d, "\n") >= 0) { + warn "E: `$d' must not contain `\\n'\n"; + return; + } + my $csrch = PublicInbox::CodeSearch->new($d, $self); + for my $k (qw(localprefix)) { + my $v = $self->{"$pfx.$k"} // next; + $csrch->{$k} = _array($v); + } + $csrch->{name} = $name; + $csrch; +} + +sub lookup_cindex { + my ($self, $name) = @_; + $self->{-csrch_by_name}->{$name} //= _fill_csrch($self, $name); +} + +sub each_cindex { + my ($self, $cb, @arg) = @_; + for my $s (grep(m!\Acindex\.[^\./]*\z!, @{$self->{-section_order}})) { + my $csrch = lookup_cindex($self, substr($s, length('cindex.'))); + $cb->($csrch, @arg) if $csrch; + } +} + sub config_cmd { my ($self, $env, $opt) = @_; my $f = $self->{-f} // default_file(); diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 6145b027..43f7f52f 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -649,4 +649,13 @@ sub xh_args { # prep getopt args to feed to xap_helper.h socket map { ('-d', $_) } shard_dirs($_[0]); } +sub docids_by_postlist ($$) { + my ($self, $q) = @_; + my $cur = $self->xdb->postlist_begin($q); + my $end = $self->{xdb}->postlist_end($q); + my @ids; + for (; $cur != $end; $cur++) { push(@ids, $cur->get_docid) }; + @ids; +} + 1; diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm index ba3c94cb..b0e6cc24 100644 --- a/lib/PublicInbox/SolverGit.pm +++ b/lib/PublicInbox/SolverGit.pm @@ -640,9 +640,13 @@ sub resolve_patch ($$) { # so user_cb never references the SolverGit object sub new { my ($class, $ibx, $user_cb, $uarg) = @_; + my $gits = $ibx ? $ibx->{-repo_objs} : undef; + + # FIXME: cindex --join= is super-aggressive and may hit too many + $gits = [ @$gits[0..2] ] if $gits && @$gits > 3; bless { # $ibx is undef if coderepo only (see WwwCoderepo) - gits => $ibx ? $ibx->{-repo_objs} : undef, + gits => $gits, user_cb => $user_cb, uarg => $uarg, # -cur_di, -qsp_err, -msg => temp fields for Qspawn callbacks diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 6b616bd4..289599b8 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -189,6 +189,7 @@ sub preload { } $pi_cfg->ALL and require PublicInbox::Isearch; $self->cgit; + $self->coderepo; $self->stylesheets_prepare($_) for ('', '../', '../../'); $self->news_www; } diff --git a/lib/PublicInbox/WwwCoderepo.pm b/lib/PublicInbox/WwwCoderepo.pm index 0eb4a2d6..9012e786 100644 --- a/lib/PublicInbox/WwwCoderepo.pm +++ b/lib/PublicInbox/WwwCoderepo.pm @@ -14,12 +14,14 @@ use PublicInbox::ViewVCS; use PublicInbox::WwwStatic qw(r); use PublicInbox::GitHTTPBackend; use PublicInbox::WwwStream; -use PublicInbox::Hval qw(ascii_html utf8_maybe); +use PublicInbox::Hval qw(prurl ascii_html utf8_maybe); use PublicInbox::ViewDiff qw(uri_escape_path); use PublicInbox::RepoSnapshot; use PublicInbox::RepoAtom; use PublicInbox::RepoTree; use PublicInbox::OnDestroy; +use URI::Escape qw(uri_escape_utf8); +use File::Spec; my @EACH_REF = (qw(git for-each-ref --sort=-creatordate), "--format=%(HEAD)%00".join('%00', map { "%($_)" } @@ -37,6 +39,74 @@ $ git for-each-ref --sort=-creatordate refs/tags \ my $NO_HEADS = "# no heads (branches), yet...\n"; my $NO_TAGS = "# no tags, yet...\n"; +sub csrch_load_coderepos { # each_cindex callback + my ($csrch, $self, $pi_cfg) = @_; + my $name = $csrch->{name}; + my $cfg_f = $pi_cfg->{-f}; + my $lpfx = $csrch->{localprefix} or return warn <{-coderepos}; + my $nick_pfx = $name eq '' ? '' : "$name/"; + my %dir2cr; + for my $p ($csrch->all_terms('P')) { + my $nick = $p; + $nick =~ s!$lre!$nick_pfx!s or next; + $dir2cr{$p} = $coderepos->{$nick} //= do { + my $git = PublicInbox::Git->new($p); + $git->{nick} = $nick; # for git->pub_urls + $git; + }; + } + my $jd = $csrch->join_data or return warn <{topdir} has no usable join data for $cfg_f +EOM + my ($ekeys, $roots, $ibx2root) = @$jd{qw(ekeys roots ibx2root)}; + my $roots2paths = $csrch->roots2paths; + for my $root_offs (@$ibx2root) { + my $ekey = shift(@$ekeys) // die 'BUG: {ekeys} empty'; + scalar(@$root_offs) or next; + my $ibx = $pi_cfg->lookup_eidx_key($ekey) // do { + warn "W: `$ekey' gone from $cfg_f\n"; + next; + }; + my $gits = $ibx->{-repo_objs} //= []; + my %ibx_p2g = map { $_->{git_dir} => $_ } @$gits; + for (@$root_offs) { # sorted by $nr descending + my ($nr, $root_off) = @$_; + my $root_oid = $roots->[$root_off] // do { + warn <{$root_oid}; + @$git_dirs = grep { !$ibx_p2g{$_} } @$git_dirs; + # @$git_dirs or warn "W: no matches for $root_oid\n"; + for (@$git_dirs) { + if (my $git = $dir2cr{$_}) { + $ibx_p2g{$_} = $git; + $ibx->{-hide}->{www} or + push @{$git->{ibx_score}}, + [ $nr, $ibx->{name} ]; + push @$gits, $git; + } else { + warn <{-repo_objs} if !@$gits; + } + for my $git (values %dir2cr) { + my $s = $git->{ibx_score}; + @$s = sort { $b->[0] <=> $a->[0] } @$s if $s; + } +} + # shared with PublicInbox::Cgit sub prepare_coderepos { my ($self) = @_; @@ -62,6 +132,7 @@ sub prepare_coderepos { my $eidx = $pi_cfg->lookup_ei($k) // next; $pi_cfg->repo_objs($eidx); } + $pi_cfg->each_cindex(\&csrch_load_coderepos, $self, $pi_cfg); } sub new { @@ -119,6 +190,40 @@ sub _refs_tags_link { "$align ", ascii_html($s), " ($cd)", @snap_fmt, "\n"); } +sub emit_joined_inboxes ($) { + my ($ctx) = @_; + my $names = $ctx->{git}->{ibx_names}; # coderepo directives in config + my $score = $ctx->{git}->{ibx_score}; # generated w/ cindex --join + ($names || $score) or return; + my $pi_cfg = $ctx->{wcr}->{pi_cfg}; + my ($u, $h); + my $zfh = $ctx->zfh; + print $zfh "\n# associated public inboxes:"; + my @ns = map { [ 0, $_ ] } @$names; + for (@ns, @$score) { + my ($nr, $name) = @$_; + my $ibx = $pi_cfg->lookup_name($name) // do { + warn "W: inbox `$name' gone for $ctx->{git}->{git_dir}"; + say $zfh '# ', ascii_html($name), ' (missing inbox?)'; + next; + }; + if (scalar(@{$ibx->{url} // []})) { + $u = $h = ascii_html(prurl($ctx->{env}, $ibx->{url})); + } else { + $h = uri_escape_utf8($name); + $h = File::Spec->abs2rel("/$h", "/$ctx->{git}->{nick}"); + $h = ascii_html($h . '/'); + $u = ascii_html($name); + } + if ($nr) { + printf $zfh "\n% 11u", $nr; + } else { + print $zfh "\n", ' 'x11; + } + print $zfh qq{ $u}; + } +} + sub summary_END { # called via OnDestroy my ($ctx) = @_; my $wcb = delete($ctx->{-wcb}) or return; # already done @@ -174,6 +279,7 @@ EOM for (@r) { print $zfh _refs_tags_link($_, './', $snap_pfx, @snap_fmt) } print $zfh $NO_TAGS if !@r; print $zfh qq(...\n) if $last; + emit_joined_inboxes $ctx; $wcb->($ctx->html_done('')); } diff --git a/t/cindex.t b/t/cindex.t index ac7a6000..afcc226e 100644 --- a/t/cindex.t +++ b/t/cindex.t @@ -5,7 +5,7 @@ use v5.12; use PublicInbox::TestCommon; use Cwd qw(getcwd abs_path); use List::Util qw(sum); -use autodie qw(close open rename); +use autodie qw(close mkdir open rename); require_mods(qw(json Xapian)); use_ok 'PublicInbox::CodeSearchIdx'; use PublicInbox::Import; @@ -227,7 +227,7 @@ SKIP: { # --prune } File::Path::remove_tree("$tmp/ext"); -ok(mkdir("$tmp/ext", 0707), 'create $tmp/ext with odd permissions'); +mkdir("$tmp/ext", 0707); ok(run_script([qw(-cindex --dangerous -q -d), "$tmp/ext", $zp]), 'external on existing dir'); { @@ -265,4 +265,28 @@ EOM 'non-Xapian-enabled inbox noted'); } +# we need to support blank sections for a top-level repos +# (e.g. +# git.kernel.org could use "pub" as section name, though, since all git repos +# are currently under //git.kernel.org/pub/**/* +{ + mkdir(my $d = "$tmp/blanksection"); + my $cfg = cfg_new($d, <lookup_cindex(''); + is ref($csrch), 'PublicInbox::CodeSearch', 'codesearch w/ blank name'; + is_deeply $csrch->{localprefix}, [ "$tmp" ], 'localprefix respected'; + my $nr = 0; + $cfg->each_cindex(sub { + my ($cs, @rest) = @_; + is $cs->{topdir}, $csrch->{topdir}, 'each_cindex works'; + is_deeply \@rest, [ '.' ], 'got expected arg'; + ++$nr; + }, '.'); + is $nr, 1, 'iterated through cindices'; +} + done_testing; diff --git a/xt/solver.t b/xt/solver.t index 51b4144c..372d003b 100644 --- a/xt/solver.t +++ b/xt/solver.t @@ -10,6 +10,7 @@ use_ok($_) for @psgi; use_ok 'PublicInbox::WWW'; my $cfg = PublicInbox::Config->new; my $www = PublicInbox::WWW->new($cfg); +$www->preload; my $app = sub { my $env = shift; $env->{'psgi.errors'} = \*STDERR; @@ -63,7 +64,7 @@ while (my ($ibx_name, $urls) = each %$todo) { skip(qq{[publicinbox "$ibx_name"] not configured}, scalar(@$urls)); } - if (!defined($ibx->{coderepo})) { + if (!defined($ibx->{-repo_objs})) { push @gone, $ibx_name; skip(qq{publicinbox.$ibx_name.coderepo not configured}, scalar(@$urls));