# Copyright (C) all contributors # License: AGPL-3.0+ # read-only external index for coderepos # currently, it only indexes commits and repository metadata # (pathname, root commits); not blob contents package PublicInbox::CodeSearch; use v5.12; use parent qw(PublicInbox::Search); use PublicInbox::Config; use PublicInbox::Search qw(retry_reopen int_val xap_terms); use PublicInbox::Compat qw(uniqstr); use Compress::Zlib qw(uncompress); use constant { AT => 0, # author time YYYYMMDDHHMMSS, dt: for mail) CT => 1, # commit time (Unix time stamp, like TS/rt: in mail) CIDX_SCHEMA_VER => 1, # brand new schema for code search # for repos (`Tr'), CT(col=1) is used for the latest tip commit time # in refs/{heads,tags}. AT(col=0) may be used to store disk usage # in the future, but disk usage calculation is espensive w/ alternates }; our @CODE_NRP; our @CODE_VMAP = ( [ AT, 'd:' ], # mairix compat [ AT, 'dt:' ], # public-inbox mail compat [ CT, 'ct:' ], ); # note: the non-X term prefix allocations are shared with Xapian omega, # see xapian-applications/omega/docs/termprefixes.rst # bool_pfx_internal: # type => 'T', # 'c' - commit, 'r' - repo GIT_DIR # tags are not indexed, only normal branches (refs/heads/*), not hidden # 'P' # (pathname) GIT_DIR # uniq # 'G' # (group) root commit (may have multiple roots) my %bool_pfx_external = ( oid => 'Q', # type:commit - git OID hex (40|64)-byte SHA-(1|256) # type:repo - rel2abs_collapsed(GIT_DIR) parent => 'XP', %PublicInbox::Search::PATCH_BOOL_COMMON, ); my %prob_prefix = ( # copied from PublicInbox::Search # do we care about committer? or partial commit OID via Xapian? # o => 'XQ', # 'oid:' (bool) is exact, 'o:' (prob) can do partial %PublicInbox::Search::PATCH_PROB_COMMON, # default: '' => 'S A XQUOT XFN ' . $PublicInbox::Search::NON_QUOTED_BODY ); sub new { my ($cls, $dir, $cfg) = @_; # can't have a PublicInbox::Config here due to circular refs bless { topdir => $dir, xpfx => "$dir/cidx".CIDX_SCHEMA_VER, -cfg_f => $cfg->{-f} }, $cls; } sub join_data_key ($) { "join:$_[0]->{-cfg_f}" } sub join_data { my ($self) = @_; my $key = join_data_key($self); my $cur = $self->xdb->get_metadata($key) or return; $cur = eval { PublicInbox::Config::json()->decode(uncompress($cur)) }; warn "E: $@ (corrupt metadata in `$key' key?)" if $@; my @m = grep { ref($cur->{$_}) ne 'ARRAY' } qw(ekeys roots ibx2root); if (@m) { warn <{topdir} join data for $self->{-cfg_f} missing: @m EOM undef; } elsif (@{$cur->{ekeys}} < @{$cur->{ibx2root}}) { warn <{topdir} join data for $self->{-cfg_f} mismatched ekeys and ibx2root EOM undef; } else { $cur; } } sub qparse_new ($) { my ($self) = @_; my $qp = $self->qp_init_common; my $cb = $qp->can('add_valuerangeprocessor') // $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ if (!@CODE_NRP) { @CODE_NRP = map { $PublicInbox::Search::NVRP->new(@$_) } @CODE_VMAP; } $cb->($qp, $_) for @CODE_NRP; while (my ($name, $pfx) = each %bool_pfx_external) { $qp->add_boolean_prefix($name, $_) for split(/ /, $pfx); } while (my ($name, $pfx) = each %prob_prefix) { $qp->add_prefix($name, $_) for split(/ /, $pfx); } $qp; } sub generate_cxx () { # generates snippet for xap_helper.h my $ret = <[0], "$x->[1]");\n} } $ret .= <ADD_RP(code_nrp[i]); EOM for my $name (sort keys %bool_pfx_external) { for (split(/ /, $bool_pfx_external{$name})) { $ret .= qq{\tqp->add_boolean_prefix("$name", "$_");\n} } } for my $name (sort keys %prob_prefix) { for (split(/ /, $prob_prefix{$name})) { $ret .= qq{\tqp->add_prefix("$name", "$_");\n} } } $ret .= "}\n"; } # returns a Xapian::Query to filter by roots sub roots_filter { # retry_reopen callback my ($self, $git_dir) = @_; my $xdb = $self->xdb; my $P = 'P'.$git_dir; my ($cur, $end) = ($xdb->postlist_begin($P), $xdb->postlist_end($P)); if ($cur == $end) { warn "W: $git_dir not indexed?\n"; return; } my @roots = xap_terms('G', $xdb, $cur->get_docid); if (!@roots) { warn "W: $git_dir has no root commits?\n"; return; } my $q = $PublicInbox::Search::X{Query}->new('G'.shift(@roots)); for my $r (@roots) { $q = $PublicInbox::Search::X{Query}->new( PublicInbox::Search::OP_OR(), $q, 'G'.$r); } $q; } sub mset { my ($self, $qry_str, $opt) = @_; my $qp = $self->{qp} //= qparse_new($self); my $qry = $qp->parse_query($qry_str, $self->{qp_flags}); # limit to commits with shared roots if (defined(my $git_dir = $opt->{git_dir})) { my $rf = retry_reopen($self, \&roots_filter, $git_dir) or return; $qry = $PublicInbox::Search::X{Query}->new( PublicInbox::Search::OP_FILTER(), $qry, $rf); } # we only want commits: $qry = $PublicInbox::Search::X{Query}->new( PublicInbox::Search::OP_FILTER(), $qry, 'T'.'c'); $self->do_enquire($qry, $opt, CT); } sub roots2paths { # for diagnostics my ($self) = @_; my $cur = $self->xdb->allterms_begin('G'); my $end = $self->{xdb}->allterms_end('G'); my $qrepo = $PublicInbox::Search::X{Query}->new('T'.'r'); my $enq = $PublicInbox::Search::X{Enquire}->new($self->{xdb}); $enq->set_weighting_scheme($PublicInbox::Search::X{BoolWeight}->new); $enq->set_docid_order($PublicInbox::Search::ENQ_ASCENDING); my %ret; for (; $cur != $end; $cur++) { my $G_oidhex = $cur->get_termname; my $qry = $PublicInbox::Search::X{Query}->new( PublicInbox::Search::OP_FILTER(), $qrepo, $G_oidhex); $enq->set_query($qry); my ($size, $off, $lim) = (0, 0, 100000); my $dirs = $ret{substr($G_oidhex, 1)} = []; do { my $mset = $enq->get_mset($off += $size, $lim); for my $x ($mset->items) { push @$dirs, xap_terms('P', $x->get_document); } $size = $mset->size; } while ($size); @$dirs = sort(uniqstr(@$dirs)); } \%ret; } sub docids_of_git_dir ($$) { my ($self, $git_dir) = @_; my @ids = $self->docids_by_postlist('P'.$git_dir); warn <<"" if @ids > 1; BUG: (non-fatal) $git_dir indexed multiple times in $self->{topdir} @ids; } sub root_oids ($$) { my ($self, $git_dir) = @_; my @ids = docids_of_git_dir $self, $git_dir or warn <<""; BUG? (non-fatal) `$git_dir' not indexed in $self->{topdir} my @ret = map { xap_terms('G', $self->xdb, $_) } @ids; @ret = uniqstr(@ret) if @ids > 1; @ret; } sub paths2roots { my ($self, $paths) = @_; my %ret; if ($paths) { for my $p (keys %$paths) { @{$ret{$p}} = root_oids($self, $p) } } else { my $tmp = roots2paths($self); for my $root_oidhex (keys %$tmp) { my $paths = delete $tmp->{$root_oidhex}; push @{$ret{$_}}, $root_oidhex for @$paths; } @$_ = sort(@$_) for values %ret; } \%ret; } sub load_ct { # retry_reopen cb my ($self, $git_dir) = @_; my @ids = docids_of_git_dir $self, $git_dir or return; for (@ids) { my $doc = $self->get_doc($_) // next; return int_val($doc, CT); } } sub load_commit_times { # each_cindex callback my ($self, $todo) = @_; # todo = [ [ time, git ], [ time, git ] ...] my (@pending, $rec, $ct); while ($rec = shift @$todo) { $ct = $self->retry_reopen(\&load_ct, $rec->[1]->{git_dir}); if (defined $ct) { $rec->[0] = $ct; } else { # may be in another cindex: push @pending, $rec; } } @$todo = @pending; } sub load_coderepos { # each_cindex callback my ($self, $pi_cfg) = @_; my $name = $self->{name}; my $cfg_f = $pi_cfg->{-f}; my $lpfx = $self->{localprefix} or return warn <{-coderepos}; my $nick_pfx = $name eq '' ? '' : "$name/"; my %dir2cr; for my $p ($self->all_terms('P')) { my $nick = $p; $nick =~ s!$lre!$nick_pfx!s or next; $dir2cr{$p} = $coderepos->{$nick} //= do { my $git = PublicInbox::Git->new($p); my %dedupe = ($nick => undef); ($git->{nick}) = keys %dedupe; # for git->pub_urls $git; }; } my $jd = $self->retry_reopen(\&join_data, $self) or return warn <{topdir} has no usable join data for $cfg_f EOM my ($ekeys, $roots, $ibx2root) = @$jd{qw(ekeys roots ibx2root)}; my $roots2paths = roots2paths($self); my %dedupe; # 50x alloc reduction w/ lore + gko mirror (Mar 2024) for my $root_offs (@$ibx2root) { my $ekey = shift(@$ekeys) // die 'BUG: {ekeys} empty'; scalar(@$root_offs) or next; my $ibx = $pi_cfg->lookup_eidx_key($ekey) // do { warn "W: `$ekey' gone from $cfg_f\n"; next; }; my $gits = $ibx->{-repo_objs} //= []; my $cr_score = $ibx->{-cr_score} //= {}; my %ibx_p2g = map { $_->{git_dir} => $_ } @$gits; my $ibx2self; # cindex has an association w/ inbox? for (@$root_offs) { # sorted by $nr descending my ($nr, $root_off) = @$_; my $root_oid = $roots->[$root_off] // do { warn <{$root_oid}; my @gits = map { $dir2cr{$_} // () } @$git_dirs; $cr_score->{$_->{nick}} //= $nr for @gits; @$git_dirs = grep { !$ibx_p2g{$_} } @$git_dirs; # @$git_dirs or warn "W: no matches for $root_oid\n"; for (@$git_dirs) { if (my $git = $dir2cr{$_}) { $ibx_p2g{$_} = $git; $ibx2self = 1; if (!$ibx->{-hide_www}) { # don't stringify $nr directly # to avoid long-lived PV my $k = ($nr + 0)."\0". ($ibx + 0); my $s = $dedupe{$k} //= [ $nr, $ibx->{name} ]; push @{$git->{ibx_score}}, $s; } push @$gits, $git; } else { warn <{-csrch}}, $self if $ibx2self; } else { delete $ibx->{-repo_objs}; delete $ibx->{-cr_score}; } } for my $git (values %dir2cr) { my $s = $git->{ibx_score}; @$s = sort { $b->[0] <=> $a->[0] } @$s if $s; } my $ALL = $pi_cfg->ALL or return; my @alls_gits = sort { scalar @{$b->{ibx_score} // []} <=> scalar @{$a->{ibx_score} // []} } values %$coderepos; my $gits = $ALL->{-repo_objs} //= []; push @$gits, @alls_gits; my $cr_score = $ALL->{-cr_score} //= {}; $cr_score->{$_->{nick}} //= scalar(@{$_->{ibx_score}//[]}) for @$gits; } sub repos_sorted { my $pi_cfg = shift; my @recs = map { [ 0, $_ ] } @_; # PublicInbox::Git objects my @todo = @recs; $pi_cfg->each_cindex(\&load_commit_times, \@todo); @recs = sort { $b->[0] <=> $a->[0] } @recs; # sort by commit time } 1;