From 68b310207929db23667ca5d454a78af9d65589f2 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 4 Feb 2017 02:20:35 +0000 Subject: repobrowse: start wiring up git search Much more work on this will be needed, but at least explicit flush points prevents OOMs on my system. --- MANIFEST | 4 + lib/PublicInbox/RepoGitSearch.pm | 183 +++++++++++++++ lib/PublicInbox/RepoGitSearchIdx.pm | 444 ++++++++++++++++++++++++++++++++++++ script/repobrowse-index | 68 ++++++ t/repo_git_search_idx.t | 28 +++ 5 files changed, 727 insertions(+) create mode 100644 lib/PublicInbox/RepoGitSearch.pm create mode 100644 lib/PublicInbox/RepoGitSearchIdx.pm create mode 100755 script/repobrowse-index create mode 100644 t/repo_git_search_idx.t diff --git a/MANIFEST b/MANIFEST index 9e4e7cab..f235dc67 100644 --- a/MANIFEST +++ b/MANIFEST @@ -78,6 +78,8 @@ lib/PublicInbox/NewsWWW.pm lib/PublicInbox/ParentPipe.pm lib/PublicInbox/ProcessPipe.pm lib/PublicInbox/Qspawn.pm +lib/PublicInbox/RepoGitSearch.pm +lib/PublicInbox/RepoGitSearchIdx.pm lib/PublicInbox/Repobrowse.pm lib/PublicInbox/RepobrowseBase.pm lib/PublicInbox/RepobrowseConfig.pm @@ -127,6 +129,7 @@ script/public-inbox-mda script/public-inbox-nntpd script/public-inbox-watch script/public-inbox.cgi +script/repobrowse-index scripts/dc-dlvr scripts/dc-dlvr.pre scripts/edit-sa-prefs @@ -179,6 +182,7 @@ t/psgi_attach.t t/psgi_mount.t t/psgi_text.t t/qspawn.t +t/repo_git_search_idx.t t/repobrowse.t t/repobrowse_common_git.perl t/repobrowse_git.t diff --git a/lib/PublicInbox/RepoGitSearch.pm b/lib/PublicInbox/RepoGitSearch.pm new file mode 100644 index 00000000..0c94a7b7 --- /dev/null +++ b/lib/PublicInbox/RepoGitSearch.pm @@ -0,0 +1,183 @@ +# Copyright (C) 2017 all contributors +# License: AGPL-3.0+ +# +# Read-only search interface for use by the Repobrowse web interface +# RepoGitSearchIdx builds upon this for writing a Xapian DB. +package PublicInbox::RepoGitSearch; +use strict; +use warnings; +use Search::Xapian qw/:standard/; + +# values for ranges and sorting +use constant { + CD => 0, # commit date stamp (YYYYMMDD) + AD => 1, # author date stamp (YYYYMMDD) + + REPO_SCHEMA_VERSION => 1, + # n.b. FLAG_PURE_NOT is expensive not suitable for a public website + # as it could become a denial-of-service vector + QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD, +}; +our $LANG = 'english'; + +my %bool_pfx_internal = ( + type => 'T', # "commit", "tag", or "ref" +); + +my %bool_pfx_external = ( + ref => 'XREF', # refname (belongs to) +); + +my %prob_prefix = ( + id => 'Q', # git object ID, partial matches supported + p => 'XP', # parent commit (partial) + s => 'S', # subject + a => 'A', # Author name + email + c => 'XC', # Committer name + email + ac => 'A XC', # Author and Committer name + email + b => 'XBODY', # commit message body + bs => 'S XBODY', # commit message (subject + body) + diff_fn => 'XDFN', # changed filenames + diff_hdr => 'XDHH', # diff hunk header + diff_ctx => 'XDCTX', # diff context + diff_a => 'XDFA', # diff a/ file (before) + diff_b => 'XDFB', # diff b/ file (after) + diff => 'XDFN XDHH XDCTX XDFA XDFB', # entire diff + preimg => 'XPRE', # blob pre-image (full) + postimg => 'XPOST', # blob post-image (full) + # default: + '' => 'Q XP S A XC XBODY XDFN XDHH XDCTX XDFA XDFB XPRE XPOST', +); + +our @HELP = ( + 's:' => 'match within message subject e.g. s:"a quick brown fox"', + 'ad:' => < 'Committer date range as YYYYMMDD, see ad: above', + 'b:' => 'match within commit message body', + 'bs:' => 'match within the commit message subject and body', +); +chomp @HELP; + +my %all_pfx = (%bool_pfx_internal, %bool_pfx_external, %prob_prefix); + +sub new { + my ($class, $git_dir, $repo_dir) = @_; + $repo_dir ||= "$git_dir/public-inbox"; + my $xdir = "$repo_dir/xr".REPO_SCHEMA_VERSION; + bless { git_dir => $git_dir, xdir => $xdir }, $class; +} + +# overriden by RepoGitSearchIdx +sub xdb ($) { $_[0]->{xdb} ||= Search::Xapian::Database->new($_[0]->{xdir}) } + +sub retry_reopen ($$) { + my ($self, $cb) = @_; + my $ret; + for (1..3) { + eval { $ret = $cb->() }; + return $ret unless $@; + # Exception: The revision being read has been discarded - + # you should call Xapian::Database::reopen() + if (ref($@) eq 'Search::Xapian::DatabaseModifiedError') { + $self->{xdb}->reopen; + } else { + die; + } + } +} + +sub _enquire_once ($$$) { + my ($self, $query, $opts) = @_; + my $enq = $self->{enquire} ||= Search::Xapian::Enquire->new($self->xdb); + $enq->set_query($query); + $opts ||= {}; + my $desc = !$opts->{asc}; + if ($opts->{relevance}) { + $enq->set_sort_by_relevance_then_value(AD, $desc); + } else { + $enq->set_sort_by_value_then_relevance(AD, $desc); + } + my $offset = $opts->{offset} || 0; + my $limit = $opts->{limit} || 50; + $enq->get_mset($offset, $limit); +} + +sub _do_enquire ($$$) { + my ($self, $query, $opts) = @_; + retry_reopen($self, sub { _enquire_once($self, $query, $opts) }); +} + +sub stemmer () { Search::Xapian::Stem->new($LANG) } + +# read-only +sub qp ($) { + my ($self) = @_; + + my $qp = $self->{query_parser}; + return $qp if $qp; + + # new parser + $qp = Search::Xapian::QueryParser->new; + $qp->set_default_op(OP_AND); + $qp->set_database($self->xdb); + $qp->set_stemmer(stemmer()); + $qp->set_stemming_strategy(STEM_SOME); + + $qp->add_valuerangeprocessor( + Search::Xapian::NumberValueRangeProcessor->new(AD, 'ad:')); + $qp->add_valuerangeprocessor( + Search::Xapian::NumberValueRangeProcessor->new(CD, 'cd:')); + + while (my ($name, $prefix) = each %bool_pfx_external) { + $qp->add_boolean_prefix($name, $prefix); + } + + while (my ($name, $prefix) = each %prob_prefix) { + $qp->add_prefix($name, $_) foreach split(/ /, $prefix); + } + + $self->{query_parser} = $qp; +} + +# returns begin and end PostingIterator +sub find_docids ($$) { + my ($self, $termval) = @_; + my $db = $self->xdb; + ($db->postlist_begin($termval), $db->postlist_end($termval)); +} + +sub find_unique_docid ($$$) { + my ($self, $termval) = @_; + my ($begin, $end) = find_docids($self, $termval); + return undef if $begin->equal($end); # not found + my $rv = $begin->get_docid; + # sanity check + $begin->inc; + $begin->equal($end) or die "Term '$termval' is not unique\n"; + $rv; +} + +sub help ($) { + my ($self) = @_; + \@HELP; +} + +# read-only +sub query { + my ($self, $query_string, $opts) = @_; + my $query; + + $opts ||= {}; + unless ($query_string eq '') { + $query = qp($self)->parse_query($query_string, QP_FLAGS); + $opts->{relevance} = 1 unless exists $opts->{relevance}; + } + + _do_enquire($self, $query, $opts); +} + +1; diff --git a/lib/PublicInbox/RepoGitSearchIdx.pm b/lib/PublicInbox/RepoGitSearchIdx.pm new file mode 100644 index 00000000..333558ca --- /dev/null +++ b/lib/PublicInbox/RepoGitSearchIdx.pm @@ -0,0 +1,444 @@ +# Copyright (C) 2017 all contributors +# License: AGPL-3.0+ +# +# Qrefs/(tags|heads)/foo => 40-byte SHA1 hex of commit +# Q$SHA1HEX_OF_COMMIT +# +# Indexes any git repository with Xapian; intended for code; +# see PublicInbox::SearchIdx for a mail-specific indexer +package PublicInbox::RepoGitSearchIdx; +use strict; +use warnings; +use base qw(PublicInbox::RepoGitSearch); # base is read-only +use POSIX qw(strftime); +use PublicInbox::Git; +use PublicInbox::GitIdx; +*xpfx = *PublicInbox::RepoGitSearch::xpfx; +use constant { + Z40 => ('0' x 40), + STATE_GPGSIG => -0x80000000, + DEBUG => !!$ENV{DEBUG}, + BATCH_BYTES => 1_000_000, +}; + +sub new { + my ($class, $git_dir, $repo_dir) = @_; + require Search::Xapian::WritableDatabase; + my $self = $class->SUPER::new($git_dir, $repo_dir); + my $git = $self->{git} = PublicInbox::Git->new($git_dir); + $self->{want_refs_re} = qr!^refs/(?:heads|tags)/!; + $self->{'umask'} = git_umask_for($git); + $self; +} + +sub xdb ($) { + my ($self) = @_; + $self->{xdb} ||= with_umask($self->{'umask'}, sub { + my $xdir = $self->{xdir}; + unless (-d $xdir) { + require File::Path; + File::Path::mkpath($xdir); + } + Search::Xapian::WritableDatabase->new($xdir, + Search::Xapian::DB_CREATE_OR_OPEN); + }); +} + +sub doc_new ($$) { + my ($type, $unique_id) = @_; + my $doc = Search::Xapian::Document->new; + $doc->add_term('T'.$type); + $doc->add_term($unique_id); + $doc; +} + +sub add_val ($$$) { + my ($doc, $col, $num) = @_; + $num = Search::Xapian::sortable_serialise($num); + $doc->add_value($col, $num); +} + +sub each_term_val ($$$$) { + my ($doc, $pfx, $re, $cb) = @_; + my $end = $doc->termlist_end; + my $i = $doc->termlist_begin; + $i->skip_to($pfx); + while ($i != $end) { + my $val = $i->get_termname; + $val =~ s/$re// and $cb->($val); + $i->inc; + } + undef; +} + +sub get_doc ($$$$) { + my ($self, $id_ref, $type, $oid) = @_; + my $doc; + my $doc_id = $self->find_unique_docid('Q'.$oid); + if (defined $doc_id) { + $doc = $self->{xdb}->get_document($doc_id); + } else { + $doc = doc_new($type, 'Q'.$oid); + } + $$id_ref = $doc_id; + $doc; +} + +# increments and returns update generation counter +sub update_id ($) { + my ($self) = @_; + my $db = $self->{xdb}; + my $update_id = int($db->get_metadata('last_update_id') || 0); + $db->set_metadata('last_update_id', ++$update_id); + $update_id; +} + +sub replace_or_add ($$$) { + my ($db, $doc_id, $doc) = @_; + # update our ref: + if (defined $doc_id) { + $db->replace_document($doc_id, $doc); + } else { + $doc_id = $db->add_document($doc); + } + $doc_id; +} + +sub doc_refnames { + my ($doc) = @_; + my %cur; + each_term_val($doc, 'XREF', qr/^XREF/, sub { $cur{$_[0]} = 1 }); + \%cur; +} + +sub decor_update { + my ($self, $doc, $decor, $oid) = @_; + + # load all current refs + my $cur = doc_refnames($doc); + my $want = $self->{want_refs_re}; + ($decor) = ($decor =~ m!\((.+)\)!); + foreach (split(/, /, $decor)) { + my ($sym, $refname, $tag); + if (/^(\S+) -> (\S+)\z/) { + ($sym, $refname) = ($1, $2); + } elsif (s/^tag: //) { + $refname = $_; + $tag = 1; + } else { + $refname = $_; + } + next if $cur->{$refname}; + if ($refname =~ $want) { + $self->{-active_refs}->{$refname} = $oid; + } + # TODO: handle $sym, and do something with tags + } +} + +sub update_ref_contains ($$) { + my ($self, $doc) = @_; + my $cur = doc_refnames($doc); + my $n = 0; + my @active = keys %{$self->{-active_refs}}; + for (@active) { + next if $cur->{$_}; + $doc->add_term('XREF'.$_); + ++$n; + } + $n; +} + +sub commit_doc ($$$) { + my ($self, $doc_id, $doc) = @_; + my $n = update_ref_contains($self, $doc); + if ($n || !defined($doc_id)) { + replace_or_add($self->{xdb}, $doc_id, $doc); + } +} + +sub term_generator ($) { # write-only + my ($self) = @_; + + $self->{term_generator} ||= eval { + my $tg = Search::Xapian::TermGenerator->new; + $tg->set_stemmer($self->stemmer); + $tg; + }; +} + +sub index_text_inc ($$$) { + my ($tg, $text, $pfx) = @_; + $tg->index_text($text, 1, $pfx); + $tg->increase_termpos; +} + +sub index_blob_id ($$$) { + my ($tg, $blob_id, $pfx) = @_; + index_text_inc($tg, $blob_id, $pfx) if $blob_id ne Z40; +} + +sub each_log_line ($$) { + my ($self, $range) = @_; + my $log = $self->{git}->popen(qw(log --decorate=full --pretty=raw + --no-color --no-abbrev --no-notes + -r --raw -p + ), $range, '--'); + my $db = $self->{xdb}; + my ($doc, $doc_id); + my $tg = term_generator($self); + my $state = 0; # 1: subject, 2: body, 3: diff, 4: diff -c + my $tip; + my $hex = '[a-f0-9]+'; + my ($cc_ins, $cc_del); + my $batch = BATCH_BYTES; + my $decorate_only; + + local $/ = "\n"; + while (defined(my $l = <$log>)) { + $batch -= bytes::length($l); + if ($l =~ /^commit (\S+)(\s+\([^\)]+\))?/) { + my ($oid, $decor) = ($1, $2); + commit_doc($self, $doc_id, $doc) if $doc; + $tip ||= $oid; + $state = 0; + $cc_ins = $cc_del = undef; + + # prevent OOM + if ($batch <= 0) { + $db->flush; + $batch = BATCH_BYTES; + } + $doc = get_doc($self, \$doc_id, 'commit', $oid); + decor_update($self, $doc, $decor, $oid) if $decor; + # old commit + if (defined $doc_id) { + $decorate_only = $oid; + last; + } + + # new commit: + $tg->set_document($doc); + $doc->set_data($oid); + $doc->add_term('Q' . $oid); + index_text_inc($tg, $oid, 'Q'); + } elsif ($l =~ /^parent (\S+)/) { + my $parent = $1; + index_text_inc($tg, $parent, 'XP'); + } elsif ($l =~ /^author ([^<]*?<[^>]+>) (\d+)/) { + my ($au, $at) = ($1, $2); + index_text_inc($tg, $au, 'A'); + add_val($doc, PublicInbox::RepoGitSearch::AD, + strftime('%Y%m%d', gmtime($at))); + } elsif ($l =~ /^committer ([^<]*?<[^>]+>) (\d+)/) { + my ($cu, $ct) = ($1, $2); + index_text_inc($tg, $cu, 'XC'); + add_val($doc, PublicInbox::RepoGitSearch::CD, + strftime('%Y%m%d', gmtime($ct))); + } elsif ($l =~ /^gpgsig /) { + $state = STATE_GPGSIG; + } elsif ($l =~ /^mergetag /) { + $state = -1; + } elsif ($state < 0) { # inside mergetag or gpgsig + if ($l eq " \n") { # paragraph + $state--; + $tg->increase_termpos; + } elsif ($l eq "-----BEGIN PGP SIGNATURE-----\n") { + # no point in indexing a PGP signature + $state = STATE_GPGSIG; + } elsif ($state == -2) { # mergetag subject + $tg->index_text($l, 1); + $tg->increase_termpos; + } elsif ($state < -2 && $state > STATE_GPGSIG) { + $tg->index_text($l); # mergetag body + } elsif ($l eq "\n") { + # end of mergetag, onto normal commit message + $tg->increase_termpos; + $state = 0; + } elsif ($l =~ /^ (?:tag|tagger|type) /) { + # ignored + } elsif (DEBUG) { + if ($state <= STATE_GPGSIG) { + # skip + } else { + warn "unhandled mergetag: $l"; + } + } + } elsif ($state < 3 && $l =~ s/^ //) { # subject and body + if ($state > 0) { + $l =~ /\S/ ? $tg->index_text($l, 1) + : $tg->increase_termpos; + $state = 2; + } else { + $state = 1; + $tg->index_text($l, 1, 'S') if $l ne "\n"; + } + } elsif ($l =~ /^:\d{6} \d{6} ($hex) ($hex) (\S+)\t+(.+)/o) { + # --raw output (regular) + my ($pre, $post, $chg, $names) = ($1, $2, $3, $4); + index_blob_id($tg, $pre, 'XPRE'); + index_blob_id($tg, $post, 'XPOST'); + } elsif ($l =~ /^(::+)(?:\d{6} )+ ($hex .+)? (\S+)\t+(.+)/o) { + # --raw output (combined) + my ($colons, $blobs, $chg, $names) = ($1, $2, $3, $4); + my @blobs = split(/ /, $blobs); + my $post = pop @blobs; + my $n = length($colons); + if (scalar(@blobs) != $n) { + die "combined raw parsed wrong:\n$l\n//\n"; + } + index_blob_id($tg, $_, 'XPRE') foreach @blobs; + index_blob_id($tg, $post, 'XPOST'); + unless ($cc_ins) { + $n--; + $cc_ins = qr/^ {0,$n}[\+]\s*(.*)/; + $cc_del = qr/^ {0,$n}[\-]\s*(.*)/; + } + } elsif ($l =~ m!^diff --git (?:"?a/.+?) (?:"?b/.+)!) { + # regular diff, filenames handled by --raw + $state = 3; + } elsif ($l =~ /^diff --(?:cc|combined) (?:.+)/) { + # combined diff, filenames handled by --raw + $state = 4; + } elsif ($l =~ /^@@ (?:\S+) (?:\S+) @@(.*)/) { + my $hunk_hdr = $1; + # regular hunk header context + $hunk_hdr =~ /\S/ and + index_text_inc($tg, $hunk_hdr, 'XDHH'); + # not currently handled: + } elsif ($l =~ /^index (?:$hex)\.\.(?:$hex)/o) { + } elsif ($l =~ /^index (?:$hex,[^\.]+)\.\.(?:$hex)(.*)$/o) { + #--cc + } elsif ($l =~ /^(?:@@@+) (?:\S+.*\S+) @@@+\z/) { # --cc + } elsif ($l =~ /^(?:old|new) mode/) { + } elsif ($l =~ /^(?:deleted|new) file mode/) { + } elsif ($l =~ /^tree (?:\S+)/) { + } elsif ($l =~ /^(?:copy|rename) (?:from|to) /) { + } elsif ($l =~ /^(?:dis)?similarity index /) { + } elsif ($l =~ /^\\ No newline at end of file/) { + } elsif ($l =~ /^Binary files .* differ/) { + } elsif ($l =~ /^--- /) { # preimage filename + } elsif ($l =~ /^\+\+\+ /) { # postimage filename + } elsif ($state == 3) { # diff --git + if ($l =~ s/^\+//) { + index_text_inc($tg, $l, 'XDFB'); + } elsif ($l =~ s/^\-//) { + index_text_inc($tg, $l, 'XDFA'); + } elsif ($l =~ s/^ //) { + index_text_inc($tg, $l, 'XDCTX'); + } elsif (DEBUG) { + if ($l eq "\n") { + } else { + warn "unhandled diff -u $l"; + } + } + } elsif ($state == 4) { # diff --cc/combined + if ($l =~ $cc_ins) { + index_text_inc($tg, $1, 'XDFB'); + } elsif ($l =~ $cc_del) { + index_text_inc($tg, $1, 'XDFA'); + } elsif ($l =~ s/^ //) { + index_text_inc($tg, $l, 'XDCTX'); + } elsif (DEBUG) { + if ($l eq "\n") { + } else { + warn "unhandled diff --cc $l"; + } + } + } elsif (DEBUG) { + warn "wtf $state $l\n" if $l ne "\n"; + } + } + + # optimization: we go into decorate-only mode once we start + # seeing commits we've already seen to save git from having + # to generate diffs and us from having to skip lines we + # don't care about: + if (defined $decorate_only) { + $doc = undef; + # SIGPIPE existing git log, spawn a new fast one + $log = $self->{git}->popen(qw(log --decorate=full + --pretty=format:%H%d), + $decorate_only); + + while (defined(my $l = <$log>)) { + $l =~ /^(\S+)(\s+\([^\)]+\))?/ or die "bad line: $l"; + my ($oid, $decor) = ($1, $2); + commit_doc($self, $doc_id, $doc) if $doc; + + # do not to buffer more than 100 docs before flush + if (($batch -= 10000) <= 0) { + $db->flush; + $batch = BATCH_BYTES; + } + $doc = get_doc($self, \$doc_id, 'commit', $oid); + decor_update($self, $doc, $decor, $oid) if $decor; + } + } + + commit_doc($self, $doc_id, $doc) if $doc; + $tip; +} + +sub index_top_ref ($$$) { + my ($self, $refname, $end) = @_; + my $doc_id; + my $db = xdb($self); + my $ref_doc = get_doc($self, \$doc_id, 'ref', $refname); + my $begin = defined $doc_id ? $ref_doc->get_data : ''; + my $active = $self->{-active_refs} = { $refname => undef }; + my $git = $self->{git}; + + # check for discontiguous branches (from "push --force") + if ($begin ne '') { + my $base = $git->qx(qw(merge-base), $begin, $end); + chomp $base; + if ($base ne $begin) { + warn "$refname updated with force\n"; + # TODO: cleanup_forced_update($self, $refname); + $begin = ''; + } + } + my $range = $begin eq '' ? $end : "$begin^0..$end^0"; + my $tip = each_log_line($self, $range); + my $progress = $self->{progress}; + if (defined $tip) { + $ref_doc->set_data($tip); + print $progress "$refname => $tip\n" if $progress; + replace_or_add($db, $doc_id, $ref_doc); + } + + # update all decorated refs which got snowballed into this one + delete $active->{$refname}; + foreach my $ref (keys %$active) { + $ref_doc = get_doc($self, \$doc_id, 'ref', $ref); + $ref_doc->set_data($active->{$ref}); + if ($progress) { + print $progress "$ref => $active->{$ref} ($refname)\n"; + } + replace_or_add($db, $doc_id, $ref_doc); + } + $db->flush; +} + +# main entry sub: +sub index_sync { + my ($self, $opts) = @_; + $self->{progress} = $opts->{progress}; + my $db = xdb($self); + $self->{-update_id} = update_id($self); + # go for most recent refs, first, since that reduces the amount + # of work we have to do. + my $refs = $self->{git}->popen(qw(for-each-ref --sort=-creatordate)); + local $/ = "\n"; + while (defined(my $line = <$refs>)) { + chomp $line; + my ($oid, $type, $refname) = split(/\s+/, $line); + next unless $refname =~ $self->{want_refs_re}; + next unless $type eq 'commit' || $type eq 'tag'; + index_top_ref($self, $refname, $oid); + } + $db->flush; +} + +1; diff --git a/script/repobrowse-index b/script/repobrowse-index new file mode 100755 index 00000000..6e939fd5 --- /dev/null +++ b/script/repobrowse-index @@ -0,0 +1,68 @@ +#!/usr/bin/perl -w +# Copyright (C) 2017 all contributors +# License: AGPL-3.0+ +# Basic tool to create a Xapian search index for any git repository +# Usage with libeatmydata +# highly recommended: eatmydata repobrowse-index GIT_DIR +use strict; +use warnings; +use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); +use Cwd 'abs_path'; +my $usage = "repobrowse-index GIT_DIR"; + +eval { require PublicInbox::RepoGitSearchIdx }; +if ($@) { + print STDERR "Search::Xapian required for $0\n"; + exit 1; +} + +my $reindex; +my %opts = ( '--reindex' => \$reindex ); +GetOptions(%opts) or die "bad command-line args\n$usage"; + +my @dirs; +sub resolve_git_dir { + my ($cd) = @_; + my @cmd = qw(git rev-parse --git-dir); + my $cmd = join(' ', @cmd); + my $pid = open my $fh, '-|'; + defined $pid or die "forking $cmd failed: $!\n"; + if ($pid == 0) { + if (defined $cd) { + chdir $cd or die "chdir $cd failed: $!\n"; + } + exec @cmd; + die "Failed to exec $cmd: $!\n"; + } else { + my $dir = eval { + local $/; + <$fh>; + }; + close $fh or die "error in $cmd: $!\n"; + chomp $dir; + return abs_path($cd) if ($dir eq '.' && defined $cd); + abs_path($dir); + } +} + +if (@ARGV) { + @dirs = map { resolve_git_dir($_) } @ARGV; +} else { + @dirs = (resolve_git_dir()); +} + +sub usage { print STDERR "Usage: $usage\n"; exit 1 } +usage() unless @dirs; + +foreach my $dir (@dirs) { + index_dir($dir); +} + +sub index_dir { + my ($git_dir) = @_; + if (!ref $git_dir && ! -d $git_dir) { + die "$git_dir does not appear to be a git repository\n"; + } + my $s = PublicInbox::RepoGitSearchIdx->new($git_dir); + $s->index_sync({ reindex => $reindex, progress => \*STDERR }); +} diff --git a/t/repo_git_search_idx.t b/t/repo_git_search_idx.t new file mode 100644 index 00000000..934a4e6f --- /dev/null +++ b/t/repo_git_search_idx.t @@ -0,0 +1,28 @@ +# Copyright (C) 2017 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use File::Temp qw/tempdir/; +use_ok 'PublicInbox::RepoGitSearchIdx'; +my $test = require './t/repobrowse_common_git.perl'; +my $git_dir = $test->{git_dir}; +my $xdir = "$git_dir/rg"; +my $idx = PublicInbox::RepoGitSearchIdx->new($git_dir, $xdir); +ok($idx->xdb && -d $xdir, 'Xapian dir created'); +$idx->index_sync; + +my $mset = $idx->query('bs:"add header"'); +my $doc; +$doc = $_->get_document foreach $mset->items; +ok($doc, 'got document'); +is('cb3b92d257e628b512a2eee0861f8935c594cd12', $doc->get_data, 'DATA OK'); + +foreach my $q (qw(id:cb3b92d257e628b512a2eee0861f8935c594cd12 id:cb3b92d2*)) { + $mset = $idx->query($q); + $doc = undef; + $doc = $_->get_document foreach $mset->items; + ok($doc, "got document for $q"); +} + +done_testing(); -- cgit v1.2.3-24-ge0c7