about summary refs log tree commit homepage
path: root/lib
diff options
authorEric Wong <e@80x24.org>2017-02-04 02:20:35 +0000
committerEric Wong <e@80x24.org>2017-02-08 23:39:26 +0000
commit68b310207929db23667ca5d454a78af9d65589f2 (patch)
tree1b574620ab8d69eeb911ad2533588923db00f93d /lib
parent984c21dda421c4d2c104d72e4954e8ee44a815f3 (diff)
Much more work on this will be needed, but at least explicit
flush points prevents OOMs on my system.
Diffstat (limited to 'lib')
2 files changed, 627 insertions, 0 deletions
diff --git a/lib/PublicInbox/RepoGitSearch.pm b/lib/PublicInbox/RepoGitSearch.pm
new file mode 100644
index 00000000..0c94a7b7
--- /dev/null
+++ b/lib/PublicInbox/RepoGitSearch.pm
@@ -0,0 +1,183 @@
+# Copyright (C) 2017 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+# Read-only search interface for use by the Repobrowse web interface
+# RepoGitSearchIdx builds upon this for writing a Xapian DB.
+package PublicInbox::RepoGitSearch;
+use strict;
+use warnings;
+use Search::Xapian qw/:standard/;
+# values for ranges and sorting
+use constant {
+        CD => 0, # commit date stamp (YYYYMMDD)
+        AD => 1, # author date stamp (YYYYMMDD)
+        REPO_SCHEMA_VERSION => 1,
+        # n.b. FLAG_PURE_NOT is expensive not suitable for a public website
+        # as it could become a denial-of-service vector
+our $LANG = 'english';
+my %bool_pfx_internal = (
+        type => 'T', # "commit", "tag", or "ref"
+my %bool_pfx_external = (
+        ref => 'XREF', # refname (belongs to)
+my %prob_prefix = (
+        id => 'Q', # git object ID, partial matches supported
+        p => 'XP', # parent commit (partial)
+        s => 'S', # subject
+        a => 'A', # Author name + email
+        c => 'XC', # Committer name + email
+        ac => 'A XC', # Author and Committer name + email
+        b => 'XBODY', # commit message body
+        bs => 'S XBODY', # commit message (subject + body)
+        diff_fn => 'XDFN', # changed filenames
+        diff_hdr => 'XDHH', # diff hunk header
+        diff_ctx => 'XDCTX', # diff context
+        diff_a => 'XDFA', # diff a/ file (before)
+        diff_b => 'XDFB', # diff b/ file (after)
+        diff => 'XDFN XDHH XDCTX XDFA XDFB', # entire diff
+        preimg => 'XPRE', # blob pre-image (full)
+        postimg => 'XPOST', # blob post-image (full)
+        # default:
+our @HELP = (
+        's:' => 'match within message subject e.g. s:"a quick brown fox"',
+        'ad:' => <<EOF,
+Author date range as YYYYMMDD  e.g. ad:19931002..20101002
+Open-ended ranges such as ad:19931002.. and ad:..20101002
+are also supported
+        'cd:' => 'Committer date range as YYYYMMDD, see ad: above',
+        'b:' => 'match within commit message body',
+        'bs:' => 'match within the commit message subject and body',
+chomp @HELP;
+my %all_pfx = (%bool_pfx_internal, %bool_pfx_external, %prob_prefix);
+sub new {
+        my ($class, $git_dir, $repo_dir) = @_;
+        $repo_dir ||= "$git_dir/public-inbox";
+        my $xdir = "$repo_dir/xr".REPO_SCHEMA_VERSION;
+        bless { git_dir => $git_dir, xdir => $xdir }, $class;
+# overriden by RepoGitSearchIdx
+sub xdb ($) { $_[0]->{xdb} ||= Search::Xapian::Database->new($_[0]->{xdir}) }
+sub retry_reopen ($$) {
+        my ($self, $cb) = @_;
+        my $ret;
+        for (1..3) {
+                eval { $ret = $cb->() };
+                return $ret unless $@;
+                # Exception: The revision being read has been discarded -
+                # you should call Xapian::Database::reopen()
+                if (ref($@) eq 'Search::Xapian::DatabaseModifiedError') {
+                        $self->{xdb}->reopen;
+                } else {
+                        die;
+                }
+        }
+sub _enquire_once ($$$) {
+        my ($self, $query, $opts) = @_;
+        my $enq = $self->{enquire} ||= Search::Xapian::Enquire->new($self->xdb);
+        $enq->set_query($query);
+        $opts ||= {};
+        my $desc = !$opts->{asc};
+        if ($opts->{relevance}) {
+                $enq->set_sort_by_relevance_then_value(AD, $desc);
+        } else {
+                $enq->set_sort_by_value_then_relevance(AD, $desc);
+        }
+        my $offset = $opts->{offset} || 0;
+        my $limit = $opts->{limit} || 50;
+        $enq->get_mset($offset, $limit);
+sub _do_enquire ($$$) {
+        my ($self, $query, $opts) = @_;
+        retry_reopen($self, sub { _enquire_once($self, $query, $opts) });
+sub stemmer () { Search::Xapian::Stem->new($LANG) }
+# read-only
+sub qp ($) {
+        my ($self) = @_;
+        my $qp = $self->{query_parser};
+        return $qp if $qp;
+        # new parser
+        $qp = Search::Xapian::QueryParser->new;
+        $qp->set_default_op(OP_AND);
+        $qp->set_database($self->xdb);
+        $qp->set_stemmer(stemmer());
+        $qp->set_stemming_strategy(STEM_SOME);
+        $qp->add_valuerangeprocessor(
+                Search::Xapian::NumberValueRangeProcessor->new(AD, 'ad:'));
+        $qp->add_valuerangeprocessor(
+                Search::Xapian::NumberValueRangeProcessor->new(CD, 'cd:'));
+        while (my ($name, $prefix) = each %bool_pfx_external) {
+                $qp->add_boolean_prefix($name, $prefix);
+        }
+        while (my ($name, $prefix) = each %prob_prefix) {
+                $qp->add_prefix($name, $_) foreach split(/ /, $prefix);
+        }
+        $self->{query_parser} = $qp;
+# returns begin and end PostingIterator
+sub find_docids ($$) {
+        my ($self, $termval) = @_;
+        my $db = $self->xdb;
+        ($db->postlist_begin($termval), $db->postlist_end($termval));
+sub find_unique_docid ($$$) {
+        my ($self, $termval) = @_;
+        my ($begin, $end) = find_docids($self, $termval);
+        return undef if $begin->equal($end); # not found
+        my $rv = $begin->get_docid;
+        # sanity check
+        $begin->inc;
+        $begin->equal($end) or die "Term '$termval' is not unique\n";
+        $rv;
+sub help ($) {
+        my ($self) = @_;
+        \@HELP;
+# read-only
+sub query {
+        my ($self, $query_string, $opts) = @_;
+        my $query;
+        $opts ||= {};
+        unless ($query_string eq '') {
+                $query = qp($self)->parse_query($query_string, QP_FLAGS);
+                $opts->{relevance} = 1 unless exists $opts->{relevance};
+        }
+        _do_enquire($self, $query, $opts);
diff --git a/lib/PublicInbox/RepoGitSearchIdx.pm b/lib/PublicInbox/RepoGitSearchIdx.pm
new file mode 100644
index 00000000..333558ca
--- /dev/null
+++ b/lib/PublicInbox/RepoGitSearchIdx.pm
@@ -0,0 +1,444 @@
+# Copyright (C) 2017 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+# Qrefs/(tags|heads)/foo => 40-byte SHA1 hex of commit
+# Indexes any git repository with Xapian; intended for code;
+# see PublicInbox::SearchIdx for a mail-specific indexer
+package PublicInbox::RepoGitSearchIdx;
+use strict;
+use warnings;
+use base qw(PublicInbox::RepoGitSearch); # base is read-only
+use POSIX qw(strftime);
+use PublicInbox::Git;
+use PublicInbox::GitIdx;
+*xpfx = *PublicInbox::RepoGitSearch::xpfx;
+use constant {
+        Z40 => ('0' x 40),
+        STATE_GPGSIG => -0x80000000,
+        DEBUG => !!$ENV{DEBUG},
+        BATCH_BYTES => 1_000_000,
+sub new {
+        my ($class, $git_dir, $repo_dir) = @_;
+        require Search::Xapian::WritableDatabase;
+        my $self = $class->SUPER::new($git_dir, $repo_dir);
+        my $git = $self->{git} = PublicInbox::Git->new($git_dir);
+        $self->{want_refs_re} = qr!^refs/(?:heads|tags)/!;
+        $self->{'umask'} = git_umask_for($git);
+        $self;
+sub xdb ($) {
+        my ($self) = @_;
+        $self->{xdb} ||= with_umask($self->{'umask'}, sub {
+                my $xdir = $self->{xdir};
+                unless (-d $xdir) {
+                        require File::Path;
+                        File::Path::mkpath($xdir);
+                }
+                Search::Xapian::WritableDatabase->new($xdir,
+                                Search::Xapian::DB_CREATE_OR_OPEN);
+        });
+sub doc_new ($$) {
+        my ($type, $unique_id) = @_;
+        my $doc = Search::Xapian::Document->new;
+        $doc->add_term('T'.$type);
+        $doc->add_term($unique_id);
+        $doc;
+sub add_val ($$$) {
+        my ($doc, $col, $num) = @_;
+        $num = Search::Xapian::sortable_serialise($num);
+        $doc->add_value($col, $num);
+sub each_term_val ($$$$) {
+        my ($doc, $pfx, $re, $cb) = @_;
+        my $end = $doc->termlist_end;
+        my $i = $doc->termlist_begin;
+        $i->skip_to($pfx);
+        while ($i != $end) {
+                my $val = $i->get_termname;
+                $val =~ s/$re// and $cb->($val);
+                $i->inc;
+        }
+        undef;
+sub get_doc ($$$$) {
+        my ($self, $id_ref, $type, $oid) = @_;
+        my $doc;
+        my $doc_id = $self->find_unique_docid('Q'.$oid);
+        if (defined $doc_id) {
+                $doc = $self->{xdb}->get_document($doc_id);
+        } else {
+                $doc = doc_new($type, 'Q'.$oid);
+        }
+        $$id_ref = $doc_id;
+        $doc;
+# increments and returns update generation counter
+sub update_id ($) {
+        my ($self) = @_;
+        my $db = $self->{xdb};
+        my $update_id = int($db->get_metadata('last_update_id') || 0);
+        $db->set_metadata('last_update_id', ++$update_id);
+        $update_id;
+sub replace_or_add ($$$) {
+        my ($db, $doc_id, $doc) = @_;
+        # update our ref:
+        if (defined $doc_id) {
+                $db->replace_document($doc_id, $doc);
+        } else {
+                $doc_id = $db->add_document($doc);
+        }
+        $doc_id;
+sub doc_refnames {
+        my ($doc) = @_;
+        my %cur;
+        each_term_val($doc, 'XREF', qr/^XREF/, sub { $cur{$_[0]} = 1 });
+        \%cur;
+sub decor_update {
+        my ($self, $doc, $decor, $oid) = @_;
+        # load all current refs
+        my $cur = doc_refnames($doc);
+        my $want = $self->{want_refs_re};
+        ($decor) = ($decor =~ m!\((.+)\)!);
+        foreach (split(/, /, $decor)) {
+                my ($sym, $refname, $tag);
+                if (/^(\S+) -> (\S+)\z/) {
+                        ($sym, $refname) = ($1, $2);
+                } elsif (s/^tag: //) {
+                        $refname = $_;
+                        $tag = 1;
+                } else {
+                        $refname = $_;
+                }
+                next if $cur->{$refname};
+                if ($refname =~ $want) {
+                        $self->{-active_refs}->{$refname} = $oid;
+                }
+                # TODO: handle $sym, and do something with tags
+        }
+sub update_ref_contains ($$) {
+        my ($self, $doc) = @_;
+        my $cur = doc_refnames($doc);
+        my $n = 0;
+        my @active = keys %{$self->{-active_refs}};
+        for (@active) {
+                next if $cur->{$_};
+                $doc->add_term('XREF'.$_);
+                ++$n;
+        }
+        $n;
+sub commit_doc ($$$) {
+        my ($self, $doc_id, $doc) = @_;
+        my $n = update_ref_contains($self, $doc);
+        if ($n || !defined($doc_id)) {
+                replace_or_add($self->{xdb}, $doc_id, $doc);
+        }
+sub term_generator ($) { # write-only
+        my ($self) = @_;
+        $self->{term_generator} ||= eval {
+                my $tg = Search::Xapian::TermGenerator->new;
+                $tg->set_stemmer($self->stemmer);
+                $tg;
+        };
+sub index_text_inc ($$$) {
+        my ($tg, $text, $pfx) = @_;
+        $tg->index_text($text, 1, $pfx);
+        $tg->increase_termpos;
+sub index_blob_id ($$$) {
+        my ($tg, $blob_id, $pfx) = @_;
+        index_text_inc($tg, $blob_id, $pfx) if $blob_id ne Z40;
+sub each_log_line ($$) {
+        my ($self, $range) = @_;
+        my $log = $self->{git}->popen(qw(log --decorate=full --pretty=raw
+                        --no-color --no-abbrev --no-notes
+                        -r --raw -p
+                        ), $range, '--');
+        my $db = $self->{xdb};
+        my ($doc, $doc_id);
+        my $tg = term_generator($self);
+        my $state = 0; # 1: subject, 2: body, 3: diff, 4: diff -c
+        my $tip;
+        my $hex = '[a-f0-9]+';
+        my ($cc_ins, $cc_del);
+        my $batch = BATCH_BYTES;
+        my $decorate_only;
+        local $/ = "\n";
+        while (defined(my $l = <$log>)) {
+                $batch -= bytes::length($l);
+                if ($l =~ /^commit (\S+)(\s+\([^\)]+\))?/) {
+                        my ($oid, $decor) = ($1, $2);
+                        commit_doc($self, $doc_id, $doc) if $doc;
+                        $tip ||= $oid;
+                        $state = 0;
+                        $cc_ins = $cc_del = undef;
+                        # prevent OOM
+                        if ($batch <= 0) {
+                                $db->flush;
+                                $batch = BATCH_BYTES;
+                        }
+                        $doc = get_doc($self, \$doc_id, 'commit', $oid);
+                        decor_update($self, $doc, $decor, $oid) if $decor;
+                        # old commit
+                        if (defined $doc_id) {
+                                $decorate_only = $oid;
+                                last;
+                        }
+                        # new commit:
+                        $tg->set_document($doc);
+                        $doc->set_data($oid);
+                        $doc->add_term('Q' . $oid);
+                        index_text_inc($tg, $oid, 'Q');
+                } elsif ($l =~ /^parent (\S+)/) {
+                        my $parent = $1;
+                        index_text_inc($tg, $parent, 'XP');
+                } elsif ($l =~ /^author ([^<]*?<[^>]+>) (\d+)/) {
+                        my ($au, $at) = ($1, $2);
+                        index_text_inc($tg, $au, 'A');
+                        add_val($doc, PublicInbox::RepoGitSearch::AD,
+                                strftime('%Y%m%d', gmtime($at)));
+                } elsif ($l =~ /^committer ([^<]*?<[^>]+>) (\d+)/) {
+                        my ($cu, $ct) = ($1, $2);
+                        index_text_inc($tg, $cu, 'XC');
+                        add_val($doc, PublicInbox::RepoGitSearch::CD,
+                                strftime('%Y%m%d', gmtime($ct)));
+                } elsif ($l =~ /^gpgsig /) {
+                        $state = STATE_GPGSIG;
+                } elsif ($l =~ /^mergetag /) {
+                        $state = -1;
+                } elsif ($state < 0) { # inside mergetag or gpgsig
+                        if ($l eq " \n") { # paragraph
+                                $state--;
+                                $tg->increase_termpos;
+                        } elsif ($l eq "-----BEGIN PGP SIGNATURE-----\n") {
+                                # no point in indexing a PGP signature
+                                $state = STATE_GPGSIG;
+                        } elsif ($state == -2) { # mergetag subject
+                                $tg->index_text($l, 1);
+                                $tg->increase_termpos;
+                        } elsif ($state < -2 && $state > STATE_GPGSIG) {
+                                $tg->index_text($l); # mergetag body
+                        } elsif ($l eq "\n") {
+                                # end of mergetag, onto normal commit message
+                                $tg->increase_termpos;
+                                $state = 0;
+                        } elsif ($l =~ /^ (?:tag|tagger|type) /) {
+                                # ignored
+                        } elsif (DEBUG) {
+                                if ($state <= STATE_GPGSIG) {
+                                # skip
+                                } else {
+                                        warn "unhandled mergetag: $l";
+                                }
+                        }
+                } elsif ($state < 3 && $l =~ s/^    //) { # subject and body
+                        if ($state > 0) {
+                                $l =~ /\S/ ? $tg->index_text($l, 1)
+                                                : $tg->increase_termpos;
+                                $state = 2;
+                        } else {
+                                $state = 1;
+                                $tg->index_text($l, 1, 'S') if $l ne "\n";
+                        }
+                } elsif ($l =~ /^:\d{6} \d{6} ($hex) ($hex) (\S+)\t+(.+)/o) {
+                        # --raw output (regular)
+                        my ($pre, $post, $chg, $names) = ($1, $2, $3, $4);
+                        index_blob_id($tg, $pre, 'XPRE');
+                        index_blob_id($tg, $post, 'XPOST');
+                } elsif ($l =~ /^(::+)(?:\d{6} )+ ($hex .+)? (\S+)\t+(.+)/o) {
+                        # --raw output (combined)
+                        my ($colons, $blobs, $chg, $names) = ($1, $2, $3, $4);
+                        my @blobs = split(/ /, $blobs);
+                        my $post = pop @blobs;
+                        my $n = length($colons);
+                        if (scalar(@blobs) != $n) {
+                                die "combined raw parsed wrong:\n$l\n//\n";
+                        }
+                        index_blob_id($tg, $_, 'XPRE') foreach @blobs;
+                        index_blob_id($tg, $post, 'XPOST');
+                        unless ($cc_ins) {
+                                $n--;
+                                $cc_ins = qr/^ {0,$n}[\+]\s*(.*)/;
+                                $cc_del = qr/^ {0,$n}[\-]\s*(.*)/;
+                        }
+                } elsif ($l =~ m!^diff --git (?:"?a/.+?) (?:"?b/.+)!) {
+                        # regular diff, filenames handled by --raw
+                        $state = 3;
+                } elsif ($l =~ /^diff --(?:cc|combined) (?:.+)/) {
+                        # combined diff, filenames handled by --raw
+                        $state = 4;
+                } elsif ($l =~ /^@@ (?:\S+) (?:\S+) @@(.*)/) {
+                        my $hunk_hdr = $1;
+                        # regular hunk header context
+                        $hunk_hdr =~ /\S/ and
+                                        index_text_inc($tg, $hunk_hdr, 'XDHH');
+                # not currently handled:
+                } elsif ($l =~ /^index (?:$hex)\.\.(?:$hex)/o) {
+                } elsif ($l =~ /^index (?:$hex,[^\.]+)\.\.(?:$hex)(.*)$/o) {
+                        #--cc
+                } elsif ($l =~ /^(?:@@@+) (?:\S+.*\S+) @@@+\z/) { # --cc
+                } elsif ($l =~ /^(?:old|new) mode/) {
+                } elsif ($l =~ /^(?:deleted|new) file mode/) {
+                } elsif ($l =~ /^tree (?:\S+)/) {
+                } elsif ($l =~ /^(?:copy|rename) (?:from|to) /) {
+                } elsif ($l =~ /^(?:dis)?similarity index /) {
+                } elsif ($l =~ /^\\ No newline at end of file/) {
+                } elsif ($l =~ /^Binary files .* differ/) {
+                } elsif ($l =~ /^--- /) { # preimage filename
+                } elsif ($l =~ /^\+\+\+ /) { # postimage filename
+                } elsif ($state == 3) { # diff --git
+                        if ($l =~ s/^\+//) {
+                                index_text_inc($tg, $l, 'XDFB');
+                        } elsif ($l =~ s/^\-//) {
+                                index_text_inc($tg, $l, 'XDFA');
+                        } elsif ($l =~ s/^ //) {
+                                index_text_inc($tg, $l, 'XDCTX');
+                        } elsif (DEBUG) {
+                                if ($l eq "\n") {
+                                } else {
+                                        warn "unhandled diff -u $l";
+                                }
+                        }
+                } elsif ($state == 4) { # diff --cc/combined
+                        if ($l =~ $cc_ins) {
+                                index_text_inc($tg, $1, 'XDFB');
+                        } elsif ($l =~ $cc_del) {
+                                index_text_inc($tg, $1, 'XDFA');
+                        } elsif ($l =~ s/^ //) {
+                                index_text_inc($tg, $l, 'XDCTX');
+                        } elsif (DEBUG) {
+                                if ($l eq "\n") {
+                                } else {
+                                        warn "unhandled diff --cc $l";
+                                }
+                        }
+                } elsif (DEBUG) {
+                        warn  "wtf $state $l\n" if $l ne "\n";
+                }
+        }
+        # optimization: we go into decorate-only mode once we start
+        # seeing commits we've already seen to save git from having
+        # to generate diffs and us from having to skip lines we
+        # don't care about:
+        if (defined $decorate_only) {
+                $doc = undef;
+                # SIGPIPE existing git log, spawn a new fast one
+                $log = $self->{git}->popen(qw(log --decorate=full
+                                                --pretty=format:%H%d),
+                                                $decorate_only);
+                while (defined(my $l = <$log>)) {
+                        $l =~ /^(\S+)(\s+\([^\)]+\))?/ or die "bad line: $l";
+                        my ($oid, $decor) = ($1, $2);
+                        commit_doc($self, $doc_id, $doc) if $doc;
+                        # do not to buffer more than 100 docs before flush
+                        if (($batch -= 10000) <= 0) {
+                                $db->flush;
+                                $batch = BATCH_BYTES;
+                        }
+                        $doc = get_doc($self, \$doc_id, 'commit', $oid);
+                        decor_update($self, $doc, $decor, $oid) if $decor;
+                }
+        }
+        commit_doc($self, $doc_id, $doc) if $doc;
+        $tip;
+sub index_top_ref ($$$) {
+        my ($self, $refname, $end) = @_;
+        my $doc_id;
+        my $db = xdb($self);
+        my $ref_doc = get_doc($self, \$doc_id, 'ref', $refname);
+        my $begin = defined $doc_id ? $ref_doc->get_data : '';
+        my $active = $self->{-active_refs} = { $refname => undef };
+        my $git = $self->{git};
+        # check for discontiguous branches (from "push --force")
+        if ($begin ne '') {
+                my $base = $git->qx(qw(merge-base), $begin, $end);
+                chomp $base;
+                if ($base ne $begin) {
+                        warn "$refname updated with force\n";
+                        # TODO: cleanup_forced_update($self, $refname);
+                        $begin = '';
+                }
+        }
+        my $range = $begin eq '' ? $end : "$begin^0..$end^0";
+        my $tip = each_log_line($self, $range);
+        my $progress = $self->{progress};
+        if (defined $tip) {
+                $ref_doc->set_data($tip);
+                print $progress "$refname => $tip\n" if $progress;
+                replace_or_add($db, $doc_id, $ref_doc);
+        }
+        # update all decorated refs which got snowballed into this one
+        delete $active->{$refname};
+        foreach my $ref (keys %$active) {
+                $ref_doc = get_doc($self, \$doc_id, 'ref', $ref);
+                $ref_doc->set_data($active->{$ref});
+                if ($progress) {
+                        print $progress "$ref => $active->{$ref} ($refname)\n";
+                }
+                replace_or_add($db, $doc_id, $ref_doc);
+        }
+        $db->flush;
+# main entry sub:
+sub index_sync {
+        my ($self, $opts) = @_;
+        $self->{progress} = $opts->{progress};
+        my $db = xdb($self);
+        $self->{-update_id} = update_id($self);
+        # go for most recent refs, first, since that reduces the amount
+        # of work we have to do.
+        my $refs = $self->{git}->popen(qw(for-each-ref --sort=-creatordate));
+        local $/ = "\n";
+        while (defined(my $line = <$refs>)) {
+                chomp $line;
+                my ($oid, $type, $refname) = split(/\s+/, $line);
+                next unless $refname =~ $self->{want_refs_re};
+                next unless $type eq 'commit' || $type eq 'tag';
+                index_top_ref($self, $refname, $oid);
+        }
+        $db->flush;