about summary refs log tree commit homepage
path: root/lib/PublicInbox/SearchIdx.pm
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2023-03-21 23:07:21 +0000
committerEric Wong <e@80x24.org>2023-03-25 09:37:45 +0000
commit32fa6be4222d9af593c22a7dc101d8d5e8835511 (patch)
tree16bbac338b62675b1214bd1fceea4ca4ab2d40cd /lib/PublicInbox/SearchIdx.pm
parent72dfac803728571c30e7ab8caf005229bc1f39f8 (diff)
downloadpublic-inbox-32fa6be4222d9af593c22a7dc101d8d5e8835511.tar.gz
It seems relying on root commits is a reasonable way to
deduplicate and handle repositories with common history.

I initially wanted to shoehorn this into extindex, but decided a
separate Xapian index layout capable of being EITHER external to
handle many forks or internal (in $GIT_DIR/public-inbox-cindex)
for small projects is the right way to go.

Unlike most existing parts of public-inbox, this relies on
absolute paths of $GIT_DIR stored in the Xapian DB and does not
rely on the config file.  We'll be relying on the config file to
map absolute paths to public URL paths for WWW.
Diffstat (limited to 'lib/PublicInbox/SearchIdx.pm')
-rw-r--r--lib/PublicInbox/SearchIdx.pm88
1 files changed, 48 insertions, 40 deletions
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index fc464383..3baeaa9c 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -134,6 +134,7 @@ sub idx_acquire {
                 load_xapian_writable();
                 $flag = $self->{creat} ? $DB_CREATE_OR_OPEN : $DB_OPEN;
         }
+        my $owner = $self->{ibx} // $self->{eidx} // $self;
         if ($self->{creat}) {
                 require File::Path;
                 $self->lock_acquire;
@@ -145,14 +146,13 @@ sub idx_acquire {
                         File::Path::mkpath($dir);
                         require PublicInbox::Syscall;
                         PublicInbox::Syscall::nodatacow_dir($dir);
-                        $self->{-set_has_threadid_once} = 1;
-                        if (($self->{ibx} // $self->{eidx})->{-dangerous}) {
-                                $flag |= $DB_DANGEROUS;
-                        }
+                        # owner == self for CodeSearchIdx
+                        $self->{-set_has_threadid_once} = 1 if $owner != $self;
+                        $flag |= $DB_DANGEROUS if $owner->{-dangerous};
                 }
         }
         return unless defined $flag;
-        $flag |= $DB_NO_SYNC if ($self->{ibx} // $self->{eidx})->{-no_fsync};
+        $flag |= $DB_NO_SYNC if $owner->{-no_fsync};
         my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) };
         croak "Failed opening $dir: $@" if $@;
         $self->{xdb} = $xdb;
@@ -350,43 +350,30 @@ sub index_diff ($$$) {
         index_text($self, join("\n", @$xnq), 1, 'XNQ');
 }
 
-sub index_xapian { # msg_iter callback
-        my $part = $_[0]->[0]; # ignore $depth and $idx
-        my ($self, $doc) = @{$_[1]};
-        my $ct = $part->content_type || 'text/plain';
-        my $fn = $part->filename;
-        if (defined $fn && $fn ne '') {
-                index_phrase($self, $fn, 1, 'XFN');
-        }
-        if ($part->{is_submsg}) {
-                my $mids = mids_for_index($part);
-                index_ids($self, $doc, $part, $mids);
-                my $smsg = bless {}, 'PublicInbox::Smsg';
-                $smsg->populate($part);
-                index_headers($self, $smsg);
-        }
-
-        my ($s, undef) = msg_part_text($part, $ct);
-        defined $s or return;
-        $_[0]->[0] = $part = undef; # free memory
+sub patch_id {
+        my ($self) = @_; # $_[1] is the diff (may be huge)
+        open(my $fh, '+>:utf8', undef) or die "open: $!";
+        open(my $eh, '+>', undef) or die "open: $!";
+        $fh->autoflush(1);
+        print $fh $_[1] or die "print: $!";
+        sysseek($fh, 0, SEEK_SET) or die "sysseek: $!";
+        my $id = ($self->{ibx} // $self->{eidx} // $self)->git->qx(
+                        [qw(patch-id --stable)], {}, { 0 => $fh, 2 => $eh });
+        seek($eh, 0, SEEK_SET) or die "seek: $!";
+        while (<$eh>) { warn $_ }
+        $id =~ /\A([a-f0-9]{40,})/ ? $1 : undef;
+}
 
-        if ($s =~ /^(?:diff|---|\+\+\+) /ms) {
-                open(my $fh, '+>:utf8', undef) or die "open: $!";
-                open(my $eh, '+>', undef) or die "open: $!";
-                $fh->autoflush(1);
-                print $fh $s or die "print: $!";
-                sysseek($fh, 0, SEEK_SET) or die "sysseek: $!";
-                my $id = ($self->{ibx} // $self->{eidx})->git->qx(
-                                                [qw(patch-id --stable)],
-                                                {}, { 0 => $fh, 2 => $eh });
-                $id =~ /\A([a-f0-9]{40,})/ and $doc->add_term('XDFID'.$1);
-                seek($eh, 0, SEEK_SET) or die "seek: $!";
-                while (<$eh>) { warn $_ }
+sub index_body_text {
+        my ($self, $doc, $sref) = @_;
+        if ($$sref =~ /^(?:diff|---|\+\+\+) /ms) {
+                my $id = patch_id($self, $$sref);
+                $doc->add_term('XDFID'.$id) if defined($id);
         }
 
         # split off quoted and unquoted blocks:
-        my @sections = PublicInbox::MsgIter::split_quotes($s);
-        undef $s; # free memory
+        my @sections = PublicInbox::MsgIter::split_quotes($$sref);
+        undef $$sref; # free memory
         for my $txt (@sections) {
                 if ($txt =~ /\A>/) {
                         if ($txt =~ /^[>\t ]+GIT binary patch\r?/sm) {
@@ -396,8 +383,7 @@ sub index_xapian { # msg_iter callback
                                         (?:[>\h]+$BASE85\h*\r?\n)+/$1/gsmx;
                         }
                         index_text($self, $txt, 0, 'XQUOT');
-                } else {
-                        # does it look like a diff?
+                } else { # does it look like a diff?
                         if ($txt =~ /^(?:diff|---|\+\+\+) /ms) {
                                 index_diff($self, \$txt, $doc);
                         } else {
@@ -408,6 +394,28 @@ sub index_xapian { # msg_iter callback
         }
 }
 
+sub index_xapian { # msg_iter callback
+        my $part = $_[0]->[0]; # ignore $depth and $idx
+        my ($self, $doc) = @{$_[1]};
+        my $ct = $part->content_type || 'text/plain';
+        my $fn = $part->filename;
+        if (defined $fn && $fn ne '') {
+                index_phrase($self, $fn, 1, 'XFN');
+        }
+        if ($part->{is_submsg}) {
+                my $mids = mids_for_index($part);
+                index_ids($self, $doc, $part, $mids);
+                my $smsg = bless {}, 'PublicInbox::Smsg';
+                $smsg->populate($part);
+                index_headers($self, $smsg);
+        }
+
+        my ($s, undef) = msg_part_text($part, $ct);
+        defined $s or return;
+        $_[0]->[0] = $part = undef; # free memory
+        index_body_text($self, $doc, \$s);
+}
+
 sub index_list_id ($$$) {
         my ($self, $doc, $hdr) = @_;
         for my $l ($hdr->header_raw('List-Id')) {