diff options
author | Eric Wong <e@80x24.org> | 2023-03-21 23:07:21 +0000 |
---|---|---|
committer | Eric Wong <e@80x24.org> | 2023-03-25 09:37:45 +0000 |
commit | 32fa6be4222d9af593c22a7dc101d8d5e8835511 (patch) | |
tree | 16bbac338b62675b1214bd1fceea4ca4ab2d40cd /lib/PublicInbox/SearchIdx.pm | |
parent | 72dfac803728571c30e7ab8caf005229bc1f39f8 (diff) | |
download | public-inbox-32fa6be4222d9af593c22a7dc101d8d5e8835511.tar.gz |
It seems relying on root commits is a reasonable way to deduplicate and handle repositories with common history. I initially wanted to shoehorn this into extindex, but decided a separate Xapian index layout capable of being EITHER external to handle many forks or internal (in $GIT_DIR/public-inbox-cindex) for small projects is the right way to go. Unlike most existing parts of public-inbox, this relies on absolute paths of $GIT_DIR stored in the Xapian DB and does not rely on the config file. We'll be relying on the config file to map absolute paths to public URL paths for WWW.
Diffstat (limited to 'lib/PublicInbox/SearchIdx.pm')
-rw-r--r-- | lib/PublicInbox/SearchIdx.pm | 88 |
1 files changed, 48 insertions, 40 deletions
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index fc464383..3baeaa9c 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -134,6 +134,7 @@ sub idx_acquire { load_xapian_writable(); $flag = $self->{creat} ? $DB_CREATE_OR_OPEN : $DB_OPEN; } + my $owner = $self->{ibx} // $self->{eidx} // $self; if ($self->{creat}) { require File::Path; $self->lock_acquire; @@ -145,14 +146,13 @@ sub idx_acquire { File::Path::mkpath($dir); require PublicInbox::Syscall; PublicInbox::Syscall::nodatacow_dir($dir); - $self->{-set_has_threadid_once} = 1; - if (($self->{ibx} // $self->{eidx})->{-dangerous}) { - $flag |= $DB_DANGEROUS; - } + # owner == self for CodeSearchIdx + $self->{-set_has_threadid_once} = 1 if $owner != $self; + $flag |= $DB_DANGEROUS if $owner->{-dangerous}; } } return unless defined $flag; - $flag |= $DB_NO_SYNC if ($self->{ibx} // $self->{eidx})->{-no_fsync}; + $flag |= $DB_NO_SYNC if $owner->{-no_fsync}; my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) }; croak "Failed opening $dir: $@" if $@; $self->{xdb} = $xdb; @@ -350,43 +350,30 @@ sub index_diff ($$$) { index_text($self, join("\n", @$xnq), 1, 'XNQ'); } -sub index_xapian { # msg_iter callback - my $part = $_[0]->[0]; # ignore $depth and $idx - my ($self, $doc) = @{$_[1]}; - my $ct = $part->content_type || 'text/plain'; - my $fn = $part->filename; - if (defined $fn && $fn ne '') { - index_phrase($self, $fn, 1, 'XFN'); - } - if ($part->{is_submsg}) { - my $mids = mids_for_index($part); - index_ids($self, $doc, $part, $mids); - my $smsg = bless {}, 'PublicInbox::Smsg'; - $smsg->populate($part); - index_headers($self, $smsg); - } - - my ($s, undef) = msg_part_text($part, $ct); - defined $s or return; - $_[0]->[0] = $part = undef; # free memory +sub patch_id { + my ($self) = @_; # $_[1] is the diff (may be huge) + open(my $fh, '+>:utf8', undef) or die "open: $!"; + open(my $eh, '+>', undef) or die "open: $!"; + $fh->autoflush(1); + print $fh $_[1] or die "print: $!"; + sysseek($fh, 0, SEEK_SET) or die "sysseek: $!"; + my $id = ($self->{ibx} // $self->{eidx} // $self)->git->qx( + [qw(patch-id --stable)], {}, { 0 => $fh, 2 => $eh }); + seek($eh, 0, SEEK_SET) or die "seek: $!"; + while (<$eh>) { warn $_ } + $id =~ /\A([a-f0-9]{40,})/ ? $1 : undef; +} - if ($s =~ /^(?:diff|---|\+\+\+) /ms) { - open(my $fh, '+>:utf8', undef) or die "open: $!"; - open(my $eh, '+>', undef) or die "open: $!"; - $fh->autoflush(1); - print $fh $s or die "print: $!"; - sysseek($fh, 0, SEEK_SET) or die "sysseek: $!"; - my $id = ($self->{ibx} // $self->{eidx})->git->qx( - [qw(patch-id --stable)], - {}, { 0 => $fh, 2 => $eh }); - $id =~ /\A([a-f0-9]{40,})/ and $doc->add_term('XDFID'.$1); - seek($eh, 0, SEEK_SET) or die "seek: $!"; - while (<$eh>) { warn $_ } +sub index_body_text { + my ($self, $doc, $sref) = @_; + if ($$sref =~ /^(?:diff|---|\+\+\+) /ms) { + my $id = patch_id($self, $$sref); + $doc->add_term('XDFID'.$id) if defined($id); } # split off quoted and unquoted blocks: - my @sections = PublicInbox::MsgIter::split_quotes($s); - undef $s; # free memory + my @sections = PublicInbox::MsgIter::split_quotes($$sref); + undef $$sref; # free memory for my $txt (@sections) { if ($txt =~ /\A>/) { if ($txt =~ /^[>\t ]+GIT binary patch\r?/sm) { @@ -396,8 +383,7 @@ sub index_xapian { # msg_iter callback (?:[>\h]+$BASE85\h*\r?\n)+/$1/gsmx; } index_text($self, $txt, 0, 'XQUOT'); - } else { - # does it look like a diff? + } else { # does it look like a diff? if ($txt =~ /^(?:diff|---|\+\+\+) /ms) { index_diff($self, \$txt, $doc); } else { @@ -408,6 +394,28 @@ sub index_xapian { # msg_iter callback } } +sub index_xapian { # msg_iter callback + my $part = $_[0]->[0]; # ignore $depth and $idx + my ($self, $doc) = @{$_[1]}; + my $ct = $part->content_type || 'text/plain'; + my $fn = $part->filename; + if (defined $fn && $fn ne '') { + index_phrase($self, $fn, 1, 'XFN'); + } + if ($part->{is_submsg}) { + my $mids = mids_for_index($part); + index_ids($self, $doc, $part, $mids); + my $smsg = bless {}, 'PublicInbox::Smsg'; + $smsg->populate($part); + index_headers($self, $smsg); + } + + my ($s, undef) = msg_part_text($part, $ct); + defined $s or return; + $_[0]->[0] = $part = undef; # free memory + index_body_text($self, $doc, \$s); +} + sub index_list_id ($$$) { my ($self, $doc, $hdr) = @_; for my $l ($hdr->header_raw('List-Id')) { |