about summary refs log tree commit homepage
path: root/lib/PublicInbox/LeiSearch.pm
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2021-03-14 13:12:00 +0200
committerEric Wong <e@80x24.org>2021-03-15 08:04:44 +0000
commit42fc590f8cabd23455949d002e2ddf28bbec6d1e (patch)
treecf521b1325e9d74d60ad5f75d0df1d12cf277cb0 /lib/PublicInbox/LeiSearch.pm
parent64b557420689476493d752968d99ab8ae62bad9a (diff)
downloadpublic-inbox-42fc590f8cabd23455949d002e2ddf28bbec6d1e.tar.gz
We only want to auto import messages that are exclusively in
remote externals.  Messages in local externals are not
auto-imported to save space and reduce wear on storage device.
Diffstat (limited to 'lib/PublicInbox/LeiSearch.pm')
-rw-r--r--lib/PublicInbox/LeiSearch.pm37
1 files changed, 24 insertions, 13 deletions
diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm
index ceb3624b..2e3f10fd 100644
--- a/lib/PublicInbox/LeiSearch.pm
+++ b/lib/PublicInbox/LeiSearch.pm
@@ -44,29 +44,40 @@ sub content_key ($) {
 
 sub _cmp_1st { # git->cat_async callback
         my ($bref, $oid, $type, $size, $cmp) = @_; # cmp: [chash, found, smsg]
-        return if defined($cmp->[1]->[0]); # $found->[0]
         if (content_hash(PublicInbox::Eml->new($bref)) eq $cmp->[0]) {
-                push @{$cmp->[1]}, $cmp->[2]->{num};
+                $cmp->[1]->{$oid} = $cmp->[2]->{num};
         }
 }
 
-# returns true if $eml is indexed by lei/store and keywords don't match
-sub kw_changed {
-        my ($self, $eml, $new_kw_sorted) = @_;
+sub xids_for { # returns { OID => docid } mapping for $eml matches
+        my ($self, $eml, $min) = @_;
         my ($chash, $mids) = content_key($eml);
-        my $over = $self->over;
+        my @overs = ($self->over // $self->overs_all);
         my $git = $self->git;
-        my $found = [];
+        my $found = {};
         for my $mid (@$mids) {
-                my ($id, $prev);
-                while (my $cur = $over->next_by_mid($mid, \$id, \$prev)) {
-                        $git->cat_async($cur->{blob}, \&_cmp_1st,
-                                        [ $chash, $found, $cur ]);
-                        last if scalar(@$found);
+                for my $o (@overs) {
+                        my ($id, $prev);
+                        while (my $cur = $o->next_by_mid($mid, \$id, \$prev)) {
+                                next if $found->{$cur->{blob}};
+                                $git->cat_async($cur->{blob}, \&_cmp_1st,
+                                                [ $chash, $found, $cur ]);
+                                if ($min && scalar(keys %$found) >= $min) {
+                                        $git->cat_async_wait;
+                                        return $found;
+                                }
+                        }
                 }
         }
         $git->cat_async_wait;
-        my $num = $found->[0] // return;
+        scalar(keys %$found) ? $found : undef;
+}
+
+# returns true if $eml is indexed by lei/store and keywords don't match
+sub kw_changed {
+        my ($self, $eml, $new_kw_sorted) = @_;
+        my $found = xids_for($self, $eml, 1) // return;
+        my ($num) = values %$found;
         my @cur_kw = msg_keywords($self, $num);
         join("\0", @$new_kw_sorted) eq join("\0", @cur_kw) ? 0 : 1;
 }