about summary refs log tree commit homepage
path: root/lib/PublicInbox/SearchIdx.pm
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2023-02-20 09:21:50 +0000
committerEric Wong <e@80x24.org>2023-02-20 17:20:59 +0000
commitb0468d20f37b7987e08faad33d3115c9fb8e0559 (patch)
tree8ceb5aa7c8822d0e80aeaae2a316035381c16b46 /lib/PublicInbox/SearchIdx.pm
parent2e80069c1de234be0ee569b4edb498f27a909c42 (diff)
downloadpublic-inbox-b0468d20f37b7987e08faad33d3115c9fb8e0559.tar.gz
Base-85 binary patches were a source of false-positives in results
and we've filtered out in non-quoted text since July 2022.
Unfortunately, people were quoting binary patch contents
in replies (*sigh*) and triggering false positives in search
results.  So we must filter out base-85-looking contents from
quoted text, too.

Followup-to: 8fda04081acde705 (search: do not index base-85 binary patches, 2022-06-20)
Followup-to: 840785917bc74c8e (searchidx: skip "delta $N" sections for base-85, 2022-07-19)
Diffstat (limited to 'lib/PublicInbox/SearchIdx.pm')
-rw-r--r--lib/PublicInbox/SearchIdx.pm10
1 files changed, 8 insertions, 2 deletions
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 257b83a5..fc464383 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -37,7 +37,7 @@ our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff :
         # typical 32-bit system:
         (($Config{ptrsize} >= 8 ? 8192 : 1024) * 1024);
 use constant DEBUG => !!$ENV{DEBUG};
-my $BASE85 = qr/\A[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+\z/;
+my $BASE85 = qr/[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+/;
 my $xapianlevels = qr/\A(?:full|medium)\z/;
 my $hex = '[a-f0-9]';
 my $OID = $hex .'{40,}';
@@ -270,7 +270,7 @@ sub index_diff ($$$) {
                                 push @$xnq, shift(@l);
 
                                 # skip base85 and empty lines
-                                while (@l && ($l[0] =~ /$BASE85/o ||
+                                while (@l && ($l[0] =~ /\A$BASE85\h*\z/o ||
                                                 $l[0] !~ /\S/)) {
                                         shift @l;
                                 }
@@ -389,6 +389,12 @@ sub index_xapian { # msg_iter callback
         undef $s; # free memory
         for my $txt (@sections) {
                 if ($txt =~ /\A>/) {
+                        if ($txt =~ /^[>\t ]+GIT binary patch\r?/sm) {
+                                # get rid of Base-85 noise
+                                $txt =~ s/^([>\h]+(?:literal|delta)
+                                                \x20[0-9]+\r?\n)
+                                        (?:[>\h]+$BASE85\h*\r?\n)+/$1/gsmx;
+                        }
                         index_text($self, $txt, 0, 'XQUOT');
                 } else {
                         # does it look like a diff?