about summary refs log tree commit homepage
path: root/lib/PublicInbox/LeiDedupe.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/PublicInbox/LeiDedupe.pm')
-rw-r--r--lib/PublicInbox/LeiDedupe.pm20
1 files changed, 11 insertions, 9 deletions
diff --git a/lib/PublicInbox/LeiDedupe.pm b/lib/PublicInbox/LeiDedupe.pm
index 32f99cd0..eda54d79 100644
--- a/lib/PublicInbox/LeiDedupe.pm
+++ b/lib/PublicInbox/LeiDedupe.pm
@@ -1,10 +1,9 @@
-# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 package PublicInbox::LeiDedupe;
-use strict;
-use v5.10.1;
-use PublicInbox::ContentHash qw(content_hash git_sha);
-use Digest::SHA ();
+use v5.12;
+use PublicInbox::ContentHash qw(content_hash content_digest git_sha);
+use PublicInbox::SHA qw(sha256);
 
 # n.b. mutt sets most of these headers not sure about Bytes
 our @OID_IGNORE = qw(Status X-Status Content-Length Lines Bytes);
@@ -30,11 +29,9 @@ sub _oidbin ($) { defined($_[0]) ? pack('H*', $_[0]) : undef }
 
 sub smsg_hash ($) {
         my ($smsg) = @_;
-        my $dig = Digest::SHA->new(256);
         my $x = join("\0", @$smsg{qw(from to cc ds subject references mid)});
         utf8::encode($x);
-        $dig->add($x);
-        $dig->digest;
+        sha256($x);
 }
 
 # the paranoid option
@@ -72,7 +69,12 @@ sub dedupe_content ($) {
         my ($skv) = @_;
         (sub { # may be called in a child process
                 my ($eml) = @_; # $oidhex = $_[1], ignored
-                $skv->set_maybe(content_hash($eml), '');
+
+                # we must account for Message-ID via hash_mids, since
+                # (unlike v2 dedupe) Message-ID is not accounted for elsewhere:
+                $skv->set_maybe(content_digest($eml, PublicInbox::SHA->new(256),
+                                1 # hash_mids
+                                )->digest, '');
         }, sub {
                 my ($smsg) = @_;
                 $skv->set_maybe(smsg_hash($smsg), '');