diff options
Diffstat (limited to 'lib/PublicInbox/LeiDedupe.pm')
-rw-r--r-- | lib/PublicInbox/LeiDedupe.pm | 20 |
1 files changed, 11 insertions, 9 deletions
diff --git a/lib/PublicInbox/LeiDedupe.pm b/lib/PublicInbox/LeiDedupe.pm index 32f99cd0..eda54d79 100644 --- a/lib/PublicInbox/LeiDedupe.pm +++ b/lib/PublicInbox/LeiDedupe.pm @@ -1,10 +1,9 @@ -# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org> +# Copyright (C) all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> package PublicInbox::LeiDedupe; -use strict; -use v5.10.1; -use PublicInbox::ContentHash qw(content_hash git_sha); -use Digest::SHA (); +use v5.12; +use PublicInbox::ContentHash qw(content_hash content_digest git_sha); +use PublicInbox::SHA qw(sha256); # n.b. mutt sets most of these headers not sure about Bytes our @OID_IGNORE = qw(Status X-Status Content-Length Lines Bytes); @@ -30,11 +29,9 @@ sub _oidbin ($) { defined($_[0]) ? pack('H*', $_[0]) : undef } sub smsg_hash ($) { my ($smsg) = @_; - my $dig = Digest::SHA->new(256); my $x = join("\0", @$smsg{qw(from to cc ds subject references mid)}); utf8::encode($x); - $dig->add($x); - $dig->digest; + sha256($x); } # the paranoid option @@ -72,7 +69,12 @@ sub dedupe_content ($) { my ($skv) = @_; (sub { # may be called in a child process my ($eml) = @_; # $oidhex = $_[1], ignored - $skv->set_maybe(content_hash($eml), ''); + + # we must account for Message-ID via hash_mids, since + # (unlike v2 dedupe) Message-ID is not accounted for elsewhere: + $skv->set_maybe(content_digest($eml, PublicInbox::SHA->new(256), + 1 # hash_mids + )->digest, ''); }, sub { my ($smsg) = @_; $skv->set_maybe(smsg_hash($smsg), ''); |