about summary refs log tree commit homepage
path: root/lib/PublicInbox/ContentHash.pm
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2022-11-27 09:15:47 +0000
committerEric Wong <e@80x24.org>2022-11-27 09:19:13 +0000
commit9174f534ccc51054895bdb198c8bc1a765abd9e9 (patch)
treeed68c3770eab136f88d5ad745dc5ab361d046a70 /lib/PublicInbox/ContentHash.pm
parent86cb9010c49523b1968c29ef592bc1afacc77894 (diff)
downloadpublic-inbox-9174f534ccc51054895bdb198c8bc1a765abd9e9.tar.gz
The alsa-devel archives on lore has some UTF-8 References:
headers, so we need to treat them as octets, again, otherwise
(re)indexing triggers cascading failures.

Fixes: 5198c976ce8b "eml: header_raw converts octets to Perl UTF-8"
Diffstat (limited to 'lib/PublicInbox/ContentHash.pm')
-rw-r--r--lib/PublicInbox/ContentHash.pm7
1 files changed, 4 insertions, 3 deletions
diff --git a/lib/PublicInbox/ContentHash.pm b/lib/PublicInbox/ContentHash.pm
index bacc9cdd..1afbb413 100644
--- a/lib/PublicInbox/ContentHash.pm
+++ b/lib/PublicInbox/ContentHash.pm
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 
 # Unstable internal API.
@@ -63,8 +63,9 @@ sub content_digest ($;$) {
         # do NOT consider the Message-ID as part of the content_hash
         # if we got here, we've already got Message-ID reuse
         my %seen = map { $_ => 1 } @{mids($eml)};
-        foreach my $mid (@{references($eml)}) {
-                $dig->add("ref\0$mid\0") unless $seen{$mid}++;
+        for (grep { !$seen{$_}++ } @{references($eml)}) {
+                utf8::encode($_);
+                $dig->add("ref\0$_\0");
         }
 
         # Only use Sender: if From is not present