From 9174f534ccc51054895bdb198c8bc1a765abd9e9 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 27 Nov 2022 09:15:47 +0000 Subject: content_hash: handle References as octets The alsa-devel archives on lore has some UTF-8 References: headers, so we need to treat them as octets, again, otherwise (re)indexing triggers cascading failures. Fixes: 5198c976ce8b "eml: header_raw converts octets to Perl UTF-8" --- lib/PublicInbox/ContentHash.pm | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'lib/PublicInbox/ContentHash.pm') diff --git a/lib/PublicInbox/ContentHash.pm b/lib/PublicInbox/ContentHash.pm index bacc9cdd..1afbb413 100644 --- a/lib/PublicInbox/ContentHash.pm +++ b/lib/PublicInbox/ContentHash.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2021 all contributors +# Copyright (C) all contributors # License: AGPL-3.0+ # Unstable internal API. @@ -63,8 +63,9 @@ sub content_digest ($;$) { # do NOT consider the Message-ID as part of the content_hash # if we got here, we've already got Message-ID reuse my %seen = map { $_ => 1 } @{mids($eml)}; - foreach my $mid (@{references($eml)}) { - $dig->add("ref\0$mid\0") unless $seen{$mid}++; + for (grep { !$seen{$_}++ } @{references($eml)}) { + utf8::encode($_); + $dig->add("ref\0$_\0"); } # Only use Sender: if From is not present -- cgit v1.2.3-24-ge0c7