From ccb66f35d293d3d0e3f9b119f7bd1fa019b93c75 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 11 Feb 2017 23:54:48 +0000 Subject: handle repeated References and In-Reply-To headers It seems possible for git-send-email(1) to generate repeated repeated instances of References and In-Reply-To headers, as evidenced in: https://public-inbox.org/git/20161111124541.8216-17-vascomalmeida@sapo.pt/raw This causes a mismatch between how our search indexer threads and how our HTML view handles threading. In the future, View.pm will use the smsg-parsed {references} field and avoid redoing Email::MIME header parsing. We will still need to figure out a way to deal with messages with repeated Message-IDs, at some point, too. --- lib/PublicInbox/SearchIdx.pm | 30 ++++++------------------------ lib/PublicInbox/SearchThread.pm | 2 +- lib/PublicInbox/View.pm | 19 +++++++++++-------- 3 files changed, 18 insertions(+), 33 deletions(-) diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 1142ca7a..8a529c66 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -291,17 +291,12 @@ sub link_message { my $mid = $smsg->mid; my $mime = $smsg->{mime}; my $hdr = $mime->header_obj; - my $refs = $hdr->header_raw('References'); - my @refs = defined $refs ? ($refs =~ /<([^>]+)>/g) : (); - my $irt = $hdr->header_raw('In-Reply-To'); - if (defined $irt) { - if ($irt eq '') { - $irt = undef; - } else { - $irt = mid_clean($irt); - $irt = undef if $mid eq $irt; - } - } + + # last References should be IRT, but some mail clients do things + # out of order, so trust IRT over References iff IRT exists + my @refs = ($hdr->header_raw('References'), + $hdr->header_raw('In-Reply-To')); + @refs = ((join(' ', @refs)) =~ /<([^>]+)>/g); my $tid; if (@refs) { @@ -309,15 +304,6 @@ sub link_message { my @orig_refs = @refs; @refs = (); - if (defined $irt) { - # to check MAX_MID_SIZE - push @orig_refs, $irt; - - # below, we will ensure IRT (if specified) - # is the last References - $uniq{$irt} = 1; - } - # prevent circular references via References: here: foreach my $ref (@orig_refs) { if (length($ref) > MAX_MID_SIZE) { @@ -329,10 +315,6 @@ sub link_message { } } - # last References should be IRT, but some mail clients do things - # out of order, so trust IRT over References iff IRT exists - push @refs, $irt if defined $irt; - if (@refs) { $smsg->{references} = '<'.join('> <', @refs).'>'; diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm index 2cd066db..2966907a 100644 --- a/lib/PublicInbox/SearchThread.pm +++ b/lib/PublicInbox/SearchThread.pm @@ -7,7 +7,7 @@ # Mail::Thread is unmaintained and unavailable on some distros. # We also do not want pruning or subject grouping, since we want # to encourage strict threading and hopefully encourage people -# to use proper In-Reply-To. +# to use proper In-Reply-To/References. # # This includes fixes from several open bugs for Mail::Thread # diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 2c37cd42..0b1ec75b 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -92,13 +92,13 @@ EOF sub in_reply_to { my ($hdr) = @_; - my $irt = $hdr->header_raw('In-Reply-To'); - - return mid_clean($irt) if defined $irt && $irt ne ''; - - my $refs = $hdr->header_raw('References'); - if ($refs && $refs =~ /<([^>]+)>\s*\z/s) { - return $1; + my %mid = map { $_ => 1 } $hdr->header_raw('Message-ID'); + my @refs = ($hdr->header_raw('References'), + $hdr->header_raw('In-Reply-To')); + @refs = ((join(' ', @refs)) =~ /<([^>]+)>/g); + while (defined(my $irt = pop @refs)) { + next if $mid{"<$irt>"}; + return $irt; } undef; } @@ -201,7 +201,10 @@ sub _th_index_lite { my $rv = ''; my $mapping = $ctx->{mapping} or return $rv; my $pad = ' '; - my ($attr, $node, $idx, $level) = @{$mapping->{$mid_raw}}; + my $mid_map = $mapping->{$mid_raw}; + defined $mid_map or + return 'public-inbox BUG: '.ascii_html($mid_raw).' not mapped'; + my ($attr, $node, $idx, $level) = @$mid_map; my $children = $node->{children}; my $nr_c = scalar @$children; my $nr_s = 0; -- cgit v1.2.3-24-ge0c7 From cb8fc8c39d0a820b35ed3384c35122aaa66f9a6f Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sun, 12 Feb 2017 02:41:22 +0000 Subject: t/mime: quiet warnings for old versions of Email::Simple This is fixed in the newest versions of Email::Simple, but not the version in Debian jessie (2.203) --- t/mime.t | 1 + 1 file changed, 1 insertion(+) diff --git a/t/mime.t b/t/mime.t index c4bdcf0d..b0e2290e 100644 --- a/t/mime.t +++ b/t/mime.t @@ -8,6 +8,7 @@ use Test::More; use_ok 'PublicInbox::MIME'; use PublicInbox::MsgIter; +local $SIG{__WARN__} = sub {}; my $msg = PublicInbox::MIME->new( 'From: Richard Hansen To: git@vger.kernel.org -- cgit v1.2.3-24-ge0c7 From 364de65f8a6b5729027cb70228312a141430122f Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 14 Feb 2017 22:45:15 +0000 Subject: www: do not unescape PATH_INFO twice PSGI specs already require PATH_INFO to be unescaped; so our tests were wrong, too. --- lib/PublicInbox/WWW.pm | 2 +- t/cgi.t | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 430e6b19..62e4ca43 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -165,7 +165,7 @@ sub invalid_inbox_mid { my $ret = invalid_inbox($ctx, $inbox); return $ret if $ret; - $ctx->{mid} = $mid = uri_unescape($mid); + $ctx->{mid} = $mid; if ($mid =~ /\A[a-f0-9]{40}\z/) { # this is horiffically wasteful for legacy URLs: if ($mid = mid2blob($ctx)) { diff --git a/t/cgi.t b/t/cgi.t index 092ad8c7..77409660 100644 --- a/t/cgi.t +++ b/t/cgi.t @@ -148,7 +148,7 @@ EOF $im->add($reply); $im->done; - my $res = cgi_run("/test/slashy%2fasdf\@example.com/raw"); + my $res = cgi_run("/test/slashy/asdf\@example.com/raw"); like($res->{body}, qr/Message-Id: <\Q$slashy_mid\E>/, "slashy mid raw hit"); -- cgit v1.2.3-24-ge0c7