diff options
author | Eric Wong <e@80x24.org> | 2017-02-09 00:43:02 +0000 |
---|---|---|
committer | Eric Wong <e@80x24.org> | 2017-02-09 00:43:02 +0000 |
commit | fb9ed5324ec7de9420956840ba9a6585b81e8231 (patch) | |
tree | ab6b34f8459a585205a25d1f3d18d31437b89052 /lib/PublicInbox | |
parent | 1ab8dabe04ebba61fd8761dca3d569947cbe20be (diff) | |
parent | ba4c50c20b95679580beba1ef290a4281d5285b7 (diff) | |
download | public-inbox-fb9ed5324ec7de9420956840ba9a6585b81e8231.tar.gz |
* origin/master: config: do not slurp lines into memory TODO: several updates search: schema version bump for empty References/In-Reply-To Revert "searchidx: reindex clobbers old thread IDs" searchidx: reindex clobbers old thread IDs searchidx: deal with empty In-Reply-To and References headers searchview: increase limit for displaying search results searchview: clarify numeric summary at bottom add filter for Subject: tags watchmaildir: allow arguments for filters watchmaildir: limit live importer processes learn: implement "rm" only functionality mime: avoid SUPER usage in Email::MIME subclass inbox: reinstate periodic cleanup of Xapian and SQLite objects introduce PublicInbox::MIME wrapper class
Diffstat (limited to 'lib/PublicInbox')
-rw-r--r-- | lib/PublicInbox/Config.pm | 2 | ||||
-rw-r--r-- | lib/PublicInbox/Feed.pm | 6 | ||||
-rw-r--r-- | lib/PublicInbox/Filter/SubjectTag.pm | 33 | ||||
-rw-r--r-- | lib/PublicInbox/Filter/Vger.pm | 2 | ||||
-rw-r--r-- | lib/PublicInbox/Import.pm | 2 | ||||
-rw-r--r-- | lib/PublicInbox/Inbox.pm | 22 | ||||
-rw-r--r-- | lib/PublicInbox/MIME.pm | 102 | ||||
-rw-r--r-- | lib/PublicInbox/MsgIter.pm | 22 | ||||
-rw-r--r-- | lib/PublicInbox/Search.pm | 6 | ||||
-rw-r--r-- | lib/PublicInbox/SearchIdx.pm | 14 | ||||
-rw-r--r-- | lib/PublicInbox/SearchView.pm | 20 | ||||
-rw-r--r-- | lib/PublicInbox/View.pm | 8 | ||||
-rw-r--r-- | lib/PublicInbox/WWW.pm | 6 | ||||
-rw-r--r-- | lib/PublicInbox/WatchMaildir.pm | 23 | ||||
-rw-r--r-- | lib/PublicInbox/WwwAttach.pm | 4 |
15 files changed, 213 insertions, 59 deletions
diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 28b5bdb5..f6275cdd 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -111,7 +111,7 @@ sub git_config_dump { my $fh = popen_rd(\@cmd) or die "popen_rd failed for $file: $!\n"; my %rv; local $/ = "\n"; - foreach my $line (<$fh>) { + while (defined(my $line = <$fh>)) { chomp $line; my ($k, $v) = split(/=/, $line, 2); my $cur = $rv{$k}; diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index 2a33fd29..e5d57550 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -5,7 +5,7 @@ package PublicInbox::Feed; use strict; use warnings; -use Email::MIME; +use PublicInbox::MIME; use PublicInbox::View; use PublicInbox::WwwAtomStream; @@ -39,7 +39,7 @@ sub generate_thread_atom { PublicInbox::WwwAtomStream->response($ctx, 200, sub { while (my $msg = shift @$msgs) { $msg = $ibx->msg_by_smsg($msg) and - return Email::MIME->new($msg); + return PublicInbox::MIME->new($msg); } }); } @@ -175,7 +175,7 @@ sub each_recent_blob { sub do_cat_mail { my ($ibx, $path) = @_; my $mime = eval { $ibx->msg_by_path($path) } or return; - Email::MIME->new($mime); + PublicInbox::MIME->new($mime); } 1; diff --git a/lib/PublicInbox/Filter/SubjectTag.pm b/lib/PublicInbox/Filter/SubjectTag.pm new file mode 100644 index 00000000..1d281425 --- /dev/null +++ b/lib/PublicInbox/Filter/SubjectTag.pm @@ -0,0 +1,33 @@ +# Copyright (C) 2017 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> + +# Filter for various [tags] in subjects +package PublicInbox::Filter::SubjectTag; +use strict; +use warnings; +use base qw(PublicInbox::Filter::Base); + +sub new { + my ($class, %opts) = @_; + my $tag = delete $opts{-tag}; + die "tag not defined!\n" unless defined $tag && $tag ne ''; + my $self = $class->SUPER::new(%opts); + $self->{tag_re} = qr/\A\s*(re:\s+|)\Q$tag\E\s*/i; + $self; +} + +sub scrub { + my ($self, $mime) = @_; + my $subj = $mime->header('Subject'); + $subj =~ s/$self->{tag_re}/$1/; # $1 is "Re: " + $mime->header_str_set('Subject', $subj); + $self->ACCEPT($mime); +} + +# no suffix/article rejection for mirrors +sub delivery { + my ($self, $mime) = @_; + $self->scrub($mime); +} + +1; diff --git a/lib/PublicInbox/Filter/Vger.pm b/lib/PublicInbox/Filter/Vger.pm index 2ffed184..905f28d7 100644 --- a/lib/PublicInbox/Filter/Vger.pm +++ b/lib/PublicInbox/Filter/Vger.pm @@ -25,7 +25,7 @@ sub scrub { # so in multipart (e.g. GPG-signed) messages, the list trailer # becomes invisible to MIME-aware email clients. if ($s =~ s/$l0\n$l1\n$l2\n$l3\n($l4\n)?\z//os) { - $mime = Email::MIME->new(\$s); + $mime = PublicInbox::MIME->new(\$s); } $self->ACCEPT($mime); } diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index 1ac112b8..13671a4f 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -119,7 +119,7 @@ sub remove { $n = read($r, my $lf, 1); defined($n) or die "read final byte of cat-blob failed: $!"; die "bad read on final byte: <$lf>" if $lf ne "\n"; - my $cur = Email::MIME->new($buf); + my $cur = PublicInbox::MIME->new($buf); my $cur_s = $cur->header('Subject'); $cur_s = '' unless defined $cur_s; my $cur_m = $mime->header('Subject'); diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 1a844e1c..999f813b 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -7,6 +7,7 @@ use strict; use warnings; use PublicInbox::Git; use PublicInbox::MID qw(mid2path); +use Devel::Peek qw(SvREFCNT); my $cleanup_timer; eval { @@ -18,10 +19,20 @@ eval { my $CLEANUP = {}; # string(inbox) -> inbox sub cleanup_task () { $cleanup_timer = undef; - delete $_->{git} for values %$CLEANUP; + for my $ibx (values %$CLEANUP) { + foreach my $f (qw(git mm search)) { + delete $ibx->{$f} if SvREFCNT($ibx->{$f}) == 1; + } + } $CLEANUP = {}; } +sub _cleanup_later ($) { + my ($self) = @_; + $cleanup_timer ||= PublicInbox::EvCleanup::later(*cleanup_task); + $CLEANUP->{"$self"} = $self; +} + sub _set_uint ($$$) { my ($opts, $field, $default) = @_; my $val = $opts->{$field}; @@ -70,20 +81,23 @@ sub git { $self->{git} ||= eval { my $g = PublicInbox::Git->new($self->{mainrepo}); $g->{-httpbackend_limiter} = $self->{-httpbackend_limiter}; - $cleanup_timer ||= PublicInbox::EvCleanup::later(*cleanup_task); - $CLEANUP->{"$self"} = $self; + _cleanup_later($self); $g; }; } sub mm { my ($self) = @_; - $self->{mm} ||= eval { PublicInbox::Msgmap->new($self->{mainrepo}) }; + $self->{mm} ||= eval { + _cleanup_later($self); + PublicInbox::Msgmap->new($self->{mainrepo}); + }; } sub search { my ($self) = @_; $self->{search} ||= eval { + _cleanup_later($self); PublicInbox::Search->new($self->{mainrepo}, $self->{altid}); }; } diff --git a/lib/PublicInbox/MIME.pm b/lib/PublicInbox/MIME.pm new file mode 100644 index 00000000..54925a85 --- /dev/null +++ b/lib/PublicInbox/MIME.pm @@ -0,0 +1,102 @@ +# This library is free software; you can redistribute it and/or modify +# it under the same terms as Perl itself. +# +# The license for this file differs from the rest of public-inbox. +# +# It monkey patches the "parts_multipart" subroutine with patches +# from Matthew Horsfall <wolfsage@gmail.com> at: +# +# git clone --mirror https://github.com/rjbs/Email-MIME.git refs/pull/28/head +# +# commit fe0eb870ab732507aa39a1070a2fd9435c7e4877 +# ("Make sure we don't modify the body of a message when injecting a header.") +# commit 981d8201a7239b02114489529fd366c4c576a146 +# ("GH #14 - Handle CRLF emails properly.") +# commit 2338d93598b5e8432df24bda8dfdc231bdeb666e +# ("GH #14 - Support multipart messages without content-type in subparts.") +# +# For Email::MIME >= 1.923 && < 1.935, +# commit dcef9be66c49ae89c7a5027a789bbbac544499ce +# ("removing all trailing newlines was too much") +# is also included +package PublicInbox::MIME; +use strict; +use warnings; +use base qw(Email::MIME); + +if ($Email::MIME::VERSION <= 1.937) { +sub parts_multipart { + my $self = shift; + my $boundary = $self->{ct}->{attributes}->{boundary}; + + # Take a message, join all its lines together. Now try to Email::MIME->new + # it with 1.861 or earlier. Death! It tries to recurse endlessly on the + # body, because every time it splits on boundary it gets itself. Obviously + # that means it's a bogus message, but a mangled result (or exception) is + # better than endless recursion. -- rjbs, 2008-01-07 + return $self->parts_single_part + unless $boundary and $self->body_raw =~ /^--\Q$boundary\E\s*$/sm; + + $self->{body_raw} = Email::Simple::body($self); + + # rfc1521 7.2.1 + my ($body, $epilogue) = split /^--\Q$boundary\E--\s*$/sm, $self->body_raw, 2; + + # Split on boundaries, but keep blank lines after them intact + my @bits = split /^--\Q$boundary\E\s*?(?=$self->{mycrlf})/m, ($body || ''); + + Email::Simple::body_set($self, undef); + + # If there are no headers in the potential MIME part, it's just part of the + # body. This is a horrible hack, although it's debatable whether it was + # better or worse when it was $self->{body} = shift @bits ... -- rjbs, + # 2006-11-27 + Email::Simple::body_set($self, shift @bits) if ($bits[0] || '') !~ /.*:.*/; + + my $bits = @bits; + + my @parts; + for my $bit (@bits) { + # Parts don't need headers. If they don't have them, they look like this: + # + # --90e6ba6e8d06f1723604fc1b809a + # + # Part 2 + # + # Part 2a + # + # $bit will contain two new lines before Part 2. + # + # Anything with headers will only have one new line. + # + # RFC 1341 Section 7.2 says parts without headers are to be considered + # plain US-ASCII text. -- alh + # 2016-08-01 + my $added_header; + + if ($bit =~ /^(?:$self->{mycrlf}){2}/) { + $bit = "Content-type: text/plain; charset=us-ascii" . $bit; + + $added_header = 1; + } + + $bit =~ s/\A[\n\r]+//smg; + $bit =~ s/(?<!\x0d)$self->{mycrlf}\Z//sm; + + my $email = (ref $self)->new($bit); + + if ($added_header) { + # Remove our changes so we don't change the raw email content + $email->header_str_set('Content-Type'); + } + + push @parts, $email; + } + + $self->{parts} = \@parts; + + return @{ $self->{parts} }; +} +} + +1; diff --git a/lib/PublicInbox/MsgIter.pm b/lib/PublicInbox/MsgIter.pm index ef0d209f..5be06a44 100644 --- a/lib/PublicInbox/MsgIter.pm +++ b/lib/PublicInbox/MsgIter.pm @@ -6,17 +6,7 @@ use strict; use warnings; use base qw(Exporter); our @EXPORT = qw(msg_iter); -use Email::MIME; -use Scalar::Util qw(readonly); - -# Workaround Email::MIME versions without -# commit dcef9be66c49ae89c7a5027a789bbbac544499ce -# ("removing all trailing newlines was too much") -# This is necessary for Debian jessie -my $bad = 1.923; -my $good = 1.935; -my $ver = $Email::MIME::VERSION; -my $extra_nl = 1 if ($ver >= $bad && $ver < $good); +use PublicInbox::MIME; # Like Email::MIME::walk_parts, but this is: # * non-recursive @@ -36,16 +26,6 @@ sub msg_iter ($$) { @sub = map { [ $_, $depth, @idx, ++$i ] } @sub; @parts = (@sub, @parts); } else { - if ($extra_nl) { - my $lf = $part->{mycrlf}; - my $bref = $part->{body}; - if (readonly($$bref)) { - my $s = $$bref . $lf; - $part->{body} = \$s; - } else { - $$bref .= $lf; - } - } $cb->($p); } } diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index a1bae419..8c72fa17 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -16,7 +16,7 @@ use constant YYYYMMDD => 4; # for searching in the WWW UI use Search::Xapian qw/:standard/; use PublicInbox::SearchMsg; -use Email::MIME; +use PublicInbox::MIME; use PublicInbox::MID qw/mid_clean id_compress/; # This is English-only, everything else is non-standard and may be confused as @@ -39,7 +39,9 @@ use constant { # 10 - optimize doc for NNTP overviews # 11 - merge threads when vivifying ghosts # 12 - change YYYYMMDD value column to numeric - SCHEMA_VERSION => 12, + # 13 - fix threading for empty References/In-Reply-To + # (commit 83425ef12e4b65cdcecd11ddcb38175d4a91d5a0) + SCHEMA_VERSION => 13, # n.b. FLAG_PURE_NOT is expensive not suitable for a public website # as it could become a denial-of-service vector diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 5adc17d3..c0ea3c1e 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -10,7 +10,7 @@ package PublicInbox::SearchIdx; use strict; use warnings; use Fcntl qw(:flock :DEFAULT); -use Email::MIME; +use PublicInbox::MIME; use Email::MIME::ContentType; $Email::MIME::ContentType::STRICT_PARAMS = 0; use base qw(PublicInbox::Search); @@ -285,11 +285,15 @@ sub link_message { my $mime = $smsg->{mime}; my $hdr = $mime->header_obj; my $refs = $hdr->header_raw('References'); - my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : (); + my @refs = defined $refs ? ($refs =~ /<([^>]+)>/g) : (); my $irt = $hdr->header_raw('In-Reply-To'); if (defined $irt) { - $irt = mid_clean($irt); - $irt = undef if $mid eq $irt; + if ($irt eq '') { + $irt = undef; + } else { + $irt = mid_clean($irt); + $irt = undef if $mid eq $irt; + } } my $tid; @@ -393,7 +397,7 @@ sub do_cat_mail { my $str = $git->cat_file($blob, $sizeref); # fixup bugs from import: $$str =~ s/\A[\r\n]*From [^\r\n]*\r?\n//s; - Email::MIME->new($str); + PublicInbox::MIME->new($str); }; $@ ? undef : $mime; } diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index bd634d8d..f1c4b6a0 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -10,10 +10,10 @@ use PublicInbox::Hval qw/ascii_html/; use PublicInbox::View; use PublicInbox::WwwAtomStream; use PublicInbox::MID qw(mid2path mid_mime mid_clean mid_escape); -use Email::MIME; +use PublicInbox::MIME; require PublicInbox::Git; require PublicInbox::SearchThread; -our $LIM = 50; +our $LIM = 200; sub noop {} @@ -145,15 +145,21 @@ sub search_nav_bot { my $o = $q->{o}; my $end = $o + $nr; my $beg = $o + 1; - my $rv = "</pre><hr><pre>Results $beg-$end of $total"; + my $rv = '</pre><hr><pre>'; + if ($beg <= $end) { + $rv .= "Results $beg-$end of $total"; + $rv .= ' (estimated)' if $end != $total; + } else { + $rv .= "No more results, only $total"; + } my $n = $o + $LIM; if ($n < $total) { my $qs = $q->qs_html(o => $n); - $rv .= qq{, <a\nhref="?$qs"\nrel=next>next</a>} + $rv .= qq{ <a\nhref="?$qs"\nrel=next>next</a>} } if ($o > 0) { - $rv .= $n < $total ? '/' : ', '; + $rv .= $n < $total ? '/' : ' '; my $p = $o - $LIM; my $qs = $q->qs_html(o => ($p > 0 ? $p : 0)); $rv .= qq{<a\nhref="?$qs"\nrel=prev>prev</a>}; @@ -205,7 +211,7 @@ sub mset_thread { $mime = $inbox->msg_by_smsg($mime) and last; } if ($mime) { - $mime = Email::MIME->new($mime); + $mime = PublicInbox::MIME->new($mime); return PublicInbox::View::index_entry($mime, $ctx, scalar @$msgs); } @@ -239,7 +245,7 @@ sub adump { while (my $x = shift @items) { $x = load_doc_retry($srch, $x); $x = $ibx->msg_by_smsg($x) and - return Email::MIME->new($x); + return PublicInbox::MIME->new($x); } return undef; }); diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index e4e9d7d2..2c37cd42 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -299,7 +299,7 @@ sub stream_thread ($$) { } return missing_thread($ctx) unless $mime; - $mime = Email::MIME->new($mime); + $mime = PublicInbox::MIME->new($mime); $ctx->{-title_html} = ascii_html($mime->header('Subject')); $ctx->{-html_tip} = thread_index_entry($ctx, $level, $mime); PublicInbox::WwwStream->response($ctx, 200, sub { @@ -311,7 +311,7 @@ sub stream_thread ($$) { unshift @q, map { ($cl, $_) } @{$node->{children}}; my $mid = $node->{id}; if ($mime = $inbox->msg_by_smsg($node->{smsg})) { - $mime = Email::MIME->new($mime); + $mime = PublicInbox::MIME->new($mime); return thread_index_entry($ctx, $level, $mime); } else { return ghost_index_entry($ctx, $level, $node); @@ -362,7 +362,7 @@ sub thread_html { $mime = $inbox->msg_by_smsg($mime) and last; } return missing_thread($ctx) unless $mime; - $mime = Email::MIME->new($mime); + $mime = PublicInbox::MIME->new($mime); $ctx->{-title_html} = ascii_html($mime->header('Subject')); $ctx->{-html_tip} = '<pre>'.index_entry($mime, $ctx, scalar @$msgs); $mime = undef; @@ -372,7 +372,7 @@ sub thread_html { $mime = $inbox->msg_by_smsg($mime) and last; } if ($mime) { - $mime = Email::MIME->new($mime); + $mime = PublicInbox::MIME->new($mime); return index_entry($mime, $ctx, scalar @$msgs); } $msgs = undef; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 11fc92e9..430e6b19 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -113,7 +113,7 @@ sub preload { require PublicInbox::Feed; require PublicInbox::View; require PublicInbox::SearchThread; - require Email::MIME; + require PublicInbox::MIME; require Digest::SHA; require POSIX; @@ -225,8 +225,8 @@ sub get_mid_html { my $x = mid2blob($ctx) or return r404($ctx); require PublicInbox::View; - require Email::MIME; - my $mime = Email::MIME->new($x); + require PublicInbox::MIME; + my $mime = PublicInbox::MIME->new($x); searcher($ctx); PublicInbox::View::msg_html($ctx, $mime); } diff --git a/lib/PublicInbox/WatchMaildir.pm b/lib/PublicInbox/WatchMaildir.pm index b7c2d17a..1823c248 100644 --- a/lib/PublicInbox/WatchMaildir.pm +++ b/lib/PublicInbox/WatchMaildir.pm @@ -6,7 +6,7 @@ package PublicInbox::WatchMaildir; use strict; use warnings; -use Email::MIME; +use PublicInbox::MIME; use Email::MIME::ContentType; $Email::MIME::ContentType::STRICT_PARAMS = 0; # user input is imperfect use PublicInbox::Git; @@ -207,7 +207,7 @@ sub _path_to_mime { local $/; my $str = <$fh>; $str or return; - return Email::MIME->new(\$str); + return PublicInbox::MIME->new(\$str); } elsif ($!{ENOENT}) { return; } else { @@ -224,18 +224,31 @@ sub _importer_for { my $addr = $inbox->{-primary_address}; PublicInbox::Import->new($git, $name, $addr, $inbox); }; - $self->{importers}->{"$im"} = $im; + + my $importers = $self->{importers}; + if (scalar(keys(%$importers)) > 2) { + delete $importers->{"$im"}; + _done_for_now($self); + } + + $importers->{"$im"} = $im; } sub _scrubber_for { my ($inbox) = @_; my $f = $inbox->{filter}; if ($f && $f =~ /::/) { + my @args; + # basic line splitting, only + # Perhaps we can have proper quote splitting one day... + ($f, @args) = split(/\s+/, $f) if $f =~ /\s+/; + eval "require $f"; if ($@) { warn $@; } else { - return $f->new; + # e.g: PublicInbox::Filter::Vger->new(@args) + return $f->new(@args); } } undef; @@ -247,7 +260,7 @@ sub _spamcheck_cb { my ($mime) = @_; my $tmp = ''; if ($sc->spamcheck($mime, \$tmp)) { - return Email::MIME->new(\$tmp); + return PublicInbox::MIME->new(\$tmp); } warn $mime->header('Message-ID')." failed spam check\n"; undef; diff --git a/lib/PublicInbox/WwwAttach.pm b/lib/PublicInbox/WwwAttach.pm index 33bfce27..a5ba5b2b 100644 --- a/lib/PublicInbox/WwwAttach.pm +++ b/lib/PublicInbox/WwwAttach.pm @@ -5,7 +5,7 @@ package PublicInbox::WwwAttach; # internal package use strict; use warnings; -use Email::MIME; +use PublicInbox::MIME; use Email::MIME::ContentType qw(parse_content_type); $Email::MIME::ContentType::STRICT_PARAMS = 0; use PublicInbox::MsgIter; @@ -15,7 +15,7 @@ sub get_attach ($$$) { my ($ctx, $idx, $fn) = @_; my $res = [ 404, [ 'Content-Type', 'text/plain' ], [ "Not found\n" ] ]; my $mime = $ctx->{-inbox}->msg_by_mid($ctx->{mid}) or return $res; - $mime = Email::MIME->new($mime); + $mime = PublicInbox::MIME->new($mime); msg_iter($mime, sub { my ($part, $depth, @idx) = @{$_[0]}; return if join('.', @idx) ne $idx; |