diff options
author | Eric Wong <e@80x24.org> | 2016-12-26 05:25:36 +0000 |
---|---|---|
committer | Eric Wong <e@80x24.org> | 2016-12-26 05:25:36 +0000 |
commit | e36899b149ecb7cc56f88a6078b18b211ac3c793 (patch) | |
tree | da8a6e07234a815b3782f1a659f6dee1122959d1 | |
parent | b388dfdd96804f898fbf72baf2a32e0c9f0fb3f1 (diff) | |
parent | 427245acacaf04a882d5524e662075909b96905b (diff) | |
download | public-inbox-e36899b149ecb7cc56f88a6078b18b211ac3c793.tar.gz |
* origin/master: (25 commits) evcleanup: ensure deferred close from timers are handled ASAP httpd/async: improve variable naming githttpbackend: minor cleanups to improve readability githttpbackend: simplify compatibility code githttpbackend: minor readability improvement http: fix clobbering of $null_io linkify: modify argument in place view: do not modify array during iteration view: stop chomping off whitespace at ends of messages view: remove unused parameter search: lookup_mail handles modified DBs doc: various comments on async handling searchthread: simplify API and remove needless OO searchthread: update comment about loop prevention searchmsg: remove ensure_metadata tests: add thread-all testing for benchmarking searchmsg: do not memoize {date} field searchmsg: remove locale-dependency for ->date t/config.t: fix feedmax default wwwtext: link to RFC4685 (Atom Threading) ...
-rw-r--r-- | Documentation/public-inbox-config.pod | 8 | ||||
-rw-r--r-- | MANIFEST | 1 | ||||
-rw-r--r-- | TODO | 3 | ||||
-rw-r--r-- | examples/public-inbox.psgi | 1 | ||||
-rw-r--r-- | lib/PublicInbox/Config.pm | 3 | ||||
-rw-r--r-- | lib/PublicInbox/EvCleanup.pm | 14 | ||||
-rw-r--r-- | lib/PublicInbox/Feed.pm | 5 | ||||
-rw-r--r-- | lib/PublicInbox/GitHTTPBackend.pm | 43 | ||||
-rw-r--r-- | lib/PublicInbox/HTTP.pm | 4 | ||||
-rw-r--r-- | lib/PublicInbox/HTTPD.pm | 2 | ||||
-rw-r--r-- | lib/PublicInbox/HTTPD/Async.pm | 14 | ||||
-rw-r--r-- | lib/PublicInbox/Inbox.pm | 11 | ||||
-rw-r--r-- | lib/PublicInbox/Linkify.pm | 17 | ||||
-rw-r--r-- | lib/PublicInbox/Listener.pm | 6 | ||||
-rw-r--r-- | lib/PublicInbox/Search.pm | 12 | ||||
-rw-r--r-- | lib/PublicInbox/SearchMsg.pm | 52 | ||||
-rw-r--r-- | lib/PublicInbox/SearchThread.pm | 39 | ||||
-rw-r--r-- | lib/PublicInbox/SearchView.pm | 25 | ||||
-rw-r--r-- | lib/PublicInbox/View.pm | 42 | ||||
-rw-r--r-- | lib/PublicInbox/WwwAtomStream.pm | 40 | ||||
-rw-r--r-- | lib/PublicInbox/WwwText.pm | 7 | ||||
-rw-r--r-- | t/config.t | 2 | ||||
-rw-r--r-- | t/feed.t | 13 | ||||
-rw-r--r-- | t/search.t | 2 | ||||
-rw-r--r-- | t/thread-all.t | 38 | ||||
-rw-r--r-- | t/thread-cycle.t | 9 |
26 files changed, 227 insertions, 186 deletions
diff --git a/Documentation/public-inbox-config.pod b/Documentation/public-inbox-config.pod index 00376457..cfd6c93d 100644 --- a/Documentation/public-inbox-config.pod +++ b/Documentation/public-inbox-config.pod @@ -120,6 +120,14 @@ addresses or mirrors. Default: none +=item publicinbox.<name>.feedmax + +The size of an Atom feed for the inbox. If specified more than +once, only the last value is used. Invalid values (<= 0) will +be treated as the default value. + +Default: 25 + =back =head1 ENVIRONMENT @@ -186,6 +186,7 @@ t/repobrowse_git_tree.t t/search.t t/spamcheck_spamc.t t/spawn.t +t/thread-all.t t/thread-cycle.t t/utf8.mbox t/view.t @@ -34,8 +34,6 @@ all need to be considered for everything we introduce) the links should point to an anchor tag within the same page, instead; giving the user options. -* implement RFC 4685 (Atom message threading) - * configurable constants (index limits, search results) * handle messages with multiple Message-IDs @@ -47,6 +45,7 @@ all need to be considered for everything we introduce) * portability to FreeBSD (and other Free Software *BSDs) ugh... https://rt.cpan.org/Ticket/Display.html?id=116615 + (IO::KQueue is broken with Danga::Socket) * improve documentation diff --git a/examples/public-inbox.psgi b/examples/public-inbox.psgi index e97f917f..98cde92d 100644 --- a/examples/public-inbox.psgi +++ b/examples/public-inbox.psgi @@ -14,6 +14,7 @@ my $www = PublicInbox::WWW->new; # share the public-inbox code itself: my $src = $ENV{SRC_GIT_DIR}; # '/path/to/public-inbox.git' +$src = PublicInbox::Git->new($src) if defined $src; builder { eval { diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 8d66cf8c..6e31df72 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -145,7 +145,8 @@ sub _fill { my $rv = {}; foreach my $k (qw(mainrepo address filter url newsgroup - infourl watch watchheader httpbackendmax)) { + infourl watch watchheader httpbackendmax + feedmax)) { my $v = $self->{"$pfx.$k"}; $rv->{$k} = $v if defined $v; } diff --git a/lib/PublicInbox/EvCleanup.pm b/lib/PublicInbox/EvCleanup.pm index 2b77c617..b9fe843b 100644 --- a/lib/PublicInbox/EvCleanup.pm +++ b/lib/PublicInbox/EvCleanup.pm @@ -30,9 +30,19 @@ sub _run_all ($) { $_->() foreach @$run; } +# ensure Danga::Socket::ToClose fires after timers fire +sub _asap_close () { $asapq->[1] ||= _asap_timer() } + sub _run_asap () { _run_all($asapq) } -sub _run_next () { _run_all($nextq) } -sub _run_later () { _run_all($laterq) } +sub _run_next () { + _run_all($nextq); + _asap_close(); +} + +sub _run_later () { + _run_all($laterq); + _asap_close(); +} # Called by Danga::Socket sub event_write { diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index 31d82adb..2a33fd29 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -8,9 +8,6 @@ use warnings; use Email::MIME; use PublicInbox::View; use PublicInbox::WwwAtomStream; -use constant { - MAX_PER_PAGE => 25, # this needs to be tunable -}; # main function sub generate { @@ -114,7 +111,7 @@ sub new_html_footer { sub each_recent_blob { my ($ctx, $cb) = @_; - my $max = $ctx->{max} || MAX_PER_PAGE; + my $max = $ctx->{-inbox}->{feedmax}; my $hex = '[a-f0-9]'; my $addmsg = qr!^:000000 100644 \S+ \S+ A\t(${hex}{2}/${hex}{38})$!; my $delmsg = qr!^:100644 000000 \S+ \S+ D\t(${hex}{2}/${hex}{38})$!; diff --git a/lib/PublicInbox/GitHTTPBackend.pm b/lib/PublicInbox/GitHTTPBackend.pm index 0275a2a0..c8f4706c 100644 --- a/lib/PublicInbox/GitHTTPBackend.pm +++ b/lib/PublicInbox/GitHTTPBackend.pm @@ -46,6 +46,9 @@ sub r ($;$) { sub serve { my ($env, $git, $path) = @_; + # XXX compatibility... ugh, can we stop supporting this? + $git = PublicInbox::Git->new($git) unless ref($git); + # Documentation/technical/http-protocol.txt in git.git # requires one and exactly one query parameter: if ($env->{QUERY_STRING} =~ /\Aservice=git-\w+-pack\z/ || @@ -98,7 +101,7 @@ sub serve_dumb { return r(404); } - my $f = (ref $git ? $git->{git_dir} : $git) . '/' . $path; + my $f = $git->{git_dir} . '/' . $path; return r(404) unless -f $f && -r _; # just in case it's a FIFO :P my $size = -s _; @@ -196,21 +199,15 @@ sub serve_smart { my $val = $env->{$name}; $env{$name} = $val if defined $val; } - my ($git_dir, $limiter); - if (ref $git) { - $limiter = $git->{-httpbackend_limiter} || $default_limiter; - $git_dir = $git->{git_dir}; - } else { - $limiter = $default_limiter; - $git_dir = $git; - } + my $limiter = $git->{-httpbackend_limiter} || $default_limiter; + my $git_dir = $git->{git_dir}; $env{GIT_HTTP_EXPORT_ALL} = '1'; $env{PATH_TRANSLATED} = "$git_dir/$path"; - my %rdr = ( 0 => fileno($in) ); - my $x = PublicInbox::Qspawn->new([qw(git http-backend)], \%env, \%rdr); + my $rdr = { 0 => fileno($in) }; + my $qsp = PublicInbox::Qspawn->new([qw(git http-backend)], \%env, $rdr); my ($fh, $rpipe); my $end = sub { - if (my $err = $x->finish) { + if (my $err = $qsp->finish) { err($env, "git http-backend ($git_dir): $err"); } $fh->close if $fh; # async-only @@ -227,8 +224,7 @@ sub serve_smart { $r->[0] == 403 ? serve_dumb($env, $git, $path) : $r; }; my $res; - my $async = $env->{'pi-httpd.async'}; - my $io = $env->{'psgix.io'}; + my $async = $env->{'pi-httpd.async'}; # XXX unstable API my $cb = sub { my $r = $rd_hdr->() or return; $rd_hdr = undef; @@ -239,17 +235,16 @@ sub serve_smart { $rpipe->close; $end->(); } - return $res->($r); - } - if ($async) { + $res->($r); + } elsif ($async) { $fh = $res->($r); - return $async->async_pass($io, $fh, \$buf); + $async->async_pass($env->{'psgix.io'}, $fh, \$buf); + } else { # for synchronous PSGI servers + require PublicInbox::GetlineBody; + $r->[2] = PublicInbox::GetlineBody->new($rpipe, $end, + $buf); + $res->($r); } - - # for synchronous PSGI servers - require PublicInbox::GetlineBody; - $r->[2] = PublicInbox::GetlineBody->new($rpipe, $end, $buf); - $res->($r); }; sub { ($res) = @_; @@ -258,7 +253,7 @@ sub serve_smart { # holding the input here is a waste of FDs and memory $env->{'psgi.input'} = undef; - $x->start($limiter, sub { # may run later, much later... + $qsp->start($limiter, sub { # may run later, much later... ($rpipe) = @_; $in = undef; if ($async) { diff --git a/lib/PublicInbox/HTTP.pm b/lib/PublicInbox/HTTP.pm index cac14be3..c4b74b45 100644 --- a/lib/PublicInbox/HTTP.pm +++ b/lib/PublicInbox/HTTP.pm @@ -328,7 +328,7 @@ sub more ($$) { sub input_prepare { my ($self, $env) = @_; - my $input = $null_io; + my $input; my $len = $env->{CONTENT_LENGTH}; if ($len) { if ($len > $MAX_REQUEST_BUFFER) { @@ -339,6 +339,8 @@ sub input_prepare { } elsif (env_chunked($env)) { $len = CHUNK_START; open($input, '+>', undef); + } else { + $input = $null_io; } # TODO: expire idle clients on ENFILE / EMFILE diff --git a/lib/PublicInbox/HTTPD.pm b/lib/PublicInbox/HTTPD.pm index 433d6da7..fb476624 100644 --- a/lib/PublicInbox/HTTPD.pm +++ b/lib/PublicInbox/HTTPD.pm @@ -29,6 +29,8 @@ sub new { 'psgi.multiprocess' => Plack::Util::TRUE, 'psgix.harakiri'=> Plack::Util::FALSE, 'psgix.input.buffered' => Plack::Util::TRUE, + + # XXX unstable API! 'pi-httpd.async' => do { no warnings 'once'; *pi_httpd_async diff --git a/lib/PublicInbox/HTTPD/Async.pm b/lib/PublicInbox/HTTPD/Async.pm index 68514f5a..79951ca6 100644 --- a/lib/PublicInbox/HTTPD/Async.pm +++ b/lib/PublicInbox/HTTPD/Async.pm @@ -30,10 +30,10 @@ sub restart_read_cb ($) { } sub async_pass { - my ($self, $io, $fh, $bref) = @_; - # In case the client HTTP connection ($io) dies, it + my ($self, $http, $fh, $bref) = @_; + # In case the client HTTP connection ($http) dies, it # will automatically close this ($self) object. - $io->{forward} = $self; + $http->{forward} = $self; $fh->write($$bref); my $restart_read = restart_read_cb($self); weaken($self); @@ -41,10 +41,10 @@ sub async_pass { my $r = sysread($self->{sock}, $$bref, 8192); if ($r) { $fh->write($$bref); - return if $io->{closed}; - if ($io->{write_buf_size}) { + return if $http->{closed}; + if ($http->{write_buf_size}) { $self->watch_read(0); - $io->write($restart_read); # D::S::write + $http->write($restart_read); # D::S::write } # stay in watch_read, but let other clients # get some work done, too. @@ -55,7 +55,7 @@ sub async_pass { # Done! Error handling will happen in $fh->close # called by the {cleanup} handler - $io->{forward} = undef; + $http->{forward} = undef; $self->close; } } diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 8c639082..5503980f 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -29,11 +29,22 @@ sub _weaken_later ($) { $WEAKEN->{"$self"} = $self; } +sub _set_uint ($$$) { + my ($opts, $field, $default) = @_; + my $val = $opts->{$field}; + if (defined $val) { + $val = $val->[-1] if ref($val) eq 'ARRAY'; + $val = undef if $val !~ /\A\d+\z/; + } + $opts->{$field} = $val || $default; +} + sub new { my ($class, $opts) = @_; my $v = $opts->{address} ||= 'public-inbox@example.com'; my $p = $opts->{-primary_address} = ref($v) eq 'ARRAY' ? $v->[0] : $v; $opts->{domain} = ($p =~ /\@(\S+)\z/) ? $1 : 'localhost'; + _set_uint($opts, 'feedmax', 25); weaken($opts->{-pi_config}); bless $opts, $class; } diff --git a/lib/PublicInbox/Linkify.pm b/lib/PublicInbox/Linkify.pm index acd2a47e..8e1728c7 100644 --- a/lib/PublicInbox/Linkify.pm +++ b/lib/PublicInbox/Linkify.pm @@ -22,11 +22,10 @@ my $LINK_RE = qr{(\()?\b((?:ftps?|https?|nntps?|gopher):// (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)? )}xi; -sub new { bless {}, shift } +sub new { bless {}, $_[0] } sub linkify_1 { - my ($self, $s) = @_; - $s =~ s!$LINK_RE! + $_[1] =~ s!$LINK_RE! my $beg = $1 || ''; my $url = $2; my $end = ''; @@ -50,19 +49,17 @@ sub linkify_1 { # only escape ampersands, others do not match LINK_RE $url =~ s/&/&/g; - $self->{$key} = $url; + $_[0]->{$key} = $url; $beg . 'PI-LINK-'. $key . $end; !ge; - $s; + $_[1]; } sub linkify_2 { - my ($self, $s) = @_; - # Added "PI-LINK-" prefix to avoid false-positives on git commits - $s =~ s!\bPI-LINK-([a-f0-9]{40})\b! + $_[1] =~ s!\bPI-LINK-([a-f0-9]{40})\b! my $key = $1; - my $url = $self->{$key}; + my $url = $_[0]->{$key}; if (defined $url) { "<a\nhref=\"$url\">$url</a>"; } else { @@ -70,7 +67,7 @@ sub linkify_2 { $key; } !ge; - $s; + $_[1]; } 1; diff --git a/lib/PublicInbox/Listener.pm b/lib/PublicInbox/Listener.pm index 5f351a77..a3a2015b 100644 --- a/lib/PublicInbox/Listener.pm +++ b/lib/PublicInbox/Listener.pm @@ -13,7 +13,7 @@ require IO::Handle; sub new ($$$) { my ($class, $s, $cb) = @_; setsockopt($s, SOL_SOCKET, SO_KEEPALIVE, 1); - setsockopt($s, IPPROTO_TCP, TCP_NODELAY, 1); + setsockopt($s, IPPROTO_TCP, TCP_NODELAY, 1); # ignore errors on non-TCP listen($s, 1024); IO::Handle::blocking($s, 0); my $self = fields::new($class); @@ -26,8 +26,12 @@ sub new ($$$) { sub event_read { my ($self) = @_; my $sock = $self->{sock}; + # no loop here, we want to fairly distribute clients # between multiple processes sharing the same socket + # XXX our event loop needs better granularity for + # a single accept() here to be, umm..., acceptable + # on high-traffic sites. if (my $addr = accept(my $c, $sock)) { IO::Handle::blocking($c, 0); # no accept4 :< $self->{post_accept}->($c, $addr, $sock); diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 24cb2667..b59430d8 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -108,12 +108,6 @@ my %all_pfx = (%bool_pfx_internal, %bool_pfx_external, %prob_prefix); sub xpfx { $all_pfx{$_[0]} } -our %PFX2TERM_RMAP; -my %meta_pfx = (mid => 1, thread => 1, path => 1); -while (my ($k, $v) = each %all_pfx) { - $PFX2TERM_RMAP{$v} = $k if $meta_pfx{$k}; -} - my $mail_query = Search::Xapian::Query->new(xpfx('type') . 'mail'); sub xdir { @@ -295,8 +289,10 @@ sub lookup_message { sub lookup_mail { # no ghosts! my ($self, $mid) = @_; - my $smsg = lookup_message($self, $mid) or return; - PublicInbox::SearchMsg->load_doc($smsg->{doc}); + retry_reopen($self, sub { + my $smsg = lookup_message($self, $mid) or return; + PublicInbox::SearchMsg->load_doc($smsg->{doc}); + }); } sub find_unique_doc_id { diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 5779d1e2..96406c6f 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -7,12 +7,9 @@ package PublicInbox::SearchMsg; use strict; use warnings; use Search::Xapian; -use POSIX qw//; use Date::Parse qw/str2time/; use PublicInbox::MID qw/mid_clean/; use PublicInbox::Address; -our $PFX2TERM_RE = undef; -use POSIX qw(strftime); sub new { my ($class, $mime) = @_; @@ -72,13 +69,18 @@ sub subject ($) { __hdr($_[0], 'subject') } sub to ($) { __hdr($_[0], 'to') } sub cc ($) { __hdr($_[0], 'cc') } +# no strftime, that is locale-dependent +my @DoW = qw(Sun Mon Tue Wed Thu Fri Sat); +my @MoY = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); + sub date ($) { my ($self) = @_; - my $date = __hdr($self, 'date'); - return $date if defined $date; my $ts = $self->{ts}; return unless defined $ts; - $self->{date} = strftime('%a, %d %b %Y %T +0000', gmtime($ts)); + my ($sec, $min, $hour, $mday, $mon, $year, $wday) = gmtime($ts); + "$DoW[$wday], " . sprintf("%02d $MoY[$mon] %04d %02d:%02d:%02d +0000", + $mday, $year+1900, $hour, $min, $sec); + } sub from ($) { @@ -118,29 +120,17 @@ sub references { defined $x ? $x : ''; } -sub ensure_metadata { - my ($self) = @_; +sub _get_term_val ($$$) { + my ($self, $pfx, $re) = @_; my $doc = $self->{doc}; my $end = $doc->termlist_end; - - unless (defined $PFX2TERM_RE) { - my $or = join('|', keys %PublicInbox::Search::PFX2TERM_RMAP); - $PFX2TERM_RE = qr/\A($or)/; - } - - while (my ($pfx, $field) = each %PublicInbox::Search::PFX2TERM_RMAP) { - # ideally we'd move this out of the loop: - my $i = $doc->termlist_begin; - - $i->skip_to($pfx); - if ($i != $end) { - my $val = $i->get_termname; - - if ($val =~ s/$PFX2TERM_RE//o) { - $self->{$field} = $val; - } - } + my $i = $doc->termlist_begin; + $i->skip_to($pfx); + if ($i != $end) { + my $val = $i->get_termname; + $val =~ s/$re// and return $val; } + undef; } sub mid ($;$) { @@ -151,8 +141,8 @@ sub mid ($;$) { } elsif (my $rv = $self->{mid}) { $rv; } else { - $self->ensure_metadata; # needed for ghosts - $self->{mid} ||= $self->_extract_mid; + $self->{mid} = _get_term_val($self, 'Q', qr/\AQ/) || + $self->_extract_mid; } } @@ -191,16 +181,14 @@ sub thread_id { my ($self) = @_; my $tid = $self->{thread}; return $tid if defined $tid; - $self->ensure_metadata; - $self->{thread}; + $self->{thread} = _get_term_val($self, 'G', qr/\AG/); # *G*roup } sub path { my ($self) = @_; my $path = $self->{path}; return $path if defined $path; - $self->ensure_metadata; - $self->{path}; + $self->{path} = _get_term_val($self, 'XPATH', qr/\AXPATH/); # path } 1; diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm index 601a84b0..2cd066db 100644 --- a/lib/PublicInbox/SearchThread.pm +++ b/lib/PublicInbox/SearchThread.pm @@ -21,32 +21,28 @@ package PublicInbox::SearchThread; use strict; use warnings; -sub new { - return bless { - messages => $_[1], - id_table => {}, - rootset => [] - }, $_[0]; -} - sub thread { - my $self = shift; - _add_message($self, $_) foreach @{$self->{messages}}; - my $id_table = delete $self->{id_table}; - $self->{rootset} = [ grep { + my ($messages, $ordersub) = @_; + my $id_table = {}; + _add_message($id_table, $_) foreach @$messages; + my $rootset = [ grep { !delete($_->{parent}) && $_->visible } values %$id_table ]; + $id_table = undef; + $rootset = $ordersub->($rootset); + $_->order_children($ordersub) for @$rootset; + $rootset; } sub _get_cont_for_id ($$) { - my ($self, $mid) = @_; - $self->{id_table}{$mid} ||= PublicInbox::SearchThread::Msg->new($mid); + my ($id_table, $mid) = @_; + $id_table->{$mid} ||= PublicInbox::SearchThread::Msg->new($mid); } sub _add_message ($$) { - my ($self, $smsg) = @_; + my ($id_table, $smsg) = @_; # A. if id_table... - my $this = _get_cont_for_id($self, $smsg->{mid}); + my $this = _get_cont_for_id($id_table, $smsg->{mid}); $this->{smsg} = $smsg; # B. For each element in the message's References field: @@ -59,7 +55,7 @@ sub _add_message ($$) { my $prev; foreach my $ref ($refs =~ m/<([^>]+)>/g) { # Find a Container object for the given Message-ID - my $cont = _get_cont_for_id($self, $ref); + my $cont = _get_cont_for_id($id_table, $ref); # Link the References field's Containers together in # the order implied by the References header @@ -82,13 +78,6 @@ sub _add_message ($$) { $prev->add_child($this) if defined $prev; } -sub order { - my ($self, $ordersub) = @_; - my $rootset = $ordersub->($self->{rootset}); - $self->{rootset} = $rootset; - $_->order_children($ordersub) for @$rootset; -} - package PublicInbox::SearchThread::Msg; use strict; use warnings; @@ -129,7 +118,7 @@ sub add_child { sub has_descendent { my ($self, $child) = @_; - my %seen; # loop prevention XXX may not be necessary + my %seen; # loop prevention while ($child) { return 1 if $self == $child || $seen{$child}++; $child = $child->{parent}; diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index 50a2c01c..bd634d8d 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -161,6 +161,15 @@ sub search_nav_bot { $rv .= '</pre>'; } +sub sort_relevance { + my ($pct) = @_; + sub { + [ sort { (eval { $pct->{$b->topmost->{id}} } || 0) + <=> + (eval { $pct->{$a->topmost->{id}} } || 0) + } @{$_[0]} ] }; +} + sub mset_thread { my ($ctx, $mset, $q) = @_; my %pct; @@ -171,18 +180,8 @@ sub mset_thread { $smsg; } ($mset->items) ]}); - my $th = PublicInbox::SearchThread->new($msgs); - $th->thread; - if ($q->{r}) { # order by relevance - $th->order(sub { - [ sort { (eval { $pct{$b->topmost->{id}} } || 0) - <=> - (eval { $pct{$a->topmost->{id}} } || 0) - } @{$_[0]} ]; - }); - } else { # order by time (default for threaded view) - $th->order(*PublicInbox::View::sort_ts); - } + my $rootset = PublicInbox::SearchThread::thread($msgs, + $q->{r} ? sort_relevance(\%pct) : *PublicInbox::View::sort_ts); my $skel = search_nav_bot($mset, $q). "<pre>"; my $inbox = $ctx->{-inbox}; $ctx->{-upfx} = ''; @@ -196,7 +195,7 @@ sub mset_thread { $ctx->{seen} = {}; $ctx->{s_nr} = scalar(@$msgs).'+ results'; - PublicInbox::View::walk_thread($th, $ctx, + PublicInbox::View::walk_thread($rootset, $ctx, *PublicInbox::View::pre_thread); my $mime; diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index fa47a16a..e4e9d7d2 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -260,8 +260,8 @@ sub _th_index_lite { } sub walk_thread { - my ($th, $ctx, $cb) = @_; - my @q = map { (0, $_, -1) } @{$th->{rootset}}; + my ($rootset, $ctx, $cb) = @_; + my @q = map { (0, $_, -1) } @$rootset; while (@q) { my ($level, $node, $i) = splice(@q, 0, 3); defined $node or next; @@ -285,10 +285,10 @@ sub thread_index_entry { } sub stream_thread ($$) { - my ($th, $ctx) = @_; + my ($rootset, $ctx) = @_; my $inbox = $ctx->{-inbox}; my $mime; - my @q = map { (0, $_) } @{$th->{rootset}}; + my @q = map { (0, $_) } @$rootset; my $level; while (@q) { $level = shift @q; @@ -350,10 +350,10 @@ sub thread_html { $ctx->{mapping} = {}; $ctx->{s_nr} = "$nr+ messages in thread"; - my $th = thread_results($msgs); - walk_thread($th, $ctx, *pre_thread); + my $rootset = thread_results($msgs); + walk_thread($rootset, $ctx, *pre_thread); $skel .= '</pre>'; - return stream_thread($th, $ctx) unless $ctx->{flat}; + return stream_thread($rootset, $ctx) unless $ctx->{flat}; # flat display: lazy load the full message from smsg my $inbox = $ctx->{-inbox}; @@ -441,7 +441,7 @@ sub attach_link ($$$$;$) { sub add_text_body { my ($upfx, $p) = @_; # from msg_iter: [ Email::MIME, depth, @idx ] - my ($part, $depth, @idx) = @$p; + my ($part, $depth) = @$p; # attachment @idx is unused my $ct = $part->content_type || 'text/plain'; my $fn = $part->filename; @@ -476,29 +476,26 @@ sub add_text_body { } my @quot; my $l = PublicInbox::Linkify->new; - while (defined(my $cur = shift @lines)) { + foreach my $cur (@lines) { if ($cur !~ /^>/) { # show the previously buffered quote inline flush_quote(\$s, $l, \@quot) if @quot; # regular line, OK - $cur = $l->linkify_1($cur); - $cur = ascii_html($cur); - $s .= $l->linkify_2($cur); + $l->linkify_1($cur); + $s .= $l->linkify_2(ascii_html($cur)); } else { push @quot, $cur; } } - my $end = "\n"; - if (@quot) { - $end = ''; + if (@quot) { # ugh, top posted flush_quote(\$s, $l, \@quot); + } elsif ($s =~ /\n\z/s) { # common, last line ends with a newline + $s; + } else { # some editors don't do newlines... + $s .= "\n"; } - $s =~ s/[ \t]+$//sgm; # kill per-line trailing whitespace - $s =~ s/\A\n+//s; # kill leading blank lines - $s =~ s/\s+\z//s; # kill all trailing spaces - $s .= $end; } sub _msg_html_prepare { @@ -737,7 +734,7 @@ sub indent_for { sub load_results { my ($srch, $sres) = @_; my $msgs = delete $sres->{msgs}; - $srch->retry_reopen(sub { [ map { $_->ensure_metadata; $_ } @$msgs ] }); + $srch->retry_reopen(sub { [ map { $_->mid; $_ } @$msgs ] }); } sub msg_timestamp { @@ -749,10 +746,7 @@ sub msg_timestamp { sub thread_results { my ($msgs) = @_; require PublicInbox::SearchThread; - my $th = PublicInbox::SearchThread->new($msgs); - $th->thread; - $th->order(*sort_ts); - $th + PublicInbox::SearchThread::thread($msgs, *sort_ts); } sub missing_thread { diff --git a/lib/PublicInbox/WwwAtomStream.pm b/lib/PublicInbox/WwwAtomStream.pm index 5720384c..a6817b31 100644 --- a/lib/PublicInbox/WwwAtomStream.pm +++ b/lib/PublicInbox/WwwAtomStream.pm @@ -9,10 +9,10 @@ use warnings; # FIXME: locale-independence: use POSIX qw(strftime); use Date::Parse qw(strptime); - +use Digest::SHA qw(sha1_hex); use PublicInbox::Address; use PublicInbox::Hval qw(ascii_html); -use PublicInbox::MID qw/mid_clean mid2path mid_escape/; +use PublicInbox::MID qw/mid_clean mid_escape/; # called by PSGI server after getline: sub close {} @@ -72,7 +72,8 @@ sub atom_header { my $mtime = (stat($ibx->{mainrepo}))[9] || time; qq(<?xml version="1.0" encoding="us-ascii"?>\n) . - qq{<feed\nxmlns="http://www.w3.org/2005/Atom">} . + qq(<feed\nxmlns="http://www.w3.org/2005/Atom"\n) . + qq(xmlns:thr="http://purl.org/syndication/thread/1.0">) . qq{$title} . qq(<link\nrel="alternate"\ntype="text/html") . qq(\nhref="$base_url"/>) . @@ -81,22 +82,33 @@ sub atom_header { feed_updated(gmtime($mtime)); } +sub mid2uuid ($) { + my ($mid) = @_; + utf8::encode($mid); # really screwed up In-Reply-To fields exist + $mid = sha1_hex($mid); + my $h = '[a-f0-9]'; + my (@uuid5) = ($mid =~ m!\A($h{8})($h{4})($h{4})($h{4})($h{12})!o); + 'urn:uuid:' . join('-', @uuid5); +} + # returns undef or string sub feed_entry { my ($self, $mime) = @_; my $ctx = $self->{ctx}; my $hdr = $mime->header_obj; my $mid = mid_clean($hdr->header_raw('Message-ID')); - - my $uuid = mid2path($mid); - $uuid =~ tr!/!!d; - my $h = '[a-f0-9]'; - my (@uuid5) = ($uuid =~ m!\A($h{8})($h{4})($h{4})($h{4})($h{12})!o); - $uuid = 'urn:uuid:' . join('-', @uuid5); - - $mid = PublicInbox::Hval->new_msgid($mid); - my $href = $ctx->{feed_base_url} . $mid->{href}. '/'; - + my $irt = PublicInbox::View::in_reply_to($hdr); + my $uuid = mid2uuid($mid); + my $base = $ctx->{feed_base_url}; + if (defined $irt) { + my $irt_uuid = mid2uuid($irt); + $irt = mid_escape($irt); + $irt = qq(<thr:in-reply-to\nref="$irt_uuid"\n). + qq(href="$base$irt/"/>); + } else { + $irt = ''; + } + my $href = $base . mid_escape($mid) . '/'; my $date = $hdr->header('Date'); my @t = eval { strptime($date) } if defined $date; @t = gmtime(time) unless scalar @t; @@ -124,7 +136,7 @@ sub feed_entry { PublicInbox::View::multipart_text_as_html($mime, $href) . '</pre>' . qq!</div></content><link\nhref="$href"/>!. - "<id>$uuid</id></entry>"; + "<id>$uuid</id>$irt</entry>"; } sub feed_updated { diff --git a/lib/PublicInbox/WwwText.pm b/lib/PublicInbox/WwwText.pm index b0f262cd..449cb499 100644 --- a/lib/PublicInbox/WwwText.pm +++ b/lib/PublicInbox/WwwText.pm @@ -1,7 +1,6 @@ # Copyright (C) 2016 all contributors <meta@public-inbox.org> # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> -# -# serves the /$INBOX/_/* endpoints from :text/* of the git tree + package PublicInbox::WwwText; use strict; use warnings; @@ -182,6 +181,10 @@ message threading $WIKI_URL/Atom_(standard) https://tools.ietf.org/html/rfc4287 + Atom Threading Extensions (RFC4685) is supported: + + https://tools.ietf.org/html/rfc4685 + Finally, the gzipped mbox for a thread is available for downloading and importing into your favorite mail client: @@ -30,6 +30,7 @@ my $tmpdir = tempdir('pi-config-XXXXXX', TMPDIR => 1, CLEANUP => 1); 'url' => 'http://example.com/meta', -primary_address => 'meta@public-inbox.org', 'name' => 'meta', + feedmax => 25, -pi_config => $cfg, }, "lookup matches expected output"); @@ -45,6 +46,7 @@ my $tmpdir = tempdir('pi-config-XXXXXX', TMPDIR => 1, CLEANUP => 1); 'mainrepo' => '/home/pi/test-main.git', 'domain' => 'public-inbox.org', 'name' => 'test', + feedmax => 25, 'url' => 'http://example.com/test', -pi_config => $cfg, }, "lookup matches expected output for test"); @@ -46,6 +46,7 @@ my $ibx = PublicInbox::Inbox->new({ name => 'testbox', mainrepo => $git_dir, url => 'http://example.com/test', + feedmax => 3, }); my $git = $ibx->git; my $im = PublicInbox::Import->new($git, $ibx->{name}, 'test@example'); @@ -101,10 +102,7 @@ EOF { # check initial feed { - my $feed = string_feed({ - -inbox => $ibx, - max => 3 - }); + my $feed = string_feed({ -inbox => $ibx }); SKIP: { skip 'XML::Feed missing', 2 unless $have_xml_feed; my $p = XML::Feed->parse(\$feed); @@ -142,10 +140,7 @@ EOF # check spam shows up { - my $spammy_feed = string_feed({ - -inbox => $ibx, - max => 3 - }); + my $spammy_feed = string_feed({ -inbox => $ibx }); SKIP: { skip 'XML::Feed missing', 2 unless $have_xml_feed; my $p = XML::Feed->parse(\$spammy_feed); @@ -167,7 +162,7 @@ EOF # spam no longer shows up { - my $feed = string_feed({ -inbox => $ibx, max => 3 }); + my $feed = string_feed({ -inbox => $ibx }); SKIP: { skip 'XML::Feed missing', 2 unless $have_xml_feed; my $p = XML::Feed->parse(\$feed); @@ -109,7 +109,6 @@ sub filter_mids { my $found = $ro->lookup_message('<root@s>'); ok($found, "message found"); is($root_id, $found->{doc_id}, 'doc_id set correctly'); - $found->ensure_metadata; is($found->mid, 'root@s', 'mid set correctly'); ok(int($found->thread_id) > 0, 'thread_id is an integer'); @@ -290,7 +289,6 @@ sub filter_mids { body => "LOOP!\n")); ok($doc_id > 0, "doc_id defined with circular reference"); my $smsg = $rw->lookup_message('circle@a'); - $smsg->ensure_metadata; is($smsg->references, '', "no references created"); my $msg = PublicInbox::SearchMsg->load_doc($smsg->{doc}); is($s, $msg->subject, 'long subject not rewritten'); diff --git a/t/thread-all.t b/t/thread-all.t new file mode 100644 index 00000000..8ccf4f8c --- /dev/null +++ b/t/thread-all.t @@ -0,0 +1,38 @@ +# Copyright (C) 2016 all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# +# real-world testing of search threading +use strict; +use warnings; +use Test::More; +use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC); +my $pi_dir = $ENV{GIANT_PI_DIR}; +plan skip_all => "GIANT_PI_DIR not defined for $0" unless $pi_dir; +eval { require PublicInbox::Search; }; +plan skip_all => "Xapian missing for $0" if $@; +my $srch = eval { PublicInbox::Search->new($pi_dir) }; +plan skip_all => "$pi_dir not initialized for $0" if $@; + +require PublicInbox::View; +require PublicInbox::SearchThread; + +my $pfx = PublicInbox::Search::xpfx('thread'); +my $opts = { limit => 1000000, asc => 1 }; +my $t0 = clock_gettime(CLOCK_MONOTONIC); +my $elapsed; + +my $sres = $srch->_do_enquire(undef, $opts); +$elapsed = clock_gettime(CLOCK_MONOTONIC) - $t0; +diag "enquire: $elapsed"; + +$t0 = clock_gettime(CLOCK_MONOTONIC); +my $msgs = PublicInbox::View::load_results($srch, $sres); +$elapsed = clock_gettime(CLOCK_MONOTONIC) - $t0; +diag "load_results $elapsed"; + +$t0 = clock_gettime(CLOCK_MONOTONIC); +PublicInbox::View::thread_results($msgs); +$elapsed = clock_gettime(CLOCK_MONOTONIC) - $t0; +diag "thread_results $elapsed"; + +done_testing(); diff --git a/t/thread-cycle.t b/t/thread-cycle.t index b0844490..16ceee71 100644 --- a/t/thread-cycle.t +++ b/t/thread-cycle.t @@ -3,7 +3,6 @@ use strict; use warnings; use Test::More; -use_ok('PublicInbox::SearchMsg'); use_ok('PublicInbox::SearchThread'); use Email::Simple; my $mt = eval { @@ -73,11 +72,11 @@ SKIP: { done_testing(); sub thread_to_s { - my $th = PublicInbox::SearchThread->new(shift); - $th->thread; - $th->order(sub { [ sort { $a->{id} cmp $b->{id} } @{$_[0]} ] }); + my ($msgs) = @_; + my $rootset = PublicInbox::SearchThread::thread($msgs, sub { + [ sort { $a->{id} cmp $b->{id} } @{$_[0]} ] }); my $st = ''; - my @q = map { (0, $_) } @{$th->{rootset}}; + my @q = map { (0, $_) } @$rootset; while (@q) { my $level = shift @q; my $node = shift @q or next; |