diff options
Diffstat (limited to 'lib')
39 files changed, 1247 insertions, 515 deletions
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index a1b1fc07..bb5d3653 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -11,6 +11,7 @@ use PublicInbox::Config; use PublicInbox::Inbox; use PublicInbox::Spawn qw(run_qx); use PublicInbox::Eml; +use PublicInbox::Git qw(git_exe); *rel2abs_collapsed = \&PublicInbox::Config::rel2abs_collapsed; sub setup_signals { @@ -77,7 +78,7 @@ sub resolve_git_dir { my $env; defined($pwd) && substr($cd // '/', 0, 1) ne '/' and $env->{PWD} = "$pwd/$cd"; - my $cmd = [ qw(git rev-parse --git-dir) ]; + my $cmd = [ git_exe, qw(rev-parse --git-dir) ]; my $dir = run_qx($cmd, $env, { -C => $cd }); die "error in @$cmd (cwd:${\($cd // '.')}): $?\n" if $?; chomp $dir; @@ -317,7 +318,7 @@ sub progress_prepare ($;$) { $opt->{1} = $null; # suitable for spawn() redirect } else { $opt->{verbose} ||= 1; - $dst //= *STDERR{GLOB}; + $dst //= \*STDERR; $opt->{-progress} = sub { print $dst '# ', @_ }; } } diff --git a/lib/PublicInbox/Compat.pm b/lib/PublicInbox/Compat.pm index 78cba90e..8ed2d7a1 100644 --- a/lib/PublicInbox/Compat.pm +++ b/lib/PublicInbox/Compat.pm @@ -8,7 +8,7 @@ use v5.12; use parent qw(Exporter); require List::Util; -our @EXPORT_OK = qw(uniqstr); +our @EXPORT_OK = qw(uniqstr sum0); # uniqstr is in List::Util 1.45+, which means Perl 5.26+; # so maybe 2030 for us since we need to support enterprise distros. @@ -21,4 +21,6 @@ no warnings 'once'; grep { !$seen{$_}++ } @_; }; +*sum0 = List::Util->can('sum0') // sub (@) { List::Util::sum(@_) // 0 }; + 1; diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 49659a2e..998fc25e 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -517,7 +517,9 @@ sub _fill_ibx { delete $ibx->{newsgroup}; warn "newsgroup name invalid: `$ngname'\n"; } else { - my $lc = $ibx->{newsgroup} = lc $ngname; + %dedupe = (lc($ngname) => undef); + my ($lc) = keys %dedupe; + $ibx->{newsgroup} = $lc; warn <<EOM if $lc ne $ngname; W: newsgroup=`$ngname' lowercased to `$lc' EOM @@ -630,7 +632,7 @@ sub urlmatch { } elsif (($? >> 8) != 1) { $urlmatch_broken = 1; } elsif ($try_git) { # n.b. this takes cwd into account - $val = run_qx([qw(git config), @bool, + $val = run_qx([$cmd->[0], 'config', @bool, qw(-z --get-urlmatch), $key, $url]); undef $val if $?; } diff --git a/lib/PublicInbox/DS.pm b/lib/PublicInbox/DS.pm index a6fec954..f807c626 100644 --- a/lib/PublicInbox/DS.pm +++ b/lib/PublicInbox/DS.pm @@ -18,14 +18,14 @@ # sock: underlying socket # rbuf: scalarref, usually undef # wbuf: arrayref of coderefs or tmpio (autovivified)) -# (tmpio = [ GLOB, offset, [ length ] ]) +# (tmpio = [ GLOB, offset, [ length ] ]) package PublicInbox::DS; use strict; use v5.10.1; use parent qw(Exporter); use bytes qw(length substr); # FIXME(?): needed for PublicInbox::NNTP use POSIX qw(WNOHANG sigprocmask SIG_SETMASK SIG_UNBLOCK); -use Fcntl qw(SEEK_SET :DEFAULT O_APPEND); +use Fcntl qw(SEEK_SET :DEFAULT); use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC); use Scalar::Util qw(blessed); use PublicInbox::Syscall qw(%SIGNUM @@ -35,23 +35,24 @@ use PublicInbox::Select; use PublicInbox::OnDestroy; use Errno qw(EAGAIN EINVAL ECHILD); use Carp qw(carp croak); +use List::Util qw(sum); our @EXPORT_OK = qw(now msg_more awaitpid add_timer add_uniq_timer); +my $sendmsg_more = PublicInbox::Syscall->can('sendmsg_more'); +my $writev = PublicInbox::Syscall->can('writev'); my $nextq; # queue for next_tick my $reap_armed; my @active; # FDs (or objs) returned by epoll/kqueue our (%AWAIT_PIDS, # pid => [ $callback, @args ] $cur_runq, # only set inside next_tick - @FD_MAP, # fd (num) -> PublicInbox::DS object - $Poller, # global Select, Epoll, DSPoll, or DSKQXS ref - - @post_loop_do, # subref + args to call at the end of each loop - - $loop_timeout, # timeout of event loop in milliseconds - @Timers, # timers - %UniqTimer, - $in_loop, - ); + @FD_MAP, # fd (num) -> PublicInbox::DS object + $Poller, # global Select, Epoll, DSPoll, or DSKQXS ref + @post_loop_do, # subref + args to call at the end of each loop + $loop_timeout, # timeout of event loop in milliseconds + @Timers, + %UniqTimer, + $in_loop, +); Reset(); @@ -318,23 +319,22 @@ This is normally (always?) called from your subclass via: =cut sub new { - my ($self, $sock, $ev) = @_; - $self->{sock} = $sock; - my $fd = fileno($sock); - - $Poller //= _InitPoller(); + my ($self, $sock, $ev) = @_; + $self->{sock} = $sock; + my $fd = fileno($sock); + $Poller //= _InitPoller(); retry: - if ($Poller->ep_add($sock, $ev)) { - if ($! == EINVAL && ($ev & EPOLLEXCLUSIVE)) { - $ev &= ~EPOLLEXCLUSIVE; - goto retry; - } - die "EPOLL_CTL_ADD $self/$sock/$fd: $!"; - } - defined($FD_MAP[$fd]) and + if ($Poller->ep_add($sock, $ev)) { + if ($! == EINVAL && ($ev & EPOLLEXCLUSIVE)) { + $ev &= ~EPOLLEXCLUSIVE; + goto retry; + } + die "EPOLL_CTL_ADD $self/$sock/$fd: $!"; + } + defined($FD_MAP[$fd]) and croak("BUG: FD:$fd in use by $FD_MAP[$fd] (for $self/$sock)"); - $FD_MAP[$fd] = $self; + $FD_MAP[$fd] = $self; } # for IMAP, NNTP, and POP3 which greet clients upon connect @@ -374,75 +374,80 @@ sub close { # portable, non-thread-safe sendfile emulation (no pread, yet) sub send_tmpio ($$) { - my ($sock, $tmpio) = @_; - - sysseek($tmpio->[0], $tmpio->[1], SEEK_SET) or return; - my $n = $tmpio->[2] // 65536; - $n = 65536 if $n > 65536; - defined(my $to_write = sysread($tmpio->[0], my $buf, $n)) or return; - my $written = 0; - while ($to_write > 0) { - if (defined(my $w = syswrite($sock, $buf, $to_write, $written))) { - $written += $w; - $to_write -= $w; - } else { - return if $written == 0; - last; - } - } - $tmpio->[1] += $written; # offset - $tmpio->[2] -= $written if defined($tmpio->[2]); # length - $written; + my ($sock, $tmpio) = @_; + + sysseek($tmpio->[0], $tmpio->[1], SEEK_SET) or return; + my $n = $tmpio->[2] // 65536; + $n = 65536 if $n > 65536; + my $to_write = sysread($tmpio->[0], my $buf, $n) // return; + my $total = 0; + while ($to_write > 0) { + if (defined(my $w = syswrite($sock, $buf, $to_write, $total))) { + $total += $w; + $to_write -= $w; + } else { + $total ? last : return; + } + } + $tmpio->[1] += $total; # offset + $tmpio->[2] -= $total if defined($tmpio->[2]); # length + $total; } sub epbit ($$) { # (sock, default) $_[0]->can('stop_SSL') ? PublicInbox::TLS::epollbit() : $_[1]; } +sub epwait ($$) { + my ($io, $ev) = @_; + $Poller->ep_mod($io, $ev) and croak("EPOLL_CTL_MOD($io): $!"); +} + # returns 1 if done, 0 if incomplete -sub flush_write ($) { - my ($self) = @_; - my $sock = $self->{sock} or return; - my $wbuf = $self->{wbuf} or return 1; +sub flush_write { + my ($self) = @_; + my $sock = $self->{sock} or return; + my $wbuf = $self->{wbuf} or return 1; next_buf: - while (my $bref = $wbuf->[0]) { - if (ref($bref) ne 'CODE') { - while ($sock) { - my $w = send_tmpio($sock, $bref); # bref is tmpio - if (defined $w) { - if ($w == 0) { - shift @$wbuf; - goto next_buf; - } - } elsif ($! == EAGAIN && (my $ev = epbit($sock, EPOLLOUT))) { - epwait($sock, $ev | EPOLLONESHOT); - return 0; - } else { - return $self->close; - } - } - } else { #(ref($bref) eq 'CODE') { - shift @$wbuf; - my $before = scalar(@$wbuf); - $bref->($self); - - # bref may be enqueueing more CODE to call (see accept_tls_step) - return 0 if (scalar(@$wbuf) > $before); - } - } # while @$wbuf - - delete $self->{wbuf}; - 1; # all done + while (my $bref = $wbuf->[0]) { + if (ref($bref) ne 'CODE') { # bref is tmpio + while ($sock) { + my $w = send_tmpio $sock, $bref; + if (defined $w) { + if ($w == 0) { + shift @$wbuf; + goto next_buf; + } + } elsif ($! == EAGAIN && (my $ev = epbit + $sock, EPOLLOUT)) { + epwait $sock, $ev | EPOLLONESHOT; + return 0; + } else { + return $self->close; + } + } + } else { #(ref($bref) eq 'CODE') { + shift @$wbuf; + my $before = scalar(@$wbuf); + $bref->($self); + # bref may be enqueueing more CODE to call + # (see accept_tls_step) + return 0 if (scalar(@$wbuf) > $before); + } + } # while @$wbuf + + delete $self->{wbuf}; + 1; # all done } sub rbuf_idle ($$) { - my ($self, $rbuf) = @_; - if ($$rbuf eq '') { # who knows how long till we can read again - delete $self->{rbuf}; - } else { - $self->{rbuf} = $rbuf; - } + my ($self, $rbuf) = @_; + if ($$rbuf eq '') { # who knows how long till we can read again + delete $self->{rbuf}; + } else { + $self->{rbuf} = $rbuf; + } } # returns true if bytes are read, false otherwise @@ -451,8 +456,8 @@ sub do_read ($$$;$) { my ($ev, $r, $s); $r = sysread($s = $self->{sock}, $$rbuf, $len, $off // 0) and return $r; - if (!defined($r) && $! == EAGAIN && ($ev = epbit($s, EPOLLIN))) { - epwait($s, $ev | EPOLLONESHOT); + if (!defined($r) && $! == EAGAIN && ($ev = epbit $s, EPOLLIN)) { + epwait $s, $ev | EPOLLONESHOT; rbuf_idle($self, $rbuf); } else { $self->close; @@ -462,22 +467,23 @@ sub do_read ($$$;$) { # drop the socket if we hit unrecoverable errors on our system which # require BOFH attention: ENOSPC, EFBIG, EIO, EMFILE, ENFILE... -sub drop { +sub drop ($@) { my $self = shift; carp(@_); $self->close; undef; } -sub tmpio ($$$) { - my ($self, $bref, $off) = @_; - my $fh = tmpfile('wbuf', $self->{sock}, O_APPEND) or - return drop($self, "tmpfile $!"); +sub tmpio ($$$;@) { + my ($self, $bref, $off, @rest) = @_; + my $fh = tmpfile 'wbuf', $self->{sock}, 1 or + return drop $self, "tmpfile $!"; $fh->autoflush(1); my $len = length($$bref) - $off; my $n = syswrite($fh, $$bref, $len, $off) // - return drop($self, "write ($len): $!"); - $n == $len or return drop($self, "wrote $n < $len bytes"); + return drop $self, "write ($len): $!"; + $n == $len or return drop $self, "wrote $n < $len bytes"; + @rest and (print $fh @rest or return drop $self, "print rest: $!"); [ $fh, 0 ] # [1] = offset, [2] = length, not set by us } @@ -490,112 +496,135 @@ it returns 1, caller should stop waiting for 'writable' events) =cut sub write { - my ($self, $data) = @_; - - # nobody should be writing to closed sockets, but caller code can - # do two writes within an event, have the first fail and - # disconnect the other side (whose destructor then closes the - # calling object, but it's still in a method), and then the - # now-dead object does its second write. that is this case. we - # just lie and say it worked. it'll be dead soon and won't be - # hurt by this lie. - my $sock = $self->{sock} or return 1; - my $ref = ref $data; - my $bref = $ref ? $data : \$data; - my $wbuf = $self->{wbuf}; - if ($wbuf && scalar(@$wbuf)) { # already buffering, can't write more... - if ($ref eq 'CODE') { - push @$wbuf, $bref; - } else { - my $tmpio = $wbuf->[-1]; - if (ref($tmpio) eq 'ARRAY' && !defined($tmpio->[2])) { - # append to tmp file buffer - $tmpio->[0]->print($$bref) or return drop($self, "print: $!"); - } else { - my $tmpio = tmpio($self, $bref, 0) or return 0; - push @$wbuf, $tmpio; - } - } - return 0; - } elsif ($ref eq 'CODE') { - $bref->($self); - return 1; - } else { - my $to_write = length($$bref); - my $written = syswrite($sock, $$bref, $to_write); - - if (defined $written) { - return 1 if $written == $to_write; - requeue($self); # runs: event_step -> flush_write - } elsif ($! == EAGAIN) { - my $ev = epbit($sock, EPOLLOUT) or return $self->close; - epwait($sock, $ev | EPOLLONESHOT); - $written = 0; - } else { - return $self->close; - } - - # deal with EAGAIN or partial write: - my $tmpio = tmpio($self, $bref, $written) or return 0; - - # wbuf may be an empty array if we're being called inside - # ->flush_write via CODE bref: - push @{$self->{wbuf}}, $tmpio; # autovivifies - return 0; - } -} - -use constant MSG_MORE => ($^O eq 'linux') ? 0x8000 : 0; - -sub msg_more ($$) { - my $self = $_[0]; - my $sock = $self->{sock} or return 1; - my $wbuf = $self->{wbuf}; - - if (MSG_MORE && (!defined($wbuf) || !scalar(@$wbuf)) && - !$sock->can('stop_SSL')) { - my $n = send($sock, $_[1], MSG_MORE); - if (defined $n) { - my $nlen = length($_[1]) - $n; - return 1 if $nlen == 0; # all done! - # queue up the unwritten substring: - my $tmpio = tmpio($self, \($_[1]), $n) or return 0; - push @{$self->{wbuf}}, $tmpio; # autovivifies - epwait($sock, EPOLLOUT|EPOLLONESHOT); - return 0; - } - } - - # don't redispatch into NNTPdeflate::write - PublicInbox::DS::write($self, \($_[1])); + my ($self, $data) = @_; + + # nobody should be writing to closed sockets, but caller code can + # do two writes within an event, have the first fail and + # disconnect the other side (whose destructor then closes the + # calling object, but it's still in a method), and then the + # now-dead object does its second write. that is this case. we + # just lie and say it worked. it'll be dead soon and won't be + # hurt by this lie. + my $sock = $self->{sock} or return 1; + my $ref = ref $data; + my $bref = $ref ? $data : \$data; + my $wbuf = $self->{wbuf}; + if ($wbuf && scalar(@$wbuf)) { # already buffering, can't write more... + if ($ref eq 'CODE') { + push @$wbuf, $bref; + } else { + my $tmpio = $wbuf->[-1]; + if (ref($tmpio) eq 'ARRAY' && !defined($tmpio->[2])) { + # append to tmp file buffer + $tmpio->[0]->print($$bref) or + return drop($self, "print: $!"); + } else { + $tmpio = tmpio $self, $bref, 0 or return 0; + push @$wbuf, $tmpio; + } + } + 0; + } elsif ($ref eq 'CODE') { + $bref->($self); + 1; + } else { + my $to_write = length $$bref; + my $w = syswrite $sock, $$bref, $to_write; + + if (defined $w) { + return 1 if $w == $to_write; + requeue $self; # runs: event_step -> flush_write + } elsif ($! == EAGAIN) { + my $ev = epbit $sock, EPOLLOUT or return $self->close; + epwait $sock, $ev | EPOLLONESHOT; + $w = 0; + } else { + return $self->close; + } + + # deal with EAGAIN or partial write: + my $tmpio = tmpio $self, $bref, $w or return 0; + + # wbuf may be an empty array if we're being called inside + # ->flush_write via CODE bref: + push @{$self->{wbuf}}, $tmpio; # autovivifies + 0; + } } -sub epwait ($$) { - my ($io, $ev) = @_; - $Poller->ep_mod($io, $ev) and croak("EPOLL_CTL_MOD($io): $!"); +sub _iov_write ($$@) { + my ($self, $cb) = (shift, shift); + my ($tip, $tmpio, $s, $exp); + $s = $cb->($self->{sock}, @_); + if (defined $s) { + $exp = sum(map length, @_); + return 1 if $s == $exp; + while (@_) { + $tip = shift; + if ($s >= length($tip)) { # fully written + $s -= length($tip); + } else { # first partial write + $tmpio = tmpio $self, \$tip, $s, @_ or return 0; + last; + } + } + $tmpio // return drop $self, "BUG: tmpio on $s != $exp"; + } elsif ($! == EAGAIN) { + $tip = shift; + $tmpio = tmpio $self, \$tip, 0, @_ or return 0; + } else { # client disconnected + return $self->close; + } + push @{$self->{wbuf}}, $tmpio; # autovivifies + epwait $self->{sock}, EPOLLOUT|EPOLLONESHOT; + 0; +} + +sub msg_more ($@) { + my $self = shift; + my $sock = $self->{sock} or return 1; + my $wbuf = $self->{wbuf}; + if ($sendmsg_more && (!defined($wbuf) || !scalar(@$wbuf)) && + !$sock->can('stop_SSL')) { + _iov_write $self, $sendmsg_more, @_; + } else { # don't redispatch into NNTPdeflate::write + PublicInbox::DS::write($self, join('', @_)); + } +} + +sub writev ($@) { + my $self = shift; + my $sock = $self->{sock} or return 1; + my $wbuf = $self->{wbuf}; + if ($writev && (!defined($wbuf) || !scalar(@$wbuf)) && + !$sock->can('stop_SSL')) { + _iov_write $self, $writev, @_; + } else { # don't redispatch into NNTPdeflate::write + PublicInbox::DS::write($self, join('', @_)); + } } # return true if complete, false if incomplete (or failure) sub accept_tls_step ($) { - my ($self) = @_; - my $sock = $self->{sock} or return; - return 1 if $sock->accept_SSL; - return $self->close if $! != EAGAIN; - my $ev = PublicInbox::TLS::epollbit() or return $self->close; - epwait($sock, $ev | EPOLLONESHOT); - unshift(@{$self->{wbuf}}, \&accept_tls_step); # autovivifies - 0; + my ($self) = @_; + my $sock = $self->{sock} or return; + return 1 if $sock->accept_SSL; + return $self->close if $! != EAGAIN; + my $ev = PublicInbox::TLS::epollbit() or return $self->close; + epwait $sock, $ev | EPOLLONESHOT; + unshift @{$self->{wbuf}}, \&accept_tls_step; # autovivifies + 0; } # return value irrelevant sub shutdn_tls_step ($) { - my ($self) = @_; - my $sock = $self->{sock} or return; - return $self->close if $sock->stop_SSL(SSL_fast_shutdown => 1); - return $self->close if $! != EAGAIN; - my $ev = PublicInbox::TLS::epollbit() or return $self->close; - epwait($sock, $ev | EPOLLONESHOT); - unshift(@{$self->{wbuf}}, \&shutdn_tls_step); # autovivifies + my ($self) = @_; + my $sock = $self->{sock} or return; + return $self->close if $sock->stop_SSL(SSL_fast_shutdown => 1); + return $self->close if $! != EAGAIN; + my $ev = PublicInbox::TLS::epollbit() or return $self->close; + epwait $sock, $ev | EPOLLONESHOT; + unshift @{$self->{wbuf}}, \&shutdn_tls_step; # autovivifies } # don't bother with shutdown($sock, 2), we don't fork+exec w/o CLOEXEC @@ -603,7 +632,7 @@ sub shutdn_tls_step ($) { sub shutdn ($) { my ($self) = @_; my $sock = $self->{sock} or return; - $sock->can('stop_SSL') ? shutdn_tls_step($self) : $self->close; + $sock->can('stop_SSL') ? shutdn_tls_step $self : $self->close; } sub dflush {} # overridden by DSdeflate diff --git a/lib/PublicInbox/DSdeflate.pm b/lib/PublicInbox/DSdeflate.pm index 539adf0f..a9f9693b 100644 --- a/lib/PublicInbox/DSdeflate.pm +++ b/lib/PublicInbox/DSdeflate.pm @@ -91,12 +91,15 @@ sub do_read ($$$$) { } # override PublicInbox::DS::msg_more -sub msg_more ($$) { - my $self = $_[0]; +sub msg_more ($@) { + my $self = shift; # $_[1] may be a reference or not for ->deflate - my $err = $zout->deflate($_[1], $zbuf); - $err == Z_OK or die "->deflate failed $err"; + my $err; + for (@_) { + $err = $zout->deflate($_, $zbuf); + $err == Z_OK or die "->deflate failed $err"; + } 1; } diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 883dbea3..934197c0 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -543,13 +543,7 @@ sub _ibx_for ($$$) { sub _fd_constrained ($) { my ($self) = @_; $self->{-fd_constrained} //= do { - my $soft; - if (eval { require BSD::Resource; 1 }) { - my $NOFILE = BSD::Resource::RLIMIT_NOFILE(); - ($soft, undef) = BSD::Resource::getrlimit($NOFILE); - } else { - chomp($soft = `sh -c 'ulimit -n'`); - } + my $soft = PublicInbox::Search::ulimit_n; if (defined($soft)) { # $want is an estimate my $want = scalar(@{$self->{ibx_active}}) + 64; diff --git a/lib/PublicInbox/Gcf2.pm b/lib/PublicInbox/Gcf2.pm index 78392990..acc2091c 100644 --- a/lib/PublicInbox/Gcf2.pm +++ b/lib/PublicInbox/Gcf2.pm @@ -11,7 +11,7 @@ use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC); use IO::Handle; # autoflush use PublicInbox::Git qw($ck_unlinked_packs); use PublicInbox::Lock; -use autodie qw(close open seek truncate); +use autodie qw(open seek truncate); BEGIN { my (%CFG, $c_src); diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index a9a821ad..32c11a59 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -61,7 +61,10 @@ sub git_exe () { return $GIT_EXE if $now < $next_check; $next_check = $now + 10; $GIT_EXE = which('git') // die "git not found in $ENV{PATH}"; - my @st = stat(_) or die "stat($GIT_EXE): $!"; # can't do HiRes w/ _ +} + +sub git_version () { + my @st = stat(git_exe) or die "stat($GIT_EXE): $!"; my $st = pack('dd', $st[0], $st[1]); if ($st ne $EXE_ST) { my $v = run_qx([ $GIT_EXE, '--version' ]); @@ -71,11 +74,6 @@ sub git_exe () { $GIT_VER = eval("v$1") // die "BUG: bad vstring: $1 ($v)"; $EXE_ST = $st; } - $GIT_EXE; -} - -sub git_version () { - git_exe; $GIT_VER; } @@ -108,9 +106,10 @@ sub new { sub git_path ($$) { my ($self, $path) = @_; $self->{-git_path}->{$path} //= do { - my $d = "$self->{git_dir}/$path"; - if (-e $d) { - $d; + my $d = $self->{git_dir}; + my $f = "$d/$path"; + if (-d "$d/objects") { + $f; } else { local $/ = "\n"; my $rdr = { 2 => \my $err }; @@ -119,7 +118,7 @@ sub git_path ($$) { chomp $s; # git prior to 2.5.0 did not understand --git-path - $s eq "--git-path\n$path" ? $d : $s; + $s eq "--git-path\n$path" ? $f : $s; } }; } @@ -208,7 +207,7 @@ sub cat_async_retry ($$) { $oid = \$oid if !@$new_inflight; # to indicate oid retried push @$new_inflight, $oid, $cb, $arg; } - $sock->close if $sock; # only safe once old_inflight is empty + undef $sock; # gcf_drain may run from PublicInbox::IO::DESTROY cat_async_step($self, $new_inflight); # take one step } @@ -639,7 +638,8 @@ sub event_step { my ($self) = @_; my $inflight = gcf_inflight($self); if ($inflight && @$inflight) { - $self->cat_async_step($inflight); + eval { $self->cat_async_step($inflight) }; + warn "E: $self->{git_dir}: $@" if $@; return $self->close unless $self->{sock}; # don't loop here to keep things fair, but we must requeue # if there's already-read data in pi_io_rbuf @@ -664,10 +664,9 @@ sub watch_async ($) { sub close { my ($self) = @_; - my $sock = $self->{sock}; delete @$self{qw(-bc err_c inflight)}; delete($self->{epwatch}) ? $self->SUPER::close : delete($self->{sock}); - $sock->close if $sock; # calls gcf_drain via awaitpid + # gcf_drain may run from PublicInbox::IO::DESTROY } package PublicInbox::GitCheck; # only for git <2.36 diff --git a/lib/PublicInbox/GitCredential.pm b/lib/PublicInbox/GitCredential.pm index bb225ff3..cf5a2213 100644 --- a/lib/PublicInbox/GitCredential.pm +++ b/lib/PublicInbox/GitCredential.pm @@ -4,13 +4,14 @@ # git-credential wrapper with built-in .netrc fallback package PublicInbox::GitCredential; use v5.12; +use PublicInbox::Git qw(git_exe); use PublicInbox::Spawn qw(popen_rd); use autodie qw(close pipe); sub run ($$;$) { my ($self, $op, $lei) = @_; my ($in_r, $in_w, $out_r); - my $cmd = [ qw(git credential), $op ]; + my $cmd = [ git_exe, 'credential', $op ]; pipe($in_r, $in_w); if ($lei) { # we'll die if disconnected: pipe($out_r, my $out_w); diff --git a/lib/PublicInbox/GitHTTPBackend.pm b/lib/PublicInbox/GitHTTPBackend.pm index 396aa783..ac610d4b 100644 --- a/lib/PublicInbox/GitHTTPBackend.pm +++ b/lib/PublicInbox/GitHTTPBackend.pm @@ -106,7 +106,9 @@ sub serve_smart { $env{PATH_TRANSLATED} = "$git->{git_dir}/$path"; my $rdr = input_prepare($env) or return r(500); $rdr->{quiet} = 1; - my $qsp = PublicInbox::Qspawn->new([qw(git http-backend)], \%env, $rdr); + my $cmd = $git->cmd('http-backend'); + splice @$cmd, 1, 0, '-c', 'safe.directory=*'; + my $qsp = PublicInbox::Qspawn->new($cmd, \%env, $rdr); $qsp->psgi_yield($env, $limiter, \&ghb_parse_hdr, $env, $git, $path); } diff --git a/lib/PublicInbox/GzipFilter.pm b/lib/PublicInbox/GzipFilter.pm index 8b630f25..a2e82a2d 100644 --- a/lib/PublicInbox/GzipFilter.pm +++ b/lib/PublicInbox/GzipFilter.pm @@ -11,6 +11,8 @@ # async_eml callbacks only run when a blob arrives from git. # # We continue to support getline+close for generic PSGI servers. +# Note: we favor gzip in Perl (as opposed to nginx || varnish) to +# reduce IPC memory traffic package PublicInbox::GzipFilter; use strict; use parent qw(Exporter); @@ -21,7 +23,9 @@ use PublicInbox::GitAsyncCat; use Carp qw(carp); our @EXPORT_OK = qw(gzf_maybe); -my %OPT = (-WindowBits => 15 + 16, -AppendOutput => 1); +# Compress::Raw::Zlib uses MAX_MEM_LEVEL (9) while zlib DEF_MEM_LEVEL is 8; +# choose the zlib default because C:R:Z is excessive. +my %OPT = (-WindowBits => 15 + 16, -AppendOutput => 1, -MemLevel => 8); my @GZIP_HDRS = qw(Vary Accept-Encoding Content-Encoding gzip); sub new { bless {}, shift } # qspawn filter diff --git a/lib/PublicInbox/HTTP.pm b/lib/PublicInbox/HTTP.pm index 7162732e..73632aee 100644 --- a/lib/PublicInbox/HTTP.pm +++ b/lib/PublicInbox/HTTP.pm @@ -21,6 +21,7 @@ package PublicInbox::HTTP; use strict; use parent qw(PublicInbox::DS); +use bytes qw(length); use Fcntl qw(:seek); use Plack::HTTPParser qw(parse_http_request); # XS or pure Perl use Plack::Util; @@ -36,6 +37,7 @@ use constant { CHUNK_MAX_HDR => 256, }; use Errno qw(EAGAIN); +use PublicInbox::Compat qw(sum0); # Use the same configuration parameter as git since this is primarily # a slow-client sponge for git-http-backend @@ -47,7 +49,7 @@ open(my $null_io, '<', '/dev/null') or die "open /dev/null: $!"; { my @n = stat($null_io) or die "stat(/dev/null): $!"; my @i = stat(STDIN) or die "stat(STDIN): $!"; - $null_io = *STDIN{IO} if "@n[0, 1]" eq "@i[0, 1]"; + $null_io = \*STDIN if "@n[0, 1]" eq "@i[0, 1]"; } my $http_date; @@ -165,7 +167,7 @@ sub app_dispatch { } } -sub response_header_write { +sub response_header_write ($$$) { my ($self, $env, $res) = @_; my $proto = $env->{SERVER_PROTOCOL} or return; # HTTP/0.9 :P my $status = $res->[0]; @@ -188,7 +190,13 @@ sub response_header_write { my $conn = $env->{HTTP_CONNECTION} || ''; my $term = defined($len) || $chunked; my $prot_persist = ($proto eq 'HTTP/1.1') && ($conn !~ /\bclose\b/i); - my $alive; + my ($alive, $res_body); + if (!$term && ref($res->[2]) eq 'ARRAY') { + ($res_body, $res->[2]) = ($res->[2], []); + $len = sum0(map length, @$res_body); + $h .= "Content-Length: $len\r\n"; + $term = 1; + } if (!$term && $prot_persist) { # auto-chunk $chunked = $alive = 2; $alive = 3 if $env->{REQUEST_METHOD} eq 'HEAD'; @@ -203,7 +211,9 @@ sub response_header_write { } $h .= 'Date: ' . http_date() . "\r\n\r\n"; - if (($len || $chunked) && $env->{REQUEST_METHOD} ne 'HEAD') { + if ($res_body) { + $self->writev($h, @$res_body); + } elsif (($len || $chunked) && $env->{REQUEST_METHOD} ne 'HEAD') { msg_more($self, $h); } else { $self->write(\$h); @@ -213,14 +223,9 @@ sub response_header_write { # middlewares such as Deflater may write empty strings sub chunked_write ($$) { - my $self = $_[0]; - return if $_[1] eq ''; - msg_more($self, sprintf("%x\r\n", length($_[1]))); - msg_more($self, $_[1]); - - # use $self->write(\"\n\n") if you care about real-time - # streaming responses, public-inbox WWW does not. - msg_more($self, "\r\n"); + my ($self, $buf) = @_; + $buf eq '' or + msg_more $self, sprintf("%x\r\n", length($buf)), $buf, "\r\n"; } sub identity_write ($$) { @@ -459,7 +464,7 @@ use v5.12; sub write { # ([$http], $buf) = @_; PublicInbox::HTTP::chunked_write($_[0]->[0], $_[1]); - $_[0]->[0]->{sock} ? length($_[1]) : undef; + $_[0]->[0]->{sock} ? bytes::length($_[1]) : undef; } sub close { @@ -474,7 +479,7 @@ our @ISA = qw(PublicInbox::HTTP::Chunked); sub write { # ([$http], $buf) = @_; PublicInbox::HTTP::identity_write($_[0]->[0], $_[1]); - $_[0]->[0]->{sock} ? length($_[1]) : undef; + $_[0]->[0]->{sock} ? bytes::length($_[1]) : undef; } 1; diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm index b12533cb..0eb21b6a 100644 --- a/lib/PublicInbox/IMAP.pm +++ b/lib/PublicInbox/IMAP.pm @@ -605,8 +605,7 @@ sub fetch_blob_cb { # called by git->cat_async via ibx_async_cat sub emit_rfc822 { my ($self, $k, undef, $bref) = @_; - $self->msg_more(" $k {" . length($$bref)."}\r\n"); - $self->msg_more($$bref); + $self->msg_more(" $k {" . length($$bref)."}\r\n", $$bref); } # Mail::IMAPClient::message_string cares about this by default, @@ -626,20 +625,18 @@ sub emit_flags { $_[0]->msg_more(' FLAGS ()') } sub emit_envelope { my ($self, undef, undef, undef, $eml) = @_; - $self->msg_more(' ENVELOPE '.eml_envelope($eml)); + $self->msg_more(' ENVELOPE ', eml_envelope($eml)); } sub emit_rfc822_header { my ($self, $k, undef, undef, $eml) = @_; - $self->msg_more(" $k {".length(${$eml->{hdr}})."}\r\n"); - $self->msg_more(${$eml->{hdr}}); + $self->msg_more(" $k {".length(${$eml->{hdr}})."}\r\n", ${$eml->{hdr}}); } # n.b. this is sorted to be after any emit_eml_new ops sub emit_rfc822_text { my ($self, $k, undef, $bref) = @_; - $self->msg_more(" $k {".length($$bref)."}\r\n"); - $self->msg_more($$bref); + $self->msg_more(" $k {".length($$bref)."}\r\n", $$bref); } sub emit_bodystructure { @@ -970,8 +967,7 @@ sub partial_emit ($$$) { } else { $len = length($str); } - $self->msg_more(" $k {$len}\r\n"); - $self->msg_more($str); + $self->msg_more(" $k {$len}\r\n", $str); } } diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index fefc282a..2e193d46 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -8,6 +8,7 @@ package PublicInbox::Import; use v5.12; use parent qw(PublicInbox::Lock); +use PublicInbox::Git qw(git_exe); use PublicInbox::Spawn qw(run_die run_qx spawn); use PublicInbox::MID qw(mids mid2path); use PublicInbox::Address; @@ -24,7 +25,7 @@ use PublicInbox::IO qw(read_all); sub default_branch () { state $default_branch = do { - my $h = run_qx([qw(git config --global init.defaultBranch)], + my $h = run_qx([git_exe,qw(config --global init.defaultBranch)], { GIT_CONFIG => undef }); chomp $h; $h eq '' ? 'refs/heads/master' : "refs/heads/$h"; diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm index e9a0de6c..c5146428 100644 --- a/lib/PublicInbox/LEI.pm +++ b/lib/PublicInbox/LEI.pm @@ -22,6 +22,7 @@ use PublicInbox::Syscall qw(EPOLLIN); use PublicInbox::Spawn qw(run_wait popen_rd run_qx); use PublicInbox::Lock; use PublicInbox::Eml; +use PublicInbox::Git qw(git_exe); use PublicInbox::Import; use PublicInbox::ContentHash qw(git_sha); use PublicInbox::OnDestroy; @@ -506,12 +507,12 @@ sub x_it ($$) { sub err ($;@) { my $self = shift; - my $err = $self->{2} // ($self->{pgr} // [])->[2] // *STDERR{GLOB}; + my $err = $self->{2} // ($self->{pgr} // [])->[2] // \*STDERR; my @eor = (substr($_[-1]//'', -1, 1) eq "\n" ? () : ("\n")); print $err @_, @eor and return; my $old_err = delete $self->{2}; $old_err->close if $! == EPIPE && $old_err; - $err = $self->{2} = ($self->{pgr} // [])->[2] // *STDERR{GLOB}; + $err = $self->{2} = ($self->{pgr} // [])->[2] // \*STDERR; print $err @_, @eor or print STDERR @_, @eor; } @@ -1098,7 +1099,7 @@ sub path_to_fd { # caller needs to "-t $self->{1}" to check if tty sub start_pager { my ($self, $new_env) = @_; - chomp(my $pager = run_qx([qw(git var GIT_PAGER)])); + chomp(my $pager = run_qx([git_exe, qw(var GIT_PAGER)])); warn "`git var PAGER' error: \$?=$?" if $?; return if $pager eq 'cat' || $pager eq ''; $new_env //= {}; @@ -1556,7 +1557,7 @@ sub sto_barrier_request { eval { $lei->{sto}->wq_do('schedule_commit', $n) }; } else { my $s = ($wq ? $wq->{lei_sock} : undef) // $lei->{sock}; - my $errfh = $lei->{2} // *STDERR{GLOB}; + my $errfh = $lei->{2} // \*STDERR; my @io = $s ? ($errfh, $s) : ($errfh); eval { $lei->{sto}->wq_io_do('barrier', \@io, 1) }; } diff --git a/lib/PublicInbox/LeiBlob.pm b/lib/PublicInbox/LeiBlob.pm index 7b2ea434..31936c36 100644 --- a/lib/PublicInbox/LeiBlob.pm +++ b/lib/PublicInbox/LeiBlob.pm @@ -10,14 +10,14 @@ use parent qw(PublicInbox::IPC); use PublicInbox::Spawn qw(run_wait run_qx which); use PublicInbox::DS; use PublicInbox::Eml; -use PublicInbox::Git; +use PublicInbox::Git qw(git_exe); use PublicInbox::IO qw(read_all); sub get_git_dir ($$) { my ($lei, $d) = @_; return $d if -d "$d/objects" && -d "$d/refs" && -e "$d/HEAD"; - my $cmd = [ qw(git rev-parse --git-dir) ]; + my $cmd = [ git_exe, qw(rev-parse --git-dir) ]; my $opt = { '-C' => $d }; if (defined($lei->{opt}->{cwd})) { # --cwd used, report errors $opt->{2} = $lei->{2}; diff --git a/lib/PublicInbox/LeiConfig.pm b/lib/PublicInbox/LeiConfig.pm index a50ff2b6..ae12249f 100644 --- a/lib/PublicInbox/LeiConfig.pm +++ b/lib/PublicInbox/LeiConfig.pm @@ -3,6 +3,7 @@ package PublicInbox::LeiConfig; # subclassed by LeiEditSearch use v5.12; use PublicInbox::PktOp; +use PublicInbox::Git qw(git_exe); use Fcntl qw(SEEK_SET); use autodie qw(open seek); use PublicInbox::IO qw(read_all); @@ -11,7 +12,7 @@ sub cfg_do_edit ($;$) { my ($self, $reason) = @_; my $lei = $self->{lei}; $lei->pgr_err($reason) if defined $reason; - my $cmd = [ qw(git config --edit -f), $self->{-f} ]; + my $cmd = [ git_exe, qw(config --edit -f), $self->{-f} ]; my $env = { GIT_CONFIG => $self->{-f} }; $self->cfg_edit_begin if $self->can('cfg_edit_begin'); # run in script/lei foreground diff --git a/lib/PublicInbox/LeiInput.pm b/lib/PublicInbox/LeiInput.pm index c388f7dc..0a6aba82 100644 --- a/lib/PublicInbox/LeiInput.pm +++ b/lib/PublicInbox/LeiInput.pm @@ -5,6 +5,7 @@ package PublicInbox::LeiInput; use v5.12; use PublicInbox::DS; +use PublicInbox::Git qw(git_exe); use PublicInbox::Spawn qw(which popen_rd); use PublicInbox::InboxWritable qw(eml_from_path); @@ -252,7 +253,8 @@ sub input_path_url { each_ibx_eml($self, $esrch, @args); } elsif ($self->{missing_ok} && !-e $input) { # don't ->fail if ($lei->{cmd} eq 'p2q') { - my $fp = [ qw(git format-patch --stdout -1), $input ]; + my $fp = [ git_exe, qw(format-patch --stdout -1), + $input ]; my $rdr = { 2 => $lei->{2} }; my $fh = popen_rd($fp, undef, $rdr); eval { $self->input_fh('eml', $fh, $input, @args) }; diff --git a/lib/PublicInbox/LeiMailDiff.pm b/lib/PublicInbox/LeiMailDiff.pm index af6ecf82..270de507 100644 --- a/lib/PublicInbox/LeiMailDiff.pm +++ b/lib/PublicInbox/LeiMailDiff.pm @@ -7,13 +7,14 @@ package PublicInbox::LeiMailDiff; use v5.12; use parent qw(PublicInbox::IPC PublicInbox::LeiInput PublicInbox::MailDiff); use PublicInbox::Spawn qw(run_wait); +use PublicInbox::Git qw(git_exe); require PublicInbox::LeiRediff; sub diff_a ($$) { my ($self, $eml) = @_; my $dir = "$self->{tmp}/N".(++$self->{nr}); $self->dump_eml($dir, $eml); - my $cmd = [ qw(git diff --no-index) ]; + my $cmd = [ git_exe, qw(diff --no-index) ]; my $lei = $self->{lei}; PublicInbox::LeiRediff::_lei_diff_prepare($lei, $cmd); push @$cmd, qw(-- a), "N$self->{nr}"; diff --git a/lib/PublicInbox/LeiSavedSearch.pm b/lib/PublicInbox/LeiSavedSearch.pm index 9ae9dcdb..c27b6f86 100644 --- a/lib/PublicInbox/LeiSavedSearch.pm +++ b/lib/PublicInbox/LeiSavedSearch.pm @@ -5,7 +5,7 @@ package PublicInbox::LeiSavedSearch; use v5.12; use parent qw(PublicInbox::Lock); -use PublicInbox::Git; +use PublicInbox::Git qw(git_exe); use PublicInbox::OverIdx; use PublicInbox::LeiSearch; use PublicInbox::Config; @@ -176,7 +176,7 @@ sub description { $_[0]->{qstr} } # for WWW sub cfg_set { # called by LeiXSearch my ($self, @args) = @_; my $lk = $self->lock_for_scope; # git-config doesn't wait - run_die([qw(git config -f), $self->{'-f'}, @args]); + run_die([git_exe, qw(config -f), $self->{'-f'}, @args]); } # drop-in for LeiDedupe API @@ -263,8 +263,6 @@ sub reset_dedupe { sub mm { undef } -sub altid_map { {} } - sub cloneurl { [] } # find existing directory containing a `lei.saved-search' file based on diff --git a/lib/PublicInbox/LeiViewText.pm b/lib/PublicInbox/LeiViewText.pm index c7d72c71..6510b19e 100644 --- a/lib/PublicInbox/LeiViewText.pm +++ b/lib/PublicInbox/LeiViewText.pm @@ -12,6 +12,7 @@ use PublicInbox::View; use PublicInbox::Hval; use PublicInbox::ViewDiff; use PublicInbox::Spawn qw(popen_rd); +use PublicInbox::Git qw(git_exe); use Term::ANSIColor; use POSIX (); use PublicInbox::Address; @@ -72,7 +73,7 @@ sub new { my $self = bless { %{$lei->{opt}}, -colored => \&uncolored }, $cls; $self->{-quote_reply} = 1 if $fmt eq 'reply'; return $self unless $self->{color} //= -t $lei->{1}; - my @cmd = qw(git config -z --includes -l); # reuse normal git config + my @cmd = (git_exe, qw(config -z --includes -l)); # reuse normal git cfg my $r = popen_rd(\@cmd, undef, { 2 => $lei->{2} }); my $cfg = PublicInbox::Config::config_fh_parse($r, "\0", "\n"); if (!$r->close) { diff --git a/lib/PublicInbox/MailDiff.pm b/lib/PublicInbox/MailDiff.pm index 125360fe..ce268bfb 100644 --- a/lib/PublicInbox/MailDiff.pm +++ b/lib/PublicInbox/MailDiff.pm @@ -7,6 +7,7 @@ use PublicInbox::ContentHash qw(content_digest); use PublicInbox::MsgIter qw(msg_part_text); use PublicInbox::ViewDiff qw(flush_diff); use PublicInbox::GitAsyncCat; +use PublicInbox::Git qw(git_exe); use PublicInbox::ContentDigestDbg; use PublicInbox::Qspawn; use PublicInbox::IO qw(write_file); @@ -81,7 +82,7 @@ sub do_diff { my $n = 'N'.(++$self->{nr}); my $dir = "$self->{tmp}/$n"; $self->dump_eml($dir, $eml); - my $cmd = [ qw(git diff --no-index --no-color -- a), $n ]; + my $cmd = [ git_exe, qw(diff --no-index --no-color -- a), $n ]; my $opt = { -C => "$self->{tmp}", quiet => 1 }; my $qsp = PublicInbox::Qspawn->new($cmd, undef, $opt); $qsp->psgi_qx($self->{ctx}->{env}, undef, \&emit_msg_diff, $self); diff --git a/lib/PublicInbox/MultiGit.pm b/lib/PublicInbox/MultiGit.pm index b7691806..32bb3588 100644 --- a/lib/PublicInbox/MultiGit.pm +++ b/lib/PublicInbox/MultiGit.pm @@ -7,6 +7,7 @@ use strict; use v5.10.1; use PublicInbox::Spawn qw(run_die run_qx); use PublicInbox::Import; +use PublicInbox::Git qw(git_exe); use File::Temp 0.19; use List::Util qw(max); use PublicInbox::IO qw(read_all); @@ -110,11 +111,12 @@ sub epoch_cfg_set { my ($self, $epoch_nr) = @_; my $f = epoch_dir($self)."/$epoch_nr.git/config"; my $v = "../../$self->{all}/config"; + my @cmd = (git_exe, qw(config -f), $f, 'include.path'); if (-r $f) { - chomp(my $x = run_qx([qw(git config -f), $f, 'include.path'])); + chomp(my $x = run_qx(\@cmd)); return if $x eq $v; } - run_die([qw(git config -f), $f, 'include.path', $v ]); + run_die [@cmd, $v]; } sub add_epoch { diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index e5c5d6ab..eb5e67ba 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -54,6 +54,9 @@ use constant { # # v1.6.0 adds BYTES, UID and THREADID values SCHEMA_VERSION => 15, + + # we may have up to 8 FDs per shard (depends on Xapian *shrug*) + SHARD_COST => 8, }; use PublicInbox::Smsg; @@ -477,7 +480,7 @@ sub async_mset { my ($self, $qry_str, $opt, $cb, @args) = @_; if ($XHC) { # unconditionally retrieving pct + rank for now xdb($self); # populate {nshards} - my @margs = ($self->xh_args, xh_opt($self, $opt)); + my @margs = ($self->xh_args, xh_opt($self, $opt), '--'); my $ret = eval { my $rd = $XHC->mkreq(undef, 'mset', @margs, $qry_str); PublicInbox::XhcMset->maybe_new($rd, $self, $cb, @args); @@ -729,4 +732,17 @@ sub get_doc ($$) { } } +# not sure where best to put this... +sub ulimit_n () { + my $n; + if (eval { require BSD::Resource; 1 }) { + my $NOFILE = BSD::Resource::RLIMIT_NOFILE(); + ($n, undef) = BSD::Resource::getrlimit($NOFILE); + } else { + require PublicInbox::Spawn; + $n = PublicInbox::Spawn::run_qx([qw(/bin/sh -c), 'ulimit -n']); + } + $n; +} + 1; diff --git a/lib/PublicInbox/SearchThread.pm b/lib/PublicInbox/SearchThread.pm index 00ae9fac..672c53ad 100644 --- a/lib/PublicInbox/SearchThread.pm +++ b/lib/PublicInbox/SearchThread.pm @@ -33,19 +33,24 @@ sub thread { # can be shakier if somebody used In-Reply-To with multiple, disparate # messages. So, take the client Date: into account since we can't # always determine ordering when somebody uses multiple In-Reply-To. + my (%dedupe, $mid); my @kids = sort { $a->{ds} <=> $b->{ds} } grep { # this delete saves around 4K across 1K messages # TODO: move this to a more appropriate place, breaks tests # if we do it during psgi_cull delete $_->{num}; bless $_, 'PublicInbox::SearchThread::Msg'; - if (exists $id_table{$_->{mid}}) { + $mid = $_->{mid}; + if (exists $id_table{$mid}) { $_->{children} = []; push @imposters, $_; # we'll deal with them later undef; } else { $_->{children} = {}; # will become arrayref later - $id_table{$_->{mid}} = $_; + %dedupe = ($mid => undef); + ($mid) = keys %dedupe; + $_->{mid} = $mid; + $id_table{$mid} = $_; defined($_->{references}); } } @$msgs; diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm index 9919e25c..f056dddf 100644 --- a/lib/PublicInbox/SearchView.pm +++ b/lib/PublicInbox/SearchView.pm @@ -154,9 +154,15 @@ sub path2inc ($) { if (my $short = $rmap_inc{$full}) { return $short; } elsif (!scalar(keys %rmap_inc) && -e $full) { - %rmap_inc = map {; "$INC{$_}" => $_ } keys %INC; + # n.b. $INC{'PublicInbox::Gcf2'} is undef if libgit2-dev + # doesn't exist + my $f; + %rmap_inc = map {; + $f = $INC{$_}; + defined $f ? ($f, $_) : (); + } keys %INC; # fall back to basename as last resort - $rmap_inc{$full} // (split('/', $full))[-1]; + $rmap_inc{$full} // (split(m'/', $full))[-1]; } else { $full; } @@ -310,13 +316,12 @@ sub mset_thread { my $rootset = PublicInbox::SearchThread::thread($msgs, $r ? \&sort_relevance : \&PublicInbox::View::sort_ds, $ctx); - my $skel = search_nav_bot($ctx, $mset, $q).'<pre>'. <<EOM; + $ctx->{skel} = [ search_nav_bot($ctx, $mset, $q).'<pre>'. <<EOM ]; -- pct% links below jump to the message on this page, permalinks otherwise -- EOM $ctx->{-upfx} = ''; $ctx->{anchor_idx} = 1; $ctx->{cur_level} = 0; - $ctx->{skel} = \$skel; $ctx->{mapping} = {}; $ctx->{searchview} = 1; $ctx->{prev_attr} = ''; @@ -326,7 +331,7 @@ EOM # reduce hash lookups in skel_dump $ctx->{-obfs_ibx} = $ibx->{obfuscate} ? $ibx : undef; PublicInbox::View::walk_thread($rootset, $ctx, - \&PublicInbox::View::pre_thread); + \&PublicInbox::View::pre_thread); # pushes to ctx->{skel} # link $INBOX_DIR/description text to "recent" view around # the newest message in this result set: @@ -343,7 +348,7 @@ sub mset_thread_i { print { $ctx->zfh } $ctx->html_top if exists $ctx->{-html_tip}; $eml and return PublicInbox::View::eml_entry($ctx, $eml); my $smsg = shift @{$ctx->{msgs}} or - print { $ctx->zfh } ${delete($ctx->{skel})}; + print { $ctx->zfh } @{delete($ctx->{skel})}; $smsg; } diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm index 7984af43..898ca72d 100644 --- a/lib/PublicInbox/SolverGit.pm +++ b/lib/PublicInbox/SolverGit.pm @@ -13,7 +13,7 @@ use v5.10.1; use File::Temp 0.19 (); # 0.19 for ->newdir use autodie qw(mkdir); use Fcntl qw(SEEK_SET); -use PublicInbox::Git qw(git_unquote git_quote); +use PublicInbox::Git qw(git_unquote git_quote git_exe); use PublicInbox::IO qw(write_file); use PublicInbox::MsgIter qw(msg_part_text); use PublicInbox::Qspawn; @@ -176,7 +176,7 @@ sub extract_diff ($$) { (?:^---\x20$FN$LF) # "+++ b/foo.c" sets post-filename ($11) in case - # $3 is missing + # $3 is missing or truncated (?:^\+{3}\x20$FN$LF) # the meat of the diff, including "^\\No newline ..." @@ -193,7 +193,8 @@ sub extract_diff ($$) { mode_a => $5 // $8 // $4, # new (file) // unchanged // old }; my $path_a = $2 // $10; - my $path_b = $3 // $11; + my $path_b = defined $11 && defined $3 && length $11 > length $3 ? + $11 // $3 : $3 // $11; my $patch = $9; # don't care for leading 'a/' and 'b/' @@ -293,7 +294,7 @@ sub prepare_index ($) { dbg($self, 'preparing index'); my $rdr = { 0 => $in }; - my $cmd = [ qw(git update-index -z --index-info) ]; + my $cmd = [ git_exe, qw(update-index -z --index-info) ]; my $qsp = PublicInbox::Qspawn->new($cmd, $self->{git_env}, $rdr); $path_a = git_quote($path_a); $self->{-msg} = "index prepared:\n$mode_a $oid_full\t$path_a"; @@ -473,7 +474,7 @@ sub apply_result ($$) { # qx_cb skip_identical($self, $patches, $di->{oid_b}); } - my @cmd = qw(git ls-files -s -z); + my @cmd = (git_exe, qw(ls-files -s -z)); my $qsp = PublicInbox::Qspawn->new(\@cmd, $self->{git_env}); $self->{-cur_di} = $di; qsp_qx $self, $qsp, \&ls_files_result; @@ -484,7 +485,7 @@ sub do_git_apply ($) { my $patches = $self->{patches}; # we need --ignore-whitespace because some patches are CRLF - my @cmd = (qw(git apply --cached --ignore-whitespace + my @cmd = (git_exe, qw(apply --cached --ignore-whitespace --unidiff-zero --whitespace=warn --verbose)); my $len = length(join(' ', @cmd)); my $di; # keep track of the last one for "git ls-files" diff --git a/lib/PublicInbox/Syscall.pm b/lib/PublicInbox/Syscall.pm index 4cbe9623..ebcedb89 100644 --- a/lib/PublicInbox/Syscall.pm +++ b/lib/PublicInbox/Syscall.pm @@ -22,7 +22,8 @@ use POSIX qw(ENOENT ENOSYS EINVAL O_NONBLOCK); use Socket qw(SOL_SOCKET SCM_RIGHTS); use Config; our %SIGNUM = (WINCH => 28); # most Linux, {Free,Net,Open}BSD, *Darwin -our ($INOTIFY, %PACK); +our ($INOTIFY, %CONST); +use List::Util qw(sum); # $VERSION = '0.25'; # Sys::Syscall version our @EXPORT_OK = qw(epoll_ctl epoll_create epoll_wait @@ -60,7 +61,7 @@ our ($SYS_epoll_create, $SYS_recvmsg); my $SYS_fstatfs; # don't need fstatfs64, just statfs.f_type -my ($FS_IOC_GETFLAGS, $FS_IOC_SETFLAGS); +my ($FS_IOC_GETFLAGS, $FS_IOC_SETFLAGS, $SYS_writev); my $SFD_CLOEXEC = 02000000; # Perl does not expose O_CLOEXEC our $no_deprecated = 0; @@ -94,6 +95,7 @@ if ($^O eq "linux") { $SYS_fstatfs = 100; $SYS_sendmsg = 370; $SYS_recvmsg = 372; + $SYS_writev = 146; $INOTIFY = { # usage: `use constant $PublicInbox::Syscall::INOTIFY' SYS_inotify_init1 => 332, SYS_inotify_add_watch => 292, @@ -110,6 +112,7 @@ if ($^O eq "linux") { $SYS_fstatfs = 138; $SYS_sendmsg = 46; $SYS_recvmsg = 47; + $SYS_writev = 20; $INOTIFY = { SYS_inotify_init1 => 294, SYS_inotify_add_watch => 254, @@ -126,6 +129,7 @@ if ($^O eq "linux") { $SYS_fstatfs = 138; $SYS_sendmsg = 0x40000206; $SYS_recvmsg = 0x40000207; + $SYS_writev = 0x40000204; $FS_IOC_GETFLAGS = 0x80046601; $FS_IOC_SETFLAGS = 0x40046602; $INOTIFY = { @@ -163,6 +167,7 @@ if ($^O eq "linux") { $SYS_fstatfs = 100; $SYS_sendmsg = 341; $SYS_recvmsg = 342; + $SYS_writev = 146; $FS_IOC_GETFLAGS = 0x40086601; $FS_IOC_SETFLAGS = 0x80086602; $INOTIFY = { @@ -178,6 +183,7 @@ if ($^O eq "linux") { $SYS_signalfd4 = 313; $SYS_renameat2 //= 357; $SYS_fstatfs = 100; + $SYS_writev = 146; $FS_IOC_GETFLAGS = 0x40086601; $FS_IOC_SETFLAGS = 0x80086602; } elsif ($machine =~ m/^s390/) { # untested, no machine on cfarm @@ -190,6 +196,7 @@ if ($^O eq "linux") { $SYS_fstatfs = 100; $SYS_sendmsg = 370; $SYS_recvmsg = 372; + $SYS_writev = 146; } elsif ($machine eq 'ia64') { # untested, no machine on cfarm $SYS_epoll_create = 1243; $SYS_epoll_ctl = 1244; @@ -215,6 +222,7 @@ if ($^O eq "linux") { $SYS_fstatfs = 44; $SYS_sendmsg = 211; $SYS_recvmsg = 212; + $SYS_writev = 66; $INOTIFY = { SYS_inotify_init1 => 26, SYS_inotify_add_watch => 27, @@ -232,6 +240,7 @@ if ($^O eq "linux") { $SYS_fstatfs = 100; $SYS_sendmsg = 296; $SYS_recvmsg = 297; + $SYS_writev = 146; } elsif ($machine =~ m/^mips64/) { # cfarm only has 32-bit userspace $SYS_epoll_create = 5207; $SYS_epoll_ctl = 5208; @@ -242,6 +251,7 @@ if ($^O eq "linux") { $SYS_fstatfs = 5135; $SYS_sendmsg = 5045; $SYS_recvmsg = 5046; + $SYS_writev = 5019; $FS_IOC_GETFLAGS = 0x40046601; $FS_IOC_SETFLAGS = 0x80046602; } elsif ($machine =~ m/^mips/) { # 32-bit, tested on mips64 cfarm host @@ -254,6 +264,7 @@ if ($^O eq "linux") { $SYS_fstatfs = 4100; $SYS_sendmsg = 4179; $SYS_recvmsg = 4177; + $SYS_writev = 4146; $FS_IOC_GETFLAGS = 0x40046601; $FS_IOC_SETFLAGS = 0x80046602; $SIGNUM{WINCH} = 20; @@ -286,11 +297,13 @@ EOM # (I'm assuming Dragonfly copies FreeBSD, here, too) $SYS_recvmsg = 27; $SYS_sendmsg = 28; + $SYS_writev = 121; } BEGIN { if ($^O eq 'linux') { - %PACK = ( + %CONST = ( + MSG_MORE => 0x8000, TMPL_cmsg_len => TMPL_size_t, # cmsg_len, cmsg_level, cmsg_type SIZEOF_cmsghdr => SIZEOF_int * 2 + SIZEOF_size_t, @@ -303,7 +316,7 @@ BEGIN { 'i', # msg_flags ); } elsif ($^O =~ /\A(?:freebsd|openbsd|netbsd|dragonfly)\z/) { - %PACK = ( + %CONST = ( TMPL_cmsg_len => 'L', # socklen_t SIZEOF_cmsghdr => SIZEOF_int * 3, CMSG_DATA_off => SIZEOF_ptr == 8 ? '@16' : '', @@ -316,11 +329,12 @@ BEGIN { ) } - $PACK{CMSG_ALIGN_size} = SIZEOF_size_t; - $PACK{SIZEOF_cmsghdr} //= 0; - $PACK{TMPL_cmsg_len} //= undef; - $PACK{CMSG_DATA_off} //= undef; - $PACK{TMPL_msghdr} //= undef; + $CONST{CMSG_ALIGN_size} = SIZEOF_size_t; + $CONST{SIZEOF_cmsghdr} //= 0; + $CONST{TMPL_cmsg_len} //= undef; + $CONST{CMSG_DATA_off} //= undef; + $CONST{TMPL_msghdr} //= undef; + $CONST{MSG_MORE} //= 0; } # SFD_CLOEXEC is arch-dependent, so IN_CLOEXEC may be, too @@ -455,7 +469,7 @@ sub nodatacow_dir { if (open my $fh, '<', $_[0]) { nodatacow_fh($fh) } } -use constant \%PACK; +use constant \%CONST; sub CMSG_ALIGN ($) { ($_[0] + CMSG_ALIGN_size - 1) & ~(CMSG_ALIGN_size - 1) } use constant CMSG_ALIGN_SIZEOF_cmsghdr => CMSG_ALIGN(SIZEOF_cmsghdr); sub CMSG_SPACE ($) { CMSG_ALIGN($_[0]) + CMSG_ALIGN_SIZEOF_cmsghdr } @@ -463,8 +477,9 @@ sub CMSG_LEN ($) { CMSG_ALIGN_SIZEOF_cmsghdr + $_[0] } use constant msg_controllen_max => CMSG_SPACE(10 * SIZEOF_int) + SIZEOF_cmsghdr; # space for 10 FDs -if (defined($SYS_sendmsg) && defined($SYS_recvmsg)) { no warnings 'once'; + +if (defined($SYS_sendmsg) && defined($SYS_recvmsg)) { require PublicInbox::CmdIPC4; *send_cmd4 = sub ($$$$;$) { @@ -527,6 +542,35 @@ require PublicInbox::CmdIPC4; } @ret; }; + +*sendmsg_more = sub ($@) { + use bytes qw(length substr); + my $sock = shift; + my $iov = join('', map { pack 'P'.TMPL_size_t, $_, length } @_); + my $mh = pack(TMPL_msghdr, + undef, 0, # msg_name, msg_namelen (unused) + $iov, scalar(@_), # msg_iov, msg_iovlen + undef, 0, # msg_control, msg_controllen (unused), + 0); # msg_flags (unused) + my $s; + do { + $s = syscall($SYS_sendmsg, fileno($sock), $mh, MSG_MORE); + } while ($s < 0 && $!{EINTR}); + $s < 0 ? undef : $s; +}; +} + +if (defined($SYS_writev)) { +*writev = sub { + my $fh = shift; + use bytes qw(length substr); + my $iov = join('', map { pack 'P'.TMPL_size_t, $_, length } @_); + my $w; + do { + $w = syscall($SYS_writev, fileno($fh), $iov, scalar(@_)); + } while ($w < 0 && $!{EINTR}); + $w < 0 ? undef : $w; +}; } 1; diff --git a/lib/PublicInbox/TestCommon.pm b/lib/PublicInbox/TestCommon.pm index 3a67ab54..00e96aee 100644 --- a/lib/PublicInbox/TestCommon.pm +++ b/lib/PublicInbox/TestCommon.pm @@ -167,11 +167,8 @@ sub require_git ($;$) { sub require_git_http_backend (;$) { my ($nr) = @_; state $ok = do { - require PublicInbox::Git; - my $git = PublicInbox::Git::git_exe() or plan - skip_all => 'nothing in public-inbox works w/o git'; my $rdr = { 1 => \my $out, 2 => \my $err }; - xsys([$git, qw(http-backend)], undef, $rdr); + xsys([qw(git http-backend)], undef, $rdr); $out =~ /^Status:/ism; }; if (!$ok) { @@ -274,15 +271,16 @@ sub require_mods { sub key2script ($) { my ($key) = @_; - return $key if ($key eq 'git' || index($key, '/') >= 0); + require PublicInbox::Git; + return PublicInbox::Git::git_exe() if $key eq 'git'; + return $key if index($key, '/') >= 0; # n.b. we may have scripts which don't start with "public-inbox" in # the future: $key =~ s/\A([-\.])/public-inbox$1/; 'blib/script/'.$key; } -my @io_mode = ([ *STDIN{IO}, '+<&' ], [ *STDOUT{IO}, '+>&' ], - [ *STDERR{IO}, '+>&' ]); +my @io_mode = ([ \*STDIN, '+<&' ], [ \*STDOUT, '+>&' ], [ \*STDERR, '+>&' ]); sub _prepare_redirects ($) { my ($fhref) = @_; diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 44e1f2a8..9a90f939 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -74,9 +74,13 @@ sub msg_page { my ($id, $prev); my $next_arg = $ctx->{next_arg} = [ $ctx->{mid}, \$id, \$prev ]; - my $smsg = $ctx->{smsg} = $over->next_by_mid(@$next_arg) or - return; # undef == 404 - + my $smsg = $ctx->{smsg} = $over->next_by_mid(@$next_arg); + if (!$smsg && $ctx->{mid} =~ /\A\<(.+)\>\z/ and + ($next_arg->[0] = $1) and + ($over->next_by_mid(@$next_arg))) { + return PublicInbox::WWW::r301($ctx, undef, $next_arg->[0]); + } + $smsg or return; # undef=404 # allow user to easily browse the range around this message if # they have ->over $ctx->{-t_max} = $smsg->{ts}; @@ -307,7 +311,8 @@ sub eml_entry { " <a\nhref=\"${mhref}raw\">raw</a>" . " <a\nhref=\"${mhref}#R\">reply</a>"; - delete($ctx->{-qry}) and + # points to permalink + delete($ctx->{-qry_dfblob}) and $rv .= qq[ <a\nhref="${mhref}#related">related</a>]; my $hr; @@ -432,6 +437,7 @@ sub walk_thread ($$$) { sub pre_thread { # walk_thread callback my ($ctx, $level, $node, $idx) = @_; + # node->{mid} is deduplicated in PublicInbox::SearchThread::thread $ctx->{mapping}->{$node->{mid}} = [ '', $node, $idx, $level ]; skel_dump($ctx, $level, $node); } @@ -476,7 +482,7 @@ sub stream_thread_i { # PublicInbox::WwwStream::getline callback print { $ctx->zfh } ghost_index_entry($ctx, $lvl, $smsg) } else { # all done print { $ctx->zfh } thread_adj_level($ctx, 0), - ${delete($ctx->{skel})}; + @{delete($ctx->{skel})}; return; } } @@ -513,11 +519,13 @@ href="../../">newest</a>] EOF $skel .= "<b\nid=t>Thread overview:</b> "; $skel .= $nr == 1 ? '(only message)' : "$nr+ messages"; - $skel .= " (download: <a\nhref=\"../t.mbox.gz\">mbox.gz</a>"; - $skel .= " / follow: <a\nhref=\"../t.atom\">Atom feed</a>)\n"; - $skel .= "-- links below jump to the message on this page --\n"; + $skel .= <<EOM; + (download: <a\nhref="../t.mbox.gz">mbox.gz</a> follow: <a +href=\"../t.atom\">Atom feed</a> +-- links below jump to the message on this page -- +EOM $ctx->{cur_level} = 0; - $ctx->{skel} = \$skel; + $ctx->{skel} = [ $skel ]; $ctx->{prev_attr} = ''; $ctx->{prev_level} = 0; $ctx->{root_anchor} = 'm' . id_compress($mid, 1); @@ -529,9 +537,9 @@ EOF # reduce hash lookups in pre_thread->skel_dump $ctx->{-obfs_ibx} = $ibx->{obfuscate} ? $ibx : undef; - walk_thread($rootset, $ctx, \&pre_thread); + walk_thread($rootset, $ctx, \&pre_thread); # pushes to ctx->{skel} - $skel .= '</pre>'; + push @{$ctx->{skel}}, '</pre>'; return stream_thread($rootset, $ctx) unless $ctx->{flat}; # flat display: lazy load the full message from smsg @@ -553,8 +561,7 @@ sub thread_html_i { # PublicInbox::WwwStream::getline callback while (my $smsg = shift @{$ctx->{msgs}}) { return $smsg if exists($smsg->{blob}); } - my $skel = delete($ctx->{skel}) or return; # all done - print { $ctx->zfh } $$skel; + print { $ctx->zfh } @{delete $ctx->{skel} // []}; undef; } } @@ -698,6 +705,7 @@ href="d/">diff</a>)</pre><pre>]; } } my @subj = $eml->header('Subject'); + $ctx->{subj_raw} = $subj[0]; $hbuf .= "Subject: $_\n" for @subj; $title[0] = $subj[0] // '(no subject)'; $hbuf .= "Date: $_\n" for $eml->header('Date'); @@ -778,13 +786,13 @@ sub thread_skel ($$$) { my $ibx = $ctx->{ibx}; my ($nr, $msgs) = $ibx->over->get_thread($mid); my $parent = in_reply_to($hdr); - $$skel .= "\n<b>Thread overview: </b>"; + $skel->[-1] .= "\n<b>Thread overview: </b>"; if ($nr <= 1) { if (defined $parent) { - $$skel .= SKEL_EXPAND."\n "; - $$skel .= ghost_parent('../', $parent) . "\n"; + $skel->[-1] .= SKEL_EXPAND."\n "; + $skel->[-1] .= ghost_parent('../', $parent) . "\n"; } else { - $$skel .= "<a\nid=r>[no followups]</a> ". + $skel->[-1] .= "<a\nid=r>[no followups]</a> ". SKEL_EXPAND."\n"; } $ctx->{next_msg} = undef; @@ -792,8 +800,9 @@ sub thread_skel ($$$) { return; } - $$skel .= $nr; - $$skel .= '+ messages / '.SKEL_EXPAND.qq! <a\nhref="#b">top</a>\n!; + $skel->[-1] .= $nr; + $skel->[-1] .= '+ messages / '.SKEL_EXPAND. + qq! <a\nhref="#b">top</a>\n!; # nb: mutt only shows the first Subject in the index pane # when multiple Subject: headers are present, so we follow suit: @@ -811,25 +820,38 @@ sub thread_skel ($$$) { $ctx->{parent_msg} = $parent; } +sub dfqry_text ($$) { + my ($ctx, $subj) = @_; + my $qry_dfblob = delete $ctx->{-qry_dfblob} or return (undef); + my @bs = split /["\x{201c}\x{201d}]+/, $subj; + my $q = join ' ', (@bs ? ('(') : ()), map { + chop if length > 7; # include 1 abbrev "older" patches + "dfblob:$_"; + } @$qry_dfblob; + local $Text::Wrap::columns = COLS; + local $Text::Wrap::huge = 'overflow'; + $subj //= ''; + $subj =~ s/\A\s*(?:amend|fixup|squash)\!\s*//; # --autosquash commands + # split on double-quotes for phrases + $q = wrap('', '', $q); + if (@bs) { + $q .= " )\n OR ("; + $q .= qq[\nbs:"$_"] for @bs; + $q .= ' )'; + } + my $rows = ($q =~ tr/\n/\n/) + 1; + ($rows, ascii_html($q)); +} + # writes to zbuf sub html_footer { my ($ctx, $hdr) = @_; my $upfx = '../'; - my (@related, $skel); + my (@related, @skel); my $foot = '<pre>'; - my $qry = delete $ctx->{-qry}; - if ($qry && $ctx->{ibx}->isrch) { - my $q = ''; # search for either ancestor or descendent patches - for (@{$qry->{dfpre}}, @{$qry->{dfpost}}) { - chop if length > 7; # include 1 abbrev "older" patches - $q .= "dfblob:$_ "; - } - chop $q; # omit trailing SP - local $Text::Wrap::columns = COLS; - local $Text::Wrap::huge = 'overflow'; - $q = wrap('', '', $q); - my $rows = ($q =~ tr/\n/\n/) + 1; - $q = ascii_html($q); + my ($rows, $q) = dfqry_text $ctx, + delete($ctx->{-qry_subj}) // $ctx->{subj_raw}; + if ($rows && $ctx->{ibx}->isrch) { $related[0] = <<EOM; <form id=related action=$upfx @@ -847,12 +869,12 @@ EOM my $t = ts2str($ctx->{-t_max}); my $t_fmt = fmt_ts($ctx->{-t_max}); my $fallback = @related ? "\t" : "<a id=related>\t</a>"; - $skel = <<EOF; + $skel[0] = <<EOF; ${fallback}other threads:[<a href="$upfx?t=$t">~$t_fmt UTC</a>|<a href="$upfx">newest</a>] EOF - thread_skel(\$skel, $ctx, $hdr); + thread_skel(\@skel, $ctx, $hdr); my ($next, $prev); my $parent = ' '; $next = $prev = ' '; @@ -879,11 +901,11 @@ EOF } $foot .= "$next $prev$parent "; } else { # unindexed inboxes w/o over - $skel = qq( <a\nhref="$upfx">latest</a>); + $skel[0] = qq( <a\nhref="$upfx">latest</a>); } - # $skel may be big for big threads, don't append it to $foot + # @skel may be big for big threads, don't push to it print { $ctx->zfh } $foot, qq(<a\nhref="#R">reply</a>), - $skel, '</pre>', @related, + @skel, '</pre>', @related, msg_reply($ctx, $hdr); } @@ -985,7 +1007,8 @@ sub skel_dump { # walk_thread callback my $mid = $smsg->{mid}; if ($level == 0 && $ctx->{skel_dump_roots}++) { - $$skel .= delete($ctx->{sl_note}) || ''; + my $note = delete $ctx->{sl_note}; + push @$skel, $note if $note; } my $f = ascii_html(delete $smsg->{from_name}); @@ -1014,7 +1037,7 @@ sub skel_dump { # walk_thread callback if ($cur) { if ($cur eq $mid) { delete $ctx->{cur}; - $$skel .= "<b>$d<a\nid=r\nhref=\"#t\">". + push @$skel, "<b>$d<a\nid=r\nhref=\"#t\">". "$attr [this message]</a></b>\n"; return 1; } else { @@ -1054,8 +1077,7 @@ sub skel_dump { # walk_thread callback } else { $m = $ctx->{-upfx}.mid_href($mid).'/'; } - $$skel .= $d . "<a\nhref=\"$m\"$id>" . $end; - 1; + push @$skel, qq($d<a\nhref="$m"$id>$end); } sub _skel_ghost { @@ -1078,8 +1100,7 @@ sub _skel_ghost { } else { $d .= qq{<<a\nhref="$href">$html</a>>\n}; } - ${$ctx->{skel}} .= $d; - 1; + push @{$ctx->{skel}}, $d; } # note: we favor Date: here because git-send-email increments it diff --git a/lib/PublicInbox/ViewDiff.pm b/lib/PublicInbox/ViewDiff.pm index d078c5f9..7a6d9a2b 100644 --- a/lib/PublicInbox/ViewDiff.pm +++ b/lib/PublicInbox/ViewDiff.pm @@ -135,15 +135,14 @@ sub diff_header ($$$) { # no need to capture oid_a and oid_b on add/delete, # we just linkify OIDs directly via s///e in conditional if ($$x =~ s/$NULL_TO_BLOB/$1 . oid($dctx, $spfx, $2)/e) { - push @{$ctx->{-qry}->{dfpost}}, $2; + push @{$ctx->{-qry_dfblob}}, $2; } elsif ($$x =~ s/$BLOB_TO_NULL/'index '.oid($dctx, $spfx, $1).$2/e) { - push @{$ctx->{-qry}->{dfpre}}, $1; + push @{$ctx->{-qry_dfblob}}, $1; } elsif ($$x =~ $BLOB_TO_BLOB) { # modification-only, not add/delete: # linkify hunk headers later using oid_a and oid_b @$dctx{qw(oid_a oid_b)} = ($1, $2); - push @{$ctx->{-qry}->{dfpre}}, $1; - push @{$ctx->{-qry}->{dfpost}}, $2; + push @{$ctx->{-qry_dfblob}}, $1, $2; } else { warn "BUG? <$$x> had no ^index line"; } @@ -183,9 +182,10 @@ sub diff_before_or_after ($$) { sub flush_diff ($$) { my ($ctx, $cur) = @_; + my ($subj) = ($$cur =~ /^Subject:\s*\[[^\]]+\]\s*(.+?)$/sm); my @top = split($EXTRACT_DIFFS, $$cur); undef $$cur; # free memory - + $ctx->{-qry_subj} = $subj if $subj; my $lnk = $ctx->{-linkify}; my $dctx; # {}, keys: Q, oid_a, oid_b my $zfh = $ctx->zfh; diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm index 83a83698..b69214bc 100644 --- a/lib/PublicInbox/ViewVCS.pm +++ b/lib/PublicInbox/ViewVCS.pm @@ -251,7 +251,7 @@ href="$ibx_url?t=$t" title="list contemporary emails">$2</a>) !e; - $ctx->{-title_html} = $s = $ctx->{-linkify}->to_html($s); + my $title_html = $ctx->{-title_html} = $ctx->{-linkify}->to_html($s); my ($P, $p, $pt) = delete @$ctx{qw(-cmt_P -cmt_p -cmt_pt)}; $_ = qq(<a href="$upfx$_/s/">).shift(@$p).'</a> '.shift(@$pt) for @$P; if (@$P == 1) { @@ -273,7 +273,7 @@ href="$f.patch">patch</a>)\n <a href=#parent>parent</a> $P->[0]}; author $au committer $co -<b>$s</b> +<b>$title_html</b> EOM print $zfh "\n", $ctx->{-linkify}->to_html($bdy) if length($bdy); undef $bdy; # free memory @@ -291,20 +291,8 @@ EOM # TODO: should there be another textarea which attempts to # search for the exact email which was applied to make this # commit? - if (my $qry = delete $ctx->{-qry}) { - my $q = ''; - for (@{$qry->{dfpost}}, @{$qry->{dfpre}}) { - # keep blobs as short as reasonable, emails - # are going to be older than what's in git - substr($_, 7, 64, ''); - $q .= "dfblob:$_ "; - } - chop $q; # no trailing SP - local $Text::Wrap::columns = PublicInbox::View::COLS; - local $Text::Wrap::huge = 'overflow'; - $q = wrap('', '', $q); - my $rows = ($q =~ tr/\n/\n/) + 1; - $q = ascii_html($q); + my ($rows, $q) = PublicInbox::View::dfqry_text $ctx, $s; + if ($rows) { my $ibx_url = ibx_url_for($ctx); my $alt; if (defined $ibx_url) { diff --git a/lib/PublicInbox/WwwCoderepo.pm b/lib/PublicInbox/WwwCoderepo.pm index a5e2dc4a..5e086fee 100644 --- a/lib/PublicInbox/WwwCoderepo.pm +++ b/lib/PublicInbox/WwwCoderepo.pm @@ -24,8 +24,9 @@ use PublicInbox::OnDestroy; use URI::Escape qw(uri_escape_utf8); use File::Spec; use autodie qw(fcntl open); +use PublicInbox::Git qw(git_exe); -my @EACH_REF = (qw(git for-each-ref --sort=-creatordate), +my @EACH_REF = (git_exe, qw(for-each-ref --sort=-creatordate), "--format=%(HEAD)%00".join('%00', map { "%($_)" } qw(objectname refname:short subject creatordate:short))); my $HEADS_CMD = <<''; @@ -87,7 +88,7 @@ sub new { my @s = stat(STDIN) or die "stat(STDIN): $!"; if ("@l[0, 1]" eq "@s[0, 1]") { my $f = fcntl(STDIN, F_GETFL, 0); - $self->{log_fh} = *STDIN{IO} if $f & O_RDWR; + $self->{log_fh} = \*STDIN if $f & O_RDWR; } $self; } @@ -186,7 +187,7 @@ EOM print $zfh "...\n" if $last; # README - my ($bref, $oid, $ref_path) = @{delete $ctx->{qx_res}->{readme}}; + my ($bref, $oid, $ref_path) = @{delete $ctx->{qx_res}->{readme} // []}; if ($bref) { my $l = PublicInbox::Linkify->new; $$bref =~ s/\s*\z//sm; @@ -249,7 +250,7 @@ sub summary ($$) { my $qsp_err = \($ctx->{-qsp_err} = ''); my %opt = (quiet => 1, 2 => $ctx->{wcr}->{log_fh}); my %env = (GIT_DIR => $ctx->{git}->{git_dir}); - my @log = (qw(git log), "-$nl", '--pretty=format:%d %H %h %cs %s'); + my @log = (git_exe, 'log', "-$nl", '--pretty=format:%d %H %h %cs %s'); push(@log, $tip) if defined $tip; # limit scope for MockHTTP test (t/solver_git.t) diff --git a/lib/PublicInbox/WwwText.pm b/lib/PublicInbox/WwwText.pm index 5e23005e..8279591a 100644 --- a/lib/PublicInbox/WwwText.pm +++ b/lib/PublicInbox/WwwText.pm @@ -37,7 +37,7 @@ sub get_text { } my $env = $ctx->{env}; if ($raw) { - my $h = delete $ctx->{-res_hdr}; + my $h = delete $ctx->{-res_hdr} // []; $txt = gzf_maybe($h, $env)->zflush($txt) if $code == 200; push @$h, 'Content-Type', 'text/plain', 'Content-Length', length($txt); diff --git a/lib/PublicInbox/XapHelper.pm b/lib/PublicInbox/XapHelper.pm index c9957f64..ba41b5d2 100644 --- a/lib/PublicInbox/XapHelper.pm +++ b/lib/PublicInbox/XapHelper.pm @@ -18,7 +18,7 @@ use POSIX qw(:signal_h); use Fcntl qw(LOCK_UN LOCK_EX); use Carp qw(croak); my $X = \%PublicInbox::Search::X; -our (%SRCH, %WORKERS, $nworker, $workerset, $in); +our (%SRCH, %WORKERS, $nworker, $workerset, $in, $SHARD_NFD, $MY_FD_MAX); our $stderr = \*STDERR; sub cmd_test_inspect { @@ -190,23 +190,43 @@ sub dispatch { $GLP->getoptionsfromarray(\@argv, $req, @PublicInbox::Search::XH_SPEC) or return; my $dirs = delete $req->{d} or die 'no -d args'; - my $key = join("\0", @$dirs); + my $key = "-d\0".join("\0-d\0", @$dirs); + $key .= "\0".join("\0", map { ('-Q', $_) } @{$req->{Q}}) if $req->{Q}; my $new; - $req->{srch} = $SRCH{$key} //= do { + $req->{srch} = $SRCH{$key} // do { $new = { qp_flags => $PublicInbox::Search::QP_FLAGS }; + my $nfd = scalar(@$dirs) * PublicInbox::Search::SHARD_COST; + $SHARD_NFD += $nfd; + if ($SHARD_NFD > $MY_FD_MAX) { + $SHARD_NFD = $nfd; + %SRCH = (); + } my $first = shift @$dirs; - my $slow_phrase = -f "$first/iamchert"; - $new->{xdb} = $X->{Database}->new($first); - for (@$dirs) { - $slow_phrase ||= -f "$_/iamchert"; - $new->{xdb}->add_database($X->{Database}->new($_)); + for my $retried (0, 1) { + my $slow_phrase = -f "$first/iamchert"; + eval { + $new->{xdb} = $X->{Database}->new($first); + for (@$dirs) { + $slow_phrase ||= -f "$_/iamchert"; + $new->{xdb}->add_database( + $X->{Database}->new($_)) + } + }; + last unless $@; + if ($retried) { + die "E: $@\n"; + } else { # may be EMFILE/ENFILE/ENOMEM.... + warn "W: $@, retrying...\n"; + %SRCH = (); + $SHARD_NFD = $nfd; + } + $slow_phrase or $new->{qp_flags} + |= PublicInbox::Search::FLAG_PHRASE(); } - $slow_phrase or - $new->{qp_flags} |= PublicInbox::Search::FLAG_PHRASE(); bless $new, $req->{c} ? 'PublicInbox::CodeSearch' : 'PublicInbox::Search'; $new->{qp} = $new->qparse_new; - $new; + $SRCH{$key} = $new; }; $req->{srch}->{xdb}->reopen unless $new; $req->{Q} && !$req->{srch}->{qp_extra_done} and @@ -304,7 +324,7 @@ sub start (@) { my $c = getsockopt(local $in = \*STDIN, SOL_SOCKET, SO_TYPE); unpack('i', $c) == SOCK_SEQPACKET or die 'stdin is not SOCK_SEQPACKET'; - local (%SRCH, %WORKERS); + local (%SRCH, %WORKERS, $SHARD_NFD, $MY_FD_MAX); PublicInbox::Search::load_xapian(); $GLP->getoptionsfromarray(\@argv, my $opt = { j => 1 }, 'j=i') or die 'bad args'; @@ -313,6 +333,10 @@ sub start (@) { for (@PublicInbox::DS::UNBLOCKABLE, POSIX::SIGUSR1) { $workerset->delset($_) or die "delset($_): $!"; } + $MY_FD_MAX = PublicInbox::Search::ulimit_n // + die "E: unable to get RLIMIT_NOFILE: $!"; + warn "W: RLIMIT_NOFILE=$MY_FD_MAX too low\n" if $MY_FD_MAX < 72; + $MY_FD_MAX -= 64; local $nworker = $opt->{j}; return recv_loop() if $nworker == 0; diff --git a/lib/PublicInbox/XapHelperCxx.pm b/lib/PublicInbox/XapHelperCxx.pm index 74852ad1..922bd583 100644 --- a/lib/PublicInbox/XapHelperCxx.pm +++ b/lib/PublicInbox/XapHelperCxx.pm @@ -34,6 +34,7 @@ my $ldflags = '-Wl,-O1'; $ldflags .= ' -Wl,--compress-debug-sections=zlib' if $^O ne 'openbsd'; my $xflags = ($ENV{CXXFLAGS} // '-Wall -ggdb3 -pipe') . ' ' . ' -DTHREADID=' . PublicInbox::Search::THREADID . + ' -DSHARD_COST=' . PublicInbox::Search::SHARD_COST . ' -DXH_SPEC="'.join('', map { s/=.*/:/; $_ } @PublicInbox::Search::XH_SPEC) . '" ' . ($ENV{LDFLAGS} // $ldflags); diff --git a/lib/PublicInbox/khashl.h b/lib/PublicInbox/khashl.h new file mode 100644 index 00000000..170b81ff --- /dev/null +++ b/lib/PublicInbox/khashl.h @@ -0,0 +1,502 @@ +/* The MIT License + + Copyright (c) 2019-2023 by Attractive Chaos <attractor@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef __AC_KHASHL_H +#define __AC_KHASHL_H + +#define AC_VERSION_KHASHL_H "0.2" + +typedef uint32_t khint32_t; +typedef uint64_t khint64_t; + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define kh_inline inline +#define KH_LOCAL static kh_inline + +#ifndef kcalloc +#define kcalloc(N,Z) xcalloc(N,Z) +#endif +#ifndef kfree +#define kfree(P) free(P) +#endif + +/**************************** + * Simple private functions * + ****************************/ + +#define __kh_used(flag, i) (flag[i>>5] >> (i&0x1fU) & 1U) +#define __kh_set_used(flag, i) (flag[i>>5] |= 1U<<(i&0x1fU)) +#define __kh_set_unused(flag, i) (flag[i>>5] &= ~(1U<<(i&0x1fU))) + +#define __kh_fsize(m) ((m) < 32? 1 : (m)>>5) + +static kh_inline khint_t __kh_h2b(khint_t hash, khint_t bits) { return hash * 2654435769U >> (32 - bits); } + +/******************* + * Hash table base * + *******************/ + +#define __KHASHL_TYPE(HType, khkey_t) \ + typedef struct HType { \ + khint_t bits, count; \ + khint32_t *used; \ + khkey_t *keys; \ + } HType; + +#define __KHASHL_PROTOTYPES(HType, prefix, khkey_t) \ + extern HType *prefix##_init(void); \ + extern void prefix##_destroy(HType *h); \ + extern void prefix##_clear(HType *h); \ + extern khint_t prefix##_getp(const HType *h, const khkey_t *key); \ + extern void prefix##_resize(HType *h, khint_t new_n_buckets); \ + extern khint_t prefix##_putp(HType *h, const khkey_t *key, int *absent); \ + extern void prefix##_del(HType *h, khint_t k); + +#define __KHASHL_IMPL_BASIC(SCOPE, HType, prefix) \ + SCOPE HType *prefix##_init(void) { \ + return (HType*)kcalloc(1, sizeof(HType)); \ + } \ + SCOPE void prefix##_release(HType *h) { \ + kfree((void *)h->keys); kfree(h->used); \ + } \ + SCOPE void prefix##_destroy(HType *h) { \ + if (!h) return; \ + prefix##_release(h); \ + kfree(h); \ + } \ + SCOPE void prefix##_clear(HType *h) { \ + if (h && h->used) { \ + khint_t n_buckets = (khint_t)1U << h->bits; \ + memset(h->used, 0, __kh_fsize(n_buckets) * sizeof(khint32_t)); \ + h->count = 0; \ + } \ + } + +#define __KHASHL_IMPL_GET(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + SCOPE khint_t prefix##_getp_core(const HType *h, const khkey_t *key, khint_t hash) { \ + khint_t i, last, n_buckets, mask; \ + if (!h->keys) return 0; \ + n_buckets = (khint_t)1U << h->bits; \ + mask = n_buckets - 1U; \ + i = last = __kh_h2b(hash, h->bits); \ + while (__kh_used(h->used, i) && !__hash_eq(h->keys[i], *key)) { \ + i = (i + 1U) & mask; \ + if (i == last) return n_buckets; \ + } \ + return !__kh_used(h->used, i)? n_buckets : i; \ + } \ + SCOPE khint_t prefix##_getp(const HType *h, const khkey_t *key) { return prefix##_getp_core(h, key, __hash_fn(*key)); } \ + SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { return prefix##_getp_core(h, &key, __hash_fn(key)); } + +#define __KHASHL_IMPL_RESIZE(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + SCOPE void prefix##_resize(HType *h, khint_t new_n_buckets) { \ + khint32_t *new_used = NULL; \ + khint_t j = 0, x = new_n_buckets, n_buckets, new_bits, new_mask; \ + while ((x >>= 1) != 0) ++j; \ + if (new_n_buckets & (new_n_buckets - 1)) ++j; \ + new_bits = j > 2? j : 2; \ + new_n_buckets = (khint_t)1U << new_bits; \ + if (h->count > (new_n_buckets>>1) + (new_n_buckets>>2)) return; /* noop, requested size is too small */ \ + new_used = (khint32_t*)kcalloc(__kh_fsize(new_n_buckets), sizeof(khint32_t)); \ + n_buckets = h->keys? (khint_t)1U<<h->bits : 0U; \ + if (n_buckets < new_n_buckets) { /* expand */ \ + h->keys = (khkey_t *)xreallocarray(h->keys, \ + new_n_buckets, sizeof(khkey_t)); \ + } /* otherwise shrink */ \ + new_mask = new_n_buckets - 1; \ + for (j = 0; j != n_buckets; ++j) { \ + khkey_t key; \ + if (!__kh_used(h->used, j)) continue; \ + key = h->keys[j]; \ + __kh_set_unused(h->used, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t i; \ + i = __kh_h2b(__hash_fn(key), new_bits); \ + while (__kh_used(new_used, i)) i = (i + 1) & new_mask; \ + __kh_set_used(new_used, i); \ + if (i < n_buckets && __kh_used(h->used, i)) { /* kick out the existing element */ \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + __kh_set_unused(h->used, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + break; \ + } \ + } \ + } \ + if (n_buckets > new_n_buckets) /* shrink the hash table */ \ + h->keys = (khkey_t *)xreallocarray(h->keys, \ + new_n_buckets, sizeof(khkey_t)); \ + kfree(h->used); /* free the working space */ \ + h->used = new_used, h->bits = new_bits; \ + } + +#define __KHASHL_IMPL_PUT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + SCOPE khint_t prefix##_putp_core(HType *h, const khkey_t *key, khint_t hash, int *absent) { \ + khint_t n_buckets, i, last, mask; \ + n_buckets = h->keys? (khint_t)1U<<h->bits : 0U; \ + *absent = -1; \ + if (h->count >= (n_buckets>>1) + (n_buckets>>2)) { /* rehashing */ \ + prefix##_resize(h, n_buckets + 1U); \ + n_buckets = (khint_t)1U<<h->bits; \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + mask = n_buckets - 1; \ + i = last = __kh_h2b(hash, h->bits); \ + while (__kh_used(h->used, i) && !__hash_eq(h->keys[i], *key)) { \ + i = (i + 1U) & mask; \ + if (i == last) break; \ + } \ + if (!__kh_used(h->used, i)) { /* not present at all */ \ + h->keys[i] = *key; \ + __kh_set_used(h->used, i); \ + ++h->count; \ + *absent = 1; \ + } else *absent = 0; /* Don't touch h->keys[i] if present */ \ + return i; \ + } \ + SCOPE khint_t prefix##_putp(HType *h, const khkey_t *key, int *absent) { return prefix##_putp_core(h, key, __hash_fn(*key), absent); } \ + SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { return prefix##_putp_core(h, &key, __hash_fn(key), absent); } + +#define __KHASHL_IMPL_DEL(SCOPE, HType, prefix, khkey_t, __hash_fn) \ + SCOPE int prefix##_del(HType *h, khint_t i) { \ + khint_t j = i, k, mask, n_buckets; \ + if (!h->keys) return 0; \ + n_buckets = (khint_t)1U<<h->bits; \ + mask = n_buckets - 1U; \ + while (1) { \ + j = (j + 1U) & mask; \ + if (j == i || !__kh_used(h->used, j)) break; /* j==i only when the table is completely full */ \ + k = __kh_h2b(__hash_fn(h->keys[j]), h->bits); \ + if ((j > i && (k <= i || k > j)) || (j < i && (k <= i && k > j))) \ + h->keys[i] = h->keys[j], i = j; \ + } \ + __kh_set_unused(h->used, i); \ + --h->count; \ + return 1; \ + } + +#define KHASHL_DECLARE(HType, prefix, khkey_t) \ + __KHASHL_TYPE(HType, khkey_t) \ + __KHASHL_PROTOTYPES(HType, prefix, khkey_t) + +#define KHASHL_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + __KHASHL_TYPE(HType, khkey_t) \ + __KHASHL_IMPL_BASIC(SCOPE, HType, prefix) \ + __KHASHL_IMPL_GET(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + __KHASHL_IMPL_RESIZE(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + __KHASHL_IMPL_PUT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + __KHASHL_IMPL_DEL(SCOPE, HType, prefix, khkey_t, __hash_fn) + +#define KHASHE_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + KHASHL_INIT(KH_LOCAL, HType##_sub, prefix##_sub, khkey_t, __hash_fn, __hash_eq) \ + typedef struct HType { \ + khint64_t count:54, bits:8; \ + HType##_sub *sub; \ + } HType; \ + SCOPE HType *prefix##_init_sub(HType *g, size_t bits) { \ + g->bits = bits; \ + g->sub = (HType##_sub*)kcalloc(1U<<bits, sizeof(*g->sub)); \ + return g; \ + } \ + SCOPE HType *prefix##_init(void) { \ + HType *g; \ + g = (HType*)kcalloc(1, sizeof(*g)); \ + return prefix##_init_sub(g, 0); /* unsure about default */ \ + } \ + SCOPE void prefix##_release(HType *g) { \ + int t; \ + for (t = 0; t < 1<<g->bits; ++t) \ + prefix##_sub_release(&g->sub[t]); \ + kfree(g->sub); \ + } \ + SCOPE void prefix##_destroy(HType *g) { \ + if (!g) return; \ + prefix##_release(g); \ + kfree(g); \ + } \ + SCOPE void prefix##_clear(HType *g) { \ + int t; \ + if (!g) return; \ + for (t = 0; t < 1<<g->bits; ++t) \ + prefix##_sub_clear(&g->sub[t]); \ + } \ + SCOPE kh_ensitr_t prefix##_getp(const HType *g, const khkey_t *key) { \ + khint_t hash, low, ret; \ + kh_ensitr_t r; \ + HType##_sub *h; \ + hash = __hash_fn(*key); \ + low = hash & ((1U<<g->bits) - 1); \ + h = &g->sub[low]; \ + ret = prefix##_sub_getp_core(h, key, hash); \ + if (ret >= kh_end(h)) r.sub = low, r.pos = (khint_t)-1; \ + else r.sub = low, r.pos = ret; \ + return r; \ + } \ + SCOPE kh_ensitr_t prefix##_get(const HType *g, const khkey_t key) { return prefix##_getp(g, &key); } \ + SCOPE kh_ensitr_t prefix##_putp(HType *g, const khkey_t *key, int *absent) { \ + khint_t hash, low, ret; \ + kh_ensitr_t r; \ + HType##_sub *h; \ + hash = __hash_fn(*key); \ + low = hash & ((1U<<g->bits) - 1); \ + h = &g->sub[low]; \ + ret = prefix##_sub_putp_core(h, key, hash, absent); \ + if (*absent) ++g->count; \ + if (ret == 1U<<h->bits) r.sub = low, r.pos = (khint_t)-1; \ + else r.sub = low, r.pos = ret; \ + return r; \ + } \ + SCOPE kh_ensitr_t prefix##_put(HType *g, const khkey_t key, int *absent) { return prefix##_putp(g, &key, absent); } \ + SCOPE int prefix##_del(HType *g, kh_ensitr_t itr) { \ + HType##_sub *h = &g->sub[itr.sub]; \ + int ret; \ + ret = prefix##_sub_del(h, itr.pos); \ + if (ret) --g->count; \ + return ret; \ + } + +/***************************** + * More convenient interface * + *****************************/ + +#define __kh_packed /* noop, we use -Werror=address-of-packed-member */ +#define __kh_cached_hash(x) ((x).hash) + +#define KHASHL_SET_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + typedef struct { khkey_t key; } __kh_packed HType##_s_bucket_t; \ + static kh_inline khint_t prefix##_s_hash(HType##_s_bucket_t x) { return __hash_fn(x.key); } \ + static kh_inline int prefix##_s_eq(HType##_s_bucket_t x, HType##_s_bucket_t y) { return __hash_eq(x.key, y.key); } \ + KHASHL_INIT(KH_LOCAL, HType, prefix##_s, HType##_s_bucket_t, prefix##_s_hash, prefix##_s_eq) \ + SCOPE HType *prefix##_init(void) { return prefix##_s_init(); } \ + SCOPE void prefix##_release(HType *h) { prefix##_s_release(h); } \ + SCOPE void prefix##_destroy(HType *h) { prefix##_s_destroy(h); } \ + SCOPE void prefix##_clear(HType *h) { prefix##_s_clear(h); } \ + SCOPE void prefix##_resize(HType *h, khint_t new_n_buckets) { prefix##_s_resize(h, new_n_buckets); } \ + SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_s_bucket_t t; t.key = key; return prefix##_s_getp(h, &t); } \ + SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_s_del(h, k); } \ + SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_s_bucket_t t; t.key = key; return prefix##_s_putp(h, &t, absent); } \ + +#define KHASHL_MAP_INIT(SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \ + typedef struct { khkey_t key; kh_val_t val; } __kh_packed HType##_m_bucket_t; \ + static kh_inline khint_t prefix##_m_hash(HType##_m_bucket_t x) { return __hash_fn(x.key); } \ + static kh_inline int prefix##_m_eq(HType##_m_bucket_t x, HType##_m_bucket_t y) { return __hash_eq(x.key, y.key); } \ + KHASHL_INIT(KH_LOCAL, HType, prefix##_m, HType##_m_bucket_t, prefix##_m_hash, prefix##_m_eq) \ + SCOPE HType *prefix##_init(void) { return prefix##_m_init(); } \ + SCOPE void prefix##_release(HType *h) { prefix##_m_release(h); } \ + SCOPE void prefix##_destroy(HType *h) { prefix##_m_destroy(h); } \ + SCOPE void prefix##_clear(HType *h) { prefix##_m_clear(h); } \ + SCOPE void prefix##_resize(HType *h, khint_t new_n_buckets) { prefix##_m_resize(h, new_n_buckets); } \ + SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_m_bucket_t t; t.key = key; return prefix##_m_getp(h, &t); } \ + SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_m_del(h, k); } \ + SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_m_bucket_t t; t.key = key; return prefix##_m_putp(h, &t, absent); } \ + +#define KHASHL_CSET_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + typedef struct { khkey_t key; khint_t hash; } __kh_packed HType##_cs_bucket_t; \ + static kh_inline int prefix##_cs_eq(HType##_cs_bucket_t x, HType##_cs_bucket_t y) { return x.hash == y.hash && __hash_eq(x.key, y.key); } \ + KHASHL_INIT(KH_LOCAL, HType, prefix##_cs, HType##_cs_bucket_t, __kh_cached_hash, prefix##_cs_eq) \ + SCOPE HType *prefix##_init(void) { return prefix##_cs_init(); } \ + SCOPE void prefix##_destroy(HType *h) { prefix##_cs_destroy(h); } \ + SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_cs_bucket_t t; t.key = key; t.hash = __hash_fn(key); return prefix##_cs_getp(h, &t); } \ + SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_cs_del(h, k); } \ + SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_cs_bucket_t t; t.key = key, t.hash = __hash_fn(key); return prefix##_cs_putp(h, &t, absent); } + +#define KHASHL_CMAP_INIT(SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \ + typedef struct { khkey_t key; kh_val_t val; khint_t hash; } __kh_packed HType##_cm_bucket_t; \ + static kh_inline int prefix##_cm_eq(HType##_cm_bucket_t x, HType##_cm_bucket_t y) { return x.hash == y.hash && __hash_eq(x.key, y.key); } \ + KHASHL_INIT(KH_LOCAL, HType, prefix##_cm, HType##_cm_bucket_t, __kh_cached_hash, prefix##_cm_eq) \ + SCOPE HType *prefix##_init(void) { return prefix##_cm_init(); } \ + SCOPE void prefix##_destroy(HType *h) { prefix##_cm_destroy(h); } \ + SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_cm_bucket_t t; t.key = key; t.hash = __hash_fn(key); return prefix##_cm_getp(h, &t); } \ + SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_cm_del(h, k); } \ + SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_cm_bucket_t t; t.key = key, t.hash = __hash_fn(key); return prefix##_cm_putp(h, &t, absent); } + +#define KHASHE_MAP_INIT(SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \ + typedef struct { khkey_t key; kh_val_t val; } __kh_packed HType##_m_bucket_t; \ + static kh_inline khint_t prefix##_m_hash(HType##_m_bucket_t x) { return __hash_fn(x.key); } \ + static kh_inline int prefix##_m_eq(HType##_m_bucket_t x, HType##_m_bucket_t y) { return __hash_eq(x.key, y.key); } \ + KHASHE_INIT(KH_LOCAL, HType, prefix##_m, HType##_m_bucket_t, prefix##_m_hash, prefix##_m_eq) \ + SCOPE HType *prefix##_init(void) { return prefix##_m_init(); } \ + SCOPE void prefix##_release(HType *h) { prefix##_m_release(h); } \ + SCOPE void prefix##_destroy(HType *h) { prefix##_m_destroy(h); } \ + SCOPE void prefix##_clear(HType *h) { prefix##_m_clear(h); } \ + SCOPE void prefix##_resize(HType *h, khint_t ignore) { /* noop */ } \ + SCOPE kh_ensitr_t prefix##_get(const HType *h, khkey_t key) { HType##_m_bucket_t t; t.key = key; return prefix##_m_getp(h, &t); } \ + SCOPE int prefix##_del(HType *h, kh_ensitr_t k) { return prefix##_m_del(h, k); } \ + SCOPE kh_ensitr_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_m_bucket_t t; t.key = key; return prefix##_m_putp(h, &t, absent); } \ + +/************************** + * Public macro functions * + **************************/ + +#define kh_bucket(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->count) + +#define kh_capacity(h) ((h)->keys? 1U<<(h)->bits : 0U) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table + @return The end iterator [khint_t] + */ +#define kh_end(h) kh_capacity(h) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x].key) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->keys[x].val) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) kh_val(h, x) + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) __kh_used((h)->used, (x)) + +#define kh_ens_key(g, x) kh_key(&(g)->sub[(x).sub], (x).pos) +#define kh_ens_val(g, x) kh_val(&(g)->sub[(x).sub], (x).pos) +#define kh_ens_exist(g, x) kh_exist(&(g)->sub[(x).sub], (x).pos) +#define kh_ens_is_end(x) ((x).pos == (khint_t)-1) +#define kh_ens_size(g) ((g)->count) + +/************************************** + * Common hash and equality functions * + **************************************/ + +#define kh_eq_generic(a, b) ((a) == (b)) +#define kh_eq_str(a, b) (strcmp((a), (b)) == 0) +#define kh_hash_dummy(x) ((khint_t)(x)) + +static kh_inline khint_t kh_hash_uint32(khint_t key) { + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} + +static kh_inline khint_t kh_hash_uint64(khint64_t key) { + key = ~key + (key << 21); + key = key ^ key >> 24; + key = (key + (key << 3)) + (key << 8); + key = key ^ key >> 14; + key = (key + (key << 2)) + (key << 4); + key = key ^ key >> 28; + key = key + (key << 31); + return (khint_t)key; +} + +#define KH_FNV_SEED 11 + +static kh_inline khint_t kh_hash_str(const char *s) { /* FNV1a */ + khint_t h = KH_FNV_SEED ^ 2166136261U; + const unsigned char *t = (const unsigned char*)s; + for (; *t; ++t) + h ^= *t, h *= 16777619; + return h; +} + +static kh_inline khint_t kh_hash_bytes(int len, const unsigned char *s) { + khint_t h = KH_FNV_SEED ^ 2166136261U; + int i; + for (i = 0; i < len; ++i) + h ^= s[i], h *= 16777619; + return h; +} + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +#define kh_ens_foreach(g, kvar, vvar, code) do { \ + size_t t; \ + for (t = 0; t < 1<<g->bits; ++t) \ + kh_foreach(&g->sub[t], kvar, vvar, code); \ +} while (0) + +#define kh_ens_foreach_value(g, vvar, code) do { \ + size_t t; \ + for (t = 0; t < 1<<g->bits; ++t) \ + kh_foreach_value(&g->sub[t], vvar, code); \ +} while (0) + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +#endif /* __AC_KHASHL_H */ diff --git a/lib/PublicInbox/xap_helper.h b/lib/PublicInbox/xap_helper.h index a30a8768..51ab48bf 100644 --- a/lib/PublicInbox/xap_helper.h +++ b/lib/PublicInbox/xap_helper.h @@ -7,7 +7,7 @@ * this is not linked to Perl in any way. * C (not C++) is used as much as possible to lower the contribution * barrier for hackers who mainly know C (this includes the maintainer). - * Yes, that means we use C stdlib stuff like hsearch and open_memstream + * Yes, that means we use C stdlib stuff like open_memstream * instead their equivalents in the C++ stdlib :P * Everything here is an unstable internal API of public-inbox and * NOT intended for ordinary users; only public-inbox hackers @@ -15,6 +15,9 @@ #ifndef _ALL_SOURCE # define _ALL_SOURCE #endif +#ifndef _GNU_SOURCE +# define _GNU_SOURCE +#endif #if defined(__NetBSD__) && !defined(_OPENBSD_SOURCE) // for reallocarray(3) # define _OPENBSD_SOURCE #endif @@ -34,7 +37,6 @@ #include <errno.h> #include <fcntl.h> #include <limits.h> -#include <search.h> #include <signal.h> #include <stddef.h> #include <stdint.h> @@ -83,6 +85,62 @@ #define ABORT(...) do { warnx(__VA_ARGS__); abort(); } while (0) #define EABORT(...) do { warn(__VA_ARGS__); abort(); } while (0) +static void *xcalloc(size_t nmemb, size_t size) +{ + void *ret = calloc(nmemb, size); + if (!ret) EABORT("calloc(%zu, %zu)", nmemb, size); + return ret; +} + +#if defined(__GLIBC__) && defined(__GLIBC_MINOR__) && \ + MY_VER(__GLIBC__, __GLIBC_MINOR__, 0) >= MY_VER(2, 28, 0) +# define HAVE_REALLOCARRAY 1 +#elif defined(__OpenBSD__) || defined(__DragonFly__) || \ + defined(__FreeBSD__) || defined(__NetBSD__) +# define HAVE_REALLOCARRAY 1 +#endif + +static void *xreallocarray(void *ptr, size_t nmemb, size_t size) +{ +#ifdef HAVE_REALLOCARRAY + void *ret = reallocarray(ptr, nmemb, size); +#else // can't rely on __builtin_mul_overflow in gcc 4.x :< + void *ret = NULL; + if (nmemb && size > SIZE_MAX / nmemb) + errno = ENOMEM; + else + ret = realloc(ptr, nmemb * size); +#endif + if (!ret) EABORT("reallocarray(..., %zu, %zu)", nmemb, size); + return ret; +} + +#include "khashl.h" + +struct srch { + int ckey_len; // int for comparisons + unsigned qp_flags; + bool qp_extra_done; + Xapian::Database *db; + Xapian::QueryParser *qp; + unsigned char ckey[]; // $shard_path0\0$shard_path1\0... +}; + +static khint_t srch_hash(const struct srch *srch) +{ + return kh_hash_bytes(srch->ckey_len, srch->ckey); +} + +static int srch_eq(const struct srch *a, const struct srch *b) +{ + return a->ckey_len == b->ckey_len ? + !memcmp(a->ckey, b->ckey, (size_t)a->ckey_len) : 0; +} + +KHASHL_CSET_INIT(KH_LOCAL, srch_set, srch_set, struct srch *, + srch_hash, srch_eq) +static srch_set *srch_cache; +static long my_fd_max, shard_nfd; // sock_fd is modified in signal handler, yes, it's SOCK_SEQPACKET static volatile int sock_fd = STDIN_FILENO; static sigset_t fullset, workerset; @@ -91,7 +149,6 @@ static bool alive = true; static FILE *orig_err = stderr; #endif static int orig_err_fd = -1; -static void *srch_tree; // tsearch + tdelete + twalk static pid_t *worker_pids; // nr => pid #define WORKER_MAX USHRT_MAX static unsigned long nworker, nworker_hwm; @@ -111,15 +168,6 @@ enum exc_iter { ITER_ABORT }; -struct srch { - int paths_len; // int for comparisons - unsigned qp_flags; - bool qp_extra_done; - Xapian::Database *db; - Xapian::QueryParser *qp; - char paths[]; // $shard_path0\0$shard_path1\0... -}; - #define MY_ARG_MAX 256 typedef bool (*cmd)(struct req *); @@ -128,6 +176,7 @@ struct req { // argv and pfxv point into global rbuf char *argv[MY_ARG_MAX]; char *pfxv[MY_ARG_MAX]; // -A <prefix> char *qpfxv[MY_ARG_MAX]; // -Q <user_prefix>[:=]<INTERNAL_PREFIX> + char *dirv[MY_ARG_MAX]; // -d /path/to/XDB(shard) size_t *lenv; // -A <prefix>LENGTH struct srch *srch; char *Pgit_dir; @@ -139,9 +188,7 @@ struct req { // argv and pfxv point into global rbuf unsigned long timeout_sec; size_t nr_out; long sort_col; // value column, negative means BoolWeight - int argc; - int pfxc; - int qpfxc; + int argc, pfxc, qpfxc, dirc; FILE *fp[2]; // [0] response pipe or sock, [1] status/errors (optional) bool has_input; // fp[0] is bidirectional bool collapse_threads; @@ -375,25 +422,6 @@ static size_t off2size(off_t n) return (size_t)n; } -static char *hsearch_enter_key(char *s) -{ -#if defined(__OpenBSD__) || defined(__DragonFly__) - // hdestroy frees each key on some platforms, - // so give it something to free: - char *ret = strdup(s); - if (!ret) err(EXIT_FAILURE, "strdup"); - return ret; -// AFAIK there's no way to detect musl, assume non-glibc Linux is musl: -#elif defined(__GLIBC__) || defined(__linux__) || \ - defined(__FreeBSD__) || defined(__NetBSD__) - // do nothing on these platforms -#else -#warning untested platform detected, unsure if hdestroy(3) frees keys -#warning contact us at meta@public-inbox.org if you get segfaults -#endif - return s; -} - // for test usage only, we need to ensure the compiler supports // __cleanup__ when exceptions are thrown struct inspect { struct req *req; }; @@ -512,15 +540,6 @@ again: return false; } -static int srch_cmp(const void *pa, const void *pb) // for tfind|tsearch -{ - const struct srch *a = (const struct srch *)pa; - const struct srch *b = (const struct srch *)pb; - int diff = a->paths_len - b->paths_len; - - return diff ? diff : memcmp(a->paths, b->paths, (size_t)a->paths_len); -} - static bool is_chert(const char *dir) { char iamchert[PATH_MAX]; @@ -534,49 +553,85 @@ static bool is_chert(const char *dir) return false; } -static bool srch_init(struct req *req) +static void srch_free(struct srch *srch) +{ + delete srch->qp; + delete srch->db; + free(srch); +} + +static void srch_cache_renew(struct srch *keep) +{ + khint_t k; + + // can't delete while iterating, so just free each + clear + for (k = kh_begin(srch_cache); k != kh_end(srch_cache); k++) { + if (!kh_exist(srch_cache, k)) continue; + struct srch *cur = kh_key(srch_cache, k); + + if (cur != keep) + srch_free(cur); + } + srch_set_cs_clear(srch_cache); + if (keep) { + int absent; + k = srch_set_put(srch_cache, keep, &absent); + assert(absent); + assert(k < kh_end(srch_cache)); + } +} + +static void srch_init(struct req *req) { - char *dirv[MY_ARG_MAX]; int i; struct srch *srch = req->srch; - int dirc = (int)SPLIT2ARGV(dirv, srch->paths, (size_t)srch->paths_len); const unsigned FLAG_PHRASE = Xapian::QueryParser::FLAG_PHRASE; - srch->qp_flags = FLAG_PHRASE | - Xapian::QueryParser::FLAG_BOOLEAN | + srch->qp_flags = Xapian::QueryParser::FLAG_BOOLEAN | Xapian::QueryParser::FLAG_LOVEHATE | Xapian::QueryParser::FLAG_WILDCARD; - if (is_chert(dirv[0])) - srch->qp_flags &= ~FLAG_PHRASE; - try { - srch->db = new Xapian::Database(dirv[0]); - } catch (...) { - warn("E: Xapian::Database(%s)", dirv[0]); - return false; + long nfd = req->dirc * SHARD_COST; + + shard_nfd += nfd; + if (shard_nfd > my_fd_max) { + srch_cache_renew(srch); + shard_nfd = nfd; } - try { - for (i = 1; i < dirc; i++) { - if (srch->qp_flags & FLAG_PHRASE && is_chert(dirv[i])) + for (int retried = 0; retried < 2; retried++) { + srch->qp_flags |= FLAG_PHRASE; + i = 0; + try { + srch->db = new Xapian::Database(req->dirv[i]); + if (is_chert(req->dirv[0])) srch->qp_flags &= ~FLAG_PHRASE; - srch->db->add_database(Xapian::Database(dirv[i])); + for (i = 1; i < req->dirc; i++) { + const char *dir = req->dirv[i]; + if (srch->qp_flags & FLAG_PHRASE && + is_chert(dir)) + srch->qp_flags &= ~FLAG_PHRASE; + srch->db->add_database(Xapian::Database(dir)); + } + break; + } catch (const Xapian::Error & e) { + warnx("E: Xapian::Error: %s (%s)", + e.get_description().c_str(), req->dirv[i]); + } catch (...) { // does this happen? + warn("E: add_database(%s)", req->dirv[i]); + } + if (retried) { + errx(EXIT_FAILURE, "E: can't open %s", req->dirv[i]); + } else { + warnx("retrying..."); + if (srch->db) + delete srch->db; + srch->db = NULL; + srch_cache_renew(srch); } - } catch (...) { - warn("E: add_database(%s)", dirv[i]); - return false; - } - try { - srch->qp = new Xapian::QueryParser; - } catch (...) { - perror("E: Xapian::QueryParser"); - return false; } + // these will raise and die on ENOMEM or other errors + srch->qp = new Xapian::QueryParser; srch->qp->set_default_op(Xapian::Query::OP_AND); srch->qp->set_database(*srch->db); - try { - srch->qp->set_stemmer(Xapian::Stem("english")); - } catch (...) { - perror("E: Xapian::Stem"); - return false; - } + srch->qp->set_stemmer(Xapian::Stem("english")); srch->qp->set_stemming_strategy(Xapian::QueryParser::STEM_SOME); srch->qp->SET_MAX_EXPANSION(100); @@ -584,7 +639,6 @@ static bool srch_init(struct req *req) qp_init_code_search(srch->qp); // CodeSearch.pm else qp_init_mail_search(srch->qp); // Search.pm - return true; } // setup query parser for altid and arbitrary headers @@ -612,14 +666,6 @@ static void srch_init_extra(struct req *req) req->srch->qp_extra_done = true; } -static void free_srch(void *p) // tdestroy -{ - struct srch *srch = (struct srch *)p; - delete srch->qp; - delete srch->db; - free(srch); -} - static void dispatch(struct req *req) { int c; @@ -630,7 +676,6 @@ static void dispatch(struct req *req) } kbuf; char *end; FILE *kfp; - struct srch **s; req->threadid = ULLONG_MAX; for (c = 0; c < (int)MY_ARRAY_SIZE(cmds); c++) { if (cmds[c].fn_len == size && @@ -644,7 +689,7 @@ static void dispatch(struct req *req) kfp = open_memstream(&kbuf.ptr, &size); if (!kfp) err(EXIT_FAILURE, "open_memstream(kbuf)"); // write padding, first (contents don't matter) - fwrite(&req->argv[0], offsetof(struct srch, paths), 1, kfp); + fwrite(&req->argv[0], offsetof(struct srch, ckey), 1, kfp); // global getopt variables: optopt = 0; @@ -656,7 +701,11 @@ static void dispatch(struct req *req) switch (c) { case 'a': req->asc = true; break; case 'c': req->code_search = true; break; - case 'd': fwrite(optarg, strlen(optarg) + 1, 1, kfp); break; + case 'd': + req->dirv[req->dirc++] = optarg; + if (MY_ARG_MAX == req->dirc) ABORT("too many -d"); + fprintf(kfp, "-d%c%s%c", 0, optarg, 0); + break; case 'g': req->Pgit_dir = optarg - 1; break; // pad "P" prefix case 'k': req->sort_col = strtol(optarg, &end, 10); @@ -696,6 +745,7 @@ static void dispatch(struct req *req) case 'Q': req->qpfxv[req->qpfxc++] = optarg; if (MY_ARG_MAX == req->qpfxc) ABORT("too many -Q"); + fprintf(kfp, "-Q%c%s%c", 0, optarg, 0); break; default: ABORT("bad switch `-%c'", c); } @@ -704,22 +754,20 @@ static void dispatch(struct req *req) kbuf.srch->db = NULL; kbuf.srch->qp = NULL; kbuf.srch->qp_extra_done = false; - kbuf.srch->paths_len = size - offsetof(struct srch, paths); - if (kbuf.srch->paths_len <= 0) - ABORT("no -d args"); - s = (struct srch **)tsearch(kbuf.srch, &srch_tree, srch_cmp); - if (!s) err(EXIT_FAILURE, "tsearch"); // likely ENOMEM - req->srch = *s; - if (req->srch != kbuf.srch) { // reuse existing - free_srch(kbuf.srch); + kbuf.srch->ckey_len = size - offsetof(struct srch, ckey); + if (kbuf.srch->ckey_len <= 0 || !req->dirc) + ABORT("no -d args (or too many)"); + + int absent; + khint_t ki = srch_set_put(srch_cache, kbuf.srch, &absent); + assert(ki < kh_end(srch_cache)); + req->srch = kh_key(srch_cache, ki); + if (absent) { + srch_init(req); + } else { + assert(req->srch != kbuf.srch); + srch_free(kbuf.srch); req->srch->db->reopen(); - } else if (!srch_init(req)) { - assert(kbuf.srch == *((struct srch **)tfind( - kbuf.srch, &srch_tree, srch_cmp))); - void *del = tdelete(kbuf.srch, &srch_tree, srch_cmp); - assert(del); - free_srch(kbuf.srch); - goto cmd_err; // srch_init already warned } if (req->qpfxc && !req->srch->qp_extra_done) srch_init_extra(req); @@ -736,8 +784,6 @@ static void dispatch(struct req *req) } if (req->timeout_sec) alarm(0); -cmd_err: - return; // just be silent on errors, for now } static void cleanup_pids(void) @@ -877,10 +923,11 @@ static void start_workers(void) static void cleanup_all(void) { cleanup_pids(); -#ifdef __GLIBC__ - tdestroy(srch_tree, free_srch); - srch_tree = NULL; -#endif + if (!srch_cache) + return; + srch_cache_renew(NULL); + srch_set_destroy(srch_cache); + srch_cache = NULL; } static void parent_reopen_logs(void) @@ -1014,14 +1061,23 @@ int main(int argc, char *argv[]) socklen_t slen = (socklen_t)sizeof(c); stdout_path = getenv("STDOUT_PATH"); stderr_path = getenv("STDERR_PATH"); + struct rlimit rl; if (getsockopt(sock_fd, SOL_SOCKET, SO_TYPE, &c, &slen)) err(EXIT_FAILURE, "getsockopt"); if (c != SOCK_SEQPACKET) errx(EXIT_FAILURE, "stdin is not SOCK_SEQPACKET"); + if (getrlimit(RLIMIT_NOFILE, &rl)) + err(EXIT_FAILURE, "getrlimit"); + my_fd_max = rl.rlim_cur; + if (my_fd_max < 72) + warnx("W: RLIMIT_NOFILE=%ld too low\n", my_fd_max); + my_fd_max -= 64; + mail_nrp_init(); code_nrp_init(); + srch_cache = srch_set_init(); atexit(cleanup_all); if (!STDERR_ASSIGNABLE) { @@ -1082,8 +1138,7 @@ int main(int argc, char *argv[]) CHECK(int, 0, sigdelset(&workerset, SIGTERM)); CHECK(int, 0, sigdelset(&workerset, SIGCHLD)); nworker_hwm = nworker; - worker_pids = (pid_t *)calloc(nworker, sizeof(pid_t)); - if (!worker_pids) err(EXIT_FAILURE, "calloc"); + worker_pids = (pid_t *)xcalloc(nworker, sizeof(pid_t)); if (pipe(pipefds)) err(EXIT_FAILURE, "pipe"); int fl = fcntl(pipefds[1], F_GETFL); diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h index 311ca05f..8cc6a845 100644 --- a/lib/PublicInbox/xh_cidx.h +++ b/lib/PublicInbox/xh_cidx.h @@ -3,11 +3,38 @@ // This file is only intended to be included by xap_helper.h // it implements pieces used by CodeSearchIdx.pm +// TODO: consider making PublicInbox::CodeSearchIdx emit binary +// (20 or 32-bit) OIDs instead of ASCII hex. It would require +// more code in both Perl and C++, though... + +// assumes trusted data from same host +static inline unsigned int hex2uint(char c) +{ + switch (c) { + case '0' ... '9': return c - '0'; + case 'a' ... 'f': return c - 'a' + 10; + default: return 0xff; // oh well... + } +} + +// assumes trusted data from same host +static kh_inline khint_t sha_hex_hash(const char *hex) +{ + khint_t ret = 0; + + for (size_t shift = 32; shift; ) + ret |= hex2uint(*hex++) << (shift -= 4); + + return ret; +} + +KHASHL_CMAP_INIT(KH_LOCAL, root2id_map, root2id, + const char *, const char *, + sha_hex_hash, kh_eq_str) + static void term_length_extract(struct req *req) { - req->lenv = (size_t *)calloc(req->pfxc, sizeof(size_t)); - if (!req->lenv) - EABORT("lenv = calloc(%d %zu)", req->pfxc, sizeof(size_t)); + req->lenv = (size_t *)xcalloc(req->pfxc, sizeof(size_t)); for (int i = 0; i < req->pfxc; i++) { char *pfx = req->pfxv[i]; // extract trailing digits as length: @@ -101,6 +128,7 @@ struct dump_roots_tmp { void *mm_ptr; char **entries; struct fbuf wbuf; + root2id_map *root2id; int root2off_fd; }; @@ -110,7 +138,8 @@ static void dump_roots_ensure(void *ptr) struct dump_roots_tmp *drt = (struct dump_roots_tmp *)ptr; if (drt->root2off_fd >= 0) xclose(drt->root2off_fd); - hdestroy(); // idempotent + if (drt->root2id) + root2id_cm_destroy(drt->root2id); size_t size = off2size(drt->sb.st_size); if (drt->mm_ptr && munmap(drt->mm_ptr, size)) EABORT("BUG: munmap(%p, %zu)", drt->mm_ptr, size); @@ -118,23 +147,21 @@ static void dump_roots_ensure(void *ptr) fbuf_ensure(&drt->wbuf); } -static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc) +static bool root2offs_str(struct dump_roots_tmp *drt, + struct fbuf *root_offs, Xapian::Document *doc) { Xapian::TermIterator cur = doc->termlist_begin(); Xapian::TermIterator end = doc->termlist_end(); - ENTRY e, *ep; fbuf_init(root_offs); for (cur.skip_to("G"); cur != end; cur++) { std::string tn = *cur; if (!starts_with(&tn, "G", 1)) break; - union { const char *in; char *out; } u; - u.in = tn.c_str() + 1; - e.key = u.out; - ep = hsearch(e, FIND); - if (!ep) ABORT("hsearch miss `%s'", e.key); - // ep->data is a NUL-terminated string matching /[0-9]+/ + khint_t i = root2id_get(drt->root2id, tn.c_str() + 1); + if (i >= kh_end(drt->root2id)) + ABORT("kh get miss `%s'", tn.c_str() + 1); fputc(' ', root_offs->fp); - fputs((const char *)ep->data, root_offs->fp); + // kh_val(...) is a NUL-terminated string matching /[0-9]+/ + fputs(kh_val(drt->root2id, i), root_offs->fp); } fputc('\n', root_offs->fp); ERR_CLOSE(root_offs->fp, EXIT_FAILURE); // ENOMEM @@ -198,7 +225,7 @@ static enum exc_iter dump_roots_iter(struct req *req, CLEANUP_FBUF struct fbuf root_offs = {}; // " $ID0 $ID1 $IDx..\n" try { Xapian::Document doc = i->get_document(); - if (!root2offs_str(&root_offs, &doc)) + if (!root2offs_str(drt, &root_offs, &doc)) return ITER_ABORT; // bad request, abort for (int p = 0; p < req->pfxc; p++) dump_roots_term(req, p, drt, &root_offs, &doc); @@ -226,8 +253,7 @@ static bool cmd_dump_roots(struct req *req) if (fstat(drt.root2off_fd, &drt.sb)) // ENOMEM? err(EXIT_FAILURE, "fstat(%s)", root2off_file); // each entry is at least 43 bytes ({OIDHEX}\0{INT}\0), - // so /32 overestimates the number of expected entries by - // ~%25 (as recommended by Linux hcreate(3) manpage) + // so /32 overestimates the number of expected entries size_t size = off2size(drt.sb.st_size); size_t est = (size / 32) + 1; //+1 for "\0" termination drt.mm_ptr = mmap(NULL, size, PROT_READ, @@ -236,20 +262,19 @@ static bool cmd_dump_roots(struct req *req) err(EXIT_FAILURE, "mmap(%zu, %s)", size, root2off_file); size_t asize = est * 2; if (asize < est) ABORT("too many entries: %zu", est); - drt.entries = (char **)calloc(asize, sizeof(char *)); - if (!drt.entries) - err(EXIT_FAILURE, "calloc(%zu * 2, %zu)", est, sizeof(char *)); + drt.entries = (char **)xcalloc(asize, sizeof(char *)); size_t tot = split2argv(drt.entries, (char *)drt.mm_ptr, size, asize); if (tot <= 0) return false; // split2argv already warned on error - if (!hcreate(est)) - err(EXIT_FAILURE, "hcreate(%zu)", est); + drt.root2id = root2id_init(); + root2id_cm_resize(drt.root2id, est); for (size_t i = 0; i < tot; ) { - ENTRY e; - e.key = hsearch_enter_key(drt.entries[i++]); // dies on ENOMEM - e.data = drt.entries[i++]; - if (!hsearch(e, ENTER)) - err(EXIT_FAILURE, "hsearch(%s => %s, ENTER)", e.key, - (const char *)e.data); + int absent; + const char *key = drt.entries[i++]; + khint_t k = root2id_put(drt.root2id, key, &absent); + if (!absent) + err(EXIT_FAILURE, "put(%s => %s, ENTER)", + key, drt.entries[i]); + kh_val(drt.root2id, k) = drt.entries[i++]; } req->asc = true; req->sort_col = -1; |