diff options
Diffstat (limited to 'lib/PublicInbox')
26 files changed, 943 insertions, 259 deletions
diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index d6300610..e1843912 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -13,6 +13,7 @@ use v5.10.1; use parent qw(Exporter); our @EXPORT_OK = qw(glob2re rel2abs_collapsed); use PublicInbox::Inbox; +use PublicInbox::Git qw(git_exe); use PublicInbox::Spawn qw(popen_rd run_qx); our $LD_PRELOAD = $ENV{LD_PRELOAD}; # only valid at startup our $DEDUPE; # set to {} to dedupe or clear cache @@ -188,7 +189,7 @@ sub git_config_dump { unshift(@opt_c, '-c', "include.path=$file") if defined($file); tmp_cmd_opt(\%env, $opt); } - my @cmd = ('git', @opt_c, qw(config -z -l --includes)); + my @cmd = (git_exe, @opt_c, qw(config -z -l --includes)); push(@cmd, '-f', $file) if !@opt_c && defined($file); my $fh = popen_rd(\@cmd, \%env, $opt); my $rv = config_fh_parse($fh, "\0", "\n"); @@ -516,7 +517,9 @@ sub _fill_ibx { delete $ibx->{newsgroup}; warn "newsgroup name invalid: `$ngname'\n"; } else { - my $lc = $ibx->{newsgroup} = lc $ngname; + %dedupe = (lc($ngname) => undef); + my ($lc) = keys %dedupe; + $ibx->{newsgroup} = $lc; warn <<EOM if $lc ne $ngname; W: newsgroup=`$ngname' lowercased to `$lc' EOM @@ -608,7 +611,7 @@ sub config_cmd { my ($self, $env, $opt) = @_; my $f = $self->{-f} // default_file(); my @opt_c = @{$self->{-opt_c} // []}; - my @cmd = ('git', @opt_c, 'config'); + my @cmd = (git_exe, @opt_c, 'config'); @opt_c ? tmp_cmd_opt($env, $opt) : push(@cmd, '-f', $f); \@cmd; } diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 774fa47b..934197c0 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -543,13 +543,7 @@ sub _ibx_for ($$$) { sub _fd_constrained ($) { my ($self) = @_; $self->{-fd_constrained} //= do { - my $soft; - if (eval { require BSD::Resource; 1 }) { - my $NOFILE = BSD::Resource::RLIMIT_NOFILE(); - ($soft, undef) = BSD::Resource::getrlimit($NOFILE); - } else { - chomp($soft = `sh -c 'ulimit -n'`); - } + my $soft = PublicInbox::Search::ulimit_n; if (defined($soft)) { # $want is an estimate my $want = scalar(@{$self->{ibx_active}}) + 64; @@ -1288,10 +1282,10 @@ sub idx_init { # similar to V2Writable $self->{mg}->write_alternates($mode, $alt, $new); my $restore = $self->with_umask; if ($git_midx && ($opt->{'multi-pack-index'} // 1)) { - my @cmd = ('multi-pack-index'); - push @cmd, '--no-progress' if ($opt->{quiet}//0) > 1; + my $cmd = $self->git->cmd('multi-pack-index'); + push @$cmd, '--no-progress' if ($opt->{quiet}//0) > 1; my $lk = $self->lock_for_scope; - system('git', "--git-dir=$ALL", @cmd, 'write'); + system(@$cmd, 'write'); # ignore errors, fairly new command, may not exist } $self->parallel_init($self->{indexlevel}); diff --git a/lib/PublicInbox/Fetch.pm b/lib/PublicInbox/Fetch.pm index b0f1437c..814d6e8e 100644 --- a/lib/PublicInbox/Fetch.pm +++ b/lib/PublicInbox/Fetch.pm @@ -12,6 +12,7 @@ use PublicInbox::LeiCurl; use PublicInbox::LeiMirror; use PublicInbox::SHA qw(sha_all); use File::Temp (); +use PublicInbox::Git qw(git_exe); sub new { bless {}, __PACKAGE__ } @@ -19,7 +20,7 @@ sub remote_url ($$) { my ($lei, $dir) = @_; my $rn = $lei->{opt}->{'try-remote'} // [ 'origin', '_grokmirror' ]; for my $r (@$rn) { - my $cmd = [ qw(git config), "remote.$r.url" ]; + my $cmd = [ git_exe, 'config', "remote.$r.url" ]; my $url = run_qx($cmd, undef, { -C => $dir, 2 => $lei->{2} }); next if $?; $url =~ s!/*\n!!s; @@ -92,7 +93,7 @@ sub do_manifest ($$$) { sub get_fingerprint2 { my ($git_dir) = @_; - my $rd = popen_rd([qw(git show-ref)], undef, { -C => $git_dir }); + my $rd = popen_rd([git_exe, 'show-ref'], undef, { -C => $git_dir }); sha_all(256, $rd)->digest; # ignore show-ref errors } @@ -132,8 +133,8 @@ sub do_fetch { # main entry point warn "W: $edir missing remote.*.url\n"; my $o = { -C => $edir }; $o->{1} = $o->{2} = $lei->{2}; - run_wait([qw(git config -l)], undef, $o) and - $lei->child_error($?); + run_wait([git_exe, qw(config -l)], undef, $o) + and $lei->child_error($?); } } @epochs = grep { !$skip->{$_} } @epochs if $skip; @@ -188,7 +189,7 @@ EOM my $opt = {}; # for spawn if (-d $d) { $fp2->[0] = get_fingerprint2($d) if $fp2; - $cmd = [ @$torsocks, 'git', "--git-dir=$d", + $cmd = [ @$torsocks, git_exe, "--git-dir=$d", PublicInbox::LeiMirror::fetch_args($lei, $opt)]; } else { my $e_uri = $ibx_uri->clone; diff --git a/lib/PublicInbox/Gcf2.pm b/lib/PublicInbox/Gcf2.pm index 78392990..acc2091c 100644 --- a/lib/PublicInbox/Gcf2.pm +++ b/lib/PublicInbox/Gcf2.pm @@ -11,7 +11,7 @@ use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC); use IO::Handle; # autoflush use PublicInbox::Git qw($ck_unlinked_packs); use PublicInbox::Lock; -use autodie qw(close open seek truncate); +use autodie qw(open seek truncate); BEGIN { my (%CFG, $c_src); diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index aea389e8..6b722023 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -10,6 +10,7 @@ package PublicInbox::Git; use strict; use v5.10.1; use parent qw(Exporter PublicInbox::DS); +use PublicInbox::DS qw(now); use autodie qw(socketpair read); use POSIX (); use Socket qw(AF_UNIX SOCK_STREAM); @@ -25,7 +26,7 @@ use PublicInbox::SHA qw(sha_all); our %HEXLEN2SHA = (40 => 1, 64 => 256); our %OFMT2HEXLEN = (sha1 => 40, sha256 => 64); our @EXPORT_OK = qw(git_unquote git_quote %HEXLEN2SHA %OFMT2HEXLEN - $ck_unlinked_packs); + $ck_unlinked_packs git_exe); our $in_cleanup; our $async_warn; # true in read-only daemons @@ -54,7 +55,11 @@ my %ESC_GIT = map { $GIT_ESC{$_} => $_ } keys %GIT_ESC; my $EXE_ST = ''; # pack('dd', st_dev, st_ino); # no `q' in some 32-bit builds my ($GIT_EXE, $GIT_VER); -sub check_git_exe () { +sub git_exe () { + my $now = now; + state $next_check = $now - 10; + return $GIT_EXE if $now < $next_check; + $next_check = $now + 10; $GIT_EXE = which('git') // die "git not found in $ENV{PATH}"; my @st = stat(_) or die "stat($GIT_EXE): $!"; # can't do HiRes w/ _ my $st = pack('dd', $st[0], $st[1]); @@ -69,8 +74,8 @@ sub check_git_exe () { $GIT_EXE; } -sub git_version { - check_git_exe(); +sub git_version () { + git_exe; $GIT_VER; } @@ -103,9 +108,10 @@ sub new { sub git_path ($$) { my ($self, $path) = @_; $self->{-git_path}->{$path} //= do { - my $d = "$self->{git_dir}/$path"; - if (-e $d) { - $d; + my $d = $self->{git_dir}; + my $f = "$d/$path"; + if (-d "$d/objects") { + $f; } else { local $/ = "\n"; my $rdr = { 2 => \my $err }; @@ -114,7 +120,7 @@ sub git_path ($$) { chomp $s; # git prior to 2.5.0 did not understand --git-path - $s eq "--git-path\n$path" ? $d : $s; + $s eq "--git-path\n$path" ? $f : $s; } }; } @@ -174,7 +180,7 @@ sub _sock_cmd { # git 2.31.0+ supports -c core.abbrev=no, don't bother with # core.abbrev=64 since not many releases had SHA-256 prior to 2.31 - my $abbr = $GIT_VER lt v2.31.0 ? 40 : 'no'; + my $abbr = git_version lt v2.31.0 ? 40 : 'no'; my @cmd = ($GIT_EXE, "--git-dir=$gd", '-c', "core.abbrev=$abbr", 'cat-file', "--$batch"); if ($err_c) { @@ -203,7 +209,7 @@ sub cat_async_retry ($$) { $oid = \$oid if !@$new_inflight; # to indicate oid retried push @$new_inflight, $oid, $cb, $arg; } - $sock->close if $sock; # only safe once old_inflight is empty + undef $sock; # gcf_drain may run from PublicInbox::IO::DESTROY cat_async_step($self, $new_inflight); # take one step } @@ -287,8 +293,7 @@ sub cat_async_wait ($) { sub batch_prepare ($) { my ($self) = @_; - check_git_exe(); - if ($GIT_VER ge BATCH_CMD_VER) { + if (git_version ge BATCH_CMD_VER) { $self->{-bc} = 1; _sock_cmd($self, 'batch-command', 1); } else { @@ -344,8 +349,7 @@ sub ck { sub check_async_begin ($) { my ($self) = @_; cleanup($self) if alternates_changed($self); - check_git_exe(); - if ($GIT_VER ge BATCH_CMD_VER) { + if (git_version ge BATCH_CMD_VER) { $self->{-bc} = 1; _sock_cmd($self, 'batch-command', 1); } else { @@ -421,15 +425,15 @@ sub async_err ($$$$$) { sub cmd { my $self = shift; - [ $GIT_EXE // check_git_exe(), "--git-dir=$self->{git_dir}", @_ ] + [ git_exe(), "--git-dir=$self->{git_dir}", @_ ] } # $git->popen(qw(show f00)); # or # $git->popen(qw(show f00), { GIT_CONFIG => ... }, { 2 => ... }); sub popen { my ($self, $cmd) = splice(@_, 0, 2); - $cmd = [ 'git', "--git-dir=$self->{git_dir}", - ref($cmd) ? @$cmd : ($cmd, grep { defined && !ref } @_) ]; + $cmd = $self->cmd(ref($cmd) ? @$cmd : + ($cmd, grep { defined && !ref } @_)); popen_rd($cmd, grep { !defined || ref } @_); # env and opt } @@ -577,9 +581,8 @@ sub cloneurl { # templates/this--description in git.git sub manifest_entry { my ($self, $epoch, $default_desc) = @_; - check_git_exe(); my $gd = $self->{git_dir}; - my @git = ($GIT_EXE, "--git-dir=$gd"); + my @git = (git_exe, "--git-dir=$gd"); my $sr = popen_rd([@git, 'show-ref']); my $own = popen_rd([@git, qw(config gitweb.owner)]); my $mod = popen_rd([@git, @MODIFIED_DATE]); @@ -637,7 +640,8 @@ sub event_step { my ($self) = @_; my $inflight = gcf_inflight($self); if ($inflight && @$inflight) { - $self->cat_async_step($inflight); + eval { $self->cat_async_step($inflight) }; + warn "E: $self->{git_dir}: $@" if $@; return $self->close unless $self->{sock}; # don't loop here to keep things fair, but we must requeue # if there's already-read data in pi_io_rbuf @@ -662,10 +666,9 @@ sub watch_async ($) { sub close { my ($self) = @_; - my $sock = $self->{sock}; delete @$self{qw(-bc err_c inflight)}; delete($self->{epwatch}) ? $self->SUPER::close : delete($self->{sock}); - $sock->close if $sock; # calls gcf_drain via awaitpid + # gcf_drain may run from PublicInbox::IO::DESTROY } package PublicInbox::GitCheck; # only for git <2.36 diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm index ed34d548..fefc282a 100644 --- a/lib/PublicInbox/Import.pm +++ b/lib/PublicInbox/Import.pm @@ -73,8 +73,8 @@ sub gfi_start { die "fatal: ls-tree -r -z --name-only $ref: \$?=$?" if $?; $self->{-tree} = { map { $_ => 1 } split(/\0/, $t) }; } - my $gfi = [ 'git', "--git-dir=$git->{git_dir}", qw(fast-import - --quiet --done --date-format=raw) ]; + my $gfi = $git->cmd(qw(fast-import + --quiet --done --date-format=raw)); my $pid = spawn($gfi, undef, { 0 => $s2, 1 => $s2 }); $self->{nchg} = 0; $self->{io} = PublicInbox::IO::attach_pid($io, $pid); @@ -161,7 +161,7 @@ sub _update_git_info ($$) { # for compatibility with existing ssoma installations # we can probably remove this entirely by 2020 my $git_dir = $self->{git}->{git_dir}; - my @cmd = ('git', "--git-dir=$git_dir"); + my @cmd = @{$self->{git}->cmd}; my $index = "$git_dir/ssoma.index"; if (-e $index && !$ENV{FAST}) { my $env = { GIT_INDEX_FILE => $index }; @@ -631,7 +631,7 @@ sub replace_oids { chomp(my $cmt = $self->get_mark(":$mark")) if $nreplace; $self->{nchg} = 0; # prevent _update_git_info until update-ref: $self->done; - my @git = ('git', "--git-dir=$git->{git_dir}"); + my @git = @{$git->cmd}; run_die([@git, qw(update-ref), $old, $tmp]) if $nreplace; diff --git a/lib/PublicInbox/LeiBlob.pm b/lib/PublicInbox/LeiBlob.pm index 00697097..7b2ea434 100644 --- a/lib/PublicInbox/LeiBlob.pm +++ b/lib/PublicInbox/LeiBlob.pm @@ -36,14 +36,13 @@ sub solver_user_cb { # called by solver when done ref($res) eq 'ARRAY' or return $lei->child_error(0, $$log_buf); $lei->qerr($$log_buf); my ($git, $oid, $type, $size, $di) = @$res; - my $gd = $git->{git_dir}; # don't try to support all the git-show(1) options for non-blob, # this is just a convenience: - $type ne 'blob' and - warn "# $oid is a $type of $size bytes in:\n#\t$gd\n"; - - my $cmd = [ 'git', "--git-dir=$gd", 'show', $oid ]; + $type ne 'blob' and warn <<EOM; +# $oid is a $type of $size bytes in:\n#\t$git->{git_dir} +EOM + my $cmd = $git->cmd('show', $oid); my $rdr = { 1 => $lei->{1}, 2 => $lei->{2} }; run_wait($cmd, $lei->{env}, $rdr) and $lei->child_error($?); } diff --git a/lib/PublicInbox/LeiMirror.pm b/lib/PublicInbox/LeiMirror.pm index 08e61e4b..e7c265bd 100644 --- a/lib/PublicInbox/LeiMirror.pm +++ b/lib/PublicInbox/LeiMirror.pm @@ -24,6 +24,7 @@ use POSIX qw(strftime); use PublicInbox::Admin qw(fmt_localtime); use autodie qw(chdir chmod close open pipe readlink seek symlink sysopen sysseek truncate unlink); +use PublicInbox::Git qw(git_exe); our $LIVE; # pid => callback our $FGRP_TODO; # objstore -> [[ to resume ], [ to clone ]] @@ -105,7 +106,7 @@ E: confused by scraping <$uri>, got ambiguous results: sub clone_cmd { my ($lei, $opt) = @_; - my @cmd = qw(git); + my @cmd = (git_exe); $opt->{$_} = $lei->{$_} for (0..2); # we support "-c $key=$val" for arbitrary git config options # e.g.: git -c http.proxy=socks5h://127.0.0.1:9050 @@ -291,7 +292,7 @@ sub upr { # feed `git update-ref --stdin -z' verbosely sub start_update_ref { my ($fgrp) = @_; pipe(my $r, my $w); - my $cmd = [ 'git', "--git-dir=$fgrp->{cur_dst}", + my $cmd = [ git_exe, "--git-dir=$fgrp->{cur_dst}", qw(update-ref --stdin -z) ]; my $pack = on_destroy \&satellite_done, $fgrp; start_cmd($fgrp, $cmd, { 0 => $r, 2 => $fgrp->{lei}->{2} }, $pack); @@ -353,7 +354,7 @@ sub satellite_done { sub pack_refs { my ($self, $git_dir) = @_; - my $cmd = [ 'git', "--git-dir=$git_dir", qw(pack-refs --all --prune) ]; + my $cmd = [git_exe, "--git-dir=$git_dir", qw(pack-refs --all --prune)]; start_cmd($self, $cmd, { 2 => $self->{lei}->{2} }); } @@ -374,14 +375,15 @@ sub fgrpv_done { my $rn = $fgrp->{-remote}; my %opt = ( 2 => $fgrp->{lei}->{2} ); my $update_ref = on_destroy \&fgrp_update, $fgrp; - my $src = [ 'git', "--git-dir=$fgrp->{-osdir}", 'for-each-ref', + my $src = [ git_exe, "--git-dir=$fgrp->{-osdir}", + 'for-each-ref', "--format=refs/%(refname:lstrip=3)%00%(objectname)", "refs/remotes/$rn/" ]; open(my $sfh, '+>', undef); $fgrp->{srcfh} = $sfh; start_cmd($fgrp, $src, { %opt, 1 => $sfh }, $update_ref); - my $dst = [ 'git', "--git-dir=$fgrp->{cur_dst}", 'for-each-ref', - '--format=%(refname)%00%(objectname)' ]; + my $dst = [ git_exe, "--git-dir=$fgrp->{cur_dst}", + 'for-each-ref', '--format=%(refname)%00%(objectname)' ]; open(my $dfh, '+>', undef); $fgrp->{dstfh} = $dfh; start_cmd($fgrp, $dst, { %opt, 1 => $dfh }, $update_ref); @@ -399,7 +401,7 @@ sub fgrp_fetch_all { # system argv limits: my $grp = 'fgrptmp'; - my @git = (@{$self->{-torsocks}}, 'git'); + my @git = (@{$self->{-torsocks}}, git_exe); my $j = $self->{lei}->{opt}->{jobs}; my $opt = {}; my @fetch = do { @@ -413,7 +415,7 @@ sub fgrp_fetch_all { my ($old, $new) = @$fgrp_old_new; @$old = sort { $b->{-sort} <=> $a->{-sort} } @$old; # $new is ordered by {references} - my $cmd = ['git', "--git-dir=$osdir", qw(config -f), $f ]; + my $cmd = [ git_exe, "--git-dir=$osdir", qw(config -f), $f ]; # clobber settings from previous run atomically for ("remotes.$grp", 'fetch.hideRefs') { @@ -541,7 +543,7 @@ sub cmp_fp_do { return if $cur_ent->{fingerprint} eq $new; } my $dst = $self->{cur_dst} // $self->{dst}; - my $cmd = ['git', "--git-dir=$dst", 'show-ref']; + my $cmd = [git_exe, "--git-dir=$dst", 'show-ref']; my $opt = { 2 => $self->{lei}->{2} }; open($opt->{1}, '+>', undef); $self->{-show_ref} = $opt->{1}; @@ -555,7 +557,7 @@ sub resume_fetch { my ($self, $uri, $fini) = @_; return if !keep_going($self); my $dst = $self->{cur_dst} // $self->{dst}; - my @git = ('git', "--git-dir=$dst"); + my @git = (git_exe, "--git-dir=$dst"); my $opt = { 2 => $self->{lei}->{2} }; my $rn = 'random'.int(rand(1 << 30)); for ("url=$uri", "fetch=+refs/*:refs/*", 'mirror=true') { @@ -755,7 +757,7 @@ sub update_ent { my $cur = $self->{-local_manifest}->{$key}->{fingerprint} // "\0"; my $dst = $self->{cur_dst} // $self->{dst}; if (defined($new) && $new ne $cur) { - my $cmd = ['git', "--git-dir=$dst", 'show-ref']; + my $cmd = [git_exe, "--git-dir=$dst", 'show-ref']; my $opt = { 2 => $self->{lei}->{2} }; open($opt->{1}, '+>', undef); $self->{-show_ref_up} = $opt->{1}; @@ -766,7 +768,7 @@ sub update_ent { $cur = $self->{-local_manifest}->{$key}->{head} // "\0"; if (defined($new) && $new ne $cur) { # n.b. grokmirror writes raw contents to $dst/HEAD w/o locking - my $cmd = [ 'git', "--git-dir=$dst" ]; + my $cmd = [ git_exe, "--git-dir=$dst" ]; if ($new =~ s/\Aref: //) { push @$cmd, qw(symbolic-ref HEAD), $new; } elsif ($new =~ /\A[a-f0-9]{40,}\z/) { @@ -811,7 +813,8 @@ sub update_ent { $cur = $self->{-local_manifest}->{$key}->{owner} // "\0"; return if $cur eq $new; utf8::encode($new); # to octets - my $cmd = [ qw(git config -f), "$dst/config", 'gitweb.owner', $new ]; + my $cmd = [ git_exe, qw(config -f), "$dst/config", + 'gitweb.owner', $new ]; start_cmd($self, $cmd, { 2 => $self->{lei}->{2} }); } diff --git a/lib/PublicInbox/LeiRediff.pm b/lib/PublicInbox/LeiRediff.pm index 35728330..66359dd4 100644 --- a/lib/PublicInbox/LeiRediff.pm +++ b/lib/PublicInbox/LeiRediff.pm @@ -119,17 +119,16 @@ EOM map { $_->git_path('objects')."\n" } @{$self->{gits}}; $rw = PublicInbox::Git->new($d); } - my $w = popen_wr(['git', "--git-dir=$rw->{git_dir}", - qw(fast-import --quiet --done --date-format=raw)], + my $w = popen_wr($rw->cmd(qw(fast-import + --quiet --done --date-format=raw)), $lei->{env}, { 2 => $lei->{2} }); print $w $ta, "\n", $tb, "\ndone\n" or die "print fast-import: $!"; $w->close or die "close w fast-import: \$?=$? \$!=$!"; - my $cmd = [ 'diff' ]; + my $cmd = $rw->cmd('diff'); _lei_diff_prepare($lei, $cmd); - $lei->qerr("# git @$cmd"); + $lei->qerr("# git @$cmd[2..$#$cmd]"); push @$cmd, qw(A B); - unshift @$cmd, 'git', "--git-dir=$rw->{git_dir}"; run_wait($cmd, $lei->{env}, { 2 => $lei->{2}, 1 => $lei->{1} }) and $lei->child_error($?); # for git diff --exit-code undef; diff --git a/lib/PublicInbox/LeiSavedSearch.pm b/lib/PublicInbox/LeiSavedSearch.pm index 9ae9dcdb..1c8a1f73 100644 --- a/lib/PublicInbox/LeiSavedSearch.pm +++ b/lib/PublicInbox/LeiSavedSearch.pm @@ -263,8 +263,6 @@ sub reset_dedupe { sub mm { undef } -sub altid_map { {} } - sub cloneurl { [] } # find existing directory containing a `lei.saved-search' file based on diff --git a/lib/PublicInbox/RepoAtom.pm b/lib/PublicInbox/RepoAtom.pm index ab0f2fcc..eb0ed3c7 100644 --- a/lib/PublicInbox/RepoAtom.pm +++ b/lib/PublicInbox/RepoAtom.pm @@ -94,11 +94,10 @@ xmlns="http://www.w3.org/1999/xhtml"><pre style="white-space:pre-wrap"> sub srv_tags_atom { my ($ctx) = @_; my $max = 50; # TODO configurable - my @cmd = ('git', "--git-dir=$ctx->{git}->{git_dir}", - qw(for-each-ref --sort=-creatordate), "--count=$max", - '--perl', $EACH_REF_FMT, 'refs/tags'); + my $cmd = $ctx->{git}->cmd(qw(for-each-ref --sort=-creatordate), + "--count=$max", '--perl', $EACH_REF_FMT, 'refs/tags'); $ctx->{-feed_title} = "$ctx->{git}->{nick} tags"; - my $qsp = PublicInbox::Qspawn->new(\@cmd); + my $qsp = PublicInbox::Qspawn->new($cmd); $ctx->{-is_tag} = 1; $qsp->psgi_yield($ctx->{env}, undef, \&atom_ok, $ctx); } @@ -107,20 +106,19 @@ sub srv_atom { my ($ctx, $path) = @_; return if index($path, '//') >= 0 || index($path, '/') == 0; my $max = 50; # TODO configurable - my @cmd = ('git', "--git-dir=$ctx->{git}->{git_dir}", - qw(log --no-notes --no-color --no-abbrev), - $ATOM_FMT, "-$max"); + my $cmd = $ctx->{git}->cmd(qw(log --no-notes --no-color --no-abbrev), + $ATOM_FMT, "-$max"); my $tip = $ctx->{qp}->{h}; # same as cgit $ctx->{-feed_title} = $ctx->{git}->{nick}; $ctx->{-feed_title} .= " $path" if $path ne ''; if (defined($tip)) { - push @cmd, $tip; + push @$cmd, $tip; $ctx->{-feed_title} .= ", $tip"; } # else: let git decide based on HEAD if $tip isn't defined - push @cmd, '--'; - push @cmd, $path if $path ne ''; - my $qsp = PublicInbox::Qspawn->new(\@cmd, undef, + push @$cmd, '--'; + push @$cmd, $path if $path ne ''; + my $qsp = PublicInbox::Qspawn->new($cmd, undef, { quiet => 1, 2 => $ctx->{lh} }); $qsp->psgi_yield($ctx->{env}, undef, \&atom_ok, $ctx); } diff --git a/lib/PublicInbox/RepoSnapshot.pm b/lib/PublicInbox/RepoSnapshot.pm index 4c372569..bff97bc8 100644 --- a/lib/PublicInbox/RepoSnapshot.pm +++ b/lib/PublicInbox/RepoSnapshot.pm @@ -50,15 +50,13 @@ sub ver_check { # git->check_async callback delete($ctx->{env}->{'qspawn.wcb'})->(r(404)); } else { # found, done: $ctx->{etag} = $oid; - my @cfg; + my $cmd = $ctx->{git}->cmd; if (my $cmd = $FMT_CFG{$ctx->{snap_fmt}}) { - @cfg = ('-c', "tar.$ctx->{snap_fmt}.command=$cmd"); + push @$cmd, '-c', "tar.$ctx->{snap_fmt}.command=$cmd"; } - my $qsp = PublicInbox::Qspawn->new(['git', @cfg, - "--git-dir=$ctx->{git}->{git_dir}", 'archive', - "--prefix=$ctx->{snap_pfx}/", - "--format=$ctx->{snap_fmt}", $treeish], undef, - { quiet => 1 }); + push @$cmd, 'archive', "--prefix=$ctx->{snap_pfx}/", + "--format=$ctx->{snap_fmt}", $treeish; + my $qsp = PublicInbox::Qspawn->new($cmd, undef, { quiet => 1 }); $qsp->psgi_yield($ctx->{env}, undef, \&archive_hdr, $ctx); } } diff --git a/lib/PublicInbox/RepoTree.pm b/lib/PublicInbox/RepoTree.pm index 5c73531a..4c85f9a8 100644 --- a/lib/PublicInbox/RepoTree.pm +++ b/lib/PublicInbox/RepoTree.pm @@ -51,8 +51,8 @@ sub find_missing { $res->[0] = 404; return delete($ctx->{-wcb})->($res); } - my $cmd = ['git', "--git-dir=$ctx->{git}->{git_dir}", - qw(log --no-color -1), '--pretty=%H %h %s (%as)' ]; + my $cmd = $ctx->{git}->cmd(qw(log --no-color -1), + '--pretty=%H %h %s (%as)'); push @$cmd, $ctx->{qp}->{h} if defined($ctx->{qp}->{h}); push @$cmd, '--'; push @$cmd, $ctx->{-path}; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index fbdb48a3..eb5e67ba 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -54,6 +54,9 @@ use constant { # # v1.6.0 adds BYTES, UID and THREADID values SCHEMA_VERSION => 15, + + # we may have up to 8 FDs per shard (depends on Xapian *shrug*) + SHARD_COST => 8, }; use PublicInbox::Smsg; @@ -92,6 +95,7 @@ our @XH_SPEC = ( 'K=i', # timeout kill after i seconds 'O=s', # eidx_key 'T=i', # threadid + 'Q=s@', # query prefixes "$user_prefix[:=]$XPREFIX" ); sub load_xapian () { @@ -435,8 +439,8 @@ sub xhc_start_maybe (@) { $xhc; } -sub xh_opt ($) { - my ($opt) = @_; +sub xh_opt ($$) { + my ($self, $opt) = @_; my $lim = $opt->{limit} || 50; my @ret; push @ret, '-o', $opt->{offset} if $opt->{offset}; @@ -458,7 +462,16 @@ sub xh_opt ($) { push @ret, '-t' if $opt->{threads}; push @ret, '-T', $opt->{threadid} if defined $opt->{threadid}; push @ret, '-O', $opt->{eidx_key} if defined $opt->{eidx_key}; - @ret; + my $apfx = $self->{-alt_pfx} //= do { + my @tmp; + for (grep /\Aserial:/, @{$self->{altid} // []}) { + my (undef, $pfx) = split /:/, $_; + push @tmp, '-Q', "$pfx=X\U$pfx"; + } + # TODO: arbitrary header indexing goes here + \@tmp; + }; + (@ret, @$apfx); } # returns a true value if actually handled asynchronously, @@ -467,7 +480,7 @@ sub async_mset { my ($self, $qry_str, $opt, $cb, @args) = @_; if ($XHC) { # unconditionally retrieving pct + rank for now xdb($self); # populate {nshards} - my @margs = ($self->xh_args, xh_opt($opt)); + my @margs = ($self->xh_args, xh_opt($self, $opt), '--'); my $ret = eval { my $rd = $XHC->mkreq(undef, 'mset', @margs, $qry_str); PublicInbox::XhcMset->maybe_new($rd, $self, $cb, @args); @@ -630,7 +643,7 @@ EOM $ret .= qq{\tqp->add_boolean_prefix("$name", "$_");\n} } } - # TODO: altid support + # altid support is handled in xh_opt and srch_init_extra in XH for my $name (sort keys %prob_prefix) { for (split(/ /, $prob_prefix{$name})) { $ret .= qq{\tqp->add_prefix("$name", "$_");\n} @@ -719,4 +732,17 @@ sub get_doc ($$) { } } +# not sure where best to put this... +sub ulimit_n () { + my $n; + if (eval { require BSD::Resource; 1 }) { + my $NOFILE = BSD::Resource::RLIMIT_NOFILE(); + ($n, undef) = BSD::Resource::getrlimit($NOFILE); + } else { + require PublicInbox::Spawn; + $n = PublicInbox::Spawn::run_qx([qw(/bin/sh -c), 'ulimit -n']); + } + $n; +} + 1; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 1cbf6d23..4fd493d9 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -119,7 +119,7 @@ sub load_xapian_writable () { my $ver = eval 'v'.join('.', eval($xap.'::major_version()'), eval($xap.'::minor_version()'), eval($xap.'::revision()')); - if ($ver ge 1.4) { # new flags in Xapian 1.4 + if ($ver ge v1.4) { # new flags in Xapian 1.4 $DB_NO_SYNC = 0x4; $DB_DANGEROUS = 0x10; } @@ -1003,8 +1003,7 @@ sub prepare_stack ($$) { sub is_ancestor ($$$) { my ($git, $cur, $tip) = @_; return 0 unless $git->check($cur); - my $cmd = [ 'git', "--git-dir=$git->{git_dir}", - qw(merge-base --is-ancestor), $cur, $tip ]; + my $cmd = $git->cmd(qw(merge-base --is-ancestor), $cur, $tip); run_wait($cmd) == 0; } diff --git a/lib/PublicInbox/SolverGit.pm b/lib/PublicInbox/SolverGit.pm index 296e7d17..b5f6b96e 100644 --- a/lib/PublicInbox/SolverGit.pm +++ b/lib/PublicInbox/SolverGit.pm @@ -13,7 +13,7 @@ use v5.10.1; use File::Temp 0.19 (); # 0.19 for ->newdir use autodie qw(mkdir); use Fcntl qw(SEEK_SET); -use PublicInbox::Git qw(git_unquote git_quote); +use PublicInbox::Git qw(git_unquote git_quote git_exe); use PublicInbox::IO qw(write_file); use PublicInbox::MsgIter qw(msg_part_text); use PublicInbox::Qspawn; @@ -136,6 +136,12 @@ sub extract_diff ($$) { if ($cte =~ /\bquoted-printable\b/i && $part->crlf eq "\n") { $s =~ s/\r\n/\n/sg; } + + # Quiet "Complex regular subexpression recursion limit" warning. + # Not much we can do about it, but it's no longer relevant to + # Perl 5.3x (the warning was removed in 5.37.1, and actual + # recursino sometime before then). + no warnings 'regexp'; $s =~ m!( # $1 start header lines we save for debugging: # everything before ^index is optional, but we don't @@ -287,7 +293,7 @@ sub prepare_index ($) { dbg($self, 'preparing index'); my $rdr = { 0 => $in }; - my $cmd = [ qw(git update-index -z --index-info) ]; + my $cmd = [ git_exe, qw(update-index -z --index-info) ]; my $qsp = PublicInbox::Qspawn->new($cmd, $self->{git_env}, $rdr); $path_a = git_quote($path_a); $self->{-msg} = "index prepared:\n$mode_a $oid_full\t$path_a"; @@ -467,7 +473,7 @@ sub apply_result ($$) { # qx_cb skip_identical($self, $patches, $di->{oid_b}); } - my @cmd = qw(git ls-files -s -z); + my @cmd = (git_exe, qw(ls-files -s -z)); my $qsp = PublicInbox::Qspawn->new(\@cmd, $self->{git_env}); $self->{-cur_di} = $di; qsp_qx $self, $qsp, \&ls_files_result; @@ -478,7 +484,7 @@ sub do_git_apply ($) { my $patches = $self->{patches}; # we need --ignore-whitespace because some patches are CRLF - my @cmd = (qw(git apply --cached --ignore-whitespace + my @cmd = (git_exe, qw(apply --cached --ignore-whitespace --unidiff-zero --whitespace=warn --verbose)); my $len = length(join(' ', @cmd)); my $di; # keep track of the last one for "git ls-files" diff --git a/lib/PublicInbox/TestCommon.pm b/lib/PublicInbox/TestCommon.pm index aeff5d1d..3a67ab54 100644 --- a/lib/PublicInbox/TestCommon.pm +++ b/lib/PublicInbox/TestCommon.pm @@ -168,7 +168,7 @@ sub require_git_http_backend (;$) { my ($nr) = @_; state $ok = do { require PublicInbox::Git; - my $git = PublicInbox::Git::check_git_exe() or plan + my $git = PublicInbox::Git::git_exe() or plan skip_all => 'nothing in public-inbox works w/o git'; my $rdr = { 1 => \my $out, 2 => \my $err }; xsys([$git, qw(http-backend)], undef, $rdr); diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 43f37f60..15a73158 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -1071,8 +1071,8 @@ sub unindex_todo ($$$) { return if $before == $after; # ensure any blob can not longer be accessed via dumb HTTP - run_die(['git', "--git-dir=$unit->{git}->{git_dir}", - qw(-c gc.reflogExpire=now gc --prune=all --quiet)]); + run_die($unit->{git}->cmd(qw(-c gc.reflogExpire=now gc + --prune=all --quiet))); } sub sync_ranges ($$) { diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm index f47c2703..83a83698 100644 --- a/lib/PublicInbox/ViewVCS.pm +++ b/lib/PublicInbox/ViewVCS.pm @@ -106,7 +106,7 @@ sub stream_large_blob ($$) { my ($ctx, $res) = @_; $ctx->{-res} = $res; my ($git, $oid, $type, $size, $di) = @$res; - my $cmd = ['git', "--git-dir=$git->{git_dir}", 'cat-file', $type, $oid]; + my $cmd = $git->cmd('cat-file', $type, $oid); my $qsp = PublicInbox::Qspawn->new($cmd); $ctx->{env}->{'qspawn.wcb'} = $ctx->{-wcb}; $qsp->psgi_yield($ctx->{env}, undef, \&stream_blob_parse_hdr, $ctx); @@ -368,10 +368,9 @@ sub stream_patch_parse_hdr { # {parse_hdr} for Qspawn sub show_patch ($$) { my ($ctx, $res) = @_; my ($git, $oid) = @$res; - my @cmd = ('git', "--git-dir=$git->{git_dir}", - qw(format-patch -1 --stdout -C), + my $cmd = $git->cmd(qw(format-patch -1 --stdout -C), "--signature=git format-patch -1 --stdout -C $oid", $oid); - my $qsp = PublicInbox::Qspawn->new(\@cmd); + my $qsp = PublicInbox::Qspawn->new($cmd); $ctx->{env}->{'qspawn.wcb'} = $ctx->{-wcb}; $ctx->{patch_oid} = $oid; $qsp->psgi_yield($ctx->{env}, undef, \&stream_patch_parse_hdr, $ctx); @@ -400,8 +399,8 @@ sub show_other ($$) { # just in case... my ($git, $oid, $type, $size) = @$res; $size > $MAX_SIZE and return html_page($ctx, 200, ascii_html($type)." $oid is too big to show\n". dbg_log($ctx)); - my $cmd = ['git', "--git-dir=$git->{git_dir}", - qw(show --encoding=UTF-8 --no-color --no-abbrev), $oid ]; + my $cmd = $git->cmd(qw(show --encoding=UTF-8 + --no-color --no-abbrev), $oid); my $qsp = PublicInbox::Qspawn->new($cmd); $qsp->{qsp_err} = \($ctx->{-qsp_err} = ''); $qsp->psgi_qx($ctx->{env}, undef, \&show_other_result, $ctx); @@ -487,8 +486,7 @@ sub show_tree ($$) { # also used by RepoTree my ($git, $oid, undef, $size) = @$res; $size > $MAX_SIZE and return html_page($ctx, 200, "tree $oid is too big to show\n". dbg_log($ctx)); - my $cmd = [ 'git', "--git-dir=$git->{git_dir}", - qw(ls-tree -z -l --no-abbrev), $oid ]; + my $cmd = $git->cmd(qw(ls-tree -z -l --no-abbrev), $oid); my $qsp = PublicInbox::Qspawn->new($cmd); $ctx->{tree_oid} = $oid; $qsp->{qsp_err} = \($ctx->{-qsp_err} = ''); diff --git a/lib/PublicInbox/WwwCoderepo.pm b/lib/PublicInbox/WwwCoderepo.pm index a5e2dc4a..8587f530 100644 --- a/lib/PublicInbox/WwwCoderepo.pm +++ b/lib/PublicInbox/WwwCoderepo.pm @@ -186,7 +186,7 @@ EOM print $zfh "...\n" if $last; # README - my ($bref, $oid, $ref_path) = @{delete $ctx->{qx_res}->{readme}}; + my ($bref, $oid, $ref_path) = @{delete $ctx->{qx_res}->{readme} // []}; if ($bref) { my $l = PublicInbox::Linkify->new; $$bref =~ s/\s*\z//sm; diff --git a/lib/PublicInbox/WwwText.pm b/lib/PublicInbox/WwwText.pm index 5e23005e..8279591a 100644 --- a/lib/PublicInbox/WwwText.pm +++ b/lib/PublicInbox/WwwText.pm @@ -37,7 +37,7 @@ sub get_text { } my $env = $ctx->{env}; if ($raw) { - my $h = delete $ctx->{-res_hdr}; + my $h = delete $ctx->{-res_hdr} // []; $txt = gzf_maybe($h, $env)->zflush($txt) if $code == 200; push @$h, 'Content-Type', 'text/plain', 'Content-Length', length($txt); diff --git a/lib/PublicInbox/XapHelper.pm b/lib/PublicInbox/XapHelper.pm index 2e20660e..ba41b5d2 100644 --- a/lib/PublicInbox/XapHelper.pm +++ b/lib/PublicInbox/XapHelper.pm @@ -18,7 +18,7 @@ use POSIX qw(:signal_h); use Fcntl qw(LOCK_UN LOCK_EX); use Carp qw(croak); my $X = \%PublicInbox::Search::X; -our (%SRCH, %WORKERS, $nworker, $workerset, $in); +our (%SRCH, %WORKERS, $nworker, $workerset, $in, $SHARD_NFD, $MY_FD_MAX); our $stderr = \*STDERR; sub cmd_test_inspect { @@ -172,29 +172,65 @@ sub cmd_mset { # to be used by WWW + IMAP } } +sub srch_init_extra ($) { + my ($req) = @_; + my $qp = $req->{srch}->{qp}; + for (@{$req->{Q}}) { + my ($upfx, $m, $xpfx) = split /([:=])/; + $xpfx // die "E: bad -Q $_"; + $m = $m eq '=' ? 'add_boolean_prefix' : 'add_prefix'; + $qp->$m($upfx, $xpfx); + } + $req->{srch}->{qp_extra_done} = 1; +} + sub dispatch { my ($req, $cmd, @argv) = @_; my $fn = $req->can("cmd_$cmd") or return; $GLP->getoptionsfromarray(\@argv, $req, @PublicInbox::Search::XH_SPEC) or return; my $dirs = delete $req->{d} or die 'no -d args'; - my $key = join("\0", @$dirs); - $req->{srch} = $SRCH{$key} //= do { - my $new = { qp_flags => $PublicInbox::Search::QP_FLAGS }; + my $key = "-d\0".join("\0-d\0", @$dirs); + $key .= "\0".join("\0", map { ('-Q', $_) } @{$req->{Q}}) if $req->{Q}; + my $new; + $req->{srch} = $SRCH{$key} // do { + $new = { qp_flags => $PublicInbox::Search::QP_FLAGS }; + my $nfd = scalar(@$dirs) * PublicInbox::Search::SHARD_COST; + $SHARD_NFD += $nfd; + if ($SHARD_NFD > $MY_FD_MAX) { + $SHARD_NFD = $nfd; + %SRCH = (); + } my $first = shift @$dirs; - my $slow_phrase = -f "$first/iamchert"; - $new->{xdb} = $X->{Database}->new($first); - for (@$dirs) { - $slow_phrase ||= -f "$_/iamchert"; - $new->{xdb}->add_database($X->{Database}->new($_)); + for my $retried (0, 1) { + my $slow_phrase = -f "$first/iamchert"; + eval { + $new->{xdb} = $X->{Database}->new($first); + for (@$dirs) { + $slow_phrase ||= -f "$_/iamchert"; + $new->{xdb}->add_database( + $X->{Database}->new($_)) + } + }; + last unless $@; + if ($retried) { + die "E: $@\n"; + } else { # may be EMFILE/ENFILE/ENOMEM.... + warn "W: $@, retrying...\n"; + %SRCH = (); + $SHARD_NFD = $nfd; + } + $slow_phrase or $new->{qp_flags} + |= PublicInbox::Search::FLAG_PHRASE(); } - $slow_phrase or - $new->{qp_flags} |= PublicInbox::Search::FLAG_PHRASE(); bless $new, $req->{c} ? 'PublicInbox::CodeSearch' : 'PublicInbox::Search'; $new->{qp} = $new->qparse_new; - $new; + $SRCH{$key} = $new; }; + $req->{srch}->{xdb}->reopen unless $new; + $req->{Q} && !$req->{srch}->{qp_extra_done} and + srch_init_extra $req; my $timeo = $req->{K}; alarm($timeo) if $timeo; $fn->($req, @argv); @@ -288,7 +324,7 @@ sub start (@) { my $c = getsockopt(local $in = \*STDIN, SOL_SOCKET, SO_TYPE); unpack('i', $c) == SOCK_SEQPACKET or die 'stdin is not SOCK_SEQPACKET'; - local (%SRCH, %WORKERS); + local (%SRCH, %WORKERS, $SHARD_NFD, $MY_FD_MAX); PublicInbox::Search::load_xapian(); $GLP->getoptionsfromarray(\@argv, my $opt = { j => 1 }, 'j=i') or die 'bad args'; @@ -297,6 +333,10 @@ sub start (@) { for (@PublicInbox::DS::UNBLOCKABLE, POSIX::SIGUSR1) { $workerset->delset($_) or die "delset($_): $!"; } + $MY_FD_MAX = PublicInbox::Search::ulimit_n // + die "E: unable to get RLIMIT_NOFILE: $!"; + warn "W: RLIMIT_NOFILE=$MY_FD_MAX too low\n" if $MY_FD_MAX < 72; + $MY_FD_MAX -= 64; local $nworker = $opt->{j}; return recv_loop() if $nworker == 0; diff --git a/lib/PublicInbox/XapHelperCxx.pm b/lib/PublicInbox/XapHelperCxx.pm index 74852ad1..922bd583 100644 --- a/lib/PublicInbox/XapHelperCxx.pm +++ b/lib/PublicInbox/XapHelperCxx.pm @@ -34,6 +34,7 @@ my $ldflags = '-Wl,-O1'; $ldflags .= ' -Wl,--compress-debug-sections=zlib' if $^O ne 'openbsd'; my $xflags = ($ENV{CXXFLAGS} // '-Wall -ggdb3 -pipe') . ' ' . ' -DTHREADID=' . PublicInbox::Search::THREADID . + ' -DSHARD_COST=' . PublicInbox::Search::SHARD_COST . ' -DXH_SPEC="'.join('', map { s/=.*/:/; $_ } @PublicInbox::Search::XH_SPEC) . '" ' . ($ENV{LDFLAGS} // $ldflags); diff --git a/lib/PublicInbox/khashl.h b/lib/PublicInbox/khashl.h new file mode 100644 index 00000000..170b81ff --- /dev/null +++ b/lib/PublicInbox/khashl.h @@ -0,0 +1,502 @@ +/* The MIT License + + Copyright (c) 2019-2023 by Attractive Chaos <attractor@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef __AC_KHASHL_H +#define __AC_KHASHL_H + +#define AC_VERSION_KHASHL_H "0.2" + +typedef uint32_t khint32_t; +typedef uint64_t khint64_t; + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define kh_inline inline +#define KH_LOCAL static kh_inline + +#ifndef kcalloc +#define kcalloc(N,Z) xcalloc(N,Z) +#endif +#ifndef kfree +#define kfree(P) free(P) +#endif + +/**************************** + * Simple private functions * + ****************************/ + +#define __kh_used(flag, i) (flag[i>>5] >> (i&0x1fU) & 1U) +#define __kh_set_used(flag, i) (flag[i>>5] |= 1U<<(i&0x1fU)) +#define __kh_set_unused(flag, i) (flag[i>>5] &= ~(1U<<(i&0x1fU))) + +#define __kh_fsize(m) ((m) < 32? 1 : (m)>>5) + +static kh_inline khint_t __kh_h2b(khint_t hash, khint_t bits) { return hash * 2654435769U >> (32 - bits); } + +/******************* + * Hash table base * + *******************/ + +#define __KHASHL_TYPE(HType, khkey_t) \ + typedef struct HType { \ + khint_t bits, count; \ + khint32_t *used; \ + khkey_t *keys; \ + } HType; + +#define __KHASHL_PROTOTYPES(HType, prefix, khkey_t) \ + extern HType *prefix##_init(void); \ + extern void prefix##_destroy(HType *h); \ + extern void prefix##_clear(HType *h); \ + extern khint_t prefix##_getp(const HType *h, const khkey_t *key); \ + extern void prefix##_resize(HType *h, khint_t new_n_buckets); \ + extern khint_t prefix##_putp(HType *h, const khkey_t *key, int *absent); \ + extern void prefix##_del(HType *h, khint_t k); + +#define __KHASHL_IMPL_BASIC(SCOPE, HType, prefix) \ + SCOPE HType *prefix##_init(void) { \ + return (HType*)kcalloc(1, sizeof(HType)); \ + } \ + SCOPE void prefix##_release(HType *h) { \ + kfree((void *)h->keys); kfree(h->used); \ + } \ + SCOPE void prefix##_destroy(HType *h) { \ + if (!h) return; \ + prefix##_release(h); \ + kfree(h); \ + } \ + SCOPE void prefix##_clear(HType *h) { \ + if (h && h->used) { \ + khint_t n_buckets = (khint_t)1U << h->bits; \ + memset(h->used, 0, __kh_fsize(n_buckets) * sizeof(khint32_t)); \ + h->count = 0; \ + } \ + } + +#define __KHASHL_IMPL_GET(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + SCOPE khint_t prefix##_getp_core(const HType *h, const khkey_t *key, khint_t hash) { \ + khint_t i, last, n_buckets, mask; \ + if (!h->keys) return 0; \ + n_buckets = (khint_t)1U << h->bits; \ + mask = n_buckets - 1U; \ + i = last = __kh_h2b(hash, h->bits); \ + while (__kh_used(h->used, i) && !__hash_eq(h->keys[i], *key)) { \ + i = (i + 1U) & mask; \ + if (i == last) return n_buckets; \ + } \ + return !__kh_used(h->used, i)? n_buckets : i; \ + } \ + SCOPE khint_t prefix##_getp(const HType *h, const khkey_t *key) { return prefix##_getp_core(h, key, __hash_fn(*key)); } \ + SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { return prefix##_getp_core(h, &key, __hash_fn(key)); } + +#define __KHASHL_IMPL_RESIZE(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + SCOPE void prefix##_resize(HType *h, khint_t new_n_buckets) { \ + khint32_t *new_used = NULL; \ + khint_t j = 0, x = new_n_buckets, n_buckets, new_bits, new_mask; \ + while ((x >>= 1) != 0) ++j; \ + if (new_n_buckets & (new_n_buckets - 1)) ++j; \ + new_bits = j > 2? j : 2; \ + new_n_buckets = (khint_t)1U << new_bits; \ + if (h->count > (new_n_buckets>>1) + (new_n_buckets>>2)) return; /* noop, requested size is too small */ \ + new_used = (khint32_t*)kcalloc(__kh_fsize(new_n_buckets), sizeof(khint32_t)); \ + n_buckets = h->keys? (khint_t)1U<<h->bits : 0U; \ + if (n_buckets < new_n_buckets) { /* expand */ \ + h->keys = (khkey_t *)xreallocarray(h->keys, \ + new_n_buckets, sizeof(khkey_t)); \ + } /* otherwise shrink */ \ + new_mask = new_n_buckets - 1; \ + for (j = 0; j != n_buckets; ++j) { \ + khkey_t key; \ + if (!__kh_used(h->used, j)) continue; \ + key = h->keys[j]; \ + __kh_set_unused(h->used, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t i; \ + i = __kh_h2b(__hash_fn(key), new_bits); \ + while (__kh_used(new_used, i)) i = (i + 1) & new_mask; \ + __kh_set_used(new_used, i); \ + if (i < n_buckets && __kh_used(h->used, i)) { /* kick out the existing element */ \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + __kh_set_unused(h->used, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + break; \ + } \ + } \ + } \ + if (n_buckets > new_n_buckets) /* shrink the hash table */ \ + h->keys = (khkey_t *)xreallocarray(h->keys, \ + new_n_buckets, sizeof(khkey_t)); \ + kfree(h->used); /* free the working space */ \ + h->used = new_used, h->bits = new_bits; \ + } + +#define __KHASHL_IMPL_PUT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + SCOPE khint_t prefix##_putp_core(HType *h, const khkey_t *key, khint_t hash, int *absent) { \ + khint_t n_buckets, i, last, mask; \ + n_buckets = h->keys? (khint_t)1U<<h->bits : 0U; \ + *absent = -1; \ + if (h->count >= (n_buckets>>1) + (n_buckets>>2)) { /* rehashing */ \ + prefix##_resize(h, n_buckets + 1U); \ + n_buckets = (khint_t)1U<<h->bits; \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + mask = n_buckets - 1; \ + i = last = __kh_h2b(hash, h->bits); \ + while (__kh_used(h->used, i) && !__hash_eq(h->keys[i], *key)) { \ + i = (i + 1U) & mask; \ + if (i == last) break; \ + } \ + if (!__kh_used(h->used, i)) { /* not present at all */ \ + h->keys[i] = *key; \ + __kh_set_used(h->used, i); \ + ++h->count; \ + *absent = 1; \ + } else *absent = 0; /* Don't touch h->keys[i] if present */ \ + return i; \ + } \ + SCOPE khint_t prefix##_putp(HType *h, const khkey_t *key, int *absent) { return prefix##_putp_core(h, key, __hash_fn(*key), absent); } \ + SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { return prefix##_putp_core(h, &key, __hash_fn(key), absent); } + +#define __KHASHL_IMPL_DEL(SCOPE, HType, prefix, khkey_t, __hash_fn) \ + SCOPE int prefix##_del(HType *h, khint_t i) { \ + khint_t j = i, k, mask, n_buckets; \ + if (!h->keys) return 0; \ + n_buckets = (khint_t)1U<<h->bits; \ + mask = n_buckets - 1U; \ + while (1) { \ + j = (j + 1U) & mask; \ + if (j == i || !__kh_used(h->used, j)) break; /* j==i only when the table is completely full */ \ + k = __kh_h2b(__hash_fn(h->keys[j]), h->bits); \ + if ((j > i && (k <= i || k > j)) || (j < i && (k <= i && k > j))) \ + h->keys[i] = h->keys[j], i = j; \ + } \ + __kh_set_unused(h->used, i); \ + --h->count; \ + return 1; \ + } + +#define KHASHL_DECLARE(HType, prefix, khkey_t) \ + __KHASHL_TYPE(HType, khkey_t) \ + __KHASHL_PROTOTYPES(HType, prefix, khkey_t) + +#define KHASHL_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + __KHASHL_TYPE(HType, khkey_t) \ + __KHASHL_IMPL_BASIC(SCOPE, HType, prefix) \ + __KHASHL_IMPL_GET(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + __KHASHL_IMPL_RESIZE(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + __KHASHL_IMPL_PUT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + __KHASHL_IMPL_DEL(SCOPE, HType, prefix, khkey_t, __hash_fn) + +#define KHASHE_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + KHASHL_INIT(KH_LOCAL, HType##_sub, prefix##_sub, khkey_t, __hash_fn, __hash_eq) \ + typedef struct HType { \ + khint64_t count:54, bits:8; \ + HType##_sub *sub; \ + } HType; \ + SCOPE HType *prefix##_init_sub(HType *g, size_t bits) { \ + g->bits = bits; \ + g->sub = (HType##_sub*)kcalloc(1U<<bits, sizeof(*g->sub)); \ + return g; \ + } \ + SCOPE HType *prefix##_init(void) { \ + HType *g; \ + g = (HType*)kcalloc(1, sizeof(*g)); \ + return prefix##_init_sub(g, 0); /* unsure about default */ \ + } \ + SCOPE void prefix##_release(HType *g) { \ + int t; \ + for (t = 0; t < 1<<g->bits; ++t) \ + prefix##_sub_release(&g->sub[t]); \ + kfree(g->sub); \ + } \ + SCOPE void prefix##_destroy(HType *g) { \ + if (!g) return; \ + prefix##_release(g); \ + kfree(g); \ + } \ + SCOPE void prefix##_clear(HType *g) { \ + int t; \ + if (!g) return; \ + for (t = 0; t < 1<<g->bits; ++t) \ + prefix##_sub_clear(&g->sub[t]); \ + } \ + SCOPE kh_ensitr_t prefix##_getp(const HType *g, const khkey_t *key) { \ + khint_t hash, low, ret; \ + kh_ensitr_t r; \ + HType##_sub *h; \ + hash = __hash_fn(*key); \ + low = hash & ((1U<<g->bits) - 1); \ + h = &g->sub[low]; \ + ret = prefix##_sub_getp_core(h, key, hash); \ + if (ret >= kh_end(h)) r.sub = low, r.pos = (khint_t)-1; \ + else r.sub = low, r.pos = ret; \ + return r; \ + } \ + SCOPE kh_ensitr_t prefix##_get(const HType *g, const khkey_t key) { return prefix##_getp(g, &key); } \ + SCOPE kh_ensitr_t prefix##_putp(HType *g, const khkey_t *key, int *absent) { \ + khint_t hash, low, ret; \ + kh_ensitr_t r; \ + HType##_sub *h; \ + hash = __hash_fn(*key); \ + low = hash & ((1U<<g->bits) - 1); \ + h = &g->sub[low]; \ + ret = prefix##_sub_putp_core(h, key, hash, absent); \ + if (*absent) ++g->count; \ + if (ret == 1U<<h->bits) r.sub = low, r.pos = (khint_t)-1; \ + else r.sub = low, r.pos = ret; \ + return r; \ + } \ + SCOPE kh_ensitr_t prefix##_put(HType *g, const khkey_t key, int *absent) { return prefix##_putp(g, &key, absent); } \ + SCOPE int prefix##_del(HType *g, kh_ensitr_t itr) { \ + HType##_sub *h = &g->sub[itr.sub]; \ + int ret; \ + ret = prefix##_sub_del(h, itr.pos); \ + if (ret) --g->count; \ + return ret; \ + } + +/***************************** + * More convenient interface * + *****************************/ + +#define __kh_packed /* noop, we use -Werror=address-of-packed-member */ +#define __kh_cached_hash(x) ((x).hash) + +#define KHASHL_SET_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + typedef struct { khkey_t key; } __kh_packed HType##_s_bucket_t; \ + static kh_inline khint_t prefix##_s_hash(HType##_s_bucket_t x) { return __hash_fn(x.key); } \ + static kh_inline int prefix##_s_eq(HType##_s_bucket_t x, HType##_s_bucket_t y) { return __hash_eq(x.key, y.key); } \ + KHASHL_INIT(KH_LOCAL, HType, prefix##_s, HType##_s_bucket_t, prefix##_s_hash, prefix##_s_eq) \ + SCOPE HType *prefix##_init(void) { return prefix##_s_init(); } \ + SCOPE void prefix##_release(HType *h) { prefix##_s_release(h); } \ + SCOPE void prefix##_destroy(HType *h) { prefix##_s_destroy(h); } \ + SCOPE void prefix##_clear(HType *h) { prefix##_s_clear(h); } \ + SCOPE void prefix##_resize(HType *h, khint_t new_n_buckets) { prefix##_s_resize(h, new_n_buckets); } \ + SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_s_bucket_t t; t.key = key; return prefix##_s_getp(h, &t); } \ + SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_s_del(h, k); } \ + SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_s_bucket_t t; t.key = key; return prefix##_s_putp(h, &t, absent); } \ + +#define KHASHL_MAP_INIT(SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \ + typedef struct { khkey_t key; kh_val_t val; } __kh_packed HType##_m_bucket_t; \ + static kh_inline khint_t prefix##_m_hash(HType##_m_bucket_t x) { return __hash_fn(x.key); } \ + static kh_inline int prefix##_m_eq(HType##_m_bucket_t x, HType##_m_bucket_t y) { return __hash_eq(x.key, y.key); } \ + KHASHL_INIT(KH_LOCAL, HType, prefix##_m, HType##_m_bucket_t, prefix##_m_hash, prefix##_m_eq) \ + SCOPE HType *prefix##_init(void) { return prefix##_m_init(); } \ + SCOPE void prefix##_release(HType *h) { prefix##_m_release(h); } \ + SCOPE void prefix##_destroy(HType *h) { prefix##_m_destroy(h); } \ + SCOPE void prefix##_clear(HType *h) { prefix##_m_clear(h); } \ + SCOPE void prefix##_resize(HType *h, khint_t new_n_buckets) { prefix##_m_resize(h, new_n_buckets); } \ + SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_m_bucket_t t; t.key = key; return prefix##_m_getp(h, &t); } \ + SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_m_del(h, k); } \ + SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_m_bucket_t t; t.key = key; return prefix##_m_putp(h, &t, absent); } \ + +#define KHASHL_CSET_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \ + typedef struct { khkey_t key; khint_t hash; } __kh_packed HType##_cs_bucket_t; \ + static kh_inline int prefix##_cs_eq(HType##_cs_bucket_t x, HType##_cs_bucket_t y) { return x.hash == y.hash && __hash_eq(x.key, y.key); } \ + KHASHL_INIT(KH_LOCAL, HType, prefix##_cs, HType##_cs_bucket_t, __kh_cached_hash, prefix##_cs_eq) \ + SCOPE HType *prefix##_init(void) { return prefix##_cs_init(); } \ + SCOPE void prefix##_destroy(HType *h) { prefix##_cs_destroy(h); } \ + SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_cs_bucket_t t; t.key = key; t.hash = __hash_fn(key); return prefix##_cs_getp(h, &t); } \ + SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_cs_del(h, k); } \ + SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_cs_bucket_t t; t.key = key, t.hash = __hash_fn(key); return prefix##_cs_putp(h, &t, absent); } + +#define KHASHL_CMAP_INIT(SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \ + typedef struct { khkey_t key; kh_val_t val; khint_t hash; } __kh_packed HType##_cm_bucket_t; \ + static kh_inline int prefix##_cm_eq(HType##_cm_bucket_t x, HType##_cm_bucket_t y) { return x.hash == y.hash && __hash_eq(x.key, y.key); } \ + KHASHL_INIT(KH_LOCAL, HType, prefix##_cm, HType##_cm_bucket_t, __kh_cached_hash, prefix##_cm_eq) \ + SCOPE HType *prefix##_init(void) { return prefix##_cm_init(); } \ + SCOPE void prefix##_destroy(HType *h) { prefix##_cm_destroy(h); } \ + SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_cm_bucket_t t; t.key = key; t.hash = __hash_fn(key); return prefix##_cm_getp(h, &t); } \ + SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_cm_del(h, k); } \ + SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_cm_bucket_t t; t.key = key, t.hash = __hash_fn(key); return prefix##_cm_putp(h, &t, absent); } + +#define KHASHE_MAP_INIT(SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \ + typedef struct { khkey_t key; kh_val_t val; } __kh_packed HType##_m_bucket_t; \ + static kh_inline khint_t prefix##_m_hash(HType##_m_bucket_t x) { return __hash_fn(x.key); } \ + static kh_inline int prefix##_m_eq(HType##_m_bucket_t x, HType##_m_bucket_t y) { return __hash_eq(x.key, y.key); } \ + KHASHE_INIT(KH_LOCAL, HType, prefix##_m, HType##_m_bucket_t, prefix##_m_hash, prefix##_m_eq) \ + SCOPE HType *prefix##_init(void) { return prefix##_m_init(); } \ + SCOPE void prefix##_release(HType *h) { prefix##_m_release(h); } \ + SCOPE void prefix##_destroy(HType *h) { prefix##_m_destroy(h); } \ + SCOPE void prefix##_clear(HType *h) { prefix##_m_clear(h); } \ + SCOPE void prefix##_resize(HType *h, khint_t ignore) { /* noop */ } \ + SCOPE kh_ensitr_t prefix##_get(const HType *h, khkey_t key) { HType##_m_bucket_t t; t.key = key; return prefix##_m_getp(h, &t); } \ + SCOPE int prefix##_del(HType *h, kh_ensitr_t k) { return prefix##_m_del(h, k); } \ + SCOPE kh_ensitr_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_m_bucket_t t; t.key = key; return prefix##_m_putp(h, &t, absent); } \ + +/************************** + * Public macro functions * + **************************/ + +#define kh_bucket(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->count) + +#define kh_capacity(h) ((h)->keys? 1U<<(h)->bits : 0U) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table + @return The end iterator [khint_t] + */ +#define kh_end(h) kh_capacity(h) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x].key) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->keys[x].val) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) kh_val(h, x) + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) __kh_used((h)->used, (x)) + +#define kh_ens_key(g, x) kh_key(&(g)->sub[(x).sub], (x).pos) +#define kh_ens_val(g, x) kh_val(&(g)->sub[(x).sub], (x).pos) +#define kh_ens_exist(g, x) kh_exist(&(g)->sub[(x).sub], (x).pos) +#define kh_ens_is_end(x) ((x).pos == (khint_t)-1) +#define kh_ens_size(g) ((g)->count) + +/************************************** + * Common hash and equality functions * + **************************************/ + +#define kh_eq_generic(a, b) ((a) == (b)) +#define kh_eq_str(a, b) (strcmp((a), (b)) == 0) +#define kh_hash_dummy(x) ((khint_t)(x)) + +static kh_inline khint_t kh_hash_uint32(khint_t key) { + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} + +static kh_inline khint_t kh_hash_uint64(khint64_t key) { + key = ~key + (key << 21); + key = key ^ key >> 24; + key = (key + (key << 3)) + (key << 8); + key = key ^ key >> 14; + key = (key + (key << 2)) + (key << 4); + key = key ^ key >> 28; + key = key + (key << 31); + return (khint_t)key; +} + +#define KH_FNV_SEED 11 + +static kh_inline khint_t kh_hash_str(const char *s) { /* FNV1a */ + khint_t h = KH_FNV_SEED ^ 2166136261U; + const unsigned char *t = (const unsigned char*)s; + for (; *t; ++t) + h ^= *t, h *= 16777619; + return h; +} + +static kh_inline khint_t kh_hash_bytes(int len, const unsigned char *s) { + khint_t h = KH_FNV_SEED ^ 2166136261U; + int i; + for (i = 0; i < len; ++i) + h ^= s[i], h *= 16777619; + return h; +} + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +#define kh_ens_foreach(g, kvar, vvar, code) do { \ + size_t t; \ + for (t = 0; t < 1<<g->bits; ++t) \ + kh_foreach(&g->sub[t], kvar, vvar, code); \ +} while (0) + +#define kh_ens_foreach_value(g, vvar, code) do { \ + size_t t; \ + for (t = 0; t < 1<<g->bits; ++t) \ + kh_foreach_value(&g->sub[t], vvar, code); \ +} while (0) + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +#endif /* __AC_KHASHL_H */ diff --git a/lib/PublicInbox/xap_helper.h b/lib/PublicInbox/xap_helper.h index 3df3ce91..51ab48bf 100644 --- a/lib/PublicInbox/xap_helper.h +++ b/lib/PublicInbox/xap_helper.h @@ -7,7 +7,7 @@ * this is not linked to Perl in any way. * C (not C++) is used as much as possible to lower the contribution * barrier for hackers who mainly know C (this includes the maintainer). - * Yes, that means we use C stdlib stuff like hsearch and open_memstream + * Yes, that means we use C stdlib stuff like open_memstream * instead their equivalents in the C++ stdlib :P * Everything here is an unstable internal API of public-inbox and * NOT intended for ordinary users; only public-inbox hackers @@ -15,6 +15,9 @@ #ifndef _ALL_SOURCE # define _ALL_SOURCE #endif +#ifndef _GNU_SOURCE +# define _GNU_SOURCE +#endif #if defined(__NetBSD__) && !defined(_OPENBSD_SOURCE) // for reallocarray(3) # define _OPENBSD_SOURCE #endif @@ -34,7 +37,6 @@ #include <errno.h> #include <fcntl.h> #include <limits.h> -#include <search.h> #include <signal.h> #include <stddef.h> #include <stdint.h> @@ -83,6 +85,62 @@ #define ABORT(...) do { warnx(__VA_ARGS__); abort(); } while (0) #define EABORT(...) do { warn(__VA_ARGS__); abort(); } while (0) +static void *xcalloc(size_t nmemb, size_t size) +{ + void *ret = calloc(nmemb, size); + if (!ret) EABORT("calloc(%zu, %zu)", nmemb, size); + return ret; +} + +#if defined(__GLIBC__) && defined(__GLIBC_MINOR__) && \ + MY_VER(__GLIBC__, __GLIBC_MINOR__, 0) >= MY_VER(2, 28, 0) +# define HAVE_REALLOCARRAY 1 +#elif defined(__OpenBSD__) || defined(__DragonFly__) || \ + defined(__FreeBSD__) || defined(__NetBSD__) +# define HAVE_REALLOCARRAY 1 +#endif + +static void *xreallocarray(void *ptr, size_t nmemb, size_t size) +{ +#ifdef HAVE_REALLOCARRAY + void *ret = reallocarray(ptr, nmemb, size); +#else // can't rely on __builtin_mul_overflow in gcc 4.x :< + void *ret = NULL; + if (nmemb && size > SIZE_MAX / nmemb) + errno = ENOMEM; + else + ret = realloc(ptr, nmemb * size); +#endif + if (!ret) EABORT("reallocarray(..., %zu, %zu)", nmemb, size); + return ret; +} + +#include "khashl.h" + +struct srch { + int ckey_len; // int for comparisons + unsigned qp_flags; + bool qp_extra_done; + Xapian::Database *db; + Xapian::QueryParser *qp; + unsigned char ckey[]; // $shard_path0\0$shard_path1\0... +}; + +static khint_t srch_hash(const struct srch *srch) +{ + return kh_hash_bytes(srch->ckey_len, srch->ckey); +} + +static int srch_eq(const struct srch *a, const struct srch *b) +{ + return a->ckey_len == b->ckey_len ? + !memcmp(a->ckey, b->ckey, (size_t)a->ckey_len) : 0; +} + +KHASHL_CSET_INIT(KH_LOCAL, srch_set, srch_set, struct srch *, + srch_hash, srch_eq) +static srch_set *srch_cache; +static long my_fd_max, shard_nfd; // sock_fd is modified in signal handler, yes, it's SOCK_SEQPACKET static volatile int sock_fd = STDIN_FILENO; static sigset_t fullset, workerset; @@ -91,7 +149,6 @@ static bool alive = true; static FILE *orig_err = stderr; #endif static int orig_err_fd = -1; -static void *srch_tree; // tsearch + tdelete + twalk static pid_t *worker_pids; // nr => pid #define WORKER_MAX USHRT_MAX static unsigned long nworker, nworker_hwm; @@ -111,14 +168,6 @@ enum exc_iter { ITER_ABORT }; -struct srch { - int paths_len; // int for comparisons - unsigned qp_flags; - Xapian::Database *db; - Xapian::QueryParser *qp; - char paths[]; // $shard_path0\0$shard_path1\0... -}; - #define MY_ARG_MAX 256 typedef bool (*cmd)(struct req *); @@ -126,6 +175,8 @@ typedef bool (*cmd)(struct req *); struct req { // argv and pfxv point into global rbuf char *argv[MY_ARG_MAX]; char *pfxv[MY_ARG_MAX]; // -A <prefix> + char *qpfxv[MY_ARG_MAX]; // -Q <user_prefix>[:=]<INTERNAL_PREFIX> + char *dirv[MY_ARG_MAX]; // -d /path/to/XDB(shard) size_t *lenv; // -A <prefix>LENGTH struct srch *srch; char *Pgit_dir; @@ -137,8 +188,7 @@ struct req { // argv and pfxv point into global rbuf unsigned long timeout_sec; size_t nr_out; long sort_col; // value column, negative means BoolWeight - int argc; - int pfxc; + int argc, pfxc, qpfxc, dirc; FILE *fp[2]; // [0] response pipe or sock, [1] status/errors (optional) bool has_input; // fp[0] is bidirectional bool collapse_threads; @@ -372,25 +422,6 @@ static size_t off2size(off_t n) return (size_t)n; } -static char *hsearch_enter_key(char *s) -{ -#if defined(__OpenBSD__) || defined(__DragonFly__) - // hdestroy frees each key on some platforms, - // so give it something to free: - char *ret = strdup(s); - if (!ret) err(EXIT_FAILURE, "strdup"); - return ret; -// AFAIK there's no way to detect musl, assume non-glibc Linux is musl: -#elif defined(__GLIBC__) || defined(__linux__) || \ - defined(__FreeBSD__) || defined(__NetBSD__) - // do nothing on these platforms -#else -#warning untested platform detected, unsure if hdestroy(3) frees keys -#warning contact us at meta@public-inbox.org if you get segfaults -#endif - return s; -} - // for test usage only, we need to ensure the compiler supports // __cleanup__ when exceptions are thrown struct inspect { struct req *req; }; @@ -509,15 +540,6 @@ again: return false; } -static int srch_cmp(const void *pa, const void *pb) // for tfind|tsearch -{ - const struct srch *a = (const struct srch *)pa; - const struct srch *b = (const struct srch *)pb; - int diff = a->paths_len - b->paths_len; - - return diff ? diff : memcmp(a->paths, b->paths, (size_t)a->paths_len); -} - static bool is_chert(const char *dir) { char iamchert[PATH_MAX]; @@ -531,49 +553,85 @@ static bool is_chert(const char *dir) return false; } -static bool srch_init(struct req *req) +static void srch_free(struct srch *srch) +{ + delete srch->qp; + delete srch->db; + free(srch); +} + +static void srch_cache_renew(struct srch *keep) +{ + khint_t k; + + // can't delete while iterating, so just free each + clear + for (k = kh_begin(srch_cache); k != kh_end(srch_cache); k++) { + if (!kh_exist(srch_cache, k)) continue; + struct srch *cur = kh_key(srch_cache, k); + + if (cur != keep) + srch_free(cur); + } + srch_set_cs_clear(srch_cache); + if (keep) { + int absent; + k = srch_set_put(srch_cache, keep, &absent); + assert(absent); + assert(k < kh_end(srch_cache)); + } +} + +static void srch_init(struct req *req) { - char *dirv[MY_ARG_MAX]; int i; struct srch *srch = req->srch; - int dirc = (int)SPLIT2ARGV(dirv, srch->paths, (size_t)srch->paths_len); const unsigned FLAG_PHRASE = Xapian::QueryParser::FLAG_PHRASE; - srch->qp_flags = FLAG_PHRASE | - Xapian::QueryParser::FLAG_BOOLEAN | + srch->qp_flags = Xapian::QueryParser::FLAG_BOOLEAN | Xapian::QueryParser::FLAG_LOVEHATE | Xapian::QueryParser::FLAG_WILDCARD; - if (is_chert(dirv[0])) - srch->qp_flags &= ~FLAG_PHRASE; - try { - srch->db = new Xapian::Database(dirv[0]); - } catch (...) { - warn("E: Xapian::Database(%s)", dirv[0]); - return false; + long nfd = req->dirc * SHARD_COST; + + shard_nfd += nfd; + if (shard_nfd > my_fd_max) { + srch_cache_renew(srch); + shard_nfd = nfd; } - try { - for (i = 1; i < dirc; i++) { - if (srch->qp_flags & FLAG_PHRASE && is_chert(dirv[i])) + for (int retried = 0; retried < 2; retried++) { + srch->qp_flags |= FLAG_PHRASE; + i = 0; + try { + srch->db = new Xapian::Database(req->dirv[i]); + if (is_chert(req->dirv[0])) srch->qp_flags &= ~FLAG_PHRASE; - srch->db->add_database(Xapian::Database(dirv[i])); + for (i = 1; i < req->dirc; i++) { + const char *dir = req->dirv[i]; + if (srch->qp_flags & FLAG_PHRASE && + is_chert(dir)) + srch->qp_flags &= ~FLAG_PHRASE; + srch->db->add_database(Xapian::Database(dir)); + } + break; + } catch (const Xapian::Error & e) { + warnx("E: Xapian::Error: %s (%s)", + e.get_description().c_str(), req->dirv[i]); + } catch (...) { // does this happen? + warn("E: add_database(%s)", req->dirv[i]); + } + if (retried) { + errx(EXIT_FAILURE, "E: can't open %s", req->dirv[i]); + } else { + warnx("retrying..."); + if (srch->db) + delete srch->db; + srch->db = NULL; + srch_cache_renew(srch); } - } catch (...) { - warn("E: add_database(%s)", dirv[i]); - return false; - } - try { - srch->qp = new Xapian::QueryParser; - } catch (...) { - perror("E: Xapian::QueryParser"); - return false; } + // these will raise and die on ENOMEM or other errors + srch->qp = new Xapian::QueryParser; srch->qp->set_default_op(Xapian::Query::OP_AND); srch->qp->set_database(*srch->db); - try { - srch->qp->set_stemmer(Xapian::Stem("english")); - } catch (...) { - perror("E: Xapian::Stem"); - return false; - } + srch->qp->set_stemmer(Xapian::Stem("english")); srch->qp->set_stemming_strategy(Xapian::QueryParser::STEM_SOME); srch->qp->SET_MAX_EXPANSION(100); @@ -581,15 +639,31 @@ static bool srch_init(struct req *req) qp_init_code_search(srch->qp); // CodeSearch.pm else qp_init_mail_search(srch->qp); // Search.pm - return true; } -static void free_srch(void *p) // tdestroy +// setup query parser for altid and arbitrary headers +static void srch_init_extra(struct req *req) { - struct srch *srch = (struct srch *)p; - delete srch->qp; - delete srch->db; - free(srch); + const char *XPFX; + for (int i = 0; i < req->qpfxc; i++) { + size_t len = strlen(req->qpfxv[i]); + char *c = (char *)memchr(req->qpfxv[i], '=', len); + + if (c) { // it's boolean "gmane=XGMANE" + XPFX = c + 1; + *c = 0; + req->srch->qp->add_boolean_prefix(req->qpfxv[i], XPFX); + continue; + } + // maybe it's a non-boolean prefix "blob:XBLOBID" + c = (char *)memchr(req->qpfxv[i], ':', len); + if (!c) + errx(EXIT_FAILURE, "bad -Q %s", req->qpfxv[i]); + XPFX = c + 1; + *c = 0; + req->srch->qp->add_prefix(req->qpfxv[i], XPFX); + } + req->srch->qp_extra_done = true; } static void dispatch(struct req *req) @@ -602,7 +676,6 @@ static void dispatch(struct req *req) } kbuf; char *end; FILE *kfp; - struct srch **s; req->threadid = ULLONG_MAX; for (c = 0; c < (int)MY_ARRAY_SIZE(cmds); c++) { if (cmds[c].fn_len == size && @@ -616,7 +689,7 @@ static void dispatch(struct req *req) kfp = open_memstream(&kbuf.ptr, &size); if (!kfp) err(EXIT_FAILURE, "open_memstream(kbuf)"); // write padding, first (contents don't matter) - fwrite(&req->argv[0], offsetof(struct srch, paths), 1, kfp); + fwrite(&req->argv[0], offsetof(struct srch, ckey), 1, kfp); // global getopt variables: optopt = 0; @@ -628,7 +701,11 @@ static void dispatch(struct req *req) switch (c) { case 'a': req->asc = true; break; case 'c': req->code_search = true; break; - case 'd': fwrite(optarg, strlen(optarg) + 1, 1, kfp); break; + case 'd': + req->dirv[req->dirc++] = optarg; + if (MY_ARG_MAX == req->dirc) ABORT("too many -d"); + fprintf(kfp, "-d%c%s%c", 0, optarg, 0); + break; case 'g': req->Pgit_dir = optarg - 1; break; // pad "P" prefix case 'k': req->sort_col = strtol(optarg, &end, 10); @@ -665,28 +742,35 @@ static void dispatch(struct req *req) if (*end || req->threadid == ULLONG_MAX) ABORT("-T %s", optarg); break; + case 'Q': + req->qpfxv[req->qpfxc++] = optarg; + if (MY_ARG_MAX == req->qpfxc) ABORT("too many -Q"); + fprintf(kfp, "-Q%c%s%c", 0, optarg, 0); + break; default: ABORT("bad switch `-%c'", c); } } ERR_CLOSE(kfp, EXIT_FAILURE); // may ENOMEM, sets kbuf.srch kbuf.srch->db = NULL; kbuf.srch->qp = NULL; - kbuf.srch->paths_len = size - offsetof(struct srch, paths); - if (kbuf.srch->paths_len <= 0) - ABORT("no -d args"); - s = (struct srch **)tsearch(kbuf.srch, &srch_tree, srch_cmp); - if (!s) err(EXIT_FAILURE, "tsearch"); // likely ENOMEM - req->srch = *s; - if (req->srch != kbuf.srch) { // reuse existing - free_srch(kbuf.srch); - } else if (!srch_init(req)) { - assert(kbuf.srch == *((struct srch **)tfind( - kbuf.srch, &srch_tree, srch_cmp))); - void *del = tdelete(kbuf.srch, &srch_tree, srch_cmp); - assert(del); - free_srch(kbuf.srch); - goto cmd_err; // srch_init already warned + kbuf.srch->qp_extra_done = false; + kbuf.srch->ckey_len = size - offsetof(struct srch, ckey); + if (kbuf.srch->ckey_len <= 0 || !req->dirc) + ABORT("no -d args (or too many)"); + + int absent; + khint_t ki = srch_set_put(srch_cache, kbuf.srch, &absent); + assert(ki < kh_end(srch_cache)); + req->srch = kh_key(srch_cache, ki); + if (absent) { + srch_init(req); + } else { + assert(req->srch != kbuf.srch); + srch_free(kbuf.srch); + req->srch->db->reopen(); } + if (req->qpfxc && !req->srch->qp_extra_done) + srch_init_extra(req); if (req->timeout_sec) alarm(req->timeout_sec > UINT_MAX ? UINT_MAX : (unsigned)req->timeout_sec); @@ -700,8 +784,6 @@ static void dispatch(struct req *req) } if (req->timeout_sec) alarm(0); -cmd_err: - return; // just be silent on errors, for now } static void cleanup_pids(void) @@ -841,10 +923,11 @@ static void start_workers(void) static void cleanup_all(void) { cleanup_pids(); -#ifdef __GLIBC__ - tdestroy(srch_tree, free_srch); - srch_tree = NULL; -#endif + if (!srch_cache) + return; + srch_cache_renew(NULL); + srch_set_destroy(srch_cache); + srch_cache = NULL; } static void parent_reopen_logs(void) @@ -978,14 +1061,23 @@ int main(int argc, char *argv[]) socklen_t slen = (socklen_t)sizeof(c); stdout_path = getenv("STDOUT_PATH"); stderr_path = getenv("STDERR_PATH"); + struct rlimit rl; if (getsockopt(sock_fd, SOL_SOCKET, SO_TYPE, &c, &slen)) err(EXIT_FAILURE, "getsockopt"); if (c != SOCK_SEQPACKET) errx(EXIT_FAILURE, "stdin is not SOCK_SEQPACKET"); + if (getrlimit(RLIMIT_NOFILE, &rl)) + err(EXIT_FAILURE, "getrlimit"); + my_fd_max = rl.rlim_cur; + if (my_fd_max < 72) + warnx("W: RLIMIT_NOFILE=%ld too low\n", my_fd_max); + my_fd_max -= 64; + mail_nrp_init(); code_nrp_init(); + srch_cache = srch_set_init(); atexit(cleanup_all); if (!STDERR_ASSIGNABLE) { @@ -1046,8 +1138,7 @@ int main(int argc, char *argv[]) CHECK(int, 0, sigdelset(&workerset, SIGTERM)); CHECK(int, 0, sigdelset(&workerset, SIGCHLD)); nworker_hwm = nworker; - worker_pids = (pid_t *)calloc(nworker, sizeof(pid_t)); - if (!worker_pids) err(EXIT_FAILURE, "calloc"); + worker_pids = (pid_t *)xcalloc(nworker, sizeof(pid_t)); if (pipe(pipefds)) err(EXIT_FAILURE, "pipe"); int fl = fcntl(pipefds[1], F_GETFL); diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h index 311ca05f..8cc6a845 100644 --- a/lib/PublicInbox/xh_cidx.h +++ b/lib/PublicInbox/xh_cidx.h @@ -3,11 +3,38 @@ // This file is only intended to be included by xap_helper.h // it implements pieces used by CodeSearchIdx.pm +// TODO: consider making PublicInbox::CodeSearchIdx emit binary +// (20 or 32-bit) OIDs instead of ASCII hex. It would require +// more code in both Perl and C++, though... + +// assumes trusted data from same host +static inline unsigned int hex2uint(char c) +{ + switch (c) { + case '0' ... '9': return c - '0'; + case 'a' ... 'f': return c - 'a' + 10; + default: return 0xff; // oh well... + } +} + +// assumes trusted data from same host +static kh_inline khint_t sha_hex_hash(const char *hex) +{ + khint_t ret = 0; + + for (size_t shift = 32; shift; ) + ret |= hex2uint(*hex++) << (shift -= 4); + + return ret; +} + +KHASHL_CMAP_INIT(KH_LOCAL, root2id_map, root2id, + const char *, const char *, + sha_hex_hash, kh_eq_str) + static void term_length_extract(struct req *req) { - req->lenv = (size_t *)calloc(req->pfxc, sizeof(size_t)); - if (!req->lenv) - EABORT("lenv = calloc(%d %zu)", req->pfxc, sizeof(size_t)); + req->lenv = (size_t *)xcalloc(req->pfxc, sizeof(size_t)); for (int i = 0; i < req->pfxc; i++) { char *pfx = req->pfxv[i]; // extract trailing digits as length: @@ -101,6 +128,7 @@ struct dump_roots_tmp { void *mm_ptr; char **entries; struct fbuf wbuf; + root2id_map *root2id; int root2off_fd; }; @@ -110,7 +138,8 @@ static void dump_roots_ensure(void *ptr) struct dump_roots_tmp *drt = (struct dump_roots_tmp *)ptr; if (drt->root2off_fd >= 0) xclose(drt->root2off_fd); - hdestroy(); // idempotent + if (drt->root2id) + root2id_cm_destroy(drt->root2id); size_t size = off2size(drt->sb.st_size); if (drt->mm_ptr && munmap(drt->mm_ptr, size)) EABORT("BUG: munmap(%p, %zu)", drt->mm_ptr, size); @@ -118,23 +147,21 @@ static void dump_roots_ensure(void *ptr) fbuf_ensure(&drt->wbuf); } -static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc) +static bool root2offs_str(struct dump_roots_tmp *drt, + struct fbuf *root_offs, Xapian::Document *doc) { Xapian::TermIterator cur = doc->termlist_begin(); Xapian::TermIterator end = doc->termlist_end(); - ENTRY e, *ep; fbuf_init(root_offs); for (cur.skip_to("G"); cur != end; cur++) { std::string tn = *cur; if (!starts_with(&tn, "G", 1)) break; - union { const char *in; char *out; } u; - u.in = tn.c_str() + 1; - e.key = u.out; - ep = hsearch(e, FIND); - if (!ep) ABORT("hsearch miss `%s'", e.key); - // ep->data is a NUL-terminated string matching /[0-9]+/ + khint_t i = root2id_get(drt->root2id, tn.c_str() + 1); + if (i >= kh_end(drt->root2id)) + ABORT("kh get miss `%s'", tn.c_str() + 1); fputc(' ', root_offs->fp); - fputs((const char *)ep->data, root_offs->fp); + // kh_val(...) is a NUL-terminated string matching /[0-9]+/ + fputs(kh_val(drt->root2id, i), root_offs->fp); } fputc('\n', root_offs->fp); ERR_CLOSE(root_offs->fp, EXIT_FAILURE); // ENOMEM @@ -198,7 +225,7 @@ static enum exc_iter dump_roots_iter(struct req *req, CLEANUP_FBUF struct fbuf root_offs = {}; // " $ID0 $ID1 $IDx..\n" try { Xapian::Document doc = i->get_document(); - if (!root2offs_str(&root_offs, &doc)) + if (!root2offs_str(drt, &root_offs, &doc)) return ITER_ABORT; // bad request, abort for (int p = 0; p < req->pfxc; p++) dump_roots_term(req, p, drt, &root_offs, &doc); @@ -226,8 +253,7 @@ static bool cmd_dump_roots(struct req *req) if (fstat(drt.root2off_fd, &drt.sb)) // ENOMEM? err(EXIT_FAILURE, "fstat(%s)", root2off_file); // each entry is at least 43 bytes ({OIDHEX}\0{INT}\0), - // so /32 overestimates the number of expected entries by - // ~%25 (as recommended by Linux hcreate(3) manpage) + // so /32 overestimates the number of expected entries size_t size = off2size(drt.sb.st_size); size_t est = (size / 32) + 1; //+1 for "\0" termination drt.mm_ptr = mmap(NULL, size, PROT_READ, @@ -236,20 +262,19 @@ static bool cmd_dump_roots(struct req *req) err(EXIT_FAILURE, "mmap(%zu, %s)", size, root2off_file); size_t asize = est * 2; if (asize < est) ABORT("too many entries: %zu", est); - drt.entries = (char **)calloc(asize, sizeof(char *)); - if (!drt.entries) - err(EXIT_FAILURE, "calloc(%zu * 2, %zu)", est, sizeof(char *)); + drt.entries = (char **)xcalloc(asize, sizeof(char *)); size_t tot = split2argv(drt.entries, (char *)drt.mm_ptr, size, asize); if (tot <= 0) return false; // split2argv already warned on error - if (!hcreate(est)) - err(EXIT_FAILURE, "hcreate(%zu)", est); + drt.root2id = root2id_init(); + root2id_cm_resize(drt.root2id, est); for (size_t i = 0; i < tot; ) { - ENTRY e; - e.key = hsearch_enter_key(drt.entries[i++]); // dies on ENOMEM - e.data = drt.entries[i++]; - if (!hsearch(e, ENTER)) - err(EXIT_FAILURE, "hsearch(%s => %s, ENTER)", e.key, - (const char *)e.data); + int absent; + const char *key = drt.entries[i++]; + khint_t k = root2id_put(drt.root2id, key, &absent); + if (!absent) + err(EXIT_FAILURE, "put(%s => %s, ENTER)", + key, drt.entries[i]); + kh_val(drt.root2id, k) = drt.entries[i++]; } req->asc = true; req->sort_col = -1; |