From be998d9f32501d8c3acdaf4d5128a6343d5cb268 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 06:38:58 +0000 Subject: doc: rename our Xapian "partitions" to "shards" For consistency with Xapian documentation (in the "master" branch). --- Documentation/public-inbox-v2-format.pod | 10 +++++----- Documentation/public-inbox-xcpdb.pod | 11 +++++------ 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/Documentation/public-inbox-v2-format.pod b/Documentation/public-inbox-v2-format.pod index bdfe7abc..28d3550c 100644 --- a/Documentation/public-inbox-v2-format.pod +++ b/Documentation/public-inbox-v2-format.pod @@ -16,7 +16,7 @@ Message-IDs. The key change in v2 is the inbox is no longer a bare git repository, but a directory with two or more git repositories. v2 divides git repositories by time "epochs" and Xapian -databases for parallelism by "partitions". +databases for parallelism by "shards". =head2 INBOX OVERVIEW AND DEFINITIONS @@ -28,7 +28,7 @@ foo/ # assuming "foo" is the name of the list - inbox.lock # lock file (flock) to protect global state - git/$EPOCH.git # normal git repositories - all.git # empty git repo, alternates to git/$EPOCH.git -- xap$SCHEMA_VERSION/$PART # per-partition Xapian DB +- xap$SCHEMA_VERSION/$SHARD # per-shard Xapian DB - xap$SCHEMA_VERSION/over.sqlite3 # OVER-view DB for NNTP and threading - msgmap.sqlite3 # same the v1 msgmap @@ -95,16 +95,16 @@ are documented at: L -=head2 XAPIAN PARTITIONS +=head2 XAPIAN SHARDS Another second scalability problem in v1 was the inability to utilize multiple CPU cores for Xapian indexing. This is -addressed by using partitions in Xapian to perform import +addressed by using shards in Xapian to perform import indexing in parallel. As with git alternates, Xapian natively supports a read-only interface which transparently abstracts away the knowledge of -multiple partitions. This allows us to simplify our read-only +multiple shards. This allows us to simplify our read-only code paths. The performance of the storage device is now the bottleneck on diff --git a/Documentation/public-inbox-xcpdb.pod b/Documentation/public-inbox-xcpdb.pod index fd8770a4..a13c4efa 100644 --- a/Documentation/public-inbox-xcpdb.pod +++ b/Documentation/public-inbox-xcpdb.pod @@ -21,7 +21,7 @@ L or L. =item --compact In addition to performing the copy operation, run L -on each Xapian partition after copying but before finalizing it. +on each Xapian shard after copying but before finalizing it. Compared to the cost of copying a Xapian database, compacting a Xapian database takes only around 5% of the time required to copy. @@ -32,14 +32,13 @@ the compaction to take hours at-a-time. =item --reshard=N / -R N -Repartition the Xapian database on a L -inbox to C partitions. Since L is not suitable -for merging, users can rely on this switch to repartition the +Reshard the Xapian database on a L +inbox to C shards . Since L is not suitable +for merging, users can rely on this switch to reshard the existing Xapian database(s) to any positive value of C. This is useful in case the Xapian DB was created with too few or -too many partitions given the capabilities of the current -hardware. +too many shards given the capabilities of the current hardware. =item --blocksize / --no-full / --fuller -- cgit v1.2.3-24-ge0c7 From f7651f3fea8cea0ee1f6567cc53f93a57a652a47 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 07:34:07 +0000 Subject: v2writable: update comments regarding xcpdb --reshard Using compact to change shard count was abandoned during the v2 development phase. --- lib/PublicInbox/V2Writable.pm | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 76e61e86..db905f92 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -58,8 +58,8 @@ sub count_partitions ($) { my $xpfx = $self->{xpfx}; # always load existing partitions in case core count changes: - # Also, partition count may change while -watch is running - # due to -compact + # Also, shard count may change while -watch is running + # due to "xcpdb --reshard" if (-d $xpfx) { foreach my $part (<$xpfx/*>) { -d $part && $part =~ m!/[0-9]+\z! or next; @@ -288,7 +288,7 @@ sub idx_init { $self->lock_acquire unless ($opt && $opt->{-skip_lock}); $over->create; - # -compact can change partition count while -watch is idle + # xcpdb can change shard count while -watch is idle my $nparts = count_partitions($self); if ($nparts && $nparts != $self->{partitions}) { $self->{partitions} = $nparts; -- cgit v1.2.3-24-ge0c7 From a0eabc015e22e51cbf8f6060abafd5b53a0ae72f Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 07:37:03 +0000 Subject: admin|xapcmd: user-facing messages say "shard" We're slowly getting rid of the word "partition" when it comes to remain consistent with Xapian docs. --- lib/PublicInbox/Admin.pm | 2 +- lib/PublicInbox/Xapcmd.pm | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index 8a2f2043..5549b855 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -207,7 +207,7 @@ sub index_inbox { my $n = $v2w->{partitions}; if ($jobs != ($n + 1)) { warn -"Unable to respect --jobs=$jobs, inbox was created with $n partitions\n"; +"Unable to respect --jobs=$jobs, inbox was created with $n shards\n"; } } } diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm index e1c6fe3a..e303da9e 100644 --- a/lib/PublicInbox/Xapcmd.pm +++ b/lib/PublicInbox/Xapcmd.pm @@ -68,11 +68,11 @@ sub commit_changes ($$$) { my $n = $im->count_partitions; if (defined $new_parts && $n != $new_parts) { die -"BUG: counted $n partitions after repartioning to $new_parts"; +"BUG: counted $n shards after resharding to $new_parts"; } my $prev = $im->{partitions}; if ($pr && $prev != $n) { - $pr->("partition count changed: $prev => $n\n"); + $pr->("shard count changed: $prev => $n\n"); $im->{partitions} = $n; } } @@ -177,7 +177,7 @@ sub run { } # we want temporary directories to be as deep as possible, - # so v2 partitions can keep "xap$SCHEMA_VERSION" on a separate FS. + # so v2 shards can keep "xap$SCHEMA_VERSION" on a separate FS. if ($v == 1) { if (defined $new_parts) { warn @@ -355,9 +355,9 @@ sub cpdb ($$) { if (ref($old) eq 'ARRAY') { ($cur_part) = ($new =~ m!xap[0-9]+/([0-9]+)\b!); defined $cur_part or - die "BUG: could not extract partition # from $new"; + die "BUG: could not extract shard # from $new"; $new_parts = $opt->{reshard}; - defined $new_parts or die 'BUG: got array src w/o --partition'; + defined $new_parts or die 'BUG: got array src w/o --reshard'; # repartitioning, M:N copy means have full read access foreach (@$old) { -- cgit v1.2.3-24-ge0c7 From d304870c3c1e82f0421272e7986fc5c9aafa2889 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 07:51:45 +0000 Subject: rename reference to git epochs as "partitions" Try to remain consistent with our own documentation regarding v2 git "epochs", first. --- lib/PublicInbox/Inbox.pm | 18 +++++++++--------- lib/PublicInbox/WWW.pm | 12 ++++++------ lib/PublicInbox/WwwListing.pm | 2 +- lib/PublicInbox/WwwStream.pm | 12 ++++++------ t/view.t | 2 +- 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 10f716ca..c0eb640f 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -125,11 +125,11 @@ sub new { bless $opts, $class; } -sub git_part { - my ($self, $part) = @_; +sub git_epoch { + my ($self, $epoch) = @_; ($self->{version} || 1) == 2 or return; - $self->{"$part.git"} ||= eval { - my $git_dir = "$self->{mainrepo}/git/$part.git"; + $self->{"$epoch.git"} ||= eval { + my $git_dir = "$self->{mainrepo}/git/$epoch.git"; my $g = PublicInbox::Git->new($git_dir); $g->{-httpbackend_limiter} = $self->{-httpbackend_limiter}; # no cleanup needed, we never cat-file off this, only clone @@ -149,13 +149,13 @@ sub git { }; } -sub max_git_part { +sub max_git_epoch { my ($self) = @_; my $v = $self->{version}; return unless defined($v) && $v == 2; - my $part = $self->{-max_git_part}; + my $cur = $self->{-max_git_epoch}; my $changed = git($self)->alternates_changed; - if (!defined($part) || $changed) { + if (!defined($cur) || $changed) { $self->git->cleanup if $changed; my $gits = "$self->{mainrepo}/git"; if (opendir my $dh, $gits) { @@ -164,12 +164,12 @@ sub max_git_part { $git_dir =~ m!\A([0-9]+)\.git\z! or next; $max = $1 if $1 > $max; } - $part = $self->{-max_git_part} = $max if $max >= 0; + $cur = $self->{-max_git_epoch} = $max if $max >= 0; } else { warn "opendir $gits failed: $!\n"; } } - $part; + $cur; } sub mm { diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index e4682636..9021cb52 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -76,9 +76,9 @@ sub call { if ($method eq 'POST') { if ($path_info =~ m!$INBOX_RE/(?:(?:git/)?([0-9]+)(?:\.git)?/)? (git-upload-pack)\z!x) { - my ($part, $path) = ($2, $3); + my ($epoch, $path) = ($2, $3); return invalid_inbox($ctx, $1) || - serve_git($ctx, $part, $path); + serve_git($ctx, $epoch, $path); } elsif ($path_info =~ m!$INBOX_RE/!o) { return invalid_inbox($ctx, $1) || mbox_results($ctx); } @@ -100,8 +100,8 @@ sub call { invalid_inbox($ctx, $1) || get_new($ctx); } elsif ($path_info =~ m!$INBOX_RE/(?:(?:git/)?([0-9]+)(?:\.git)?/)? ($PublicInbox::GitHTTPBackend::ANY)\z!ox) { - my ($part, $path) = ($2, $3); - invalid_inbox($ctx, $1) || serve_git($ctx, $part, $path); + my ($epoch, $path) = ($2, $3); + invalid_inbox($ctx, $1) || serve_git($ctx, $epoch, $path); } elsif ($path_info =~ m!$INBOX_RE/([a-zA-Z0-9_\-]+).mbox\.gz\z!o) { serve_mbox_range($ctx, $1, $2); } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/$END_RE\z!o) { @@ -437,10 +437,10 @@ sub msg_page { } sub serve_git { - my ($ctx, $part, $path) = @_; + my ($ctx, $epoch, $path) = @_; my $env = $ctx->{env}; my $ibx = $ctx->{-inbox}; - my $git = defined $part ? $ibx->git_part($part) : $ibx->git; + my $git = defined $epoch ? $ibx->git_epoch($epoch) : $ibx->git; $git ? PublicInbox::GitHTTPBackend::serve($env, $git, $path) : r404(); } diff --git a/lib/PublicInbox/WwwListing.pm b/lib/PublicInbox/WwwListing.pm index e2724cc4..e052bbff 100644 --- a/lib/PublicInbox/WwwListing.pm +++ b/lib/PublicInbox/WwwListing.pm @@ -190,7 +190,7 @@ sub js ($$) { my $manifest = { -abs2urlpath => {}, -mtime => 0 }; for my $ibx (@$list) { - if (defined(my $max = $ibx->max_git_part)) { + if (defined(my $max = $ibx->max_git_epoch)) { for my $epoch (0..$max) { manifest_add($manifest, $ibx, $epoch); } diff --git a/lib/PublicInbox/WwwStream.pm b/lib/PublicInbox/WwwStream.pm index f6c50496..082e5ec9 100644 --- a/lib/PublicInbox/WwwStream.pm +++ b/lib/PublicInbox/WwwStream.pm @@ -85,11 +85,11 @@ sub _html_end { my (%seen, @urls); my $http = $ibx->base_url($ctx->{env}); chop $http; # no trailing slash for clone - my $part = $ibx->max_git_part; + my $max = $ibx->max_git_epoch; my $dir = (split(m!/!, $http))[-1]; - if (defined($part)) { # v2 + if (defined($max)) { # v2 $seen{$http} = 1; - for my $i (0..$part) { + for my $i (0..$max) { # old parts my be deleted: -d "$ibx->{mainrepo}/git/$i.git" or next; my $url = "$http/$i"; @@ -101,7 +101,7 @@ sub _html_end { push @urls, $http; } - # FIXME: partitioning in can be different in other repositories, + # FIXME: epoch splits can be different in other repositories, # use the "cloneurl" file as-is for now: foreach my $u (@{$ibx->cloneurl}) { next if $seen{$u}; @@ -109,13 +109,13 @@ sub _html_end { push @urls, $u =~ /\Ahttps?:/ ? qq($u) : $u; } - if (defined($part) || scalar(@urls) > 1) { + if (defined($max) || scalar(@urls) > 1) { $urls .= "\n" . join("\n", map { "\tgit clone --mirror $_" } @urls); } else { $urls .= " git clone --mirror $urls[0]"; } - if (defined $part) { + if (defined $max) { my $addrs = $ibx->{address}; $addrs = join(' ', @$addrs) if ref($addrs) eq 'ARRAY'; $urls .= < sub { 'http://example.com/' }, cloneurl => sub {[]}, nntp_url => sub {[]}, - max_git_part => sub { undef }, + max_git_epoch => sub { undef }, description => sub { '' }), www => Plack::Util::inline_object(style => sub { '' }), }; -- cgit v1.2.3-24-ge0c7 From e48df4770cffd4ef9672a884ea827cbcf566e469 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 07:55:35 +0000 Subject: searchidxpart: start using "shard" in user-visible places We'll name our process title with "shard" instead, and update a few error messages and comments to match. --- lib/PublicInbox/SearchIdxPart.pm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/PublicInbox/SearchIdxPart.pm b/lib/PublicInbox/SearchIdxPart.pm index 51d81a0a..77fb7d90 100644 --- a/lib/PublicInbox/SearchIdxPart.pm +++ b/lib/PublicInbox/SearchIdxPart.pm @@ -1,8 +1,8 @@ # Copyright (C) 2018 all contributors # License: AGPL-3.0+ -# used to interface with a single Xapian partition in V2 repos. -# See L for more info on how we partition Xapian +# used to interface with a single Xapian shard in V2 repos. +# See L for more info on how we shard Xapian package PublicInbox::SearchIdxPart; use strict; use warnings; @@ -47,7 +47,7 @@ sub spawn_worker { sub partition_worker_loop ($$$$) { my ($self, $r, $part, $bnote) = @_; - $0 = "pi-v2-partition[$part]"; + $0 = "pi-v2-shard[$part]"; my $current_info = ''; my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ }; local $SIG{__WARN__} = sub { @@ -89,7 +89,7 @@ sub index_raw { my ($self, $bytes, $msgref, $artnum, $oid, $mid0, $mime) = @_; if (my $w = $self->{w}) { print $w "$bytes $artnum $oid $mid0\n", $$msgref or die - "failed to write partition $!\n"; + "failed to write shard $!\n"; $w->flush or die "failed to flush: $!\n"; } else { $$msgref = undef; -- cgit v1.2.3-24-ge0c7 From 9b20aeef079c67ddaa2911b89f0b1209903d72fb Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 07:56:58 +0000 Subject: v2writable: count_partitions => count_shards Another step towards becoming consistent with Xapian terminology --- lib/PublicInbox/V2Writable.pm | 6 +++--- lib/PublicInbox/Xapcmd.pm | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index db905f92..03e6e951 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -52,7 +52,7 @@ sub nproc_parts ($) { $n < 1 ? 1 : $n; } -sub count_partitions ($) { +sub count_shards ($) { my ($self) = @_; my $nparts = 0; my $xpfx = $self->{xpfx}; @@ -103,7 +103,7 @@ sub new { rotate_bytes => int((1024 * 1024 * 1024) / $PACKING_FACTOR), last_commit => [], # git repo -> commit }; - $self->{partitions} = count_partitions($self) || nproc_parts($creat); + $self->{partitions} = count_shards($self) || nproc_parts($creat); bless $self, $class; } @@ -289,7 +289,7 @@ sub idx_init { $over->create; # xcpdb can change shard count while -watch is idle - my $nparts = count_partitions($self); + my $nparts = count_shards($self); if ($nparts && $nparts != $self->{partitions}) { $self->{partitions} = $nparts; } diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm index e303da9e..89bacc50 100644 --- a/lib/PublicInbox/Xapcmd.pm +++ b/lib/PublicInbox/Xapcmd.pm @@ -63,9 +63,9 @@ sub commit_changes ($$$) { if (!$opt->{-coarse_lock}) { $opt->{-skip_lock} = 1; - if ($im->can('count_partitions')) { + if ($im->can('count_shards')) { my $pr = $opt->{-progress}; - my $n = $im->count_partitions; + my $n = $im->count_shards; if (defined $new_parts && $n != $new_parts) { die "BUG: counted $n shards after resharding to $new_parts"; -- cgit v1.2.3-24-ge0c7 From 240de56c97d767cd5c819ac0be858359e8d2eff3 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 07:59:53 +0000 Subject: v2writable: rename {partitions} field to {shards} Our internal data structure should be consistent with Xapian terminology. --- lib/PublicInbox/Admin.pm | 2 +- lib/PublicInbox/V2Writable.pm | 10 +++++----- lib/PublicInbox/Xapcmd.pm | 4 ++-- t/v2writable.t | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index 5549b855..29388ad6 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -204,7 +204,7 @@ sub index_inbox { if ($jobs == 0) { $v2w->{parallel} = 0; } else { - my $n = $v2w->{partitions}; + my $n = $v2w->{shards}; if ($jobs != ($n + 1)) { warn "Unable to respect --jobs=$jobs, inbox was created with $n shards\n"; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 03e6e951..aa13aa8f 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -103,7 +103,7 @@ sub new { rotate_bytes => int((1024 * 1024 * 1024) / $PACKING_FACTOR), last_commit => [], # git repo -> commit }; - $self->{partitions} = count_shards($self) || nproc_parts($creat); + $self->{shards} = count_shards($self) || nproc_parts($creat); bless $self, $class; } @@ -134,7 +134,7 @@ sub add { sub do_idx ($$$$$$$) { my ($self, $msgref, $mime, $len, $num, $oid, $mid0) = @_; $self->{over}->add_overview($mime, $len, $num, $oid, $mid0); - my $npart = $self->{partitions}; + my $npart = $self->{shards}; my $part = $num % $npart; my $idx = idx_part($self, $part); $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime); @@ -290,12 +290,12 @@ sub idx_init { # xcpdb can change shard count while -watch is idle my $nparts = count_shards($self); - if ($nparts && $nparts != $self->{partitions}) { - $self->{partitions} = $nparts; + if ($nparts && $nparts != $self->{shards}) { + $self->{shards} = $nparts; } # need to create all parts before initializing msgmap FD - my $max = $self->{partitions} - 1; + my $max = $self->{shards} - 1; # idx_parts must be visible to all forked processes my $idx = $self->{idx_parts} = []; diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm index 89bacc50..322d827a 100644 --- a/lib/PublicInbox/Xapcmd.pm +++ b/lib/PublicInbox/Xapcmd.pm @@ -70,10 +70,10 @@ sub commit_changes ($$$) { die "BUG: counted $n shards after resharding to $new_parts"; } - my $prev = $im->{partitions}; + my $prev = $im->{shards}; if ($pr && $prev != $n) { $pr->("shard count changed: $prev => $n\n"); - $im->{partitions} = $n; + $im->{shards} = $n; } } diff --git a/t/v2writable.t b/t/v2writable.t index b0f88d27..88df2d64 100644 --- a/t/v2writable.t +++ b/t/v2writable.t @@ -34,7 +34,7 @@ my $mime = PublicInbox::MIME->create( ); my $im = PublicInbox::V2Writable->new($ibx, {nproc => 1}); -is($im->{partitions}, 1, 'one partition when forced'); +is($im->{shards}, 1, 'one shard when forced'); ok($im->add($mime), 'ordinary message added'); foreach my $f ("$mainrepo/msgmap.sqlite3", glob("$mainrepo/xap*/*"), @@ -199,7 +199,7 @@ EOF my @before = $git0->qx(@log, qw(--pretty=oneline)); my $before = $git0->qx(@log, qw(--pretty=raw --raw -r)); $im = PublicInbox::V2Writable->new($ibx, {nproc => 2}); - is($im->{partitions}, 1, 'detected single partition from previous'); + is($im->{shards}, 1, 'detected single shard from previous'); my $smsg = $im->remove($mime, 'test removal'); $im->done; my @after = $git0->qx(@log, qw(--pretty=oneline)); -- cgit v1.2.3-24-ge0c7 From 2e0abeb5e0e2ffa55ffef650f91f50086f143019 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 08:02:32 +0000 Subject: tests: change messages to use "shard" instead of partition Another potentially user-facing piece made consistent with Xapian terminology. --- t/indexlevels-mirror.t | 2 +- t/psgi_v2.t | 2 +- t/xcpdb-reshard.t | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/t/indexlevels-mirror.t b/t/indexlevels-mirror.t index 35974947..b685da14 100644 --- a/t/indexlevels-mirror.t +++ b/t/indexlevels-mirror.t @@ -138,7 +138,7 @@ sub import_index_incremental { if ($v == 2 && $level eq 'basic') { is_deeply([glob("$ibx->{mainrepo}/xap*/?/")], [], - 'no Xapian partition directories for v2 basic'); + 'no Xapian shard directories for v2 basic'); } if ($level ne 'basic') { ($nr, $msgs) = $ro_mirror->search->reopen->query('m:m@2'); diff --git a/t/psgi_v2.t b/t/psgi_v2.t index 5c358cde..36010689 100644 --- a/t/psgi_v2.t +++ b/t/psgi_v2.t @@ -205,7 +205,7 @@ test_psgi(sub { $www->call(@_) }, sub { $res = $cb->(GET('/v2test/0.git/info/refs')); is($res->code, 200, 'got info refs for dumb clones w/ .git suffix'); $res = $cb->(GET('/v2test/info/refs')); - is($res->code, 404, 'unpartitioned git URL fails'); + is($res->code, 404, 'v2 git URL w/o shard fails'); # ensure conflicted attachments can be resolved foreach my $body (qw(old new)) { diff --git a/t/xcpdb-reshard.t b/t/xcpdb-reshard.t index ce552f54..bf56404d 100644 --- a/t/xcpdb-reshard.t +++ b/t/xcpdb-reshard.t @@ -48,14 +48,14 @@ is(scalar(@parts), $nproc, 'got expected parts'); my $orig = $ibx->over->query_xover(1, $ndoc); my %nums = map {; "$_->{num}" => 1 } @$orig; -# ensure we can go up or down in partitions, or stay the same: +# ensure we can go up or down in shards, or stay the same: for my $R (qw(2 4 1 3 3)) { delete $ibx->{search}; # release old handles is(system(@xcpdb, "-R$R", $ibx->{mainrepo}), 0, "xcpdb -R$R"); my @new_parts = grep(m!/\d+\z!, glob("$ibx->{mainrepo}/xap*/*")); - is(scalar(@new_parts), $R, 'repartitioned to two parts'); + is(scalar(@new_parts), $R, 'resharded to two parts'); my $msgs = $ibx->search->query('s:this'); - is(scalar(@$msgs), $ndoc, 'got expected docs after repartitioning'); + is(scalar(@$msgs), $ndoc, 'got expected docs after resharding'); my %by_mid = map {; "$_->{mid}" => $_ } @$msgs; ok($by_mid{"m$_\@example.com"}, "$_ exists") for (1..$ndoc); -- cgit v1.2.3-24-ge0c7 From d963eec26dea82026789834573c2d5c4df91b3cf Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 17:31:28 +0000 Subject: inboxwritable: s/partitions/shards/ in local var More work towards being consistent with Xapian's own terminology --- lib/PublicInbox/InboxWritable.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index 116f423b..f00141d2 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -31,7 +31,7 @@ sub new { } sub init_inbox { - my ($self, $partitions, $skip_epoch, $skip_artnum) = @_; + my ($self, $shards, $skip_epoch, $skip_artnum) = @_; # TODO: honor skip_artnum my $v = $self->{version} || 1; if ($v == 1) { @@ -39,7 +39,7 @@ sub init_inbox { PublicInbox::Import::init_bare($dir); } else { my $v2w = importer($self); - $v2w->init_inbox($partitions, $skip_epoch, $skip_artnum); + $v2w->init_inbox($shards, $skip_epoch, $skip_artnum); } } -- cgit v1.2.3-24-ge0c7 From 0ffd4a9833da64006d558ef241badfef3c096d1b Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 17:35:04 +0000 Subject: v2: rename SearchIdxPart => SearchIdxShard Another step towards keeping our file and package names consistent with Xapian terminology. --- MANIFEST | 2 +- lib/PublicInbox/SearchIdxPart.pm | 116 -------------------------------------- lib/PublicInbox/SearchIdxShard.pm | 116 ++++++++++++++++++++++++++++++++++++++ lib/PublicInbox/V2Writable.pm | 4 +- 4 files changed, 119 insertions(+), 119 deletions(-) delete mode 100644 lib/PublicInbox/SearchIdxPart.pm create mode 100644 lib/PublicInbox/SearchIdxShard.pm diff --git a/MANIFEST b/MANIFEST index 3f0a79a6..c7693976 100644 --- a/MANIFEST +++ b/MANIFEST @@ -119,7 +119,7 @@ lib/PublicInbox/Reply.pm lib/PublicInbox/SaPlugin/ListMirror.pm lib/PublicInbox/Search.pm lib/PublicInbox/SearchIdx.pm -lib/PublicInbox/SearchIdxPart.pm +lib/PublicInbox/SearchIdxShard.pm lib/PublicInbox/SearchMsg.pm lib/PublicInbox/SearchThread.pm lib/PublicInbox/SearchView.pm diff --git a/lib/PublicInbox/SearchIdxPart.pm b/lib/PublicInbox/SearchIdxPart.pm deleted file mode 100644 index 77fb7d90..00000000 --- a/lib/PublicInbox/SearchIdxPart.pm +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (C) 2018 all contributors -# License: AGPL-3.0+ - -# used to interface with a single Xapian shard in V2 repos. -# See L for more info on how we shard Xapian -package PublicInbox::SearchIdxPart; -use strict; -use warnings; -use base qw(PublicInbox::SearchIdx); - -sub new { - my ($class, $v2writable, $part) = @_; - my $self = $class->SUPER::new($v2writable->{-inbox}, 1, $part); - # create the DB before forking: - $self->_xdb_acquire; - $self->_xdb_release; - $self->spawn_worker($v2writable, $part) if $v2writable->{parallel}; - $self; -} - -sub spawn_worker { - my ($self, $v2writable, $part) = @_; - my ($r, $w); - pipe($r, $w) or die "pipe failed: $!\n"; - binmode $r, ':raw'; - binmode $w, ':raw'; - my $pid = fork; - defined $pid or die "fork failed: $!\n"; - if ($pid == 0) { - my $bnote = $v2writable->atfork_child; - $v2writable = undef; - close $w or die "failed to close: $!"; - - # F_SETPIPE_SZ = 1031 on Linux; increasing the pipe size here - # speeds V2Writable batch imports across 8 cores by nearly 20% - fcntl($r, 1031, 1048576) if $^O eq 'linux'; - - eval { partition_worker_loop($self, $r, $part, $bnote) }; - die "worker $part died: $@\n" if $@; - die "unexpected MM $self->{mm}" if $self->{mm}; - exit; - } - $self->{pid} = $pid; - $self->{w} = $w; - close $r or die "failed to close: $!"; -} - -sub partition_worker_loop ($$$$) { - my ($self, $r, $part, $bnote) = @_; - $0 = "pi-v2-shard[$part]"; - my $current_info = ''; - my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ }; - local $SIG{__WARN__} = sub { - chomp $current_info; - $warn_cb->("[$part] $current_info: ", @_); - }; - $self->begin_txn_lazy; - while (my $line = $r->getline) { - $current_info = $line; - if ($line eq "commit\n") { - $self->commit_txn_lazy; - } elsif ($line eq "close\n") { - $self->_xdb_release; - } elsif ($line eq "barrier\n") { - $self->commit_txn_lazy; - # no need to lock < 512 bytes is atomic under POSIX - print $bnote "barrier $part\n" or - die "write failed for barrier $!\n"; - } elsif ($line =~ /\AD ([a-f0-9]{40,}) (.+)\n\z/s) { - my ($oid, $mid) = ($1, $2); - $self->begin_txn_lazy; - $self->remove_by_oid($oid, $mid); - } else { - chomp $line; - my ($len, $artnum, $oid, $mid0) = split(/ /, $line); - $self->begin_txn_lazy; - my $n = read($r, my $msg, $len) or die "read: $!\n"; - $n == $len or die "short read: $n != $len\n"; - my $mime = PublicInbox::MIME->new(\$msg); - $artnum = int($artnum); - $self->add_message($mime, $n, $artnum, $oid, $mid0); - } - } - $self->worker_done; -} - -# called by V2Writable -sub index_raw { - my ($self, $bytes, $msgref, $artnum, $oid, $mid0, $mime) = @_; - if (my $w = $self->{w}) { - print $w "$bytes $artnum $oid $mid0\n", $$msgref or die - "failed to write shard $!\n"; - $w->flush or die "failed to flush: $!\n"; - } else { - $$msgref = undef; - $self->begin_txn_lazy; - $self->add_message($mime, $bytes, $artnum, $oid, $mid0); - } -} - -sub atfork_child { - close $_[0]->{w} or die "failed to close write pipe: $!\n"; -} - -# called by V2Writable: -sub remote_barrier { - my ($self) = @_; - if (my $w = $self->{w}) { - print $w "barrier\n" or die "failed to print: $!"; - $w->flush or die "failed to flush: $!"; - } else { - $self->commit_txn_lazy; - } -} - -1; diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm new file mode 100644 index 00000000..15ec6578 --- /dev/null +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -0,0 +1,116 @@ +# Copyright (C) 2018 all contributors +# License: AGPL-3.0+ + +# used to interface with a single Xapian shard in V2 repos. +# See L for more info on how we shard Xapian +package PublicInbox::SearchIdxShard; +use strict; +use warnings; +use base qw(PublicInbox::SearchIdx); + +sub new { + my ($class, $v2writable, $shard) = @_; + my $self = $class->SUPER::new($v2writable->{-inbox}, 1, $shard); + # create the DB before forking: + $self->_xdb_acquire; + $self->_xdb_release; + $self->spawn_worker($v2writable, $shard) if $v2writable->{parallel}; + $self; +} + +sub spawn_worker { + my ($self, $v2writable, $shard) = @_; + my ($r, $w); + pipe($r, $w) or die "pipe failed: $!\n"; + binmode $r, ':raw'; + binmode $w, ':raw'; + my $pid = fork; + defined $pid or die "fork failed: $!\n"; + if ($pid == 0) { + my $bnote = $v2writable->atfork_child; + $v2writable = undef; + close $w or die "failed to close: $!"; + + # F_SETPIPE_SZ = 1031 on Linux; increasing the pipe size here + # speeds V2Writable batch imports across 8 cores by nearly 20% + fcntl($r, 1031, 1048576) if $^O eq 'linux'; + + eval { shard_worker_loop($self, $r, $shard, $bnote) }; + die "worker $shard died: $@\n" if $@; + die "unexpected MM $self->{mm}" if $self->{mm}; + exit; + } + $self->{pid} = $pid; + $self->{w} = $w; + close $r or die "failed to close: $!"; +} + +sub shard_worker_loop ($$$$) { + my ($self, $r, $shard, $bnote) = @_; + $0 = "pi-v2-shard[$shard]"; + my $current_info = ''; + my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ }; + local $SIG{__WARN__} = sub { + chomp $current_info; + $warn_cb->("[$shard] $current_info: ", @_); + }; + $self->begin_txn_lazy; + while (my $line = $r->getline) { + $current_info = $line; + if ($line eq "commit\n") { + $self->commit_txn_lazy; + } elsif ($line eq "close\n") { + $self->_xdb_release; + } elsif ($line eq "barrier\n") { + $self->commit_txn_lazy; + # no need to lock < 512 bytes is atomic under POSIX + print $bnote "barrier $shard\n" or + die "write failed for barrier $!\n"; + } elsif ($line =~ /\AD ([a-f0-9]{40,}) (.+)\n\z/s) { + my ($oid, $mid) = ($1, $2); + $self->begin_txn_lazy; + $self->remove_by_oid($oid, $mid); + } else { + chomp $line; + my ($len, $artnum, $oid, $mid0) = split(/ /, $line); + $self->begin_txn_lazy; + my $n = read($r, my $msg, $len) or die "read: $!\n"; + $n == $len or die "short read: $n != $len\n"; + my $mime = PublicInbox::MIME->new(\$msg); + $artnum = int($artnum); + $self->add_message($mime, $n, $artnum, $oid, $mid0); + } + } + $self->worker_done; +} + +# called by V2Writable +sub index_raw { + my ($self, $bytes, $msgref, $artnum, $oid, $mid0, $mime) = @_; + if (my $w = $self->{w}) { + print $w "$bytes $artnum $oid $mid0\n", $$msgref or die + "failed to write shard $!\n"; + $w->flush or die "failed to flush: $!\n"; + } else { + $$msgref = undef; + $self->begin_txn_lazy; + $self->add_message($mime, $bytes, $artnum, $oid, $mid0); + } +} + +sub atfork_child { + close $_[0]->{w} or die "failed to close write pipe: $!\n"; +} + +# called by V2Writable: +sub remote_barrier { + my ($self) = @_; + if (my $w = $self->{w}) { + print $w "barrier\n" or die "failed to print: $!"; + $w->flush or die "failed to flush: $!"; + } else { + $self->commit_txn_lazy; + } +} + +1; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index aa13aa8f..cc9ebfed 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -7,7 +7,7 @@ package PublicInbox::V2Writable; use strict; use warnings; use base qw(PublicInbox::Lock); -use PublicInbox::SearchIdxPart; +use PublicInbox::SearchIdxShard; use PublicInbox::MIME; use PublicInbox::Git; use PublicInbox::Import; @@ -300,7 +300,7 @@ sub idx_init { # idx_parts must be visible to all forked processes my $idx = $self->{idx_parts} = []; for my $i (0..$max) { - push @$idx, PublicInbox::SearchIdxPart->new($self, $i); + push @$idx, PublicInbox::SearchIdxShard->new($self, $i); } # Now that all subprocesses are up, we can open the FDs -- cgit v1.2.3-24-ge0c7 From 3f5370d63630701d0f62f57c15e08d566c35db42 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 17:38:01 +0000 Subject: xapcmd: update comments referencing "partitions" Don't confuse future readers of our code. --- lib/PublicInbox/Xapcmd.pm | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm index 322d827a..5e4ac87e 100644 --- a/lib/PublicInbox/Xapcmd.pm +++ b/lib/PublicInbox/Xapcmd.pm @@ -40,7 +40,7 @@ sub commit_changes ($$$) { $over = undef; } - if (!defined($new)) { # culled partition + if (!defined($new)) { # culled shard push @old_part, $old; next; } @@ -359,7 +359,7 @@ sub cpdb ($$) { $new_parts = $opt->{reshard}; defined $new_parts or die 'BUG: got array src w/o --reshard'; - # repartitioning, M:N copy means have full read access + # resharding, M:N copy means have full read access foreach (@$old) { if ($src) { my $sub = Search::Xapian::Database->new($_); @@ -397,7 +397,7 @@ sub cpdb ($$) { my $lc = $src->get_metadata('last_commit'); $dst->set_metadata('last_commit', $lc) if $lc; - # only the first xapian partition (0) gets 'indexlevel' + # only the first xapian shard (0) gets 'indexlevel' if ($new =~ m!(?:xapian[0-9]+|xap[0-9]+/0)\b!) { my $l = $src->get_metadata('indexlevel'); if ($l eq 'medium') { @@ -407,7 +407,7 @@ sub cpdb ($$) { if ($pr_data) { my $tot = $src->get_doccount; - # we can only estimate when repartitioning, + # we can only estimate when resharding, # because removed spam causes slight imbalance my $est = ''; if (defined $cur_part && $new_parts > 1) { @@ -459,7 +459,7 @@ sub new { # http://www.tldp.org/LDP/abs/html/exitcodes.html $SIG{INT} = sub { exit(130) }; $SIG{HUP} = $SIG{PIPE} = $SIG{TERM} = sub { exit(1) }; - my $self = bless {}, $_[0]; # old partition => new (tmp) partition + my $self = bless {}, $_[0]; # old shard => new (WIP) shard $owner{"$self"} = $$; $self; } @@ -481,7 +481,7 @@ sub DESTROY { my $owner_pid = delete $owner{"$self"} or return; return if $owner_pid != $$; foreach my $new (values %$self) { - defined $new or next; # may be undef if repartitioning + defined $new or next; # may be undef if resharding remove_tree($new) unless -d "$new/old"; } done($self); -- cgit v1.2.3-24-ge0c7 From 097cca4c583a3fe51945406041d1d88c864689a5 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 17:42:57 +0000 Subject: search*: rename {partition} => {shard} Another step towards keeping our internal data structures consistent with Xapian naming. --- lib/PublicInbox/Search.pm | 6 +++--- lib/PublicInbox/SearchIdx.pm | 17 +++++++++-------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 098c97cd..45431ecc 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -131,9 +131,9 @@ sub xdir ($;$) { my $dir = "$self->{mainrepo}/xap" . SCHEMA_VERSION; return $dir if $rdonly; - my $part = $self->{partition}; - defined $part or die "partition not given"; - $dir .= "/$part"; + my $shard = $self->{shard}; + defined $shard or die "shard not given"; + $dir .= "/$shard"; } } diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index a088ce75..58b23375 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -29,7 +29,7 @@ use constant { my $xapianlevels = qr/\A(?:full|medium)\z/; sub new { - my ($class, $ibx, $creat, $part) = @_; + my ($class, $ibx, $creat, $shard) = @_; ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx"; my $levels = qr/\A(?:full|medium|basic)\z/; my $mainrepo = $ibx->{mainrepo}; @@ -62,9 +62,9 @@ sub new { my $dir = $self->xdir; $self->{over} = PublicInbox::OverIdx->new("$dir/over.sqlite3"); } elsif ($version == 2) { - defined $part or die "partition is required for v2\n"; - # partition is a number - $self->{partition} = $part; + defined $shard or die "shard is required for v2\n"; + # shard is a number + $self->{shard} = $shard; $self->{lock_path} = undef; } else { die "unsupported inbox version=$version\n"; @@ -102,8 +102,8 @@ sub _xdb_acquire { $self->lock_acquire; # don't create empty Xapian directories if we don't need Xapian - my $is_part = defined($self->{partition}); - if (!$is_part || ($is_part && need_xapian($self))) { + my $is_shard = defined($self->{shard}); + if (!$is_shard || ($is_shard && need_xapian($self))) { File::Path::mkpath($dir); } } @@ -824,9 +824,10 @@ sub commit_txn_lazy { $self->{-inbox}->with_umask(sub { if (my $xdb = $self->{xdb}) { - # store 'indexlevel=medium' in v2 part=0 and v1 (only part) + # store 'indexlevel=medium' in v2 shard=0 and + # v1 (only one shard) # This metadata is read by Admin::detect_indexlevel: - if (!$self->{partition} # undef or 0, not >0 + if (!$self->{shard} # undef or 0, not >0 && $self->{indexlevel} eq 'medium') { $xdb->set_metadata('indexlevel', 'medium'); } -- cgit v1.2.3-24-ge0c7 From 7c89ad3a335d3e85d5804586330251c4eb68dc44 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 17:50:45 +0000 Subject: v2writable: avoid "part" in internal subs and fields We'll be using the term "shard" from now on to be consistent with Xapian terminology. --- lib/PublicInbox/V2Writable.pm | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index cc9ebfed..a2313905 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -28,10 +28,10 @@ my $PACKING_FACTOR = 0.4; # waste of FDs and space. It can also lead to excessive IO latency # and slow things down. Users on NVME or other fast storage can # use the NPROC env or switches in our script/public-inbox-* programs -# to increase Xapian partitions. +# to increase Xapian shards our $NPROC_MAX_DEFAULT = 4; -sub nproc_parts ($) { +sub nproc_shards ($) { my ($creat_opt) = @_; if (ref($creat_opt) eq 'HASH') { if (defined(my $n = $creat_opt->{nproc})) { @@ -103,7 +103,7 @@ sub new { rotate_bytes => int((1024 * 1024 * 1024) / $PACKING_FACTOR), last_commit => [], # git repo -> commit }; - $self->{shards} = count_shards($self) || nproc_parts($creat); + $self->{shards} = count_shards($self) || nproc_shards($creat); bless $self, $class; } @@ -136,7 +136,7 @@ sub do_idx ($$$$$$$) { $self->{over}->add_overview($mime, $len, $num, $oid, $mid0); my $npart = $self->{shards}; my $part = $num % $npart; - my $idx = idx_part($self, $part); + my $idx = idx_shard($self, $part); $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime); my $n = $self->{transact_bytes} += $len; $n >= (PublicInbox::SearchIdx::BATCH_BYTES * $npart); @@ -252,15 +252,15 @@ sub num_for_harder { $num; } -sub idx_part { +sub idx_shard { my ($self, $part) = @_; - $self->{idx_parts}->[$part]; + $self->{idx_shards}->[$part]; } # idempotent sub idx_init { my ($self, $opt) = @_; - return if $self->{idx_parts}; + return if $self->{idx_shards}; my $ibx = $self->{-inbox}; # do not leak read-only FDs to child processes, we only have these @@ -297,8 +297,8 @@ sub idx_init { # need to create all parts before initializing msgmap FD my $max = $self->{shards} - 1; - # idx_parts must be visible to all forked processes - my $idx = $self->{idx_parts} = []; + # idx_shards must be visible to all forked processes + my $idx = $self->{idx_shards} = []; for my $i (0..$max) { push @$idx, PublicInbox::SearchIdxShard->new($self, $i); } @@ -370,7 +370,7 @@ sub rewrite_internal ($$;$$$) { } my $over = $self->{over}; my $cids = content_ids($old_mime); - my $parts = $self->{idx_parts}; + my $parts = $self->{idx_shards}; my $removed; my $mids = mids($old_mime->header_obj); @@ -605,7 +605,7 @@ sub checkpoint ($;$) { $im->checkpoint; } } - my $parts = $self->{idx_parts}; + my $parts = $self->{idx_shards}; if ($parts) { my $dbh = $self->{mm}->{dbh}; @@ -652,7 +652,7 @@ sub done { checkpoint($self); my $mm = delete $self->{mm}; $mm->{dbh}->commit if $mm; - my $parts = delete $self->{idx_parts}; + my $parts = delete $self->{idx_shards}; if ($parts) { $_->remote_close for @$parts; } @@ -827,7 +827,7 @@ sub atfork_child { my ($self) = @_; my $fh = delete $self->{reindex_pipe}; close $fh if $fh; - if (my $parts = $self->{idx_parts}) { + if (my $parts = $self->{idx_shards}) { $_->atfork_child foreach @$parts; } if (my $im = $self->{im}) { @@ -1051,7 +1051,7 @@ sub sync_prepare ($$$) { sub unindex_oid_remote ($$$) { my ($self, $oid, $mid) = @_; - $_->remote_remove($oid, $mid) foreach @{$self->{idx_parts}}; + $_->remote_remove($oid, $mid) foreach @{$self->{idx_shards}}; $self->{over}->remove_oid($oid, $mid); } -- cgit v1.2.3-24-ge0c7 From e5f9466ebdac655d0062c37a7b38a61c587c0975 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 18:00:20 +0000 Subject: v2writable: rename local vars to match Xapian terminology --- lib/PublicInbox/V2Writable.pm | 53 ++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index a2313905..502824c8 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -54,22 +54,22 @@ sub nproc_shards ($) { sub count_shards ($) { my ($self) = @_; - my $nparts = 0; + my $n = 0; my $xpfx = $self->{xpfx}; # always load existing partitions in case core count changes: # Also, shard count may change while -watch is running # due to "xcpdb --reshard" if (-d $xpfx) { - foreach my $part (<$xpfx/*>) { - -d $part && $part =~ m!/[0-9]+\z! or next; + foreach my $shard (<$xpfx/*>) { + -d $shard && $shard =~ m!/[0-9]+\z! or next; eval { - Search::Xapian::Database->new($part)->close; - $nparts++; + Search::Xapian::Database->new($shard)->close; + $n++; }; } } - $nparts; + $n; } sub new { @@ -134,12 +134,10 @@ sub add { sub do_idx ($$$$$$$) { my ($self, $msgref, $mime, $len, $num, $oid, $mid0) = @_; $self->{over}->add_overview($mime, $len, $num, $oid, $mid0); - my $npart = $self->{shards}; - my $part = $num % $npart; - my $idx = idx_shard($self, $part); + my $idx = idx_shard($self, $num % $self->{shards}); $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime); my $n = $self->{transact_bytes} += $len; - $n >= (PublicInbox::SearchIdx::BATCH_BYTES * $npart); + $n >= (PublicInbox::SearchIdx::BATCH_BYTES * $self->{shards}); } sub _add { @@ -253,8 +251,8 @@ sub num_for_harder { } sub idx_shard { - my ($self, $part) = @_; - $self->{idx_shards}->[$part]; + my ($self, $shard_i) = @_; + $self->{idx_shards}->[$shard_i]; } # idempotent @@ -289,9 +287,9 @@ sub idx_init { $over->create; # xcpdb can change shard count while -watch is idle - my $nparts = count_shards($self); - if ($nparts && $nparts != $self->{shards}) { - $self->{shards} = $nparts; + my $nshards = count_shards($self); + if ($nshards && $nshards != $self->{shards}) { + $self->{shards} = $nshards; } # need to create all parts before initializing msgmap FD @@ -370,7 +368,6 @@ sub rewrite_internal ($$;$$$) { } my $over = $self->{over}; my $cids = content_ids($old_mime); - my $parts = $self->{idx_shards}; my $removed; my $mids = mids($old_mime->header_obj); @@ -590,7 +587,7 @@ sub barrier_wait { while (scalar keys %$barrier) { defined(my $l = $r->getline) or die "EOF on barrier_wait: $!"; $l =~ /\Abarrier (\d+)/ or die "bad line on barrier_wait: $l"; - delete $barrier->{$1} or die "bad part[$1] on barrier wait"; + delete $barrier->{$1} or die "bad shard[$1] on barrier wait"; } } @@ -605,8 +602,8 @@ sub checkpoint ($;$) { $im->checkpoint; } } - my $parts = $self->{idx_shards}; - if ($parts) { + my $shards = $self->{idx_shards}; + if ($shards) { my $dbh = $self->{mm}->{dbh}; # SQLite msgmap data is second in importance @@ -617,15 +614,15 @@ sub checkpoint ($;$) { # Now deal with Xapian if ($wait) { - my $barrier = $self->barrier_init(scalar @$parts); + my $barrier = $self->barrier_init(scalar @$shards); # each partition needs to issue a barrier command - $_->remote_barrier for @$parts; + $_->remote_barrier for @$shards; # wait for each Xapian partition $self->barrier_wait($barrier); } else { - $_->remote_commit for @$parts; + $_->remote_commit for @$shards; } # last_commit is special, don't commit these until @@ -652,14 +649,14 @@ sub done { checkpoint($self); my $mm = delete $self->{mm}; $mm->{dbh}->commit if $mm; - my $parts = delete $self->{idx_shards}; - if ($parts) { - $_->remote_close for @$parts; + my $shards = delete $self->{idx_shards}; + if ($shards) { + $_->remote_close for @$shards; } $self->{over}->disconnect; delete $self->{bnote}; $self->{transact_bytes} = 0; - $self->lock_release if $parts; + $self->lock_release if $shards; $self->{-inbox}->git->cleanup; } @@ -827,8 +824,8 @@ sub atfork_child { my ($self) = @_; my $fh = delete $self->{reindex_pipe}; close $fh if $fh; - if (my $parts = $self->{idx_shards}) { - $_->atfork_child foreach @$parts; + if (my $shards = $self->{idx_shards}) { + $_->atfork_child foreach @$shards; } if (my $im = $self->{im}) { $im->atfork_child; -- cgit v1.2.3-24-ge0c7 From 609f38288ce413c186142225ef9225d31fcd908b Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 18:14:42 +0000 Subject: adminedit: "part" => "shard" for local variables --- lib/PublicInbox/AdminEdit.pm | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/PublicInbox/AdminEdit.pm b/lib/PublicInbox/AdminEdit.pm index 169feba0..2e2a8629 100644 --- a/lib/PublicInbox/AdminEdit.pm +++ b/lib/PublicInbox/AdminEdit.pm @@ -29,15 +29,15 @@ sub check_editable ($) { # $ibx->{search} is populated by $ibx->over call my $xdir_ro = $ibx->{search}->xdir(1); - my $npart = 0; - foreach my $part (<$xdir_ro/*>) { - if (-d $part && $part =~ m!/[0-9]+\z!) { + my $nshard = 0; + foreach my $shard (<$xdir_ro/*>) { + if (-d $shard && $shard =~ m!/[0-9]+\z!) { my $bytes = 0; - $bytes += -s $_ foreach glob("$part/*"); - $npart++ if $bytes; + $bytes += -s $_ foreach glob("$shard/*"); + $nshard++ if $bytes; } } - if ($npart) { + if ($nshard) { PublicInbox::Admin::require_or_die('-search'); } else { # somebody could "rm -r" all the Xapian directories; -- cgit v1.2.3-24-ge0c7 From 9126f3ca9f8ebb6b04afbba1e6f84e705f81e1b6 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 18:16:09 +0000 Subject: v2writable: use "epoch" consistently when referring to git repos Be consistent with our own terminology and use "epoch" for [0-9]+\.git repos. The term "partition" is going away entirely. --- lib/PublicInbox/V2Writable.pm | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 502824c8..7a89093c 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -556,7 +556,7 @@ W: $list $rewritten->{rewrites}; } -sub last_commit_part ($$;$) { +sub last_epoch_commit ($$;$) { my ($self, $i, $cmt) = @_; my $v = PublicInbox::Search::SCHEMA_VERSION(); $self->{mm}->last_commit_xap($v, $i, $cmt); @@ -569,7 +569,7 @@ sub set_last_commits ($) { foreach my $i (0..$epoch_max) { defined(my $cmt = $last_commit->[$i]) or next; $last_commit->[$i] = undef; - last_commit_part($self, $i, $cmt); + last_epoch_commit($self, $i, $cmt); } } @@ -927,13 +927,13 @@ sub reindex_oid ($$$$) { # only update last_commit for $i on reindex iff newer than current sub update_last_commit ($$$$) { my ($self, $git, $i, $cmt) = @_; - my $last = last_commit_part($self, $i); + my $last = last_epoch_commit($self, $i); if (defined $last && is_ancestor($git, $last, $cmt)) { my @cmd = (qw(rev-list --count), "$last..$cmt"); chomp(my $n = $git->qx(@cmd)); return if $n ne '' && $n == 0; } - last_commit_part($self, $i, $cmt); + last_epoch_commit($self, $i, $cmt); } sub git_dir_n ($$) { "$_[0]->{-inbox}->{mainrepo}/git/$_[1].git" } @@ -942,7 +942,7 @@ sub last_commits ($$) { my ($self, $epoch_max) = @_; my $heads = []; for (my $i = $epoch_max; $i >= 0; $i--) { - $heads->[$i] = last_commit_part($self, $i); + $heads->[$i] = last_epoch_commit($self, $i); } $heads; } @@ -1013,7 +1013,7 @@ sub sync_prepare ($$$) { for (my $i = $epoch_max; $i >= 0; $i--) { die 'BUG: already indexing!' if $self->{reindex_pipe}; my $git_dir = git_dir_n($self, $i); - -d $git_dir or next; # missing parts are fine + -d $git_dir or next; # missing epochs are fine my $git = PublicInbox::Git->new($git_dir); if ($reindex_heads) { $head = $reindex_heads->[$i] or next; @@ -1123,7 +1123,7 @@ sub index_epoch ($$$) { my $git_dir = git_dir_n($self, $i); die 'BUG: already reindexing!' if $self->{reindex_pipe}; - -d $git_dir or return; # missing parts are fine + -d $git_dir or return; # missing epochs are fine fill_alternates($self, $i); my $git = PublicInbox::Git->new($git_dir); if (my $unindex_range = delete $sync->{unindex_range}->{$i}) { -- cgit v1.2.3-24-ge0c7 From ec32087fbd0af943017d2047eedbdbd59d3291da Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 18:18:06 +0000 Subject: search: use "shard" for local variable Another small step towards terminology consistency with Xapian. --- lib/PublicInbox/Search.pm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 45431ecc..60fc861a 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -143,15 +143,15 @@ sub _xdb ($) { my ($xdb, $slow_phrase); my $qpf = \($self->{qp_flags} ||= $QP_FLAGS); if ($self->{version} >= 2) { - foreach my $part (<$dir/*>) { - -d $part && $part =~ m!/[0-9]+\z! or next; - my $sub = Search::Xapian::Database->new($part); + foreach my $shard (<$dir/*>) { + -d $shard && $shard =~ m!/[0-9]+\z! or next; + my $sub = Search::Xapian::Database->new($shard); if ($xdb) { $xdb->add_database($sub); } else { $xdb = $sub; } - $slow_phrase ||= -f "$part/iamchert"; + $slow_phrase ||= -f "$shard/iamchert"; } } else { $slow_phrase = -f "$dir/iamchert"; -- cgit v1.2.3-24-ge0c7 From a45af9138d4a708ff7818dba91e03b6d4d22e69c Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 21:29:37 +0000 Subject: xapcmd: favor 'shard' over 'part' in local variables Yet another step to keeping our naming consistent with Xapian terminology. --- lib/PublicInbox/Xapcmd.pm | 70 +++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm index 5e4ac87e..819d7829 100644 --- a/lib/PublicInbox/Xapcmd.pm +++ b/lib/PublicInbox/Xapcmd.pm @@ -17,13 +17,13 @@ our @COMPACT_OPT = qw(jobs|j=i quiet|q blocksize|b=s no-full|n fuller|F); sub commit_changes ($$$) { my ($ibx, $tmp, $opt) = @_; - my $new_parts = $opt->{reshard}; + my $reshard = $opt->{reshard}; my $reindex = $opt->{reindex}; my $im = $ibx->importer(0); $im->lock_acquire if !$opt->{-coarse_lock}; $SIG{INT} or die 'BUG: $SIG{INT} not handled'; - my @old_part; + my @old_shard; while (my ($old, $new) = each %$tmp) { my @st = stat($old); @@ -41,7 +41,7 @@ sub commit_changes ($$$) { } if (!defined($new)) { # culled shard - push @old_part, $old; + push @old_shard, $old; next; } @@ -58,7 +58,7 @@ sub commit_changes ($$$) { die "failed to remove $prev: $!\n"; } } - remove_tree(@old_part); + remove_tree(@old_shard); $tmp->done; if (!$opt->{-coarse_lock}) { $opt->{-skip_lock} = 1; @@ -66,9 +66,9 @@ sub commit_changes ($$$) { if ($im->can('count_shards')) { my $pr = $opt->{-progress}; my $n = $im->count_shards; - if (defined $new_parts && $n != $new_parts) { + if (defined $reshard && $n != $reshard) { die -"BUG: counted $n shards after resharding to $new_parts"; +"BUG: counted $n shards after resharding to $reshard"; } my $prev = $im->{shards}; if ($pr && $prev != $n) { @@ -171,17 +171,17 @@ sub run { my $tmp = PublicInbox::Xtmpdirs->new; my $v = $ibx->{version} ||= 1; my @q; - my $new_parts = $opt->{reshard}; - if (defined $new_parts && $new_parts <= 0) { + my $reshard = $opt->{reshard}; + if (defined $reshard && $reshard <= 0) { die "--reshard must be a positive number\n"; } # we want temporary directories to be as deep as possible, # so v2 shards can keep "xap$SCHEMA_VERSION" on a separate FS. if ($v == 1) { - if (defined $new_parts) { + if (defined $reshard) { warn -"--reshard=$new_parts ignored for v1 $ibx->{mainrepo}\n"; +"--reshard=$reshard ignored for v1 $ibx->{mainrepo}\n"; } my $old_parent = dirname($old); same_fs_or_die($old_parent, $old); @@ -191,28 +191,28 @@ sub run { push @q, [ $old, $wip ]; } else { opendir my $dh, $old or die "Failed to opendir $old: $!\n"; - my @old_parts; + my @old_shards; while (defined(my $dn = readdir($dh))) { if ($dn =~ /\A[0-9]+\z/) { - push @old_parts, $dn; + push @old_shards, $dn; } elsif ($dn eq '.' || $dn eq '..') { } elsif ($dn =~ /\Aover\.sqlite3/) { } else { warn "W: skipping unknown dir: $old/$dn\n" } } - die "No Xapian parts found in $old\n" unless @old_parts; + die "No Xapian shards found in $old\n" unless @old_shards; - my ($src, $max_part); - if (!defined($new_parts) || $new_parts == scalar(@old_parts)) { + my ($src, $max_shard); + if (!defined($reshard) || $reshard == scalar(@old_shards)) { # 1:1 copy - $max_part = scalar(@old_parts) - 1; + $max_shard = scalar(@old_shards) - 1; } else { # M:N copy - $max_part = $new_parts - 1; - $src = [ map { "$old/$_" } @old_parts ]; + $max_shard = $reshard - 1; + $src = [ map { "$old/$_" } @old_shards ]; } - foreach my $dn (0..$max_part) { + foreach my $dn (0..$max_shard) { my $tmpl = "$dn-XXXXXXXX"; my $wip = tempdir($tmpl, DIR => $old); same_fs_or_die($old, $wip); @@ -220,7 +220,7 @@ sub run { push @q, [ $src // $cur , $wip ]; $tmp->{$cur} = $wip; } - # mark old parts to be unlinked + # mark old shards to be unlinked if ($src) { $tmp->{$_} ||= undef for @$src; } @@ -305,7 +305,7 @@ sub compact ($$) { } sub cpdb_loop ($$$;$$) { - my ($src, $dst, $pr_data, $cur_part, $new_parts) = @_; + my ($src, $dst, $pr_data, $cur_shard, $reshard) = @_; my ($pr, $fmt, $nr, $pfx); if ($pr_data) { $pr = $pr_data->{pr}; @@ -326,9 +326,9 @@ sub cpdb_loop ($$$;$$) { eval { for (; $it != $end; $it++) { my $docid = $it->get_docid; - if (defined $new_parts) { - my $dst_part = $docid % $new_parts; - next if $dst_part != $cur_part; + if (defined $reshard) { + my $dst_shard = $docid % $reshard; + next if $dst_shard != $cur_shard; } my $doc = $src->get_document($docid); $dst->replace_document($docid, $doc); @@ -350,14 +350,14 @@ sub cpdb_loop ($$$;$$) { sub cpdb ($$) { my ($args, $opt) = @_; my ($old, $new) = @$args; - my ($src, $cur_part); - my $new_parts; + my ($src, $cur_shard); + my $reshard; if (ref($old) eq 'ARRAY') { - ($cur_part) = ($new =~ m!xap[0-9]+/([0-9]+)\b!); - defined $cur_part or + ($cur_shard) = ($new =~ m!xap[0-9]+/([0-9]+)\b!); + defined $cur_shard or die "BUG: could not extract shard # from $new"; - $new_parts = $opt->{reshard}; - defined $new_parts or die 'BUG: got array src w/o --reshard'; + $reshard = $opt->{reshard}; + defined $reshard or die 'BUG: got array src w/o --reshard'; # resharding, M:N copy means have full read access foreach (@$old) { @@ -410,8 +410,8 @@ sub cpdb ($$) { # we can only estimate when resharding, # because removed spam causes slight imbalance my $est = ''; - if (defined $cur_part && $new_parts > 1) { - $tot = int($tot/$new_parts); + if (defined $cur_shard && $reshard > 1) { + $tot = int($tot/$reshard); $est = 'around '; } my $fmt = "$pfx % ".length($tot)."u/$tot\n"; @@ -422,15 +422,15 @@ sub cpdb ($$) { }; } while (cpdb_retryable($src, $pfx)); - if (defined $new_parts) { + if (defined $reshard) { # we rely on document IDs matching NNTP article number, - # so we can't have the combined DB support rewriting + # so we can't have the Xapian sharding DB support rewriting # document IDs. Thus we iterate through each shard # individually. $src = undef; foreach (@$old) { my $old = Search::Xapian::Database->new($_); - cpdb_loop($old, $dst, $pr_data, $cur_part, $new_parts); + cpdb_loop($old, $dst, $pr_data, $cur_shard, $reshard); } } else { cpdb_loop($src, $dst, $pr_data); -- cgit v1.2.3-24-ge0c7 From a060a5a94d02045b48ccb8b3f4105170e52719b2 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 14 Jun 2019 21:30:31 +0000 Subject: t/xcpdb-reshard: use 'shard' term in local variables Another step in maintaining consistency with Xapian docs. --- t/xcpdb-reshard.t | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/t/xcpdb-reshard.t b/t/xcpdb-reshard.t index bf56404d..d921e12f 100644 --- a/t/xcpdb-reshard.t +++ b/t/xcpdb-reshard.t @@ -43,8 +43,8 @@ for my $i (1..$ndoc) { ok($im->add($mime), "message $i added"); } $im->done; -my @parts = grep(m!/\d+\z!, glob("$ibx->{mainrepo}/xap*/*")); -is(scalar(@parts), $nproc, 'got expected parts'); +my @shards = grep(m!/\d+\z!, glob("$ibx->{mainrepo}/xap*/*")); +is(scalar(@shards), $nproc, 'got expected shards'); my $orig = $ibx->over->query_xover(1, $ndoc); my %nums = map {; "$_->{num}" => 1 } @$orig; @@ -52,8 +52,8 @@ my %nums = map {; "$_->{num}" => 1 } @$orig; for my $R (qw(2 4 1 3 3)) { delete $ibx->{search}; # release old handles is(system(@xcpdb, "-R$R", $ibx->{mainrepo}), 0, "xcpdb -R$R"); - my @new_parts = grep(m!/\d+\z!, glob("$ibx->{mainrepo}/xap*/*")); - is(scalar(@new_parts), $R, 'resharded to two parts'); + my @new_shards = grep(m!/\d+\z!, glob("$ibx->{mainrepo}/xap*/*")); + is(scalar(@new_shards), $R, 'resharded to two shards'); my $msgs = $ibx->search->query('s:this'); is(scalar(@$msgs), $ndoc, 'got expected docs after resharding'); my %by_mid = map {; "$_->{mid}" => $_ } @$msgs; @@ -64,7 +64,7 @@ for my $R (qw(2 4 1 3 3)) { # ensure docids in Xapian match NNTP article numbers my $tot = 0; my %tmp = %nums; - foreach my $d (@new_parts) { + foreach my $d (@new_shards) { my $xdb = Search::Xapian::Database->new($d); $tot += $xdb->get_doccount; my $it = $xdb->postlist_begin(''); -- cgit v1.2.3-24-ge0c7 From 27658d2c8b8e51fa64f523c873587273f4f16c46 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Sat, 15 Jun 2019 08:25:44 +0000 Subject: comments: replace "partition" with "shard" Now that the code matches Xapian terminology, ensure our comments match, too. --- lib/PublicInbox/SearchIdx.pm | 2 +- lib/PublicInbox/V2Writable.pm | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 58b23375..665f673a 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -797,7 +797,7 @@ sub remote_close { sub remote_remove { my ($self, $oid, $mid) = @_; if (my $w = $self->{w}) { - # triggers remove_by_oid in a partition + # triggers remove_by_oid in a shard print $w "D $oid $mid\n" or die "failed to write remove $!"; } else { $self->begin_txn_lazy; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 7a89093c..2b3ffa63 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -24,7 +24,7 @@ use IO::Handle; my $PACKING_FACTOR = 0.4; # SATA storage lags behind what CPUs are capable of, so relying on -# nproc(1) can be misleading and having extra Xapian partions is a +# nproc(1) can be misleading and having extra Xapian shards is a # waste of FDs and space. It can also lead to excessive IO latency # and slow things down. Users on NVME or other fast storage can # use the NPROC env or switches in our script/public-inbox-* programs @@ -57,7 +57,7 @@ sub count_shards ($) { my $n = 0; my $xpfx = $self->{xpfx}; - # always load existing partitions in case core count changes: + # always load existing shards in case core count changes: # Also, shard count may change while -watch is running # due to "xcpdb --reshard" if (-d $xpfx) { @@ -292,7 +292,7 @@ sub idx_init { $self->{shards} = $nshards; } - # need to create all parts before initializing msgmap FD + # need to create all shards before initializing msgmap FD my $max = $self->{shards} - 1; # idx_shards must be visible to all forked processes @@ -616,17 +616,17 @@ sub checkpoint ($;$) { if ($wait) { my $barrier = $self->barrier_init(scalar @$shards); - # each partition needs to issue a barrier command + # each shard needs to issue a barrier command $_->remote_barrier for @$shards; - # wait for each Xapian partition + # wait for each Xapian shard $self->barrier_wait($barrier); } else { $_->remote_commit for @$shards; } # last_commit is special, don't commit these until - # remote partitions are done: + # remote shards are done: $dbh->begin_work; set_last_commits($self); $dbh->commit; -- cgit v1.2.3-24-ge0c7