about summary refs log tree commit homepage
path: root/script
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2023-03-21 23:07:21 +0000
committerEric Wong <e@80x24.org>2023-03-25 09:37:45 +0000
commit32fa6be4222d9af593c22a7dc101d8d5e8835511 (patch)
tree16bbac338b62675b1214bd1fceea4ca4ab2d40cd /script
parent72dfac803728571c30e7ab8caf005229bc1f39f8 (diff)
downloadpublic-inbox-32fa6be4222d9af593c22a7dc101d8d5e8835511.tar.gz
It seems relying on root commits is a reasonable way to
deduplicate and handle repositories with common history.

I initially wanted to shoehorn this into extindex, but decided a
separate Xapian index layout capable of being EITHER external to
handle many forks or internal (in $GIT_DIR/public-inbox-cindex)
for small projects is the right way to go.

Unlike most existing parts of public-inbox, this relies on
absolute paths of $GIT_DIR stored in the Xapian DB and does not
rely on the config file.  We'll be relying on the config file to
map absolute paths to public URL paths for WWW.
Diffstat (limited to 'script')
-rwxr-xr-xscript/public-inbox-cindex75
1 files changed, 75 insertions, 0 deletions
diff --git a/script/public-inbox-cindex b/script/public-inbox-cindex
new file mode 100755
index 00000000..166c8261
--- /dev/null
+++ b/script/public-inbox-cindex
@@ -0,0 +1,75 @@
+#!perl -w
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use v5.12;
+use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
+my $help = <<EOF; # the following should fit w/o scrolling in 80x24 term:
+usage: public-inbox-cindex [options] GIT_DIR...
+usage: public-inbox-cindex [options] --project-list=FILE PROJECT_ROOT
+
+  Create and update search indices for code repos
+
+  -d EXTDIR           use EXTDIR instead of GIT_DIR/public-inbox-cindex
+  --no-fsync          speed up indexing, risk corruption on power outage
+  -L LEVEL            `medium', or `full' (default: medium)
+  --project-list=FILE use a cgit/gitweb-compatible list of projects
+  --update | -u       update previously-indexed code repos with `-d'
+  --jobs=NUM          set or disable parallelization (NUM=0)
+  --batch-size=BYTES  flush changes to OS after a given number of bytes
+  --prune             prune old repos and commits
+  --reindex           reindex previously indexed repos
+  --verbose | -v      increase verbosity (may be repeated)
+
+BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes)
+See public-inbox-cindex(1) man page for full documentation.
+EOF
+my $opt = { fsync => 1, scan => 1 }; # --no-scan is hidden
+GetOptions($opt, qw(quiet|q verbose|v+ reindex jobs|j=i fsync|sync! dangerous
+                indexlevel|index-level|L=s batch_size|batch-size=s
+                project-list=s
+                d=s update|u scan! prune dry-run|n C=s@ help|h))
+        or die $help;
+if ($opt->{help}) { print $help; exit 0 };
+die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
+require IO::Handle;
+STDOUT->autoflush(1);
+STDERR->autoflush(1);
+local $SIG{USR1} = 'IGNORE'; # to be overridden in cidx_sync
+# require lazily to speed up --help
+require PublicInbox::Admin;
+PublicInbox::Admin::do_chdir(delete $opt->{C});
+my $cfg = PublicInbox::Config->new;
+my $cidx_dir = $opt->{d};
+PublicInbox::Admin::require_or_die('Search::Xapian');
+PublicInbox::Admin::progress_prepare($opt);
+my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
+%ENV = (%ENV, %$env) if $env;
+
+require PublicInbox::CodeSearchIdx; # unstable internal API
+my @git_dirs;
+if (defined(my $pl = $opt->{'project-list'})) {
+        my $pfx = shift @ARGV // die <<EOM;
+PROJECTS_ROOT required for --project-list
+EOM
+        open my $fh, '<', $pl or die "open($pl): $!\n";
+        chomp(@git_dirs = <$fh>);
+        $_ = PublicInbox::Admin::resolve_git_dir("$pfx/$_") for @git_dirs;
+} else {
+        @git_dirs = map { PublicInbox::Admin::resolve_git_dir($_) } @ARGV;
+}
+if (defined $cidx_dir) { # external index
+        die "`%' is not allowed in $cidx_dir\n" if $cidx_dir =~ /\%/;
+        my $cidx = PublicInbox::CodeSearchIdx->new($cidx_dir, $opt);
+        @{$cidx->{git_dirs}} = @git_dirs; # may be empty
+        $cidx->cidx_run;
+} elsif (!@git_dirs) {
+        die $help
+} else {
+        for my $gd (@git_dirs) {
+                my $cd = "$gd/public-inbox-cindex";
+                my $cidx = PublicInbox::CodeSearchIdx->new($cd, { %$opt });
+                $cidx->{-internal} = 1;
+                @{$cidx->{git_dirs}} = ($gd);
+                $cidx->cidx_run;
+        }
+}