From 32fa6be4222d9af593c22a7dc101d8d5e8835511 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Tue, 21 Mar 2023 23:07:21 +0000 Subject: codesearch: initial cut w/ -cindex tool It seems relying on root commits is a reasonable way to deduplicate and handle repositories with common history. I initially wanted to shoehorn this into extindex, but decided a separate Xapian index layout capable of being EITHER external to handle many forks or internal (in $GIT_DIR/public-inbox-cindex) for small projects is the right way to go. Unlike most existing parts of public-inbox, this relies on absolute paths of $GIT_DIR stored in the Xapian DB and does not rely on the config file. We'll be relying on the config file to map absolute paths to public URL paths for WWW. --- script/public-inbox-cindex | 75 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100755 script/public-inbox-cindex (limited to 'script') diff --git a/script/public-inbox-cindex b/script/public-inbox-cindex new file mode 100755 index 00000000..166c8261 --- /dev/null +++ b/script/public-inbox-cindex @@ -0,0 +1,75 @@ +#!perl -w +# Copyright (C) all contributors +# License: AGPL-3.0+ +use v5.12; +use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); +my $help = < 1, scan => 1 }; # --no-scan is hidden +GetOptions($opt, qw(quiet|q verbose|v+ reindex jobs|j=i fsync|sync! dangerous + indexlevel|index-level|L=s batch_size|batch-size=s + project-list=s + d=s update|u scan! prune dry-run|n C=s@ help|h)) + or die $help; +if ($opt->{help}) { print $help; exit 0 }; +die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0; +require IO::Handle; +STDOUT->autoflush(1); +STDERR->autoflush(1); +local $SIG{USR1} = 'IGNORE'; # to be overridden in cidx_sync +# require lazily to speed up --help +require PublicInbox::Admin; +PublicInbox::Admin::do_chdir(delete $opt->{C}); +my $cfg = PublicInbox::Config->new; +my $cidx_dir = $opt->{d}; +PublicInbox::Admin::require_or_die('Search::Xapian'); +PublicInbox::Admin::progress_prepare($opt); +my $env = PublicInbox::Admin::index_prepare($opt, $cfg); +%ENV = (%ENV, %$env) if $env; + +require PublicInbox::CodeSearchIdx; # unstable internal API +my @git_dirs; +if (defined(my $pl = $opt->{'project-list'})) { + my $pfx = shift @ARGV // die <); + $_ = PublicInbox::Admin::resolve_git_dir("$pfx/$_") for @git_dirs; +} else { + @git_dirs = map { PublicInbox::Admin::resolve_git_dir($_) } @ARGV; +} +if (defined $cidx_dir) { # external index + die "`%' is not allowed in $cidx_dir\n" if $cidx_dir =~ /\%/; + my $cidx = PublicInbox::CodeSearchIdx->new($cidx_dir, $opt); + @{$cidx->{git_dirs}} = @git_dirs; # may be empty + $cidx->cidx_run; +} elsif (!@git_dirs) { + die $help +} else { + for my $gd (@git_dirs) { + my $cd = "$gd/public-inbox-cindex"; + my $cidx = PublicInbox::CodeSearchIdx->new($cd, { %$opt }); + $cidx->{-internal} = 1; + @{$cidx->{git_dirs}} = ($gd); + $cidx->cidx_run; + } +} -- cgit v1.2.3-24-ge0c7