diff options
author | Eric Wong <e@80x24.org> | 2021-09-12 07:47:12 +0000 |
---|---|---|
committer | Eric Wong <e@80x24.org> | 2021-09-12 07:48:56 +0000 |
commit | b45a1dffa647f6427d0c900fcc55753db7a1994c (patch) | |
tree | eafb5e666662bc65afd1aad33aeb203134b1efd7 /lib/PublicInbox/Fetch.pm | |
parent | 02a0f3959b2e74f7217fcdca848822e7230acd6b (diff) | |
download | public-inbox-b45a1dffa647f6427d0c900fcc55753db7a1994c.tar.gz |
Setting up and maintaining git-only mirrors of v2 inboxes is complex since multiple commands are required to clone and fetch into epochs. Unlike grokmirror, these commands do not require any configuration. Instead, they rely on existing git config files and work like "git clone --mirror" and "git fetch", respectively. Like grokmirror, they use manifest.js.gz, but only on a per-inbox basis so users won't have to clone every inbox of a large instance nor edit config files to include/exclude inboxes they're interested in.
Diffstat (limited to 'lib/PublicInbox/Fetch.pm')
-rw-r--r-- | lib/PublicInbox/Fetch.pm | 145 |
1 files changed, 145 insertions, 0 deletions
diff --git a/lib/PublicInbox/Fetch.pm b/lib/PublicInbox/Fetch.pm new file mode 100644 index 00000000..d795731c --- /dev/null +++ b/lib/PublicInbox/Fetch.pm @@ -0,0 +1,145 @@ +# Copyright (C) all contributors <meta@public-inbox.org> +# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> +# Wrapper to "git fetch" remote public-inboxes +package PublicInbox::Fetch; +use strict; +use v5.10.1; +use parent qw(PublicInbox::IPC); +use URI (); +use PublicInbox::Spawn qw(popen_rd); +use PublicInbox::Admin; +use PublicInbox::LEI; +use PublicInbox::LeiCurl; +use PublicInbox::LeiMirror; +use IO::Uncompress::Gunzip qw(gunzip $GunzipError); +use File::Temp (); + +sub new { bless {}, __PACKAGE__ } + +sub fetch_cmd ($$) { + my ($lei, $opt) = @_; + my @cmd = qw(git); + $opt->{$_} = $lei->{$_} for (0..2); + # we support "-c $key=$val" for arbitrary git config options + # e.g.: git -c http.proxy=socks5h://127.0.0.1:9050 + push(@cmd, '-c', $_) for @{$lei->{opt}->{c} // []}; + push @cmd, 'fetch'; + push @cmd, '-q' if $lei->{opt}->{quiet}; + push @cmd, '-v' if $lei->{opt}->{verbose}; + @cmd; +} + +sub remote_url ($$) { + my ($lei, $dir) = @_; # TODO: support non-"origin"? + my $cmd = [ qw(git config remote.origin.url) ]; + my $fh = popen_rd($cmd, undef, { -C => $dir, 2 => $lei->{2} }); + my $url = <$fh>; + close $fh or return; + chomp $url; + $url; +} + +sub do_fetch { + my ($cls, $lei, $cd) = @_; + my $ibx_ver; + my $curl = PublicInbox::LeiCurl->new($lei) or return; + my $dir = PublicInbox::Admin::resolve_inboxdir($cd, \$ibx_ver); + if ($ibx_ver == 1) { + my $url = remote_url($lei, $dir) // + die "E: $dir missing remote.origin.url\n"; + my $uri = URI->new($url); + my $torsocks = $curl->torsocks($lei, $uri); + my $opt = { -C => $dir }; + my $cmd = [ @$torsocks, fetch_cmd($lei, $opt) ]; + my $cerr = PublicInbox::LeiMirror::run_reap($lei, $cmd, $opt); + $lei->child_error($cerr, "@$cmd failed") if $cerr; + return; + } + # v2: + opendir my $dh, "$dir/git" or die "opendir $dir/git: $!"; + my @epochs = sort { $b <=> $a } map { substr($_, 0, -4) + 0 } + grep(/\A[0-9]+\.git\z/, readdir($dh)); + my ($git_url, $epoch); + for my $nr (@epochs) { # try newest epoch, first + my $edir = "$dir/git/$nr.git"; + if (defined(my $url = remote_url($lei, $edir))) { + $git_url = $url; + $epoch = $nr; + last; + } else { + warn "W: $edir missing remote.origin.url\n"; + } + } + $git_url or die "Unable to determine git URL\n"; + my $inbox_url = $git_url; + $inbox_url =~ s!/git/$epoch(?:\.git)?/?\z!! or + $inbox_url =~ s!/$epoch(?:\.git)?/?\z!! or die <<EOM; +Unable to infer inbox URL from <$git_url> +EOM + $lei->qerr("# inbox URL: $inbox_url/"); + my $muri = URI->new("$inbox_url/manifest.js.gz"); + my $ft = File::Temp->new(TEMPLATE => 'manifest-XXXX', + UNLINK => 1, DIR => $dir); + my $fn = $ft->filename; + my @opt = (qw(-R -o), $fn); + my $mf = "$dir/manifest.js.gz"; + my $m0; # current manifest.js.gz contents + if (open my $fh, '<', $mf) { + $m0 = eval { + PublicInbox::LeiMirror::decode_manifest($fh, $mf, $mf) + }; + $lei->err($@) if $@; + push @opt, '-z', $mf if defined($m0); + } + my $curl_cmd = $curl->for_uri($lei, $muri, @opt); + my $opt = {}; + $opt->{$_} = $lei->{$_} for (0..2); + my $cerr = PublicInbox::LeiMirror::run_reap($lei, $curl_cmd, $opt); + return $lei->child_error($cerr, "@$curl_cmd failed") if $cerr; + return if !-s $ft; # 304 Not Modified via curl -z + + my $m1 = PublicInbox::LeiMirror::decode_manifest($ft, $fn, $muri); + my $mdiff = { %$m1 }; + + # filter out unchanged entries + while (my ($k, $v0) = each %{$m0 // {}}) { + my $cur = $m1->{$k} // next; + my $f0 = $v0->{fingerprint} // next; + my $f1 = $cur->{fingerprint} // next; + my $t0 = $v0->{modified} // next; + my $t1 = $cur->{modified} // next; + delete($mdiff->{$k}) if $f0 eq $f1 && $t0 == $t1; + } + my $ibx_uri = URI->new("$inbox_url/"); + my ($path_pfx, $v1_bare, @v2_epochs) = + PublicInbox::LeiMirror::deduce_epochs($mdiff, $ibx_uri->path); + defined($v1_bare) and die <<EOM; +E: got v1 `$v1_bare' when expecting v2 epoch(s) in <$muri>, WTF? +EOM + my @epoch_nr = sort { $a <=> $b } + map { my ($nr) = (m!/([0-9]+)\.git\z!g) } @v2_epochs; + + # n.b. this expects all epochs are from the same host + my $torsocks = $curl->torsocks($lei, $muri); + for my $nr (@epoch_nr) { + my $dir = "$dir/git/$nr.git"; + my $cmd; + my $opt = {}; + if (-d $dir) { + $opt->{-C} = $dir; + $cmd = [ @$torsocks, fetch_cmd($lei, $opt) ]; + } else { + my $e_uri = $ibx_uri->clone; + $e_uri->path($ibx_uri->path."git/$nr.git"); + $cmd = [ @$torsocks, + PublicInbox::LeiMirror::clone_cmd($lei, $opt), + $$e_uri, $dir ]; + } + my $cerr = PublicInbox::LeiMirror::run_reap($lei, $cmd, $opt); + return $lei->child_error($cerr, "@$cmd failed") if $cerr; + } + rename($fn, $mf) or die "E: rename($fn, $mf): $!\n"; + $ft->unlink_on_destroy(0); +} + +1; |