From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: AS43350 77.247.176.0/21 X-Spam-Status: No, score=-2.1 required=3.0 tests=AWL,BAYES_00, RCVD_IN_MSPIKE_BL,RCVD_IN_MSPIKE_ZBI,RCVD_IN_XBL,SPF_FAIL,SPF_HELO_FAIL, TO_EQ_FM_DOM_SPF_FAIL shortcircuit=no autolearn=no autolearn_force=no version=3.4.0 Received: from 80x24.org (chomsky.torservers.net [77.247.181.162]) by dcvr.yhbt.net (Postfix) with ESMTP id 92E531FD99 for ; Wed, 10 Aug 2016 23:31:30 +0000 (UTC) From: Eric Wong To: spew@80x24.org Subject: [PATCH] search: support alt-ID for mapping legacy serial numbers Date: Wed, 10 Aug 2016 23:31:28 +0000 Message-Id: <20160810233128.11364-1-e@80x24.org> List-Id: For some existing mailing list archives, messages are identified by serial number (such as NNTP article numbers in gmane). Those links may become inaccessible (as is the current case for gmane), so ensure users can still search based on old serial numbers. Now, I run the following periodically to get article numbers from gmane (while news.gmane.org remains): NNTPSERVER=news.gmane.org export NNTPSERVER GROUP=gmane.comp.version-control.git perl -I lib scripts/xhdr-num2mid $GROUP --msgmap=/path/to/gmane.sqlite3 (I might integrate this further with public-inbox-* scripts one day). My ~/.public-inbox/config as an added "altid" snippet which now looks like this: [publicinbox "git"] address = git@vger.kernel.org mainrepo = /path/to/git.vger.git newsgroup = inbox.comp.version-control.git ; relative pathnames expand to $mainrepo/public-inbox/$file altid = serial:gmane:file=gmane.sqlite3 And run "public-inbox-index --reindex /path/to/git.vger.git" periodically. This ought to allow searching for "gmane:12345" to work for Xapian-enabled instances. Disclaimer: while public-inbox supports NNTP and stable article serial numbers, use of those for public links is discouraged since it encourages centralization. --- MANIFEST | 2 ++ lib/PublicInbox/AltId.pm | 38 +++++++++++++++++++++++++++ lib/PublicInbox/Config.pm | 7 +++++ lib/PublicInbox/Msgmap.pm | 20 ++++++++++++++- lib/PublicInbox/Search.pm | 16 ++++++++++-- lib/PublicInbox/SearchIdx.pm | 24 +++++++++++++++-- scripts/xhdr-num2mid | 27 ++++++++++++++++++-- t/altid.t | 61 ++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 188 insertions(+), 7 deletions(-) create mode 100644 lib/PublicInbox/AltId.pm create mode 100644 t/altid.t diff --git a/MANIFEST b/MANIFEST index 308da06..f5ea455 100644 --- a/MANIFEST +++ b/MANIFEST @@ -35,6 +35,7 @@ examples/unsubscribe.milter examples/unsubscribe.psgi examples/varnish-4.vcl lib/PublicInbox/Address.pm +lib/PublicInbox/AltId.pm lib/PublicInbox/Config.pm lib/PublicInbox/Daemon.pm lib/PublicInbox/Emergency.pm @@ -104,6 +105,7 @@ scripts/slrnspool2maildir scripts/ssoma-replay scripts/xhdr-num2mid t/address.t +t/altid.t t/cgi.t t/check-www-inbox.perl t/common.perl diff --git a/lib/PublicInbox/AltId.pm b/lib/PublicInbox/AltId.pm new file mode 100644 index 0000000..6fdc3a2 --- /dev/null +++ b/lib/PublicInbox/AltId.pm @@ -0,0 +1,38 @@ +# Copyright (C) 2016 all contributors +# License: AGPL-3.0+ + +package PublicInbox::AltId; +use strict; +use warnings; +use URI::Escape qw(uri_unescape); + +# spec: TYPE:PREFIX:param1=value1¶m2=value2&... +# Example: serial:gmane:file=/path/to/altmsgmap.sqlite3 +sub new { + my ($class, $inbox, $spec) = @_; + my ($type, $prefix, $query) = split(/:/, $spec, 3); + $type eq 'serial' or die "non-serial not supported, yet\n"; + + require PublicInbox::Msgmap; + + my %params = map { + my ($k, $v) = split(/=/, uri_unescape($_), 2); + $v = '' unless defined $v; + ($k, $v); + } split(/[&;]/, $query); + my $f = $params{file} or die "file: required for $type spec $spec\n"; + unless (index($f, '/') == 0) { + $f = "$inbox->{mainrepo}/public-inbox/$f"; + } + bless { + mm_alt => PublicInbox::Msgmap->new_file($f), + xprefix => 'X'.uc($prefix), + }, $class; +} + +sub mid2alt { + my ($self, $mid) = @_; + $self->{mm_alt}->num_for($mid); +} + +1; diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 1256fb1..a2781a7 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -145,6 +145,13 @@ sub _fill { my $v = $self->{"$pfx.$k"}; $rv->{$k} = $v if defined $v; } + foreach my $k (qw(altid)) { # TODO: more arrays + my $v = $self->{"$pfx.$k"}; + if (defined $v) { + $rv->{$k} = []; + } + } + return unless $rv->{mainrepo}; my $name = $pfx; $name =~ s/\Apublicinbox\.//; diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm index 2583ff4..3fb3805 100644 --- a/lib/PublicInbox/Msgmap.pm +++ b/lib/PublicInbox/Msgmap.pm @@ -20,7 +20,12 @@ sub new { my $err = $!; -d $d or die "$d not created: $err"; } - my $f = "$d/msgmap.sqlite3"; + new_file($class, "$d/msgmap.sqlite3", $writable); +} + +sub new_file { + my ($class, $f, $writable) = @_; + my $dbh = DBI->connect("dbi:SQLite:dbname=$f",'','', { AutoCommit => 1, RaiseError => 1, @@ -40,6 +45,7 @@ sub new { $self; } +# n.b. invoked directly by scripts/xhdr-num2mid sub meta_accessor { my ($self, $key, $value) = @_; use constant { @@ -154,6 +160,7 @@ sub create_tables { 'val VARCHAR(255) NOT NULL)'); } +# used by NNTP.pm sub id_batch { my ($self, $num, $cb) = @_; my $dbh = $self->{dbh}; @@ -167,4 +174,15 @@ sub id_batch { $nr; } +# only used for mapping external serial numbers (e.g. articles from gmane) +# see scripts/xhdr-num2mid for usage +sub mid_set { + my ($self, $num, $mid) = @_; + my $sth = $self->{mid_set} ||= do { + my $sql = 'INSERT INTO msgmap (num, mid) VALUES (?,?)'; + $self->{dbh}->prepare($sql); + }; + $sth->execute($num, $mid); +} + 1; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 3a908ac..018fcb5 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -79,10 +79,10 @@ sub xdir { } sub new { - my ($class, $git_dir) = @_; + my ($class, $git_dir, $altid) = @_; my $dir = $class->xdir($git_dir); my $db = Search::Xapian::Database->new($dir); - bless { xdb => $db, git_dir => $git_dir }, $class; + bless { xdb => $db, git_dir => $git_dir, altid => $altid }, $class; } sub reopen { $_[0]->{xdb}->reopen } @@ -186,6 +186,18 @@ sub qp { $qp->add_boolean_prefix($name, $prefix); } + # we do not actually create AltId objects, + # just parse the spec to avoid the extra DB handles for now. + if (my $altid = $self->{altid}) { + for (@$altid) { + # $_ = 'serial:gmane:/path/to/gmane.msgmap.sqlite3' + /\Aserial:(\w+):/ or next; + my $pfx = $1; + # gmane => XGMANE + $qp->add_boolean_prefix($pfx, 'X'.uc($pfx)); + } + } + while (my ($name, $prefix) = each %prob_prefix) { $qp->add_prefix($name, $prefix); } diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 0582526..9f7f256 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -30,9 +30,21 @@ use constant { }; sub new { - my ($class, $git_dir, $creat) = @_; + my ($class, $inbox, $creat) = @_; + my $git_dir = $inbox; + my $altid; + if (ref $inbox) { + $git_dir = $inbox->{mainrepo}; + $altid = $inbox->{altid}; + if ($altid) { + require PublicInbox::AltId; + $altid = [ map { + PublicInbox::AltId->new($inbox, $_); + } @$altid ]; + } + } require Search::Xapian::WritableDatabase; - my $self = bless { git_dir => $git_dir }, $class; + my $self = bless { git_dir => $git_dir, -altid => $altid }, $class; my $perm = $self->_git_config_perm; my $umask = _umask_for($perm); $self->{umask} = $umask; @@ -170,6 +182,14 @@ sub add_message { link_message($self, $smsg, $old_tid); $doc->set_data($smsg->to_doc_data($blob)); + + if (my $altid = $self->{-altid}) { + foreach my $alt (@$altid) { + my $id = $alt->mid2alt($mid); + next unless defined $id; + $doc->add_term($alt->{xprefix} . $id); + } + } if (defined $doc_id) { $db->replace_document($doc_id, $doc); } else { diff --git a/scripts/xhdr-num2mid b/scripts/xhdr-num2mid index f1e7ea3..bc3ede6 100755 --- a/scripts/xhdr-num2mid +++ b/scripts/xhdr-num2mid @@ -5,8 +5,18 @@ use strict; use warnings; use Net::NNTP; -use Data::Dumper; +use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); my $usage = "usage: NNTPSERVER=news.example.org $0 GROUP [FIRST_NUM]\n"; +my ($msgmap, $mm); +my %opts = ( '--msgmap=s' => \$msgmap ); +GetOptions(%opts) or die "bad command-line args\n$usage"; + +if ($msgmap) { + require PublicInbox::Msgmap; + require PublicInbox::MID; # mid_clean + $mm = PublicInbox::Msgmap->new_file($msgmap, 1); +} + my $group = shift or die $usage; my $nntp = Net::NNTP->new($ENV{NNTPSERVER} || '127.0.0.1'); my ($num, $first, $last) = $nntp->group($group); @@ -15,16 +25,29 @@ my $arg_first = shift; if (defined $arg_first) { $arg_first =~ /\A\d+\z/ or die $usage; $first = $arg_first; +} elsif ($mm) { + my $last_article = $mm->meta_accessor('last_article'); + $first = $last_article + 1 if defined $last_article; } my $batch = 1000; my $i; for ($i = $first; $i < $last; $i += $batch) { - my $j = $i + $batch; + my $j = $i + $batch - 1; $j = $last if $j > $last; my $num2mid = $nntp->xhdr('Message-ID', "$i-$j"); + + $mm->{dbh}->begin_work if $mm; for my $n ($i..$j) { defined(my $mid = $num2mid->{$n}) or next; print "$n $mid\n"; + if ($mm) { + $mid = PublicInbox::MID::mid_clean($mid); + $mm->mid_set($n, $mid); + } + } + if ($mm) { + $mm->meta_accessor('last_article', $j); + $mm->{dbh}->commit; } } diff --git a/t/altid.t b/t/altid.t new file mode 100644 index 0000000..887d548 --- /dev/null +++ b/t/altid.t @@ -0,0 +1,61 @@ +# Copyright (C) 2016 all contributors +# License: AGPL-3.0+ +use strict; +use warnings; +use Test::More; +use File::Temp qw/tempdir/; +foreach my $mod (qw(DBD::SQLite Search::Xapian)) { + eval "require $mod"; + plan skip_all => "$mod missing for altid.t" if $@; +} + +use_ok 'PublicInbox::Msgmap'; +use_ok 'PublicInbox::SearchIdx'; +use_ok 'PublicInbox::Import'; +use_ok 'PublicInbox::Inbox'; +my $tmpdir = tempdir('pi-altid-XXXXXX', TMPDIR => 1, CLEANUP => 1); +my $git_dir = "$tmpdir/a.git"; +my $alt_file = "$tmpdir/another-nntp.sqlite3"; +my $altid = [ "serial:gmane:file=$alt_file" ]; + +{ + my $mm = PublicInbox::Msgmap->new_file($alt_file, 1); + $mm->mid_set(1234, 'a@example.com'); +} + +{ + is(system(qw(git init -q --bare), $git_dir), 0, 'git init ok'); + my $git = PublicInbox::Git->new($git_dir); + my $im = PublicInbox::Import->new($git, 'testbox', 'test@example'); + $im->add(Email::MIME->create( + header => [ + From => 'a@example.com', + To => 'b@example.com', + 'Content-Type' => 'text/plain', + Subject => 'boo!', + 'Message-ID' => '', + ], + body => "hello world gmane:666\n", + )); + $im->done; +} +{ + my $inbox = PublicInbox::Inbox->new({mainrepo=>$git_dir}); + $inbox->{altid} = $altid; + my $rw = PublicInbox::SearchIdx->new($inbox, 1); + $rw->index_sync; +} + +{ + my $ro = PublicInbox::Search->new($git_dir, $altid); + my $res = $ro->query("gmane:1234"); + is($res->{total}, 1, 'got one match'); + is($res->{msgs}->[0]->mid, 'a@example.com'); + + $res = $ro->query("gmane:666"); + is($res->{total}, 0, 'body did NOT match'); +}; + +done_testing(); + +1; -- EW