* [PATCH 1/8] import_slrnspool: reimplement using fast-import
@ 2016-08-13 23:53 Eric Wong
2016-08-13 23:53 ` [PATCH 2/8] searchidx: do not release Xapian lock while (only) Msgmap is indexing Eric Wong
` (6 more replies)
0 siblings, 7 replies; 8+ messages in thread
From: Eric Wong @ 2016-08-13 23:53 UTC (permalink / raw)
To: spew
I needed to use this to resurrect some messages missing
from my initial downloads from gmane...
---
scripts/import_slrnspool | 94 ++++++++++++++++++++----------------------------
1 file changed, 38 insertions(+), 56 deletions(-)
diff --git a/scripts/import_slrnspool b/scripts/import_slrnspool
index 687809b..9885059 100755
--- a/scripts/import_slrnspool
+++ b/scripts/import_slrnspool
@@ -1,6 +1,6 @@
#!/usr/bin/perl -w
-# Copyright (C) 2015 all contributors <meta@public-inbox.org>
-# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt)
+# Copyright (C) 2015-2016 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
#
# Incremental (or one-shot) importer of a slrnpull news spool
=begin usage
@@ -11,8 +11,9 @@
use strict;
use warnings;
use PublicInbox::Config;
-use Email::Filter;
-use Email::LocalDelivery;
+use Email::MIME;
+use PublicInbox::Import;
+use PublicInbox::Git;
sub usage { "Usage:\n".join('',grep(/\t/, `head -n 10 $0`)) }
my $exit = 0;
my $sighandler = sub { $exit = 1 };
@@ -22,43 +23,33 @@ my $spool = shift @ARGV or die usage();
my $recipient = $ENV{ORIGINAL_RECIPIENT};
defined $recipient or die usage();
my $config = PublicInbox::Config->new;
-my $cfg = $config->lookup($recipient);
-defined $cfg or exit(1);
-my @mda;
-if ($ENV{'FILTER'}) {
- @mda = qw(public-inbox-mda);
-} else {
- @mda = (qw(ssoma-mda -1), $cfg->{mainrepo});
-}
+my $ibx = $config->lookup($recipient);
+my $git = $ibx->git;
+my $im = PublicInbox::Import->new($git, $ibx->{name}, $ibx->{-primary_address});
sub key {
- my ($cfg) = @_;
- "publicinbox.$cfg->{inbox}.importslrnspoolstate";
+ "publicinbox.$ibx->{name}.importslrnspoolstate";
}
sub get_min {
my $f = PublicInbox::Config->default_file;
- my @cmd = (qw/git config/, "--file=$f", key($cfg));
- use IPC::Run qw/run/;
-
- my $in = '';
- my $out = '';
- unless (run(\@cmd, \$in, \$out)) {
- $out = 0;
- }
- int($out);
+ my $out = $git->qx('config', "--file=$f", key($ibx));
+ $out ||= 0;
+ chomp $out;
+ $out =~ /\A\d+\z/ and return $out;
+ 0;
}
sub set_min {
- my ($cfg, $num) = @_;
+ my ($num) = @_;
my $f = PublicInbox::Config->default_file;
- my @cmd = (qw/git config/, "--file=$f", key($cfg), $num);
+ my @cmd = (qw/git config/, "--file=$f", key($ibx), $num);
system(@cmd) == 0 or die join(' ', @cmd). " failed: $?\n";
}
my $n = get_min();
my $ok;
-my $max_gap = 10000;
+my $max_gap = 200000;
my $max = $n + $max_gap;
for (; $exit == 0 && $n < $max; $n++) {
@@ -67,40 +58,31 @@ for (; $exit == 0 && $n < $max; $n++) {
open(my $fh, '<', $fn) or next;
$max = $n + $max_gap;
- # prevent process growth by forking a new process for each message
- my $pid = fork;
- die "failed to fork: $!\n" unless defined $pid;
-
- if ($pid == 0) {
- my $f = Email::Filter->new(data => eval { local $/; <$fh> });
- close $fh;
- $fh = undef;
- my $s = $f->simple;
+ my $mime = Email::MIME->new(eval { local $/; <$fh> });
+ my $hdr = $mime->header_obj;
- # gmane rewrites Received headers, which increases spamminess
- # Some older archives set Original-To
- foreach my $x (qw(Received To)) {
- my @h = $s->header("Original-$x");
- if (@h) {
- $s->header_set($x, @h);
- $s->header_set("Original-$x");
- }
+ # gmane rewrites Received headers, which increases spamminess
+ # Some older archives set Original-To
+ foreach my $x (qw(Received To)) {
+ my @h = $hdr->header_raw("Original-$x");
+ if (@h) {
+ $hdr->header_set($x, @h);
+ $hdr->header_set("Original-$x");
}
+ }
- # triggers for the SA HEADER_SPAM rule
- foreach my $drop (qw(Approved)) { $s->header_set($drop) }
+ # Approved triggers for the SA HEADER_SPAM rule,
+ # X-From is gmane specific
+ foreach my $drop (qw(Approved X-From)) {
+ $hdr->header_set($drop);
+ }
- # appears to be an old gmane bug:
- $s->header_set('connect()');
+ # appears to be an old gmane bug:
+ $hdr->header_set('connect()');
+ $im->add($mime);
- $f->exit(0);
- $f->pipe(@mda);
- exit 0;
- } else {
- close $fh;
- waitpid($pid, 0);
- die "error: $?\n" if $?;
- }
$ok = $n + 1;
- set_min($cfg, $ok);
+ set_min($ok);
}
+
+$im->done;
--
EW
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 2/8] searchidx: do not release Xapian lock while (only) Msgmap is indexing
2016-08-13 23:53 [PATCH 1/8] import_slrnspool: reimplement using fast-import Eric Wong
@ 2016-08-13 23:53 ` Eric Wong
2016-08-13 23:53 ` [PATCH 3/8] www: do not unecessarily escape some chars in paths Eric Wong
` (5 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Eric Wong @ 2016-08-13 23:53 UTC (permalink / raw)
To: spew
SQLite might index quickly, so we hold the lock used by Xapian
for the duration. This probably needs to be reworked entirely,
actually.
---
lib/PublicInbox/SearchIdx.pm | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 0eb07a1..f8155ec 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -436,19 +436,24 @@ sub _index_sync {
my $mm = _msgmap_init($self);
my $dbh = $mm->{dbh} if $mm;
+ my $mm_only;
my $cb = sub {
my ($commit, $more) = @_;
if ($dbh) {
$mm->last_commit($commit) if $commit;
$dbh->commit;
}
- $xdb->set_metadata($mkey, $commit) if $mkey && $commit;
- $xdb->commit_transaction;
- $xdb = _xdb_release($self);
+ if (!$mm_only) {
+ $xdb->set_metadata($mkey, $commit) if $mkey && $commit;
+ $xdb->commit_transaction;
+ $xdb = _xdb_release($self);
+ }
# let another process do some work... <
if ($more) {
- $xdb = _xdb_acquire($self);
- $xdb->begin_transaction;
+ if (!$mm_only) {
+ $xdb = _xdb_acquire($self);
+ $xdb->begin_transaction;
+ }
$dbh->begin_work if $dbh;
}
};
@@ -472,14 +477,13 @@ sub _index_sync {
my $mkey_prev = $mkey;
$mkey = undef; # ignore xapian, for now
my $mlog = _git_log($self, $r);
+ $mm_only = 1;
rlog($self, $mlog, *index_mm, *unindex_mm, $cb);
- $mlog = undef;
+ $mm_only = $mlog = undef;
# now deal with Xapian
$mkey = $mkey_prev;
$dbh = undef;
- $xdb = _xdb_acquire($self);
- $xdb->begin_transaction;
rlog($self, $xlog, *index_mm2, *unindex_mm2, $cb);
}
} else {
--
EW
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 3/8] www: do not unecessarily escape some chars in paths
2016-08-13 23:53 [PATCH 1/8] import_slrnspool: reimplement using fast-import Eric Wong
2016-08-13 23:53 ` [PATCH 2/8] searchidx: do not release Xapian lock while (only) Msgmap is indexing Eric Wong
@ 2016-08-13 23:53 ` Eric Wong
2016-08-13 23:53 ` [PATCH 4/8] www: do not double-clean Message-IDs from internal DBs Eric Wong
` (4 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Eric Wong @ 2016-08-13 23:53 UTC (permalink / raw)
To: spew
Based on reading RFC 3986, it seems '@', ':', '!', '$', '&',
"'", '; '(', ')', '*', '+', ',', ';', '=' are all allowed
in path-absolute where we have the Message-ID.
In any case, it seems '@' is fairly common in path components
nowadays and too common in Message-IDs.
---
MANIFEST | 1 +
lib/PublicInbox/ExtMsg.pm | 6 +++---
lib/PublicInbox/Feed.pm | 4 ++--
lib/PublicInbox/Hval.pm | 5 ++---
lib/PublicInbox/MID.pm | 7 ++++++-
lib/PublicInbox/NNTP.pm | 4 ++--
lib/PublicInbox/SearchView.pm | 6 +++---
lib/PublicInbox/View.pm | 31 +++++++++++++++----------------
lib/PublicInbox/WWW.pm | 5 +++--
scripts/ssoma-replay | 2 +-
t/cgi.t | 14 +++++++-------
t/check-www-inbox.perl | 4 ----
t/mid.t | 11 +++++++++++
t/nntp.t | 4 ++--
t/plack.t | 28 ++++++++++++++--------------
t/psgi_mount.t | 2 +-
16 files changed, 73 insertions(+), 61 deletions(-)
create mode 100644 t/mid.t
diff --git a/MANIFEST b/MANIFEST
index f5ea455..bed6050 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -132,6 +132,7 @@ t/init.t
t/linkify.t
t/main-bin/spamc
t/mda.t
+t/mid.t
t/msg_iter.t
t/msgmap.t
t/nntp.t
diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm
index 955ada7..b93d555 100644
--- a/lib/PublicInbox/ExtMsg.pm
+++ b/lib/PublicInbox/ExtMsg.pm
@@ -100,7 +100,7 @@ again:
my $code = 404;
my $h = PublicInbox::Hval->new_msgid($mid);
- my $href = $h->as_href;
+ my $href = $h->{href};
my $html = $h->as_html;
my $title = "<$html> not found";
my $s = "<pre>Message-ID <$html>\nnot found\n";
@@ -115,7 +115,7 @@ again:
my $u = $ibx->base_url($env) or next;
foreach my $m (@$res) {
my $p = PublicInbox::Hval->new_msgid($m);
- my $r = $p->as_href;
+ my $r = $p->{href};
my $t = $p->as_html;
$s .= qq{<a\nhref="$u$r/">$u$t/</a>\n};
}
@@ -153,7 +153,7 @@ sub ext_urls {
sub exact {
my ($ctx, $found, $mid) = @_;
my $h = PublicInbox::Hval->new_msgid($mid);
- my $href = $h->as_href;
+ my $href = $h->{href};
my $html = $h->as_html;
my $title = "<$html> found in ";
my $end = @$found == 1 ? 'another inbox' : 'other inboxes';
diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm
index 240c336..232a91c 100644
--- a/lib/PublicInbox/Feed.pm
+++ b/lib/PublicInbox/Feed.pm
@@ -137,7 +137,7 @@ sub emit_atom_thread {
my $fh = $cb->([200, ['Content-Type' => 'application/atom+xml']]);
my $ibx = $ctx->{-inbox};
my $html_url = $ibx->base_url($ctx->{env});
- $html_url .= PublicInbox::Hval->new_msgid($mid)->as_href;
+ $html_url .= PublicInbox::Hval->new_msgid($mid)->{href};
$feed_opts->{url} = $html_url;
$feed_opts->{emit_header} = 1;
@@ -269,7 +269,7 @@ sub feed_entry {
my $mid = $header_obj->header_raw('Message-ID');
defined $mid or return;
$mid = PublicInbox::Hval->new_msgid($mid);
- my $href = $midurl . $mid->as_href . '/';
+ my $href = $midurl . $mid->{href}. '/';
my $date = $header_obj->header('Date');
my $updated = feed_updated($date);
diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm
index 652aef3..ab05238 100644
--- a/lib/PublicInbox/Hval.pm
+++ b/lib/PublicInbox/Hval.pm
@@ -8,7 +8,7 @@ use strict;
use warnings;
use Encode qw(find_encoding);
use URI::Escape qw(uri_escape_utf8);
-use PublicInbox::MID qw/mid_clean/;
+use PublicInbox::MID qw/mid_clean mid_escape/;
use base qw/Exporter/;
our @EXPORT_OK = qw/ascii_html/;
@@ -33,7 +33,7 @@ sub new {
sub new_msgid {
my ($class, $msgid) = @_;
$msgid = mid_clean($msgid);
- $class->new($msgid, $msgid);
+ $class->new($msgid, mid_escape($msgid));
}
sub new_oneline {
@@ -60,7 +60,6 @@ sub ascii_html {
}
sub as_html { ascii_html($_[0]->{raw}) }
-sub as_href { ascii_html(uri_escape_utf8($_[0]->{href})) }
sub raw {
if (defined $_[1]) {
diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm
index 78952b9..74a4875 100644
--- a/lib/PublicInbox/MID.pm
+++ b/lib/PublicInbox/MID.pm
@@ -6,7 +6,8 @@ package PublicInbox::MID;
use strict;
use warnings;
use base qw/Exporter/;
-our @EXPORT_OK = qw/mid_clean id_compress mid2path mid_mime/;
+our @EXPORT_OK = qw/mid_clean id_compress mid2path mid_mime mid_escape/;
+use URI::Escape qw(uri_escape_utf8);
use Digest::SHA qw/sha1_hex/;
use constant MID_MAX => 40; # SHA-1 hex length
@@ -44,4 +45,8 @@ sub mid2path {
sub mid_mime ($) { $_[0]->header_obj->header_raw('Message-ID') }
+# RFC3986, section 3.3:
+sub MID_ESC () { '^A-Za-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@' }
+sub mid_escape ($) { uri_escape_utf8($_[0], MID_ESC) }
+
1;
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index 0c61dd8..7bfc6dd 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -9,12 +9,12 @@ use base qw(Danga::Socket);
use fields qw(nntpd article rbuf ng long_res);
use PublicInbox::Search;
use PublicInbox::Msgmap;
+use PublicInbox::MID qw(mid_escape);
use PublicInbox::Git;
require PublicInbox::EvCleanup;
use Email::Simple;
use POSIX qw(strftime);
use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC);
-use URI::Escape qw(uri_escape_utf8);
use constant {
r501 => '501 command syntax error',
r221 => '221 Header follows',
@@ -421,7 +421,7 @@ sub set_nntp_headers {
$hdr->header_set('Xref', xref($ng, $n));
header_append($hdr, 'List-Post', "<mailto:$ng->{-primary_address}>");
if (my $url = $ng->base_url) {
- $mid = uri_escape_utf8($mid);
+ $mid = mid_escape($mid);
header_append($hdr, 'Archived-At', "<$url$mid/>");
header_append($hdr, 'List-Archive', "<$url>");
}
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 80a2ff7..bf879c6 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -8,7 +8,7 @@ use warnings;
use PublicInbox::SearchMsg;
use PublicInbox::Hval qw/ascii_html/;
use PublicInbox::View;
-use PublicInbox::MID qw(mid2path mid_mime mid_clean);
+use PublicInbox::MID qw(mid2path mid_mime mid_clean mid_escape);
use Email::MIME;
require PublicInbox::Git;
require PublicInbox::Thread;
@@ -74,7 +74,7 @@ sub mset_summary {
my $s = ascii_html($smsg->subject);
my $f = ascii_html($smsg->from_name);
my $ts = PublicInbox::View::fmt_ts($smsg->ts);
- my $mid = PublicInbox::Hval->new_msgid($smsg->mid)->as_href;
+ my $mid = PublicInbox::Hval->new_msgid($smsg->mid)->{href};
$$res .= qq{$rank. <b><a\nhref="$mid/">}.
$s . "</a></b>\n";
$$res .= "$pfx - by $f @ $ts UTC [$pct%]\n\n";
@@ -263,7 +263,7 @@ sub qs_html {
$self = $tmp;
}
- my $q = PublicInbox::Hval->new($self->{'q'})->as_href;
+ my $q = mid_escape($self->{'q'});
$q =~ s/%20/+/g; # improve URL readability
my $qs = "q=$q";
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index ed3c96e..47ffc62 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -10,7 +10,7 @@ use URI::Escape qw/uri_escape_utf8/;
use Date::Parse qw/str2time/;
use PublicInbox::Hval qw/ascii_html/;
use PublicInbox::Linkify;
-use PublicInbox::MID qw/mid_clean id_compress mid_mime/;
+use PublicInbox::MID qw/mid_clean id_compress mid_mime mid_escape/;
use PublicInbox::MsgIter;
use PublicInbox::Address;
use PublicInbox::WwwStream;
@@ -125,7 +125,6 @@ sub index_entry {
my $mid_raw = mid_clean(mid_mime($mime));
my $id = id_compress($mid_raw, 1);
my $id_m = 'm'.$id;
- my $mid = PublicInbox::Hval->new_msgid($mid_raw);
my $root_anchor = $ctx->{root_anchor} || '';
my $irt = in_reply_to($hdr);
@@ -142,7 +141,7 @@ sub index_entry {
}
$rv .= "From: "._hdr_names($hdr, 'From').' @ '._msg_date($hdr)." UTC";
my $upfx = $ctx->{-upfx};
- my $mhref = $upfx . $mid->as_href . '/';
+ my $mhref = $upfx . mid_escape($mid_raw) . '/';
$rv .= qq{ (<a\nhref="$mhref">permalink</a> / };
$rv .= qq{<a\nhref="${mhref}raw">raw</a>)\n};
$rv .= ' '.join('; +', @tocc) . "\n" if @tocc;
@@ -150,7 +149,7 @@ sub index_entry {
my $mapping = $ctx->{mapping};
if (!$mapping && $irt) {
my $mirt = PublicInbox::Hval->new_msgid($irt);
- my $href = $upfx . $mirt->as_href . '/';
+ my $href = $upfx . $mirt->{href}. '/';
my $html = $mirt->as_html;
$rv .= qq(In-Reply-To: <<a\nhref="$href">$html</a>>\n)
}
@@ -568,7 +567,7 @@ sub _parent_headers {
if (defined $irt) {
my $v = PublicInbox::Hval->new_msgid($irt);
my $html = $v->as_html;
- my $href = $v->as_href;
+ my $href = $v->{href};
$rv .= "In-Reply-To: <";
$rv .= "<a\nhref=\"../$href/\">$html</a>>\n";
}
@@ -627,7 +626,7 @@ sub mailto_arg_link {
$subj = "Re: $subj" unless $subj =~ /\bRe:/i;
my $mid = $hdr->header_raw('Message-ID');
push @arg, '--in-reply-to='.squote_maybe(mid_clean($mid));
- my $irt = uri_escape_utf8($mid);
+ my $irt = mid_escape($mid);
delete $cc{$to};
push @arg, "--to=$to";
$to = uri_escape_utf8($to);
@@ -657,17 +656,17 @@ sub html_footer {
$next = $prev = ' ';
if (my $n = $ctx->{next_msg}) {
- $n = PublicInbox::Hval->new_msgid($n)->as_href;
+ $n = PublicInbox::Hval->new_msgid($n)->{href};
$next = "<a\nhref=\"$upfx$n/\"\nrel=next>next</a>";
}
my $u;
my $par = $ctx->{parent_msg};
if ($par) {
- $u = PublicInbox::Hval->new_msgid($par)->as_href;
+ $u = PublicInbox::Hval->new_msgid($par)->{href};
$u = "$upfx$u/";
}
if (my $p = $ctx->{prev_msg}) {
- $prev = PublicInbox::Hval->new_msgid($p)->as_href;
+ $prev = PublicInbox::Hval->new_msgid($p)->{href};
if ($p && $par && $p eq $par) {
$prev = "<a\nhref=\"$upfx$prev/\"\n" .
'rel=prev>prev parent</a>';
@@ -692,7 +691,7 @@ sub html_footer {
sub linkify_ref_nosrch {
my $v = PublicInbox::Hval->new_msgid($_[0]);
my $html = $v->as_html;
- my $href = $v->as_href;
+ my $href = $v->{href};
"<<a\nhref=\"../$href/\">$html</a>>";
}
@@ -707,7 +706,7 @@ sub ghost_parent {
return '[no common parent]' if ($mid eq 'subject dummy');
$mid = PublicInbox::Hval->new_msgid($mid);
- my $href = $mid->as_href;
+ my $href = $mid->{href};
my $html = $mid->as_html;
qq{[parent not found: <<a\nhref="$upfx$href/">$html</a>>]};
}
@@ -793,7 +792,7 @@ sub _skel_header {
$s = PublicInbox::Hval->new($s);
$s = $s->as_html;
}
- my $m = PublicInbox::Hval->new_msgid($mid);
+ my $m;
my $id = '';
my $mapping = $ctx->{mapping};
my $end = defined($s) ? "$s</a> $f\n" : "$f</a>\n";
@@ -804,7 +803,7 @@ sub _skel_header {
$map->[1] = "$d<a\nhref=\"$m\">$end";
$id = "\nid=r".$id;
} else {
- $m = $ctx->{-upfx}.$m->as_href.'/';
+ $m = $ctx->{-upfx}.mid_escape($mid).'/';
}
$$dst .= $d . "<a\nhref=\"$m\"$id>" . $end;
}
@@ -829,7 +828,7 @@ sub skel_dump {
$d .= indent_for($level) . th_pfx($level);
my $upfx = $ctx->{-upfx};
my $m = PublicInbox::Hval->new_msgid($mid);
- my $href = $upfx . $m->as_href . '/';
+ my $href = $upfx . $m->{href} . '/';
my $html = $m->as_html;
if ($map) {
@@ -911,7 +910,7 @@ sub dump_topics {
@$topic = ();
next unless defined $top; # ghost topic
my $mid = delete $seen->{$top};
- my $href = PublicInbox::Hval->new_msgid($mid)->as_href;
+ my $href = mid_escape($mid);
$top = PublicInbox::Hval->new($top)->as_html;
$ts = fmt_ts($ts);
@@ -935,7 +934,7 @@ sub dump_topics {
my $sub = $ex[$i + 1];
$mid = delete $seen->{$sub};
$sub = PublicInbox::Hval->new($sub)->as_html;
- $href = PublicInbox::Hval->new_msgid($mid)->as_href;
+ $href = mid_escape($mid);
$s .= indent_for($level) . TCHILD;
$s .= "<a\nhref=\"$href/T/#u\">$sub</a>\n";
}
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index 60cb443..6f6a003 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -15,7 +15,8 @@ use strict;
use warnings;
use PublicInbox::Config;
use PublicInbox::Hval;
-use URI::Escape qw(uri_escape_utf8 uri_unescape);
+use URI::Escape qw(uri_unescape);
+use PublicInbox::MID qw(mid_escape);
require PublicInbox::Git;
use PublicInbox::GitHTTPBackend;
our $INBOX_RE = qr!\A/([\w\.\-]+)!;
@@ -353,7 +354,7 @@ sub r301 {
}
my $url = $obj->base_url($ctx->{env});
my $qs = $ctx->{env}->{QUERY_STRING};
- $url .= (uri_escape_utf8($mid) . '/') if (defined $mid);
+ $url .= (mid_escape($mid) . '/') if (defined $mid);
$url .= $suffix if (defined $suffix);
$url .= "?$qs" if $qs ne '';
diff --git a/scripts/ssoma-replay b/scripts/ssoma-replay
index 91c121d..5b6b8af 100755
--- a/scripts/ssoma-replay
+++ b/scripts/ssoma-replay
@@ -53,7 +53,7 @@ if (defined $list_id) {
if ($mid =~ /\A<(.+)>\z/) {
$mid = $1;
}
- $mid = uri_escape_utf8($mid);
+ $mid = uri_escape_utf8($mid, '^A-Za-z0-9\-\._~@');
$header_obj->header_set('List-Archive', "<$archive_url>");
foreach my $h (qw(Help Unsubscribe Subscribe Owner)) {
diff --git a/t/cgi.t b/t/cgi.t
index a0f09c5..092ad8c 100644
--- a/t/cgi.t
+++ b/t/cgi.t
@@ -118,7 +118,7 @@ EOF
like($res->{head}, qr/Status:\s*206/i, "info/refs partial past end OK");
is($res->{body}, substr($orig, 5), 'partial body OK past end');
}
-
+use Data::Dumper;
# atom feeds
{
local $ENV{HOME} = $home;
@@ -126,7 +126,7 @@ EOF
like($res->{body}, qr/<title>test for public-inbox/,
"set title in XML feed");
like($res->{body},
- qr!http://test\.example\.com/test/blah%40example\.com/!,
+ qr!http://test\.example\.com/test/blah\@example\.com/!,
"link id set");
like($res->{body}, qr/what\?/, "reply included");
}
@@ -148,7 +148,7 @@ EOF
$im->add($reply);
$im->done;
- my $res = cgi_run("/test/slashy%2fasdf%40example.com/raw");
+ my $res = cgi_run("/test/slashy%2fasdf\@example.com/raw");
like($res->{body}, qr/Message-Id: <\Q$slashy_mid\E>/,
"slashy mid raw hit");
@@ -167,20 +167,20 @@ EOF
$res = cgi_run("/test/blahblah\@example.com/f/");
like($res->{head}, qr/Status: 301 Moved/, "301 response");
like($res->{head},
- qr!^Location: http://[^/]+/test/blahblah%40example\.com/\r\n!ms,
+ qr!^Location: http://[^/]+/test/blahblah\@example\.com/\r\n!ms,
'301 redirect location');
$res = cgi_run("/test/blahblah\@example.con/");
like($res->{head}, qr/Status: 300 Multiple Choices/, "mid html miss");
$res = cgi_run("/test/new.html");
- like($res->{body}, qr/slashy%2Fasdf%40example\.com/,
+ like($res->{body}, qr/slashy%2Fasdf\@example\.com/,
"slashy URL generated correctly");
}
# retrieve thread as an mbox
{
local $ENV{HOME} = $home;
- my $path = "/test/blahblah%40example.com/t.mbox.gz";
+ my $path = "/test/blahblah\@example.com/t.mbox.gz";
my $res = cgi_run($path);
like($res->{head}, qr/^Status: 501 /, "search not-yet-enabled");
my $indexed = system($index, $maindir) == 0;
@@ -200,7 +200,7 @@ EOF
my $have_xml_feed = eval { require XML::Feed; 1 } if $indexed;
if ($have_xml_feed) {
- $path = "/test/blahblah%40example.com/t.atom";
+ $path = "/test/blahblah\@example.com/t.atom";
$res = cgi_run($path);
like($res->{head}, qr/^Status: 200 /, "atom returned 200");
like($res->{head}, qr!^Content-Type: application/atom\+xml!m,
diff --git a/t/check-www-inbox.perl b/t/check-www-inbox.perl
index 6be631e..4319049 100644
--- a/t/check-www-inbox.perl
+++ b/t/check-www-inbox.perl
@@ -131,10 +131,6 @@ sub worker_loop {
warn "W: ".$r->code . " $u\n"
}
- # check bad links
- my @at = grep(/@/, @links);
- print "BAD: $u ", join("\n", @at), "\n" if @at;
-
my $s;
# blocking
foreach my $l (@links, "DONE\t$u") {
diff --git a/t/mid.t b/t/mid.t
new file mode 100644
index 0000000..b0af838
--- /dev/null
+++ b/t/mid.t
@@ -0,0 +1,11 @@
+# Copyright (C) 2016 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use Test::More;
+use PublicInbox::MID qw(mid_escape);
+
+is(mid_escape('foo!@(bar)'), 'foo!@(bar)');
+is(mid_escape('foo%!@(bar)'), 'foo%25!@(bar)');
+is(mid_escape('foo%!@(bar)'), 'foo%25!@(bar)');
+
+done_testing();
+1;
diff --git a/t/nntp.t b/t/nntp.t
index de07abb..7500d6b 100644
--- a/t/nntp.t
+++ b/t/nntp.t
@@ -112,7 +112,7 @@ use_ok 'PublicInbox::Inbox';
PublicInbox::NNTP::set_nntp_headers($mime->header_obj, $ng, 1, $mid);
is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ],
'Message-ID unchanged');
- is_deeply([ $mime->header('Archived-At') ], [ "<${u}a%40b/>" ],
+ is_deeply([ $mime->header('Archived-At') ], [ "<${u}a\@b/>" ],
'Archived-At: set');
is_deeply([ $mime->header('List-Archive') ], [ "<$u>" ],
'List-Archive: set');
@@ -128,7 +128,7 @@ use_ok 'PublicInbox::Inbox';
is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ],
'Message-ID unchanged');
is_deeply([ $mime->header('Archived-At') ],
- [ "<${u}a%40b/>", '<http://mirror.example.com/m/a%40b/>' ],
+ [ "<${u}a\@b/>", '<http://mirror.example.com/m/a@b/>' ],
'Archived-At: appended');
is_deeply([ $mime->header('Xref') ], [ 'example.com test:2' ],
'Old Xref: clobbered');
diff --git a/t/plack.t b/t/plack.t
index db3a9b2..608afb9 100644
--- a/t/plack.t
+++ b/t/plack.t
@@ -97,7 +97,7 @@ EOF
foreach my $t (qw(t T)) {
test_psgi($app, sub {
my ($cb) = @_;
- my $u = $pfx . "/blah%40example.com/$t";
+ my $u = $pfx . "/blah\@example.com/$t";
my $res = $cb->(GET($u));
is(301, $res->code, "redirect for missing /");
my $location = $res->header('Location');
@@ -108,11 +108,11 @@ EOF
foreach my $t (qw(f)) {
test_psgi($app, sub {
my ($cb) = @_;
- my $u = $pfx . "/blah%40example.com/$t";
+ my $u = $pfx . "/blah\@example.com/$t";
my $res = $cb->(GET($u));
is(301, $res->code, "redirect for legacy /f");
my $location = $res->header('Location');
- like($location, qr!/blah%40example\.com/\z!,
+ like($location, qr!/blah\@example\.com/\z!,
'redirected with missing /');
});
}
@@ -124,7 +124,7 @@ EOF
is(200, $res->code, 'success response received');
like($res->content, qr!href="new\.atom"!,
'atom URL generated');
- like($res->content, qr!href="blah%40example\.com/"!,
+ like($res->content, qr!href="blah\@example\.com/"!,
'index generated');
});
@@ -133,13 +133,13 @@ EOF
my $res = $cb->(GET($pfx . '/atom.xml'));
is(200, $res->code, 'success response received for atom');
like($res->content,
- qr!link\s+href="\Q$pfx\E/blah%40example\.com/"!s,
+ qr!link\s+href="\Q$pfx\E/blah\@example\.com/"!s,
'atom feed generated correct URL');
});
test_psgi($app, sub {
my ($cb) = @_;
- my $path = '/blah%40example.com/';
+ my $path = '/blah@example.com/';
my $res = $cb->(GET($pfx . $path));
is(200, $res->code, "success for $path");
like($res->content, qr!<title>hihi - Me</title>!,
@@ -149,13 +149,13 @@ EOF
$res = $cb->(GET($pfx . $path));
is(301, $res->code, "redirect for $path");
my $location = $res->header('Location');
- like($location, qr!/blah%40example\.com/\z!,
+ like($location, qr!/blah\@example\.com/\z!,
'/$MESSAGE_ID/f/ redirected to /$MESSAGE_ID/');
});
test_psgi($app, sub {
my ($cb) = @_;
- my $res = $cb->(GET($pfx . '/blah%40example.com/raw'));
+ my $res = $cb->(GET($pfx . '/blah@example.com/raw'));
is(200, $res->code, 'success response received for /*/raw');
like($res->content, qr!^From !sm, "mbox returned");
});
@@ -164,10 +164,10 @@ EOF
foreach my $t (qw(m f)) {
test_psgi($app, sub {
my ($cb) = @_;
- my $res = $cb->(GET($pfx . "/$t/blah%40example.com.txt"));
+ my $res = $cb->(GET($pfx . "/$t/blah\@example.com.txt"));
is(301, $res->code, "redirect for old $t .txt link");
my $location = $res->header('Location');
- like($location, qr!/blah%40example\.com/raw\z!,
+ like($location, qr!/blah\@example\.com/raw\z!,
".txt redirected to /raw");
});
}
@@ -180,22 +180,22 @@ EOF
while (my ($t, $e) = each %umap) {
test_psgi($app, sub {
my ($cb) = @_;
- my $res = $cb->(GET($pfx . "/$t/blah%40example.com.html"));
+ my $res = $cb->(GET($pfx . "/$t/blah\@example.com.html"));
is(301, $res->code, "redirect for old $t .html link");
my $location = $res->header('Location');
like($location,
- qr!/blah%40example\.com/$e(?:#u)?\z!,
+ qr!/blah\@example\.com/$e(?:#u)?\z!,
".html redirected to new location");
});
}
foreach my $sfx (qw(mbox mbox.gz)) {
test_psgi($app, sub {
my ($cb) = @_;
- my $res = $cb->(GET($pfx . "/t/blah%40example.com.$sfx"));
+ my $res = $cb->(GET($pfx . "/t/blah\@example.com.$sfx"));
is(301, $res->code, 'redirect for old thread link');
my $location = $res->header('Location');
like($location,
- qr!/blah%40example\.com/t\.mbox(?:\.gz)?\z!,
+ qr!/blah\@example\.com/t\.mbox(?:\.gz)?\z!,
"$sfx redirected to /mbox.gz");
});
}
diff --git a/t/psgi_mount.t b/t/psgi_mount.t
index dae45ba..4a515c6 100644
--- a/t/psgi_mount.t
+++ b/t/psgi_mount.t
@@ -67,7 +67,7 @@ test_psgi($app, sub {
is($res->code, 200, 'OK with URLMap mount');
$res = $cb->(GET('/a/test/m/blah%40example.com.html'));
is($res->header('Location'),
- 'http://localhost/a/test/blah%40example.com/',
+ 'http://localhost/a/test/blah@example.com/',
'redirect functions properly under mount');
$res = $cb->(GET('/test/blah%40example.com/'));
--
EW
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 4/8] www: do not double-clean Message-IDs from internal DBs
2016-08-13 23:53 [PATCH 1/8] import_slrnspool: reimplement using fast-import Eric Wong
2016-08-13 23:53 ` [PATCH 2/8] searchidx: do not release Xapian lock while (only) Msgmap is indexing Eric Wong
2016-08-13 23:53 ` [PATCH 3/8] www: do not unecessarily escape some chars in paths Eric Wong
@ 2016-08-13 23:53 ` Eric Wong
2016-08-13 23:53 ` [PATCH 5/8] mid: no wide characters for sha1_hex Eric Wong
` (3 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Eric Wong @ 2016-08-13 23:53 UTC (permalink / raw)
To: spew
Ensure we always strip one level of '<>' from Message-IDs,
since our internal SQLite, Xapian, and SHA-1 storage all
assume that.
Realistically, we may screw up if somebody has '<<' or '>>',
but those are screwed up mail clients.
---
lib/PublicInbox/Feed.pm | 3 +--
lib/PublicInbox/Hval.pm | 1 -
lib/PublicInbox/View.pm | 2 +-
3 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm
index 232a91c..25fec10 100644
--- a/lib/PublicInbox/Feed.pm
+++ b/lib/PublicInbox/Feed.pm
@@ -266,8 +266,7 @@ sub feed_entry {
my $midurl = $feed_opts->{midurl};
my $header_obj = $mime->header_obj;
- my $mid = $header_obj->header_raw('Message-ID');
- defined $mid or return;
+ my $mid = mid_clean($header_obj->header_raw('Message-ID'));
$mid = PublicInbox::Hval->new_msgid($mid);
my $href = $midurl . $mid->{href}. '/';
diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm
index ab05238..b354aa4 100644
--- a/lib/PublicInbox/Hval.pm
+++ b/lib/PublicInbox/Hval.pm
@@ -32,7 +32,6 @@ sub new {
sub new_msgid {
my ($class, $msgid) = @_;
- $msgid = mid_clean($msgid);
$class->new($msgid, mid_escape($msgid));
}
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 47ffc62..5b352d1 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -491,7 +491,7 @@ sub _msg_html_prepare {
$ctx->{-upfx} = '../';
}
my @title;
- my $mid = $hdr->header_raw('Message-ID');
+ my $mid = mid_clean($hdr->header_raw('Message-ID'));
$mid = PublicInbox::Hval->new_msgid($mid);
foreach my $h (qw(From To Cc Subject Date)) {
my $v = $hdr->header($h);
--
EW
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 5/8] mid: no wide characters for sha1_hex
2016-08-13 23:53 [PATCH 1/8] import_slrnspool: reimplement using fast-import Eric Wong
` (2 preceding siblings ...)
2016-08-13 23:53 ` [PATCH 4/8] www: do not double-clean Message-IDs from internal DBs Eric Wong
@ 2016-08-13 23:53 ` Eric Wong
2016-08-13 23:53 ` [PATCH 6/8] view: allow for missing In-Reply-To mapping Eric Wong
` (2 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Eric Wong @ 2016-08-13 23:53 UTC (permalink / raw)
To: spew
Apparently there are some really screwed up In-Reply-To
fields out there.
---
lib/PublicInbox/MID.pm | 1 +
1 file changed, 1 insertion(+)
diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm
index 74a4875..b81fc15 100644
--- a/lib/PublicInbox/MID.pm
+++ b/lib/PublicInbox/MID.pm
@@ -26,6 +26,7 @@ sub id_compress {
my ($id, $force) = @_;
if ($force || $id =~ /[^\w\-]/ || length($id) > MID_MAX) {
+ utf8::encode($id);
return sha1_hex($id);
}
$id;
--
EW
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 6/8] view: allow for missing In-Reply-To mapping
2016-08-13 23:53 [PATCH 1/8] import_slrnspool: reimplement using fast-import Eric Wong
` (3 preceding siblings ...)
2016-08-13 23:53 ` [PATCH 5/8] mid: no wide characters for sha1_hex Eric Wong
@ 2016-08-13 23:53 ` Eric Wong
2016-08-13 23:53 ` [PATCH 7/8] view: remove redundant pre closing tag Eric Wong
2016-08-13 23:53 ` [PATCH 8/8] squash Eric Wong
6 siblings, 0 replies; 8+ messages in thread
From: Eric Wong @ 2016-08-13 23:53 UTC (permalink / raw)
To: spew
Because buggy mail clients exist and generate invalid
In-Reply-To headers we cannot handle across the board...
---
lib/PublicInbox/View.pm | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 5b352d1..3f25028 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -207,8 +207,8 @@ sub _th_index_lite {
my $nr_s = 0;
my $level = $map->[4];
my $idx = $map->[3];
- if (defined $irt) {
- my $irt_map = $mapping->{$irt};
+ my $irt_map = $mapping->{$irt} if defined $irt;
+ if (defined $irt_map) {
my $siblings = $irt_map->[0];
$nr_s = scalar(@$siblings) - 1;
$rv .= $pad . $irt_map->[1];
--
EW
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 7/8] view: remove redundant pre closing tag
2016-08-13 23:53 [PATCH 1/8] import_slrnspool: reimplement using fast-import Eric Wong
` (4 preceding siblings ...)
2016-08-13 23:53 ` [PATCH 6/8] view: allow for missing In-Reply-To mapping Eric Wong
@ 2016-08-13 23:53 ` Eric Wong
2016-08-13 23:53 ` [PATCH 8/8] squash Eric Wong
6 siblings, 0 replies; 8+ messages in thread
From: Eric Wong @ 2016-08-13 23:53 UTC (permalink / raw)
To: spew
---
lib/PublicInbox/View.pm | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 3f25028..6f79f60 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -377,7 +377,7 @@ sub thread_html {
return index_entry($mime, $ctx, scalar @$msgs);
}
$msgs = undef;
- '</pre>'.$skel;
+ $skel;
});
}
--
EW
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 8/8] squash
2016-08-13 23:53 [PATCH 1/8] import_slrnspool: reimplement using fast-import Eric Wong
` (5 preceding siblings ...)
2016-08-13 23:53 ` [PATCH 7/8] view: remove redundant pre closing tag Eric Wong
@ 2016-08-13 23:53 ` Eric Wong
6 siblings, 0 replies; 8+ messages in thread
From: Eric Wong @ 2016-08-13 23:53 UTC (permalink / raw)
To: spew
---
lib/PublicInbox/MID.pm | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm
index b81fc15..1c2d75c 100644
--- a/lib/PublicInbox/MID.pm
+++ b/lib/PublicInbox/MID.pm
@@ -38,7 +38,9 @@ sub mid2path {
unless (defined $x38) {
# compatibility with old links (or short Message-IDs :)
- $mid = sha1_hex(mid_clean($mid));
+ $mid = mid_clean($mid);
+ utf8::encode($mid);
+ $mid = sha1_hex($mid);
($x2, $x38) = ($mid =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/);
}
"$x2/$x38";
--
EW
^ permalink raw reply related [flat|nested] 8+ messages in thread
end of thread, other threads:[~2016-08-13 23:53 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-13 23:53 [PATCH 1/8] import_slrnspool: reimplement using fast-import Eric Wong
2016-08-13 23:53 ` [PATCH 2/8] searchidx: do not release Xapian lock while (only) Msgmap is indexing Eric Wong
2016-08-13 23:53 ` [PATCH 3/8] www: do not unecessarily escape some chars in paths Eric Wong
2016-08-13 23:53 ` [PATCH 4/8] www: do not double-clean Message-IDs from internal DBs Eric Wong
2016-08-13 23:53 ` [PATCH 5/8] mid: no wide characters for sha1_hex Eric Wong
2016-08-13 23:53 ` [PATCH 6/8] view: allow for missing In-Reply-To mapping Eric Wong
2016-08-13 23:53 ` [PATCH 7/8] view: remove redundant pre closing tag Eric Wong
2016-08-13 23:53 ` [PATCH 8/8] squash Eric Wong
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).