dumping ground for random patches and texts
 help / color / mirror / Atom feed
* [PATCH 1/8] import_slrnspool: reimplement using fast-import
@ 2016-08-13 23:53 Eric Wong
  2016-08-13 23:53 ` [PATCH 2/8] searchidx: do not release Xapian lock while (only) Msgmap is indexing Eric Wong
                   ` (6 more replies)
  0 siblings, 7 replies; 8+ messages in thread
From: Eric Wong @ 2016-08-13 23:53 UTC (permalink / raw)
  To: spew

I needed to use this to resurrect some messages missing
from my initial downloads from gmane...
---
 scripts/import_slrnspool | 94 ++++++++++++++++++++----------------------------
 1 file changed, 38 insertions(+), 56 deletions(-)

diff --git a/scripts/import_slrnspool b/scripts/import_slrnspool
index 687809b..9885059 100755
--- a/scripts/import_slrnspool
+++ b/scripts/import_slrnspool
@@ -1,6 +1,6 @@
 #!/usr/bin/perl -w
-# Copyright (C) 2015 all contributors <meta@public-inbox.org>
-# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt)
+# Copyright (C) 2015-2016 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 #
 # Incremental (or one-shot) importer of a slrnpull news spool
 =begin usage
@@ -11,8 +11,9 @@
 use strict;
 use warnings;
 use PublicInbox::Config;
-use Email::Filter;
-use Email::LocalDelivery;
+use Email::MIME;
+use PublicInbox::Import;
+use PublicInbox::Git;
 sub usage { "Usage:\n".join('',grep(/\t/, `head -n 10 $0`)) }
 my $exit = 0;
 my $sighandler = sub { $exit = 1 };
@@ -22,43 +23,33 @@ my $spool = shift @ARGV or die usage();
 my $recipient = $ENV{ORIGINAL_RECIPIENT};
 defined $recipient or die usage();
 my $config = PublicInbox::Config->new;
-my $cfg = $config->lookup($recipient);
-defined $cfg or exit(1);
-my @mda;
-if ($ENV{'FILTER'}) {
-	@mda = qw(public-inbox-mda);
-} else {
-	@mda = (qw(ssoma-mda -1), $cfg->{mainrepo});
-}
+my $ibx = $config->lookup($recipient);
+my $git = $ibx->git;
+my $im = PublicInbox::Import->new($git, $ibx->{name}, $ibx->{-primary_address});
 
 sub key {
-	my ($cfg) = @_;
-	"publicinbox.$cfg->{inbox}.importslrnspoolstate";
+	"publicinbox.$ibx->{name}.importslrnspoolstate";
 }
 
 sub get_min {
 	my $f = PublicInbox::Config->default_file;
-	my @cmd = (qw/git config/, "--file=$f", key($cfg));
-	use IPC::Run qw/run/;
-
-	my $in = '';
-	my $out = '';
-	unless (run(\@cmd, \$in, \$out)) {
-		$out = 0;
-	}
-	int($out);
+	my $out = $git->qx('config', "--file=$f", key($ibx));
+	$out ||= 0;
+	chomp $out;
+	$out =~ /\A\d+\z/ and return $out;
+	0;
 }
 
 sub set_min {
-	my ($cfg, $num) = @_;
+	my ($num) = @_;
 	my $f = PublicInbox::Config->default_file;
-	my @cmd = (qw/git config/, "--file=$f", key($cfg), $num);
+	my @cmd = (qw/git config/, "--file=$f", key($ibx), $num);
 	system(@cmd) == 0 or die join(' ', @cmd). " failed: $?\n";
 }
 
 my $n = get_min();
 my $ok;
-my $max_gap = 10000;
+my $max_gap = 200000;
 my $max = $n + $max_gap;
 
 for (; $exit == 0 && $n < $max; $n++) {
@@ -67,40 +58,31 @@ for (; $exit == 0 && $n < $max; $n++) {
 	open(my $fh, '<', $fn) or next;
 	$max = $n + $max_gap;
 
-	# prevent process growth by forking a new process for each message
-	my $pid = fork;
-	die "failed to fork: $!\n" unless defined $pid;
-
-	if ($pid == 0) {
-		my $f = Email::Filter->new(data => eval { local $/; <$fh> });
-		close $fh;
-		$fh = undef;
-		my $s = $f->simple;
+	my $mime = Email::MIME->new(eval { local $/; <$fh> });
+	my $hdr = $mime->header_obj;
 
-		# gmane rewrites Received headers, which increases spamminess
-		# Some older archives set Original-To
-		foreach my $x (qw(Received To)) {
-			my @h = $s->header("Original-$x");
-			if (@h) {
-				$s->header_set($x, @h);
-				$s->header_set("Original-$x");
-			}
+	# gmane rewrites Received headers, which increases spamminess
+	# Some older archives set Original-To
+	foreach my $x (qw(Received To)) {
+		my @h = $hdr->header_raw("Original-$x");
+		if (@h) {
+			$hdr->header_set($x, @h);
+			$hdr->header_set("Original-$x");
 		}
+	}
 
-		# triggers for the SA HEADER_SPAM rule
-		foreach my $drop (qw(Approved)) { $s->header_set($drop) }
+	# Approved triggers for the SA HEADER_SPAM rule,
+	# X-From is gmane specific
+	foreach my $drop (qw(Approved X-From)) {
+		$hdr->header_set($drop);
+	}
 
-		# appears to be an old gmane bug:
-		$s->header_set('connect()');
+	# appears to be an old gmane bug:
+	$hdr->header_set('connect()');
+	$im->add($mime);
 
-		$f->exit(0);
-		$f->pipe(@mda);
-		exit 0;
-	} else {
-		close $fh;
-		waitpid($pid, 0);
-		die "error: $?\n" if $?;
-	}
 	$ok = $n + 1;
-	set_min($cfg, $ok);
+	set_min($ok);
 }
+
+$im->done;
-- 
EW


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/8] searchidx: do not release Xapian lock while (only) Msgmap is indexing
  2016-08-13 23:53 [PATCH 1/8] import_slrnspool: reimplement using fast-import Eric Wong
@ 2016-08-13 23:53 ` Eric Wong
  2016-08-13 23:53 ` [PATCH 3/8] www: do not unecessarily escape some chars in paths Eric Wong
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Eric Wong @ 2016-08-13 23:53 UTC (permalink / raw)
  To: spew

SQLite might index quickly, so we hold the lock used by Xapian
for the duration.  This probably needs to be reworked entirely,
actually.
---
 lib/PublicInbox/SearchIdx.pm | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 0eb07a1..f8155ec 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -436,19 +436,24 @@ sub _index_sync {
 
 	my $mm = _msgmap_init($self);
 	my $dbh = $mm->{dbh} if $mm;
+	my $mm_only;
 	my $cb = sub {
 		my ($commit, $more) = @_;
 		if ($dbh) {
 			$mm->last_commit($commit) if $commit;
 			$dbh->commit;
 		}
-		$xdb->set_metadata($mkey, $commit) if $mkey && $commit;
-		$xdb->commit_transaction;
-		$xdb = _xdb_release($self);
+		if (!$mm_only) {
+			$xdb->set_metadata($mkey, $commit) if $mkey && $commit;
+			$xdb->commit_transaction;
+			$xdb = _xdb_release($self);
+		}
 		# let another process do some work... <
 		if ($more) {
-			$xdb = _xdb_acquire($self);
-			$xdb->begin_transaction;
+			if (!$mm_only) {
+				$xdb = _xdb_acquire($self);
+				$xdb->begin_transaction;
+			}
 			$dbh->begin_work if $dbh;
 		}
 	};
@@ -472,14 +477,13 @@ sub _index_sync {
 			my $mkey_prev = $mkey;
 			$mkey = undef; # ignore xapian, for now
 			my $mlog = _git_log($self, $r);
+			$mm_only = 1;
 			rlog($self, $mlog, *index_mm, *unindex_mm, $cb);
-			$mlog = undef;
+			$mm_only = $mlog = undef;
 
 			# now deal with Xapian
 			$mkey = $mkey_prev;
 			$dbh = undef;
-			$xdb = _xdb_acquire($self);
-			$xdb->begin_transaction;
 			rlog($self, $xlog, *index_mm2, *unindex_mm2, $cb);
 		}
 	} else {
-- 
EW


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 3/8] www: do not unecessarily escape some chars in paths
  2016-08-13 23:53 [PATCH 1/8] import_slrnspool: reimplement using fast-import Eric Wong
  2016-08-13 23:53 ` [PATCH 2/8] searchidx: do not release Xapian lock while (only) Msgmap is indexing Eric Wong
@ 2016-08-13 23:53 ` Eric Wong
  2016-08-13 23:53 ` [PATCH 4/8] www: do not double-clean Message-IDs from internal DBs Eric Wong
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Eric Wong @ 2016-08-13 23:53 UTC (permalink / raw)
  To: spew

Based on reading RFC 3986, it seems '@', ':', '!', '$', '&',
"'", '; '(', ')', '*', '+', ',', ';', '=' are all allowed
in path-absolute where we have the Message-ID.

In any case, it seems '@' is fairly common in path components
nowadays and too common in Message-IDs.
---
 MANIFEST                      |  1 +
 lib/PublicInbox/ExtMsg.pm     |  6 +++---
 lib/PublicInbox/Feed.pm       |  4 ++--
 lib/PublicInbox/Hval.pm       |  5 ++---
 lib/PublicInbox/MID.pm        |  7 ++++++-
 lib/PublicInbox/NNTP.pm       |  4 ++--
 lib/PublicInbox/SearchView.pm |  6 +++---
 lib/PublicInbox/View.pm       | 31 +++++++++++++++----------------
 lib/PublicInbox/WWW.pm        |  5 +++--
 scripts/ssoma-replay          |  2 +-
 t/cgi.t                       | 14 +++++++-------
 t/check-www-inbox.perl        |  4 ----
 t/mid.t                       | 11 +++++++++++
 t/nntp.t                      |  4 ++--
 t/plack.t                     | 28 ++++++++++++++--------------
 t/psgi_mount.t                |  2 +-
 16 files changed, 73 insertions(+), 61 deletions(-)
 create mode 100644 t/mid.t

diff --git a/MANIFEST b/MANIFEST
index f5ea455..bed6050 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -132,6 +132,7 @@ t/init.t
 t/linkify.t
 t/main-bin/spamc
 t/mda.t
+t/mid.t
 t/msg_iter.t
 t/msgmap.t
 t/nntp.t
diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm
index 955ada7..b93d555 100644
--- a/lib/PublicInbox/ExtMsg.pm
+++ b/lib/PublicInbox/ExtMsg.pm
@@ -100,7 +100,7 @@ again:
 
 	my $code = 404;
 	my $h = PublicInbox::Hval->new_msgid($mid);
-	my $href = $h->as_href;
+	my $href = $h->{href};
 	my $html = $h->as_html;
 	my $title = "&lt;$html&gt; not found";
 	my $s = "<pre>Message-ID &lt;$html&gt;\nnot found\n";
@@ -115,7 +115,7 @@ again:
 			my $u = $ibx->base_url($env) or next;
 			foreach my $m (@$res) {
 				my $p = PublicInbox::Hval->new_msgid($m);
-				my $r = $p->as_href;
+				my $r = $p->{href};
 				my $t = $p->as_html;
 				$s .= qq{<a\nhref="$u$r/">$u$t/</a>\n};
 			}
@@ -153,7 +153,7 @@ sub ext_urls {
 sub exact {
 	my ($ctx, $found, $mid) = @_;
 	my $h = PublicInbox::Hval->new_msgid($mid);
-	my $href = $h->as_href;
+	my $href = $h->{href};
 	my $html = $h->as_html;
 	my $title = "&lt;$html&gt; found in ";
 	my $end = @$found == 1 ? 'another inbox' : 'other inboxes';
diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm
index 240c336..232a91c 100644
--- a/lib/PublicInbox/Feed.pm
+++ b/lib/PublicInbox/Feed.pm
@@ -137,7 +137,7 @@ sub emit_atom_thread {
 	my $fh = $cb->([200, ['Content-Type' => 'application/atom+xml']]);
 	my $ibx = $ctx->{-inbox};
 	my $html_url = $ibx->base_url($ctx->{env});
-	$html_url .= PublicInbox::Hval->new_msgid($mid)->as_href;
+	$html_url .= PublicInbox::Hval->new_msgid($mid)->{href};
 
 	$feed_opts->{url} = $html_url;
 	$feed_opts->{emit_header} = 1;
@@ -269,7 +269,7 @@ sub feed_entry {
 	my $mid = $header_obj->header_raw('Message-ID');
 	defined $mid or return;
 	$mid = PublicInbox::Hval->new_msgid($mid);
-	my $href = $midurl . $mid->as_href . '/';
+	my $href = $midurl . $mid->{href}. '/';
 
 	my $date = $header_obj->header('Date');
 	my $updated = feed_updated($date);
diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm
index 652aef3..ab05238 100644
--- a/lib/PublicInbox/Hval.pm
+++ b/lib/PublicInbox/Hval.pm
@@ -8,7 +8,7 @@ use strict;
 use warnings;
 use Encode qw(find_encoding);
 use URI::Escape qw(uri_escape_utf8);
-use PublicInbox::MID qw/mid_clean/;
+use PublicInbox::MID qw/mid_clean mid_escape/;
 use base qw/Exporter/;
 our @EXPORT_OK = qw/ascii_html/;
 
@@ -33,7 +33,7 @@ sub new {
 sub new_msgid {
 	my ($class, $msgid) = @_;
 	$msgid = mid_clean($msgid);
-	$class->new($msgid, $msgid);
+	$class->new($msgid, mid_escape($msgid));
 }
 
 sub new_oneline {
@@ -60,7 +60,6 @@ sub ascii_html {
 }
 
 sub as_html { ascii_html($_[0]->{raw}) }
-sub as_href { ascii_html(uri_escape_utf8($_[0]->{href})) }
 
 sub raw {
 	if (defined $_[1]) {
diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm
index 78952b9..74a4875 100644
--- a/lib/PublicInbox/MID.pm
+++ b/lib/PublicInbox/MID.pm
@@ -6,7 +6,8 @@ package PublicInbox::MID;
 use strict;
 use warnings;
 use base qw/Exporter/;
-our @EXPORT_OK = qw/mid_clean id_compress mid2path mid_mime/;
+our @EXPORT_OK = qw/mid_clean id_compress mid2path mid_mime mid_escape/;
+use URI::Escape qw(uri_escape_utf8);
 use Digest::SHA qw/sha1_hex/;
 use constant MID_MAX => 40; # SHA-1 hex length
 
@@ -44,4 +45,8 @@ sub mid2path {
 
 sub mid_mime ($) { $_[0]->header_obj->header_raw('Message-ID') }
 
+# RFC3986, section 3.3:
+sub MID_ESC () { '^A-Za-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@' }
+sub mid_escape ($) { uri_escape_utf8($_[0], MID_ESC) }
+
 1;
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index 0c61dd8..7bfc6dd 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -9,12 +9,12 @@ use base qw(Danga::Socket);
 use fields qw(nntpd article rbuf ng long_res);
 use PublicInbox::Search;
 use PublicInbox::Msgmap;
+use PublicInbox::MID qw(mid_escape);
 use PublicInbox::Git;
 require PublicInbox::EvCleanup;
 use Email::Simple;
 use POSIX qw(strftime);
 use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC);
-use URI::Escape qw(uri_escape_utf8);
 use constant {
 	r501 => '501 command syntax error',
 	r221 => '221 Header follows',
@@ -421,7 +421,7 @@ sub set_nntp_headers {
 	$hdr->header_set('Xref', xref($ng, $n));
 	header_append($hdr, 'List-Post', "<mailto:$ng->{-primary_address}>");
 	if (my $url = $ng->base_url) {
-		$mid = uri_escape_utf8($mid);
+		$mid = mid_escape($mid);
 		header_append($hdr, 'Archived-At', "<$url$mid/>");
 		header_append($hdr, 'List-Archive', "<$url>");
 	}
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 80a2ff7..bf879c6 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -8,7 +8,7 @@ use warnings;
 use PublicInbox::SearchMsg;
 use PublicInbox::Hval qw/ascii_html/;
 use PublicInbox::View;
-use PublicInbox::MID qw(mid2path mid_mime mid_clean);
+use PublicInbox::MID qw(mid2path mid_mime mid_clean mid_escape);
 use Email::MIME;
 require PublicInbox::Git;
 require PublicInbox::Thread;
@@ -74,7 +74,7 @@ sub mset_summary {
 		my $s = ascii_html($smsg->subject);
 		my $f = ascii_html($smsg->from_name);
 		my $ts = PublicInbox::View::fmt_ts($smsg->ts);
-		my $mid = PublicInbox::Hval->new_msgid($smsg->mid)->as_href;
+		my $mid = PublicInbox::Hval->new_msgid($smsg->mid)->{href};
 		$$res .= qq{$rank. <b><a\nhref="$mid/">}.
 			$s . "</a></b>\n";
 		$$res .= "$pfx  - by $f @ $ts UTC [$pct%]\n\n";
@@ -263,7 +263,7 @@ sub qs_html {
 		$self = $tmp;
 	}
 
-	my $q = PublicInbox::Hval->new($self->{'q'})->as_href;
+	my $q = mid_escape($self->{'q'});
 	$q =~ s/%20/+/g; # improve URL readability
 	my $qs = "q=$q";
 
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index ed3c96e..47ffc62 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -10,7 +10,7 @@ use URI::Escape qw/uri_escape_utf8/;
 use Date::Parse qw/str2time/;
 use PublicInbox::Hval qw/ascii_html/;
 use PublicInbox::Linkify;
-use PublicInbox::MID qw/mid_clean id_compress mid_mime/;
+use PublicInbox::MID qw/mid_clean id_compress mid_mime mid_escape/;
 use PublicInbox::MsgIter;
 use PublicInbox::Address;
 use PublicInbox::WwwStream;
@@ -125,7 +125,6 @@ sub index_entry {
 	my $mid_raw = mid_clean(mid_mime($mime));
 	my $id = id_compress($mid_raw, 1);
 	my $id_m = 'm'.$id;
-	my $mid = PublicInbox::Hval->new_msgid($mid_raw);
 
 	my $root_anchor = $ctx->{root_anchor} || '';
 	my $irt = in_reply_to($hdr);
@@ -142,7 +141,7 @@ sub index_entry {
 	}
 	$rv .= "From: "._hdr_names($hdr, 'From').' @ '._msg_date($hdr)." UTC";
 	my $upfx = $ctx->{-upfx};
-	my $mhref = $upfx . $mid->as_href . '/';
+	my $mhref = $upfx . mid_escape($mid_raw) . '/';
 	$rv .= qq{ (<a\nhref="$mhref">permalink</a> / };
 	$rv .= qq{<a\nhref="${mhref}raw">raw</a>)\n};
 	$rv .= '  '.join('; +', @tocc) . "\n" if @tocc;
@@ -150,7 +149,7 @@ sub index_entry {
 	my $mapping = $ctx->{mapping};
 	if (!$mapping && $irt) {
 		my $mirt = PublicInbox::Hval->new_msgid($irt);
-		my $href = $upfx . $mirt->as_href . '/';
+		my $href = $upfx . $mirt->{href}. '/';
 		my $html = $mirt->as_html;
 		$rv .= qq(In-Reply-To: &lt;<a\nhref="$href">$html</a>&gt;\n)
 	}
@@ -568,7 +567,7 @@ sub _parent_headers {
 	if (defined $irt) {
 		my $v = PublicInbox::Hval->new_msgid($irt);
 		my $html = $v->as_html;
-		my $href = $v->as_href;
+		my $href = $v->{href};
 		$rv .= "In-Reply-To: &lt;";
 		$rv .= "<a\nhref=\"../$href/\">$html</a>&gt;\n";
 	}
@@ -627,7 +626,7 @@ sub mailto_arg_link {
 	$subj = "Re: $subj" unless $subj =~ /\bRe:/i;
 	my $mid = $hdr->header_raw('Message-ID');
 	push @arg, '--in-reply-to='.squote_maybe(mid_clean($mid));
-	my $irt = uri_escape_utf8($mid);
+	my $irt = mid_escape($mid);
 	delete $cc{$to};
 	push @arg, "--to=$to";
 	$to = uri_escape_utf8($to);
@@ -657,17 +656,17 @@ sub html_footer {
 		$next = $prev = '    ';
 
 		if (my $n = $ctx->{next_msg}) {
-			$n = PublicInbox::Hval->new_msgid($n)->as_href;
+			$n = PublicInbox::Hval->new_msgid($n)->{href};
 			$next = "<a\nhref=\"$upfx$n/\"\nrel=next>next</a>";
 		}
 		my $u;
 		my $par = $ctx->{parent_msg};
 		if ($par) {
-			$u = PublicInbox::Hval->new_msgid($par)->as_href;
+			$u = PublicInbox::Hval->new_msgid($par)->{href};
 			$u = "$upfx$u/";
 		}
 		if (my $p = $ctx->{prev_msg}) {
-			$prev = PublicInbox::Hval->new_msgid($p)->as_href;
+			$prev = PublicInbox::Hval->new_msgid($p)->{href};
 			if ($p && $par && $p eq $par) {
 				$prev = "<a\nhref=\"$upfx$prev/\"\n" .
 					'rel=prev>prev parent</a>';
@@ -692,7 +691,7 @@ sub html_footer {
 sub linkify_ref_nosrch {
 	my $v = PublicInbox::Hval->new_msgid($_[0]);
 	my $html = $v->as_html;
-	my $href = $v->as_href;
+	my $href = $v->{href};
 	"&lt;<a\nhref=\"../$href/\">$html</a>&gt;";
 }
 
@@ -707,7 +706,7 @@ sub ghost_parent {
 	return '[no common parent]' if ($mid eq 'subject dummy');
 
 	$mid = PublicInbox::Hval->new_msgid($mid);
-	my $href = $mid->as_href;
+	my $href = $mid->{href};
 	my $html = $mid->as_html;
 	qq{[parent not found: &lt;<a\nhref="$upfx$href/">$html</a>&gt;]};
 }
@@ -793,7 +792,7 @@ sub _skel_header {
 		$s = PublicInbox::Hval->new($s);
 		$s = $s->as_html;
 	}
-	my $m = PublicInbox::Hval->new_msgid($mid);
+	my $m;
 	my $id = '';
 	my $mapping = $ctx->{mapping};
 	my $end = defined($s) ? "$s</a> $f\n" : "$f</a>\n";
@@ -804,7 +803,7 @@ sub _skel_header {
 		$map->[1] = "$d<a\nhref=\"$m\">$end";
 		$id = "\nid=r".$id;
 	} else {
-		$m = $ctx->{-upfx}.$m->as_href.'/';
+		$m = $ctx->{-upfx}.mid_escape($mid).'/';
 	}
 	$$dst .=  $d . "<a\nhref=\"$m\"$id>" . $end;
 }
@@ -829,7 +828,7 @@ sub skel_dump {
 		$d .= indent_for($level) . th_pfx($level);
 		my $upfx = $ctx->{-upfx};
 		my $m = PublicInbox::Hval->new_msgid($mid);
-		my $href = $upfx . $m->as_href . '/';
+		my $href = $upfx . $m->{href} . '/';
 		my $html = $m->as_html;
 
 		if ($map) {
@@ -911,7 +910,7 @@ sub dump_topics {
 		@$topic = ();
 		next unless defined $top;  # ghost topic
 		my $mid = delete $seen->{$top};
-		my $href = PublicInbox::Hval->new_msgid($mid)->as_href;
+		my $href = mid_escape($mid);
 		$top = PublicInbox::Hval->new($top)->as_html;
 		$ts = fmt_ts($ts);
 
@@ -935,7 +934,7 @@ sub dump_topics {
 			my $sub = $ex[$i + 1];
 			$mid = delete $seen->{$sub};
 			$sub = PublicInbox::Hval->new($sub)->as_html;
-			$href = PublicInbox::Hval->new_msgid($mid)->as_href;
+			$href = mid_escape($mid);
 			$s .= indent_for($level) . TCHILD;
 			$s .= "<a\nhref=\"$href/T/#u\">$sub</a>\n";
 		}
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index 60cb443..6f6a003 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -15,7 +15,8 @@ use strict;
 use warnings;
 use PublicInbox::Config;
 use PublicInbox::Hval;
-use URI::Escape qw(uri_escape_utf8 uri_unescape);
+use URI::Escape qw(uri_unescape);
+use PublicInbox::MID qw(mid_escape);
 require PublicInbox::Git;
 use PublicInbox::GitHTTPBackend;
 our $INBOX_RE = qr!\A/([\w\.\-]+)!;
@@ -353,7 +354,7 @@ sub r301 {
 	}
 	my $url = $obj->base_url($ctx->{env});
 	my $qs = $ctx->{env}->{QUERY_STRING};
-	$url .= (uri_escape_utf8($mid) . '/') if (defined $mid);
+	$url .= (mid_escape($mid) . '/') if (defined $mid);
 	$url .= $suffix if (defined $suffix);
 	$url .= "?$qs" if $qs ne '';
 
diff --git a/scripts/ssoma-replay b/scripts/ssoma-replay
index 91c121d..5b6b8af 100755
--- a/scripts/ssoma-replay
+++ b/scripts/ssoma-replay
@@ -53,7 +53,7 @@ if (defined $list_id) {
 		if ($mid =~ /\A<(.+)>\z/) {
 			$mid = $1;
 		}
-		$mid = uri_escape_utf8($mid);
+		$mid = uri_escape_utf8($mid, '^A-Za-z0-9\-\._~@');
 		$header_obj->header_set('List-Archive', "<$archive_url>");
 
 		foreach my $h (qw(Help Unsubscribe Subscribe Owner)) {
diff --git a/t/cgi.t b/t/cgi.t
index a0f09c5..092ad8c 100644
--- a/t/cgi.t
+++ b/t/cgi.t
@@ -118,7 +118,7 @@ EOF
 	like($res->{head}, qr/Status:\s*206/i, "info/refs partial past end OK");
 	is($res->{body}, substr($orig, 5), 'partial body OK past end');
 }
-
+use Data::Dumper;
 # atom feeds
 {
 	local $ENV{HOME} = $home;
@@ -126,7 +126,7 @@ EOF
 	like($res->{body}, qr/<title>test for public-inbox/,
 		"set title in XML feed");
 	like($res->{body},
-		qr!http://test\.example\.com/test/blah%40example\.com/!,
+		qr!http://test\.example\.com/test/blah\@example\.com/!,
 		"link id set");
 	like($res->{body}, qr/what\?/, "reply included");
 }
@@ -148,7 +148,7 @@ EOF
 	$im->add($reply);
 	$im->done;
 
-	my $res = cgi_run("/test/slashy%2fasdf%40example.com/raw");
+	my $res = cgi_run("/test/slashy%2fasdf\@example.com/raw");
 	like($res->{body}, qr/Message-Id: <\Q$slashy_mid\E>/,
 		"slashy mid raw hit");
 
@@ -167,20 +167,20 @@ EOF
 	$res = cgi_run("/test/blahblah\@example.com/f/");
 	like($res->{head}, qr/Status: 301 Moved/, "301 response");
 	like($res->{head},
-		qr!^Location: http://[^/]+/test/blahblah%40example\.com/\r\n!ms,
+		qr!^Location: http://[^/]+/test/blahblah\@example\.com/\r\n!ms,
 		'301 redirect location');
 	$res = cgi_run("/test/blahblah\@example.con/");
 	like($res->{head}, qr/Status: 300 Multiple Choices/, "mid html miss");
 
 	$res = cgi_run("/test/new.html");
-	like($res->{body}, qr/slashy%2Fasdf%40example\.com/,
+	like($res->{body}, qr/slashy%2Fasdf\@example\.com/,
 		"slashy URL generated correctly");
 }
 
 # retrieve thread as an mbox
 {
 	local $ENV{HOME} = $home;
-	my $path = "/test/blahblah%40example.com/t.mbox.gz";
+	my $path = "/test/blahblah\@example.com/t.mbox.gz";
 	my $res = cgi_run($path);
 	like($res->{head}, qr/^Status: 501 /, "search not-yet-enabled");
 	my $indexed = system($index, $maindir) == 0;
@@ -200,7 +200,7 @@ EOF
 
 	my $have_xml_feed = eval { require XML::Feed; 1 } if $indexed;
 	if ($have_xml_feed) {
-		$path = "/test/blahblah%40example.com/t.atom";
+		$path = "/test/blahblah\@example.com/t.atom";
 		$res = cgi_run($path);
 		like($res->{head}, qr/^Status: 200 /, "atom returned 200");
 		like($res->{head}, qr!^Content-Type: application/atom\+xml!m,
diff --git a/t/check-www-inbox.perl b/t/check-www-inbox.perl
index 6be631e..4319049 100644
--- a/t/check-www-inbox.perl
+++ b/t/check-www-inbox.perl
@@ -131,10 +131,6 @@ sub worker_loop {
 			warn "W: ".$r->code . " $u\n"
 		}
 
-		# check bad links
-		my @at = grep(/@/, @links);
-		print "BAD: $u ", join("\n", @at), "\n" if @at;
-
 		my $s;
 		# blocking
 		foreach my $l (@links, "DONE\t$u") {
diff --git a/t/mid.t b/t/mid.t
new file mode 100644
index 0000000..b0af838
--- /dev/null
+++ b/t/mid.t
@@ -0,0 +1,11 @@
+# Copyright (C) 2016 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use Test::More;
+use PublicInbox::MID qw(mid_escape);
+
+is(mid_escape('foo!@(bar)'), 'foo!@(bar)');
+is(mid_escape('foo%!@(bar)'), 'foo%25!@(bar)');
+is(mid_escape('foo%!@(bar)'), 'foo%25!@(bar)');
+
+done_testing();
+1;
diff --git a/t/nntp.t b/t/nntp.t
index de07abb..7500d6b 100644
--- a/t/nntp.t
+++ b/t/nntp.t
@@ -112,7 +112,7 @@ use_ok 'PublicInbox::Inbox';
 	PublicInbox::NNTP::set_nntp_headers($mime->header_obj, $ng, 1, $mid);
 	is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ],
 		'Message-ID unchanged');
-	is_deeply([ $mime->header('Archived-At') ], [ "<${u}a%40b/>" ],
+	is_deeply([ $mime->header('Archived-At') ], [ "<${u}a\@b/>" ],
 		'Archived-At: set');
 	is_deeply([ $mime->header('List-Archive') ], [ "<$u>" ],
 		'List-Archive: set');
@@ -128,7 +128,7 @@ use_ok 'PublicInbox::Inbox';
 	is_deeply([ $mime->header('Message-ID') ], [ "<$mid>" ],
 		'Message-ID unchanged');
 	is_deeply([ $mime->header('Archived-At') ],
-		[ "<${u}a%40b/>", '<http://mirror.example.com/m/a%40b/>' ],
+		[ "<${u}a\@b/>", '<http://mirror.example.com/m/a@b/>' ],
 		'Archived-At: appended');
 	is_deeply([ $mime->header('Xref') ], [ 'example.com test:2' ],
 		'Old Xref: clobbered');
diff --git a/t/plack.t b/t/plack.t
index db3a9b2..608afb9 100644
--- a/t/plack.t
+++ b/t/plack.t
@@ -97,7 +97,7 @@ EOF
 	foreach my $t (qw(t T)) {
 		test_psgi($app, sub {
 			my ($cb) = @_;
-			my $u = $pfx . "/blah%40example.com/$t";
+			my $u = $pfx . "/blah\@example.com/$t";
 			my $res = $cb->(GET($u));
 			is(301, $res->code, "redirect for missing /");
 			my $location = $res->header('Location');
@@ -108,11 +108,11 @@ EOF
 	foreach my $t (qw(f)) {
 		test_psgi($app, sub {
 			my ($cb) = @_;
-			my $u = $pfx . "/blah%40example.com/$t";
+			my $u = $pfx . "/blah\@example.com/$t";
 			my $res = $cb->(GET($u));
 			is(301, $res->code, "redirect for legacy /f");
 			my $location = $res->header('Location');
-			like($location, qr!/blah%40example\.com/\z!,
+			like($location, qr!/blah\@example\.com/\z!,
 				'redirected with missing /');
 		});
 	}
@@ -124,7 +124,7 @@ EOF
 		is(200, $res->code, 'success response received');
 		like($res->content, qr!href="new\.atom"!,
 			'atom URL generated');
-		like($res->content, qr!href="blah%40example\.com/"!,
+		like($res->content, qr!href="blah\@example\.com/"!,
 			'index generated');
 	});
 
@@ -133,13 +133,13 @@ EOF
 		my $res = $cb->(GET($pfx . '/atom.xml'));
 		is(200, $res->code, 'success response received for atom');
 		like($res->content,
-			qr!link\s+href="\Q$pfx\E/blah%40example\.com/"!s,
+			qr!link\s+href="\Q$pfx\E/blah\@example\.com/"!s,
 			'atom feed generated correct URL');
 	});
 
 	test_psgi($app, sub {
 		my ($cb) = @_;
-		my $path = '/blah%40example.com/';
+		my $path = '/blah@example.com/';
 		my $res = $cb->(GET($pfx . $path));
 		is(200, $res->code, "success for $path");
 		like($res->content, qr!<title>hihi - Me</title>!,
@@ -149,13 +149,13 @@ EOF
 		$res = $cb->(GET($pfx . $path));
 		is(301, $res->code, "redirect for $path");
 		my $location = $res->header('Location');
-		like($location, qr!/blah%40example\.com/\z!,
+		like($location, qr!/blah\@example\.com/\z!,
 			'/$MESSAGE_ID/f/ redirected to /$MESSAGE_ID/');
 	});
 
 	test_psgi($app, sub {
 		my ($cb) = @_;
-		my $res = $cb->(GET($pfx . '/blah%40example.com/raw'));
+		my $res = $cb->(GET($pfx . '/blah@example.com/raw'));
 		is(200, $res->code, 'success response received for /*/raw');
 		like($res->content, qr!^From !sm, "mbox returned");
 	});
@@ -164,10 +164,10 @@ EOF
 	foreach my $t (qw(m f)) {
 		test_psgi($app, sub {
 			my ($cb) = @_;
-			my $res = $cb->(GET($pfx . "/$t/blah%40example.com.txt"));
+			my $res = $cb->(GET($pfx . "/$t/blah\@example.com.txt"));
 			is(301, $res->code, "redirect for old $t .txt link");
 			my $location = $res->header('Location');
-			like($location, qr!/blah%40example\.com/raw\z!,
+			like($location, qr!/blah\@example\.com/raw\z!,
 				".txt redirected to /raw");
 		});
 	}
@@ -180,22 +180,22 @@ EOF
 	while (my ($t, $e) = each %umap) {
 		test_psgi($app, sub {
 			my ($cb) = @_;
-			my $res = $cb->(GET($pfx . "/$t/blah%40example.com.html"));
+			my $res = $cb->(GET($pfx . "/$t/blah\@example.com.html"));
 			is(301, $res->code, "redirect for old $t .html link");
 			my $location = $res->header('Location');
 			like($location,
-				qr!/blah%40example\.com/$e(?:#u)?\z!,
+				qr!/blah\@example\.com/$e(?:#u)?\z!,
 				".html redirected to new location");
 		});
 	}
 	foreach my $sfx (qw(mbox mbox.gz)) {
 		test_psgi($app, sub {
 			my ($cb) = @_;
-			my $res = $cb->(GET($pfx . "/t/blah%40example.com.$sfx"));
+			my $res = $cb->(GET($pfx . "/t/blah\@example.com.$sfx"));
 			is(301, $res->code, 'redirect for old thread link');
 			my $location = $res->header('Location');
 			like($location,
-			     qr!/blah%40example\.com/t\.mbox(?:\.gz)?\z!,
+			     qr!/blah\@example\.com/t\.mbox(?:\.gz)?\z!,
 			     "$sfx redirected to /mbox.gz");
 		});
 	}
diff --git a/t/psgi_mount.t b/t/psgi_mount.t
index dae45ba..4a515c6 100644
--- a/t/psgi_mount.t
+++ b/t/psgi_mount.t
@@ -67,7 +67,7 @@ test_psgi($app, sub {
 	is($res->code, 200, 'OK with URLMap mount');
 	$res = $cb->(GET('/a/test/m/blah%40example.com.html'));
 	is($res->header('Location'),
-		'http://localhost/a/test/blah%40example.com/',
+		'http://localhost/a/test/blah@example.com/',
 		'redirect functions properly under mount');
 
 	$res = $cb->(GET('/test/blah%40example.com/'));
-- 
EW


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 4/8] www: do not double-clean Message-IDs from internal DBs
  2016-08-13 23:53 [PATCH 1/8] import_slrnspool: reimplement using fast-import Eric Wong
  2016-08-13 23:53 ` [PATCH 2/8] searchidx: do not release Xapian lock while (only) Msgmap is indexing Eric Wong
  2016-08-13 23:53 ` [PATCH 3/8] www: do not unecessarily escape some chars in paths Eric Wong
@ 2016-08-13 23:53 ` Eric Wong
  2016-08-13 23:53 ` [PATCH 5/8] mid: no wide characters for sha1_hex Eric Wong
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Eric Wong @ 2016-08-13 23:53 UTC (permalink / raw)
  To: spew

Ensure we always strip one level of '<>' from Message-IDs,
since our internal SQLite, Xapian, and SHA-1 storage all
assume that.

Realistically, we may screw up if somebody has '<<' or '>>',
but those are screwed up mail clients.
---
 lib/PublicInbox/Feed.pm | 3 +--
 lib/PublicInbox/Hval.pm | 1 -
 lib/PublicInbox/View.pm | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm
index 232a91c..25fec10 100644
--- a/lib/PublicInbox/Feed.pm
+++ b/lib/PublicInbox/Feed.pm
@@ -266,8 +266,7 @@ sub feed_entry {
 	my $midurl = $feed_opts->{midurl};
 
 	my $header_obj = $mime->header_obj;
-	my $mid = $header_obj->header_raw('Message-ID');
-	defined $mid or return;
+	my $mid = mid_clean($header_obj->header_raw('Message-ID'));
 	$mid = PublicInbox::Hval->new_msgid($mid);
 	my $href = $midurl . $mid->{href}. '/';
 
diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm
index ab05238..b354aa4 100644
--- a/lib/PublicInbox/Hval.pm
+++ b/lib/PublicInbox/Hval.pm
@@ -32,7 +32,6 @@ sub new {
 
 sub new_msgid {
 	my ($class, $msgid) = @_;
-	$msgid = mid_clean($msgid);
 	$class->new($msgid, mid_escape($msgid));
 }
 
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 47ffc62..5b352d1 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -491,7 +491,7 @@ sub _msg_html_prepare {
 		$ctx->{-upfx} = '../';
 	}
 	my @title;
-	my $mid = $hdr->header_raw('Message-ID');
+	my $mid = mid_clean($hdr->header_raw('Message-ID'));
 	$mid = PublicInbox::Hval->new_msgid($mid);
 	foreach my $h (qw(From To Cc Subject Date)) {
 		my $v = $hdr->header($h);
-- 
EW


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 5/8] mid: no wide characters for sha1_hex
  2016-08-13 23:53 [PATCH 1/8] import_slrnspool: reimplement using fast-import Eric Wong
                   ` (2 preceding siblings ...)
  2016-08-13 23:53 ` [PATCH 4/8] www: do not double-clean Message-IDs from internal DBs Eric Wong
@ 2016-08-13 23:53 ` Eric Wong
  2016-08-13 23:53 ` [PATCH 6/8] view: allow for missing In-Reply-To mapping Eric Wong
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: Eric Wong @ 2016-08-13 23:53 UTC (permalink / raw)
  To: spew

Apparently there are some really screwed up In-Reply-To
fields out there.
---
 lib/PublicInbox/MID.pm | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm
index 74a4875..b81fc15 100644
--- a/lib/PublicInbox/MID.pm
+++ b/lib/PublicInbox/MID.pm
@@ -26,6 +26,7 @@ sub id_compress {
 	my ($id, $force) = @_;
 
 	if ($force || $id =~ /[^\w\-]/ || length($id) > MID_MAX) {
+		utf8::encode($id);
 		return sha1_hex($id);
 	}
 	$id;
-- 
EW


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 6/8] view: allow for missing In-Reply-To mapping
  2016-08-13 23:53 [PATCH 1/8] import_slrnspool: reimplement using fast-import Eric Wong
                   ` (3 preceding siblings ...)
  2016-08-13 23:53 ` [PATCH 5/8] mid: no wide characters for sha1_hex Eric Wong
@ 2016-08-13 23:53 ` Eric Wong
  2016-08-13 23:53 ` [PATCH 7/8] view: remove redundant pre closing tag Eric Wong
  2016-08-13 23:53 ` [PATCH 8/8] squash Eric Wong
  6 siblings, 0 replies; 8+ messages in thread
From: Eric Wong @ 2016-08-13 23:53 UTC (permalink / raw)
  To: spew

Because buggy mail clients exist and generate invalid
In-Reply-To headers we cannot handle across the board...
---
 lib/PublicInbox/View.pm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 5b352d1..3f25028 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -207,8 +207,8 @@ sub _th_index_lite {
 	my $nr_s = 0;
 	my $level = $map->[4];
 	my $idx = $map->[3];
-	if (defined $irt) {
-		my $irt_map = $mapping->{$irt};
+	my $irt_map = $mapping->{$irt} if defined $irt;
+	if (defined $irt_map) {
 		my $siblings = $irt_map->[0];
 		$nr_s = scalar(@$siblings) - 1;
 		$rv .= $pad . $irt_map->[1];
-- 
EW


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 7/8] view: remove redundant pre closing tag
  2016-08-13 23:53 [PATCH 1/8] import_slrnspool: reimplement using fast-import Eric Wong
                   ` (4 preceding siblings ...)
  2016-08-13 23:53 ` [PATCH 6/8] view: allow for missing In-Reply-To mapping Eric Wong
@ 2016-08-13 23:53 ` Eric Wong
  2016-08-13 23:53 ` [PATCH 8/8] squash Eric Wong
  6 siblings, 0 replies; 8+ messages in thread
From: Eric Wong @ 2016-08-13 23:53 UTC (permalink / raw)
  To: spew

---
 lib/PublicInbox/View.pm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 3f25028..6f79f60 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -377,7 +377,7 @@ sub thread_html {
 			return index_entry($mime, $ctx, scalar @$msgs);
 		}
 		$msgs = undef;
-		'</pre>'.$skel;
+		$skel;
 	});
 }
 
-- 
EW


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 8/8] squash
  2016-08-13 23:53 [PATCH 1/8] import_slrnspool: reimplement using fast-import Eric Wong
                   ` (5 preceding siblings ...)
  2016-08-13 23:53 ` [PATCH 7/8] view: remove redundant pre closing tag Eric Wong
@ 2016-08-13 23:53 ` Eric Wong
  6 siblings, 0 replies; 8+ messages in thread
From: Eric Wong @ 2016-08-13 23:53 UTC (permalink / raw)
  To: spew

---
 lib/PublicInbox/MID.pm | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm
index b81fc15..1c2d75c 100644
--- a/lib/PublicInbox/MID.pm
+++ b/lib/PublicInbox/MID.pm
@@ -38,7 +38,9 @@ sub mid2path {
 
 	unless (defined $x38) {
 		# compatibility with old links (or short Message-IDs :)
-		$mid = sha1_hex(mid_clean($mid));
+		$mid = mid_clean($mid);
+		utf8::encode($mid);
+		$mid = sha1_hex($mid);
 		($x2, $x38) = ($mid =~ /\A([a-f0-9]{2})([a-f0-9]{38})\z/);
 	}
 	"$x2/$x38";
-- 
EW


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2016-08-13 23:53 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-13 23:53 [PATCH 1/8] import_slrnspool: reimplement using fast-import Eric Wong
2016-08-13 23:53 ` [PATCH 2/8] searchidx: do not release Xapian lock while (only) Msgmap is indexing Eric Wong
2016-08-13 23:53 ` [PATCH 3/8] www: do not unecessarily escape some chars in paths Eric Wong
2016-08-13 23:53 ` [PATCH 4/8] www: do not double-clean Message-IDs from internal DBs Eric Wong
2016-08-13 23:53 ` [PATCH 5/8] mid: no wide characters for sha1_hex Eric Wong
2016-08-13 23:53 ` [PATCH 6/8] view: allow for missing In-Reply-To mapping Eric Wong
2016-08-13 23:53 ` [PATCH 7/8] view: remove redundant pre closing tag Eric Wong
2016-08-13 23:53 ` [PATCH 8/8] squash Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).