dumping ground for random patches and texts
 help / color / mirror / Atom feed
* [PATCH] www_coderepo: fix handling of ancient encodings
@ 2023-10-09 10:47 Eric Wong
  0 siblings, 0 replies; only message in thread
From: Eric Wong @ 2023-10-09 10:47 UTC (permalink / raw)
  To: spew

I figure it's better to display some garbled text than nothing
at all.
---
 lib/PublicInbox/Hval.pm        |  9 +++++++--
 lib/PublicInbox/RepoAtom.pm    |  4 ++--
 lib/PublicInbox/RepoTree.pm    |  4 ++--
 lib/PublicInbox/ViewDiff.pm    |  4 ++--
 lib/PublicInbox/ViewVCS.pm     | 19 +++++++++----------
 lib/PublicInbox/WwwCoderepo.pm |  8 ++++----
 xt/solver.t                    | 30 +++++++++++++++++-------------
 7 files changed, 43 insertions(+), 35 deletions(-)

diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm
index 0677865e..e9b9ae64 100644
--- a/lib/PublicInbox/Hval.pm
+++ b/lib/PublicInbox/Hval.pm
@@ -4,13 +4,13 @@
 # represents a header value in various forms.  Used for HTML generation
 # in our web interface(s)
 package PublicInbox::Hval;
+use v5.10.1; # be careful about unicode_strings in v5.12;
 use strict;
-use warnings;
 use Encode qw(find_encoding);
 use PublicInbox::MID qw/mid_clean mid_escape/;
 use base qw/Exporter/;
 our @EXPORT_OK = qw/ascii_html obfuscate_addrs to_filename src_escape
-		to_attr prurl mid_href fmt_ts ts2str/;
+		to_attr prurl mid_href fmt_ts ts2str utf8_maybe/;
 use POSIX qw(strftime);
 my $enc_ascii = find_encoding('us-ascii');
 
@@ -137,4 +137,9 @@ sub ts2str ($) { strftime('%Y%m%d%H%M%S', gmtime($_[0])) };
 # human-friendly format
 sub fmt_ts ($) { strftime('%Y-%m-%d %k:%M', gmtime($_[0])) }
 
+sub utf8_maybe ($) {
+	utf8::decode($_[0]);
+	utf8::valid($_[0]) or utf8::encode($_[0]); # non-UTF-8 data exists
+}
+
 1;
diff --git a/lib/PublicInbox/RepoAtom.pm b/lib/PublicInbox/RepoAtom.pm
index c89d4551..79b76c12 100644
--- a/lib/PublicInbox/RepoAtom.pm
+++ b/lib/PublicInbox/RepoAtom.pm
@@ -8,7 +8,7 @@ use parent qw(PublicInbox::GzipFilter);
 use POSIX qw(strftime);
 use URI::Escape qw(uri_escape);
 use Scalar::Util ();
-use PublicInbox::Hval qw(ascii_html);
+use PublicInbox::Hval qw(ascii_html utf8_maybe);
 
 # git for-each-ref and log use different format fields :<
 my $ATOM_FMT = '--pretty=tformat:'.join('%n',
@@ -50,7 +50,7 @@ sub translate {
 	my $is_tag = $self->{-is_tag};
 	my ($H, $ct, $an, $ae, $at, $s, $bdy);
 	while ($lbuf =~ s/\A([^\0]+)\0\n//s) {
-		utf8::decode($bdy = $1);
+		utf8_maybe($bdy = $1);
 		if ($is_tag) {
 			my %r;
 			eval "$bdy";
diff --git a/lib/PublicInbox/RepoTree.pm b/lib/PublicInbox/RepoTree.pm
index 9c7b86b3..5c73531a 100644
--- a/lib/PublicInbox/RepoTree.pm
+++ b/lib/PublicInbox/RepoTree.pm
@@ -8,7 +8,7 @@ use PublicInbox::ViewDiff qw(uri_escape_path);
 use PublicInbox::WwwStatic qw(r);
 use PublicInbox::Qspawn;
 use PublicInbox::WwwStream qw(html_oneshot);
-use PublicInbox::Hval qw(ascii_html);
+use PublicInbox::Hval qw(ascii_html utf8_maybe);
 
 sub rd_404_log {
 	my ($bref, $ctx) = @_;
@@ -26,7 +26,7 @@ sub rd_404_log {
 		$code = 404;
 	} else {
 		my ($H, $h, $s_as) = split(/ /, $$bref, 3);
-		utf8::decode($s_as);
+		utf8_maybe($s_as);
 		my $x = uri_escape_path($ctx->{-path});
 		$s_as = ascii_html($s_as);
 		print $zfh <<EOM;
diff --git a/lib/PublicInbox/ViewDiff.pm b/lib/PublicInbox/ViewDiff.pm
index 124a723a..d078c5f9 100644
--- a/lib/PublicInbox/ViewDiff.pm
+++ b/lib/PublicInbox/ViewDiff.pm
@@ -11,7 +11,7 @@ use v5.12;
 use parent qw(Exporter);
 our @EXPORT_OK = qw(flush_diff uri_escape_path);
 use URI::Escape qw(uri_escape_utf8);
-use PublicInbox::Hval qw(ascii_html to_attr);
+use PublicInbox::Hval qw(ascii_html to_attr utf8_maybe);
 use PublicInbox::Git qw(git_unquote);
 
 my $OID_NULL = '0{7,}';
@@ -236,7 +236,7 @@ sub flush_diff ($$) {
 				}
 			}
 			if (!$dctx) {
-				utf8::decode($after);
+				utf8_maybe($after);
 				diff_before_or_after($ctx, \$after);
 			}
 		} else {
diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm
index e402d557..53e98c71 100644
--- a/lib/PublicInbox/ViewVCS.pm
+++ b/lib/PublicInbox/ViewVCS.pm
@@ -25,7 +25,7 @@ use PublicInbox::ViewDiff qw(flush_diff uri_escape_path);
 use PublicInbox::View;
 use PublicInbox::Eml;
 use Text::Wrap qw(wrap);
-use PublicInbox::Hval qw(ascii_html to_filename prurl);
+use PublicInbox::Hval qw(ascii_html to_filename prurl utf8_maybe);
 use POSIX qw(strftime);
 my $hl = eval {
 	require PublicInbox::HlMod;
@@ -115,14 +115,14 @@ sub show_other_result ($$) { # future-proofing
 				"git show error:$qsp_err");
 	}
 	my $l = PublicInbox::Linkify->new;
-	utf8::decode($$bref);
+	utf8_maybe($$bref);
 	html_page($ctx, 200, '<pre>', $l->to_html($$bref), '</pre><hr>',
 		dbg_log($ctx));
 }
 
 sub cmt_title { # git->cat_async callback
 	my ($bref, $oid, $type, $size, $ctx) = @_;
-	utf8::decode($$bref);
+	utf8_maybe($$bref);
 	my $title = $$bref =~ /\r?\n\r?\n([^\r\n]+)\r?\n?/ ? $1 : '';
 	push(@{$ctx->{-cmt_pt}} , ascii_html($title)) == @{$ctx->{-cmt_P}} and
 		cmt_finalize($ctx);
@@ -160,8 +160,7 @@ sub show_commit_start { # ->psgi_qx callback
 	open my $fh, '<', "$ctx->{-tmp}/h" or
 		die "open $ctx->{-tmp}/h: $!";
 	chop(my $buf = do { local $/ = "\0"; <$fh> });
-	utf8::decode($buf);
-	utf8::valid($buf) or utf8::encode($buf); # non-UTF-8 commits exist
+	utf8_maybe($buf); # non-UTF-8 commits exist
 	chomp $buf;
 	my ($P, $p);
 	($P, $p, @{$ctx->{cmt_info}}) = split(/\n/, $buf, 9);
@@ -248,12 +247,12 @@ committer $co
 EOM
 	print $zfh "\n", $ctx->{-linkify}->to_html($bdy) if length($bdy);
 	$bdy = '';
-	open my $fh, '<:utf8', "$ctx->{-tmp}/p" or
-		die "open $ctx->{-tmp}/p: $!";
+	open my $fh, '<', "$ctx->{-tmp}/p" or die "open $ctx->{-tmp}/p: $!";
 	if (-s $fh > $MAX_SIZE) {
 		print $zfh "---\n patch is too large to show\n";
 	} else { # prepare flush_diff:
 		read($fh, $x, -s _);
+		utf8_maybe($x);
 		$ctx->{-apfx} = $ctx->{-spfx} = $upfx;
 		$x =~ s/\r?\n/\n/gs;
 		$ctx->{-anchors} = {} if $x =~ /^diff --git /sm;
@@ -418,7 +417,7 @@ EOM
 		undef $_;
 		($m, $t, $oid, $sz) = split(/ +/, $x, 4);
 		$m = $GIT_MODE{$m} // '?';
-		utf8::decode($f);
+		utf8_maybe($f);
 		$n = ascii_html($f);
 		if ($m eq 'g') { # gitlink submodule commit
 			$$bref .= "\ng\t\t$n @ <a\nhref=#g>commit</a>$oid";
@@ -480,7 +479,7 @@ sub tz_adj ($) {
 
 sub show_tag_result { # git->cat_async callback
 	my ($bref, $oid, $type, $size, $ctx) = @_;
-	utf8::decode($$bref);
+	utf8_maybe($$bref);
 	my $l = PublicInbox::Linkify->new;
 	$$bref = $l->to_html($$bref);
 	$$bref =~ s!^object ([a-f0-9]+)!object <a
@@ -569,7 +568,7 @@ sub show_blob { # git->cat_async callback
 				" $raw_more</pre>".dbg_log($ctx));
 
 	# TODO: detect + convert to ensure validity
-	utf8::decode($$blob);
+	utf8_maybe($$blob);
 	my $nl = ($$blob =~ s/\r?\n/\n/sg);
 	my $pad = length($nl);
 
diff --git a/lib/PublicInbox/WwwCoderepo.pm b/lib/PublicInbox/WwwCoderepo.pm
index 834145e9..e8c340b5 100644
--- a/lib/PublicInbox/WwwCoderepo.pm
+++ b/lib/PublicInbox/WwwCoderepo.pm
@@ -14,7 +14,7 @@ use PublicInbox::ViewVCS;
 use PublicInbox::WwwStatic qw(r);
 use PublicInbox::GitHTTPBackend;
 use PublicInbox::WwwStream;
-use PublicInbox::Hval qw(ascii_html);
+use PublicInbox::Hval qw(ascii_html utf8_maybe);
 use PublicInbox::ViewDiff qw(uri_escape_path);
 use PublicInbox::RepoSnapshot;
 use PublicInbox::RepoAtom;
@@ -179,7 +179,7 @@ EOM
 
 sub capture { # psgi_qx callback to capture git-for-each-ref
 	my ($bref, $arg) = @_; # arg = [ctx, key, OnDestroy(summary_END)]
-	utf8::decode($$bref);
+	utf8_maybe($$bref);
 	$arg->[0]->{qx_res}->{$arg->[1]} = $$bref;
 	# summary_END may be called via OnDestroy $arg->[2]
 }
@@ -241,13 +241,13 @@ sub translate {
 	$fbuf .= shift while @_;
 	if ($ctx->{-heads}) {
 		while ($fbuf =~ s/\A([^\n]+)\n//s) {
-			utf8::decode(my $x = $1);
+			utf8_maybe(my $x = $1);
 			push @out, _refs_heads_link($x, '../../');
 		}
 	} else {
 		my ($snap_pfx, @snap_fmt) = _snapshot_link_prep($ctx);
 		while ($fbuf =~ s/\A([^\n]+)\n//s) {
-			utf8::decode(my $x = $1);
+			utf8_maybe(my $x = $1);
 			push @out, _refs_tags_link($x, '../../',
 						$snap_pfx, @snap_fmt);
 		}
diff --git a/xt/solver.t b/xt/solver.t
index 06f5a493..357a3317 100644
--- a/xt/solver.t
+++ b/xt/solver.t
@@ -32,23 +32,29 @@ my $todo = {
 		'c2f3bf071ee90b01f2d629921bb04c4f798f02fa/s/', # tag
 		'7eb93c89651c47c8095d476251f2e4314656b292/s/', # non-UTF-8
 	],
+	'sox-devel' => [
+		'c38987e8d20505621b8d872863afa7d233ed1096/s/', # non-UTF-8
+	]
 };
 
-my ($ibx_name, $urls, @gone);
+my @gone;
 my $client = sub {
 	my ($cb) = @_;
-	for my $u (@$urls) {
-		my $url = "/$ibx_name/$u";
-		my $res = $cb->(GET($url));
-		is($res->code, 200, $url);
-		next if $res->code == 200;
-		diag "$url failed";
-		diag $res->content;
+	while (my ($ibx_name, $urls) = each %$todo) {
+		diag "testing $ibx_name";
+		for my $u (@$urls) {
+			my $url = "/$ibx_name/$u";
+			my $res = $cb->(GET($url));
+			is($res->code, 200, $url);
+			next if $res->code == 200;
+			diag "$url failed";
+			diag $res->content;
+		}
 	}
 };
 
 my $nr = 0;
-while (($ibx_name, $urls) = each %$todo) {
+while (my ($ibx_name, $urls) = each %$todo) {
 	SKIP: {
 		my $ibx = $cfg->lookup_name($ibx_name);
 		if (!$ibx) {
@@ -61,15 +67,13 @@ while (($ibx_name, $urls) = each %$todo) {
 			skip(qq{publicinbox.$ibx_name.coderepo not configured},
 				scalar(@$urls));
 		}
-		test_psgi($app, $client);
 		$nr++;
 	}
 }
 
 delete @$todo{@gone};
+test_psgi($app, $client);
 my $env = { PI_CONFIG => PublicInbox::Config->default_file };
-while (($ibx_name, $urls) = each %$todo) {
-	test_httpd($env, $client, $nr);
-}
+test_httpd($env, $client, $nr);
 
 done_testing();

^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2023-10-09 10:47 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-09 10:47 [PATCH] www_coderepo: fix handling of ancient encodings Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).