dumping ground for random patches and texts
 help / color / mirror / Atom feed
* [PATCH] www_coderepo: fix handling of non-UTF-8 git data
@ 2023-10-09 17:53 Eric Wong
  0 siblings, 0 replies; only message in thread
From: Eric Wong @ 2023-10-09 17:53 UTC (permalink / raw)
  To: spew

We can't assume git output is UTF-8, and we'll always have
legacy data in git coderepos.  So attempt to display some
some garbled text rather than nothing at all if Perl croaks
on it.

sox commit c38987e8d20505621b8d872863afa7d233ed1096
(Added raw inverse-bit u-law and A-law support.  Updated *.txt files., 2001-12-13)
is an example of a commit which caused problems for me.
---
 lib/PublicInbox/Hval.pm        |  9 +++++++--
 lib/PublicInbox/RepoAtom.pm    |  4 ++--
 lib/PublicInbox/RepoTree.pm    |  4 ++--
 lib/PublicInbox/ViewDiff.pm    |  4 ++--
 lib/PublicInbox/ViewVCS.pm     | 19 +++++++++----------
 lib/PublicInbox/WwwCoderepo.pm |  8 ++++----
 xt/solver.t                    | 31 ++++++++++++++++++-------------
 7 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm
index 0677865e..e9b9ae64 100644
--- a/lib/PublicInbox/Hval.pm
+++ b/lib/PublicInbox/Hval.pm
@@ -4,13 +4,13 @@
 # represents a header value in various forms.  Used for HTML generation
 # in our web interface(s)
 package PublicInbox::Hval;
+use v5.10.1; # be careful about unicode_strings in v5.12;
 use strict;
-use warnings;
 use Encode qw(find_encoding);
 use PublicInbox::MID qw/mid_clean mid_escape/;
 use base qw/Exporter/;
 our @EXPORT_OK = qw/ascii_html obfuscate_addrs to_filename src_escape
-		to_attr prurl mid_href fmt_ts ts2str/;
+		to_attr prurl mid_href fmt_ts ts2str utf8_maybe/;
 use POSIX qw(strftime);
 my $enc_ascii = find_encoding('us-ascii');
 
@@ -137,4 +137,9 @@ sub ts2str ($) { strftime('%Y%m%d%H%M%S', gmtime($_[0])) };
 # human-friendly format
 sub fmt_ts ($) { strftime('%Y-%m-%d %k:%M', gmtime($_[0])) }
 
+sub utf8_maybe ($) {
+	utf8::decode($_[0]);
+	utf8::valid($_[0]) or utf8::encode($_[0]); # non-UTF-8 data exists
+}
+
 1;
diff --git a/lib/PublicInbox/RepoAtom.pm b/lib/PublicInbox/RepoAtom.pm
index c89d4551..79b76c12 100644
--- a/lib/PublicInbox/RepoAtom.pm
+++ b/lib/PublicInbox/RepoAtom.pm
@@ -8,7 +8,7 @@ use parent qw(PublicInbox::GzipFilter);
 use POSIX qw(strftime);
 use URI::Escape qw(uri_escape);
 use Scalar::Util ();
-use PublicInbox::Hval qw(ascii_html);
+use PublicInbox::Hval qw(ascii_html utf8_maybe);
 
 # git for-each-ref and log use different format fields :<
 my $ATOM_FMT = '--pretty=tformat:'.join('%n',
@@ -50,7 +50,7 @@ sub translate {
 	my $is_tag = $self->{-is_tag};
 	my ($H, $ct, $an, $ae, $at, $s, $bdy);
 	while ($lbuf =~ s/\A([^\0]+)\0\n//s) {
-		utf8::decode($bdy = $1);
+		utf8_maybe($bdy = $1);
 		if ($is_tag) {
 			my %r;
 			eval "$bdy";
diff --git a/lib/PublicInbox/RepoTree.pm b/lib/PublicInbox/RepoTree.pm
index 9c7b86b3..5c73531a 100644
--- a/lib/PublicInbox/RepoTree.pm
+++ b/lib/PublicInbox/RepoTree.pm
@@ -8,7 +8,7 @@ use PublicInbox::ViewDiff qw(uri_escape_path);
 use PublicInbox::WwwStatic qw(r);
 use PublicInbox::Qspawn;
 use PublicInbox::WwwStream qw(html_oneshot);
-use PublicInbox::Hval qw(ascii_html);
+use PublicInbox::Hval qw(ascii_html utf8_maybe);
 
 sub rd_404_log {
 	my ($bref, $ctx) = @_;
@@ -26,7 +26,7 @@ sub rd_404_log {
 		$code = 404;
 	} else {
 		my ($H, $h, $s_as) = split(/ /, $$bref, 3);
-		utf8::decode($s_as);
+		utf8_maybe($s_as);
 		my $x = uri_escape_path($ctx->{-path});
 		$s_as = ascii_html($s_as);
 		print $zfh <<EOM;
diff --git a/lib/PublicInbox/ViewDiff.pm b/lib/PublicInbox/ViewDiff.pm
index 124a723a..d078c5f9 100644
--- a/lib/PublicInbox/ViewDiff.pm
+++ b/lib/PublicInbox/ViewDiff.pm
@@ -11,7 +11,7 @@ use v5.12;
 use parent qw(Exporter);
 our @EXPORT_OK = qw(flush_diff uri_escape_path);
 use URI::Escape qw(uri_escape_utf8);
-use PublicInbox::Hval qw(ascii_html to_attr);
+use PublicInbox::Hval qw(ascii_html to_attr utf8_maybe);
 use PublicInbox::Git qw(git_unquote);
 
 my $OID_NULL = '0{7,}';
@@ -236,7 +236,7 @@ sub flush_diff ($$) {
 				}
 			}
 			if (!$dctx) {
-				utf8::decode($after);
+				utf8_maybe($after);
 				diff_before_or_after($ctx, \$after);
 			}
 		} else {
diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm
index e402d557..53e98c71 100644
--- a/lib/PublicInbox/ViewVCS.pm
+++ b/lib/PublicInbox/ViewVCS.pm
@@ -25,7 +25,7 @@ use PublicInbox::ViewDiff qw(flush_diff uri_escape_path);
 use PublicInbox::View;
 use PublicInbox::Eml;
 use Text::Wrap qw(wrap);
-use PublicInbox::Hval qw(ascii_html to_filename prurl);
+use PublicInbox::Hval qw(ascii_html to_filename prurl utf8_maybe);
 use POSIX qw(strftime);
 my $hl = eval {
 	require PublicInbox::HlMod;
@@ -115,14 +115,14 @@ sub show_other_result ($$) { # future-proofing
 				"git show error:$qsp_err");
 	}
 	my $l = PublicInbox::Linkify->new;
-	utf8::decode($$bref);
+	utf8_maybe($$bref);
 	html_page($ctx, 200, '<pre>', $l->to_html($$bref), '</pre><hr>',
 		dbg_log($ctx));
 }
 
 sub cmt_title { # git->cat_async callback
 	my ($bref, $oid, $type, $size, $ctx) = @_;
-	utf8::decode($$bref);
+	utf8_maybe($$bref);
 	my $title = $$bref =~ /\r?\n\r?\n([^\r\n]+)\r?\n?/ ? $1 : '';
 	push(@{$ctx->{-cmt_pt}} , ascii_html($title)) == @{$ctx->{-cmt_P}} and
 		cmt_finalize($ctx);
@@ -160,8 +160,7 @@ sub show_commit_start { # ->psgi_qx callback
 	open my $fh, '<', "$ctx->{-tmp}/h" or
 		die "open $ctx->{-tmp}/h: $!";
 	chop(my $buf = do { local $/ = "\0"; <$fh> });
-	utf8::decode($buf);
-	utf8::valid($buf) or utf8::encode($buf); # non-UTF-8 commits exist
+	utf8_maybe($buf); # non-UTF-8 commits exist
 	chomp $buf;
 	my ($P, $p);
 	($P, $p, @{$ctx->{cmt_info}}) = split(/\n/, $buf, 9);
@@ -248,12 +247,12 @@ committer $co
 EOM
 	print $zfh "\n", $ctx->{-linkify}->to_html($bdy) if length($bdy);
 	$bdy = '';
-	open my $fh, '<:utf8', "$ctx->{-tmp}/p" or
-		die "open $ctx->{-tmp}/p: $!";
+	open my $fh, '<', "$ctx->{-tmp}/p" or die "open $ctx->{-tmp}/p: $!";
 	if (-s $fh > $MAX_SIZE) {
 		print $zfh "---\n patch is too large to show\n";
 	} else { # prepare flush_diff:
 		read($fh, $x, -s _);
+		utf8_maybe($x);
 		$ctx->{-apfx} = $ctx->{-spfx} = $upfx;
 		$x =~ s/\r?\n/\n/gs;
 		$ctx->{-anchors} = {} if $x =~ /^diff --git /sm;
@@ -418,7 +417,7 @@ EOM
 		undef $_;
 		($m, $t, $oid, $sz) = split(/ +/, $x, 4);
 		$m = $GIT_MODE{$m} // '?';
-		utf8::decode($f);
+		utf8_maybe($f);
 		$n = ascii_html($f);
 		if ($m eq 'g') { # gitlink submodule commit
 			$$bref .= "\ng\t\t$n @ <a\nhref=#g>commit</a>$oid";
@@ -480,7 +479,7 @@ sub tz_adj ($) {
 
 sub show_tag_result { # git->cat_async callback
 	my ($bref, $oid, $type, $size, $ctx) = @_;
-	utf8::decode($$bref);
+	utf8_maybe($$bref);
 	my $l = PublicInbox::Linkify->new;
 	$$bref = $l->to_html($$bref);
 	$$bref =~ s!^object ([a-f0-9]+)!object <a
@@ -569,7 +568,7 @@ sub show_blob { # git->cat_async callback
 				" $raw_more</pre>".dbg_log($ctx));
 
 	# TODO: detect + convert to ensure validity
-	utf8::decode($$blob);
+	utf8_maybe($$blob);
 	my $nl = ($$blob =~ s/\r?\n/\n/sg);
 	my $pad = length($nl);
 
diff --git a/lib/PublicInbox/WwwCoderepo.pm b/lib/PublicInbox/WwwCoderepo.pm
index 834145e9..e8c340b5 100644
--- a/lib/PublicInbox/WwwCoderepo.pm
+++ b/lib/PublicInbox/WwwCoderepo.pm
@@ -14,7 +14,7 @@ use PublicInbox::ViewVCS;
 use PublicInbox::WwwStatic qw(r);
 use PublicInbox::GitHTTPBackend;
 use PublicInbox::WwwStream;
-use PublicInbox::Hval qw(ascii_html);
+use PublicInbox::Hval qw(ascii_html utf8_maybe);
 use PublicInbox::ViewDiff qw(uri_escape_path);
 use PublicInbox::RepoSnapshot;
 use PublicInbox::RepoAtom;
@@ -179,7 +179,7 @@ EOM
 
 sub capture { # psgi_qx callback to capture git-for-each-ref
 	my ($bref, $arg) = @_; # arg = [ctx, key, OnDestroy(summary_END)]
-	utf8::decode($$bref);
+	utf8_maybe($$bref);
 	$arg->[0]->{qx_res}->{$arg->[1]} = $$bref;
 	# summary_END may be called via OnDestroy $arg->[2]
 }
@@ -241,13 +241,13 @@ sub translate {
 	$fbuf .= shift while @_;
 	if ($ctx->{-heads}) {
 		while ($fbuf =~ s/\A([^\n]+)\n//s) {
-			utf8::decode(my $x = $1);
+			utf8_maybe(my $x = $1);
 			push @out, _refs_heads_link($x, '../../');
 		}
 	} else {
 		my ($snap_pfx, @snap_fmt) = _snapshot_link_prep($ctx);
 		while ($fbuf =~ s/\A([^\n]+)\n//s) {
-			utf8::decode(my $x = $1);
+			utf8_maybe(my $x = $1);
 			push @out, _refs_tags_link($x, '../../',
 						$snap_pfx, @snap_fmt);
 		}
diff --git a/xt/solver.t b/xt/solver.t
index 06f5a493..51b4144c 100644
--- a/xt/solver.t
+++ b/xt/solver.t
@@ -32,23 +32,30 @@ my $todo = {
 		'c2f3bf071ee90b01f2d629921bb04c4f798f02fa/s/', # tag
 		'7eb93c89651c47c8095d476251f2e4314656b292/s/', # non-UTF-8
 	],
+	'sox-devel' => [
+		'c38987e8d20505621b8d872863afa7d233ed1096/s/', # non-UTF-8
+	]
 };
 
-my ($ibx_name, $urls, @gone);
+my @gone;
 my $client = sub {
 	my ($cb) = @_;
-	for my $u (@$urls) {
-		my $url = "/$ibx_name/$u";
-		my $res = $cb->(GET($url));
-		is($res->code, 200, $url);
-		next if $res->code == 200;
-		diag "$url failed";
-		diag $res->content;
+	for my $ibx_name (sort keys %$todo) {
+		diag "testing $ibx_name";
+		my $urls = $todo->{$ibx_name};
+		for my $u (@$urls) {
+			my $url = "/$ibx_name/$u";
+			my $res = $cb->(GET($url));
+			is($res->code, 200, $url);
+			next if $res->code == 200;
+			diag "$url failed";
+			diag $res->content;
+		}
 	}
 };
 
 my $nr = 0;
-while (($ibx_name, $urls) = each %$todo) {
+while (my ($ibx_name, $urls) = each %$todo) {
 	SKIP: {
 		my $ibx = $cfg->lookup_name($ibx_name);
 		if (!$ibx) {
@@ -61,15 +68,13 @@ while (($ibx_name, $urls) = each %$todo) {
 			skip(qq{publicinbox.$ibx_name.coderepo not configured},
 				scalar(@$urls));
 		}
-		test_psgi($app, $client);
 		$nr++;
 	}
 }
 
 delete @$todo{@gone};
+test_psgi($app, $client);
 my $env = { PI_CONFIG => PublicInbox::Config->default_file };
-while (($ibx_name, $urls) = each %$todo) {
-	test_httpd($env, $client, $nr);
-}
+test_httpd($env, $client, $nr);
 
 done_testing();

^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2023-10-09 17:53 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-09 17:53 [PATCH] www_coderepo: fix handling of non-UTF-8 git data Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).