* [PATCH] www_coderepo: fix handling of ancient encodings
@ 2023-10-09 10:47 Eric Wong
0 siblings, 0 replies; only message in thread
From: Eric Wong @ 2023-10-09 10:47 UTC (permalink / raw)
To: spew
I figure it's better to display some garbled text than nothing
at all.
---
lib/PublicInbox/Hval.pm | 9 +++++++--
lib/PublicInbox/RepoAtom.pm | 4 ++--
lib/PublicInbox/RepoTree.pm | 4 ++--
lib/PublicInbox/ViewDiff.pm | 4 ++--
lib/PublicInbox/ViewVCS.pm | 19 +++++++++----------
lib/PublicInbox/WwwCoderepo.pm | 8 ++++----
xt/solver.t | 30 +++++++++++++++++-------------
7 files changed, 43 insertions(+), 35 deletions(-)
diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm
index 0677865e..e9b9ae64 100644
--- a/lib/PublicInbox/Hval.pm
+++ b/lib/PublicInbox/Hval.pm
@@ -4,13 +4,13 @@
# represents a header value in various forms. Used for HTML generation
# in our web interface(s)
package PublicInbox::Hval;
+use v5.10.1; # be careful about unicode_strings in v5.12;
use strict;
-use warnings;
use Encode qw(find_encoding);
use PublicInbox::MID qw/mid_clean mid_escape/;
use base qw/Exporter/;
our @EXPORT_OK = qw/ascii_html obfuscate_addrs to_filename src_escape
- to_attr prurl mid_href fmt_ts ts2str/;
+ to_attr prurl mid_href fmt_ts ts2str utf8_maybe/;
use POSIX qw(strftime);
my $enc_ascii = find_encoding('us-ascii');
@@ -137,4 +137,9 @@ sub ts2str ($) { strftime('%Y%m%d%H%M%S', gmtime($_[0])) };
# human-friendly format
sub fmt_ts ($) { strftime('%Y-%m-%d %k:%M', gmtime($_[0])) }
+sub utf8_maybe ($) {
+ utf8::decode($_[0]);
+ utf8::valid($_[0]) or utf8::encode($_[0]); # non-UTF-8 data exists
+}
+
1;
diff --git a/lib/PublicInbox/RepoAtom.pm b/lib/PublicInbox/RepoAtom.pm
index c89d4551..79b76c12 100644
--- a/lib/PublicInbox/RepoAtom.pm
+++ b/lib/PublicInbox/RepoAtom.pm
@@ -8,7 +8,7 @@ use parent qw(PublicInbox::GzipFilter);
use POSIX qw(strftime);
use URI::Escape qw(uri_escape);
use Scalar::Util ();
-use PublicInbox::Hval qw(ascii_html);
+use PublicInbox::Hval qw(ascii_html utf8_maybe);
# git for-each-ref and log use different format fields :<
my $ATOM_FMT = '--pretty=tformat:'.join('%n',
@@ -50,7 +50,7 @@ sub translate {
my $is_tag = $self->{-is_tag};
my ($H, $ct, $an, $ae, $at, $s, $bdy);
while ($lbuf =~ s/\A([^\0]+)\0\n//s) {
- utf8::decode($bdy = $1);
+ utf8_maybe($bdy = $1);
if ($is_tag) {
my %r;
eval "$bdy";
diff --git a/lib/PublicInbox/RepoTree.pm b/lib/PublicInbox/RepoTree.pm
index 9c7b86b3..5c73531a 100644
--- a/lib/PublicInbox/RepoTree.pm
+++ b/lib/PublicInbox/RepoTree.pm
@@ -8,7 +8,7 @@ use PublicInbox::ViewDiff qw(uri_escape_path);
use PublicInbox::WwwStatic qw(r);
use PublicInbox::Qspawn;
use PublicInbox::WwwStream qw(html_oneshot);
-use PublicInbox::Hval qw(ascii_html);
+use PublicInbox::Hval qw(ascii_html utf8_maybe);
sub rd_404_log {
my ($bref, $ctx) = @_;
@@ -26,7 +26,7 @@ sub rd_404_log {
$code = 404;
} else {
my ($H, $h, $s_as) = split(/ /, $$bref, 3);
- utf8::decode($s_as);
+ utf8_maybe($s_as);
my $x = uri_escape_path($ctx->{-path});
$s_as = ascii_html($s_as);
print $zfh <<EOM;
diff --git a/lib/PublicInbox/ViewDiff.pm b/lib/PublicInbox/ViewDiff.pm
index 124a723a..d078c5f9 100644
--- a/lib/PublicInbox/ViewDiff.pm
+++ b/lib/PublicInbox/ViewDiff.pm
@@ -11,7 +11,7 @@ use v5.12;
use parent qw(Exporter);
our @EXPORT_OK = qw(flush_diff uri_escape_path);
use URI::Escape qw(uri_escape_utf8);
-use PublicInbox::Hval qw(ascii_html to_attr);
+use PublicInbox::Hval qw(ascii_html to_attr utf8_maybe);
use PublicInbox::Git qw(git_unquote);
my $OID_NULL = '0{7,}';
@@ -236,7 +236,7 @@ sub flush_diff ($$) {
}
}
if (!$dctx) {
- utf8::decode($after);
+ utf8_maybe($after);
diff_before_or_after($ctx, \$after);
}
} else {
diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm
index e402d557..53e98c71 100644
--- a/lib/PublicInbox/ViewVCS.pm
+++ b/lib/PublicInbox/ViewVCS.pm
@@ -25,7 +25,7 @@ use PublicInbox::ViewDiff qw(flush_diff uri_escape_path);
use PublicInbox::View;
use PublicInbox::Eml;
use Text::Wrap qw(wrap);
-use PublicInbox::Hval qw(ascii_html to_filename prurl);
+use PublicInbox::Hval qw(ascii_html to_filename prurl utf8_maybe);
use POSIX qw(strftime);
my $hl = eval {
require PublicInbox::HlMod;
@@ -115,14 +115,14 @@ sub show_other_result ($$) { # future-proofing
"git show error:$qsp_err");
}
my $l = PublicInbox::Linkify->new;
- utf8::decode($$bref);
+ utf8_maybe($$bref);
html_page($ctx, 200, '<pre>', $l->to_html($$bref), '</pre><hr>',
dbg_log($ctx));
}
sub cmt_title { # git->cat_async callback
my ($bref, $oid, $type, $size, $ctx) = @_;
- utf8::decode($$bref);
+ utf8_maybe($$bref);
my $title = $$bref =~ /\r?\n\r?\n([^\r\n]+)\r?\n?/ ? $1 : '';
push(@{$ctx->{-cmt_pt}} , ascii_html($title)) == @{$ctx->{-cmt_P}} and
cmt_finalize($ctx);
@@ -160,8 +160,7 @@ sub show_commit_start { # ->psgi_qx callback
open my $fh, '<', "$ctx->{-tmp}/h" or
die "open $ctx->{-tmp}/h: $!";
chop(my $buf = do { local $/ = "\0"; <$fh> });
- utf8::decode($buf);
- utf8::valid($buf) or utf8::encode($buf); # non-UTF-8 commits exist
+ utf8_maybe($buf); # non-UTF-8 commits exist
chomp $buf;
my ($P, $p);
($P, $p, @{$ctx->{cmt_info}}) = split(/\n/, $buf, 9);
@@ -248,12 +247,12 @@ committer $co
EOM
print $zfh "\n", $ctx->{-linkify}->to_html($bdy) if length($bdy);
$bdy = '';
- open my $fh, '<:utf8', "$ctx->{-tmp}/p" or
- die "open $ctx->{-tmp}/p: $!";
+ open my $fh, '<', "$ctx->{-tmp}/p" or die "open $ctx->{-tmp}/p: $!";
if (-s $fh > $MAX_SIZE) {
print $zfh "---\n patch is too large to show\n";
} else { # prepare flush_diff:
read($fh, $x, -s _);
+ utf8_maybe($x);
$ctx->{-apfx} = $ctx->{-spfx} = $upfx;
$x =~ s/\r?\n/\n/gs;
$ctx->{-anchors} = {} if $x =~ /^diff --git /sm;
@@ -418,7 +417,7 @@ EOM
undef $_;
($m, $t, $oid, $sz) = split(/ +/, $x, 4);
$m = $GIT_MODE{$m} // '?';
- utf8::decode($f);
+ utf8_maybe($f);
$n = ascii_html($f);
if ($m eq 'g') { # gitlink submodule commit
$$bref .= "\ng\t\t$n @ <a\nhref=#g>commit</a>$oid";
@@ -480,7 +479,7 @@ sub tz_adj ($) {
sub show_tag_result { # git->cat_async callback
my ($bref, $oid, $type, $size, $ctx) = @_;
- utf8::decode($$bref);
+ utf8_maybe($$bref);
my $l = PublicInbox::Linkify->new;
$$bref = $l->to_html($$bref);
$$bref =~ s!^object ([a-f0-9]+)!object <a
@@ -569,7 +568,7 @@ sub show_blob { # git->cat_async callback
" $raw_more</pre>".dbg_log($ctx));
# TODO: detect + convert to ensure validity
- utf8::decode($$blob);
+ utf8_maybe($$blob);
my $nl = ($$blob =~ s/\r?\n/\n/sg);
my $pad = length($nl);
diff --git a/lib/PublicInbox/WwwCoderepo.pm b/lib/PublicInbox/WwwCoderepo.pm
index 834145e9..e8c340b5 100644
--- a/lib/PublicInbox/WwwCoderepo.pm
+++ b/lib/PublicInbox/WwwCoderepo.pm
@@ -14,7 +14,7 @@ use PublicInbox::ViewVCS;
use PublicInbox::WwwStatic qw(r);
use PublicInbox::GitHTTPBackend;
use PublicInbox::WwwStream;
-use PublicInbox::Hval qw(ascii_html);
+use PublicInbox::Hval qw(ascii_html utf8_maybe);
use PublicInbox::ViewDiff qw(uri_escape_path);
use PublicInbox::RepoSnapshot;
use PublicInbox::RepoAtom;
@@ -179,7 +179,7 @@ EOM
sub capture { # psgi_qx callback to capture git-for-each-ref
my ($bref, $arg) = @_; # arg = [ctx, key, OnDestroy(summary_END)]
- utf8::decode($$bref);
+ utf8_maybe($$bref);
$arg->[0]->{qx_res}->{$arg->[1]} = $$bref;
# summary_END may be called via OnDestroy $arg->[2]
}
@@ -241,13 +241,13 @@ sub translate {
$fbuf .= shift while @_;
if ($ctx->{-heads}) {
while ($fbuf =~ s/\A([^\n]+)\n//s) {
- utf8::decode(my $x = $1);
+ utf8_maybe(my $x = $1);
push @out, _refs_heads_link($x, '../../');
}
} else {
my ($snap_pfx, @snap_fmt) = _snapshot_link_prep($ctx);
while ($fbuf =~ s/\A([^\n]+)\n//s) {
- utf8::decode(my $x = $1);
+ utf8_maybe(my $x = $1);
push @out, _refs_tags_link($x, '../../',
$snap_pfx, @snap_fmt);
}
diff --git a/xt/solver.t b/xt/solver.t
index 06f5a493..357a3317 100644
--- a/xt/solver.t
+++ b/xt/solver.t
@@ -32,23 +32,29 @@ my $todo = {
'c2f3bf071ee90b01f2d629921bb04c4f798f02fa/s/', # tag
'7eb93c89651c47c8095d476251f2e4314656b292/s/', # non-UTF-8
],
+ 'sox-devel' => [
+ 'c38987e8d20505621b8d872863afa7d233ed1096/s/', # non-UTF-8
+ ]
};
-my ($ibx_name, $urls, @gone);
+my @gone;
my $client = sub {
my ($cb) = @_;
- for my $u (@$urls) {
- my $url = "/$ibx_name/$u";
- my $res = $cb->(GET($url));
- is($res->code, 200, $url);
- next if $res->code == 200;
- diag "$url failed";
- diag $res->content;
+ while (my ($ibx_name, $urls) = each %$todo) {
+ diag "testing $ibx_name";
+ for my $u (@$urls) {
+ my $url = "/$ibx_name/$u";
+ my $res = $cb->(GET($url));
+ is($res->code, 200, $url);
+ next if $res->code == 200;
+ diag "$url failed";
+ diag $res->content;
+ }
}
};
my $nr = 0;
-while (($ibx_name, $urls) = each %$todo) {
+while (my ($ibx_name, $urls) = each %$todo) {
SKIP: {
my $ibx = $cfg->lookup_name($ibx_name);
if (!$ibx) {
@@ -61,15 +67,13 @@ while (($ibx_name, $urls) = each %$todo) {
skip(qq{publicinbox.$ibx_name.coderepo not configured},
scalar(@$urls));
}
- test_psgi($app, $client);
$nr++;
}
}
delete @$todo{@gone};
+test_psgi($app, $client);
my $env = { PI_CONFIG => PublicInbox::Config->default_file };
-while (($ibx_name, $urls) = each %$todo) {
- test_httpd($env, $client, $nr);
-}
+test_httpd($env, $client, $nr);
done_testing();
^ permalink raw reply related [flat|nested] only message in thread
only message in thread, other threads:[~2023-10-09 10:47 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-09 10:47 [PATCH] www_coderepo: fix handling of ancient encodings Eric Wong
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).