From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 709EC1F454 for ; Mon, 9 Oct 2023 10:47:22 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1696848442; bh=hdqVdiWn18kuvZ+VtTHoibw+rPaZURiibnJRseJAQzM=; h=From:To:Subject:Date:From; b=OybyV4NhUZBuy5FCCEKuzIl6iq+QsRQhpT0Jy0bGRi4o5JTzlRoAyE2mPDYL7V+dM 8oIZq4pjrYXvwTnnXGe010D2mQpa0w95vhYpsKzcxLJk0SOrsZ8xenX0qDHo8wZ99X JekLlgMEhzKJHtb9qogCwBBDAGv8MNiNn6qcQlOI= From: Eric Wong To: spew@80x24.org Subject: [PATCH] www_coderepo: fix handling of ancient encodings Date: Mon, 9 Oct 2023 10:47:22 +0000 Message-ID: <20231009104722.2119951-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: I figure it's better to display some garbled text than nothing at all. --- lib/PublicInbox/Hval.pm | 9 +++++++-- lib/PublicInbox/RepoAtom.pm | 4 ++-- lib/PublicInbox/RepoTree.pm | 4 ++-- lib/PublicInbox/ViewDiff.pm | 4 ++-- lib/PublicInbox/ViewVCS.pm | 19 +++++++++---------- lib/PublicInbox/WwwCoderepo.pm | 8 ++++---- xt/solver.t | 30 +++++++++++++++++------------- 7 files changed, 43 insertions(+), 35 deletions(-) diff --git a/lib/PublicInbox/Hval.pm b/lib/PublicInbox/Hval.pm index 0677865e..e9b9ae64 100644 --- a/lib/PublicInbox/Hval.pm +++ b/lib/PublicInbox/Hval.pm @@ -4,13 +4,13 @@ # represents a header value in various forms. Used for HTML generation # in our web interface(s) package PublicInbox::Hval; +use v5.10.1; # be careful about unicode_strings in v5.12; use strict; -use warnings; use Encode qw(find_encoding); use PublicInbox::MID qw/mid_clean mid_escape/; use base qw/Exporter/; our @EXPORT_OK = qw/ascii_html obfuscate_addrs to_filename src_escape - to_attr prurl mid_href fmt_ts ts2str/; + to_attr prurl mid_href fmt_ts ts2str utf8_maybe/; use POSIX qw(strftime); my $enc_ascii = find_encoding('us-ascii'); @@ -137,4 +137,9 @@ sub ts2str ($) { strftime('%Y%m%d%H%M%S', gmtime($_[0])) }; # human-friendly format sub fmt_ts ($) { strftime('%Y-%m-%d %k:%M', gmtime($_[0])) } +sub utf8_maybe ($) { + utf8::decode($_[0]); + utf8::valid($_[0]) or utf8::encode($_[0]); # non-UTF-8 data exists +} + 1; diff --git a/lib/PublicInbox/RepoAtom.pm b/lib/PublicInbox/RepoAtom.pm index c89d4551..79b76c12 100644 --- a/lib/PublicInbox/RepoAtom.pm +++ b/lib/PublicInbox/RepoAtom.pm @@ -8,7 +8,7 @@ use parent qw(PublicInbox::GzipFilter); use POSIX qw(strftime); use URI::Escape qw(uri_escape); use Scalar::Util (); -use PublicInbox::Hval qw(ascii_html); +use PublicInbox::Hval qw(ascii_html utf8_maybe); # git for-each-ref and log use different format fields :< my $ATOM_FMT = '--pretty=tformat:'.join('%n', @@ -50,7 +50,7 @@ sub translate { my $is_tag = $self->{-is_tag}; my ($H, $ct, $an, $ae, $at, $s, $bdy); while ($lbuf =~ s/\A([^\0]+)\0\n//s) { - utf8::decode($bdy = $1); + utf8_maybe($bdy = $1); if ($is_tag) { my %r; eval "$bdy"; diff --git a/lib/PublicInbox/RepoTree.pm b/lib/PublicInbox/RepoTree.pm index 9c7b86b3..5c73531a 100644 --- a/lib/PublicInbox/RepoTree.pm +++ b/lib/PublicInbox/RepoTree.pm @@ -8,7 +8,7 @@ use PublicInbox::ViewDiff qw(uri_escape_path); use PublicInbox::WwwStatic qw(r); use PublicInbox::Qspawn; use PublicInbox::WwwStream qw(html_oneshot); -use PublicInbox::Hval qw(ascii_html); +use PublicInbox::Hval qw(ascii_html utf8_maybe); sub rd_404_log { my ($bref, $ctx) = @_; @@ -26,7 +26,7 @@ sub rd_404_log { $code = 404; } else { my ($H, $h, $s_as) = split(/ /, $$bref, 3); - utf8::decode($s_as); + utf8_maybe($s_as); my $x = uri_escape_path($ctx->{-path}); $s_as = ascii_html($s_as); print $zfh <new; - utf8::decode($$bref); + utf8_maybe($$bref); html_page($ctx, 200, '
', $l->to_html($$bref), '

', dbg_log($ctx)); } sub cmt_title { # git->cat_async callback my ($bref, $oid, $type, $size, $ctx) = @_; - utf8::decode($$bref); + utf8_maybe($$bref); my $title = $$bref =~ /\r?\n\r?\n([^\r\n]+)\r?\n?/ ? $1 : ''; push(@{$ctx->{-cmt_pt}} , ascii_html($title)) == @{$ctx->{-cmt_P}} and cmt_finalize($ctx); @@ -160,8 +160,7 @@ sub show_commit_start { # ->psgi_qx callback open my $fh, '<', "$ctx->{-tmp}/h" or die "open $ctx->{-tmp}/h: $!"; chop(my $buf = do { local $/ = "\0"; <$fh> }); - utf8::decode($buf); - utf8::valid($buf) or utf8::encode($buf); # non-UTF-8 commits exist + utf8_maybe($buf); # non-UTF-8 commits exist chomp $buf; my ($P, $p); ($P, $p, @{$ctx->{cmt_info}}) = split(/\n/, $buf, 9); @@ -248,12 +247,12 @@ committer $co EOM print $zfh "\n", $ctx->{-linkify}->to_html($bdy) if length($bdy); $bdy = ''; - open my $fh, '<:utf8', "$ctx->{-tmp}/p" or - die "open $ctx->{-tmp}/p: $!"; + open my $fh, '<', "$ctx->{-tmp}/p" or die "open $ctx->{-tmp}/p: $!"; if (-s $fh > $MAX_SIZE) { print $zfh "---\n patch is too large to show\n"; } else { # prepare flush_diff: read($fh, $x, -s _); + utf8_maybe($x); $ctx->{-apfx} = $ctx->{-spfx} = $upfx; $x =~ s/\r?\n/\n/gs; $ctx->{-anchors} = {} if $x =~ /^diff --git /sm; @@ -418,7 +417,7 @@ EOM undef $_; ($m, $t, $oid, $sz) = split(/ +/, $x, 4); $m = $GIT_MODE{$m} // '?'; - utf8::decode($f); + utf8_maybe($f); $n = ascii_html($f); if ($m eq 'g') { # gitlink submodule commit $$bref .= "\ng\t\t$n @ commit$oid"; @@ -480,7 +479,7 @@ sub tz_adj ($) { sub show_tag_result { # git->cat_async callback my ($bref, $oid, $type, $size, $ctx) = @_; - utf8::decode($$bref); + utf8_maybe($$bref); my $l = PublicInbox::Linkify->new; $$bref = $l->to_html($$bref); $$bref =~ s!^object ([a-f0-9]+)!object cat_async callback " $raw_more".dbg_log($ctx)); # TODO: detect + convert to ensure validity - utf8::decode($$blob); + utf8_maybe($$blob); my $nl = ($$blob =~ s/\r?\n/\n/sg); my $pad = length($nl); diff --git a/lib/PublicInbox/WwwCoderepo.pm b/lib/PublicInbox/WwwCoderepo.pm index 834145e9..e8c340b5 100644 --- a/lib/PublicInbox/WwwCoderepo.pm +++ b/lib/PublicInbox/WwwCoderepo.pm @@ -14,7 +14,7 @@ use PublicInbox::ViewVCS; use PublicInbox::WwwStatic qw(r); use PublicInbox::GitHTTPBackend; use PublicInbox::WwwStream; -use PublicInbox::Hval qw(ascii_html); +use PublicInbox::Hval qw(ascii_html utf8_maybe); use PublicInbox::ViewDiff qw(uri_escape_path); use PublicInbox::RepoSnapshot; use PublicInbox::RepoAtom; @@ -179,7 +179,7 @@ EOM sub capture { # psgi_qx callback to capture git-for-each-ref my ($bref, $arg) = @_; # arg = [ctx, key, OnDestroy(summary_END)] - utf8::decode($$bref); + utf8_maybe($$bref); $arg->[0]->{qx_res}->{$arg->[1]} = $$bref; # summary_END may be called via OnDestroy $arg->[2] } @@ -241,13 +241,13 @@ sub translate { $fbuf .= shift while @_; if ($ctx->{-heads}) { while ($fbuf =~ s/\A([^\n]+)\n//s) { - utf8::decode(my $x = $1); + utf8_maybe(my $x = $1); push @out, _refs_heads_link($x, '../../'); } } else { my ($snap_pfx, @snap_fmt) = _snapshot_link_prep($ctx); while ($fbuf =~ s/\A([^\n]+)\n//s) { - utf8::decode(my $x = $1); + utf8_maybe(my $x = $1); push @out, _refs_tags_link($x, '../../', $snap_pfx, @snap_fmt); } diff --git a/xt/solver.t b/xt/solver.t index 06f5a493..357a3317 100644 --- a/xt/solver.t +++ b/xt/solver.t @@ -32,23 +32,29 @@ my $todo = { 'c2f3bf071ee90b01f2d629921bb04c4f798f02fa/s/', # tag '7eb93c89651c47c8095d476251f2e4314656b292/s/', # non-UTF-8 ], + 'sox-devel' => [ + 'c38987e8d20505621b8d872863afa7d233ed1096/s/', # non-UTF-8 + ] }; -my ($ibx_name, $urls, @gone); +my @gone; my $client = sub { my ($cb) = @_; - for my $u (@$urls) { - my $url = "/$ibx_name/$u"; - my $res = $cb->(GET($url)); - is($res->code, 200, $url); - next if $res->code == 200; - diag "$url failed"; - diag $res->content; + while (my ($ibx_name, $urls) = each %$todo) { + diag "testing $ibx_name"; + for my $u (@$urls) { + my $url = "/$ibx_name/$u"; + my $res = $cb->(GET($url)); + is($res->code, 200, $url); + next if $res->code == 200; + diag "$url failed"; + diag $res->content; + } } }; my $nr = 0; -while (($ibx_name, $urls) = each %$todo) { +while (my ($ibx_name, $urls) = each %$todo) { SKIP: { my $ibx = $cfg->lookup_name($ibx_name); if (!$ibx) { @@ -61,15 +67,13 @@ while (($ibx_name, $urls) = each %$todo) { skip(qq{publicinbox.$ibx_name.coderepo not configured}, scalar(@$urls)); } - test_psgi($app, $client); $nr++; } } delete @$todo{@gone}; +test_psgi($app, $client); my $env = { PI_CONFIG => PublicInbox::Config->default_file }; -while (($ibx_name, $urls) = each %$todo) { - test_httpd($env, $client, $nr); -} +test_httpd($env, $client, $nr); done_testing();