From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: AS44716 212.21.64.0/19 X-Spam-Status: No, score=-2.6 required=3.0 tests=AWL,BAYES_00,RCVD_IN_XBL, SPF_FAIL,SPF_HELO_FAIL,SUBJ_ALL_CAPS,TO_EQ_FM_DOM_SPF_FAIL shortcircuit=no autolearn=no autolearn_force=no version=3.4.2 Received: from 80x24.org (tor-exit-4.all.de [212.21.66.6]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by dcvr.yhbt.net (Postfix) with ESMTPS id 0199E1F461 for ; Sat, 11 May 2019 22:55:19 +0000 (UTC) From: Eric Wong To: spew@80x24.org Subject: [PATCH] WIP Date: Sat, 11 May 2019 22:55:13 +0000 Message-Id: <20190511225513.20820-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: --- lib/PublicInbox/ViewDiff.pm | 367 +++++++++++++++++++++++++++++++++++- lib/PublicInbox/ViewVCS.pm | 19 +- 2 files changed, 367 insertions(+), 19 deletions(-) diff --git a/lib/PublicInbox/ViewDiff.pm b/lib/PublicInbox/ViewDiff.pm index 85b5314..6d732dd 100644 --- a/lib/PublicInbox/ViewDiff.pm +++ b/lib/PublicInbox/ViewDiff.pm @@ -10,10 +10,11 @@ package PublicInbox::ViewDiff; use strict; use warnings; use base qw(Exporter); -our @EXPORT_OK = qw(flush_diff); +our @EXPORT_OK = qw(flush_diff); # for emails, OO API is for git(1) output use URI::Escape qw(uri_escape_utf8); use PublicInbox::Hval qw(ascii_html to_attr from_attr); use PublicInbox::Git qw(git_unquote); +use PublicInbox::WwwStream; # keep track of state so we can avoid redundant HTML tags for # identically-classed lines @@ -41,6 +42,17 @@ package PublicInbox::ViewDiff; my $PATH_A = '"?a/.+|/dev/null'; my $PATH_B = '"?b/.+|/dev/null'; +my $CMT_FMT = '--pretty=format:'.join('%n', + '%H', '%s', '%an <%ae>', '%ai', '%cn <%ce>', '%ci', + '%t', '%p', '%D', '%b%x00'); + +sub CC_EMPTY () { " This is a merge, and the combined diff is empty.\n" } +sub CC_MERGE () { " This is a merge, showing combined diff:\n\n" } + +# used for "git show" (on commits) and "git diff" +my @DIFF_OPT = qw(-z --numstat -p --encoding=UTF-8 -C -B -D + --no-color --no-abbrev); + sub to_html ($$) { $_[0]->linkify_1($_[1]); $_[0]->linkify_2(ascii_html($_[1])); @@ -143,6 +155,16 @@ ($$$$$) undef } +sub dquery ($$) { + my ($pa, $pb) = @_; + my $q = '?b=' . uri_escape_utf8($pb, UNSAFE); + $q .= '&a=' . uri_escape_utf8($pa, UNSAFE) if $pa ne $pb; + $q +} + +# API for emails. public-inbox assumes we can fit any email entirely +# (because we rely on Email::MIME anyways). +# This interface assumes that. sub flush_diff ($$$) { my ($dst, $ctx, $linkify) = @_; my $diff = $ctx->{-diff}; @@ -174,13 +196,7 @@ ($$$) } $pa = (split('/', git_unquote($pa), 2))[1]; $pb = (split('/', git_unquote($pb), 2))[1]; - $dctx = { - Q => "?b=".uri_escape_utf8($pb, UNSAFE), - }; - if ($pa ne $pb) { - $dctx->{Q} .= '&a='. - uri_escape_utf8($pa, UNSAFE); - } + $dctx = { Q => dquery($pa, $pb) }; anchor1($dst, $ctx, $linkify, $pb, $s) and next; $$dst .= to_html($linkify, $s); } elsif ($s =~ s/^(index $OID_NULL\.\.)($OID_BLOB)\b//o) { @@ -196,7 +212,7 @@ ($$$) $$dst .= to_html($linkify, $s); } elsif ($s =~ s/^@@ (\S+) (\S+) @@//) { $$dst .= '' if $state2class[$state]; - $$dst .= qq(); + $$dst .= qq(); # XHTML $$dst .= diff_hunk($dctx, $spfx, $1, $2); $$dst .= ''; $state = DSTATE_CTX; @@ -235,4 +251,337 @@ ($$$) undef; } +# OO interface +sub _cmd ($$) { + my ($self, $cmd) = @_; + $self->{git_cmd} = join(' ', @$cmd); + $cmd; +} + +sub diff_cmd ($$$) { + my ($self, $oid_a, $oid_b) = @_; + _cmd($self, [ 'diff', @DIFF_OPT, $oid_a, $oid_b, '--' ]); +} + +sub commit_cmd ($$) { + my ($self, $oid) = @_; + _cmd($self, [ qw(show -c), @DIFF_OPT, $CMT_FMT, $oid, '--' ]); +} + +# OO API for parsing output of git-diff(1), git-show(1), etc... +# We try to do as much as possible by streaming, so we act as a +# a stream editor (e.g. 'sed') +sub new { + my ($class) = @_; + my $self = { + dstate => DSTATE_INIT, + dbuf => '', + # mhelp => merge help + # diff_tree => 1 (true if comparing tree-ish) + }; + $self->{ndiff} = $self->{nchg} = $self->{nadd} = $self->{ndel} = 0; + bless $self, $class; +} + +# diffstat links to anchors within the same HTML page +sub git_diffstat_rename ($$$) { + my ($self, $from, $to) = @_; + my $anchor = to_attr(git_unquote($to)); + $self->{anchors}->{$anchor} = $to; + my @from = split('/', $from); + my @to = split('/', $to); + my ($base, @base); + + # only show differing path components + while (@to && @from && $to[0] eq $from[0]) { + push @base, shift(@to); + shift @from; + } + + $base = ascii_html(join('/', @base)) if @base; + $from = ascii_html(join('/', @from)); + $to = ascii_html(join('/', @to)); + $to = qq($to); + @base ? "$base/{$from => $to}" : "$from => $to"; +} + +sub git_diff_sed_stat ($$) { + my ($self, $dst) = @_; + my @stat = split(/\0/, delete $self->{dbuf}, -1); + my $end; # end-of-stat + my $nchg = \($self->{nchg}); + my $nadd = \($self->{nadd}); + my $ndel = \($self->{ndel}); + if (!$self->{dstat_started}) { + $self->{dstat_started} = 1; + + # merges start with an extra '\0' before the diffstat + # non-merge commits start with an extra '\n', instead + if ($self->{mhelp}) { + if ($stat[0] eq '') { + shift @stat; + } else { + warn +'initial merge diffstat line was not empty'; + } + } else { + # for commits, only (not diff-tree) + $stat[0] =~ s/\A\n//s; + } + } + while (defined(my $l = shift @stat)) { + if ($l eq '') { + $end = 1 if $stat[0] && $stat[0] =~ /\Ad/; # "diff --" + last; + } elsif ($l =~ /\Adiff /) { + unshift @stat, $l; + $end = 1; + last; + } + utf8::upgrade($l); + $l =~ /\A(\S+)\t+(\S+)\t+(.*)/ or next; + my ($add, $del, $fn) = ($1, $2, $3); + if ($fn ne '') { # normal modification + # TODO: discard diffs if they are too big + # gigantic changes with many files may still OOM us + my $anchor = to_attr(git_unquote($fn)); + $self->{anchors}->{$anchor} = $fn; + $l = qq().ascii_html($fn).''; + } else { # rename + # incomplete... + if (scalar(@stat) < 2) { + unshift @stat, $l; + last; + } + my $from = shift @stat; + my $to = shift @stat; + utf8::upgrade($from); + utf8::upgrade($to); + $l = git_diffstat_rename($self, $from, $to); + } + + # text changes show numerically, Binary does not + if ($add =~ /\A\d+\z/) { + $$nadd += $add; + $$ndel += $del; + $add = "+$add"; + $del = "-$del"; + } + ++$$nchg; + my $num = sprintf('% 6s/%-6s', $del, $add); + $$dst .= " $num\t$l\n"; + } + + # the rest of the diff: + $self->{dbuf} = join("\0", @stat); + return unless $end; + + $self->{dstate} = DSTATE_HEAD; + $$dst .= "\n $$nchg "; + $$dst .= $$nchg == 1 ? 'file changed, ' : 'files changed, '; + $$dst .= $$nadd; + $$dst .= $$nadd == 1 ? ' insertion(+), ' : ' insertions(+), '; + $$dst .= $$ndel; + $$dst .= $$ndel == 1 ? " deletion(-)\n\n" : " deletions(-)\n\n"; +} + +# index abcdef89..01234567 100644 +sub git_diff_ab_index ($$$$) { + my ($self, $oid_a, $oid_b, $mode) = @_; + $self->{oid_a} = $oid_a; + $self->{oid_b} = $oid_b; + my $range = "$oid_a..$oid_b"; + if ($self->{diff_tree}) { + $range = qq({Q}">$range); + } + 'index ' . $range . ascii_html($mode); +} + +# diff --git a/foo.c b/bar.c +sub git_diff_ab_hdr ($$$) { + my ($self, $pa, $pb) = @_; + my $rv = ''; + if ($self->{dstate} != DSTATE_HEAD) { + to_state(\$rv, $self->{dstate}, DSTATE_HEAD); + } + $pa = (split('/', git_unquote($pa), 2))[1]; + $pb = (split('/', git_unquote($pb), 2))[1]; + $self->{Q} = dquery($pa, $pb) if $self->{diff_tree}; + my $anchor = to_attr($pb); + delete $self->{anchors}->{$anchor}; + + # not wasting bandwidth on links here + # links in hunk headers are far more useful with line offsets + $rv .= qq(diff --git ) . + ascii_html($pa) . ' ' . ascii_html($pb) +} + +# diff (--cc|--combined) +sub git_diff_cc_hdr { + my ($self, $combined, $path) = @_; + $path = git_unquote($path); + $self->{Q} = dquery($path, $path); + my $anchor = to_attr($path); + delete $self->{anchors}->{$anchor}; + qq(diff --$combined ) . ascii_html($path); +} + +sub offset_link ($$$) { + my ($qs, $oid, $offset) = @_; + my ($n) = ($offset =~ /\A[\-\+](\d+)/); + if (defined $n && $n == 0) { + # new or deleted file, don't link it + $offset; + } else { + $n = defined $n ? "#n$n" : ''; + qq($offset) + } +} + +# @@ -1,2 +3,4 @@ (regular diff) +sub git_diff_ab_hunk ($$$$) { + my ($self, $ca, $cb, $func_ctx) = @_; + my $qs = $self->{Q}; + + qq(@@ ) . # HTML + offset_link($qs, $self->{oid_a}, $ca) . + ' ' . offset_link($qs, $self->{oid_b}, $cb) . + ' @@' . ascii_html($func_ctx) . ''; +} + +# index abcdef09,01234567..76543210 +sub git_diff_cc_index { + my ($self, $before, $last, $end) = @_; + $self->{oids_cc} = [ split(',', $before), $last ]; + + # not wasting bandwidth on links here, yet + # links in hunk headers are far more useful with line offsets + "index $before..$last" . ascii_html($end); +} + +# @@@ -1,2 -3,4 +5,6 @@@ (combined diff) +sub git_diff_cc_hunk ($$$$) { + my ($self, $at_signs, $offs, $func_ctx) = @_; + my $pobj = $self->{oids_cc}; + my $i = 0; + my $qs = $self->{Q}; + qq(@@ ) . # HTML + join(' ', $at_signs, map { + offset_link($qs, $pobj->[$i++], $_); + } split(' ', $offs), + $at_signs) . ascii_html($func_ctx) . ''; +} + +# the rest of the diff (beyond diffstat) +sub git_diff_sed_lines ($$) { + my ($self, $dst) = @_; + + my @dlines = split(/\n/, delete $self->{dbuf}, -1); + + # don't touch the last line, it may not be terminated + $self->{dbuf} = pop @dlines; + + if (my $help = delete $self->{mhelp}) { + $$dst .= $help; # CC_MERGE + } + + # reminder: this is stricter than similar code in flush_diff, + # this is for git output (including --cc/--combined) we generate, + # while flush_diff parses mail + my $ndiff = \($self->{ndiff}); + my $linkify = PublicInbox::Linkify->new; + while (defined(my $s = shift @dlines)) { + utf8::upgrade($s); + if ($s =~ m{\Adiff --git ("?a/.+) ("?b/.+)\z}) { # regular + $$dst .= git_diff_ab_hdr($self, $1, $2); + } elsif ($s =~ m{\Adiff --(cc|combined) (.+)\z}) { + $$dst .= git_diff_cc_hdr($self, $1, $2); + } elsif ($s =~ /\Aindex ($OID_BLOB)\.\.($OID_BLOB)(.*)\z/o) { + # regular diff + $$dst .= git_diff_ab_index($self, $1, $2, $3); + } elsif ($s =~ + /\Aindex ($OID_BLOB,[^\.]+)\.\.($OID_BLOB)(.*)\z/o) { + # --cc diff + $$dst .= git_diff_cc_index($self, $1, $2, $3); + } elsif ($s =~ /\A@@ (\S+) (\S+) @@(.*)\z/) { # regular + $$dst .= '' if $state2class[$self->{dstate}]; + $$dst .= git_diff_ab_hunk($self, $1, $2, $3); + $self->{dstate} = DSTATE_CTX; + } elsif ($s =~ /\A(@@@+) (\S+.*\S+) @@@+(.*)\z/) { # --cc + $$dst .= '' if $state2class[$self->{dstate}]; + $$dst .= git_diff_cc_hunk($self, $1, $2, $3); + } elsif ($s =~ /^ /) { + # works for common cases, but not weird/long filenames + if ($self->{dstate} == DSTATE_STAT && + $s =~ /^ (.+)( +\| .*\z)/s) { + anchor0(\$dst, $self, $linkify, $1, $2) and next; + } elsif ($state2class[$self->{dstate}]) { + to_state($dst, $self->{dstate}, DSTATE_CTX); + } + $$dst .= to_html($linkify, $s); + } elsif ($s =~ m!^--- ! || $s =~ m!^\+{3} !) { + # color only (no oid link) if missing dctx->{oid_*} + $self->{dstate} <= DSTATE_STAT and + to_state($dst, $state, DSTATE_HEAD); + $$dst .= to_html($linkify, $s); + } elsif ($s =~ /^\+/) { + if ($self->{dstate} != DSTATE_ADD) { + to_state($dst, $state, DSTATE_ADD); + } + $$dst .= to_html($linkify, $s); + } elsif ($s =~ /^-/) { + if ($$state != DSTATE_DEL && $$state > DSTATE_STAT) { + to_state($dst, $state, DSTATE_DEL); + } + $$dst .= to_html($linkify, $s); + # ignore the following lines in headers: + } elsif ($s =~ /^(?:dis)similarity index/ || + $s =~ /^(?:old|new) mode/ || + $s =~ /^(?:deleted|new) file mode/ || + $s =~ /^(?:copy|rename) (?:from|to) / || + $s =~ /^(?:dis)?similarity index /) { + $$dst .= to_html($linkify, $s); + } else { + $$dst .= to_html($linkify, $s); + } + ++$$ndiff; + } +} + +sub git_diff_sed_run ($$) { + my ($self, $dst) = @_; + $self->{dstate} == DSTATE_STAT and git_diff_sed_stat($self, $dst); + $self->{dstate} > DSTATE_STAT and git_diff_sed_lines($self, $dst); + undef; +} + +sub git_diff_sed_close ($$) { + my ($self, $dst) = @_; + my $tmp = delete $self->{dbuf}; + utf8::upgrade($tmp); + $$dst .= $tmp; + undef; +} + +sub git_diff_sed { + my ($self, $ctx) = @_; + my $ws = { ctx => $ctx }; + my @first = PublicInbox::WwwStream::html_top($ws) . '
';
+	$ctx->{-html_tip} = "
Output of: git $self->{git_cmd}\n";
+	$self->{dstate} = DSTATE_STAT;
+
+	# this filters for $fh->write or $body->getline (see Qspawn)
+	sub {
+		my $dst = shift @first || '';
+		if (defined $_[0]) { # $_[0] == scalar buffer
+			$self->{dbuf} .= $_[0];
+			git_diff_sed_run($self, \$dst);
+		} else { # undef means EOF from "git show", flush the last bit
+			git_diff_sed_close($self, \$dst);
+			$dst .= '
'.PublicInbox::WwwStream::html_end($ws); + } + $dst; + } +} + 1; diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm index c693fcf..2ba09a8 100644 --- a/lib/PublicInbox/ViewVCS.pm +++ b/lib/PublicInbox/ViewVCS.pm @@ -20,7 +20,7 @@ package PublicInbox::ViewVCS; use PublicInbox::SolverGit; use PublicInbox::WwwStream; use PublicInbox::Linkify; -use PublicInbox::ViewDiff qw(flush_diff); +use PublicInbox::ViewDiff; use PublicInbox::Hval qw(ascii_html to_filename); my $hl = eval { require PublicInbox::HlMod; @@ -181,23 +181,22 @@ ($$$$$$) return html_page($ctx, 500, \'seek error'); } $log = do { local $/; <$log> }; - warn "log: $log\n"; + my $vdiff = PublicInbox::ViewDiff->new; my $git_b = $res_b->[0]; - my $cmd = ['git', "--git-dir=$git_b->{git_dir}", 'diff', - $res_a->[1], $res_b->[1] ]; + my $gcmd = $vdiff->diff_cmd($res_a->[1], $res_b->[1]); + my $cmd = ['git', "--git-dir=$git_b->{git_dir}", @$gcmd ]; my $qsp = PublicInbox::Qspawn->new($cmd); my $env = $ctx->{env}; $env->{'qspawn.wcb'} = delete $ctx->{-wcb}; - $qsp->psgi_return($env, undef, sub { + $qsp->psgi_return($env, undef, sub { # parse header my ($r, $bref) = @_; if (!defined $r) { # error html_page($ctx, 500, $log); - } elsif (index($$bref, "\0") >= 0) { - my $ct = 'application/octet-stream'; - [200, ['Content-Type', $ct ] ]; + } elsif ($r == 0) { + PublicInbox::WwwStream::r($ctx, 200, 'empty diff'); } else { - my $ct = 'text/plain; charset=UTF-8'; - [200, ['Content-Type', $ct] ]; + $env->{'qspawn.filter'} = $vdiff->git_diff_sed($ctx); + PublicInbox::WwwStream::r($ctx, 200); } }); } -- EW