Git Mailing List Archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] Teach diff to honor diff algorithms set through git attributes
@ 2023-02-05  3:46 John Cai via GitGitGadget
  2023-02-05  3:46 ` [PATCH 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
                   ` (2 more replies)
  0 siblings, 3 replies; 78+ messages in thread
From: John Cai via GitGitGadget @ 2023-02-05  3:46 UTC (permalink / raw)
  To: git; +Cc: John Cai

When a repository contains different kinds of files, it may be desirable to
use different algorithms based on file type. This is currently not feasible
through the command line or using git configs. However, we can leverage the
fact that gitattributes are path aware.

Teach the diff machinery to check gitattributes when diffing files.

John Cai (2):
  diff: consolidate diff algorithm option parsing
  diff: teach diff to read gitattribute diff-algorithm

 Documentation/gitattributes.txt |  23 +++++++
 diff.c                          | 112 ++++++++++++++++++++++----------
 diff.h                          |   2 +
 t/lib-diff-alternative.sh       |  27 +++++++-
 4 files changed, 130 insertions(+), 34 deletions(-)


base-commit: a6a323b31e2bcbac2518bddec71ea7ad558870eb
Published-As: https://github.com/gitgitgadget/git/releases/tag/pr-git-1452%2Fjohn-cai%2Fjc%2Fattr-diff-algo-v1
Fetch-It-Via: git fetch https://github.com/gitgitgadget/git pr-git-1452/john-cai/jc/attr-diff-algo-v1
Pull-Request: https://github.com/git/git/pull/1452
-- 
gitgitgadget

^ permalink raw reply	[flat|nested] 78+ messages in thread

* [PATCH 1/2] diff: consolidate diff algorithm option parsing
  2023-02-05  3:46 [PATCH 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
@ 2023-02-05  3:46 ` John Cai via GitGitGadget
  2023-02-06 16:20   ` Phillip Wood
  2023-02-05  3:46 ` [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm John Cai via GitGitGadget
  2023-02-14 21:40 ` [PATCH v2 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
  2 siblings, 1 reply; 78+ messages in thread
From: John Cai via GitGitGadget @ 2023-02-05  3:46 UTC (permalink / raw)
  To: git; +Cc: John Cai, John Cai

From: John Cai <jcai@gitlab.com>

The diff option parsing for --minimal, --patience, --histgoram can all
be consolidated into one function. This is a preparatory step for the
subsequent commit which teaches diff to keep track of whether or not a
diff algorithm has been set via the command line.

While we're at it, the logic that sets the diff algorithm in
diff_opt_diff_algorithm() can be refactored into a helper that will
allow multiple callsites to set the diff algorithm.

Signed-off-by: John Cai <johncai86@gmail.com>
---
 diff.c | 87 ++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 54 insertions(+), 33 deletions(-)

diff --git a/diff.c b/diff.c
index 329eebf16a0..a8a31c81fe7 100644
--- a/diff.c
+++ b/diff.c
@@ -3437,6 +3437,22 @@ static int diff_filepair_is_phoney(struct diff_filespec *one,
 	return !DIFF_FILE_VALID(one) && !DIFF_FILE_VALID(two);
 }
 
+static int set_diff_algorithm(struct diff_options *opts,
+			      const char *alg)
+{
+	long value = parse_algorithm_value(alg);
+
+	if (value < 0)
+		return 1;
+
+	/* clear out previous settings */
+	DIFF_XDL_CLR(opts, NEED_MINIMAL);
+	opts->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
+	opts->xdl_opts |= value;
+
+	return 0;
+}
+
 static void builtin_diff(const char *name_a,
 			 const char *name_b,
 			 struct diff_filespec *one,
@@ -5107,17 +5123,40 @@ static int diff_opt_diff_algorithm(const struct option *opt,
 				   const char *arg, int unset)
 {
 	struct diff_options *options = opt->value;
-	long value = parse_algorithm_value(arg);
 
 	BUG_ON_OPT_NEG(unset);
-	if (value < 0)
+
+	if (set_diff_algorithm(options, arg))
 		return error(_("option diff-algorithm accepts \"myers\", "
 			       "\"minimal\", \"patience\" and \"histogram\""));
 
-	/* clear out previous settings */
-	DIFF_XDL_CLR(options, NEED_MINIMAL);
-	options->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
-	options->xdl_opts |= value;
+	return 0;
+}
+
+static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
+				   const char *arg, int unset)
+{
+	struct diff_options *options = opt->value;
+
+	BUG_ON_OPT_NEG(unset);
+	BUG_ON_OPT_ARG(arg);
+
+	if (!strcmp(opt->long_name, "patience")) {
+		int i;
+		/*
+		 * Both --patience and --anchored use PATIENCE_DIFF
+		 * internally, so remove any anchors previously
+		 * specified.
+		 */
+		for (i = 0; i < options->anchors_nr; i++)
+			free(options->anchors[i]);
+		options->anchors_nr = 0;
+	}
+
+	if (set_diff_algorithm(options, opt->long_name))
+		return error(_("available diff algorithms include \"myers\", "
+			       "\"minimal\", \"patience\" and \"histogram\""));
+
 	return 0;
 }
 
@@ -5242,26 +5281,6 @@ static enum parse_opt_result diff_opt_output(struct parse_opt_ctx_t *ctx,
 	return 0;
 }
 
-static int diff_opt_patience(const struct option *opt,
-			     const char *arg, int unset)
-{
-	struct diff_options *options = opt->value;
-	int i;
-
-	BUG_ON_OPT_NEG(unset);
-	BUG_ON_OPT_ARG(arg);
-	options->xdl_opts = DIFF_WITH_ALG(options, PATIENCE_DIFF);
-	/*
-	 * Both --patience and --anchored use PATIENCE_DIFF
-	 * internally, so remove any anchors previously
-	 * specified.
-	 */
-	for (i = 0; i < options->anchors_nr; i++)
-		free(options->anchors[i]);
-	options->anchors_nr = 0;
-	return 0;
-}
-
 static int diff_opt_ignore_regex(const struct option *opt,
 				 const char *arg, int unset)
 {
@@ -5562,9 +5581,10 @@ struct option *add_diff_options(const struct option *opts,
 			    N_("prevent rename/copy detection if the number of rename/copy targets exceeds given limit")),
 
 		OPT_GROUP(N_("Diff algorithm options")),
-		OPT_BIT(0, "minimal", &options->xdl_opts,
-			N_("produce the smallest possible diff"),
-			XDF_NEED_MINIMAL),
+		OPT_CALLBACK_F(0, "minimal", options, NULL,
+			       N_("produce the smallest possible diff"),
+			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
+			       diff_opt_diff_algorithm_no_arg),
 		OPT_BIT_F('w', "ignore-all-space", &options->xdl_opts,
 			  N_("ignore whitespace when comparing lines"),
 			  XDF_IGNORE_WHITESPACE, PARSE_OPT_NONEG),
@@ -5589,10 +5609,11 @@ struct option *add_diff_options(const struct option *opts,
 		OPT_CALLBACK_F(0, "patience", options, NULL,
 			       N_("generate diff using the \"patience diff\" algorithm"),
 			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
-			       diff_opt_patience),
-		OPT_BITOP(0, "histogram", &options->xdl_opts,
-			  N_("generate diff using the \"histogram diff\" algorithm"),
-			  XDF_HISTOGRAM_DIFF, XDF_DIFF_ALGORITHM_MASK),
+			       diff_opt_diff_algorithm_no_arg),
+		OPT_CALLBACK_F(0, "histogram", options, NULL,
+			       N_("generate diff using the \"histogram diff\" algorithm"),
+			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
+			       diff_opt_diff_algorithm_no_arg),
 		OPT_CALLBACK_F(0, "diff-algorithm", options, N_("<algorithm>"),
 			       N_("choose a diff algorithm"),
 			       PARSE_OPT_NONEG, diff_opt_diff_algorithm),
-- 
gitgitgadget


^ permalink raw reply related	[flat|nested] 78+ messages in thread

* [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-05  3:46 [PATCH 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
  2023-02-05  3:46 ` [PATCH 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
@ 2023-02-05  3:46 ` John Cai via GitGitGadget
  2023-02-05 17:50   ` Eric Sunshine
                     ` (3 more replies)
  2023-02-14 21:40 ` [PATCH v2 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
  2 siblings, 4 replies; 78+ messages in thread
From: John Cai via GitGitGadget @ 2023-02-05  3:46 UTC (permalink / raw)
  To: git; +Cc: John Cai, John Cai

From: John Cai <johncai86@gmail.com>

It can be useful to specify diff algorithms per file type. For example,
one may want to use the minimal diff algorithm for .json files, another
for .c files, etc.

Teach the diff machinery to check attributes for a diff algorithm.
Enforce precedence by favoring the command line option, then looking at
attributes, then finally the config.

To enforce precedence order, set the `xdl_opts_command_line` member
during options pasing to indicate the diff algorithm was set via command
line args.

Signed-off-by: John Cai <johncai86@gmail.com>
---
 Documentation/gitattributes.txt | 23 +++++++++++++++++++++++
 diff.c                          | 25 +++++++++++++++++++++++++
 diff.h                          |  2 ++
 t/lib-diff-alternative.sh       | 27 ++++++++++++++++++++++++++-
 4 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt
index c19e64ea0ef..501dd536037 100644
--- a/Documentation/gitattributes.txt
+++ b/Documentation/gitattributes.txt
@@ -736,6 +736,29 @@ String::
 	by the configuration variables in the "diff.foo" section of the
 	Git config file.
 
+`diff-algorithm`
+^^^^^^^^^^^^^^^^
+
+The attribute `diff-algorithm` affects which algorithm Git uses to generate
+diffs. This allows defining diff algorithms per file extension. Precedence rules
+are as follows, in order from highest to lowest:
+
+*Command line option*
+
+Pass in the `--diff-algorithm` command line option int git-diff(1)
+
+*Git attributes*
+
+------------------------
+*.json	diff-algorithm=histogram
+------------------------
+
+*Git config*
+
+----------------------------------------------------------------
+[diff]
+	algorithm = histogram
+----------------------------------------------------------------
 
 Defining an external diff driver
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/diff.c b/diff.c
index a8a31c81fe7..c78e28daeb0 100644
--- a/diff.c
+++ b/diff.c
@@ -3652,6 +3652,27 @@ static void builtin_diff(const char *name_a,
 		ecbdata.opt = o;
 		if (header.len && !o->flags.suppress_diff_headers)
 			ecbdata.header = &header;
+
+		if (!o->xdl_opts_command_line) {
+			static struct attr_check *check;
+			const char *one_diff_algo;
+			const char *two_diff_algo;
+
+			check = attr_check_alloc();
+			attr_check_append(check, git_attr("diff-algorithm"));
+
+			git_check_attr(the_repository->index, NULL, one->path, check);
+			one_diff_algo = check->items[0].value;
+			git_check_attr(the_repository->index, NULL, two->path, check);
+			two_diff_algo = check->items[0].value;
+
+			if (!ATTR_UNSET(one_diff_algo) && !ATTR_UNSET(two_diff_algo) &&
+				!strcmp(one_diff_algo, two_diff_algo))
+				set_diff_algorithm(o, one_diff_algo);
+
+			attr_check_free(check);
+		}
+
 		xpp.flags = o->xdl_opts;
 		xpp.ignore_regex = o->ignore_regex;
 		xpp.ignore_regex_nr = o->ignore_regex_nr;
@@ -5130,6 +5151,8 @@ static int diff_opt_diff_algorithm(const struct option *opt,
 		return error(_("option diff-algorithm accepts \"myers\", "
 			       "\"minimal\", \"patience\" and \"histogram\""));
 
+	options->xdl_opts_command_line = 1;
+
 	return 0;
 }
 
@@ -5157,6 +5180,8 @@ static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
 		return error(_("available diff algorithms include \"myers\", "
 			       "\"minimal\", \"patience\" and \"histogram\""));
 
+	options->xdl_opts_command_line = 1;
+
 	return 0;
 }
 
diff --git a/diff.h b/diff.h
index 41eb2c3d428..46b565abfd4 100644
--- a/diff.h
+++ b/diff.h
@@ -333,6 +333,8 @@ struct diff_options {
 	int prefix_length;
 	const char *stat_sep;
 	int xdl_opts;
+	/* If xdl_opts has been set via the command line. */
+	int xdl_opts_command_line;
 
 	/* see Documentation/diff-options.txt */
 	char **anchors;
diff --git a/t/lib-diff-alternative.sh b/t/lib-diff-alternative.sh
index 8d1e408bb58..630c98ea65a 100644
--- a/t/lib-diff-alternative.sh
+++ b/t/lib-diff-alternative.sh
@@ -107,8 +107,27 @@ EOF
 
 	STRATEGY=$1
 
+	test_expect_success "$STRATEGY diff from attributes" '
+		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
+		test_must_fail git diff --no-index file1 file2 > output &&
+		test_cmp expect output
+	'
+
 	test_expect_success "$STRATEGY diff" '
-		test_must_fail git diff --no-index "--$STRATEGY" file1 file2 > output &&
+		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
+		test_cmp expect output
+	'
+
+	test_expect_success "$STRATEGY diff command line precedence before attributes" '
+		echo "file* diff-algorithm=meyers" >.gitattributes &&
+		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
+		test_cmp expect output
+	'
+
+	test_expect_success "$STRATEGY diff attributes precedence before config" '
+		git config diff.algorithm default &&
+		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
+		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
 		test_cmp expect output
 	'
 
@@ -166,5 +185,11 @@ EOF
 		test_must_fail git diff --no-index "--$STRATEGY" uniq1 uniq2 > output &&
 		test_cmp expect output
 	'
+
+	test_expect_success "$STRATEGY diff from attributes" '
+		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
+		test_must_fail git diff --no-index uniq1 uniq2 > output &&
+		test_cmp expect output
+	'
 }
 
-- 
gitgitgadget

^ permalink raw reply related	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-05  3:46 ` [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm John Cai via GitGitGadget
@ 2023-02-05 17:50   ` Eric Sunshine
  2023-02-06 13:10     ` John Cai
  2023-02-06 16:27   ` Phillip Wood
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 78+ messages in thread
From: Eric Sunshine @ 2023-02-05 17:50 UTC (permalink / raw)
  To: John Cai via GitGitGadget; +Cc: git, John Cai

On Sat, Feb 4, 2023 at 11:47 PM John Cai via GitGitGadget
<gitgitgadget@gmail.com> wrote:
> It can be useful to specify diff algorithms per file type. For example,
> one may want to use the minimal diff algorithm for .json files, another
> for .c files, etc.
>
> Teach the diff machinery to check attributes for a diff algorithm.
> Enforce precedence by favoring the command line option, then looking at
> attributes, then finally the config.
>
> To enforce precedence order, set the `xdl_opts_command_line` member
> during options pasing to indicate the diff algorithm was set via command
> line args.
>
> Signed-off-by: John Cai <johncai86@gmail.com>
> ---
> diff --git a/diff.c b/diff.c
> @@ -3652,6 +3652,27 @@ static void builtin_diff(const char *name_a,
> +               if (!o->xdl_opts_command_line) {
> +                       static struct attr_check *check;

`check` is declared static...

> +                       const char *one_diff_algo;
> +                       const char *two_diff_algo;
> +
> +                       check = attr_check_alloc();

... is allocated here...

> +                       attr_check_append(check, git_attr("diff-algorithm"));
> +
> +                       git_check_attr(the_repository->index, NULL, one->path, check);
> +                       one_diff_algo = check->items[0].value;
> +                       git_check_attr(the_repository->index, NULL, two->path, check);
> +                       two_diff_algo = check->items[0].value;
> +
> +                       if (!ATTR_UNSET(one_diff_algo) && !ATTR_UNSET(two_diff_algo) &&
> +                               !strcmp(one_diff_algo, two_diff_algo))
> +                               set_diff_algorithm(o, one_diff_algo);
> +
> +                       attr_check_free(check);

... and freed here...

> +               }

... so the reason for the `static` declaration is not clear. Am I
missing something obvious?

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-05 17:50   ` Eric Sunshine
@ 2023-02-06 13:10     ` John Cai
  0 siblings, 0 replies; 78+ messages in thread
From: John Cai @ 2023-02-06 13:10 UTC (permalink / raw)
  To: Eric Sunshine; +Cc: John Cai via GitGitGadget, git

Hi Eric,

On 5 Feb 2023, at 12:50, Eric Sunshine wrote:

> On Sat, Feb 4, 2023 at 11:47 PM John Cai via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>> It can be useful to specify diff algorithms per file type. For example,
>> one may want to use the minimal diff algorithm for .json files, another
>> for .c files, etc.
>>
>> Teach the diff machinery to check attributes for a diff algorithm.
>> Enforce precedence by favoring the command line option, then looking at
>> attributes, then finally the config.
>>
>> To enforce precedence order, set the `xdl_opts_command_line` member
>> during options pasing to indicate the diff algorithm was set via command
>> line args.
>>
>> Signed-off-by: John Cai <johncai86@gmail.com>
>> ---
>> diff --git a/diff.c b/diff.c
>> @@ -3652,6 +3652,27 @@ static void builtin_diff(const char *name_a,
>> +               if (!o->xdl_opts_command_line) {
>> +                       static struct attr_check *check;
>
> `check` is declared static...
>
>> +                       const char *one_diff_algo;
>> +                       const char *two_diff_algo;
>> +
>> +                       check = attr_check_alloc();
>
> ... is allocated here...
>
>> +                       attr_check_append(check, git_attr("diff-algorithm"));
>> +
>> +                       git_check_attr(the_repository->index, NULL, one->path, check);
>> +                       one_diff_algo = check->items[0].value;
>> +                       git_check_attr(the_repository->index, NULL, two->path, check);
>> +                       two_diff_algo = check->items[0].value;
>> +
>> +                       if (!ATTR_UNSET(one_diff_algo) && !ATTR_UNSET(two_diff_algo) &&
>> +                               !strcmp(one_diff_algo, two_diff_algo))
>> +                               set_diff_algorithm(o, one_diff_algo);
>> +
>> +                       attr_check_free(check);
>
> ... and freed here...
>
>> +               }
>
> ... so the reason for the `static` declaration is not clear. Am I
> missing something obvious?

No, you are correct. No reason for the static declaration. `check` is not used outside of the scope of this
conditional. I think this made it in from an earlier iteration and I didn't catch the oversight.

thanks
John



^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 1/2] diff: consolidate diff algorithm option parsing
  2023-02-05  3:46 ` [PATCH 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
@ 2023-02-06 16:20   ` Phillip Wood
  0 siblings, 0 replies; 78+ messages in thread
From: Phillip Wood @ 2023-02-06 16:20 UTC (permalink / raw)
  To: John Cai via GitGitGadget, git; +Cc: John Cai, John Cai

Hi John

On 05/02/2023 03:46, John Cai via GitGitGadget wrote:
> From: John Cai <jcai@gitlab.com>
> 
> The diff option parsing for --minimal, --patience, --histgoram can all
> be consolidated into one function. This is a preparatory step for the
> subsequent commit which teaches diff to keep track of whether or not a
> diff algorithm has been set via the command line.
> 
> While we're at it, the logic that sets the diff algorithm in
> diff_opt_diff_algorithm() can be refactored into a helper that will
> allow multiple callsites to set the diff algorithm.

You say "while  we're at it" but isn't it a wholly necessary change for 
what you want to do?

This patch basically looks good, I've left a couple of comments below, 
thanks for separating it out as a preparatory step

> Signed-off-by: John Cai <johncai86@gmail.com>
> ---
>   diff.c | 87 ++++++++++++++++++++++++++++++++++++----------------------
>   1 file changed, 54 insertions(+), 33 deletions(-)
> 
> diff --git a/diff.c b/diff.c
> index 329eebf16a0..a8a31c81fe7 100644
> --- a/diff.c
> +++ b/diff.c
> @@ -3437,6 +3437,22 @@ static int diff_filepair_is_phoney(struct diff_filespec *one,
>   	return !DIFF_FILE_VALID(one) && !DIFF_FILE_VALID(two);
>   }
>   
> +static int set_diff_algorithm(struct diff_options *opts,
> +			      const char *alg)
> +{
> +	long value = parse_algorithm_value(alg);
> +
> +	if (value < 0)
> +		return 1;
> +
> +	/* clear out previous settings */
> +	DIFF_XDL_CLR(opts, NEED_MINIMAL);
> +	opts->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
> +	opts->xdl_opts |= value;
> +
> +	return 0;
> +}
> +
>   static void builtin_diff(const char *name_a,
>   			 const char *name_b,
>   			 struct diff_filespec *one,
> @@ -5107,17 +5123,40 @@ static int diff_opt_diff_algorithm(const struct option *opt,
>   				   const char *arg, int unset)
>   {
>   	struct diff_options *options = opt->value;
> -	long value = parse_algorithm_value(arg);
>   
>   	BUG_ON_OPT_NEG(unset);
> -	if (value < 0)
> +
> +	if (set_diff_algorithm(options, arg))
>   		return error(_("option diff-algorithm accepts \"myers\", "
>   			       "\"minimal\", \"patience\" and \"histogram\""));
>   
> -	/* clear out previous settings */
> -	DIFF_XDL_CLR(options, NEED_MINIMAL);
> -	options->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
> -	options->xdl_opts |= value;
> +	return 0;
> +}
> +
> +static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
> +				   const char *arg, int unset)
> +{
> +	struct diff_options *options = opt->value;
> +
> +	BUG_ON_OPT_NEG(unset);
> +	BUG_ON_OPT_ARG(arg);
> +
> +	if (!strcmp(opt->long_name, "patience")) {
> +		int i;

This is copied from the existing code but as `options->anchors_nr` is a 
size_t it is probably worth converting `i` to a size_t here.

> +		/*
> +		 * Both --patience and --anchored use PATIENCE_DIFF
> +		 * internally, so remove any anchors previously
> +		 * specified.
> +		 */
> +		for (i = 0; i < options->anchors_nr; i++)
> +			free(options->anchors[i]);
> +		options->anchors_nr = 0;
> +	}
> +
> +	if (set_diff_algorithm(options, opt->long_name))
> +		return error(_("available diff algorithms include \"myers\", "
> +			       "\"minimal\", \"patience\" and \"histogram\""));

I think this should be a BUG() as it is a programming error if we reach 
this point.

Best Wishes

Phillip

> +
>   	return 0;
>   }
>   
> @@ -5242,26 +5281,6 @@ static enum parse_opt_result diff_opt_output(struct parse_opt_ctx_t *ctx,
>   	return 0;
>   }
>   
> -static int diff_opt_patience(const struct option *opt,
> -			     const char *arg, int unset)
> -{
> -	struct diff_options *options = opt->value;
> -	int i;
> -
> -	BUG_ON_OPT_NEG(unset);
> -	BUG_ON_OPT_ARG(arg);
> -	options->xdl_opts = DIFF_WITH_ALG(options, PATIENCE_DIFF);
> -	/*
> -	 * Both --patience and --anchored use PATIENCE_DIFF
> -	 * internally, so remove any anchors previously
> -	 * specified.
> -	 */
> -	for (i = 0; i < options->anchors_nr; i++)
> -		free(options->anchors[i]);
> -	options->anchors_nr = 0;
> -	return 0;
> -}
> -
>   static int diff_opt_ignore_regex(const struct option *opt,
>   				 const char *arg, int unset)
>   {
> @@ -5562,9 +5581,10 @@ struct option *add_diff_options(const struct option *opts,
>   			    N_("prevent rename/copy detection if the number of rename/copy targets exceeds given limit")),
>   
>   		OPT_GROUP(N_("Diff algorithm options")),
> -		OPT_BIT(0, "minimal", &options->xdl_opts,
> -			N_("produce the smallest possible diff"),
> -			XDF_NEED_MINIMAL),
> +		OPT_CALLBACK_F(0, "minimal", options, NULL,
> +			       N_("produce the smallest possible diff"),
> +			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
> +			       diff_opt_diff_algorithm_no_arg),
>   		OPT_BIT_F('w', "ignore-all-space", &options->xdl_opts,
>   			  N_("ignore whitespace when comparing lines"),
>   			  XDF_IGNORE_WHITESPACE, PARSE_OPT_NONEG),
> @@ -5589,10 +5609,11 @@ struct option *add_diff_options(const struct option *opts,
>   		OPT_CALLBACK_F(0, "patience", options, NULL,
>   			       N_("generate diff using the \"patience diff\" algorithm"),
>   			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
> -			       diff_opt_patience),
> -		OPT_BITOP(0, "histogram", &options->xdl_opts,
> -			  N_("generate diff using the \"histogram diff\" algorithm"),
> -			  XDF_HISTOGRAM_DIFF, XDF_DIFF_ALGORITHM_MASK),
> +			       diff_opt_diff_algorithm_no_arg),
> +		OPT_CALLBACK_F(0, "histogram", options, NULL,
> +			       N_("generate diff using the \"histogram diff\" algorithm"),
> +			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
> +			       diff_opt_diff_algorithm_no_arg),
>   		OPT_CALLBACK_F(0, "diff-algorithm", options, N_("<algorithm>"),
>   			       N_("choose a diff algorithm"),
>   			       PARSE_OPT_NONEG, diff_opt_diff_algorithm),

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-05  3:46 ` [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm John Cai via GitGitGadget
  2023-02-05 17:50   ` Eric Sunshine
@ 2023-02-06 16:27   ` Phillip Wood
  2023-02-06 18:14     ` Eric Sunshine
  2023-02-06 19:50     ` John Cai
  2023-02-06 16:39   ` Ævar Arnfjörð Bjarmason
  2023-02-07 17:56   ` Jeff King
  3 siblings, 2 replies; 78+ messages in thread
From: Phillip Wood @ 2023-02-06 16:27 UTC (permalink / raw)
  To: John Cai via GitGitGadget, git; +Cc: John Cai

Hi John

On 05/02/2023 03:46, John Cai via GitGitGadget wrote:
> From: John Cai <johncai86@gmail.com>
> 
> It can be useful to specify diff algorithms per file type. For example,
> one may want to use the minimal diff algorithm for .json files, another
> for .c files, etc.

Have you got any examples of why this is useful? I find myself 
occasionally changing the algorithm when the default gives a sub-optimal 
diff but I've not really noticed any pattern with respect to file types.

> Teach the diff machinery to check attributes for a diff algorithm.
> Enforce precedence by favoring the command line option, then looking at
> attributes, then finally the config.
> 
> To enforce precedence order, set the `xdl_opts_command_line` member
> during options pasing to indicate the diff algorithm was set via command
> line args.

I've only commented on the tests as it looks like Eric and had a careful 
look at the code

> diff --git a/t/lib-diff-alternative.sh b/t/lib-diff-alternative.sh
> index 8d1e408bb58..630c98ea65a 100644
> --- a/t/lib-diff-alternative.sh
> +++ b/t/lib-diff-alternative.sh
> @@ -107,8 +107,27 @@ EOF
>   
>   	STRATEGY=$1
>   
> +	test_expect_success "$STRATEGY diff from attributes" '
> +		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
> +		test_must_fail git diff --no-index file1 file2 > output &&
> +		test_cmp expect output
> +	'
> +
>   	test_expect_success "$STRATEGY diff" '
> -		test_must_fail git diff --no-index "--$STRATEGY" file1 file2 > output &&
> +		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
> +		test_cmp expect output
> +	'
> +
> +	test_expect_success "$STRATEGY diff command line precedence before attributes" '
> +		echo "file* diff-algorithm=meyers" >.gitattributes &&
> +		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
> +		test_cmp expect output
> +	'
> +
> +	test_expect_success "$STRATEGY diff attributes precedence before config" '
> +		git config diff.algorithm default &&
> +		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
> +		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&

This test passes with or without the code changes in this patch. I think 
you need to drop --diff-algorithm=$STRATEGY from the diff command.

>   		test_cmp expect output
>   	'
>   
> @@ -166,5 +185,11 @@ EOF
>   		test_must_fail git diff --no-index "--$STRATEGY" uniq1 uniq2 > output &&
>   		test_cmp expect output
>   	'
> +
> +	test_expect_success "$STRATEGY diff from attributes" '
> +		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
> +		test_must_fail git diff --no-index uniq1 uniq2 > output &&
> +		test_cmp expect output

This test also passes with or without the code changes in this patch. It 
  is the same as the first test added above but with files that give the 
same diff irrespective of the algorithm chosen so I don't think it is 
doing anything useful. Unless I've missed something it should be dropped.

Best Wishes

Phillip

> +	'
>   }
>   

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-05  3:46 ` [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm John Cai via GitGitGadget
  2023-02-05 17:50   ` Eric Sunshine
  2023-02-06 16:27   ` Phillip Wood
@ 2023-02-06 16:39   ` Ævar Arnfjörð Bjarmason
  2023-02-06 20:37     ` John Cai
  2023-02-09  7:50     ` Elijah Newren
  2023-02-07 17:56   ` Jeff King
  3 siblings, 2 replies; 78+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2023-02-06 16:39 UTC (permalink / raw)
  To: John Cai via GitGitGadget; +Cc: git, John Cai


On Sun, Feb 05 2023, John Cai via GitGitGadget wrote:

> From: John Cai <johncai86@gmail.com>
> [...]
> +
> +		if (!o->xdl_opts_command_line) {
> +			static struct attr_check *check;
> +			const char *one_diff_algo;
> +			const char *two_diff_algo;
> +
> +			check = attr_check_alloc();
> +			attr_check_append(check, git_attr("diff-algorithm"));
> +
> +			git_check_attr(the_repository->index, NULL, one->path, check);
> +			one_diff_algo = check->items[0].value;
> +			git_check_attr(the_repository->index, NULL, two->path, check);
> +			two_diff_algo = check->items[0].value;
> +
> +			if (!ATTR_UNSET(one_diff_algo) && !ATTR_UNSET(two_diff_algo) &&
> +				!strcmp(one_diff_algo, two_diff_algo))
> +				set_diff_algorithm(o, one_diff_algo);
> +
> +			attr_check_free(check);

This is a bit nitpicky, but I for one would find this much easier to
read with some shorter variables, here just with "a" rather than
"one_diff_algo", "b" instead of "two_diff_algo", and splitting
"the_repository->index" into "istate" (untested):
	
	+		if (!o->xdl_opts_command_line) {
	+			static struct attr_check *check;
	+			const char *a;
	+			const char *b;
	+			struct index_state *istate = the_repository->index;
	+
	+			check = attr_check_alloc();
	+			attr_check_append(check, git_attr("diff-algorithm"));
	+
	+			git_check_attr(istate, NULL, one->path, check);
	+			a = check->items[0].value;
	+			git_check_attr(istate, NULL, two->path, check);
	+			b = check->items[0].value;
	+
	+			if (!ATTR_UNSET(a) && !ATTR_UNSET(b) && !strcmp(a, b))
	+				set_diff_algorithm(o, a);
	+
	+			attr_check_free(check);
	+		}

That also nicely keeps the line length shorter.

> @@ -333,6 +333,8 @@ struct diff_options {
>  	int prefix_length;
>  	const char *stat_sep;
>  	int xdl_opts;
> +	/* If xdl_opts has been set via the command line. */
> +	int xdl_opts_command_line;
>  
>  	/* see Documentation/diff-options.txt */
>  	char **anchors;
> diff --git a/t/lib-diff-alternative.sh b/t/lib-diff-alternative.sh
> index 8d1e408bb58..630c98ea65a 100644
> --- a/t/lib-diff-alternative.sh
> +++ b/t/lib-diff-alternative.sh
> @@ -107,8 +107,27 @@ EOF
>  
>  	STRATEGY=$1
>  
> +	test_expect_success "$STRATEGY diff from attributes" '
> +		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
> +		test_must_fail git diff --no-index file1 file2 > output &&
> +		test_cmp expect output
> +	'
> +
>  	test_expect_success "$STRATEGY diff" '
> -		test_must_fail git diff --no-index "--$STRATEGY" file1 file2 > output &&
> +		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&

Nit: The usual style is ">output", not "> output".

> +		test_cmp expect output
> +	'
> +
> +	test_expect_success "$STRATEGY diff command line precedence before attributes" '
> +		echo "file* diff-algorithm=meyers" >.gitattributes &&
> +		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
> +		test_cmp expect output
> +	'
> +
> +	test_expect_success "$STRATEGY diff attributes precedence before config" '
> +		git config diff.algorithm default &&
> +		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
> +		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
>  		test_cmp expect output
>  	'
>  
> @@ -166,5 +185,11 @@ EOF
>  		test_must_fail git diff --no-index "--$STRATEGY" uniq1 uniq2 > output &&
>  		test_cmp expect output
>  	'
> +
> +	test_expect_success "$STRATEGY diff from attributes" '
> +		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
> +		test_must_fail git diff --no-index uniq1 uniq2 > output &&
> +		test_cmp expect output
> +	'
>  }

For some non-nitpicking, I do worry about exposing this as a DoS vector,
e.g. here's a diff between two distant points in git.git with the
various algorithms:

	$ hyperfine -r 1 -L a patience,minimal,histogram,myers 'git diff --diff-algorithm={a} v2.0.0 v2.28.0'
	Benchmark 1: git diff --diff-algorithm=patience v2.0.0 v2.28.0
	  Time (abs ≡):        42.121 s               [User: 41.879 s, System: 0.144 s]
	
	Benchmark 2: git diff --diff-algorithm=minimal v2.0.0 v2.28.0
	  Time (abs ≡):        35.634 s               [User: 35.473 s, System: 0.160 s]
	
	Benchmark 3: git diff --diff-algorithm=histogram v2.0.0 v2.28.0
	  Time (abs ≡):        46.912 s               [User: 46.657 s, System: 0.228 s]
	
	Benchmark 4: git diff --diff-algorithm=myers v2.0.0 v2.28.0
	  Time (abs ≡):        33.233 s               [User: 33.072 s, System: 0.160 s]
	
	Summary
	  'git diff --diff-algorithm=myers v2.0.0 v2.28.0' ran
	    1.07 times faster than 'git diff --diff-algorithm=minimal v2.0.0 v2.28.0'
	    1.27 times faster than 'git diff --diff-algorithm=patience v2.0.0 v2.28.0'
	    1.41 times faster than 'git diff --diff-algorithm=histogram v2.0.0 v2.28.0'

Now, all of those are very slow overall, but some much more than
others. I seem to recall that the non-default ones also had some
pathological cases.

Another thing to think about is that we've so far considered the diff
algorithm to be purely about presentation, with some notable exceptions
such as "patch-id".

I've advocated for us getting to the point of having an in-repo
.gitconfig or .gitattributes before with a whitelist of settings like
diff.context for certain paths, or a diff.orderFile.

But those seem easy to promise future behavior for, v.s. an entire diff
algorithm (which we of course had before, but now we'd have it in
repository data).

Maybe that's not a distinction worth worrying about, just putting that
out there.

I think if others are concerned about the above something that would
neatly side-step those is to have it opt-in via the .git/config somehow,
similar to e.g. how you can commit *.gpg content, put this in
.gitattributes:

	*.gpg diff=gpg

But not have it do anything until this is in the repo's .git/config (or
similar):

	[diff "gpg"]
        	textconv = gpg --no-tty --decrypt

For that you could still keep the exact .gitattributes format you have
here, i.e.:

	file* diff-algorithm=$STRATEGY

But we to pick it up we'd need either:

	[diff-algorithm]
        	histogram = myers

Or:

	[diff-algorithm "histogram"]
        	allow = true

The former form being one that would allow you to map the .gitattributes
of the repo (but maybe that would be redundant to
.git/info/attributes)...

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-06 16:27   ` Phillip Wood
@ 2023-02-06 18:14     ` Eric Sunshine
  2023-02-06 19:50     ` John Cai
  1 sibling, 0 replies; 78+ messages in thread
From: Eric Sunshine @ 2023-02-06 18:14 UTC (permalink / raw)
  To: phillip.wood; +Cc: John Cai via GitGitGadget, git, John Cai

On Mon, Feb 6, 2023 at 11:46 AM Phillip Wood <phillip.wood123@gmail.com> wrote:
> On 05/02/2023 03:46, John Cai via GitGitGadget wrote:
> > Teach the diff machinery to check attributes for a diff algorithm.
> > Enforce precedence by favoring the command line option, then looking at
> > attributes, then finally the config.
> >
> > To enforce precedence order, set the `xdl_opts_command_line` member
> > during options pasing to indicate the diff algorithm was set via command
> > line args.
>
> I've only commented on the tests as it looks like Eric and had a careful
> look at the code

I only very quickly ran my eye over the code; didn't even really read it.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-06 16:27   ` Phillip Wood
  2023-02-06 18:14     ` Eric Sunshine
@ 2023-02-06 19:50     ` John Cai
  2023-02-09  8:26       ` Elijah Newren
  1 sibling, 1 reply; 78+ messages in thread
From: John Cai @ 2023-02-06 19:50 UTC (permalink / raw)
  To: phillip.wood; +Cc: John Cai via GitGitGadget, git

Hi Phillip,

On 6 Feb 2023, at 11:27, Phillip Wood wrote:

> Hi John
>
> On 05/02/2023 03:46, John Cai via GitGitGadget wrote:
>> From: John Cai <johncai86@gmail.com>
>>
>> It can be useful to specify diff algorithms per file type. For example,
>> one may want to use the minimal diff algorithm for .json files, another
>> for .c files, etc.
>
> Have you got any examples of why this is useful? I find myself occasionally changing the algorithm when the default gives a sub-optimal diff but I've not really noticed any pattern with respect to file types.

At $DAYJOB, there has been a discussion and request for a feature like this [1].
One use case that came up was to be able to set a different diff algorithm for
.json files.

1. https://gitlab.com/gitlab-org/gitaly/-/issues/2591

>
>> Teach the diff machinery to check attributes for a diff algorithm.
>> Enforce precedence by favoring the command line option, then looking at
>> attributes, then finally the config.
>>
>> To enforce precedence order, set the `xdl_opts_command_line` member
>> during options pasing to indicate the diff algorithm was set via command
>> line args.
>
> I've only commented on the tests as it looks like Eric and had a careful look at the code
>
>> diff --git a/t/lib-diff-alternative.sh b/t/lib-diff-alternative.sh
>> index 8d1e408bb58..630c98ea65a 100644
>> --- a/t/lib-diff-alternative.sh
>> +++ b/t/lib-diff-alternative.sh
>> @@ -107,8 +107,27 @@ EOF
>>    	STRATEGY=$1
>>  +	test_expect_success "$STRATEGY diff from attributes" '
>> +		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
>> +		test_must_fail git diff --no-index file1 file2 > output &&
>> +		test_cmp expect output
>> +	'
>> +
>>   	test_expect_success "$STRATEGY diff" '
>> -		test_must_fail git diff --no-index "--$STRATEGY" file1 file2 > output &&
>> +		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
>> +		test_cmp expect output
>> +	'
>> +
>> +	test_expect_success "$STRATEGY diff command line precedence before attributes" '
>> +		echo "file* diff-algorithm=meyers" >.gitattributes &&
>> +		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
>> +		test_cmp expect output
>> +	'
>> +
>> +	test_expect_success "$STRATEGY diff attributes precedence before config" '
>> +		git config diff.algorithm default &&
>> +		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
>> +		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
>
> This test passes with or without the code changes in this patch. I think you need to drop --diff-algorithm=$STRATEGY from the diff command.

an oversight indeed, thanks for catching this.

>
>>   		test_cmp expect output
>>   	'
>>  @@ -166,5 +185,11 @@ EOF
>>   		test_must_fail git diff --no-index "--$STRATEGY" uniq1 uniq2 > output &&
>>   		test_cmp expect output
>>   	'
>> +
>> +	test_expect_success "$STRATEGY diff from attributes" '
>> +		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
>> +		test_must_fail git diff --no-index uniq1 uniq2 > output &&
>> +		test_cmp expect output
>
> This test also passes with or without the code changes in this patch. It  is the same as the first test added above but with files that give the same diff irrespective of the algorithm chosen so I don't think it is doing anything useful. Unless I've missed something it should be dropped.

I should have been more thorough with this one as well, thanks.

>
> Best Wishes
>
> Phillip
>
>> +	'
>>   }
>>

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-06 16:39   ` Ævar Arnfjörð Bjarmason
@ 2023-02-06 20:37     ` John Cai
  2023-02-07 14:55       ` Phillip Wood
  2023-02-09  8:44       ` Elijah Newren
  2023-02-09  7:50     ` Elijah Newren
  1 sibling, 2 replies; 78+ messages in thread
From: John Cai @ 2023-02-06 20:37 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason; +Cc: John Cai via GitGitGadget, git

Hi Ævar,

On 6 Feb 2023, at 11:39, Ævar Arnfjörð Bjarmason wrote:

> On Sun, Feb 05 2023, John Cai via GitGitGadget wrote:
>
>> From: John Cai <johncai86@gmail.com>
>> [...]
>> +
>> +		if (!o->xdl_opts_command_line) {
>> +			static struct attr_check *check;
>> +			const char *one_diff_algo;
>> +			const char *two_diff_algo;
>> +
>> +			check = attr_check_alloc();
>> +			attr_check_append(check, git_attr("diff-algorithm"));
>> +
>> +			git_check_attr(the_repository->index, NULL, one->path, check);
>> +			one_diff_algo = check->items[0].value;
>> +			git_check_attr(the_repository->index, NULL, two->path, check);
>> +			two_diff_algo = check->items[0].value;
>> +
>> +			if (!ATTR_UNSET(one_diff_algo) && !ATTR_UNSET(two_diff_algo) &&
>> +				!strcmp(one_diff_algo, two_diff_algo))
>> +				set_diff_algorithm(o, one_diff_algo);
>> +
>> +			attr_check_free(check);
>
> This is a bit nitpicky, but I for one would find this much easier to
> read with some shorter variables, here just with "a" rather than
> "one_diff_algo", "b" instead of "two_diff_algo", and splitting
> "the_repository->index" into "istate" (untested):
> 	
> 	+		if (!o->xdl_opts_command_line) {
> 	+			static struct attr_check *check;
> 	+			const char *a;
> 	+			const char *b;
> 	+			struct index_state *istate = the_repository->index;
> 	+
> 	+			check = attr_check_alloc();
> 	+			attr_check_append(check, git_attr("diff-algorithm"));
> 	+
> 	+			git_check_attr(istate, NULL, one->path, check);
> 	+			a = check->items[0].value;
> 	+			git_check_attr(istate, NULL, two->path, check);
> 	+			b = check->items[0].value;
> 	+
> 	+			if (!ATTR_UNSET(a) && !ATTR_UNSET(b) && !strcmp(a, b))
> 	+				set_diff_algorithm(o, a);
> 	+
> 	+			attr_check_free(check);
> 	+		}
>
> That also nicely keeps the line length shorter.

Thanks, I think this does look better.

>
>> @@ -333,6 +333,8 @@ struct diff_options {
>>  	int prefix_length;
>>  	const char *stat_sep;
>>  	int xdl_opts;
>> +	/* If xdl_opts has been set via the command line. */
>> +	int xdl_opts_command_line;
>>
>>  	/* see Documentation/diff-options.txt */
>>  	char **anchors;
>> diff --git a/t/lib-diff-alternative.sh b/t/lib-diff-alternative.sh
>> index 8d1e408bb58..630c98ea65a 100644
>> --- a/t/lib-diff-alternative.sh
>> +++ b/t/lib-diff-alternative.sh
>> @@ -107,8 +107,27 @@ EOF
>>
>>  	STRATEGY=$1
>>
>> +	test_expect_success "$STRATEGY diff from attributes" '
>> +		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
>> +		test_must_fail git diff --no-index file1 file2 > output &&
>> +		test_cmp expect output
>> +	'
>> +
>>  	test_expect_success "$STRATEGY diff" '
>> -		test_must_fail git diff --no-index "--$STRATEGY" file1 file2 > output &&
>> +		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
>
> Nit: The usual style is ">output", not "> output".

noted!

>
>> +		test_cmp expect output
>> +	'
>> +
>> +	test_expect_success "$STRATEGY diff command line precedence before attributes" '
>> +		echo "file* diff-algorithm=meyers" >.gitattributes &&
>> +		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
>> +		test_cmp expect output
>> +	'
>> +
>> +	test_expect_success "$STRATEGY diff attributes precedence before config" '
>> +		git config diff.algorithm default &&
>> +		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
>> +		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
>>  		test_cmp expect output
>>  	'
>>
>> @@ -166,5 +185,11 @@ EOF
>>  		test_must_fail git diff --no-index "--$STRATEGY" uniq1 uniq2 > output &&
>>  		test_cmp expect output
>>  	'
>> +
>> +	test_expect_success "$STRATEGY diff from attributes" '
>> +		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
>> +		test_must_fail git diff --no-index uniq1 uniq2 > output &&
>> +		test_cmp expect output
>> +	'
>>  }
>
> For some non-nitpicking, I do worry about exposing this as a DoS vector,
> e.g. here's a diff between two distant points in git.git with the
> various algorithms:
>
> 	$ hyperfine -r 1 -L a patience,minimal,histogram,myers 'git diff --diff-algorithm={a} v2.0.0 v2.28.0'
> 	Benchmark 1: git diff --diff-algorithm=patience v2.0.0 v2.28.0
> 	  Time (abs ≡):        42.121 s               [User: 41.879 s, System: 0.144 s]
> 	
> 	Benchmark 2: git diff --diff-algorithm=minimal v2.0.0 v2.28.0
> 	  Time (abs ≡):        35.634 s               [User: 35.473 s, System: 0.160 s]
> 	
> 	Benchmark 3: git diff --diff-algorithm=histogram v2.0.0 v2.28.0
> 	  Time (abs ≡):        46.912 s               [User: 46.657 s, System: 0.228 s]
> 	
> 	Benchmark 4: git diff --diff-algorithm=myers v2.0.0 v2.28.0
> 	  Time (abs ≡):        33.233 s               [User: 33.072 s, System: 0.160 s]
> 	
> 	Summary
> 	  'git diff --diff-algorithm=myers v2.0.0 v2.28.0' ran
> 	    1.07 times faster than 'git diff --diff-algorithm=minimal v2.0.0 v2.28.0'
> 	    1.27 times faster than 'git diff --diff-algorithm=patience v2.0.0 v2.28.0'
> 	    1.41 times faster than 'git diff --diff-algorithm=histogram v2.0.0 v2.28.0'

Thanks for this analysis. To clarify, .gitconfig's diff.algorithm setting is
already an attack vector right? I see how this would be adding another one.

That being said, here's a separate issue. I benchmarked the usage of
.gitattributes as introduced in this patch series, and indeed it does look like
there is additional latency:

$ echo "* diff-algorithm=patience >> .gitattributes
$ hyperfine -r 5 'git-bin-wrapper diff --diff-algorithm=patience v2.0.0 v2.28.0'                      ✭
Benchmark 1: git-bin-wrapper diff --diff-algorithm=patience v2.0.0 v2.28.0
  Time (mean ± σ):     889.4 ms ± 113.8 ms    [User: 715.7 ms, System: 65.3 ms]
  Range (min … max):   764.1 ms … 1029.3 ms    5 runs

$ hyperfine -r 5 'git-bin-wrapper diff v2.0.0 v2.28.0'                                                ✭
Benchmark 1: git-bin-wrapper diff v2.0.0 v2.28.0
  Time (mean ± σ):      2.146 s ±  0.368 s    [User: 0.827 s, System: 0.243 s]
  Range (min … max):    1.883 s …  2.795 s    5 runs

and I imagine the latency scales with the size of .gitattributes. Although I'm
not familiar with other parts of the codebase and how it deals with the latency
introduced by reading attributes files.

>
> Now, all of those are very slow overall, but some much more than
> others. I seem to recall that the non-default ones also had some
> pathological cases.
>
> Another thing to think about is that we've so far considered the diff
> algorithm to be purely about presentation, with some notable exceptions
> such as "patch-id".
>
> I've advocated for us getting to the point of having an in-repo
> .gitconfig or .gitattributes before with a whitelist of settings like
> diff.context for certain paths, or a diff.orderFile.
>
> But those seem easy to promise future behavior for, v.s. an entire diff
> algorithm (which we of course had before, but now we'd have it in
> repository data).
>
> Maybe that's not a distinction worth worrying about, just putting that
> out there.
>
> I think if others are concerned about the above something that would
> neatly side-step those is to have it opt-in via the .git/config somehow,
> similar to e.g. how you can commit *.gpg content, put this in
> .gitattributes:
>
> 	*.gpg diff=gpg
>
> But not have it do anything until this is in the repo's .git/config (or
> similar):
>
> 	[diff "gpg"]
>         	textconv = gpg --no-tty --decrypt
>
> For that you could still keep the exact .gitattributes format you have
> here, i.e.:
>
> 	file* diff-algorithm=$STRATEGY
>
> But we to pick it up we'd need either:
>
> 	[diff-algorithm]
>         	histogram = myers
>
> Or:
>
> 	[diff-algorithm "histogram"]
>         	allow = true

This would help address slowness from the diff algorithm itself. I'm not opposed
to adding this config if this attack vector is concerning to people.

However, it wouldn't help address the additional latency of scanning
.gitattributes to find the diff algorithm.

Would a separate config to allow gitattributes be helpful here?

[diff-algorithm]
	attributes = true

>
> The former form being one that would allow you to map the .gitattributes
> of the repo (but maybe that would be redundant to
> .git/info/attributes)...

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-06 20:37     ` John Cai
@ 2023-02-07 14:55       ` Phillip Wood
  2023-02-07 17:00         ` John Cai
  2023-02-07 17:27         ` Ævar Arnfjörð Bjarmason
  2023-02-09  8:44       ` Elijah Newren
  1 sibling, 2 replies; 78+ messages in thread
From: Phillip Wood @ 2023-02-07 14:55 UTC (permalink / raw)
  To: John Cai, Ævar Arnfjörð Bjarmason
  Cc: John Cai via GitGitGadget, git, Patrick Steinhardt

Hi John

On 06/02/2023 20:37, John Cai wrote:
> On 6 Feb 2023, at 11:39, Ævar Arnfjörð Bjarmason wrote:
>> On Sun, Feb 05 2023, John Cai via GitGitGadget wrote:
>> For some non-nitpicking, I do worry about exposing this as a DoS vector,
>> e.g. here's a diff between two distant points in git.git with the
>> various algorithms:
>>
>> 	$ hyperfine -r 1 -L a patience,minimal,histogram,myers 'git diff --diff-algorithm={a} v2.0.0 v2.28.0'
>> 	Benchmark 1: git diff --diff-algorithm=patience v2.0.0 v2.28.0
>> 	  Time (abs ≡):        42.121 s               [User: 41.879 s, System: 0.144 s]
>> 	
>> 	Benchmark 2: git diff --diff-algorithm=minimal v2.0.0 v2.28.0
>> 	  Time (abs ≡):        35.634 s               [User: 35.473 s, System: 0.160 s]
>> 	
>> 	Benchmark 3: git diff --diff-algorithm=histogram v2.0.0 v2.28.0
>> 	  Time (abs ≡):        46.912 s               [User: 46.657 s, System: 0.228 s]
>> 	
>> 	Benchmark 4: git diff --diff-algorithm=myers v2.0.0 v2.28.0
>> 	  Time (abs ≡):        33.233 s               [User: 33.072 s, System: 0.160 s]
>> 	
>> 	Summary
>> 	  'git diff --diff-algorithm=myers v2.0.0 v2.28.0' ran
>> 	    1.07 times faster than 'git diff --diff-algorithm=minimal v2.0.0 v2.28.0'
>> 	    1.27 times faster than 'git diff --diff-algorithm=patience v2.0.0 v2.28.0'
>> 	    1.41 times faster than 'git diff --diff-algorithm=histogram v2.0.0 v2.28.0'
> 
> Thanks for this analysis. To clarify, .gitconfig's diff.algorithm setting is
> already an attack vector right?

.gitconfig is under the user's control though whereas .gitattributes is 
attacker controlled if one clones a malicious repository. Having said 
the worst results above are for the historgram algorithm that merge-ort 
uses internally and no one has complained about ort's performance.

> I see how this would be adding another one.
> 
> That being said, here's a separate issue. I benchmarked the usage of
> .gitattributes as introduced in this patch series, and indeed it does look like
> there is additional latency:
> 
> $ echo "* diff-algorithm=patience >> .gitattributes
> $ hyperfine -r 5 'git-bin-wrapper diff --diff-algorithm=patience v2.0.0 v2.28.0'                      ✭
> Benchmark 1: git-bin-wrapper diff --diff-algorithm=patience v2.0.0 v2.28.0
>    Time (mean ± σ):     889.4 ms ± 113.8 ms    [User: 715.7 ms, System: 65.3 ms]
>    Range (min … max):   764.1 ms … 1029.3 ms    5 runs
> 
> $ hyperfine -r 5 'git-bin-wrapper diff v2.0.0 v2.28.0'                                                ✭
> Benchmark 1: git-bin-wrapper diff v2.0.0 v2.28.0
>    Time (mean ± σ):      2.146 s ±  0.368 s    [User: 0.827 s, System: 0.243 s]
>    Range (min … max):    1.883 s …  2.795 s    5 runs
> 
> and I imagine the latency scales with the size of .gitattributes. Although I'm
> not familiar with other parts of the codebase and how it deals with the latency
> introduced by reading attributes files.

Ouch! Thanks for benchmarking that is a suspiciously large slow down. 
I've a feeling the attributes code parses all the .gitattribute files 
from scratch for each path that's queried, so there may be scope for 
making it a bit smarter. I see some slow down but no where near as much


$ hyperfine -r 3 'bin-wrappers/git diff --diff-algorithm=patience 
--no-color --no-color-moved v2.0.0 v2.28.0'
Benchmark 1: bin-wrappers/git diff --diff-algorithm=patience --no-color 
--no-color-moved v2.0.0 v2.28.0
   Time (mean ± σ):      1.996 s ±  0.008 s    [User: 1.706 s, System: 
0.286 s]
   Range (min … max):    1.989 s …  2.004 s    3 runs

$ hyperfine -r 5 'bin-wrappers/git diff --no-color --no-color-moved 
v2.0.0 v2.28.0'
Benchmark 1: bin-wrappers/git diff --no-color --no-color-moved v2.0.0 
v2.28.0
   Time (mean ± σ):      2.238 s ±  0.037 s    [User: 1.880 s, System: 
0.350 s]
   Range (min … max):    2.216 s …  2.303 s    5 runs


Perhaps I'm over simplifying but having read the issue you linked to I 
couldn't help feeling that the majority of users might be satisfied by 
just changing gitlab to use the patience algorithm when generating 
diffs. The idea to use .gitattributes seems to have come from Patrick 
rather than a user request.

This is slightly off topic but one thing I'd really like is a way to 
tell diff use automatically use --diff-words on some files (e.g. 
Documentation/*)

Best Wishes

Phillip

>> Now, all of those are very slow overall, but some much more than
>> others. I seem to recall that the non-default ones also had some
>> pathological cases.
>>
>> Another thing to think about is that we've so far considered the diff
>> algorithm to be purely about presentation, with some notable exceptions
>> such as "patch-id".
>>
>> I've advocated for us getting to the point of having an in-repo
>> .gitconfig or .gitattributes before with a whitelist of settings like
>> diff.context for certain paths, or a diff.orderFile.
>>
>> But those seem easy to promise future behavior for, v.s. an entire diff
>> algorithm (which we of course had before, but now we'd have it in
>> repository data).
>>
>> Maybe that's not a distinction worth worrying about, just putting that
>> out there.
>>
>> I think if others are concerned about the above something that would
>> neatly side-step those is to have it opt-in via the .git/config somehow,
>> similar to e.g. how you can commit *.gpg content, put this in
>> .gitattributes:
>>
>> 	*.gpg diff=gpg
>>
>> But not have it do anything until this is in the repo's .git/config (or
>> similar):
>>
>> 	[diff "gpg"]
>>          	textconv = gpg --no-tty --decrypt
>>
>> For that you could still keep the exact .gitattributes format you have
>> here, i.e.:
>>
>> 	file* diff-algorithm=$STRATEGY
>>
>> But we to pick it up we'd need either:
>>
>> 	[diff-algorithm]
>>          	histogram = myers
>>
>> Or:
>>
>> 	[diff-algorithm "histogram"]
>>          	allow = true
> 
> This would help address slowness from the diff algorithm itself. I'm not opposed
> to adding this config if this attack vector is concerning to people.
> 
> However, it wouldn't help address the additional latency of scanning
> .gitattributes to find the diff algorithm.
> 
> Would a separate config to allow gitattributes be helpful here?
> 
> [diff-algorithm]
> 	attributes = true
> 
>>
>> The former form being one that would allow you to map the .gitattributes
>> of the repo (but maybe that would be redundant to
>> .git/info/attributes)...

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-07 14:55       ` Phillip Wood
@ 2023-02-07 17:00         ` John Cai
  2023-02-09  9:09           ` Elijah Newren
  2023-02-07 17:27         ` Ævar Arnfjörð Bjarmason
  1 sibling, 1 reply; 78+ messages in thread
From: John Cai @ 2023-02-07 17:00 UTC (permalink / raw)
  To: phillip.wood
  Cc: Ævar Arnfjörð Bjarmason, John Cai via GitGitGadget,
	git, Patrick Steinhardt

Hi Phillip,

On 7 Feb 2023, at 9:55, Phillip Wood wrote:

> Hi John
>
> On 06/02/2023 20:37, John Cai wrote:
>> On 6 Feb 2023, at 11:39, Ævar Arnfjörð Bjarmason wrote:
>>> On Sun, Feb 05 2023, John Cai via GitGitGadget wrote:
>>> For some non-nitpicking, I do worry about exposing this as a DoS vector,
>>> e.g. here's a diff between two distant points in git.git with the
>>> various algorithms:
>>>
>>> 	$ hyperfine -r 1 -L a patience,minimal,histogram,myers 'git diff --diff-algorithm={a} v2.0.0 v2.28.0'
>>> 	Benchmark 1: git diff --diff-algorithm=patience v2.0.0 v2.28.0
>>> 	  Time (abs ≡):        42.121 s               [User: 41.879 s, System: 0.144 s]
>>> 	
>>> 	Benchmark 2: git diff --diff-algorithm=minimal v2.0.0 v2.28.0
>>> 	  Time (abs ≡):        35.634 s               [User: 35.473 s, System: 0.160 s]
>>> 	
>>> 	Benchmark 3: git diff --diff-algorithm=histogram v2.0.0 v2.28.0
>>> 	  Time (abs ≡):        46.912 s               [User: 46.657 s, System: 0.228 s]
>>> 	
>>> 	Benchmark 4: git diff --diff-algorithm=myers v2.0.0 v2.28.0
>>> 	  Time (abs ≡):        33.233 s               [User: 33.072 s, System: 0.160 s]
>>> 	
>>> 	Summary
>>> 	  'git diff --diff-algorithm=myers v2.0.0 v2.28.0' ran
>>> 	    1.07 times faster than 'git diff --diff-algorithm=minimal v2.0.0 v2.28.0'
>>> 	    1.27 times faster than 'git diff --diff-algorithm=patience v2.0.0 v2.28.0'
>>> 	    1.41 times faster than 'git diff --diff-algorithm=histogram v2.0.0 v2.28.0'
>>
>> Thanks for this analysis. To clarify, .gitconfig's diff.algorithm setting is
>> already an attack vector right?
>
> .gitconfig is under the user's control though whereas .gitattributes is attacker controlled if one clones a malicious repository. Having said the worst results above are for the historgram algorithm that merge-ort uses internally and no one has complained about ort's performance.
>
>> I see how this would be adding another one.
>>
>> That being said, here's a separate issue. I benchmarked the usage of
>> .gitattributes as introduced in this patch series, and indeed it does look like
>> there is additional latency:
>>
>> $ echo "* diff-algorithm=patience >> .gitattributes
>> $ hyperfine -r 5 'git-bin-wrapper diff --diff-algorithm=patience v2.0.0 v2.28.0'                      ✭
>> Benchmark 1: git-bin-wrapper diff --diff-algorithm=patience v2.0.0 v2.28.0
>>    Time (mean ± σ):     889.4 ms ± 113.8 ms    [User: 715.7 ms, System: 65.3 ms]
>>    Range (min … max):   764.1 ms … 1029.3 ms    5 runs
>>
>> $ hyperfine -r 5 'git-bin-wrapper diff v2.0.0 v2.28.0'                                                ✭
>> Benchmark 1: git-bin-wrapper diff v2.0.0 v2.28.0
>>    Time (mean ± σ):      2.146 s ±  0.368 s    [User: 0.827 s, System: 0.243 s]
>>    Range (min … max):    1.883 s …  2.795 s    5 runs
>>
>> and I imagine the latency scales with the size of .gitattributes. Although I'm
>> not familiar with other parts of the codebase and how it deals with the latency
>> introduced by reading attributes files.
>
> Ouch! Thanks for benchmarking that is a suspiciously large slow down. I've a feeling the attributes code parses all the .gitattribute files from scratch for each path that's queried, so there may be scope for making it a bit smarter. I see some slow down but no where near as much

Yes, definitely worth looking into how this can be sped up.

>
>
> $ hyperfine -r 3 'bin-wrappers/git diff --diff-algorithm=patience --no-color --no-color-moved v2.0.0 v2.28.0'
> Benchmark 1: bin-wrappers/git diff --diff-algorithm=patience --no-color --no-color-moved v2.0.0 v2.28.0
>   Time (mean ± σ):      1.996 s ±  0.008 s    [User: 1.706 s, System: 0.286 s]
>   Range (min … max):    1.989 s …  2.004 s    3 runs
>
> $ hyperfine -r 5 'bin-wrappers/git diff --no-color --no-color-moved v2.0.0 v2.28.0'
> Benchmark 1: bin-wrappers/git diff --no-color --no-color-moved v2.0.0 v2.28.0
>   Time (mean ± σ):      2.238 s ±  0.037 s    [User: 1.880 s, System: 0.350 s]
>   Range (min … max):    2.216 s …  2.303 s    5 runs
>
>
> Perhaps I'm over simplifying but having read the issue you linked to I couldn't help feeling that the majority of users might be satisfied by just changing gitlab to use the patience algorithm when generating diffs.

Right, I recognize this is a judgment call that may be best left up to the list.

We don't have a way in GitLab to change the diff algorithm currently. Of course
that can be implemented outside of Git, but having it as part of Git may have a
wider benefit for users who would appreciate the convenience of automatically
having certain files use one diff algorithm and other files use another, without
having to pass in the diff algorithm through the command line each time.

>
> The idea to use .gitattributes seems to have come from Patrick rather than a user request.
>
> This is slightly off topic but one thing I'd really like is a way to tell diff use automatically use --diff-words on some files (e.g. Documentation/*)

I think it's a bit similar to the spirit of this desired feature.

thanks
John

>
> Best Wishes
>
> Phillip
>
>>> Now, all of those are very slow overall, but some much more than
>>> others. I seem to recall that the non-default ones also had some
>>> pathological cases.
>>>
>>> Another thing to think about is that we've so far considered the diff
>>> algorithm to be purely about presentation, with some notable exceptions
>>> such as "patch-id".
>>>
>>> I've advocated for us getting to the point of having an in-repo
>>> .gitconfig or .gitattributes before with a whitelist of settings like
>>> diff.context for certain paths, or a diff.orderFile.
>>>
>>> But those seem easy to promise future behavior for, v.s. an entire diff
>>> algorithm (which we of course had before, but now we'd have it in
>>> repository data).
>>>
>>> Maybe that's not a distinction worth worrying about, just putting that
>>> out there.
>>>
>>> I think if others are concerned about the above something that would
>>> neatly side-step those is to have it opt-in via the .git/config somehow,
>>> similar to e.g. how you can commit *.gpg content, put this in
>>> .gitattributes:
>>>
>>> 	*.gpg diff=gpg
>>>
>>> But not have it do anything until this is in the repo's .git/config (or
>>> similar):
>>>
>>> 	[diff "gpg"]
>>>          	textconv = gpg --no-tty --decrypt
>>>
>>> For that you could still keep the exact .gitattributes format you have
>>> here, i.e.:
>>>
>>> 	file* diff-algorithm=$STRATEGY
>>>
>>> But we to pick it up we'd need either:
>>>
>>> 	[diff-algorithm]
>>>          	histogram = myers
>>>
>>> Or:
>>>
>>> 	[diff-algorithm "histogram"]
>>>          	allow = true
>>
>> This would help address slowness from the diff algorithm itself. I'm not opposed
>> to adding this config if this attack vector is concerning to people.
>>
>> However, it wouldn't help address the additional latency of scanning
>> .gitattributes to find the diff algorithm.
>>
>> Would a separate config to allow gitattributes be helpful here?
>>
>> [diff-algorithm]
>> 	attributes = true
>>
>>>
>>> The former form being one that would allow you to map the .gitattributes
>>> of the repo (but maybe that would be redundant to
>>> .git/info/attributes)...

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-07 14:55       ` Phillip Wood
  2023-02-07 17:00         ` John Cai
@ 2023-02-07 17:27         ` Ævar Arnfjörð Bjarmason
  2023-02-15 14:47           ` Phillip Wood
  1 sibling, 1 reply; 78+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2023-02-07 17:27 UTC (permalink / raw)
  To: phillip.wood; +Cc: John Cai, John Cai via GitGitGadget, git, Patrick Steinhardt


On Tue, Feb 07 2023, Phillip Wood wrote:

> This is slightly off topic but one thing I'd really like is a way to
> tell diff use automatically use --diff-words on some files
> (e.g. Documentation/*)

Unlike changing the algorithm, -U options, diff.orderFile etc. doing
that would give you a diff that can't be applied with "git apply" or
other tools that expect a valid unified diff.

So I can imagine that it would be neat in some contexts, but such a
change would have much wider implications than options that tweak how a
valid unified diff looks, or is generated.

We'd need some way to mark a diff as "for ad-hoc viewing only".

But as it sounds like you want this for git.git the
Documentation/doc-diff script is much better than anything word-diff
could spew out, as it diffs the resulting generated docs.

I wonder (but haven't tried) whether you can't "diff" that using the
same method that can be used to diff binary files using a custom driver.

Hrm, except that in that case (with includes etc) there isn't really a
1=1 mapping between files within Documentation/ and generated docs (due
to includes etc.). But I suppose it could be used only for those files
that 1=1 correspond to the generated manpages.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-05  3:46 ` [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm John Cai via GitGitGadget
                     ` (2 preceding siblings ...)
  2023-02-06 16:39   ` Ævar Arnfjörð Bjarmason
@ 2023-02-07 17:56   ` Jeff King
  2023-02-07 20:18     ` Ævar Arnfjörð Bjarmason
  2023-02-09 16:34     ` John Cai
  3 siblings, 2 replies; 78+ messages in thread
From: Jeff King @ 2023-02-07 17:56 UTC (permalink / raw)
  To: John Cai via GitGitGadget
  Cc: Ævar Arnfjörð Bjarmason, git, John Cai

On Sun, Feb 05, 2023 at 03:46:21AM +0000, John Cai via GitGitGadget wrote:

> +`diff-algorithm`
> +^^^^^^^^^^^^^^^^
> +
> +The attribute `diff-algorithm` affects which algorithm Git uses to generate
> +diffs. This allows defining diff algorithms per file extension. Precedence rules
> +are as follows, in order from highest to lowest:
> +
> +*Command line option*
> +
> +Pass in the `--diff-algorithm` command line option int git-diff(1)
> +
> +*Git attributes*
> +
> +------------------------
> +*.json	diff-algorithm=histogram
> +------------------------
> +
> +*Git config*
> +
> +----------------------------------------------------------------
> +[diff]
> +	algorithm = histogram
> +----------------------------------------------------------------

From the user's perspective, this is weirdly inconsistent with the
existing diff attributes, which would be more like:

  # in .gitattributes
  *.json diff=json 

  # in config
  [diff "json"]
  algorithm = histogram

I know why one might choose the scheme you did; it kicks in if the repo
sets the algorithm, without users having to set up any extra config.
Which is sort of nice, if we assume that malicious actors don't have any
incentive to pick the algorithm. In theory they don't, though I saw Ævar
mention possible DoS elsewhere in the thread.

  Side note: It's also possible that algorithm selection could be
  required to trigger a separate security bug (say, a buffer overflow in
  the patience code or something), so restricting that works in a
  belt-and-suspenders way. But that somehow feels like like the wrong
  side of the paranoia-vs-feature line.

So I dunno. I recognize that this scheme fulfills your immediate needs
better, but I fear that we'll be stuck with a weird split between "diff"
and "diff-*" attributes forever. In the long run, having a way for the
repo to say "and here is some config I recommend to you" would give you
the best of both, but that is a challenging topic that has been
discussed and punted on for many years.

-Peff

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-07 17:56   ` Jeff King
@ 2023-02-07 20:18     ` Ævar Arnfjörð Bjarmason
  2023-02-07 20:47       ` Junio C Hamano
  2023-02-09 16:34     ` John Cai
  1 sibling, 1 reply; 78+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2023-02-07 20:18 UTC (permalink / raw)
  To: Jeff King; +Cc: John Cai via GitGitGadget, git, John Cai


On Tue, Feb 07 2023, Jeff King wrote:

> On Sun, Feb 05, 2023 at 03:46:21AM +0000, John Cai via GitGitGadget wrote:

> From the user's perspective, this is weirdly inconsistent with the
> existing diff attributes, which would be more like:
>
>   # in .gitattributes
>   *.json diff=json 
>
>   # in config
>   [diff "json"]
>   algorithm = histogram

That does look more elegant.

> I know why one might choose the scheme you did; it kicks in if the repo
> sets the algorithm, without users having to set up any extra config.
> Which is sort of nice, if we assume that malicious actors don't have any
> incentive to pick the algorithm. In theory they don't, though I saw Ævar
> mention possible DoS elsewhere in the thread.
>
>   Side note: It's also possible that algorithm selection could be
>   required to trigger a separate security bug (say, a buffer overflow in
>   the patience code or something), so restricting that works in a
>   belt-and-suspenders way. But that somehow feels like like the wrong
>   side of the paranoia-vs-feature line.
>
> So I dunno. I recognize that this scheme fulfills your immediate needs
> better, but I fear that we'll be stuck with a weird split between "diff"
> and "diff-*" attributes forever. In the long run, having a way for the
> repo to say "and here is some config I recommend to you" would give you
> the best of both, but that is a challenging topic that has been
> discussed and punted on for many years.

If (and I'm not taking a stance on whether this is the case here) we
think that a hypothetical .gitconfig in-repo combined with
.gitattributes would be more elegant for this or other cases, I don't
see why the path to some very limited version of that where we read
literally one whitelisted config variable from such a .gitconfig, and
ignore everything else, wouldn't be OK.

I.e. the more general in-repo .gitconfig discussion (of which there was
some discussion this past Git Merge in Chicago) includes potentially
much harder things

Like what to do with changing genreal in-repo config, if and how to
compare that against some local whitelist of what sort of config we
should trust (or none) etc. etc.

But if we agreed that we'd be willing to trust the remote with some
config-by-another-name unconditionally, as is being proposed here, we
could easily read that one thing from an in-repo .gitconfig, ignore
everything else, and then just document that interface as a
special-case.

We have several such "limited config" readers already, and the relevant
bits of the config parser already have to handle arbitrary "git config"
files in-tree, due to .gitmodules.

We even have special-snowflake config in the configspace already that
doesn't obey the usual rules: The trace2.* config is only read from the
--system config, as it has an inherent bootsrtapping problem in wanting
to log as early as possible.

The advantage of the limited in-repo .gitconfig would be that if we
think the interface was more elegant this would be a way to start small
in a way that would be future-proof as far as the permanent UX goes.

If there's interest the read_early_config(), read_very_early_config()
and gitmodules_config_oid() functions are good places to look.

The last one in particular could be generalized pretty easily to read a
limited in-tree .gitconfig.

Although plugging it into the "config order" might be tricky, I haven't
tried.

Even a minimal implementation of such a thing would probably want a "git
config --repository" (or whatever we call the flag) to go with
"--local", "--system" etc, to spew out something sensible from
"--show-origin" etc.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-07 20:18     ` Ævar Arnfjörð Bjarmason
@ 2023-02-07 20:47       ` Junio C Hamano
  2023-02-07 21:05         ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 78+ messages in thread
From: Junio C Hamano @ 2023-02-07 20:47 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Jeff King, John Cai via GitGitGadget, git, John Cai

Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:

>> From the user's perspective, this is weirdly inconsistent with the
>> existing diff attributes, which would be more like:
>>
>>   # in .gitattributes
>>   *.json diff=json 
>>
>>   # in config
>>   [diff "json"]
>>   algorithm = histogram
>
> That does look more elegant.

We use attributes to define what it is, while configurations are
used to define what to do on different things.  The separation of
attributes and configuration came not from "elegance" or "security"
but from a lot more practical reasons.

For a tracked file, the fact that it contains JSON text as payload
does not change per user who cloned the project, or per platform the
user used to do so.  In-tree .gitattributes that the project
controls is a perfect way to define what it is for each file.

On the other hand, the diff program suitable to compare two JSON
files may vary per platform (your favorite Windows program may not
be available to me) and per user (a platform may support more than
one and the choice becomes the matter of personal taste).

The security aspect of giving users tighter control over which exact
programs are to be run by not allowing the attributes or so called
in-tree configuration mechansim is a small bonus that fell out as a
consequence.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-07 20:47       ` Junio C Hamano
@ 2023-02-07 21:05         ` Ævar Arnfjörð Bjarmason
  2023-02-07 21:28           ` Junio C Hamano
  0 siblings, 1 reply; 78+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2023-02-07 21:05 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Jeff King, John Cai via GitGitGadget, git, John Cai


On Tue, Feb 07 2023, Junio C Hamano wrote:

> Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:
>
>>> From the user's perspective, this is weirdly inconsistent with the
>>> existing diff attributes, which would be more like:
>>>
>>>   # in .gitattributes
>>>   *.json diff=json 
>>>
>>>   # in config
>>>   [diff "json"]
>>>   algorithm = histogram
>>
>> That does look more elegant.
>
> We use attributes to define what it is, while configurations are
> used to define what to do on different things.  The separation of
> attributes and configuration came not from "elegance" or "security"
> but from a lot more practical reasons.
>
> For a tracked file, the fact that it contains JSON text as payload
> does not change per user who cloned the project, or per platform the
> user used to do so.  In-tree .gitattributes that the project
> controls is a perfect way to define what it is for each file.
>
> On the other hand, the diff program suitable to compare two JSON
> files may vary per platform (your favorite Windows program may not
> be available to me) and per user (a platform may support more than
> one and the choice becomes the matter of personal taste).
>
> The security aspect of giving users tighter control over which exact
> programs are to be run by not allowing the attributes or so called
> in-tree configuration mechansim is a small bonus that fell out as a
> consequence.

To clarify, I'm not suggesting that we ever read arbitrary parts of the
"diff.<driver>.<key>" config space, but that we could whitelist one set
of "diff.<driver>.<known-key>"="<known-values>".

The reason to do it being that, as Jeff points out, that config
mechanism is already established, and arguably more elegant. I.e. that
this is a natural fit for "diff=<driver>" in .gitattributes), and that
mechanism is already tied to config as Jeff's example shows.

Some of your reply seems like it assumed that I was suggesting that we
read "diff.algorithm" (i.e. a config setting to apply for all paths)
from an in-repo .gitconfig.

I wasn't suggesting that, nor that we open Pandora's box on starting a
limited in-repo .gitconfig support with anything remotely to do with
executing arbitrary commands (which the full "diff.<driver>.<key>" space
does support).

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-07 21:05         ` Ævar Arnfjörð Bjarmason
@ 2023-02-07 21:28           ` Junio C Hamano
  2023-02-07 21:44             ` Ævar Arnfjörð Bjarmason
  0 siblings, 1 reply; 78+ messages in thread
From: Junio C Hamano @ 2023-02-07 21:28 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Jeff King, John Cai via GitGitGadget, git, John Cai

Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:

> To clarify, I'm not suggesting that we ever read arbitrary parts of the
> "diff.<driver>.<key>" config space, but that we could whitelist one set
> of "diff.<driver>.<known-key>"="<known-values>".

When the value names the path to an executable or the command line
to invoke a program, there is no "portable" value that is useful.
Whitelisting macOS only program only because its pathname is one of
the known values does not help me running something else.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-07 21:28           ` Junio C Hamano
@ 2023-02-07 21:44             ` Ævar Arnfjörð Bjarmason
  0 siblings, 0 replies; 78+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2023-02-07 21:44 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Jeff King, John Cai via GitGitGadget, git, John Cai


On Tue, Feb 07 2023, Junio C Hamano wrote:

> Ævar Arnfjörð Bjarmason <avarab@gmail.com> writes:
>
>> To clarify, I'm not suggesting that we ever read arbitrary parts of the
>> "diff.<driver>.<key>" config space, but that we could whitelist one set
>> of "diff.<driver>.<known-key>"="<known-values>".
>
> When the value names the path to an executable or the command line
> to invoke a program, there is no "portable" value that is useful.
> Whitelisting macOS only program only because its pathname is one of
> the known values does not help me running something else.

We're not talking about invoking an executable of any kind, but a
mechanism to pick up a "diff.algorithm" setting referring to one of our
built-in algorithms, either via the in-repo .gitattributes, as in John's
original proposal:

	*.json diff-algorithm=histogram

Or via some hybrid .gitattributes/.gitconfig mechanism, where (if I
understand Jeff's suggestion correctly) the two would contain:

	# In .gitattributes
	*.json diff=json 
	# In .gitconfig
	[diff "json"]
	algorithm = histogram

In terms of implementation this would be pretty much what we do with
"submodule.<name>.update", where "Allowed values here are checkout,
rebase, merge or none.".

I don't see where you're getting this suggestion of "[a] diff program
suitable to compare two JSON files" from something I said.

That is a thing we generally support, but I've only mentioned that
arbitrary command execution (e.g. in [1]) as something we explicitly
*don't* want to support in this context.

The "submodule.<name>.update" example is particularly relevant because
it also supports arbitrary command execution with "!"-prefixed values
(the rest being the arbitrary command).

But that's something we specifically exclude when reading the in-repo
version, it's only allowed in the user-controlled config.

I'm not saying that we should be going for the read-just-the-one-key
in-repo .gitconfig approach here, just *if* we like that interface
better the implementation should be relatively straightforward, assuming
that we like the proposal in principle, and are just bikeshedding about
what the mecanhism to enable it would be.

The advantage of that being that it more naturally slots into existing
config, e.g. "diff.<driver>.binary=<bool>", this being just another
"diff.<driver>.<known-key>".

And that the way to do that securely is something we're doing for
several "submodule.*" keys, so if we pick that approach tweaking or
stealing ideas from that code should be relatively straightforward.

1. https://lore.kernel.org/git/230206.865yce7n1w.gmgdl@evledraar.gmail.com/

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-06 16:39   ` Ævar Arnfjörð Bjarmason
  2023-02-06 20:37     ` John Cai
@ 2023-02-09  7:50     ` Elijah Newren
  2023-02-09  9:41       ` Ævar Arnfjörð Bjarmason
  1 sibling, 1 reply; 78+ messages in thread
From: Elijah Newren @ 2023-02-09  7:50 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: John Cai via GitGitGadget, git, John Cai

On Mon, Feb 6, 2023 at 9:24 AM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>
> On Sun, Feb 05 2023, John Cai via GitGitGadget wrote:
>
> > From: John Cai <johncai86@gmail.com>
> > [...]
> > +
> > +             if (!o->xdl_opts_command_line) {
> > +                     static struct attr_check *check;
> > +                     const char *one_diff_algo;
> > +                     const char *two_diff_algo;
> > +
> > +                     check = attr_check_alloc();
> > +                     attr_check_append(check, git_attr("diff-algorithm"));
> > +
> > +                     git_check_attr(the_repository->index, NULL, one->path, check);
> > +                     one_diff_algo = check->items[0].value;
> > +                     git_check_attr(the_repository->index, NULL, two->path, check);
> > +                     two_diff_algo = check->items[0].value;
> > +
> > +                     if (!ATTR_UNSET(one_diff_algo) && !ATTR_UNSET(two_diff_algo) &&
> > +                             !strcmp(one_diff_algo, two_diff_algo))
> > +                             set_diff_algorithm(o, one_diff_algo);
> > +
> > +                     attr_check_free(check);
>
> This is a bit nitpicky, but I for one would find this much easier to
> read with some shorter variables, here just with "a" rather than
> "one_diff_algo", "b" instead of "two_diff_algo", and splitting
> "the_repository->index" into "istate" (untested):
>
>         +               if (!o->xdl_opts_command_line) {
>         +                       static struct attr_check *check;
>         +                       const char *a;
>         +                       const char *b;
>         +                       struct index_state *istate = the_repository->index;
>         +
>         +                       check = attr_check_alloc();
>         +                       attr_check_append(check, git_attr("diff-algorithm"));
>         +
>         +                       git_check_attr(istate, NULL, one->path, check);
>         +                       a = check->items[0].value;
>         +                       git_check_attr(istate, NULL, two->path, check);
>         +                       b = check->items[0].value;
>         +
>         +                       if (!ATTR_UNSET(a) && !ATTR_UNSET(b) && !strcmp(a, b))
>         +                               set_diff_algorithm(o, a);
>         +
>         +                       attr_check_free(check);
>         +               }
>
> That also nicely keeps the line length shorter.
>
> > @@ -333,6 +333,8 @@ struct diff_options {
> >       int prefix_length;
> >       const char *stat_sep;
> >       int xdl_opts;
> > +     /* If xdl_opts has been set via the command line. */
> > +     int xdl_opts_command_line;
> >
> >       /* see Documentation/diff-options.txt */
> >       char **anchors;
> > diff --git a/t/lib-diff-alternative.sh b/t/lib-diff-alternative.sh
> > index 8d1e408bb58..630c98ea65a 100644
> > --- a/t/lib-diff-alternative.sh
> > +++ b/t/lib-diff-alternative.sh
> > @@ -107,8 +107,27 @@ EOF
> >
> >       STRATEGY=$1
> >
> > +     test_expect_success "$STRATEGY diff from attributes" '
> > +             echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
> > +             test_must_fail git diff --no-index file1 file2 > output &&
> > +             test_cmp expect output
> > +     '
> > +
> >       test_expect_success "$STRATEGY diff" '
> > -             test_must_fail git diff --no-index "--$STRATEGY" file1 file2 > output &&
> > +             test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
>
> Nit: The usual style is ">output", not "> output".
>
> > +             test_cmp expect output
> > +     '
> > +
> > +     test_expect_success "$STRATEGY diff command line precedence before attributes" '
> > +             echo "file* diff-algorithm=meyers" >.gitattributes &&
> > +             test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
> > +             test_cmp expect output
> > +     '
> > +
> > +     test_expect_success "$STRATEGY diff attributes precedence before config" '
> > +             git config diff.algorithm default &&
> > +             echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
> > +             test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
> >               test_cmp expect output
> >       '
> >
> > @@ -166,5 +185,11 @@ EOF
> >               test_must_fail git diff --no-index "--$STRATEGY" uniq1 uniq2 > output &&
> >               test_cmp expect output
> >       '
> > +
> > +     test_expect_success "$STRATEGY diff from attributes" '
> > +             echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
> > +             test_must_fail git diff --no-index uniq1 uniq2 > output &&
> > +             test_cmp expect output
> > +     '
> >  }
>
> For some non-nitpicking, I do worry about exposing this as a DoS vector,
> e.g. here's a diff between two distant points in git.git with the
> various algorithms:
>
>         $ hyperfine -r 1 -L a patience,minimal,histogram,myers 'git diff --diff-algorithm={a} v2.0.0 v2.28.0'
>         Benchmark 1: git diff --diff-algorithm=patience v2.0.0 v2.28.0
>           Time (abs ≡):        42.121 s               [User: 41.879 s, System: 0.144 s]
>
>         Benchmark 2: git diff --diff-algorithm=minimal v2.0.0 v2.28.0
>           Time (abs ≡):        35.634 s               [User: 35.473 s, System: 0.160 s]
>
>         Benchmark 3: git diff --diff-algorithm=histogram v2.0.0 v2.28.0
>           Time (abs ≡):        46.912 s               [User: 46.657 s, System: 0.228 s]
>
>         Benchmark 4: git diff --diff-algorithm=myers v2.0.0 v2.28.0
>           Time (abs ≡):        33.233 s               [User: 33.072 s, System: 0.160 s]
>
>         Summary
>           'git diff --diff-algorithm=myers v2.0.0 v2.28.0' ran
>             1.07 times faster than 'git diff --diff-algorithm=minimal v2.0.0 v2.28.0'
>             1.27 times faster than 'git diff --diff-algorithm=patience v2.0.0 v2.28.0'
>             1.41 times faster than 'git diff --diff-algorithm=histogram v2.0.0 v2.28.0'

I'm really surprised by these numbers; they aren't remotely close to
what I compute.  Am I correct in understanding you ran these in
git.git?  Was your computer overloaded?  Was your git.git in some
serious need of repacking?  Was it on a network filesystem?  If you
run with more than 1 run, are your numbers even repeatable?

Using git compiled from current main, I see:

$ hyperfine -L a patience,minimal,histogram,myers './git diff
--diff-algorithm={a} v2.0.0 v2.28.0'
Benchmark 1: ./git diff --diff-algorithm=patience v2.0.0 v2.28.0
  Time (mean ± σ):      1.142 s ±  0.033 s    [User: 1.022 s, System: 0.114 s]
  Range (min … max):    1.117 s …  1.212 s    10 runs

Benchmark 2: ./git diff --diff-algorithm=minimal v2.0.0 v2.28.0
  Time (mean ± σ):      1.959 s ±  0.011 s    [User: 1.830 s, System: 0.120 s]
  Range (min … max):    1.947 s …  1.976 s    10 runs

Benchmark 3: ./git diff --diff-algorithm=histogram v2.0.0 v2.28.0
  Time (mean ± σ):      1.187 s ±  0.007 s    [User: 1.065 s, System: 0.115 s]
  Range (min … max):    1.175 s …  1.200 s    10 runs

Benchmark 4: ./git diff --diff-algorithm=myers v2.0.0 v2.28.0
  Time (mean ± σ):      1.194 s ±  0.007 s    [User: 1.068 s, System: 0.120 s]
  Range (min … max):    1.184 s …  1.206 s    10 runs

Summary
  './git diff --diff-algorithm=patience v2.0.0 v2.28.0' ran
    1.04 ± 0.03 times faster than './git diff
--diff-algorithm=histogram v2.0.0 v2.28.0'
    1.05 ± 0.03 times faster than './git diff --diff-algorithm=myers
v2.0.0 v2.28.0'
    1.71 ± 0.05 times faster than './git diff --diff-algorithm=minimal
v2.0.0 v2.28.0'

And this is on a kind-of low-end refurbished laptop from a few years
ago (although the repo was recently gc'ed).

I'm biased towards histogram (and making it the default rather than
making it configurable per file), but that's probably obvious given
that I made ort use it unconditionally.  And when I made ort use it,
it was actually a minor performance penalty (~1% IIRC)[*], but I
thought it was worth it since (a) histogram diffs are more
understandable to users in general, (b) the histogram diff data
structures provide an idea for possibly solving some ugly corner cases
that I don't see a way to solve with the other diffs.

[*] Phillip came along and made histogram faster after my measurements
(663c5ad035 ("diff histogram: intern strings", 2021-11-17)), so that
may not be true anymore.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-06 19:50     ` John Cai
@ 2023-02-09  8:26       ` Elijah Newren
  2023-02-09 10:31         ` "bad" diffs (was: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm) Ævar Arnfjörð Bjarmason
  2023-02-09 16:37         ` [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm John Cai
  0 siblings, 2 replies; 78+ messages in thread
From: Elijah Newren @ 2023-02-09  8:26 UTC (permalink / raw)
  To: John Cai; +Cc: phillip.wood, John Cai via GitGitGadget, git

Hi John,

On Mon, Feb 6, 2023 at 12:02 PM John Cai <johncai86@gmail.com> wrote:
>
> Hi Phillip,
>
> On 6 Feb 2023, at 11:27, Phillip Wood wrote:
>
> > Hi John
> >
> > On 05/02/2023 03:46, John Cai via GitGitGadget wrote:
> >> From: John Cai <johncai86@gmail.com>
> >>
> >> It can be useful to specify diff algorithms per file type. For example,
> >> one may want to use the minimal diff algorithm for .json files, another
> >> for .c files, etc.
> >
> > Have you got any examples of why this is useful? I find myself occasionally changing the algorithm when the default gives a sub-optimal diff but I've not really noticed any pattern with respect to file types.
>
> At $DAYJOB, there has been a discussion and request for a feature like this [1].
> One use case that came up was to be able to set a different diff algorithm for
> .json files.
>
> 1. https://gitlab.com/gitlab-org/gitaly/-/issues/2591

A couple points:

First, there seems to be a misunderstanding in that issue.  In
particular, the merge algorithm does call into the xdiff library to do
the three-way content merge of individual files, and when it does so,
it has to specify the diff algorithm (or take the default, currently
myers).  merge-recursive allows the diff algorithm to be specified by
the user (there are
-Xdiff-algorithm={histogram,minimal,patience,myers} flags to
merge/rebase for it), while merge-ort uses histogram (though it uses
the same parser as merge-recursive and thus gets the variables set
from the -Xdiff-algorithm flag, it just ignores those values and
hardcodes histogram).

Second, I also think the user request got converted to a particular
solution without looking at the wider problem space:  The idea seemed
to assume "myers" is default for a good reason, and thus asked for an
option to use something else.  I'm not sure the assumption is valid; I
think "myers" is default for historical reasons and histogram is
better not just for special Salesforce xml files, but code files too.
The output makes more sense to users.  So much so that even though my
simple testing suggested it had a 2% performance penalty compared to
myers, I forced ort to use it[1] even though I designed  everything
else in that algorithm around eking out maximum performance.  Others
who have tested the diff algorithms have also found histogram has very
similar performance to myers, and oftentimes even beats it[2][3].
Also, worries about invalidating rerere caches[4] was real, but we
already paid that price when we switched to ort.  And if performance
is still a worry, [3] gives me reason to believe we can make our
histogram implementation faster.  Finally, for the period of time when
Palantir was letting me make an internal git distribution (mostly for
testing ort), I also carried a patch that changed the default diff
algorithm to histogram (not just for ort, but for diff/log/etc. as
well).  Never had any complaints from the users from it.  Perhaps you
could do the same in your local version of git used by gitaly?

[1] See c8017176ac ("merge-ort: use histogram diff", 2020-12-13)
[2] From 85551232b5 ("perf: compare diff algorithms", 2012-03-06):
"This does indeed show that histogram diff slightly beats Myers, while
patience is much slower than the others."
[3] https://github.com/pascalkuthe/imara-diff
[4] https://lore.kernel.org/git/20120307114714.GA14990@sigill.intra.peff.net/

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-06 20:37     ` John Cai
  2023-02-07 14:55       ` Phillip Wood
@ 2023-02-09  8:44       ` Elijah Newren
  2023-02-14 21:16         ` John Cai
  1 sibling, 1 reply; 78+ messages in thread
From: Elijah Newren @ 2023-02-09  8:44 UTC (permalink / raw)
  To: John Cai
  Cc: Ævar Arnfjörð Bjarmason, John Cai via GitGitGadget,
	git

Hi John,

On Mon, Feb 6, 2023 at 12:47 PM John Cai <johncai86@gmail.com> wrote:
>
[...]
> That being said, here's a separate issue. I benchmarked the usage of
> .gitattributes as introduced in this patch series, and indeed it does look like
> there is additional latency:
>
> $ echo "* diff-algorithm=patience >> .gitattributes
> $ hyperfine -r 5 'git-bin-wrapper diff --diff-algorithm=patience v2.0.0 v2.28.0'                      ✭
> Benchmark 1: git-bin-wrapper diff --diff-algorithm=patience v2.0.0 v2.28.0
>   Time (mean ± σ):     889.4 ms ± 113.8 ms    [User: 715.7 ms, System: 65.3 ms]
>   Range (min … max):   764.1 ms … 1029.3 ms    5 runs
>
> $ hyperfine -r 5 'git-bin-wrapper diff v2.0.0 v2.28.0'                                                ✭
> Benchmark 1: git-bin-wrapper diff v2.0.0 v2.28.0
>   Time (mean ± σ):      2.146 s ±  0.368 s    [User: 0.827 s, System: 0.243 s]
>   Range (min … max):    1.883 s …  2.795 s    5 runs
>
> and I imagine the latency scales with the size of .gitattributes. Although I'm
> not familiar with other parts of the codebase and how it deals with the latency
> introduced by reading attributes files.

Yeah, that seems like a large relative performance penalty.  I had the
feeling that histogram wasn't made the default over myers mostly due
to inertia and due to a potential 2% loss in performance (since
potentially corrected by Phillip's 663c5ad035 ("diff histogram: intern
strings", 2021-11-17)).  If we had changed the default diff algorithm
to histogram, I suspect folks wouldn't have been asking for per-file
knobs to use a better diff algorithm.  And the performance penalty for
this alternative is clearly much larger than 2%, which makes me think
we might want to just revisit the default instead of allowing per-file
tweaks.

And on a separate note...

There's another set of considerations we might need to include here as
well that I haven't seen anyone else in this thread talk about:

* When trying to diff files, do we read the .gitattributes file from
the current checkout to determine the diff algorithm(s)?  Or the
index?  Or the commit we are diffing against?
* If we use the current checkout or index, what about bare clones or
diffing between two different commits?
* If diffing between two different commits, and the .gitattributes has
changed between those commits, which .gitattributes file wins?
* If diffing between two different commits, and the .gitattributes has
NOT changed, BUT a file has been renamed and the old and new names
have different rules, which rule wins?

* If per-file diff algorithms are adopted widely enough, will we be
forced to change the merge algorithm to also pay attention to them?
If it does, more complicated rename cases occur and we need rules for
how to handle those.
* If the merge algorithm has to pay attention to .gitattributes for
this too, we'll have even more corner cases around what happens if
there are merge conflicts in .gitattributes itself (which is already
kind of ugly and kludged)


Anyway, I know I'm a bit animated and biased in this area, and I
apologize if I'm a bit too much so.  Even if I am, hopefully my
comments at least provide some useful context.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-07 17:00         ` John Cai
@ 2023-02-09  9:09           ` Elijah Newren
  2023-02-09 14:44             ` Phillip Wood
  0 siblings, 1 reply; 78+ messages in thread
From: Elijah Newren @ 2023-02-09  9:09 UTC (permalink / raw)
  To: John Cai
  Cc: phillip.wood, Ævar Arnfjörð Bjarmason,
	John Cai via GitGitGadget, git, Patrick Steinhardt

Hi John and Phillip,

On Tue, Feb 7, 2023 at 9:05 AM John Cai <johncai86@gmail.com> wrote:
>
[...]
> > Perhaps I'm over simplifying but having read the issue you linked to I couldn't help feeling that the majority of users might be satisfied by just changing gitlab to use the patience algorithm when generating diffs.
>
> Right, I recognize this is a judgment call that may be best left up to the list.
>
> We don't have a way in GitLab to change the diff algorithm currently. Of course
> that can be implemented outside of Git,

Well, the below doesn't allow users to make diffs better for
*individual* files of interest, but if you agree with me that we
should just make diffs better for all users automatically, it's a
two-line change in git.git that I'd love to eventually convince the
project to take (though obviously doing that would also require some
documentation changes and some good messaging in release notes and
whatnot).  I've used it for a good long while, and had a few dozen
users using this patch too, all without complaint:

diff --git a/diff.c b/diff.c
index 329eebf16a..77a46d5b7d 100644
--- a/diff.c
+++ b/diff.c
@@ -55,7 +55,7 @@ static int diff_relative;
 static int diff_stat_graph_width;
 static int diff_dirstat_permille_default = 30;
 static struct diff_options default_diff_options;
-static long diff_algorithm;
+static long diff_algorithm = XDF_HISTOGRAM_DIFF;
 static unsigned ws_error_highlight_default = WSEH_NEW;

 static char diff_colors[][COLOR_MAXLEN] = {
diff --git a/t/t4015-diff-whitespace.sh b/t/t4015-diff-whitespace.sh
index b298f220e0..2f663eab72 100755
--- a/t/t4015-diff-whitespace.sh
+++ b/t/t4015-diff-whitespace.sh
@@ -1549,7 +1549,7 @@ test_expect_success 'short lines of opposite
sign do not get marked as moved' '
        this line should be marked as oldMoved newMoved
        unchanged 4
        EOF
-       test_expect_code 1 git diff --no-index --color --color-moved=zebra \
+       test_expect_code 1 git diff --diff-algorithm=myers --no-index
--color --color-moved=zebra \
                old.txt new.txt >output && cat output &&
        grep -v index output | test_decode_color >actual &&
        cat >expect <<-\EOF &&


I used histogram above rather than patience, since (a) it's what git's
merge backend uses, (b) it produces roughly similar results to
patience from a user perspective, (c) past testing has shown it to be
somewhat faster than patience, and (d) we've potentially got some
leads in how to speed up our histogram implementation from the README
over at https://github.com/pascalkuthe/imara-diff.  But, if you really
wanted to use patience as the default, it'd also be an easy tweak.

Anyway, just some food for thought.

^ permalink raw reply related	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-09  7:50     ` Elijah Newren
@ 2023-02-09  9:41       ` Ævar Arnfjörð Bjarmason
  2023-02-11  2:04         ` Jeff King
  0 siblings, 1 reply; 78+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2023-02-09  9:41 UTC (permalink / raw)
  To: Elijah Newren; +Cc: John Cai via GitGitGadget, git, John Cai


On Wed, Feb 08 2023, Elijah Newren wrote:

> On Mon, Feb 6, 2023 at 9:24 AM Ævar Arnfjörð Bjarmason <avarab@gmail.com> wrote:
>>
>> On Sun, Feb 05 2023, John Cai via GitGitGadget wrote:
>>
>> > From: John Cai <johncai86@gmail.com>
>> > [...]
>> > +
>> > +             if (!o->xdl_opts_command_line) {
>> > +                     static struct attr_check *check;
>> > +                     const char *one_diff_algo;
>> > +                     const char *two_diff_algo;
>> > +
>> > +                     check = attr_check_alloc();
>> > +                     attr_check_append(check, git_attr("diff-algorithm"));
>> > +
>> > +                     git_check_attr(the_repository->index, NULL, one->path, check);
>> > +                     one_diff_algo = check->items[0].value;
>> > +                     git_check_attr(the_repository->index, NULL, two->path, check);
>> > +                     two_diff_algo = check->items[0].value;
>> > +
>> > +                     if (!ATTR_UNSET(one_diff_algo) && !ATTR_UNSET(two_diff_algo) &&
>> > +                             !strcmp(one_diff_algo, two_diff_algo))
>> > +                             set_diff_algorithm(o, one_diff_algo);
>> > +
>> > +                     attr_check_free(check);
>>
>> This is a bit nitpicky, but I for one would find this much easier to
>> read with some shorter variables, here just with "a" rather than
>> "one_diff_algo", "b" instead of "two_diff_algo", and splitting
>> "the_repository->index" into "istate" (untested):
>>
>>         +               if (!o->xdl_opts_command_line) {
>>         +                       static struct attr_check *check;
>>         +                       const char *a;
>>         +                       const char *b;
>>         +                       struct index_state *istate = the_repository->index;
>>         +
>>         +                       check = attr_check_alloc();
>>         +                       attr_check_append(check, git_attr("diff-algorithm"));
>>         +
>>         +                       git_check_attr(istate, NULL, one->path, check);
>>         +                       a = check->items[0].value;
>>         +                       git_check_attr(istate, NULL, two->path, check);
>>         +                       b = check->items[0].value;
>>         +
>>         +                       if (!ATTR_UNSET(a) && !ATTR_UNSET(b) && !strcmp(a, b))
>>         +                               set_diff_algorithm(o, a);
>>         +
>>         +                       attr_check_free(check);
>>         +               }
>>
>> That also nicely keeps the line length shorter.
>>
>> > @@ -333,6 +333,8 @@ struct diff_options {
>> >       int prefix_length;
>> >       const char *stat_sep;
>> >       int xdl_opts;
>> > +     /* If xdl_opts has been set via the command line. */
>> > +     int xdl_opts_command_line;
>> >
>> >       /* see Documentation/diff-options.txt */
>> >       char **anchors;
>> > diff --git a/t/lib-diff-alternative.sh b/t/lib-diff-alternative.sh
>> > index 8d1e408bb58..630c98ea65a 100644
>> > --- a/t/lib-diff-alternative.sh
>> > +++ b/t/lib-diff-alternative.sh
>> > @@ -107,8 +107,27 @@ EOF
>> >
>> >       STRATEGY=$1
>> >
>> > +     test_expect_success "$STRATEGY diff from attributes" '
>> > +             echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
>> > +             test_must_fail git diff --no-index file1 file2 > output &&
>> > +             test_cmp expect output
>> > +     '
>> > +
>> >       test_expect_success "$STRATEGY diff" '
>> > -             test_must_fail git diff --no-index "--$STRATEGY" file1 file2 > output &&
>> > +             test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
>>
>> Nit: The usual style is ">output", not "> output".
>>
>> > +             test_cmp expect output
>> > +     '
>> > +
>> > +     test_expect_success "$STRATEGY diff command line precedence before attributes" '
>> > +             echo "file* diff-algorithm=meyers" >.gitattributes &&
>> > +             test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
>> > +             test_cmp expect output
>> > +     '
>> > +
>> > +     test_expect_success "$STRATEGY diff attributes precedence before config" '
>> > +             git config diff.algorithm default &&
>> > +             echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
>> > +             test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
>> >               test_cmp expect output
>> >       '
>> >
>> > @@ -166,5 +185,11 @@ EOF
>> >               test_must_fail git diff --no-index "--$STRATEGY" uniq1 uniq2 > output &&
>> >               test_cmp expect output
>> >       '
>> > +
>> > +     test_expect_success "$STRATEGY diff from attributes" '
>> > +             echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
>> > +             test_must_fail git diff --no-index uniq1 uniq2 > output &&
>> > +             test_cmp expect output
>> > +     '
>> >  }
>>
>> For some non-nitpicking, I do worry about exposing this as a DoS vector,
>> e.g. here's a diff between two distant points in git.git with the
>> various algorithms:
>>
>>         $ hyperfine -r 1 -L a patience,minimal,histogram,myers 'git diff --diff-algorithm={a} v2.0.0 v2.28.0'
>>         Benchmark 1: git diff --diff-algorithm=patience v2.0.0 v2.28.0
>>           Time (abs ≡):        42.121 s               [User: 41.879 s, System: 0.144 s]
>>
>>         Benchmark 2: git diff --diff-algorithm=minimal v2.0.0 v2.28.0
>>           Time (abs ≡):        35.634 s               [User: 35.473 s, System: 0.160 s]
>>
>>         Benchmark 3: git diff --diff-algorithm=histogram v2.0.0 v2.28.0
>>           Time (abs ≡):        46.912 s               [User: 46.657 s, System: 0.228 s]
>>
>>         Benchmark 4: git diff --diff-algorithm=myers v2.0.0 v2.28.0
>>           Time (abs ≡):        33.233 s               [User: 33.072 s, System: 0.160 s]
>>
>>         Summary
>>           'git diff --diff-algorithm=myers v2.0.0 v2.28.0' ran
>>             1.07 times faster than 'git diff --diff-algorithm=minimal v2.0.0 v2.28.0'
>>             1.27 times faster than 'git diff --diff-algorithm=patience v2.0.0 v2.28.0'
>>             1.41 times faster than 'git diff --diff-algorithm=histogram v2.0.0 v2.28.0'

[snip around]

> If you run with more than 1 run, are your numbers even repeatable?

Yes, but tl;dr it's diff.colorMoved=true, sorry, see below.

> I'm really surprised by these numbers; they aren't remotely close to
> what I compute.  Am I correct in understanding you ran these in
> git.git?  Was your computer overloaded?  Was your git.git in some
> serious need of repacking?  Was it on a network filesystem?  

Just on the box I regularly hack on, which isn't too overloaded, but I
re-did these a bit more seriously.

This is/was on a Hetzner EX41S
(https://docs.hetzner.com/robot/dedicated-server/general-information/root-server-hardware/),
but I tried it again now in /dev/shm/git.git with a fresh repo from:

	git clone --bare git@github.com:git/git.git

And (I didn't prune out the notice here where it says it's fuzzy (not
earlier either)).

There's some Java thing eating ~50% of 1/8 CPU cores, but otherwise the
box is pretty idle, and this is current "master" with "make CFLAGS=-O3":
	
	$ hyperfine -w 2 -r 10 -L a patience,minimal,histogram,myers './git -C /dev/shm/git.git diff --diff-algorithm={a} v2.0.0 v2.28.0'
	Benchmark 1: ./git -C /dev/shm/git.git diff --diff-algorithm=patience v2.0.0 v2.28.0
	  Time (mean ± σ):     41.131 s ±  0.550 s    [User: 40.990 s, System: 0.104 s]
	  Range (min … max):   40.323 s … 42.172 s    10 runs
	
	Benchmark 2: ./git -C /dev/shm/git.git diff --diff-algorithm=minimal v2.0.0 v2.28.0
	  Time (mean ± σ):     34.821 s ±  0.307 s    [User: 34.707 s, System: 0.100 s]
	  Range (min … max):   34.512 s … 35.523 s    10 runs
	
	Benchmark 3: ./git -C /dev/shm/git.git diff --diff-algorithm=histogram v2.0.0 v2.28.0
	  Time (mean ± σ):     45.443 s ±  0.274 s    [User: 45.328 s, System: 0.107 s]
	  Range (min … max):   44.932 s … 45.810 s    10 runs
	
	Benchmark 4: ./git -C /dev/shm/git.git diff --diff-algorithm=myers v2.0.0 v2.28.0
	  Time (mean ± σ):     33.016 s ±  0.505 s    [User: 32.893 s, System: 0.094 s]
	  Range (min … max):   32.376 s … 33.999 s    10 runs
	
	Summary
	  './git -C /dev/shm/git.git diff --diff-algorithm=myers v2.0.0 v2.28.0' ran
	    1.05 ± 0.02 times faster than './git -C /dev/shm/git.git diff --diff-algorithm=minimal v2.0.0 v2.28.0'
	    1.25 ± 0.03 times faster than './git -C /dev/shm/git.git diff --diff-algorithm=patience v2.0.0 v2.28.0'
	    1.38 ± 0.02 times faster than './git -C /dev/shm/git.git diff --diff-algorithm=histogram v2.0.0 v2.28.0'

So that's pretty much the same as my earlier results, but between the
fresh repo, ram disk, warmup & 10 measuremnets for each these should be
more accurate.

> Using git compiled from current main, I see:
>
> $ hyperfine -L a patience,minimal,histogram,myers './git diff
> --diff-algorithm={a} v2.0.0 v2.28.0'
> Benchmark 1: ./git diff --diff-algorithm=patience v2.0.0 v2.28.0
>   Time (mean ± σ):      1.142 s ±  0.033 s    [User: 1.022 s, System: 0.114 s]
>   Range (min … max):    1.117 s …  1.212 s    10 runs
>
> Benchmark 2: ./git diff --diff-algorithm=minimal v2.0.0 v2.28.0
>   Time (mean ± σ):      1.959 s ±  0.011 s    [User: 1.830 s, System: 0.120 s]
>   Range (min … max):    1.947 s …  1.976 s    10 runs
>
> Benchmark 3: ./git diff --diff-algorithm=histogram v2.0.0 v2.28.0
>   Time (mean ± σ):      1.187 s ±  0.007 s    [User: 1.065 s, System: 0.115 s]
>   Range (min … max):    1.175 s …  1.200 s    10 runs
>
> Benchmark 4: ./git diff --diff-algorithm=myers v2.0.0 v2.28.0
>   Time (mean ± σ):      1.194 s ±  0.007 s    [User: 1.068 s, System: 0.120 s]
>   Range (min … max):    1.184 s …  1.206 s    10 runs
>
> Summary
>   './git diff --diff-algorithm=patience v2.0.0 v2.28.0' ran
>     1.04 ± 0.03 times faster than './git diff
> --diff-algorithm=histogram v2.0.0 v2.28.0'
>     1.05 ± 0.03 times faster than './git diff --diff-algorithm=myers
> v2.0.0 v2.28.0'
>     1.71 ± 0.05 times faster than './git diff --diff-algorithm=minimal
> v2.0.0 v2.28.0'

But without diff.colorMoved=true I see basically your results:
	
	$ hyperfine -w 2 -r 10 -L a patience,minimal,histogram,myers './git -c diff.colorMoved=false -C /dev/shm/git.git diff --diff-algorithm={a} v2.0.0 v2.28.0'
	Benchmark 1: ./git -c diff.colorMoved=false -C /dev/shm/git.git diff --diff-algorithm=patience v2.0.0 v2.28.0
	  Time (mean ± σ):     760.9 ms ±  45.2 ms    [User: 698.6 ms, System: 62.1 ms]
	  Range (min … max):   719.0 ms … 862.2 ms    10 runs
	 
	Benchmark 2: ./git -c diff.colorMoved=false -C /dev/shm/git.git diff --diff-algorithm=minimal v2.0.0 v2.28.0
	  Time (mean ± σ):      1.347 s ±  0.041 s    [User: 1.281 s, System: 0.065 s]
	  Range (min … max):    1.305 s …  1.417 s    10 runs
	 
	Benchmark 3: ./git -c diff.colorMoved=false -C /dev/shm/git.git diff --diff-algorithm=histogram v2.0.0 v2.28.0
	  Time (mean ± σ):     826.3 ms ±  51.1 ms    [User: 767.5 ms, System: 58.6 ms]
	  Range (min … max):   773.7 ms … 929.8 ms    10 runs
	 
	Benchmark 4: ./git -c diff.colorMoved=false -C /dev/shm/git.git diff --diff-algorithm=myers v2.0.0 v2.28.0
	  Time (mean ± σ):     801.1 ms ±  39.4 ms    [User: 736.0 ms, System: 64.9 ms]
	  Range (min … max):   771.6 ms … 904.2 ms    10 runs
	 
	Summary
	  './git -c diff.colorMoved=false -C /dev/shm/git.git diff --diff-algorithm=patience v2.0.0 v2.28.0' ran
	    1.05 ± 0.08 times faster than './git -c diff.colorMoved=false -C /dev/shm/git.git diff --diff-algorithm=myers v2.0.0 v2.28.0'
	    1.09 ± 0.09 times faster than './git -c diff.colorMoved=false -C /dev/shm/git.git diff --diff-algorithm=histogram v2.0.0 v2.28.0'
	    1.77 ± 0.12 times faster than './git -c diff.colorMoved=false -C /dev/shm/git.git diff --diff-algorithm=minimal v2.0.0 v2.28.0'

So they're all within the fuzz-factor, except "minimal".

> And this is on a kind-of low-end refurbished laptop from a few years
> ago (although the repo was recently gc'ed).
>
> I'm biased towards histogram (and making it the default rather than
> making it configurable per file), but that's probably obvious given
> that I made ort use it unconditionally.  And when I made ort use it,
> it was actually a minor performance penalty (~1% IIRC)[*], but I
> thought it was worth it since (a) histogram diffs are more
> understandable to users in general, (b) the histogram diff data
> structures provide an idea for possibly solving some ugly corner cases
> that I don't see a way to solve with the other diffs.

To bring this all home the thing I was going for upthread was to raise
"is this a concern?" I was inclined to think "no", but didn't
know. Since someone who knows way more about the diffs than I probably
ever will (i.e. you :) isn't waiving their hands in panic here I think
we can just consider this checkbox ticked.

I.e. I've seen cases (and this is from vague recollection, I've got no
examples in front of me) where one diff algorithm is worse than others
on performance.

But if your intent is to DoS a service provider you can also just commit
some very long files or whatever, so whether the user can change the
diff algorithm is probably not that interesting, as long as the
performance is within some reasonable bound.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* "bad" diffs (was: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm)
  2023-02-09  8:26       ` Elijah Newren
@ 2023-02-09 10:31         ` Ævar Arnfjörð Bjarmason
  2023-02-09 16:37         ` [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm John Cai
  1 sibling, 0 replies; 78+ messages in thread
From: Ævar Arnfjörð Bjarmason @ 2023-02-09 10:31 UTC (permalink / raw)
  To: Elijah Newren; +Cc: John Cai, phillip.wood, John Cai via GitGitGadget, git


On Thu, Feb 09 2023, Elijah Newren wrote:

> [...] I
> think "myers" is default for historical reasons and histogram is
> better not just for special Salesforce xml files, but code files too.
> The output makes more sense to users.  So much so that even though my
> simple testing suggested it had a 2% performance penalty compared to
> myers, I forced ort to use it[1] even though I designed  everything
> else in that algorithm around eking out maximum performance.  Others
> who have tested the diff algorithms have also found histogram has very
> similar performance to myers, and oftentimes even beats it[2][3].
> Also, worries about invalidating rerere caches[4] was real, but we
> already paid that price when we switched to ort.

FWIW as someone who went through that one-time pain for my many git.git
topics it wasn't even a price, it was paying me!

Maybe it's just confirmation bias, or looking at the conflicts with
fresh eyes, but I found I mis-solved some of them seemingly because the
output from the old conflicts was so confusing, but much better with
"ort".

> And if performance
> is still a worry, [3] gives me reason to believe we can make our
> histogram implementation faster.  Finally, for the period of time when
> Palantir was letting me make an internal git distribution (mostly for
> testing ort), I also carried a patch that changed the default diff
> algorithm to histogram (not just for ort, but for diff/log/etc. as
> well).  Never had any complaints from the users from it.  Perhaps you
> could do the same in your local version of git used by gitaly?

I think that might be worth considering for GitLab, John? Although the
bias has definitely been to go with vanilla git semantics, but it sounds
like you might be in favor of endorsing a patch to change the default,
so if that happens to solve the problem...

> I also think the user request got converted to a particular
> solution without looking at the wider problem space:  The idea seemed
> to assume "myers" is default for a good reason, and thus asked for an
> option to use something else.  I'm not sure the assumption is valid; 

Just on the "wider problem", I've also looked at a lot of "bad diffs"
and there's interesting cases where histogram does equally bad as the
others *from most user's POV*, but from a "let's make a small diff" for
a computer it's perfect.

I've seen this most commonly with repetative file formats, e.g JSON-like
ones are a good example. You've probably looked at this exact thing N
times, but in case it's useful consider:
	
	$ cat a
	{
	        thing => 'foo',
	        a => 'b',
	},
	{
	        c => 'd',
	        thing => 'bar',
	}
	$ cat b
	{
	        thing => 'foo',
	        a => 'b',
	},
	{
	        c => 'd',
	},
	{
	        e => 'f',
	        thing => 'bar',
	}

Here all of our diff algorithms generate this equally terrible diff, or
a great diff, depending on your POV :):
	
	diff --git a/a b/b
	index 186017c1f38..5afadde1800 100644
	--- a/a
	+++ b/b
	@@ -4,5 +4,8 @@
	 },
	 {
	        c => 'd',
	+},
	+{
	+       e => 'f',
	        thing => 'bar',
	 }

The problem here is that from the user's POV they didn't add a closing
bracket, comma, open bracket etc. They added a whole new copy/pasted
block, and then modified a key-value in the subsequent one.

But the diff engine doesn't know about any of that, and will "helpfully"
proceed to "steal" parts of the previous block.

All of the cases where users have asked me (in person) about some bad
diffs have pretty much come down to this sort of thing.

In those cases one of the algorithms sometimes *happened* to find a
"better" diff, but in my experience it's been a
wrong-clock-is-right-twice-a-day sort of thing.

I've wondered if we couldn't have some much more stupid but effective
solution to these common cases. All of the ones I remember could
basically be expressed as a hypothetical:

	[diff "c"]
        balanceBrackets = true

Where we'd try as hard as we could not to produce diffs that had
un-balanced brackets. I.e. in this case (I manually produced this by
converting the brackets[1] to []'s, then changing them back:
	
	diff --git a/a b/b
	index 186017c1f38..05cdd03bfa4 100644
	--- a/a
	+++ b/b
	@@ -2,7 +2,10 @@
	        thing => 'foo',
	        a => 'b',
	 },
	+{
	+       x => 'y',
	+},
	 {
	-       c => 'd',
	+       c => 'f',
	        thing => 'bar',
	 }

That's a much worse diff to a computer (now 11 lines, v.s. 8 lines
before), but I'd think to most users that's *much* more understandable,
just by knowing just a bit about the language (although I'd argue we
could go as far as assuming this in general, with how common balanced
brackets[1] are across languages).

P.S.: Funny story: at <pastjob> I once helped a user who'd been
      struggling for hours to turn their already working change into
      something that made more sense with "git diff".

      We eventually managed to come up with something that looked
      "right", I can't remember how, probably some mixture of -U<n>,
      diff algorithm etc.

      Their next question was "Ok, so how do I commit this?", referring
      to "this particular version of the diff".

      Which, having already spent more time than I'd like to admit in
      trying to "help" them was a good reminder to first ask what
      problem we're trying to solve :)

1. By "balanced bracket" I'm referring not just to "{}", but what's
   considered a "mirrored" character in Unicode. I.e. not just ()[]{}<>
   etc., but also ∈∋ and the like (see
   e.g. https://www.compart.com/en/unicode/U+2208)

   For better or worse the Perl 6 language has this as part of its
   grammar, see e.g.:
   https://andrewshitov.com/2018/01/23/embedded-comment-delimiters-in-perl-6/

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-09  9:09           ` Elijah Newren
@ 2023-02-09 14:44             ` Phillip Wood
  2023-02-10  9:57               ` Elijah Newren
  2023-02-11  1:59               ` Jeff King
  0 siblings, 2 replies; 78+ messages in thread
From: Phillip Wood @ 2023-02-09 14:44 UTC (permalink / raw)
  To: Elijah Newren, John Cai
  Cc: Ævar Arnfjörð Bjarmason, John Cai via GitGitGadget,
	git, Patrick Steinhardt

Hi Elijah

On 09/02/2023 09:09, Elijah Newren wrote:
> Hi John and Phillip,
> 
> On Tue, Feb 7, 2023 at 9:05 AM John Cai <johncai86@gmail.com> wrote:
>>
> [...]
>>> Perhaps I'm over simplifying but having read the issue you linked to I couldn't help feeling that the majority of users might be satisfied by just changing gitlab to use the patience algorithm when generating diffs.
>>
>> Right, I recognize this is a judgment call that may be best left up to the list.
>>
>> We don't have a way in GitLab to change the diff algorithm currently. Of course
>> that can be implemented outside of Git,
> 
> Well, the below doesn't allow users to make diffs better for
> *individual* files of interest, but if you agree with me that we
> should just make diffs better for all users automatically, it's a
> two-line change in git.git that I'd love to eventually convince the
> project to take (though obviously doing that would also require some
> documentation changes and some good messaging in release notes and
> whatnot).  I've used it for a good long while, and had a few dozen
> users using this patch too, all without complaint:

I'd support a change to either patience or histogram as the default 
algorithm. My personal preference would be for the patience algorithm as 
I think it generally gives nicer diffs in the cases that the two 
disagree (see below, I've tried changing diff.algorithm to histogram a 
few times and I always end up changing it back to patience pretty 
quickly). However I can see there is an advantage in having "diff" and 
"merge" use the same algorithm as users who diffing either side to the 
merge base will see the same diff that the merge is using. The histogram 
algorithm is known to produce sub-optimal diffs in certain cases[1] but 
I'm not sure how much worse it is in that respect than any of the other 
algorithms.

To see the differences between the output of patience and histogram 
algorithms I diffed the output of "git log -p --no-merges 
--diff-algorithm=patience" and "git log -p --no-merges 
--diff-algorithm=histogram". The first three differences are

- 6c065f72b8 (http: support CURLOPT_PROTOCOLS_STR, 2023-01-16)
   In get_curl_allowed_protocols() the patience algorithm shows the
   change in the return statement more clearly

- 47cfc9bd7d (attr: add flag `--source` to work with tree-ish, 2023-01-14)
    The histogram algorithm shows read_attr_from_index() being moved
    whereas the patience algorithm does not making the diff easier to
    follow.

- b0226007f0 (fsmonitor: eliminate call to deprecated FSEventStream 
function, 2022-12-14)
   In fsm_listen__stop_async() the histogram algorithm shows
   data->shutdown_style = SHUTDOWN_EVENT;
   being moved, which is not as clear as the patience output which
   shows it as a context line.

I think there is a degree of personal preference when it comes to which 
out of patience or histogram is best and the user can easily select 
their preferred algorithm so I'd be happy with either.

Best Wishes

Phillip

[1] 
https://lore.kernel.org/git/CAGZ79kZYO6hHiAM8Sfp3J=VX11c=0-7YDSx3_EAKt5-uvvt-Ew@mail.gmail.com/

> diff --git a/diff.c b/diff.c
> index 329eebf16a..77a46d5b7d 100644
> --- a/diff.c
> +++ b/diff.c
> @@ -55,7 +55,7 @@ static int diff_relative;
>   static int diff_stat_graph_width;
>   static int diff_dirstat_permille_default = 30;
>   static struct diff_options default_diff_options;
> -static long diff_algorithm;
> +static long diff_algorithm = XDF_HISTOGRAM_DIFF;
>   static unsigned ws_error_highlight_default = WSEH_NEW;
> 
>   static char diff_colors[][COLOR_MAXLEN] = {
> diff --git a/t/t4015-diff-whitespace.sh b/t/t4015-diff-whitespace.sh
> index b298f220e0..2f663eab72 100755
> --- a/t/t4015-diff-whitespace.sh
> +++ b/t/t4015-diff-whitespace.sh
> @@ -1549,7 +1549,7 @@ test_expect_success 'short lines of opposite
> sign do not get marked as moved' '
>          this line should be marked as oldMoved newMoved
>          unchanged 4
>          EOF
> -       test_expect_code 1 git diff --no-index --color --color-moved=zebra \
> +       test_expect_code 1 git diff --diff-algorithm=myers --no-index
> --color --color-moved=zebra \
>                  old.txt new.txt >output && cat output &&
>          grep -v index output | test_decode_color >actual &&
>          cat >expect <<-\EOF &&
> 
> 
> I used histogram above rather than patience, since (a) it's what git's
> merge backend uses, (b) it produces roughly similar results to
> patience from a user perspective, (c) past testing has shown it to be
> somewhat faster than patience, and (d) we've potentially got some
> leads in how to speed up our histogram implementation from the README
> over at https://github.com/pascalkuthe/imara-diff.  But, if you really
> wanted to use patience as the default, it'd also be an easy tweak.
> 
> Anyway, just some food for thought.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-07 17:56   ` Jeff King
  2023-02-07 20:18     ` Ævar Arnfjörð Bjarmason
@ 2023-02-09 16:34     ` John Cai
  2023-02-11  1:39       ` Jeff King
  1 sibling, 1 reply; 78+ messages in thread
From: John Cai @ 2023-02-09 16:34 UTC (permalink / raw)
  To: Jeff King
  Cc: John Cai via GitGitGadget, Ævar Arnfjörð Bjarmason,
	git, gitster

On 23/02/07 12:56PM, Jeff King wrote:
> On Sun, Feb 05, 2023 at 03:46:21AM +0000, John Cai via GitGitGadget wrote:
> 
> > +`diff-algorithm`
> > +^^^^^^^^^^^^^^^^
> > +
> > +The attribute `diff-algorithm` affects which algorithm Git uses to generate
> > +diffs. This allows defining diff algorithms per file extension. Precedence rules
> > +are as follows, in order from highest to lowest:
> > +
> > +*Command line option*
> > +
> > +Pass in the `--diff-algorithm` command line option int git-diff(1)
> > +
> > +*Git attributes*
> > +
> > +------------------------
> > +*.json	diff-algorithm=histogram
> > +------------------------
> > +
> > +*Git config*
> > +
> > +----------------------------------------------------------------
> > +[diff]
> > +	algorithm = histogram
> > +----------------------------------------------------------------
> 
> From the user's perspective, this is weirdly inconsistent with the
> existing diff attributes, which would be more like:
> 
>   # in .gitattributes
>   *.json diff=json 
> 
>   # in config
>   [diff "json"]
>   algorithm = histogram

Thanks for this suggestion, Peff. What I like about this is that it builds off
of the existing diff.<driver> scheme rather than inventing another one.
Additionally, we won't get hit with a performance penalty since we already read
gitattrbitues to see if a driver has been set or not.

Thinking out loud, if we add "algorithm" as a key for diff.<driver>, it would be
mutually exclusive with "command" where "command" takes precedence, correct?

> 
> I know why one might choose the scheme you did; it kicks in if the repo
> sets the algorithm, without users having to set up any extra config.
> Which is sort of nice, if we assume that malicious actors don't have any
> incentive to pick the algorithm. In theory they don't, though I saw Ævar
> mention possible DoS elsewhere in the thread.
> 
>   Side note: It's also possible that algorithm selection could be
>   required to trigger a separate security bug (say, a buffer overflow in
>   the patience code or something), so restricting that works in a
>   belt-and-suspenders way. But that somehow feels like like the wrong
>   side of the paranoia-vs-feature line.
> 
> So I dunno. I recognize that this scheme fulfills your immediate needs
> better, but I fear that we'll be stuck with a weird split between "diff"
> and "diff-*" attributes forever. In the long run, having a way for the
> repo to say "and here is some config I recommend to you" would give you
> the best of both, but that is a challenging topic that has been
> discussed and punted on for many years.
> 
> -Peff

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-09  8:26       ` Elijah Newren
  2023-02-09 10:31         ` "bad" diffs (was: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm) Ævar Arnfjörð Bjarmason
@ 2023-02-09 16:37         ` John Cai
  1 sibling, 0 replies; 78+ messages in thread
From: John Cai @ 2023-02-09 16:37 UTC (permalink / raw)
  To: Elijah Newren; +Cc: phillip.wood, John Cai via GitGitGadget, git

On 23/02/09 12:26AM, Elijah Newren wrote:
> Hi John,
> 
> On Mon, Feb 6, 2023 at 12:02 PM John Cai <johncai86@gmail.com> wrote:
> >
> > Hi Phillip,
> >
> > On 6 Feb 2023, at 11:27, Phillip Wood wrote:
> >
> > > Hi John
> > >
> > > On 05/02/2023 03:46, John Cai via GitGitGadget wrote:
> > >> From: John Cai <johncai86@gmail.com>
> > >>
> > >> It can be useful to specify diff algorithms per file type. For example,
> > >> one may want to use the minimal diff algorithm for .json files, another
> > >> for .c files, etc.
> > >
> > > Have you got any examples of why this is useful? I find myself occasionally changing the algorithm when the default gives a sub-optimal diff but I've not really noticed any pattern with respect to file types.
> >
> > At $DAYJOB, there has been a discussion and request for a feature like this [1].
> > One use case that came up was to be able to set a different diff algorithm for
> > .json files.
> >
> > 1. https://gitlab.com/gitlab-org/gitaly/-/issues/2591
> 
> A couple points:
> 
> First, there seems to be a misunderstanding in that issue.  In
> particular, the merge algorithm does call into the xdiff library to do
> the three-way content merge of individual files, and when it does so,
> it has to specify the diff algorithm (or take the default, currently
> myers).  merge-recursive allows the diff algorithm to be specified by
> the user (there are
> -Xdiff-algorithm={histogram,minimal,patience,myers} flags to
> merge/rebase for it), while merge-ort uses histogram (though it uses
> the same parser as merge-recursive and thus gets the variables set
> from the -Xdiff-algorithm flag, it just ignores those values and
> hardcodes histogram).
> 
> Second, I also think the user request got converted to a particular
> solution without looking at the wider problem space:  The idea seemed
> to assume "myers" is default for a good reason, and thus asked for an
> option to use something else.  I'm not sure the assumption is valid; I
> think "myers" is default for historical reasons and histogram is
> better not just for special Salesforce xml files, but code files too.
> The output makes more sense to users.  So much so that even though my
> simple testing suggested it had a 2% performance penalty compared to
> myers, I forced ort to use it[1] even though I designed  everything
> else in that algorithm around eking out maximum performance.  Others
> who have tested the diff algorithms have also found histogram has very
> similar performance to myers, and oftentimes even beats it[2][3].
> Also, worries about invalidating rerere caches[4] was real, but we
> already paid that price when we switched to ort.  And if performance
> is still a worry, [3] gives me reason to believe we can make our
> histogram implementation faster.  Finally, for the period of time when
> Palantir was letting me make an internal git distribution (mostly for
> testing ort), I also carried a patch that changed the default diff
> algorithm to histogram (not just for ort, but for diff/log/etc. as
> well).  Never had any complaints from the users from it.  Perhaps you
> could do the same in your local version of git used by gitaly?

Thanks for that suggestion, Elijah. Changing the diff alg to histogram has been
something we've considered doing as well. However, we've gotten customer
requests to also have the option to view their diffs with patience. Seeing as
the diff algorithm is sometimes a matter of preference, we thought giving users
the control through the repository would be nice.

thanks
John

> 
> [1] See c8017176ac ("merge-ort: use histogram diff", 2020-12-13)
> [2] From 85551232b5 ("perf: compare diff algorithms", 2012-03-06):
> "This does indeed show that histogram diff slightly beats Myers, while
> patience is much slower than the others."
> [3] https://github.com/pascalkuthe/imara-diff
> [4] https://lore.kernel.org/git/20120307114714.GA14990@sigill.intra.peff.net/

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-09 14:44             ` Phillip Wood
@ 2023-02-10  9:57               ` Elijah Newren
  2023-02-11 17:39                 ` Phillip Wood
  2023-02-11  1:59               ` Jeff King
  1 sibling, 1 reply; 78+ messages in thread
From: Elijah Newren @ 2023-02-10  9:57 UTC (permalink / raw)
  To: phillip.wood
  Cc: John Cai, Ævar Arnfjörð Bjarmason,
	John Cai via GitGitGadget, git, Patrick Steinhardt

Hi Phillip,

On Thu, Feb 9, 2023 at 6:44 AM Phillip Wood <phillip.wood123@gmail.com> wrote:
>
> Hi Elijah
>
> On 09/02/2023 09:09, Elijah Newren wrote:
> > Hi John and Phillip,
> >
> > On Tue, Feb 7, 2023 at 9:05 AM John Cai <johncai86@gmail.com> wrote:
> >>
> > [...]
> >>> Perhaps I'm over simplifying but having read the issue you linked to I couldn't help feeling that the majority of users might be satisfied by just changing gitlab to use the patience algorithm when generating diffs.
> >>
> >> Right, I recognize this is a judgment call that may be best left up to the list.
> >>
> >> We don't have a way in GitLab to change the diff algorithm currently. Of course
> >> that can be implemented outside of Git,
> >
> > Well, the below doesn't allow users to make diffs better for
> > *individual* files of interest, but if you agree with me that we
> > should just make diffs better for all users automatically, it's a
> > two-line change in git.git that I'd love to eventually convince the
> > project to take (though obviously doing that would also require some
> > documentation changes and some good messaging in release notes and
> > whatnot).  I've used it for a good long while, and had a few dozen
> > users using this patch too, all without complaint:
>
> I'd support a change to either patience or histogram as the default
> algorithm. My personal preference would be for the patience algorithm as
> I think it generally gives nicer diffs in the cases that the two
> disagree (see below, I've tried changing diff.algorithm to histogram a
> few times and I always end up changing it back to patience pretty
> quickly). However I can see there is an advantage in having "diff" and
> "merge" use the same algorithm as users who diffing either side to the
> merge base will see the same diff that the merge is using. The histogram
> algorithm is known to produce sub-optimal diffs in certain cases[1] but
> I'm not sure how much worse it is in that respect than any of the other
> algorithms.
[...]
> [1]
> https://lore.kernel.org/git/CAGZ79kZYO6hHiAM8Sfp3J=VX11c=0-7YDSx3_EAKt5-uvvt-Ew@mail.gmail.com/

Thanks, I might have a fix, though I'm a bit worried my tweaks might
trigger issues elsewhere or cost a bit of performance; I'll need to
test.  Are there any other good known testcases where histogram
produces sub-optimal diffs?

> To see the differences between the output of patience and histogram
> algorithms I diffed the output of "git log -p --no-merges
> --diff-algorithm=patience" and "git log -p --no-merges
> --diff-algorithm=histogram". The first three differences are
>
> - 6c065f72b8 (http: support CURLOPT_PROTOCOLS_STR, 2023-01-16)
>    In get_curl_allowed_protocols() the patience algorithm shows the
>    change in the return statement more clearly
>
> - 47cfc9bd7d (attr: add flag `--source` to work with tree-ish, 2023-01-14)
>     The histogram algorithm shows read_attr_from_index() being moved
>     whereas the patience algorithm does not making the diff easier to
>     follow.
>
> - b0226007f0 (fsmonitor: eliminate call to deprecated FSEventStream
> function, 2022-12-14)
>    In fsm_listen__stop_async() the histogram algorithm shows
>    data->shutdown_style = SHUTDOWN_EVENT;
>    being moved, which is not as clear as the patience output which
>    shows it as a context line.

If my current changes are "good", then they also remove the
differences between patience and histogram for the second and third
commits above.  (And the differences between the two algorithms for
the first commit look really minor.)

> I think there is a degree of personal preference when it comes to which
> out of patience or histogram is best and the user can easily select
> their preferred algorithm so I'd be happy with either.

:-)

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-09 16:34     ` John Cai
@ 2023-02-11  1:39       ` Jeff King
  0 siblings, 0 replies; 78+ messages in thread
From: Jeff King @ 2023-02-11  1:39 UTC (permalink / raw)
  To: John Cai
  Cc: John Cai via GitGitGadget, Ævar Arnfjörð Bjarmason,
	git, gitster

On Thu, Feb 09, 2023 at 11:34:00AM -0500, John Cai wrote:

> > From the user's perspective, this is weirdly inconsistent with the
> > existing diff attributes, which would be more like:
> > 
> >   # in .gitattributes
> >   *.json diff=json 
> > 
> >   # in config
> >   [diff "json"]
> >   algorithm = histogram
> 
> Thanks for this suggestion, Peff. What I like about this is that it builds off
> of the existing diff.<driver> scheme rather than inventing another one.
> Additionally, we won't get hit with a performance penalty since we already read
> gitattrbitues to see if a driver has been set or not.
> 
> Thinking out loud, if we add "algorithm" as a key for diff.<driver>, it would be
> mutually exclusive with "command" where "command" takes precedence, correct?

Yes. I think the documentation would be something like "When generating
a diff internally, use <algorithm> to do so." And external diffs
obviously skip that code path internally.

I didn't check whether your patch does this or not, but should this
feature (however it is engaged) apply to diff-stats, too? I think
--patience, etc, does, which makes sense. But that might be something
worth elaborating in the description, too.

-Peff

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-09 14:44             ` Phillip Wood
  2023-02-10  9:57               ` Elijah Newren
@ 2023-02-11  1:59               ` Jeff King
  2023-02-15  2:35                 ` Elijah Newren
  2023-02-15 14:44                 ` Phillip Wood
  1 sibling, 2 replies; 78+ messages in thread
From: Jeff King @ 2023-02-11  1:59 UTC (permalink / raw)
  To: phillip.wood
  Cc: Elijah Newren, John Cai, Ævar Arnfjörð Bjarmason,
	John Cai via GitGitGadget, git, Patrick Steinhardt

On Thu, Feb 09, 2023 at 02:44:15PM +0000, Phillip Wood wrote:

> To see the differences between the output of patience and histogram
> algorithms I diffed the output of "git log -p --no-merges
> --diff-algorithm=patience" and "git log -p --no-merges
> --diff-algorithm=histogram". The first three differences are
> 
> - 6c065f72b8 (http: support CURLOPT_PROTOCOLS_STR, 2023-01-16)
>   In get_curl_allowed_protocols() the patience algorithm shows the
>   change in the return statement more clearly
> 
> - 47cfc9bd7d (attr: add flag `--source` to work with tree-ish, 2023-01-14)
>    The histogram algorithm shows read_attr_from_index() being moved
>    whereas the patience algorithm does not making the diff easier to
>    follow.
> 
> - b0226007f0 (fsmonitor: eliminate call to deprecated FSEventStream
> function, 2022-12-14)
>   In fsm_listen__stop_async() the histogram algorithm shows
>   data->shutdown_style = SHUTDOWN_EVENT;
>   being moved, which is not as clear as the patience output which
>   shows it as a context line.

Just a small counter-point, since I happened to be looking at myers vs
patience for something elsewhere in the thread, but:

  git show 35bd13fcd2caa4185bf3729655ca20b6a5fe9b6f builtin/add.c

looks slightly better to me with myers, even though it is 2 lines
longer. The issue is that patience and histogram are very eager to use
blank lines as anchor points, so a diff like:

  -some words
  -
  -and some more
  +unrelated content
  +
  +but it happens to also be two paragraphs

in myers becomes:

  -some words
  +unrelated content
  
  -and some more
  +but it happens to also be two paragraphs

in patience (here I'm using single lines, but in practice these may be
paragraphs, or stanzas of code). I think that's also the _strength_ of
patience in many cases, but it really depends on the content. Replacing
a multi-stanza block with another one may be the best explanation for
what happened. Or the two stanzas may be independent, and showing the
change for each one may be better.

I'm not sure which one happens more often. And you'd probably want to
weight it by how good/bad the change is. In the example I showed I don't
find patience very much worse, since it's already a pretty ugly diff.
But in cases where patience shines, it may be making things
significantly more readable.

I don't have a super strong opinion, but I just wanted to chime in that
it is not clear to me that patience/histogram is always a win over myers
(yes, I know your examples were comparing patience vs histogram, but the
larger thread is discussing the other).

-Peff

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-09  9:41       ` Ævar Arnfjörð Bjarmason
@ 2023-02-11  2:04         ` Jeff King
  0 siblings, 0 replies; 78+ messages in thread
From: Jeff King @ 2023-02-11  2:04 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: Elijah Newren, John Cai via GitGitGadget, git, John Cai

On Thu, Feb 09, 2023 at 10:41:33AM +0100, Ævar Arnfjörð Bjarmason wrote:

> > If you run with more than 1 run, are your numbers even repeatable?
> 
> Yes, but tl;dr it's diff.colorMoved=true, sorry, see below.

Wow, that's really slow. I was slightly surprised, because I also use
colorMoved. But I set it to "plain", which is way faster:

  $ time git diff --color-moved=default v2.0.0 v2.28.0 >/dev/null
  real	0m18.492s
  user	0m18.411s
  sys	0m0.081s

  $ time git diff --color-moved=plain v2.0.0 v2.28.0 >/dev/null
  real	0m0.942s
  user	0m0.841s
  sys	0m0.101s

I didn't dig into why, but it's possible there's some low-hanging fruit
in the zebra/block code.

I also have a mild feeling of deja vu that we may have discussed this
before, but a quick search in the archive didn't yield anything. So I'll
leave it for somebody to investigate further if they're interested.

-Peff

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-10  9:57               ` Elijah Newren
@ 2023-02-11 17:39                 ` Phillip Wood
  0 siblings, 0 replies; 78+ messages in thread
From: Phillip Wood @ 2023-02-11 17:39 UTC (permalink / raw)
  To: Elijah Newren
  Cc: John Cai, Ævar Arnfjörð Bjarmason,
	John Cai via GitGitGadget, git, Patrick Steinhardt

Hi Elijah

On 10/02/2023 09:57, Elijah Newren wrote:
> Hi Phillip,
> 
> On Thu, Feb 9, 2023 at 6:44 AM Phillip Wood <phillip.wood123@gmail.com> wrote:
>>
>> Hi Elijah
>>
>> On 09/02/2023 09:09, Elijah Newren wrote:
>> I'd support a change to either patience or histogram as the default
>> algorithm. My personal preference would be for the patience algorithm as
>> I think it generally gives nicer diffs in the cases that the two
>> disagree (see below, I've tried changing diff.algorithm to histogram a
>> few times and I always end up changing it back to patience pretty
>> quickly). However I can see there is an advantage in having "diff" and
>> "merge" use the same algorithm as users who diffing either side to the
>> merge base will see the same diff that the merge is using. The histogram
>> algorithm is known to produce sub-optimal diffs in certain cases[1] but
>> I'm not sure how much worse it is in that respect than any of the other
>> algorithms.
> [...]
>> [1]
>> https://lore.kernel.org/git/CAGZ79kZYO6hHiAM8Sfp3J=VX11c=0-7YDSx3_EAKt5-uvvt-Ew@mail.gmail.com/
> 
> Thanks, I might have a fix, though I'm a bit worried my tweaks might
> trigger issues elsewhere or cost a bit of performance; I'll need to
> test.  Are there any other good known testcases where histogram
> produces sub-optimal diffs?

Not that I'm aware of (I've a feeling there might have been something on 
the JGit mailing list but I only managed to find a copy of Stefan's 
message). Loosely related is [1] which talks about hash collisions and 
I've never found the time to look at properly. I suspect any hash 
collision problem is more likely to affect xdl_classify_record() which 
is used by all the algorithms.

>> To see the differences between the output of patience and histogram
>> algorithms I diffed the output of "git log -p --no-merges
>> --diff-algorithm=patience" and "git log -p --no-merges
>> --diff-algorithm=histogram". The first three differences are
>>
>> - 6c065f72b8 (http: support CURLOPT_PROTOCOLS_STR, 2023-01-16)
>>     In get_curl_allowed_protocols() the patience algorithm shows the
>>     change in the return statement more clearly
>>
>> - 47cfc9bd7d (attr: add flag `--source` to work with tree-ish, 2023-01-14)
>>      The histogram algorithm shows read_attr_from_index() being moved
>>      whereas the patience algorithm does not making the diff easier to
>>      follow.
>>
>> - b0226007f0 (fsmonitor: eliminate call to deprecated FSEventStream
>> function, 2022-12-14)
>>     In fsm_listen__stop_async() the histogram algorithm shows
>>     data->shutdown_style = SHUTDOWN_EVENT;
>>     being moved, which is not as clear as the patience output which
>>     shows it as a context line.
> 
> If my current changes are "good", then they also remove the
> differences between patience and histogram for the second and third
> commits above.  (And the differences between the two algorithms for
> the first commit look really minor.)

Interesting. I agree the differences for the first commit are small. 
Interestingly I think they come from patience algorithm falling back to 
the myers implementation because it cannot find any unique context lines 
(I have a patch that removes the fallback[2] and it gives the same 
result as the histogram implementation).

Best Wishes

Phillip

[1] 
https://lore.kernel.org/git/4e0eff48-4a3e-4f0e-9ed2-d01ec38442a5@www.fastmail.com/

[2] https://github.com/phillipwood/git/commits/pure-patience-diff

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-09  8:44       ` Elijah Newren
@ 2023-02-14 21:16         ` John Cai
  2023-02-15  3:41           ` Elijah Newren
  0 siblings, 1 reply; 78+ messages in thread
From: John Cai @ 2023-02-14 21:16 UTC (permalink / raw)
  To: Elijah Newren
  Cc: Ævar Arnfjörð Bjarmason, John Cai via GitGitGadget,
	git

Hi Elijah,

On 9 Feb 2023, at 3:44, Elijah Newren wrote:

> Hi John,
>
> On Mon, Feb 6, 2023 at 12:47 PM John Cai <johncai86@gmail.com> wrote:
>>
> [...]
>> That being said, here's a separate issue. I benchmarked the usage of
>> .gitattributes as introduced in this patch series, and indeed it does look like
>> there is additional latency:
>>
>> $ echo "* diff-algorithm=patience >> .gitattributes
>> $ hyperfine -r 5 'git-bin-wrapper diff --diff-algorithm=patience v2.0.0 v2.28.0'                      ✭
>> Benchmark 1: git-bin-wrapper diff --diff-algorithm=patience v2.0.0 v2.28.0
>>   Time (mean ± σ):     889.4 ms ± 113.8 ms    [User: 715.7 ms, System: 65.3 ms]
>>   Range (min … max):   764.1 ms … 1029.3 ms    5 runs
>>
>> $ hyperfine -r 5 'git-bin-wrapper diff v2.0.0 v2.28.0'                                                ✭
>> Benchmark 1: git-bin-wrapper diff v2.0.0 v2.28.0
>>   Time (mean ± σ):      2.146 s ±  0.368 s    [User: 0.827 s, System: 0.243 s]
>>   Range (min … max):    1.883 s …  2.795 s    5 runs
>>
>> and I imagine the latency scales with the size of .gitattributes. Although I'm
>> not familiar with other parts of the codebase and how it deals with the latency
>> introduced by reading attributes files.
>
> Yeah, that seems like a large relative performance penalty.  I had the
> feeling that histogram wasn't made the default over myers mostly due
> to inertia and due to a potential 2% loss in performance (since
> potentially corrected by Phillip's 663c5ad035 ("diff histogram: intern
> strings", 2021-11-17)).  If we had changed the default diff algorithm
> to histogram, I suspect folks wouldn't have been asking for per-file
> knobs to use a better diff algorithm.  And the performance penalty for
> this alternative is clearly much larger than 2%, which makes me think
> we might want to just revisit the default instead of allowing per-file
> tweaks.

It seems like the performance penalty was because I was adding calls to parse
attribute files. Piggy backing off of the attribute parsing in userdiff.h will
allow us to not incur this performance penalty:

$ hyperfine -r 5 -L a bin-wrappers/git,git '{a} diff v2.0.0 v2.28.0'
Benchmark 1: git-bin-wrapper diff v2.0.0 v2.28.0
  Time (mean ± σ):      1.072 s ±  0.289 s    [User: 0.626 s, System: 0.081 s]
  Range (min … max):    0.772 s …  1.537 s    5 runs

Benchmark 2: git diff v2.0.0 v2.28.0
  Time (mean ± σ):      1.003 s ±  0.065 s    [User: 0.684 s, System: 0.067 s]
  Range (min … max):    0.914 s …  1.091 s    5 runs

Summary
  'git diff v2.0.0 v2.28.0' ran
    1.07 ± 0.30 times faster than 'git-bin-wrapper diff v2.0.0 v2.28.0'

>
> And on a separate note...
>
> There's another set of considerations we might need to include here as
> well that I haven't seen anyone else in this thread talk about:

These are some great questions. I'll do my best to answer them.
>
> * When trying to diff files, do we read the .gitattributes file from
> the current checkout to determine the diff algorithm(s)?  Or the
> index?  Or the commit we are diffing against?
> * If we use the current checkout or index, what about bare clones or
> diffing between two different commits?
> * If diffing between two different commits, and the .gitattributes has
> changed between those commits, which .gitattributes file wins?
> * If diffing between two different commits, and the .gitattributes has
> NOT changed, BUT a file has been renamed and the old and new names
> have different rules, which rule wins?

In the next version I plan on using Peff's suggestion of utilizing the existing
diff driver scheme [1]. I believe these four questions are addressed if we use
the existing userdiff.h API, which in turn calls the attr.h API. We check the
worktree, then fallback to the index.

By using the userdiff.h API, the behavior will match what users already expect
when they for instance set an external driver.

1. https://lore.kernel.org/git/Y+KQtqNPews3vBS8@coredump.intra.peff.net/

>
> * If per-file diff algorithms are adopted widely enough, will we be
> forced to change the merge algorithm to also pay attention to them?
> If it does, more complicated rename cases occur and we need rules for
> how to handle those.
> * If the merge algorithm has to pay attention to .gitattributes for
> this too, we'll have even more corner cases around what happens if
> there are merge conflicts in .gitattributes itself (which is already
> kind of ugly and kludged)

I see this feature as a user-experience type convenience feature, so I don't
believe there's need for the merge machinery to also pay attention to the diff
algorithm set through gitattrbutes. We can clarify this in the documentation.

>
>
> Anyway, I know I'm a bit animated and biased in this area, and I
> apologize if I'm a bit too much so.  Even if I am, hopefully my
> comments at least provide some useful context.

No problem! thanks for raising these issues.

thanks
John

^ permalink raw reply	[flat|nested] 78+ messages in thread

* [PATCH v2 0/2] Teach diff to honor diff algorithms set through git attributes
  2023-02-05  3:46 [PATCH 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
  2023-02-05  3:46 ` [PATCH 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
  2023-02-05  3:46 ` [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm John Cai via GitGitGadget
@ 2023-02-14 21:40 ` John Cai via GitGitGadget
  2023-02-14 21:40   ` [PATCH v2 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
                     ` (2 more replies)
  2 siblings, 3 replies; 78+ messages in thread
From: John Cai via GitGitGadget @ 2023-02-14 21:40 UTC (permalink / raw)
  To: git
  Cc: Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren,
	John Cai

When a repository contains different kinds of files, it may be desirable to
use different algorithms based on file type. This is currently not feasible
through the command line or using git configs. However, we can leverage the
fact that gitattributes are path aware.

Teach the diff machinery to check gitattributes when diffing files.

Changes since V1:

 * utilize the existing diff.<driver>.* scheme where the driver is defined
   in gitattributes, but the algorithm is defined in the gitconfig.

To address some of the performance concerns in the previous series, a
benchmark shows that a performance penalty is no longer incurred, now that
we are no longer adding an additional attributes parsing call:

$ hyperfine -r 5 -L a bin-wrappers/git,git '{a} diff v2.0.0 v2.28.0'
Benchmark 1: git-bin-wrapper diff v2.0.0 v2.28.0 Time (mean ± σ): 1.072 s ±
0.289 s [User: 0.626 s, System: 0.081 s] Range (min … max): 0.772 s … 1.537
s 5 runs

Benchmark 2: git diff v2.0.0 v2.28.0 Time (mean ± σ): 1.003 s ± 0.065 s
[User: 0.684 s, System: 0.067 s] Range (min … max): 0.914 s … 1.091 s 5 runs

Summary 'git diff v2.0.0 v2.28.0' ran 1.07 ± 0.30 times faster than
'git-bin-wrapper diff v2.0.0 v2.28.0'

John Cai (2):
  diff: consolidate diff algorithm option parsing
  diff: teach diff to read gitattribute diff-algorithm

 Documentation/gitattributes.txt |  41 +++++++++++-
 diff.c                          | 112 ++++++++++++++++++++------------
 diff.h                          |   2 +
 t/lib-diff-alternative.sh       |  38 ++++++++++-
 userdiff.c                      |   4 +-
 userdiff.h                      |   1 +
 6 files changed, 154 insertions(+), 44 deletions(-)


base-commit: c867e4fa180bec4750e9b54eb10f459030dbebfd
Published-As: https://github.com/gitgitgadget/git/releases/tag/pr-git-1452%2Fjohn-cai%2Fjc%2Fattr-diff-algo-v2
Fetch-It-Via: git fetch https://github.com/gitgitgadget/git pr-git-1452/john-cai/jc/attr-diff-algo-v2
Pull-Request: https://github.com/git/git/pull/1452

Range-diff vs v1:

 1:  f56bd38ac3f ! 1:  0c5e1fc6c26 diff: consolidate diff algorithm option parsing
     @@
       ## Metadata ##
     -Author: John Cai <jcai@gitlab.com>
     +Author: John Cai <johncai86@gmail.com>
      
       ## Commit message ##
          diff: consolidate diff algorithm option parsing
     @@ Commit message
          subsequent commit which teaches diff to keep track of whether or not a
          diff algorithm has been set via the command line.
      
     -    While we're at it, the logic that sets the diff algorithm in
     -    diff_opt_diff_algorithm() can be refactored into a helper that will
     +    Additionally, the logic that sets the diff algorithm in
     +    diff_opt_diff_algorithm() can  be refactored into a helper that will
          allow multiple callsites to set the diff algorithm.
      
          Signed-off-by: John Cai <johncai86@gmail.com>
     @@ diff.c: static int diff_opt_diff_algorithm(const struct option *opt,
      +	BUG_ON_OPT_ARG(arg);
      +
      +	if (!strcmp(opt->long_name, "patience")) {
     -+		int i;
     ++		size_t i;
      +		/*
      +		 * Both --patience and --anchored use PATIENCE_DIFF
      +		 * internally, so remove any anchors previously
     @@ diff.c: static int diff_opt_diff_algorithm(const struct option *opt,
      +	}
      +
      +	if (set_diff_algorithm(options, opt->long_name))
     -+		return error(_("available diff algorithms include \"myers\", "
     -+			       "\"minimal\", \"patience\" and \"histogram\""));
     ++		BUG("available diff algorithms include \"myers\", "
     ++			       "\"minimal\", \"patience\" and \"histogram\"");
      +
       	return 0;
       }
 2:  8e73793b0db ! 2:  cb030563149 diff: teach diff to read gitattribute diff-algorithm
     @@ Commit message
          one may want to use the minimal diff algorithm for .json files, another
          for .c files, etc.
      
     -    Teach the diff machinery to check attributes for a diff algorithm.
     -    Enforce precedence by favoring the command line option, then looking at
     -    attributes, then finally the config.
     +    Teach the diff machinery to check attributes for a diff driver. Also
     +    teach the diff driver parser a new type "algorithm" to look for in the
     +    config, which will be used if a driver has been specified through the
     +    attributes.
      
     -    To enforce precedence order, set the `xdl_opts_command_line` member
     +    Enforce precedence of diff algorithm by favoring the command line option,
     +    then looking at the driver attributes & config combination, then finally
     +    the diff.algorithm config.
     +
     +    To enforce precedence order, use the `xdl_opts_command_line` member
          during options pasing to indicate the diff algorithm was set via command
          line args.
      
     @@ Documentation/gitattributes.txt: String::
       	by the configuration variables in the "diff.foo" section of the
       	Git config file.
       
     -+`diff-algorithm`
     -+^^^^^^^^^^^^^^^^
     -+
     -+The attribute `diff-algorithm` affects which algorithm Git uses to generate
     -+diffs. This allows defining diff algorithms per file extension. Precedence rules
     -+are as follows, in order from highest to lowest:
     +-
     + Defining an external diff driver
     + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     + 
     +@@ Documentation/gitattributes.txt: with the above configuration, i.e. `j-c-diff`, with 7
     + parameters, just like `GIT_EXTERNAL_DIFF` program is called.
     + See linkgit:git[1] for details.
     + 
     ++Setting the internal diff algorithm
     ++^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
      +
     -+*Command line option*
     ++The diff algorithm can be set through the `diff.algorithm` config key, but
     ++sometimes it may be helpful to set the diff algorithm by path. For example, one
     ++might wish to set a diff algorithm automatically for all `.json` files such that
     ++the user would not need to pass in a separate command line `--diff-algorithm` flag each
     ++time.
      +
     -+Pass in the `--diff-algorithm` command line option int git-diff(1)
     ++First, in `.gitattributes`, you would assign the `diff` attribute for paths.
      +
      +*Git attributes*
     -+
      +------------------------
     -+*.json	diff-algorithm=histogram
     ++*.json diff=<name>
      +------------------------
      +
     ++Then, you would define a "diff.<name>.algorithm" configuration to specify the
     ++diff algorithm, choosing from `meyers`, `patience`, `minimal`, and `histogram`.
     ++
     ++*Git config*
     ++
     ++----------------------------------------------------------------
     ++[diff "<name>"]
     ++  algorithm = histogram
     ++----------------------------------------------------------------
     ++
     ++This diff algorithm applies to git-diff(1), including the `--stat` output.
     ++
     ++NOTE: If the `command` key also exists, then Git will treat this as an external
     ++diff and attempt to use the value set for `command` as an external program. For
     ++instance, the following config, combined with the above `.gitattributes` file,
     ++will result in `command` favored over `algorithm`.
     ++
      +*Git config*
      +
      +----------------------------------------------------------------
     -+[diff]
     -+	algorithm = histogram
     ++[diff "<name>"]
     ++  command = j-c-diff
     ++  algorithm = histogram
      +----------------------------------------------------------------
       
     - Defining an external diff driver
     - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     + Defining a custom hunk-header
     + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
      
       ## diff.c ##
     -@@ diff.c: static void builtin_diff(const char *name_a,
     - 		ecbdata.opt = o;
     - 		if (header.len && !o->flags.suppress_diff_headers)
     - 			ecbdata.header = &header;
     -+
     -+		if (!o->xdl_opts_command_line) {
     -+			static struct attr_check *check;
     -+			const char *one_diff_algo;
     -+			const char *two_diff_algo;
     -+
     -+			check = attr_check_alloc();
     -+			attr_check_append(check, git_attr("diff-algorithm"));
     -+
     -+			git_check_attr(the_repository->index, NULL, one->path, check);
     -+			one_diff_algo = check->items[0].value;
     -+			git_check_attr(the_repository->index, NULL, two->path, check);
     -+			two_diff_algo = check->items[0].value;
     -+
     -+			if (!ATTR_UNSET(one_diff_algo) && !ATTR_UNSET(two_diff_algo) &&
     -+				!strcmp(one_diff_algo, two_diff_algo))
     -+				set_diff_algorithm(o, one_diff_algo);
     -+
     -+			attr_check_free(check);
     -+		}
     -+
     - 		xpp.flags = o->xdl_opts;
     - 		xpp.ignore_regex = o->ignore_regex;
     - 		xpp.ignore_regex_nr = o->ignore_regex_nr;
     +@@ diff.c: static void run_diff_cmd(const char *pgm,
     + 	const char *xfrm_msg = NULL;
     + 	int complete_rewrite = (p->status == DIFF_STATUS_MODIFIED) && p->score;
     + 	int must_show_header = 0;
     ++	struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, attr_path);
     + 
     +-
     +-	if (o->flags.allow_external) {
     +-		struct userdiff_driver *drv;
     +-
     +-		drv = userdiff_find_by_path(o->repo->index, attr_path);
     ++	if (o->flags.allow_external)
     + 		if (drv && drv->external)
     + 			pgm = drv->external;
     +-	}
     + 
     + 	if (msg) {
     + 		/*
     +@@ diff.c: static void run_diff_cmd(const char *pgm,
     + 		run_external_diff(pgm, name, other, one, two, xfrm_msg, o);
     + 		return;
     + 	}
     +-	if (one && two)
     ++	if (one && two) {
     ++		if (!o->xdl_opts_command_line)
     ++			if (drv && drv->algorithm)
     ++				set_diff_algorithm(o, drv->algorithm);
     ++
     + 		builtin_diff(name, other ? other : name,
     + 			     one, two, xfrm_msg, must_show_header,
     + 			     o, complete_rewrite);
     +-	else
     ++	} else {
     + 		fprintf(o->file, "* Unmerged path %s\n", name);
     ++	}
     + }
     + 
     + static void diff_fill_oid_info(struct diff_filespec *one, struct index_state *istate)
     +@@ diff.c: static void run_diffstat(struct diff_filepair *p, struct diff_options *o,
     + 	const char *name;
     + 	const char *other;
     + 
     ++	struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, p->one->path);
     ++	if (drv && drv->algorithm)
     ++		set_diff_algorithm(o, drv->algorithm);
     ++
     + 	if (DIFF_PAIR_UNMERGED(p)) {
     + 		/* unmerged */
     + 		builtin_diffstat(p->one->path, NULL, NULL, NULL,
      @@ diff.c: static int diff_opt_diff_algorithm(const struct option *opt,
       		return error(_("option diff-algorithm accepts \"myers\", "
       			       "\"minimal\", \"patience\" and \"histogram\""));
     @@ diff.c: static int diff_opt_diff_algorithm(const struct option *opt,
       }
       
      @@ diff.c: static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
     - 		return error(_("available diff algorithms include \"myers\", "
     - 			       "\"minimal\", \"patience\" and \"histogram\""));
     + 		BUG("available diff algorithms include \"myers\", "
     + 			       "\"minimal\", \"patience\" and \"histogram\"");
       
      +	options->xdl_opts_command_line = 1;
      +
     @@ diff.h: struct diff_options {
       	char **anchors;
      
       ## t/lib-diff-alternative.sh ##
     -@@ t/lib-diff-alternative.sh: EOF
     +@@ t/lib-diff-alternative.sh: index $file1..$file2 100644
     +  }
     + EOF
       
     ++	cat >expect_diffstat <<EOF
     ++ file1 => file2 | 21 ++++++++++-----------
     ++ 1 file changed, 10 insertions(+), 11 deletions(-)
     ++EOF
     ++
       	STRATEGY=$1
       
      +	test_expect_success "$STRATEGY diff from attributes" '
     -+		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
     ++		echo "file* diff=driver" >.gitattributes &&
     ++		git config diff.driver.algorithm "$STRATEGY" &&
      +		test_must_fail git diff --no-index file1 file2 > output &&
     ++		cat expect &&
     ++		cat output &&
      +		test_cmp expect output
      +	'
     ++
     ++	test_expect_success "$STRATEGY diff from attributes has valid diffstat" '
     ++		echo "file* diff=driver" >.gitattributes &&
     ++		git config diff.driver.algorithm "$STRATEGY" &&
     ++		test_must_fail git diff --stat --no-index file1 file2 > output &&
     ++		test_cmp expect_diffstat output
     ++	'
      +
       	test_expect_success "$STRATEGY diff" '
      -		test_must_fail git diff --no-index "--$STRATEGY" file1 file2 > output &&
     @@ t/lib-diff-alternative.sh: EOF
      +	'
      +
      +	test_expect_success "$STRATEGY diff command line precedence before attributes" '
     -+		echo "file* diff-algorithm=meyers" >.gitattributes &&
     ++		echo "file* diff=driver" >.gitattributes &&
     ++		git config diff.driver.algorithm meyers &&
      +		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
      +		test_cmp expect output
      +	'
      +
      +	test_expect_success "$STRATEGY diff attributes precedence before config" '
      +		git config diff.algorithm default &&
     -+		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
     -+		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
     ++		echo "file* diff=driver" >.gitattributes &&
     ++		git config diff.driver.algorithm "$STRATEGY" &&
     ++		test_must_fail git diff --no-index file1 file2 > output &&
       		test_cmp expect output
       	'
       
     -@@ t/lib-diff-alternative.sh: EOF
     - 		test_must_fail git diff --no-index "--$STRATEGY" uniq1 uniq2 > output &&
     - 		test_cmp expect output
     - 	'
     -+
     -+	test_expect_success "$STRATEGY diff from attributes" '
     -+		echo "file* diff-algorithm=$STRATEGY" >.gitattributes &&
     -+		test_must_fail git diff --no-index uniq1 uniq2 > output &&
     -+		test_cmp expect output
     -+	'
     - }
     +
     + ## userdiff.c ##
     +@@ userdiff.c: PATTERNS("scheme",
     + 	 "|([^][)(}{[ \t])+"),
     + PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",
     + 	 "\\\\[a-zA-Z@]+|\\\\.|[a-zA-Z0-9\x80-\xff]+"),
     +-{ "default", NULL, -1, { NULL, 0 } },
     ++{ "default", NULL, NULL, -1, { NULL, 0 } },
     + };
     + #undef PATTERNS
     + #undef IPATTERN
     +@@ userdiff.c: int userdiff_config(const char *k, const char *v)
     + 		return parse_bool(&drv->textconv_want_cache, k, v);
     + 	if (!strcmp(type, "wordregex"))
     + 		return git_config_string(&drv->word_regex, k, v);
     ++	if (!strcmp(type, "algorithm"))
     ++		return git_config_string(&drv->algorithm, k, v);
       
     + 	return 0;
     + }
     +
     + ## userdiff.h ##
     +@@ userdiff.h: struct userdiff_funcname {
     + struct userdiff_driver {
     + 	const char *name;
     + 	const char *external;
     ++	const char *algorithm;
     + 	int binary;
     + 	struct userdiff_funcname funcname;
     + 	const char *word_regex;

-- 
gitgitgadget

^ permalink raw reply	[flat|nested] 78+ messages in thread

* [PATCH v2 1/2] diff: consolidate diff algorithm option parsing
  2023-02-14 21:40 ` [PATCH v2 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
@ 2023-02-14 21:40   ` John Cai via GitGitGadget
  2023-02-15  2:38     ` Junio C Hamano
  2023-02-14 21:40   ` [PATCH v2 2/2] diff: teach diff to read gitattribute diff-algorithm John Cai via GitGitGadget
  2023-02-17 20:21   ` [PATCH v3 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
  2 siblings, 1 reply; 78+ messages in thread
From: John Cai via GitGitGadget @ 2023-02-14 21:40 UTC (permalink / raw)
  To: git
  Cc: Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren,
	John Cai, John Cai

From: John Cai <johncai86@gmail.com>

The diff option parsing for --minimal, --patience, --histgoram can all
be consolidated into one function. This is a preparatory step for the
subsequent commit which teaches diff to keep track of whether or not a
diff algorithm has been set via the command line.

Additionally, the logic that sets the diff algorithm in
diff_opt_diff_algorithm() can  be refactored into a helper that will
allow multiple callsites to set the diff algorithm.

Signed-off-by: John Cai <johncai86@gmail.com>
---
 diff.c | 87 ++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 54 insertions(+), 33 deletions(-)

diff --git a/diff.c b/diff.c
index 329eebf16a0..92a0eab942e 100644
--- a/diff.c
+++ b/diff.c
@@ -3437,6 +3437,22 @@ static int diff_filepair_is_phoney(struct diff_filespec *one,
 	return !DIFF_FILE_VALID(one) && !DIFF_FILE_VALID(two);
 }
 
+static int set_diff_algorithm(struct diff_options *opts,
+			      const char *alg)
+{
+	long value = parse_algorithm_value(alg);
+
+	if (value < 0)
+		return 1;
+
+	/* clear out previous settings */
+	DIFF_XDL_CLR(opts, NEED_MINIMAL);
+	opts->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
+	opts->xdl_opts |= value;
+
+	return 0;
+}
+
 static void builtin_diff(const char *name_a,
 			 const char *name_b,
 			 struct diff_filespec *one,
@@ -5107,17 +5123,40 @@ static int diff_opt_diff_algorithm(const struct option *opt,
 				   const char *arg, int unset)
 {
 	struct diff_options *options = opt->value;
-	long value = parse_algorithm_value(arg);
 
 	BUG_ON_OPT_NEG(unset);
-	if (value < 0)
+
+	if (set_diff_algorithm(options, arg))
 		return error(_("option diff-algorithm accepts \"myers\", "
 			       "\"minimal\", \"patience\" and \"histogram\""));
 
-	/* clear out previous settings */
-	DIFF_XDL_CLR(options, NEED_MINIMAL);
-	options->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
-	options->xdl_opts |= value;
+	return 0;
+}
+
+static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
+				   const char *arg, int unset)
+{
+	struct diff_options *options = opt->value;
+
+	BUG_ON_OPT_NEG(unset);
+	BUG_ON_OPT_ARG(arg);
+
+	if (!strcmp(opt->long_name, "patience")) {
+		size_t i;
+		/*
+		 * Both --patience and --anchored use PATIENCE_DIFF
+		 * internally, so remove any anchors previously
+		 * specified.
+		 */
+		for (i = 0; i < options->anchors_nr; i++)
+			free(options->anchors[i]);
+		options->anchors_nr = 0;
+	}
+
+	if (set_diff_algorithm(options, opt->long_name))
+		BUG("available diff algorithms include \"myers\", "
+			       "\"minimal\", \"patience\" and \"histogram\"");
+
 	return 0;
 }
 
@@ -5242,26 +5281,6 @@ static enum parse_opt_result diff_opt_output(struct parse_opt_ctx_t *ctx,
 	return 0;
 }
 
-static int diff_opt_patience(const struct option *opt,
-			     const char *arg, int unset)
-{
-	struct diff_options *options = opt->value;
-	int i;
-
-	BUG_ON_OPT_NEG(unset);
-	BUG_ON_OPT_ARG(arg);
-	options->xdl_opts = DIFF_WITH_ALG(options, PATIENCE_DIFF);
-	/*
-	 * Both --patience and --anchored use PATIENCE_DIFF
-	 * internally, so remove any anchors previously
-	 * specified.
-	 */
-	for (i = 0; i < options->anchors_nr; i++)
-		free(options->anchors[i]);
-	options->anchors_nr = 0;
-	return 0;
-}
-
 static int diff_opt_ignore_regex(const struct option *opt,
 				 const char *arg, int unset)
 {
@@ -5562,9 +5581,10 @@ struct option *add_diff_options(const struct option *opts,
 			    N_("prevent rename/copy detection if the number of rename/copy targets exceeds given limit")),
 
 		OPT_GROUP(N_("Diff algorithm options")),
-		OPT_BIT(0, "minimal", &options->xdl_opts,
-			N_("produce the smallest possible diff"),
-			XDF_NEED_MINIMAL),
+		OPT_CALLBACK_F(0, "minimal", options, NULL,
+			       N_("produce the smallest possible diff"),
+			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
+			       diff_opt_diff_algorithm_no_arg),
 		OPT_BIT_F('w', "ignore-all-space", &options->xdl_opts,
 			  N_("ignore whitespace when comparing lines"),
 			  XDF_IGNORE_WHITESPACE, PARSE_OPT_NONEG),
@@ -5589,10 +5609,11 @@ struct option *add_diff_options(const struct option *opts,
 		OPT_CALLBACK_F(0, "patience", options, NULL,
 			       N_("generate diff using the \"patience diff\" algorithm"),
 			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
-			       diff_opt_patience),
-		OPT_BITOP(0, "histogram", &options->xdl_opts,
-			  N_("generate diff using the \"histogram diff\" algorithm"),
-			  XDF_HISTOGRAM_DIFF, XDF_DIFF_ALGORITHM_MASK),
+			       diff_opt_diff_algorithm_no_arg),
+		OPT_CALLBACK_F(0, "histogram", options, NULL,
+			       N_("generate diff using the \"histogram diff\" algorithm"),
+			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
+			       diff_opt_diff_algorithm_no_arg),
 		OPT_CALLBACK_F(0, "diff-algorithm", options, N_("<algorithm>"),
 			       N_("choose a diff algorithm"),
 			       PARSE_OPT_NONEG, diff_opt_diff_algorithm),
-- 
gitgitgadget


^ permalink raw reply related	[flat|nested] 78+ messages in thread

* [PATCH v2 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-14 21:40 ` [PATCH v2 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
  2023-02-14 21:40   ` [PATCH v2 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
@ 2023-02-14 21:40   ` John Cai via GitGitGadget
  2023-02-15  2:56     ` Junio C Hamano
  2023-02-17 20:21   ` [PATCH v3 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
  2 siblings, 1 reply; 78+ messages in thread
From: John Cai via GitGitGadget @ 2023-02-14 21:40 UTC (permalink / raw)
  To: git
  Cc: Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren,
	John Cai, John Cai

From: John Cai <johncai86@gmail.com>

It can be useful to specify diff algorithms per file type. For example,
one may want to use the minimal diff algorithm for .json files, another
for .c files, etc.

Teach the diff machinery to check attributes for a diff driver. Also
teach the diff driver parser a new type "algorithm" to look for in the
config, which will be used if a driver has been specified through the
attributes.

Enforce precedence of diff algorithm by favoring the command line option,
then looking at the driver attributes & config combination, then finally
the diff.algorithm config.

To enforce precedence order, use the `xdl_opts_command_line` member
during options pasing to indicate the diff algorithm was set via command
line args.

Signed-off-by: John Cai <johncai86@gmail.com>
---
 Documentation/gitattributes.txt | 41 ++++++++++++++++++++++++++++++++-
 diff.c                          | 25 +++++++++++++-------
 diff.h                          |  2 ++
 t/lib-diff-alternative.sh       | 38 +++++++++++++++++++++++++++++-
 userdiff.c                      |  4 +++-
 userdiff.h                      |  1 +
 6 files changed, 100 insertions(+), 11 deletions(-)

diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt
index c19e64ea0ef..7e69f509d0a 100644
--- a/Documentation/gitattributes.txt
+++ b/Documentation/gitattributes.txt
@@ -736,7 +736,6 @@ String::
 	by the configuration variables in the "diff.foo" section of the
 	Git config file.
 
-
 Defining an external diff driver
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -758,6 +757,46 @@ with the above configuration, i.e. `j-c-diff`, with 7
 parameters, just like `GIT_EXTERNAL_DIFF` program is called.
 See linkgit:git[1] for details.
 
+Setting the internal diff algorithm
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The diff algorithm can be set through the `diff.algorithm` config key, but
+sometimes it may be helpful to set the diff algorithm by path. For example, one
+might wish to set a diff algorithm automatically for all `.json` files such that
+the user would not need to pass in a separate command line `--diff-algorithm` flag each
+time.
+
+First, in `.gitattributes`, you would assign the `diff` attribute for paths.
+
+*Git attributes*
+------------------------
+*.json diff=<name>
+------------------------
+
+Then, you would define a "diff.<name>.algorithm" configuration to specify the
+diff algorithm, choosing from `meyers`, `patience`, `minimal`, and `histogram`.
+
+*Git config*
+
+----------------------------------------------------------------
+[diff "<name>"]
+  algorithm = histogram
+----------------------------------------------------------------
+
+This diff algorithm applies to git-diff(1), including the `--stat` output.
+
+NOTE: If the `command` key also exists, then Git will treat this as an external
+diff and attempt to use the value set for `command` as an external program. For
+instance, the following config, combined with the above `.gitattributes` file,
+will result in `command` favored over `algorithm`.
+
+*Git config*
+
+----------------------------------------------------------------
+[diff "<name>"]
+  command = j-c-diff
+  algorithm = histogram
+----------------------------------------------------------------
 
 Defining a custom hunk-header
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/diff.c b/diff.c
index 92a0eab942e..24da439e56f 100644
--- a/diff.c
+++ b/diff.c
@@ -4456,15 +4456,11 @@ static void run_diff_cmd(const char *pgm,
 	const char *xfrm_msg = NULL;
 	int complete_rewrite = (p->status == DIFF_STATUS_MODIFIED) && p->score;
 	int must_show_header = 0;
+	struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, attr_path);
 
-
-	if (o->flags.allow_external) {
-		struct userdiff_driver *drv;
-
-		drv = userdiff_find_by_path(o->repo->index, attr_path);
+	if (o->flags.allow_external)
 		if (drv && drv->external)
 			pgm = drv->external;
-	}
 
 	if (msg) {
 		/*
@@ -4481,12 +4477,17 @@ static void run_diff_cmd(const char *pgm,
 		run_external_diff(pgm, name, other, one, two, xfrm_msg, o);
 		return;
 	}
-	if (one && two)
+	if (one && two) {
+		if (!o->xdl_opts_command_line)
+			if (drv && drv->algorithm)
+				set_diff_algorithm(o, drv->algorithm);
+
 		builtin_diff(name, other ? other : name,
 			     one, two, xfrm_msg, must_show_header,
 			     o, complete_rewrite);
-	else
+	} else {
 		fprintf(o->file, "* Unmerged path %s\n", name);
+	}
 }
 
 static void diff_fill_oid_info(struct diff_filespec *one, struct index_state *istate)
@@ -4583,6 +4584,10 @@ static void run_diffstat(struct diff_filepair *p, struct diff_options *o,
 	const char *name;
 	const char *other;
 
+	struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, p->one->path);
+	if (drv && drv->algorithm)
+		set_diff_algorithm(o, drv->algorithm);
+
 	if (DIFF_PAIR_UNMERGED(p)) {
 		/* unmerged */
 		builtin_diffstat(p->one->path, NULL, NULL, NULL,
@@ -5130,6 +5135,8 @@ static int diff_opt_diff_algorithm(const struct option *opt,
 		return error(_("option diff-algorithm accepts \"myers\", "
 			       "\"minimal\", \"patience\" and \"histogram\""));
 
+	options->xdl_opts_command_line = 1;
+
 	return 0;
 }
 
@@ -5157,6 +5164,8 @@ static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
 		BUG("available diff algorithms include \"myers\", "
 			       "\"minimal\", \"patience\" and \"histogram\"");
 
+	options->xdl_opts_command_line = 1;
+
 	return 0;
 }
 
diff --git a/diff.h b/diff.h
index 41eb2c3d428..46b565abfd4 100644
--- a/diff.h
+++ b/diff.h
@@ -333,6 +333,8 @@ struct diff_options {
 	int prefix_length;
 	const char *stat_sep;
 	int xdl_opts;
+	/* If xdl_opts has been set via the command line. */
+	int xdl_opts_command_line;
 
 	/* see Documentation/diff-options.txt */
 	char **anchors;
diff --git a/t/lib-diff-alternative.sh b/t/lib-diff-alternative.sh
index 8d1e408bb58..2dc02bca873 100644
--- a/t/lib-diff-alternative.sh
+++ b/t/lib-diff-alternative.sh
@@ -105,10 +105,46 @@ index $file1..$file2 100644
  }
 EOF
 
+	cat >expect_diffstat <<EOF
+ file1 => file2 | 21 ++++++++++-----------
+ 1 file changed, 10 insertions(+), 11 deletions(-)
+EOF
+
 	STRATEGY=$1
 
+	test_expect_success "$STRATEGY diff from attributes" '
+		echo "file* diff=driver" >.gitattributes &&
+		git config diff.driver.algorithm "$STRATEGY" &&
+		test_must_fail git diff --no-index file1 file2 > output &&
+		cat expect &&
+		cat output &&
+		test_cmp expect output
+	'
+
+	test_expect_success "$STRATEGY diff from attributes has valid diffstat" '
+		echo "file* diff=driver" >.gitattributes &&
+		git config diff.driver.algorithm "$STRATEGY" &&
+		test_must_fail git diff --stat --no-index file1 file2 > output &&
+		test_cmp expect_diffstat output
+	'
+
 	test_expect_success "$STRATEGY diff" '
-		test_must_fail git diff --no-index "--$STRATEGY" file1 file2 > output &&
+		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
+		test_cmp expect output
+	'
+
+	test_expect_success "$STRATEGY diff command line precedence before attributes" '
+		echo "file* diff=driver" >.gitattributes &&
+		git config diff.driver.algorithm meyers &&
+		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
+		test_cmp expect output
+	'
+
+	test_expect_success "$STRATEGY diff attributes precedence before config" '
+		git config diff.algorithm default &&
+		echo "file* diff=driver" >.gitattributes &&
+		git config diff.driver.algorithm "$STRATEGY" &&
+		test_must_fail git diff --no-index file1 file2 > output &&
 		test_cmp expect output
 	'
 
diff --git a/userdiff.c b/userdiff.c
index d71b82feb74..ff25cfc4b4c 100644
--- a/userdiff.c
+++ b/userdiff.c
@@ -293,7 +293,7 @@ PATTERNS("scheme",
 	 "|([^][)(}{[ \t])+"),
 PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",
 	 "\\\\[a-zA-Z@]+|\\\\.|[a-zA-Z0-9\x80-\xff]+"),
-{ "default", NULL, -1, { NULL, 0 } },
+{ "default", NULL, NULL, -1, { NULL, 0 } },
 };
 #undef PATTERNS
 #undef IPATTERN
@@ -394,6 +394,8 @@ int userdiff_config(const char *k, const char *v)
 		return parse_bool(&drv->textconv_want_cache, k, v);
 	if (!strcmp(type, "wordregex"))
 		return git_config_string(&drv->word_regex, k, v);
+	if (!strcmp(type, "algorithm"))
+		return git_config_string(&drv->algorithm, k, v);
 
 	return 0;
 }
diff --git a/userdiff.h b/userdiff.h
index aee91bc77e6..24419db6973 100644
--- a/userdiff.h
+++ b/userdiff.h
@@ -14,6 +14,7 @@ struct userdiff_funcname {
 struct userdiff_driver {
 	const char *name;
 	const char *external;
+	const char *algorithm;
 	int binary;
 	struct userdiff_funcname funcname;
 	const char *word_regex;
-- 
gitgitgadget

^ permalink raw reply related	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-11  1:59               ` Jeff King
@ 2023-02-15  2:35                 ` Elijah Newren
  2023-02-15  4:21                   ` Jeff King
  2023-02-15 14:44                 ` Phillip Wood
  1 sibling, 1 reply; 78+ messages in thread
From: Elijah Newren @ 2023-02-15  2:35 UTC (permalink / raw)
  To: Jeff King
  Cc: phillip.wood, John Cai, Ævar Arnfjörð Bjarmason,
	John Cai via GitGitGadget, git, Patrick Steinhardt

On Fri, Feb 10, 2023 at 5:59 PM Jeff King <peff@peff.net> wrote:
>
> On Thu, Feb 09, 2023 at 02:44:15PM +0000, Phillip Wood wrote:
>
> > To see the differences between the output of patience and histogram
> > algorithms I diffed the output of "git log -p --no-merges
> > --diff-algorithm=patience" and "git log -p --no-merges
> > --diff-algorithm=histogram". The first three differences are
> >
> > - 6c065f72b8 (http: support CURLOPT_PROTOCOLS_STR, 2023-01-16)
> >   In get_curl_allowed_protocols() the patience algorithm shows the
> >   change in the return statement more clearly
> >
> > - 47cfc9bd7d (attr: add flag `--source` to work with tree-ish, 2023-01-14)
> >    The histogram algorithm shows read_attr_from_index() being moved
> >    whereas the patience algorithm does not making the diff easier to
> >    follow.
> >
> > - b0226007f0 (fsmonitor: eliminate call to deprecated FSEventStream
> > function, 2022-12-14)
> >   In fsm_listen__stop_async() the histogram algorithm shows
> >   data->shutdown_style = SHUTDOWN_EVENT;
> >   being moved, which is not as clear as the patience output which
> >   shows it as a context line.
>
> Just a small counter-point, since I happened to be looking at myers vs
> patience for something elsewhere in the thread, but:
>
>   git show 35bd13fcd2caa4185bf3729655ca20b6a5fe9b6f builtin/add.c

"fatal: bad object 35bd13fcd2caa4185bf3729655ca20b6a5fe9b6f"

Is that a local commit of yours?

> looks slightly better to me with myers, even though it is 2 lines
> longer. The issue is that patience and histogram are very eager to use
> blank lines as anchor points, so a diff like:
>
>   -some words
>   -
>   -and some more
>   +unrelated content
>   +
>   +but it happens to also be two paragraphs
>
> in myers becomes:
>
>   -some words
>   +unrelated content
>
>   -and some more
>   +but it happens to also be two paragraphs
>
> in patience (here I'm using single lines, but in practice these may be
> paragraphs, or stanzas of code). I think that's also the _strength_ of
> patience in many cases, but it really depends on the content. Replacing
> a multi-stanza block with another one may be the best explanation for
> what happened. Or the two stanzas may be independent, and showing the
> change for each one may be better.
>
> I'm not sure which one happens more often. And you'd probably want to
> weight it by how good/bad the change is. In the example I showed I don't
> find patience very much worse, since it's already a pretty ugly diff.
> But in cases where patience shines, it may be making things
> significantly more readable.
>
> I don't have a super strong opinion, but I just wanted to chime in that
> it is not clear to me that patience/histogram is always a win over myers
> (yes, I know your examples were comparing patience vs histogram, but the
> larger thread is discussing the other).

Oh, I agree histogram is not always a win over myers.  I just feel it
is the majority of the time.  But if you want more than "feels",
here's some solid data to back that up...

I found a study on the subject over at
https://link.springer.com/article/10.1007/s10664-019-09772-z.  They
were particularly interested in whether other academic studies could
have been affected by git's different diff algorithms, and came away
with the answer that it did.  They looked at a few hundred thousand
commits across two dozen different repositories and found (note that
they only looked at myers and histogram, ignoring patience and
minimal):

   * 92.4% - 98.6% of the diffs (depending on repo) are identical
whether you use myers or histogram
   * 93.8% - 99.2% of the diffs (depending on repo) have the same
number of added/deleted lines with myers and histogram
   * Of the >20k diffs that were not the identical, they selected a
random sample of 377 diffs (taking care to make sure they were
statistically representative)
   * They divided the 377 diffs into "code" and "non-code" diffs, i.e.
those modifying source code and those modifying other textual files
   * They had two people annotating the diffs and independently
scoring them, and then checked for agreement between their answers
afterwards.  (No, they didn't always agree, but they did have
substantial agreement.)

For the (again, non-identical) diffs modifying non-code, they found
(see table 11) that:
   * 14.9% of the myers diffs are better
   * 13.4% of the histogram diffs are better
   * 71.6% of the diffs have equal quality

For the (non-identical) diffs modifying code, they found (again, see
table 11) that:
   * 16.9% of the myers diffs are better
   * 62.6% of the histogram diffs are better
   * 20.6% of the diffs have equal quality

A ratio of 4 to 1 for histogram being better on code diffs is pretty
weighty to me.

It's possible these results would have been even better were it not
for a couple of bugs in the histogram code (ported from the original
in jgit).  Phillip pointed me to a problematic testcase that Stefan
Beller found, and in attempting to fix it (I'm on fix #4 or so), I
believe I found another issue.  However, I don't want to go into too
much detail yet, as I found problems with some of my previous fixes
and already invalidated things I told Phillip just last week.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v2 1/2] diff: consolidate diff algorithm option parsing
  2023-02-14 21:40   ` [PATCH v2 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
@ 2023-02-15  2:38     ` Junio C Hamano
  2023-02-15 23:34       ` John Cai
  0 siblings, 1 reply; 78+ messages in thread
From: Junio C Hamano @ 2023-02-15  2:38 UTC (permalink / raw)
  To: John Cai via GitGitGadget
  Cc: git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren,
	John Cai

"John Cai via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: John Cai <johncai86@gmail.com>
>
> The diff option parsing for --minimal, --patience, --histgoram can all
> be consolidated into one function. This is a preparatory step for the
> subsequent commit which teaches diff to keep track of whether or not a
> diff algorithm has been set via the command line.

Everybody other than patience used to be just a bit-op but now
everybody is a callback?

> diff --git a/diff.c b/diff.c
> index 329eebf16a0..92a0eab942e 100644
> --- a/diff.c
> +++ b/diff.c
> @@ -3437,6 +3437,22 @@ static int diff_filepair_is_phoney(struct diff_filespec *one,
>  	return !DIFF_FILE_VALID(one) && !DIFF_FILE_VALID(two);
>  }
>  
> +static int set_diff_algorithm(struct diff_options *opts,
> +			      const char *alg)
> +{
> +	long value = parse_algorithm_value(alg);
> +
> +	if (value < 0)
> +		return 1;
> +
> +	/* clear out previous settings */
> +	DIFF_XDL_CLR(opts, NEED_MINIMAL);
> +	opts->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
> +	opts->xdl_opts |= value;
> +
> +	return 0;
> +}

The above is a faithful copy of diff_opt_diff_algorithm(), except
that it returns 1 (not -1) on failure, which is unexpected in this
codebase, and should be corrected if this patch gets rerolled.

>  static void builtin_diff(const char *name_a,
>  			 const char *name_b,
>  			 struct diff_filespec *one,
> @@ -5107,17 +5123,40 @@ static int diff_opt_diff_algorithm(const struct option *opt,
>  				   const char *arg, int unset)
>  {
>  	struct diff_options *options = opt->value;
> -	long value = parse_algorithm_value(arg);
>  
>  	BUG_ON_OPT_NEG(unset);
> -	if (value < 0)
> +
> +	if (set_diff_algorithm(options, arg))
>  		return error(_("option diff-algorithm accepts \"myers\", "
>  			       "\"minimal\", \"patience\" and \"histogram\""));
>  
> -	/* clear out previous settings */
> -	DIFF_XDL_CLR(options, NEED_MINIMAL);
> -	options->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
> -	options->xdl_opts |= value;
> +	return 0;
> +}

This version of diff_opt_diff_algorithm() behaves identically from
the version before this patch, which is excellent.

> +static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
> +				   const char *arg, int unset)
> +{
> +	struct diff_options *options = opt->value;
> +
> +	BUG_ON_OPT_NEG(unset);
> +	BUG_ON_OPT_ARG(arg);
> +
> +	if (!strcmp(opt->long_name, "patience")) {
> +		size_t i;
> +		/*
> +		 * Both --patience and --anchored use PATIENCE_DIFF
> +		 * internally, so remove any anchors previously
> +		 * specified.
> +		 */
> +		for (i = 0; i < options->anchors_nr; i++)
> +			free(options->anchors[i]);
> +		options->anchors_nr = 0;
> +	}
> +
> +	if (set_diff_algorithm(options, opt->long_name))
> +		BUG("available diff algorithms include \"myers\", "
> +			       "\"minimal\", \"patience\" and \"histogram\"");
> +
>  	return 0;
>  }

Calling this instead of diff_opt_patience() would make "--patience"
parsed identically as before without this patch, which is excellent.

> @@ -5562,9 +5581,10 @@ struct option *add_diff_options(const struct option *opts,
>  			    N_("prevent rename/copy detection if the number of rename/copy targets exceeds given limit")),
>  
>  		OPT_GROUP(N_("Diff algorithm options")),
> -		OPT_BIT(0, "minimal", &options->xdl_opts,
> -			N_("produce the smallest possible diff"),
> -			XDF_NEED_MINIMAL),
> +		OPT_CALLBACK_F(0, "minimal", options, NULL,
> +			       N_("produce the smallest possible diff"),
> +			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
> +			       diff_opt_diff_algorithm_no_arg),

I offhand cannot say that these two are equivalent, even though they
ought to be (otherwise this patch would break things).  The callback
seems to do much more than just a simple "flip the NEED_MINIMAL bit
on".

> -		OPT_BITOP(0, "histogram", &options->xdl_opts,
> -			  N_("generate diff using the \"histogram diff\" algorithm"),
> -			  XDF_HISTOGRAM_DIFF, XDF_DIFF_ALGORITHM_MASK),
> +			       diff_opt_diff_algorithm_no_arg),
> +		OPT_CALLBACK_F(0, "histogram", options, NULL,
> +			       N_("generate diff using the \"histogram diff\" algorithm"),
> +			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
> +			       diff_opt_diff_algorithm_no_arg),

Likewise.

By nature, patience (and anchored) needs to do much more than
everybody else, so it almost feels that it is OK (and preferable,
even) to leave it a special case to make the distinction stand out.
Consolidating everybody else who are much simpler to share the
more complex callback does not look like a good change to me, at
least at the first glance.

Thanks.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v2 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-14 21:40   ` [PATCH v2 2/2] diff: teach diff to read gitattribute diff-algorithm John Cai via GitGitGadget
@ 2023-02-15  2:56     ` Junio C Hamano
  2023-02-15  3:20       ` Junio C Hamano
  0 siblings, 1 reply; 78+ messages in thread
From: Junio C Hamano @ 2023-02-15  2:56 UTC (permalink / raw)
  To: John Cai via GitGitGadget
  Cc: git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren,
	John Cai

"John Cai via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: John Cai <johncai86@gmail.com>
>
> It can be useful to specify diff algorithms per file type. For example,
> one may want to use the minimal diff algorithm for .json files, another
> for .c files, etc.
>
> Teach the diff machinery to check attributes for a diff driver. Also
> teach the diff driver parser a new type "algorithm" to look for in the
> config, which will be used if a driver has been specified through the
> attributes.
>
> Enforce precedence of diff algorithm by favoring the command line option,
> then looking at the driver attributes & config combination, then finally
> the diff.algorithm config.
>
> To enforce precedence order, use the `xdl_opts_command_line` member
> during options pasing to indicate the diff algorithm was set via command
> line args.
>
> Signed-off-by: John Cai <johncai86@gmail.com>
> ---
>  Documentation/gitattributes.txt | 41 ++++++++++++++++++++++++++++++++-
>  diff.c                          | 25 +++++++++++++-------
>  diff.h                          |  2 ++
>  t/lib-diff-alternative.sh       | 38 +++++++++++++++++++++++++++++-
>  userdiff.c                      |  4 +++-
>  userdiff.h                      |  1 +
>  6 files changed, 100 insertions(+), 11 deletions(-)
>
> diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt
> index c19e64ea0ef..7e69f509d0a 100644
> --- a/Documentation/gitattributes.txt
> +++ b/Documentation/gitattributes.txt
> @@ -736,7 +736,6 @@ String::
>  	by the configuration variables in the "diff.foo" section of the
>  	Git config file.
>  
> -
>  Defining an external diff driver
>  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Unrelated change?  Wider paragraph gap between two sections than
other inter-paragraph gaps inside a single section is what original
had, and I think that is a reasonable thing to keep.

> @@ -758,6 +757,46 @@ with the above configuration, i.e. `j-c-diff`, with 7
>  parameters, just like `GIT_EXTERNAL_DIFF` program is called.
>  See linkgit:git[1] for details.

In other words, this new section wants another blank line before to match.

>  
> +Setting the internal diff algorithm
> +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> +
> +The diff algorithm can be set through the `diff.algorithm` config key, but
> +sometimes it may be helpful to set the diff algorithm by path. For example, one
> +might wish to set a diff algorithm automatically for all `.json` files such that
> +the user would not need to pass in a separate command line `--diff-algorithm` flag each
> +time.

That's an overly wide paragraph.

> +
> +First, in `.gitattributes`, you would assign the `diff` attribute for paths.
> +
> +*Git attributes*

Discard this line (mimic an existing section, like "Defining a
custom hunk-header").

> +------------------------
> +*.json diff=<name>
> +------------------------
> +
> +Then, you would define a "diff.<name>.algorithm" configuration to specify the
> +diff algorithm, choosing from `meyers`, `patience`, `minimal`, and `histogram`.
> +
> +*Git config*

Likewise, discard this line (I won't repeat but the next hunk has
the same issue).

> diff --git a/diff.c b/diff.c
> index 92a0eab942e..24da439e56f 100644
> --- a/diff.c
> +++ b/diff.c
> @@ -4456,15 +4456,11 @@ static void run_diff_cmd(const char *pgm,
>  	const char *xfrm_msg = NULL;
>  	int complete_rewrite = (p->status == DIFF_STATUS_MODIFIED) && p->score;
>  	int must_show_header = 0;
> +	struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, attr_path);

Do we run this look-up unconditionally, even when .allow_external
bit is not set?  Why?

> -
> -	if (o->flags.allow_external) {
> -		struct userdiff_driver *drv;
> -
> -		drv = userdiff_find_by_path(o->repo->index, attr_path);
> +	if (o->flags.allow_external)
>  		if (drv && drv->external)
>  			pgm = drv->external;
> -	}
>  
>  	if (msg) {
>  		/*
> @@ -4481,12 +4477,17 @@ static void run_diff_cmd(const char *pgm,
>  		run_external_diff(pgm, name, other, one, two, xfrm_msg, o);
>  		return;
>  	}
> -	if (one && two)
> +	if (one && two) {
> +		if (!o->xdl_opts_command_line)
> +			if (drv && drv->algorithm)
> +				set_diff_algorithm(o, drv->algorithm);

The idea here seems to be "if there is no explicit instruction, and
if the diff driver specifies an algorithm, then use that one", which
is very straightforward and sensible.  Can we reliably tell if we
had an explicit instruction to override the driver?  That should
probably appear in other parts of the code, I guess.

>  		builtin_diff(name, other ? other : name,
>  			     one, two, xfrm_msg, must_show_header,
>  			     o, complete_rewrite);
> -	else
> +	} else {
>  		fprintf(o->file, "* Unmerged path %s\n", name);
> +	}
>  }



> @@ -4583,6 +4584,10 @@ static void run_diffstat(struct diff_filepair *p, struct diff_options *o,
>  	const char *name;
>  	const char *other;
>  
> +	struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, p->one->path);
> +	if (drv && drv->algorithm)
> +		set_diff_algorithm(o, drv->algorithm);

Interesting.  Does external diff play a role, like in run_diff_cmd()
we saw earlier?

> @@ -5130,6 +5135,8 @@ static int diff_opt_diff_algorithm(const struct option *opt,
>  		return error(_("option diff-algorithm accepts \"myers\", "
>  			       "\"minimal\", \"patience\" and \"histogram\""));
>  
> +	options->xdl_opts_command_line = 1;

OK, calling this member "xdl_" anything is highly misleading, as it
has nothing to do with the xdiff machinery.  How about calling it
after what it does, i.e. allowing the attribute driven diff driver
to specify the algorithm?  options.ignore_driver_algorithm or
something?  The options coming _from_ the command line may happen to
be the condition to trigger this behaviour in this current
implementation, but it does not have to stay that way forever.
Losing "command line" from the name of the flag would make it
clearer what is essential (i.e. this controls if the diff driver is
allowed to affect the choice of the algorithm) and what is not (i.e.
we happen to let it decided based on the presence or absense of
command line choice).

Thanks.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v2 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-15  2:56     ` Junio C Hamano
@ 2023-02-15  3:20       ` Junio C Hamano
  2023-02-16 20:37         ` John Cai
  0 siblings, 1 reply; 78+ messages in thread
From: Junio C Hamano @ 2023-02-15  3:20 UTC (permalink / raw)
  To: John Cai via GitGitGadget
  Cc: git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren,
	John Cai

Junio C Hamano <gitster@pobox.com> writes:

>> diff --git a/diff.c b/diff.c
>> index 92a0eab942e..24da439e56f 100644
>> --- a/diff.c
>> +++ b/diff.c
>> @@ -4456,15 +4456,11 @@ static void run_diff_cmd(const char *pgm,
>>  	const char *xfrm_msg = NULL;
>>  	int complete_rewrite = (p->status == DIFF_STATUS_MODIFIED) && p->score;
>>  	int must_show_header = 0;
>> +	struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, attr_path);
>
> Do we run this look-up unconditionally, even when .allow_external
> bit is not set?  Why?

Ah, this is perfectly fine.  It used to be that this codepath can
tell that there is no need to check the diff driver when it is told
never to use any external diff driver.  Now, even when it is computing
the diff internally, it needs to check the diff driver to find out
the favoured algorithm for the path.

Strictly speaking, if we are told NOT to use external diff driver,
and if we are told NOT to pay attention to algorithm given by the
diff driver, then we know we can skip the overhead of attribute
look-up.  I.e. we could do this to avoid attribute look-up:

	struct userdiff_driver *drv = NULL;

	if (o->flags.allow_external || !o->ignore_driver_algorithm)
		drv = userdiff_find_by_path(...);

	if (drv && o->flags.allow_external && drv->external)
		pgm = drv->external;
	...
	if (pgm)
		... do the external diff thing ...
	if (one && two) {
		if (drv && !o->ignore_driver_algorithm && drv->algorithm)
			set_diff_algo(...)

I was not sure if it would be worth it before writing the above
down, but the resulting flow does not look _too_ bad.

>> @@ -4583,6 +4584,10 @@ static void run_diffstat(struct diff_filepair *p, struct diff_options *o,
>>  	const char *name;
>>  	const char *other;
>>  
>> +	struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, p->one->path);
>> +	if (drv && drv->algorithm)
>> +		set_diff_algorithm(o, drv->algorithm);
>
> Interesting.  Does external diff play a role, like in run_diff_cmd()
> we saw earlier?

As whoever wrote "diffstat" did not think of counting output from
external diff driver, of course in this codepath external diff would
not appear.  So what we see is very much expected.

Just move the blank line we see before these new lines one line
down, so that the variable decls are grouped together, with a blank
line before the first executable statement.  I.e.

	const char *name;
	const char *other;
+       struct userdiff_driver *drv;
+
+	drv = userdiff_find_by_path(...);
+	if (drv && drv->algorithm)
+		set_diff_algorithm(o, drv->algorithm);

Shouldn't this function refrain from setting algorithm from the
driver when the algorithm was given elsewhere?  E.g.

	$ git show --histogram --stat
	
or something?  IOW, shouldn't it also pay attention to
o->ignore_driver_algorithm bit, just like run_diff_cmd() did?



^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-14 21:16         ` John Cai
@ 2023-02-15  3:41           ` Elijah Newren
  0 siblings, 0 replies; 78+ messages in thread
From: Elijah Newren @ 2023-02-15  3:41 UTC (permalink / raw)
  To: John Cai
  Cc: Ævar Arnfjörð Bjarmason, John Cai via GitGitGadget,
	git

Hi John!

On Tue, Feb 14, 2023 at 1:16 PM John Cai <johncai86@gmail.com> wrote:
> On 9 Feb 2023, at 3:44, Elijah Newren wrote:
> > On Mon, Feb 6, 2023 at 12:47 PM John Cai <johncai86@gmail.com> wrote:
> >>
[...]
> It seems like the performance penalty was because I was adding calls to parse
> attribute files. Piggy backing off of the attribute parsing in userdiff.h will
> allow us to not incur this performance penalty:
>
> $ hyperfine -r 5 -L a bin-wrappers/git,git '{a} diff v2.0.0 v2.28.0'
> Benchmark 1: git-bin-wrapper diff v2.0.0 v2.28.0
>   Time (mean ± σ):      1.072 s ±  0.289 s    [User: 0.626 s, System: 0.081 s]
>   Range (min … max):    0.772 s …  1.537 s    5 runs
>
> Benchmark 2: git diff v2.0.0 v2.28.0
>   Time (mean ± σ):      1.003 s ±  0.065 s    [User: 0.684 s, System: 0.067 s]
>   Range (min … max):    0.914 s …  1.091 s    5 runs
>
> Summary
>   'git diff v2.0.0 v2.28.0' ran
>     1.07 ± 0.30 times faster than 'git-bin-wrapper diff v2.0.0 v2.28.0'

Yaay!  Much better.  :-)

I'm curious, though, whether you are showing here a 7% slowdown (which
would still be bad), or just that the feature is correctly choosing a
different (but slower) algorithm for some files, or some kind of mix.

What is the performance difference if you have this feature included,
but don't have any directives in .gitattributes selecting a different
diff algorithm for any files?

> > And on a separate note...
> >
> > There's another set of considerations we might need to include here as
> > well that I haven't seen anyone else in this thread talk about:
>
> These are some great questions. I'll do my best to answer them.
> >
> > * When trying to diff files, do we read the .gitattributes file from
> > the current checkout to determine the diff algorithm(s)?  Or the
> > index?  Or the commit we are diffing against?
> > * If we use the current checkout or index, what about bare clones or
> > diffing between two different commits?
> > * If diffing between two different commits, and the .gitattributes has
> > changed between those commits, which .gitattributes file wins?
> > * If diffing between two different commits, and the .gitattributes has
> > NOT changed, BUT a file has been renamed and the old and new names
> > have different rules, which rule wins?
>
> In the next version I plan on using Peff's suggestion of utilizing the existing
> diff driver scheme [1]. I believe these four questions are addressed if we use
> the existing userdiff.h API, which in turn calls the attr.h API. We check the
> worktree, then fallback to the index.

So...it sounds like we're just ignoring all the special cases listed
above, and living with bugs related to them?  That's not a criticism;
in fact, it might be okay -- after all, that's exactly what the
existing .gitattributes handling does and you are just hooking into
it.

I am a bit concerned, though, that we're increasing the visibility of
the interactions of .gitattributes with respect to these kinds of
cases.  I think external drivers are probably much less used than what
your feature might be, so folks are more likely to stumble into these
cases and complain.  Perhaps those cases are rare enough that we don't
care, but it might be at least worth documenting the issues (both to
manage user expectations and to give people a heads up about the
potential issues.)

(Also, it may be worth mentioning that I tend to focus on unusual
cases for anything that might touch merging; Junio once named one of
my patchsets "en/t6042-insane-merge-rename-testcases".  It's possible
I worry about corner cases more than is justified given their real
world likelihood.)

> By using the userdiff.h API, the behavior will match what users already expect
> when they for instance set an external driver.

s/already expect/already get/

The bugs also affect external drivers; I just suspect external drivers
aren't used enough that users have complained very loudly (yet?).

> 1. https://lore.kernel.org/git/Y+KQtqNPews3vBS8@coredump.intra.peff.net/
>
> >
> > * If per-file diff algorithms are adopted widely enough, will we be
> > forced to change the merge algorithm to also pay attention to them?
> > If it does, more complicated rename cases occur and we need rules for
> > how to handle those.
> > * If the merge algorithm has to pay attention to .gitattributes for
> > this too, we'll have even more corner cases around what happens if
> > there are merge conflicts in .gitattributes itself (which is already
> > kind of ugly and kludged)
>
> I see this feature as a user-experience type convenience feature, so I don't
> believe there's need for the merge machinery to also pay attention to the diff
> algorithm set through gitattrbutes. We can clarify this in the documentation.

That would be awesome; *please* do this.  This is my primary concern
with this patchset.

I've spent an awful lot of time dealing with weird corner cases in the
merge machinery, and this appears to open a big can of worms to me.
It'd be a huge relief if we just agreed that the .gitattributes
handling here is only meant for user-facing diffs and will not be
consulted by the merge machinery.

> > Anyway, I know I'm a bit animated and biased in this area, and I
> > apologize if I'm a bit too much so.  Even if I am, hopefully my
> > comments at least provide some useful context.
>
> No problem! thanks for raising these issues.
>
> thanks
> John

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-15  2:35                 ` Elijah Newren
@ 2023-02-15  4:21                   ` Jeff King
  2023-02-15  5:20                     ` Junio C Hamano
  0 siblings, 1 reply; 78+ messages in thread
From: Jeff King @ 2023-02-15  4:21 UTC (permalink / raw)
  To: Elijah Newren
  Cc: phillip.wood, John Cai, Ævar Arnfjörð Bjarmason,
	John Cai via GitGitGadget, git, Patrick Steinhardt

On Tue, Feb 14, 2023 at 06:35:00PM -0800, Elijah Newren wrote:

> > Just a small counter-point, since I happened to be looking at myers vs
> > patience for something elsewhere in the thread, but:
> >
> >   git show 35bd13fcd2caa4185bf3729655ca20b6a5fe9b6f builtin/add.c
> 
> "fatal: bad object 35bd13fcd2caa4185bf3729655ca20b6a5fe9b6f"
> 
> Is that a local commit of yours?

Oh, sorry. It's not my commit, but it may have been something I picked
up off the list. The version from Junio is 20b813d7d3d.

>    * They had two people annotating the diffs and independently
> scoring them, and then checked for agreement between their answers
> afterwards.  (No, they didn't always agree, but they did have
> substantial agreement.)

Kind of a weird methodology. I'd think you'd do better to put those
diffs in front of a lot of people to get a consensus. But OK...

> For the (again, non-identical) diffs modifying non-code, they found
> (see table 11) that:
>    * 14.9% of the myers diffs are better
>    * 13.4% of the histogram diffs are better
>    * 71.6% of the diffs have equal quality
> 
> For the (non-identical) diffs modifying code, they found (again, see
> table 11) that:
>    * 16.9% of the myers diffs are better
>    * 62.6% of the histogram diffs are better
>    * 20.6% of the diffs have equal quality
> 
> A ratio of 4 to 1 for histogram being better on code diffs is pretty
> weighty to me.

Interesting. They do slightly worse on non-code, but probably not enough
to worry about.

Despite my complaint above, I do find those results compelling. Or at
least, it is much closer to being real data than anything I have seen
before, so it seems like the best guide we have. :)

Like I said, I don't have a very strong opinion myself. Mostly I wanted
to note that while it is easy for us to remember the times we saw a
Myers diff, said "yuck", and tried patience and it was better, most of
us may be blind to the opposite case. If patience is not the default,
most of us would not have hit "yuck" case for it at all.

FWIW, I coincidentally hit this case earlier today where patience does a
_much_ better job than myers:

  https://lore.kernel.org/git/Y+vV8Ifkj1QV7KF0@coredump.intra.peff.net/

So don't count me as an objection, but just a curious observer. ;)

-Peff

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-15  4:21                   ` Jeff King
@ 2023-02-15  5:20                     ` Junio C Hamano
  0 siblings, 0 replies; 78+ messages in thread
From: Junio C Hamano @ 2023-02-15  5:20 UTC (permalink / raw)
  To: Jeff King
  Cc: Elijah Newren, phillip.wood, John Cai,
	Ævar Arnfjörð Bjarmason, John Cai via GitGitGadget,
	git, Patrick Steinhardt

Jeff King <peff@peff.net> writes:

>> >   git show 35bd13fcd2caa4185bf3729655ca20b6a5fe9b6f builtin/add.c
>> 
>> "fatal: bad object 35bd13fcd2caa4185bf3729655ca20b6a5fe9b6f"
>> 
>> Is that a local commit of yours?
>
> Oh, sorry. It's not my commit, but it may have been something I picked
> up off the list. The version from Junio is 20b813d7d3d.

$ git show -s --notes=amlog --format='%N %s' 20b813d7d3d
Message-Id: <patch-v2-1.3-71c7922b25f-20230206T225639Z-avarab@gmail.com>
 add: remove "add.interactive.useBuiltin" & Perl "git add--interactive"

> FWIW, I coincidentally hit this case earlier today where patience does a
> _much_ better job than myers:
>
>   https://lore.kernel.org/git/Y+vV8Ifkj1QV7KF0@coredump.intra.peff.net/
>
> So don't count me as an objection, but just a curious observer. ;)

Yeah, I know some people consider that Patience is the best thing
since sliced bread, and it does produce more readable diffs often.
I haven't used it often enough to cite/remember a bad case, though.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-11  1:59               ` Jeff King
  2023-02-15  2:35                 ` Elijah Newren
@ 2023-02-15 14:44                 ` Phillip Wood
  2023-02-15 15:00                   ` Jeff King
  1 sibling, 1 reply; 78+ messages in thread
From: Phillip Wood @ 2023-02-15 14:44 UTC (permalink / raw)
  To: Jeff King
  Cc: Elijah Newren, John Cai, Ævar Arnfjörð Bjarmason,
	John Cai via GitGitGadget, git, Patrick Steinhardt

Hi Peff

Thanks for the counter example

On 11/02/2023 01:59, Jeff King wrote:
> Just a small counter-point, since I happened to be looking at myers vs
> patience for something elsewhere in the thread, but:
> 
>    git show 35bd13fcd2caa4185bf3729655ca20b6a5fe9b6f builtin/add.c
> 
> looks slightly better to me with myers, even though it is 2 lines
> longer. The issue is that patience and histogram are very eager to use
> blank lines as anchor points, so a diff like:
> 
>    -some words
>    -
>    -and some more
>    +unrelated content
>    +
>    +but it happens to also be two paragraphs
> 
> in myers becomes:
> 
>    -some words
>    +unrelated content
>    
>    -and some more
>    +but it happens to also be two paragraphs
> 
> in patience (here I'm using single lines, but in practice these may be
> paragraphs, or stanzas of code). I think that's also the _strength_ of
> patience in many cases, but it really depends on the content.

Indeed. Ironically as there are no unique context lines in that example 
the blank lines are being matched by patience implementation falling 
back to the myers algorithm. Normally the myers implementation tries to 
avoid matching common context lines between two blocks of changed lines 
but I think because in this case it is only called on a small part of 
the file the blank lines are not common enough to trigger that 
heuristic. I've got a patch[1] that stops the patience implementation 
falling back to the myers algorithm and just trims any leading and 
trailing context. On the whole it I think it gives more readable diffs 
but I've not got any systematic data to back that up. I also suspect 
there are pathological cases such as each line in the file being 
duplicated where the falling back to the myers algorithm gives a much 
better result.

> Replacing
> a multi-stanza block with another one may be the best explanation for
> what happened. Or the two stanzas may be independent, and showing the
> change for each one may be better.
 >
> I'm not sure which one happens more often. And you'd probably want to
> weight it by how good/bad the change is. In the example I showed I don't
> find patience very much worse, since it's already a pretty ugly diff.
> But in cases where patience shines, it may be making things
> significantly more readable.

I agree that having some data would be useful if we're going to change 
the default but collecting it would entail quite a bit of work and as 
the scoring is subjective we'd want a few people doing it. It's great 
that someone has done that for the histogram algorithm in the paper 
Elijah cited.

> I don't have a super strong opinion, but I just wanted to chime in that
> it is not clear to me that patience/histogram is always a win over myers
> (yes, I know your examples were comparing patience vs histogram, but the
> larger thread is discussing the other).

Agreed, there are definitely cases where myers gives more readable 
diffs, I think if we're going to change the default the question we need 
to answer is which algorithm gives the best result most of the time.

Best Wishes

Phillip

[1] https://github.com/phillipwood/git/commits/pure-patience-diff

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-07 17:27         ` Ævar Arnfjörð Bjarmason
@ 2023-02-15 14:47           ` Phillip Wood
  0 siblings, 0 replies; 78+ messages in thread
From: Phillip Wood @ 2023-02-15 14:47 UTC (permalink / raw)
  To: Ævar Arnfjörð Bjarmason
  Cc: John Cai, John Cai via GitGitGadget, git, Patrick Steinhardt

Hi Ævar

On 07/02/2023 17:27, Ævar Arnfjörð Bjarmason wrote:
> 
> On Tue, Feb 07 2023, Phillip Wood wrote:
> 
>> This is slightly off topic but one thing I'd really like is a way to
>> tell diff use automatically use --diff-words on some files
>> (e.g. Documentation/*)
> 
> Unlike changing the algorithm, -U options, diff.orderFile etc. doing
> that would give you a diff that can't be applied with "git apply" or
> other tools that expect a valid unified diff.

That's a good point, we'd probably would want to guard it by checking if 
the output is going to a tty.

> So I can imagine that it would be neat in some contexts, but such a
> change would have much wider implications than options that tweak how a
> valid unified diff looks, or is generated.
> 
> We'd need some way to mark a diff as "for ad-hoc viewing only".
> 
> But as it sounds like you want this for git.git the
> Documentation/doc-diff script is much better than anything word-diff
> could spew out, as it diffs the resulting generated docs.

It's true that I should look at the doc-diff script for git, but it is a 
wider wish for text files in other projects as well.

Best Wishes

Phillip

> I wonder (but haven't tried) whether you can't "diff" that using the
> same method that can be used to diff binary files using a custom driver.
> 
> Hrm, except that in that case (with includes etc) there isn't really a
> 1=1 mapping between files within Documentation/ and generated docs (due
> to includes etc.). But I suppose it could be used only for those files
> that 1=1 correspond to the generated manpages.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-15 14:44                 ` Phillip Wood
@ 2023-02-15 15:00                   ` Jeff King
  0 siblings, 0 replies; 78+ messages in thread
From: Jeff King @ 2023-02-15 15:00 UTC (permalink / raw)
  To: phillip.wood
  Cc: Elijah Newren, John Cai, Ævar Arnfjörð Bjarmason,
	John Cai via GitGitGadget, git, Patrick Steinhardt

On Wed, Feb 15, 2023 at 02:44:59PM +0000, Phillip Wood wrote:

> Indeed. Ironically as there are no unique context lines in that example the
> blank lines are being matched by patience implementation falling back to the
> myers algorithm. Normally the myers implementation tries to avoid matching
> common context lines between two blocks of changed lines but I think because
> in this case it is only called on a small part of the file the blank lines
> are not common enough to trigger that heuristic. I've got a patch[1] that
> stops the patience implementation falling back to the myers algorithm and
> just trims any leading and trailing context. On the whole it I think it
> gives more readable diffs but I've not got any systematic data to back that
> up. I also suspect there are pathological cases such as each line in the
> file being duplicated where the falling back to the myers algorithm gives a
> much better result.

Ah, I should have suspected it was something like that (since one of the
purposes of patience is trying not to key on meaningless lines).

I tried your patch on my test case, and the result is even more readable
than the myers output, because it really was effectively a complete
rewrite of the function. It is, of course, not the minimal diff. I'm not
sure if there would be cases where you'd prefer the minimal. I guess if
each stanza's change really was independent of the others. But if there
is no commonality except for blank lines, I find it hard to imagine that
it's much worse to just treat the whole thing as a block.

Anyway, thank you (and Elijah) for explaining. I'm getting more
comfortable with the idea of switching the default.

-Peff

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v2 1/2] diff: consolidate diff algorithm option parsing
  2023-02-15  2:38     ` Junio C Hamano
@ 2023-02-15 23:34       ` John Cai
  2023-02-15 23:42         ` Junio C Hamano
  0 siblings, 1 reply; 78+ messages in thread
From: John Cai @ 2023-02-15 23:34 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: John Cai via GitGitGadget, git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren

Hi Junio,

On 14 Feb 2023, at 21:38, Junio C Hamano wrote:

> "John Cai via GitGitGadget" <gitgitgadget@gmail.com> writes:
>
>> From: John Cai <johncai86@gmail.com>
>>
>> The diff option parsing for --minimal, --patience, --histgoram can all
>> be consolidated into one function. This is a preparatory step for the
>> subsequent commit which teaches diff to keep track of whether or not a
>> diff algorithm has been set via the command line.
>
> Everybody other than patience used to be just a bit-op but now
> everybody is a callback?
>
>> diff --git a/diff.c b/diff.c
>> index 329eebf16a0..92a0eab942e 100644
>> --- a/diff.c
>> +++ b/diff.c
>> @@ -3437,6 +3437,22 @@ static int diff_filepair_is_phoney(struct diff_filespec *one,
>>  	return !DIFF_FILE_VALID(one) && !DIFF_FILE_VALID(two);
>>  }
>>
>> +static int set_diff_algorithm(struct diff_options *opts,
>> +			      const char *alg)
>> +{
>> +	long value = parse_algorithm_value(alg);
>> +
>> +	if (value < 0)
>> +		return 1;
>> +
>> +	/* clear out previous settings */
>> +	DIFF_XDL_CLR(opts, NEED_MINIMAL);
>> +	opts->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
>> +	opts->xdl_opts |= value;
>> +
>> +	return 0;
>> +}
>
> The above is a faithful copy of diff_opt_diff_algorithm(), except
> that it returns 1 (not -1) on failure, which is unexpected in this
> codebase, and should be corrected if this patch gets rerolled.
>
>>  static void builtin_diff(const char *name_a,
>>  			 const char *name_b,
>>  			 struct diff_filespec *one,
>> @@ -5107,17 +5123,40 @@ static int diff_opt_diff_algorithm(const struct option *opt,
>>  				   const char *arg, int unset)
>>  {
>>  	struct diff_options *options = opt->value;
>> -	long value = parse_algorithm_value(arg);
>>
>>  	BUG_ON_OPT_NEG(unset);
>> -	if (value < 0)
>> +
>> +	if (set_diff_algorithm(options, arg))
>>  		return error(_("option diff-algorithm accepts \"myers\", "
>>  			       "\"minimal\", \"patience\" and \"histogram\""));
>>
>> -	/* clear out previous settings */
>> -	DIFF_XDL_CLR(options, NEED_MINIMAL);
>> -	options->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
>> -	options->xdl_opts |= value;
>> +	return 0;
>> +}
>
> This version of diff_opt_diff_algorithm() behaves identically from
> the version before this patch, which is excellent.
>
>> +static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
>> +				   const char *arg, int unset)
>> +{
>> +	struct diff_options *options = opt->value;
>> +
>> +	BUG_ON_OPT_NEG(unset);
>> +	BUG_ON_OPT_ARG(arg);
>> +
>> +	if (!strcmp(opt->long_name, "patience")) {
>> +		size_t i;
>> +		/*
>> +		 * Both --patience and --anchored use PATIENCE_DIFF
>> +		 * internally, so remove any anchors previously
>> +		 * specified.
>> +		 */
>> +		for (i = 0; i < options->anchors_nr; i++)
>> +			free(options->anchors[i]);
>> +		options->anchors_nr = 0;
>> +	}
>> +
>> +	if (set_diff_algorithm(options, opt->long_name))
>> +		BUG("available diff algorithms include \"myers\", "
>> +			       "\"minimal\", \"patience\" and \"histogram\"");
>> +
>>  	return 0;
>>  }
>
> Calling this instead of diff_opt_patience() would make "--patience"
> parsed identically as before without this patch, which is excellent.
>
>> @@ -5562,9 +5581,10 @@ struct option *add_diff_options(const struct option *opts,
>>  			    N_("prevent rename/copy detection if the number of rename/copy targets exceeds given limit")),
>>
>>  		OPT_GROUP(N_("Diff algorithm options")),
>> -		OPT_BIT(0, "minimal", &options->xdl_opts,
>> -			N_("produce the smallest possible diff"),
>> -			XDF_NEED_MINIMAL),
>> +		OPT_CALLBACK_F(0, "minimal", options, NULL,
>> +			       N_("produce the smallest possible diff"),
>> +			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
>> +			       diff_opt_diff_algorithm_no_arg),
>
> I offhand cannot say that these two are equivalent, even though they
> ought to be (otherwise this patch would break things).  The callback
> seems to do much more than just a simple "flip the NEED_MINIMAL bit
> on".
>
>> -		OPT_BITOP(0, "histogram", &options->xdl_opts,
>> -			  N_("generate diff using the \"histogram diff\" algorithm"),
>> -			  XDF_HISTOGRAM_DIFF, XDF_DIFF_ALGORITHM_MASK),
>> +			       diff_opt_diff_algorithm_no_arg),
>> +		OPT_CALLBACK_F(0, "histogram", options, NULL,
>> +			       N_("generate diff using the \"histogram diff\" algorithm"),
>> +			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
>> +			       diff_opt_diff_algorithm_no_arg),
>
> Likewise.
>
> By nature, patience (and anchored) needs to do much more than
> everybody else, so it almost feels that it is OK (and preferable,
> even) to leave it a special case to make the distinction stand out.
> Consolidating everybody else who are much simpler to share the
> more complex callback does not look like a good change to me, at
> least at the first glance.

Yeah, it's not great to pull things out from a bit flip to a callback but I
needed some way of setting the xdl_opts_command_line member in the next commit
when we know that the diff algorithm was set via options on the command line so
that we can honor the precedence. If there's a different way to do that other
than using callbacks for command options parsing, I'd agree that keeping these
as bit flips would be ideal.

>
> Thanks.

Thanks!
John

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v2 1/2] diff: consolidate diff algorithm option parsing
  2023-02-15 23:34       ` John Cai
@ 2023-02-15 23:42         ` Junio C Hamano
  2023-02-16  2:14           ` Jeff King
  0 siblings, 1 reply; 78+ messages in thread
From: Junio C Hamano @ 2023-02-15 23:42 UTC (permalink / raw)
  To: John Cai
  Cc: John Cai via GitGitGadget, git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren

John Cai <johncai86@gmail.com> writes:

> Yeah, it's not great to pull things out from a bit flip to a callback but I
> needed some way of setting the xdl_opts_command_line member in the next commit

If that is the case (and after reading [2/2], of course, readers of
the series can tell that it was the reason), it would be good to be
honest and document _that_ as the reason why you are using callbacks
for all of them in the proposed log message for [1/2].

And I think that is a reasonable way to use callback to do "more
than just setting a bit".  Even in that case, I am not sure if it is
a good idea to share the same callback that has conditional code
that only is relevant to the "patience" case, though.

THanks.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v2 1/2] diff: consolidate diff algorithm option parsing
  2023-02-15 23:42         ` Junio C Hamano
@ 2023-02-16  2:14           ` Jeff King
  2023-02-16  2:57             ` Junio C Hamano
  0 siblings, 1 reply; 78+ messages in thread
From: Jeff King @ 2023-02-16  2:14 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: John Cai, John Cai via GitGitGadget, git, Eric Sunshine,
	Phillip Wood, Ævar Arnfjörð Bjarmason,
	Elijah Newren

On Wed, Feb 15, 2023 at 03:42:12PM -0800, Junio C Hamano wrote:

> And I think that is a reasonable way to use callback to do "more
> than just setting a bit".  Even in that case, I am not sure if it is
> a good idea to share the same callback that has conditional code
> that only is relevant to the "patience" case, though.

I have to admit I did a double-take at the string comparisons with
opt->long. I guess we can rely on it, since it is coming from the
options struct (and is not the string that the user wrote on the command
line!). But it still feels a little error-prone, just because there's no
help from the type system or the compiler.

I'd have expected it with individual callbacks like:

  int handle_patience(...)
  {
	do_patience_specific_stuff();
	do_shared_stuff(PATIENCE_DIFF);
  }

  int handle_histogram(...)
  {
	do_shared_stuff(HISTOGRAM_DIFF);
  }

and so on. That's a bit more verbose, but the call stack reflects the
flow we expect.

I can live with it either way, though.

-Peff

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v2 1/2] diff: consolidate diff algorithm option parsing
  2023-02-16  2:14           ` Jeff King
@ 2023-02-16  2:57             ` Junio C Hamano
  2023-02-16 20:34               ` John Cai
  0 siblings, 1 reply; 78+ messages in thread
From: Junio C Hamano @ 2023-02-16  2:57 UTC (permalink / raw)
  To: Jeff King
  Cc: John Cai, John Cai via GitGitGadget, git, Eric Sunshine,
	Phillip Wood, Ævar Arnfjörð Bjarmason,
	Elijah Newren

Jeff King <peff@peff.net> writes:

> I'd have expected it with individual callbacks like:
>
>   int handle_patience(...)
>   {
> 	do_patience_specific_stuff();
> 	do_shared_stuff(PATIENCE_DIFF);
>   }
>
>   int handle_histogram(...)
>   {
> 	do_shared_stuff(HISTOGRAM_DIFF);
>   }
>
> and so on. That's a bit more verbose, but the call stack reflects the
> flow we expect.

Exactly.

Thanks for spelling it out.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v2 1/2] diff: consolidate diff algorithm option parsing
  2023-02-16  2:57             ` Junio C Hamano
@ 2023-02-16 20:34               ` John Cai
  0 siblings, 0 replies; 78+ messages in thread
From: John Cai @ 2023-02-16 20:34 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Jeff King, John Cai via GitGitGadget, git, Eric Sunshine,
	Phillip Wood, Ævar Arnfjörð Bjarmason,
	Elijah Newren

On 23/02/15 06:57PM, Junio C Hamano wrote:
> Jeff King <peff@peff.net> writes:
> 
> > I'd have expected it with individual callbacks like:
> >
> >   int handle_patience(...)
> >   {
> > 	do_patience_specific_stuff();
> > 	do_shared_stuff(PATIENCE_DIFF);
> >   }
> >
> >   int handle_histogram(...)
> >   {
> > 	do_shared_stuff(HISTOGRAM_DIFF);
> >   }
> >
> > and so on. That's a bit more verbose, but the call stack reflects the
> > flow we expect.
> 
> Exactly.
> 
> Thanks for spelling it out.

Thanks for this suggestion--makes sense to me. Will incorporate in the next
version.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v2 2/2] diff: teach diff to read gitattribute diff-algorithm
  2023-02-15  3:20       ` Junio C Hamano
@ 2023-02-16 20:37         ` John Cai
  0 siblings, 0 replies; 78+ messages in thread
From: John Cai @ 2023-02-16 20:37 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: John Cai via GitGitGadget, git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren

On 23/02/14 07:20PM, Junio C Hamano wrote:
> Junio C Hamano <gitster@pobox.com> writes:
> 
> >> diff --git a/diff.c b/diff.c
> >> index 92a0eab942e..24da439e56f 100644
> >> --- a/diff.c
> >> +++ b/diff.c
> >> @@ -4456,15 +4456,11 @@ static void run_diff_cmd(const char *pgm,
> >>  	const char *xfrm_msg = NULL;
> >>  	int complete_rewrite = (p->status == DIFF_STATUS_MODIFIED) && p->score;
> >>  	int must_show_header = 0;
> >> +	struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, attr_path);
> >
> > Do we run this look-up unconditionally, even when .allow_external
> > bit is not set?  Why?
> 
> Ah, this is perfectly fine.  It used to be that this codepath can
> tell that there is no need to check the diff driver when it is told
> never to use any external diff driver.  Now, even when it is computing
> the diff internally, it needs to check the diff driver to find out
> the favoured algorithm for the path.
> 
> Strictly speaking, if we are told NOT to use external diff driver,
> and if we are told NOT to pay attention to algorithm given by the
> diff driver, then we know we can skip the overhead of attribute
> look-up.  I.e. we could do this to avoid attribute look-up:
> 
> 	struct userdiff_driver *drv = NULL;
> 
> 	if (o->flags.allow_external || !o->ignore_driver_algorithm)
> 		drv = userdiff_find_by_path(...);
> 
> 	if (drv && o->flags.allow_external && drv->external)
> 		pgm = drv->external;
> 	...
> 	if (pgm)
> 		... do the external diff thing ...
> 	if (one && two) {
> 		if (drv && !o->ignore_driver_algorithm && drv->algorithm)
> 			set_diff_algo(...)
> 
> I was not sure if it would be worth it before writing the above
> down, but the resulting flow does not look _too_ bad.

Yes I think it's worth it to save on execution if we know we are not using
external diff algorithm.

> 
> >> @@ -4583,6 +4584,10 @@ static void run_diffstat(struct diff_filepair *p, struct diff_options *o,
> >>  	const char *name;
> >>  	const char *other;
> >>  
> >> +	struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, p->one->path);
> >> +	if (drv && drv->algorithm)
> >> +		set_diff_algorithm(o, drv->algorithm);
> >
> > Interesting.  Does external diff play a role, like in run_diff_cmd()
> > we saw earlier?
> 
> As whoever wrote "diffstat" did not think of counting output from
> external diff driver, of course in this codepath external diff would
> not appear.  So what we see is very much expected.
> 
> Just move the blank line we see before these new lines one line
> down, so that the variable decls are grouped together, with a blank
> line before the first executable statement.  I.e.
> 
> 	const char *name;
> 	const char *other;
> +       struct userdiff_driver *drv;
> +
> +	drv = userdiff_find_by_path(...);
> +	if (drv && drv->algorithm)
> +		set_diff_algorithm(o, drv->algorithm);

makes sense, thanks.

> 
> Shouldn't this function refrain from setting algorithm from the
> driver when the algorithm was given elsewhere?  E.g.
> 
> 	$ git show --histogram --stat
> 	
> or something?  IOW, shouldn't it also pay attention to
> o->ignore_driver_algorithm bit, just like run_diff_cmd() did?

Yes we should add the same guard as above.

> 
> 

^ permalink raw reply	[flat|nested] 78+ messages in thread

* [PATCH v3 0/2] Teach diff to honor diff algorithms set through git attributes
  2023-02-14 21:40 ` [PATCH v2 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
  2023-02-14 21:40   ` [PATCH v2 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
  2023-02-14 21:40   ` [PATCH v2 2/2] diff: teach diff to read gitattribute diff-algorithm John Cai via GitGitGadget
@ 2023-02-17 20:21   ` John Cai via GitGitGadget
  2023-02-17 20:21     ` [PATCH v3 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
                       ` (3 more replies)
  2 siblings, 4 replies; 78+ messages in thread
From: John Cai via GitGitGadget @ 2023-02-17 20:21 UTC (permalink / raw)
  To: git
  Cc: Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren,
	John Cai

When a repository contains different kinds of files, it may be desirable to
use different algorithms based on file type. This is currently not feasible
through the command line or using git configs. However, we can leverage the
fact that gitattributes are path aware.

Teach the diff machinery to check gitattributes when diffing files by using
the existing diff. scheme, and add an "algorithm" type to the external
driver config.

Changes since V2:

 * minor clean up and variable renaming
 * avoid parsing attribute files for the driver if the diff algorithm is set
   through the command line

Changes since V1:

 * utilize the existing diff.<driver>.* scheme where the driver is defined
   in gitattributes, but the algorithm is defined in the gitconfig.

To address some of the performance concerns in the previous series, a
benchmark shows that a performance penalty is no longer incurred, now that
we are no longer adding an additional attributes parsing call:

$ hyperfine -r 5 -L a bin-wrappers/git,git '{a} diff v2.0.0 v2.28.0'
Benchmark 1: git-bin-wrapper diff v2.0.0 v2.28.0 Time (mean ± σ): 1.072 s ±
0.289 s [User: 0.626 s, System: 0.081 s] Range (min … max): 0.772 s … 1.537
s 5 runs

Benchmark 2: git diff v2.0.0 v2.28.0 Time (mean ± σ): 1.003 s ± 0.065 s
[User: 0.684 s, System: 0.067 s] Range (min … max): 0.914 s … 1.091 s 5 runs

Summary 'git diff v2.0.0 v2.28.0' ran 1.07 ± 0.30 times faster than
'git-bin-wrapper diff v2.0.0 v2.28.0'

John Cai (2):
  diff: consolidate diff algorithm option parsing
  diff: teach diff to read algorithm from diff driver

 Documentation/gitattributes.txt | 37 ++++++++++++++
 diff.c                          | 90 ++++++++++++++++++++++++---------
 diff.h                          |  1 +
 t/lib-diff-alternative.sh       | 38 +++++++++++++-
 userdiff.c                      |  4 +-
 userdiff.h                      |  1 +
 6 files changed, 146 insertions(+), 25 deletions(-)


base-commit: c867e4fa180bec4750e9b54eb10f459030dbebfd
Published-As: https://github.com/gitgitgadget/git/releases/tag/pr-git-1452%2Fjohn-cai%2Fjc%2Fattr-diff-algo-v3
Fetch-It-Via: git fetch https://github.com/gitgitgadget/git pr-git-1452/john-cai/jc/attr-diff-algo-v3
Pull-Request: https://github.com/git/git/pull/1452

Range-diff vs v2:

 1:  0c5e1fc6c26 ! 1:  816c47aa414 diff: consolidate diff algorithm option parsing
     @@ Metadata
       ## Commit message ##
          diff: consolidate diff algorithm option parsing
      
     -    The diff option parsing for --minimal, --patience, --histgoram can all
     -    be consolidated into one function. This is a preparatory step for the
     -    subsequent commit which teaches diff to keep track of whether or not a
     -    diff algorithm has been set via the command line.
     +    A subsequent commit will need the ability to tell if the diff algorithm
     +    was set through the command line through setting a new member of
     +    diff_options. While this logic can be added to the
     +    diff_opt_diff_algorithm() callback, the `--minimal` and `--histogram`
     +    options are handled via OPT_BIT without a callback.
      
     -    Additionally, the logic that sets the diff algorithm in
     -    diff_opt_diff_algorithm() can  be refactored into a helper that will
     +    Remedy this by consolidating the options parsing logic for --minimal and
     +    --histogram into one callback. This way we can modify `diff_options` in
     +    that function.
     +
     +    As an additional refactor, the logic that sets the diff algorithm in
     +    diff_opt_diff_algorithm() can be refactored into a helper that will
          allow multiple callsites to set the diff algorithm.
      
          Signed-off-by: John Cai <johncai86@gmail.com>
     @@ diff.c: static int diff_filepair_is_phoney(struct diff_filespec *one,
      +	long value = parse_algorithm_value(alg);
      +
      +	if (value < 0)
     -+		return 1;
     ++		return -1;
      +
      +	/* clear out previous settings */
      +	DIFF_XDL_CLR(opts, NEED_MINIMAL);
     @@ diff.c: static int diff_opt_diff_algorithm(const struct option *opt,
      +	BUG_ON_OPT_NEG(unset);
      +	BUG_ON_OPT_ARG(arg);
      +
     -+	if (!strcmp(opt->long_name, "patience")) {
     -+		size_t i;
     -+		/*
     -+		 * Both --patience and --anchored use PATIENCE_DIFF
     -+		 * internally, so remove any anchors previously
     -+		 * specified.
     -+		 */
     -+		for (i = 0; i < options->anchors_nr; i++)
     -+			free(options->anchors[i]);
     -+		options->anchors_nr = 0;
     -+	}
     -+
      +	if (set_diff_algorithm(options, opt->long_name))
      +		BUG("available diff algorithms include \"myers\", "
      +			       "\"minimal\", \"patience\" and \"histogram\"");
     @@ diff.c: static int diff_opt_diff_algorithm(const struct option *opt,
       	return 0;
       }
       
     -@@ diff.c: static enum parse_opt_result diff_opt_output(struct parse_opt_ctx_t *ctx,
     - 	return 0;
     - }
     +@@ diff.c: static int diff_opt_patience(const struct option *opt,
       
     --static int diff_opt_patience(const struct option *opt,
     --			     const char *arg, int unset)
     --{
     --	struct diff_options *options = opt->value;
     --	int i;
     --
     --	BUG_ON_OPT_NEG(unset);
     --	BUG_ON_OPT_ARG(arg);
     + 	BUG_ON_OPT_NEG(unset);
     + 	BUG_ON_OPT_ARG(arg);
      -	options->xdl_opts = DIFF_WITH_ALG(options, PATIENCE_DIFF);
     --	/*
     --	 * Both --patience and --anchored use PATIENCE_DIFF
     --	 * internally, so remove any anchors previously
     --	 * specified.
     --	 */
     --	for (i = 0; i < options->anchors_nr; i++)
     --		free(options->anchors[i]);
     --	options->anchors_nr = 0;
     + 	/*
     + 	 * Both --patience and --anchored use PATIENCE_DIFF
     + 	 * internally, so remove any anchors previously
     +@@ diff.c: static int diff_opt_patience(const struct option *opt,
     + 	for (i = 0; i < options->anchors_nr; i++)
     + 		free(options->anchors[i]);
     + 	options->anchors_nr = 0;
      -	return 0;
     --}
     --
     ++
     ++	return set_diff_algorithm(options, "patience");
     + }
     + 
       static int diff_opt_ignore_regex(const struct option *opt,
     - 				 const char *arg, int unset)
     - {
      @@ diff.c: struct option *add_diff_options(const struct option *opts,
       			    N_("prevent rename/copy detection if the number of rename/copy targets exceeds given limit")),
       
     @@ diff.c: struct option *add_diff_options(const struct option *opts,
       			  N_("ignore whitespace when comparing lines"),
       			  XDF_IGNORE_WHITESPACE, PARSE_OPT_NONEG),
      @@ diff.c: struct option *add_diff_options(const struct option *opts,
     - 		OPT_CALLBACK_F(0, "patience", options, NULL,
       			       N_("generate diff using the \"patience diff\" algorithm"),
       			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
     --			       diff_opt_patience),
     + 			       diff_opt_patience),
      -		OPT_BITOP(0, "histogram", &options->xdl_opts,
      -			  N_("generate diff using the \"histogram diff\" algorithm"),
      -			  XDF_HISTOGRAM_DIFF, XDF_DIFF_ALGORITHM_MASK),
     -+			       diff_opt_diff_algorithm_no_arg),
      +		OPT_CALLBACK_F(0, "histogram", options, NULL,
      +			       N_("generate diff using the \"histogram diff\" algorithm"),
      +			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
 2:  cb030563149 ! 2:  b330222ce83 diff: teach diff to read gitattribute diff-algorithm
     @@ Metadata
      Author: John Cai <johncai86@gmail.com>
      
       ## Commit message ##
     -    diff: teach diff to read gitattribute diff-algorithm
     +    diff: teach diff to read algorithm from diff driver
      
          It can be useful to specify diff algorithms per file type. For example,
          one may want to use the minimal diff algorithm for .json files, another
          for .c files, etc.
      
     -    Teach the diff machinery to check attributes for a diff driver. Also
     -    teach the diff driver parser a new type "algorithm" to look for in the
     +    The diff machinery already checks attributes for a diff driver. Teach
     +    the diff driver parser a new type "algorithm" to look for in the
          config, which will be used if a driver has been specified through the
          attributes.
      
     -    Enforce precedence of diff algorithm by favoring the command line option,
     -    then looking at the driver attributes & config combination, then finally
     -    the diff.algorithm config.
     +    Enforce precedence of the diff algorithm by favoring the command line
     +    option, then looking at the driver attributes & config combination, then
     +    finally the diff.algorithm config.
      
     -    To enforce precedence order, use the `xdl_opts_command_line` member
     +    To enforce precedence order, use a new `ignore_driver_algorithm` member
          during options pasing to indicate the diff algorithm was set via command
          line args.
      
          Signed-off-by: John Cai <johncai86@gmail.com>
      
       ## Documentation/gitattributes.txt ##
     -@@ Documentation/gitattributes.txt: String::
     - 	by the configuration variables in the "diff.foo" section of the
     - 	Git config file.
     - 
     --
     - Defining an external diff driver
     - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     - 
      @@ Documentation/gitattributes.txt: with the above configuration, i.e. `j-c-diff`, with 7
       parameters, just like `GIT_EXTERNAL_DIFF` program is called.
       See linkgit:git[1] for details.
     @@ Documentation/gitattributes.txt: with the above configuration, i.e. `j-c-diff`,
      +The diff algorithm can be set through the `diff.algorithm` config key, but
      +sometimes it may be helpful to set the diff algorithm by path. For example, one
      +might wish to set a diff algorithm automatically for all `.json` files such that
     -+the user would not need to pass in a separate command line `--diff-algorithm` flag each
     -+time.
     ++the user would not need to pass in a separate command line `--diff-algorithm`
     ++flag each time.
      +
     -+First, in `.gitattributes`, you would assign the `diff` attribute for paths.
     ++First, in `.gitattributes`, assign the `diff` attribute for paths.
      +
     -+*Git attributes*
      +------------------------
      +*.json diff=<name>
      +------------------------
      +
     -+Then, you would define a "diff.<name>.algorithm" configuration to specify the
     -+diff algorithm, choosing from `meyers`, `patience`, `minimal`, and `histogram`.
     -+
     -+*Git config*
     ++Then, define a "diff.<name>.algorithm" configuration to specify the diff
     ++algorithm, choosing from `meyers`, `patience`, `minimal`, or `histogram`.
      +
      +----------------------------------------------------------------
      +[diff "<name>"]
      +  algorithm = histogram
      +----------------------------------------------------------------
      +
     -+This diff algorithm applies to git-diff(1), including the `--stat` output.
     ++This diff algorithm applies to user facing diff output like git-diff(1),
     ++git-show(1) and is used for the `--stat` output as well. The merge machinery
     ++will not use the diff algorithm set through this method.
      +
      +NOTE: If the `command` key also exists, then Git will treat this as an external
      +diff and attempt to use the value set for `command` as an external program. For
      +instance, the following config, combined with the above `.gitattributes` file,
      +will result in `command` favored over `algorithm`.
      +
     -+*Git config*
     -+
      +----------------------------------------------------------------
      +[diff "<name>"]
      +  command = j-c-diff
     @@ diff.c: static void run_diff_cmd(const char *pgm,
       	const char *xfrm_msg = NULL;
       	int complete_rewrite = (p->status == DIFF_STATUS_MODIFIED) && p->score;
       	int must_show_header = 0;
     -+	struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, attr_path);
     ++	struct userdiff_driver *drv = NULL;
       
      -
      -	if (o->flags.allow_external) {
      -		struct userdiff_driver *drv;
      -
     --		drv = userdiff_find_by_path(o->repo->index, attr_path);
     -+	if (o->flags.allow_external)
     - 		if (drv && drv->external)
     - 			pgm = drv->external;
     ++	if (o->flags.allow_external || !o->ignore_driver_algorithm)
     + 		drv = userdiff_find_by_path(o->repo->index, attr_path);
     +-		if (drv && drv->external)
     +-			pgm = drv->external;
      -	}
     ++
     ++	if (o->flags.allow_external && drv && drv->external)
     ++		pgm = drv->external;
       
       	if (msg) {
       		/*
     @@ diff.c: static void run_diff_cmd(const char *pgm,
       	}
      -	if (one && two)
      +	if (one && two) {
     -+		if (!o->xdl_opts_command_line)
     -+			if (drv && drv->algorithm)
     -+				set_diff_algorithm(o, drv->algorithm);
     ++		if (drv && !o->ignore_driver_algorithm && drv->algorithm)
     ++			set_diff_algorithm(o, drv->algorithm);
      +
       		builtin_diff(name, other ? other : name,
       			     one, two, xfrm_msg, must_show_header,
     @@ diff.c: static void run_diffstat(struct diff_filepair *p, struct diff_options *o
       	const char *name;
       	const char *other;
       
     -+	struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, p->one->path);
     -+	if (drv && drv->algorithm)
     -+		set_diff_algorithm(o, drv->algorithm);
     ++	if (!o->ignore_driver_algorithm) {
     ++		struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, p->one->path);
     ++
     ++		if (drv && drv->algorithm) {
     ++			set_diff_algorithm(o, drv->algorithm);
     ++		}
     ++	}
      +
       	if (DIFF_PAIR_UNMERGED(p)) {
       		/* unmerged */
     @@ diff.c: static int diff_opt_diff_algorithm(const struct option *opt,
       		return error(_("option diff-algorithm accepts \"myers\", "
       			       "\"minimal\", \"patience\" and \"histogram\""));
       
     -+	options->xdl_opts_command_line = 1;
     ++	options->ignore_driver_algorithm = 1;
      +
       	return 0;
       }
     @@ diff.c: static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
       		BUG("available diff algorithms include \"myers\", "
       			       "\"minimal\", \"patience\" and \"histogram\"");
       
     -+	options->xdl_opts_command_line = 1;
     ++	options->ignore_driver_algorithm = 1;
      +
       	return 0;
       }
       
     +@@ diff.c: static int diff_opt_patience(const struct option *opt,
     + 	for (i = 0; i < options->anchors_nr; i++)
     + 		free(options->anchors[i]);
     + 	options->anchors_nr = 0;
     ++	options->ignore_driver_algorithm = 1;
     + 
     + 	return set_diff_algorithm(options, "patience");
     + }
      
       ## diff.h ##
      @@ diff.h: struct diff_options {
       	int prefix_length;
       	const char *stat_sep;
       	int xdl_opts;
     -+	/* If xdl_opts has been set via the command line. */
     -+	int xdl_opts_command_line;
     ++	int ignore_driver_algorithm;
       
       	/* see Documentation/diff-options.txt */
       	char **anchors;

-- 
gitgitgadget

^ permalink raw reply	[flat|nested] 78+ messages in thread

* [PATCH v3 1/2] diff: consolidate diff algorithm option parsing
  2023-02-17 20:21   ` [PATCH v3 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
@ 2023-02-17 20:21     ` John Cai via GitGitGadget
  2023-02-17 21:27       ` Junio C Hamano
  2023-02-18  1:36       ` Elijah Newren
  2023-02-17 20:21     ` [PATCH v3 2/2] diff: teach diff to read algorithm from diff driver John Cai via GitGitGadget
                       ` (2 subsequent siblings)
  3 siblings, 2 replies; 78+ messages in thread
From: John Cai via GitGitGadget @ 2023-02-17 20:21 UTC (permalink / raw)
  To: git
  Cc: Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren,
	John Cai, John Cai

From: John Cai <johncai86@gmail.com>

A subsequent commit will need the ability to tell if the diff algorithm
was set through the command line through setting a new member of
diff_options. While this logic can be added to the
diff_opt_diff_algorithm() callback, the `--minimal` and `--histogram`
options are handled via OPT_BIT without a callback.

Remedy this by consolidating the options parsing logic for --minimal and
--histogram into one callback. This way we can modify `diff_options` in
that function.

As an additional refactor, the logic that sets the diff algorithm in
diff_opt_diff_algorithm() can be refactored into a helper that will
allow multiple callsites to set the diff algorithm.

Signed-off-by: John Cai <johncai86@gmail.com>
---
 diff.c | 57 +++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 14 deletions(-)

diff --git a/diff.c b/diff.c
index 329eebf16a0..5efc22ca06b 100644
--- a/diff.c
+++ b/diff.c
@@ -3437,6 +3437,22 @@ static int diff_filepair_is_phoney(struct diff_filespec *one,
 	return !DIFF_FILE_VALID(one) && !DIFF_FILE_VALID(two);
 }
 
+static int set_diff_algorithm(struct diff_options *opts,
+			      const char *alg)
+{
+	long value = parse_algorithm_value(alg);
+
+	if (value < 0)
+		return -1;
+
+	/* clear out previous settings */
+	DIFF_XDL_CLR(opts, NEED_MINIMAL);
+	opts->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
+	opts->xdl_opts |= value;
+
+	return 0;
+}
+
 static void builtin_diff(const char *name_a,
 			 const char *name_b,
 			 struct diff_filespec *one,
@@ -5107,17 +5123,28 @@ static int diff_opt_diff_algorithm(const struct option *opt,
 				   const char *arg, int unset)
 {
 	struct diff_options *options = opt->value;
-	long value = parse_algorithm_value(arg);
 
 	BUG_ON_OPT_NEG(unset);
-	if (value < 0)
+
+	if (set_diff_algorithm(options, arg))
 		return error(_("option diff-algorithm accepts \"myers\", "
 			       "\"minimal\", \"patience\" and \"histogram\""));
 
-	/* clear out previous settings */
-	DIFF_XDL_CLR(options, NEED_MINIMAL);
-	options->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
-	options->xdl_opts |= value;
+	return 0;
+}
+
+static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
+				   const char *arg, int unset)
+{
+	struct diff_options *options = opt->value;
+
+	BUG_ON_OPT_NEG(unset);
+	BUG_ON_OPT_ARG(arg);
+
+	if (set_diff_algorithm(options, opt->long_name))
+		BUG("available diff algorithms include \"myers\", "
+			       "\"minimal\", \"patience\" and \"histogram\"");
+
 	return 0;
 }
 
@@ -5250,7 +5277,6 @@ static int diff_opt_patience(const struct option *opt,
 
 	BUG_ON_OPT_NEG(unset);
 	BUG_ON_OPT_ARG(arg);
-	options->xdl_opts = DIFF_WITH_ALG(options, PATIENCE_DIFF);
 	/*
 	 * Both --patience and --anchored use PATIENCE_DIFF
 	 * internally, so remove any anchors previously
@@ -5259,7 +5285,8 @@ static int diff_opt_patience(const struct option *opt,
 	for (i = 0; i < options->anchors_nr; i++)
 		free(options->anchors[i]);
 	options->anchors_nr = 0;
-	return 0;
+
+	return set_diff_algorithm(options, "patience");
 }
 
 static int diff_opt_ignore_regex(const struct option *opt,
@@ -5562,9 +5589,10 @@ struct option *add_diff_options(const struct option *opts,
 			    N_("prevent rename/copy detection if the number of rename/copy targets exceeds given limit")),
 
 		OPT_GROUP(N_("Diff algorithm options")),
-		OPT_BIT(0, "minimal", &options->xdl_opts,
-			N_("produce the smallest possible diff"),
-			XDF_NEED_MINIMAL),
+		OPT_CALLBACK_F(0, "minimal", options, NULL,
+			       N_("produce the smallest possible diff"),
+			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
+			       diff_opt_diff_algorithm_no_arg),
 		OPT_BIT_F('w', "ignore-all-space", &options->xdl_opts,
 			  N_("ignore whitespace when comparing lines"),
 			  XDF_IGNORE_WHITESPACE, PARSE_OPT_NONEG),
@@ -5590,9 +5618,10 @@ struct option *add_diff_options(const struct option *opts,
 			       N_("generate diff using the \"patience diff\" algorithm"),
 			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
 			       diff_opt_patience),
-		OPT_BITOP(0, "histogram", &options->xdl_opts,
-			  N_("generate diff using the \"histogram diff\" algorithm"),
-			  XDF_HISTOGRAM_DIFF, XDF_DIFF_ALGORITHM_MASK),
+		OPT_CALLBACK_F(0, "histogram", options, NULL,
+			       N_("generate diff using the \"histogram diff\" algorithm"),
+			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
+			       diff_opt_diff_algorithm_no_arg),
 		OPT_CALLBACK_F(0, "diff-algorithm", options, N_("<algorithm>"),
 			       N_("choose a diff algorithm"),
 			       PARSE_OPT_NONEG, diff_opt_diff_algorithm),
-- 
gitgitgadget


^ permalink raw reply related	[flat|nested] 78+ messages in thread

* [PATCH v3 2/2] diff: teach diff to read algorithm from diff driver
  2023-02-17 20:21   ` [PATCH v3 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
  2023-02-17 20:21     ` [PATCH v3 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
@ 2023-02-17 20:21     ` John Cai via GitGitGadget
  2023-02-17 21:50       ` Junio C Hamano
  2023-02-18  2:56       ` Elijah Newren
  2023-02-18  1:16     ` [PATCH v3 0/2] Teach diff to honor diff algorithms set through git attributes Elijah Newren
  2023-02-20 21:04     ` [PATCH v4 " John Cai via GitGitGadget
  3 siblings, 2 replies; 78+ messages in thread
From: John Cai via GitGitGadget @ 2023-02-17 20:21 UTC (permalink / raw)
  To: git
  Cc: Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren,
	John Cai, John Cai

From: John Cai <johncai86@gmail.com>

It can be useful to specify diff algorithms per file type. For example,
one may want to use the minimal diff algorithm for .json files, another
for .c files, etc.

The diff machinery already checks attributes for a diff driver. Teach
the diff driver parser a new type "algorithm" to look for in the
config, which will be used if a driver has been specified through the
attributes.

Enforce precedence of the diff algorithm by favoring the command line
option, then looking at the driver attributes & config combination, then
finally the diff.algorithm config.

To enforce precedence order, use a new `ignore_driver_algorithm` member
during options pasing to indicate the diff algorithm was set via command
line args.

Signed-off-by: John Cai <johncai86@gmail.com>
---
 Documentation/gitattributes.txt | 37 ++++++++++++++++++++++++++++++++
 diff.c                          | 33 ++++++++++++++++++++--------
 diff.h                          |  1 +
 t/lib-diff-alternative.sh       | 38 ++++++++++++++++++++++++++++++++-
 userdiff.c                      |  4 +++-
 userdiff.h                      |  1 +
 6 files changed, 103 insertions(+), 11 deletions(-)

diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt
index c19e64ea0ef..f212079a131 100644
--- a/Documentation/gitattributes.txt
+++ b/Documentation/gitattributes.txt
@@ -758,6 +758,43 @@ with the above configuration, i.e. `j-c-diff`, with 7
 parameters, just like `GIT_EXTERNAL_DIFF` program is called.
 See linkgit:git[1] for details.
 
+Setting the internal diff algorithm
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The diff algorithm can be set through the `diff.algorithm` config key, but
+sometimes it may be helpful to set the diff algorithm by path. For example, one
+might wish to set a diff algorithm automatically for all `.json` files such that
+the user would not need to pass in a separate command line `--diff-algorithm`
+flag each time.
+
+First, in `.gitattributes`, assign the `diff` attribute for paths.
+
+------------------------
+*.json diff=<name>
+------------------------
+
+Then, define a "diff.<name>.algorithm" configuration to specify the diff
+algorithm, choosing from `meyers`, `patience`, `minimal`, or `histogram`.
+
+----------------------------------------------------------------
+[diff "<name>"]
+  algorithm = histogram
+----------------------------------------------------------------
+
+This diff algorithm applies to user facing diff output like git-diff(1),
+git-show(1) and is used for the `--stat` output as well. The merge machinery
+will not use the diff algorithm set through this method.
+
+NOTE: If the `command` key also exists, then Git will treat this as an external
+diff and attempt to use the value set for `command` as an external program. For
+instance, the following config, combined with the above `.gitattributes` file,
+will result in `command` favored over `algorithm`.
+
+----------------------------------------------------------------
+[diff "<name>"]
+  command = j-c-diff
+  algorithm = histogram
+----------------------------------------------------------------
 
 Defining a custom hunk-header
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/diff.c b/diff.c
index 5efc22ca06b..04469da6d34 100644
--- a/diff.c
+++ b/diff.c
@@ -4456,15 +4456,13 @@ static void run_diff_cmd(const char *pgm,
 	const char *xfrm_msg = NULL;
 	int complete_rewrite = (p->status == DIFF_STATUS_MODIFIED) && p->score;
 	int must_show_header = 0;
+	struct userdiff_driver *drv = NULL;
 
-
-	if (o->flags.allow_external) {
-		struct userdiff_driver *drv;
-
+	if (o->flags.allow_external || !o->ignore_driver_algorithm)
 		drv = userdiff_find_by_path(o->repo->index, attr_path);
-		if (drv && drv->external)
-			pgm = drv->external;
-	}
+
+	if (o->flags.allow_external && drv && drv->external)
+		pgm = drv->external;
 
 	if (msg) {
 		/*
@@ -4481,12 +4479,16 @@ static void run_diff_cmd(const char *pgm,
 		run_external_diff(pgm, name, other, one, two, xfrm_msg, o);
 		return;
 	}
-	if (one && two)
+	if (one && two) {
+		if (drv && !o->ignore_driver_algorithm && drv->algorithm)
+			set_diff_algorithm(o, drv->algorithm);
+
 		builtin_diff(name, other ? other : name,
 			     one, two, xfrm_msg, must_show_header,
 			     o, complete_rewrite);
-	else
+	} else {
 		fprintf(o->file, "* Unmerged path %s\n", name);
+	}
 }
 
 static void diff_fill_oid_info(struct diff_filespec *one, struct index_state *istate)
@@ -4583,6 +4585,14 @@ static void run_diffstat(struct diff_filepair *p, struct diff_options *o,
 	const char *name;
 	const char *other;
 
+	if (!o->ignore_driver_algorithm) {
+		struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, p->one->path);
+
+		if (drv && drv->algorithm) {
+			set_diff_algorithm(o, drv->algorithm);
+		}
+	}
+
 	if (DIFF_PAIR_UNMERGED(p)) {
 		/* unmerged */
 		builtin_diffstat(p->one->path, NULL, NULL, NULL,
@@ -5130,6 +5140,8 @@ static int diff_opt_diff_algorithm(const struct option *opt,
 		return error(_("option diff-algorithm accepts \"myers\", "
 			       "\"minimal\", \"patience\" and \"histogram\""));
 
+	options->ignore_driver_algorithm = 1;
+
 	return 0;
 }
 
@@ -5145,6 +5157,8 @@ static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
 		BUG("available diff algorithms include \"myers\", "
 			       "\"minimal\", \"patience\" and \"histogram\"");
 
+	options->ignore_driver_algorithm = 1;
+
 	return 0;
 }
 
@@ -5285,6 +5299,7 @@ static int diff_opt_patience(const struct option *opt,
 	for (i = 0; i < options->anchors_nr; i++)
 		free(options->anchors[i]);
 	options->anchors_nr = 0;
+	options->ignore_driver_algorithm = 1;
 
 	return set_diff_algorithm(options, "patience");
 }
diff --git a/diff.h b/diff.h
index 41eb2c3d428..8d770b1d579 100644
--- a/diff.h
+++ b/diff.h
@@ -333,6 +333,7 @@ struct diff_options {
 	int prefix_length;
 	const char *stat_sep;
 	int xdl_opts;
+	int ignore_driver_algorithm;
 
 	/* see Documentation/diff-options.txt */
 	char **anchors;
diff --git a/t/lib-diff-alternative.sh b/t/lib-diff-alternative.sh
index 8d1e408bb58..2dc02bca873 100644
--- a/t/lib-diff-alternative.sh
+++ b/t/lib-diff-alternative.sh
@@ -105,10 +105,46 @@ index $file1..$file2 100644
  }
 EOF
 
+	cat >expect_diffstat <<EOF
+ file1 => file2 | 21 ++++++++++-----------
+ 1 file changed, 10 insertions(+), 11 deletions(-)
+EOF
+
 	STRATEGY=$1
 
+	test_expect_success "$STRATEGY diff from attributes" '
+		echo "file* diff=driver" >.gitattributes &&
+		git config diff.driver.algorithm "$STRATEGY" &&
+		test_must_fail git diff --no-index file1 file2 > output &&
+		cat expect &&
+		cat output &&
+		test_cmp expect output
+	'
+
+	test_expect_success "$STRATEGY diff from attributes has valid diffstat" '
+		echo "file* diff=driver" >.gitattributes &&
+		git config diff.driver.algorithm "$STRATEGY" &&
+		test_must_fail git diff --stat --no-index file1 file2 > output &&
+		test_cmp expect_diffstat output
+	'
+
 	test_expect_success "$STRATEGY diff" '
-		test_must_fail git diff --no-index "--$STRATEGY" file1 file2 > output &&
+		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
+		test_cmp expect output
+	'
+
+	test_expect_success "$STRATEGY diff command line precedence before attributes" '
+		echo "file* diff=driver" >.gitattributes &&
+		git config diff.driver.algorithm meyers &&
+		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
+		test_cmp expect output
+	'
+
+	test_expect_success "$STRATEGY diff attributes precedence before config" '
+		git config diff.algorithm default &&
+		echo "file* diff=driver" >.gitattributes &&
+		git config diff.driver.algorithm "$STRATEGY" &&
+		test_must_fail git diff --no-index file1 file2 > output &&
 		test_cmp expect output
 	'
 
diff --git a/userdiff.c b/userdiff.c
index d71b82feb74..ff25cfc4b4c 100644
--- a/userdiff.c
+++ b/userdiff.c
@@ -293,7 +293,7 @@ PATTERNS("scheme",
 	 "|([^][)(}{[ \t])+"),
 PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",
 	 "\\\\[a-zA-Z@]+|\\\\.|[a-zA-Z0-9\x80-\xff]+"),
-{ "default", NULL, -1, { NULL, 0 } },
+{ "default", NULL, NULL, -1, { NULL, 0 } },
 };
 #undef PATTERNS
 #undef IPATTERN
@@ -394,6 +394,8 @@ int userdiff_config(const char *k, const char *v)
 		return parse_bool(&drv->textconv_want_cache, k, v);
 	if (!strcmp(type, "wordregex"))
 		return git_config_string(&drv->word_regex, k, v);
+	if (!strcmp(type, "algorithm"))
+		return git_config_string(&drv->algorithm, k, v);
 
 	return 0;
 }
diff --git a/userdiff.h b/userdiff.h
index aee91bc77e6..24419db6973 100644
--- a/userdiff.h
+++ b/userdiff.h
@@ -14,6 +14,7 @@ struct userdiff_funcname {
 struct userdiff_driver {
 	const char *name;
 	const char *external;
+	const char *algorithm;
 	int binary;
 	struct userdiff_funcname funcname;
 	const char *word_regex;
-- 
gitgitgadget

^ permalink raw reply related	[flat|nested] 78+ messages in thread

* Re: [PATCH v3 1/2] diff: consolidate diff algorithm option parsing
  2023-02-17 20:21     ` [PATCH v3 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
@ 2023-02-17 21:27       ` Junio C Hamano
  2023-02-18  1:36       ` Elijah Newren
  1 sibling, 0 replies; 78+ messages in thread
From: Junio C Hamano @ 2023-02-17 21:27 UTC (permalink / raw)
  To: John Cai via GitGitGadget
  Cc: git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren,
	John Cai

"John Cai via GitGitGadget" <gitgitgadget@gmail.com> writes:

> From: John Cai <johncai86@gmail.com>
>
> A subsequent commit will need the ability to tell if the diff algorithm
> was set through the command line through setting a new member of
> diff_options. While this logic can be added to the
> diff_opt_diff_algorithm() callback, the `--minimal` and `--histogram`
> options are handled via OPT_BIT without a callback.
>
> Remedy this by consolidating the options parsing logic for --minimal and
> --histogram into one callback. This way we can modify `diff_options` in
> that function.
>
> As an additional refactor, the logic that sets the diff algorithm in
> diff_opt_diff_algorithm() can be refactored into a helper that will
> allow multiple callsites to set the diff algorithm.
>
> Signed-off-by: John Cai <johncai86@gmail.com>
> ---
>  diff.c | 57 +++++++++++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 43 insertions(+), 14 deletions(-)

Nicely done.  Much easier to follow.  Will queue.  Thanks.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v3 2/2] diff: teach diff to read algorithm from diff driver
  2023-02-17 20:21     ` [PATCH v3 2/2] diff: teach diff to read algorithm from diff driver John Cai via GitGitGadget
@ 2023-02-17 21:50       ` Junio C Hamano
  2023-02-18  2:56       ` Elijah Newren
  1 sibling, 0 replies; 78+ messages in thread
From: Junio C Hamano @ 2023-02-17 21:50 UTC (permalink / raw)
  To: John Cai via GitGitGadget
  Cc: git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren,
	John Cai

"John Cai via GitGitGadget" <gitgitgadget@gmail.com> writes:

Looking good.  Some comments below.  Many of them minor.

> +Setting the internal diff algorithm
> +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> +
> +The diff algorithm can be set through the `diff.algorithm` config key, but
> +sometimes it may be helpful to set the diff algorithm by path. For example, one

I would have expected "per path" instead of "by path".

> +might wish to set a diff algorithm automatically for all `.json` files such that
> +the user would not need to pass in a separate command line `--diff-algorithm`
> +flag each time.

While this is not incorrect per-se, I think the first paragraph of
the proposed commit log message was a lot more convincing.  Your
changes may not be limited to a single kind of files, and a command
line option is simply not enough.  You may want one algorithm for
".json" while using another for ".c", which was really an excellent
example you gave.

> +This diff algorithm applies to user facing diff output like git-diff(1),
> +git-show(1) and is used for the `--stat` output as well. The merge machinery
> +will not use the diff algorithm set through this method.

Is "format-patch" considered "user-facing"?

> +NOTE: If the `command` key also exists, then Git will treat this as an external
> +diff and attempt to use the value set for `command` as an external program. For
> +instance, the following config, combined with the above `.gitattributes` file,
> +will result in `command` favored over `algorithm`.
> +
> +----------------------------------------------------------------
> +[diff "<name>"]
> +  command = j-c-diff
> +  algorithm = histogram
> +----------------------------------------------------------------

Isn't this a bit too verbose, given that the reader has just seen
the external diff driver section.  I wonder something like this is
sufficient, without any sample configuration?

    NOTE: If `diff.<name>.command` is defined for path with the
    `diff=<name>` attribute, it is executed as an external diff driver
    (see above), and adding `diff.<name>.algorithm` has no effect (the
    algorithm is not passed to the external diff driver).


> diff --git a/diff.c b/diff.c
> index 5efc22ca06b..04469da6d34 100644
> --- a/diff.c
> +++ b/diff.c
> @@ -4456,15 +4456,13 @@ static void run_diff_cmd(const char *pgm,
>  	const char *xfrm_msg = NULL;
>  	int complete_rewrite = (p->status == DIFF_STATUS_MODIFIED) && p->score;
>  	int must_show_header = 0;
> +	struct userdiff_driver *drv = NULL;
>  
> -
> -	if (o->flags.allow_external) {
> -		struct userdiff_driver *drv;
> -
> +	if (o->flags.allow_external || !o->ignore_driver_algorithm)
>  		drv = userdiff_find_by_path(o->repo->index, attr_path);
> -		if (drv && drv->external)
> -			pgm = drv->external;
> -	}
> +
> +	if (o->flags.allow_external && drv && drv->external)
> +		pgm = drv->external;

OK.  There is no explicit "pgm = NULL" initialization in this
function, but that is done by the caller passing NULL to the
function as a parameter, so it all makes sense.

> @@ -4481,12 +4479,16 @@ static void run_diff_cmd(const char *pgm,
>  		run_external_diff(pgm, name, other, one, two, xfrm_msg, o);
>  		return;
>  	}
> -	if (one && two)
> +	if (one && two) {
> +		if (drv && !o->ignore_driver_algorithm && drv->algorithm)
> +			set_diff_algorithm(o, drv->algorithm);

For symmetry with the above choice of pgm we just saw, the order of
the condition might be easier to follow if written like so:

	if (!o->ignore_driver_algorithm && drv && drv->algorithm)

It would not make any measurable difference performance-wise either way.

> @@ -4583,6 +4585,14 @@ static void run_diffstat(struct diff_filepair *p, struct diff_options *o,
>  	const char *name;
>  	const char *other;
>  
> +	if (!o->ignore_driver_algorithm) {
> +		struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, p->one->path);

That's an overlong line.

> +
> +		if (drv && drv->algorithm) {
> +			set_diff_algorithm(o, drv->algorithm);
> +		}

No need to have {} around a single statement block.

> +	}
> +
>  	if (DIFF_PAIR_UNMERGED(p)) {
>  		/* unmerged */
>  		builtin_diffstat(p->one->path, NULL, NULL, NULL,
> @@ -5130,6 +5140,8 @@ static int diff_opt_diff_algorithm(const struct option *opt,
>  		return error(_("option diff-algorithm accepts \"myers\", "
>  			       "\"minimal\", \"patience\" and \"histogram\""));
>  
> +	options->ignore_driver_algorithm = 1;
> +
>  	return 0;
>  }
>  
> @@ -5145,6 +5157,8 @@ static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
>  		BUG("available diff algorithms include \"myers\", "
>  			       "\"minimal\", \"patience\" and \"histogram\"");
>  
> +	options->ignore_driver_algorithm = 1;
> +
>  	return 0;
>  }
> @@ -5285,6 +5299,7 @@ static int diff_opt_patience(const struct option *opt,
>  	for (i = 0; i < options->anchors_nr; i++)
>  		free(options->anchors[i]);
>  	options->anchors_nr = 0;
> +	options->ignore_driver_algorithm = 1;
>  
>  	return set_diff_algorithm(options, "patience");
>  }


I was hoping that set_diff_algorithm() can be the shared common one
that signals we were told to use a specific algorithm, but it also
is called from internal codepaths so it cannot be it.

It is probably not worth introducing an extra helper that only calls
set_diff_algorithm() and sets ignore_driver_algorithm bit only for
that to reduce three-times repetition.

OK.

> diff --git a/userdiff.c b/userdiff.c
> index d71b82feb74..ff25cfc4b4c 100644
> --- a/userdiff.c
> +++ b/userdiff.c
> @@ -293,7 +293,7 @@ PATTERNS("scheme",
>  	 "|([^][)(}{[ \t])+"),
>  PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",
>  	 "\\\\[a-zA-Z@]+|\\\\.|[a-zA-Z0-9\x80-\xff]+"),
> -{ "default", NULL, -1, { NULL, 0 } },
> +{ "default", NULL, NULL, -1, { NULL, 0 } },
>  };

I was surprised that there is so little damage to the built-in
userdiff driver definitions, but this is thanks to the PATTERNS()
and IPATTERN() macro that use designated initializers.  Very nice.

Nicely done.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v3 0/2] Teach diff to honor diff algorithms set through git attributes
  2023-02-17 20:21   ` [PATCH v3 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
  2023-02-17 20:21     ` [PATCH v3 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
  2023-02-17 20:21     ` [PATCH v3 2/2] diff: teach diff to read algorithm from diff driver John Cai via GitGitGadget
@ 2023-02-18  1:16     ` Elijah Newren
  2023-02-20 13:37       ` John Cai
  2023-02-20 21:04     ` [PATCH v4 " John Cai via GitGitGadget
  3 siblings, 1 reply; 78+ messages in thread
From: Elijah Newren @ 2023-02-18  1:16 UTC (permalink / raw)
  To: John Cai via GitGitGadget
  Cc: git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, John Cai

On Fri, Feb 17, 2023 at 12:21 PM John Cai via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> When a repository contains different kinds of files, it may be desirable to
> use different algorithms based on file type. This is currently not feasible
> through the command line or using git configs. However, we can leverage the
> fact that gitattributes are path aware.
>
> Teach the diff machinery to check gitattributes when diffing files by using
> the existing diff. scheme, and add an "algorithm" type to the external
> driver config.
[...]
> To address some of the performance concerns in the previous series, a
> benchmark shows that a performance penalty is no longer incurred, now that
> we are no longer adding an additional attributes parsing call:
>
> $ hyperfine -r 5 -L a bin-wrappers/git,git '{a} diff v2.0.0 v2.28.0'
> Benchmark 1: git-bin-wrapper diff v2.0.0 v2.28.0 Time (mean ± σ): 1.072 s ±
> 0.289 s [User: 0.626 s, System: 0.081 s] Range (min … max): 0.772 s … 1.537
> s 5 runs
>
> Benchmark 2: git diff v2.0.0 v2.28.0 Time (mean ± σ): 1.003 s ± 0.065 s
> [User: 0.684 s, System: 0.067 s] Range (min … max): 0.914 s … 1.091 s 5 runs
>
> Summary 'git diff v2.0.0 v2.28.0' ran 1.07 ± 0.30 times faster than
> 'git-bin-wrapper diff v2.0.0 v2.28.0'

I'm sorry, I don't understand this.  What are you measuring?  I
presume bin-wrappers/git refers to the version of git built with your
changes, but what version of git does "git" refer to?  Also, do you
have any .gitattributes or .git/config changes present when you are
testing to trigger the new functionality you have written?

Also, doesn't this benchmark demonstrate the opposite of your claim?
You said there was no performance penalty, but the benchmark shows a
7% slowdown.  We've battled hard to get smaller improvements than
that, so this is still worrisome, even if it's no longer a factor of 2
or whatever it was.  But, again, I'm not sure what is being measured.
If the difference is because patience diff was used for some files,
then it's not an apples-to-apples comparison, and a 7% slowdown would
be no cause for concern.

Since I was curious, I compiled both a version of git from directly
before your series, and directly after, then added a '*.[ch]
diff=other' line to the end of .gitattributes, then ran:

$ hyperfine -L a ./older-git,./newer-git '{a} -c
diff.other.algorithm=myers diff --numstat v2.0.0 v2.28.0'
Benchmark 1: ./older-git -c diff.other.algorithm=myers diff --numstat
v2.0.0 v2.28.0
  Time (mean ± σ):     870.2 ms ±   4.4 ms    [User: 755.2 ms, System: 109.8 ms]
  Range (min … max):   861.0 ms … 876.8 ms    10 runs

Benchmark 2: ./newer-git -c diff.other.algorithm=myers diff --numstat
v2.0.0 v2.28.0
  Time (mean ± σ):     876.9 ms ±   4.8 ms    [User: 758.0 ms, System: 113.1 ms]
  Range (min … max):   870.7 ms … 884.1 ms    10 runs

Summary
  './older-git -c diff.other.algorithm=myers diff --numstat v2.0.0 v2.28.0' ran
    1.01 ± 0.01 times faster than './newer-git -c
diff.other.algorithm=myers diff --numstat v2.0.0 v2.28.0'

I specifically specified 'myers' to match what we'd get from the
default anyway, so I would only be testing the slowdown from the
.gitattribute parsing.  So, I think the performance overhead comes out
to just 1% rather than 7% (and further that's when I make it only
print overall stats about the diff rather than the full diff, since I
know that's faster.  If I didn't do that, the perf hit might appear to
be less than 1%).

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v3 1/2] diff: consolidate diff algorithm option parsing
  2023-02-17 20:21     ` [PATCH v3 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
  2023-02-17 21:27       ` Junio C Hamano
@ 2023-02-18  1:36       ` Elijah Newren
  1 sibling, 0 replies; 78+ messages in thread
From: Elijah Newren @ 2023-02-18  1:36 UTC (permalink / raw)
  To: John Cai via GitGitGadget
  Cc: git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, John Cai

On Fri, Feb 17, 2023 at 12:21 PM John Cai via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: John Cai <johncai86@gmail.com>
>
> A subsequent commit will need the ability to tell if the diff algorithm
> was set through the command line through setting a new member of
> diff_options. While this logic can be added to the
> diff_opt_diff_algorithm() callback, the `--minimal` and `--histogram`
> options are handled via OPT_BIT without a callback.
>
> Remedy this by consolidating the options parsing logic for --minimal and
> --histogram into one callback. This way we can modify `diff_options` in
> that function.
>
> As an additional refactor, the logic that sets the diff algorithm in
> diff_opt_diff_algorithm() can be refactored into a helper that will
> allow multiple callsites to set the diff algorithm.
>
> Signed-off-by: John Cai <johncai86@gmail.com>
> ---
>  diff.c | 57 +++++++++++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 43 insertions(+), 14 deletions(-)
>
> diff --git a/diff.c b/diff.c
> index 329eebf16a0..5efc22ca06b 100644
> --- a/diff.c
> +++ b/diff.c
> @@ -3437,6 +3437,22 @@ static int diff_filepair_is_phoney(struct diff_filespec *one,
>         return !DIFF_FILE_VALID(one) && !DIFF_FILE_VALID(two);
>  }
>
> +static int set_diff_algorithm(struct diff_options *opts,
> +                             const char *alg)
> +{
> +       long value = parse_algorithm_value(alg);
> +
> +       if (value < 0)
> +               return -1;
> +
> +       /* clear out previous settings */
> +       DIFF_XDL_CLR(opts, NEED_MINIMAL);
> +       opts->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
> +       opts->xdl_opts |= value;
> +
> +       return 0;
> +}
> +
>  static void builtin_diff(const char *name_a,
>                          const char *name_b,
>                          struct diff_filespec *one,
> @@ -5107,17 +5123,28 @@ static int diff_opt_diff_algorithm(const struct option *opt,
>                                    const char *arg, int unset)
>  {
>         struct diff_options *options = opt->value;
> -       long value = parse_algorithm_value(arg);
>
>         BUG_ON_OPT_NEG(unset);
> -       if (value < 0)
> +
> +       if (set_diff_algorithm(options, arg))
>                 return error(_("option diff-algorithm accepts \"myers\", "
>                                "\"minimal\", \"patience\" and \"histogram\""));
>
> -       /* clear out previous settings */
> -       DIFF_XDL_CLR(options, NEED_MINIMAL);
> -       options->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
> -       options->xdl_opts |= value;
> +       return 0;
> +}
> +
> +static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
> +                                  const char *arg, int unset)
> +{
> +       struct diff_options *options = opt->value;
> +
> +       BUG_ON_OPT_NEG(unset);
> +       BUG_ON_OPT_ARG(arg);
> +
> +       if (set_diff_algorithm(options, opt->long_name))
> +               BUG("available diff algorithms include \"myers\", "
> +                              "\"minimal\", \"patience\" and \"histogram\"");
> +
>         return 0;
>  }
>
> @@ -5250,7 +5277,6 @@ static int diff_opt_patience(const struct option *opt,
>
>         BUG_ON_OPT_NEG(unset);
>         BUG_ON_OPT_ARG(arg);
> -       options->xdl_opts = DIFF_WITH_ALG(options, PATIENCE_DIFF);
>         /*
>          * Both --patience and --anchored use PATIENCE_DIFF
>          * internally, so remove any anchors previously
> @@ -5259,7 +5285,8 @@ static int diff_opt_patience(const struct option *opt,
>         for (i = 0; i < options->anchors_nr; i++)
>                 free(options->anchors[i]);
>         options->anchors_nr = 0;
> -       return 0;
> +
> +       return set_diff_algorithm(options, "patience");
>  }
>
>  static int diff_opt_ignore_regex(const struct option *opt,
> @@ -5562,9 +5589,10 @@ struct option *add_diff_options(const struct option *opts,
>                             N_("prevent rename/copy detection if the number of rename/copy targets exceeds given limit")),
>
>                 OPT_GROUP(N_("Diff algorithm options")),
> -               OPT_BIT(0, "minimal", &options->xdl_opts,
> -                       N_("produce the smallest possible diff"),
> -                       XDF_NEED_MINIMAL),
> +               OPT_CALLBACK_F(0, "minimal", options, NULL,
> +                              N_("produce the smallest possible diff"),
> +                              PARSE_OPT_NONEG | PARSE_OPT_NOARG,
> +                              diff_opt_diff_algorithm_no_arg),
>                 OPT_BIT_F('w', "ignore-all-space", &options->xdl_opts,
>                           N_("ignore whitespace when comparing lines"),
>                           XDF_IGNORE_WHITESPACE, PARSE_OPT_NONEG),
> @@ -5590,9 +5618,10 @@ struct option *add_diff_options(const struct option *opts,
>                                N_("generate diff using the \"patience diff\" algorithm"),
>                                PARSE_OPT_NONEG | PARSE_OPT_NOARG,
>                                diff_opt_patience),
> -               OPT_BITOP(0, "histogram", &options->xdl_opts,
> -                         N_("generate diff using the \"histogram diff\" algorithm"),
> -                         XDF_HISTOGRAM_DIFF, XDF_DIFF_ALGORITHM_MASK),
> +               OPT_CALLBACK_F(0, "histogram", options, NULL,
> +                              N_("generate diff using the \"histogram diff\" algorithm"),
> +                              PARSE_OPT_NONEG | PARSE_OPT_NOARG,
> +                              diff_opt_diff_algorithm_no_arg),
>                 OPT_CALLBACK_F(0, "diff-algorithm", options, N_("<algorithm>"),
>                                N_("choose a diff algorithm"),
>                                PARSE_OPT_NONEG, diff_opt_diff_algorithm),
> --
> gitgitgadget

This patch looks good to me.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v3 2/2] diff: teach diff to read algorithm from diff driver
  2023-02-17 20:21     ` [PATCH v3 2/2] diff: teach diff to read algorithm from diff driver John Cai via GitGitGadget
  2023-02-17 21:50       ` Junio C Hamano
@ 2023-02-18  2:56       ` Elijah Newren
  2023-02-20 15:32         ` John Cai
  1 sibling, 1 reply; 78+ messages in thread
From: Elijah Newren @ 2023-02-18  2:56 UTC (permalink / raw)
  To: John Cai via GitGitGadget
  Cc: git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, John Cai

On Fri, Feb 17, 2023 at 12:21 PM John Cai via GitGitGadget
<gitgitgadget@gmail.com> wrote:
>
> From: John Cai <johncai86@gmail.com>
>
> It can be useful to specify diff algorithms per file type. For example,
> one may want to use the minimal diff algorithm for .json files, another
> for .c files, etc.
>
> The diff machinery already checks attributes for a diff driver. Teach
> the diff driver parser a new type "algorithm" to look for in the
> config, which will be used if a driver has been specified through the
> attributes.
>
> Enforce precedence of the diff algorithm by favoring the command line
> option, then looking at the driver attributes & config combination, then
> finally the diff.algorithm config.
>
> To enforce precedence order, use a new `ignore_driver_algorithm` member
> during options pasing to indicate the diff algorithm was set via command
> line args.

s/pasing/parsing/

> Signed-off-by: John Cai <johncai86@gmail.com>
> ---
>  Documentation/gitattributes.txt | 37 ++++++++++++++++++++++++++++++++
>  diff.c                          | 33 ++++++++++++++++++++--------
>  diff.h                          |  1 +
>  t/lib-diff-alternative.sh       | 38 ++++++++++++++++++++++++++++++++-
>  userdiff.c                      |  4 +++-
>  userdiff.h                      |  1 +
>  6 files changed, 103 insertions(+), 11 deletions(-)
>
> diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt
> index c19e64ea0ef..f212079a131 100644
> --- a/Documentation/gitattributes.txt
> +++ b/Documentation/gitattributes.txt
> @@ -758,6 +758,43 @@ with the above configuration, i.e. `j-c-diff`, with 7
>  parameters, just like `GIT_EXTERNAL_DIFF` program is called.
>  See linkgit:git[1] for details.
>
> +Setting the internal diff algorithm
> +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> +
> +The diff algorithm can be set through the `diff.algorithm` config key, but
> +sometimes it may be helpful to set the diff algorithm by path. For example, one
> +might wish to set a diff algorithm automatically for all `.json` files such that
> +the user would not need to pass in a separate command line `--diff-algorithm`
> +flag each time.
> +
> +First, in `.gitattributes`, assign the `diff` attribute for paths.
> +
> +------------------------
> +*.json diff=<name>
> +------------------------
> +
> +Then, define a "diff.<name>.algorithm" configuration to specify the diff

Should there be a link to `git-config` right after "configuration"?
Otherwise, users may think that they are being told to specify
additional configuration within the .gitattributes file.

> +algorithm, choosing from `meyers`, `patience`, `minimal`, or `histogram`.

s/meyers/myers/

> +
> +----------------------------------------------------------------
> +[diff "<name>"]
> +  algorithm = histogram
> +----------------------------------------------------------------

It's pretty easy to assume the above is meant to be part of the
.gitattributes file instead of the .git/config file.  Don't most users
run `git config` commands directly rather than edit the .git/config
file?  Should we provide a sample command rather than showing what the
config file will contain?

> +
> +This diff algorithm applies to user facing diff output like git-diff(1),
> +git-show(1) and is used for the `--stat` output as well. The merge machinery
> +will not use the diff algorithm set through this method.

Yaay, thanks for including this!


I'm still curious if this should this also include warnings/caveats, such as:
  * The diff attribute specified in .gitattributes will be ignored in
a bare clone
  * The diff attribute specified in .gitattributes will be ignored if
it is only specified in another branch (e.g. on a branch "special-file
diff=patience" recorded in .gitattributes, then checkout master but
run `git log -1 -p $branch`)
  * When a file is renamed, the diff attribute for the pre-image name
is the only one the system pays attention to (thus adding "-R" can
flip which diff algorithm is run for the renamed file).

Also, since I tested the three items above to verify they are valid
warnings, I'm a bit confused.  I thought your intent was to use this
server-side[1], so isn't the bare clone aspect a deal-breaker for your
intended usecase?

[1] https://lore.kernel.org/git/7852AC7B-7A4E-4DD0-ADEA-CFFD5D16C595@gmail.com/

> +
> +NOTE: If the `command` key also exists, then Git will treat this as an external
> +diff and attempt to use the value set for `command` as an external program. For
> +instance, the following config, combined with the above `.gitattributes` file,
> +will result in `command` favored over `algorithm`.
> +
> +----------------------------------------------------------------
> +[diff "<name>"]
> +  command = j-c-diff
> +  algorithm = histogram
> +----------------------------------------------------------------
>
>  Defining a custom hunk-header
>  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> diff --git a/diff.c b/diff.c
> index 5efc22ca06b..04469da6d34 100644
> --- a/diff.c
> +++ b/diff.c
> @@ -4456,15 +4456,13 @@ static void run_diff_cmd(const char *pgm,
>         const char *xfrm_msg = NULL;
>         int complete_rewrite = (p->status == DIFF_STATUS_MODIFIED) && p->score;
>         int must_show_header = 0;
> +       struct userdiff_driver *drv = NULL;
>
> -
> -       if (o->flags.allow_external) {
> -               struct userdiff_driver *drv;
> -
> +       if (o->flags.allow_external || !o->ignore_driver_algorithm)
>                 drv = userdiff_find_by_path(o->repo->index, attr_path);
> -               if (drv && drv->external)
> -                       pgm = drv->external;
> -       }
> +
> +       if (o->flags.allow_external && drv && drv->external)
> +               pgm = drv->external;
>
>         if (msg) {
>                 /*
> @@ -4481,12 +4479,16 @@ static void run_diff_cmd(const char *pgm,
>                 run_external_diff(pgm, name, other, one, two, xfrm_msg, o);
>                 return;
>         }
> -       if (one && two)
> +       if (one && two) {
> +               if (drv && !o->ignore_driver_algorithm && drv->algorithm)
> +                       set_diff_algorithm(o, drv->algorithm);
> +
>                 builtin_diff(name, other ? other : name,
>                              one, two, xfrm_msg, must_show_header,
>                              o, complete_rewrite);
> -       else
> +       } else {
>                 fprintf(o->file, "* Unmerged path %s\n", name);
> +       }
>  }
>
>  static void diff_fill_oid_info(struct diff_filespec *one, struct index_state *istate)
> @@ -4583,6 +4585,14 @@ static void run_diffstat(struct diff_filepair *p, struct diff_options *o,
>         const char *name;
>         const char *other;
>
> +       if (!o->ignore_driver_algorithm) {
> +               struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, p->one->path);
> +
> +               if (drv && drv->algorithm) {
> +                       set_diff_algorithm(o, drv->algorithm);
> +               }
> +       }
> +
>         if (DIFF_PAIR_UNMERGED(p)) {
>                 /* unmerged */
>                 builtin_diffstat(p->one->path, NULL, NULL, NULL,
> @@ -5130,6 +5140,8 @@ static int diff_opt_diff_algorithm(const struct option *opt,
>                 return error(_("option diff-algorithm accepts \"myers\", "
>                                "\"minimal\", \"patience\" and \"histogram\""));
>
> +       options->ignore_driver_algorithm = 1;
> +
>         return 0;
>  }
>
> @@ -5145,6 +5157,8 @@ static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
>                 BUG("available diff algorithms include \"myers\", "
>                                "\"minimal\", \"patience\" and \"histogram\"");
>
> +       options->ignore_driver_algorithm = 1;
> +
>         return 0;
>  }
>
> @@ -5285,6 +5299,7 @@ static int diff_opt_patience(const struct option *opt,
>         for (i = 0; i < options->anchors_nr; i++)
>                 free(options->anchors[i]);
>         options->anchors_nr = 0;
> +       options->ignore_driver_algorithm = 1;
>
>         return set_diff_algorithm(options, "patience");
>  }
> diff --git a/diff.h b/diff.h
> index 41eb2c3d428..8d770b1d579 100644
> --- a/diff.h
> +++ b/diff.h
> @@ -333,6 +333,7 @@ struct diff_options {
>         int prefix_length;
>         const char *stat_sep;
>         int xdl_opts;
> +       int ignore_driver_algorithm;
>
>         /* see Documentation/diff-options.txt */
>         char **anchors;
> diff --git a/t/lib-diff-alternative.sh b/t/lib-diff-alternative.sh
> index 8d1e408bb58..2dc02bca873 100644
> --- a/t/lib-diff-alternative.sh
> +++ b/t/lib-diff-alternative.sh
> @@ -105,10 +105,46 @@ index $file1..$file2 100644
>   }
>  EOF
>
> +       cat >expect_diffstat <<EOF
> + file1 => file2 | 21 ++++++++++-----------
> + 1 file changed, 10 insertions(+), 11 deletions(-)
> +EOF
> +
>         STRATEGY=$1
>
> +       test_expect_success "$STRATEGY diff from attributes" '
> +               echo "file* diff=driver" >.gitattributes &&
> +               git config diff.driver.algorithm "$STRATEGY" &&
> +               test_must_fail git diff --no-index file1 file2 > output &&
> +               cat expect &&
> +               cat output &&
> +               test_cmp expect output
> +       '
> +
> +       test_expect_success "$STRATEGY diff from attributes has valid diffstat" '
> +               echo "file* diff=driver" >.gitattributes &&
> +               git config diff.driver.algorithm "$STRATEGY" &&
> +               test_must_fail git diff --stat --no-index file1 file2 > output &&
> +               test_cmp expect_diffstat output
> +       '
> +
>         test_expect_success "$STRATEGY diff" '
> -               test_must_fail git diff --no-index "--$STRATEGY" file1 file2 > output &&
> +               test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
> +               test_cmp expect output
> +       '
> +
> +       test_expect_success "$STRATEGY diff command line precedence before attributes" '
> +               echo "file* diff=driver" >.gitattributes &&
> +               git config diff.driver.algorithm meyers &&

Is this misspelling of myers intentional?  I think with the typo, the
code falls back to the default algorithm, which happens to be myers,
so I think the test works either way, but were you intending to test
fallback behavior in case of a typo here, or was that accidental?



> +               test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
> +               test_cmp expect output
> +       '
> +
> +       test_expect_success "$STRATEGY diff attributes precedence before config" '
> +               git config diff.algorithm default &&
> +               echo "file* diff=driver" >.gitattributes &&
> +               git config diff.driver.algorithm "$STRATEGY" &&
> +               test_must_fail git diff --no-index file1 file2 > output &&
>                 test_cmp expect output
>         '
>
> diff --git a/userdiff.c b/userdiff.c
> index d71b82feb74..ff25cfc4b4c 100644
> --- a/userdiff.c
> +++ b/userdiff.c
> @@ -293,7 +293,7 @@ PATTERNS("scheme",
>          "|([^][)(}{[ \t])+"),
>  PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",
>          "\\\\[a-zA-Z@]+|\\\\.|[a-zA-Z0-9\x80-\xff]+"),
> -{ "default", NULL, -1, { NULL, 0 } },
> +{ "default", NULL, NULL, -1, { NULL, 0 } },
>  };
>  #undef PATTERNS
>  #undef IPATTERN
> @@ -394,6 +394,8 @@ int userdiff_config(const char *k, const char *v)
>                 return parse_bool(&drv->textconv_want_cache, k, v);
>         if (!strcmp(type, "wordregex"))
>                 return git_config_string(&drv->word_regex, k, v);
> +       if (!strcmp(type, "algorithm"))
> +               return git_config_string(&drv->algorithm, k, v);
>
>         return 0;
>  }
> diff --git a/userdiff.h b/userdiff.h
> index aee91bc77e6..24419db6973 100644
> --- a/userdiff.h
> +++ b/userdiff.h
> @@ -14,6 +14,7 @@ struct userdiff_funcname {
>  struct userdiff_driver {
>         const char *name;
>         const char *external;
> +       const char *algorithm;
>         int binary;
>         struct userdiff_funcname funcname;
>         const char *word_regex;
> --
> gitgitgadget

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v3 0/2] Teach diff to honor diff algorithms set through git attributes
  2023-02-18  1:16     ` [PATCH v3 0/2] Teach diff to honor diff algorithms set through git attributes Elijah Newren
@ 2023-02-20 13:37       ` John Cai
  0 siblings, 0 replies; 78+ messages in thread
From: John Cai @ 2023-02-20 13:37 UTC (permalink / raw)
  To: Elijah Newren
  Cc: John Cai via GitGitGadget, git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King

Hi Elijah,

On 17 Feb 2023, at 20:16, Elijah Newren wrote:

> On Fri, Feb 17, 2023 at 12:21 PM John Cai via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>>
>> When a repository contains different kinds of files, it may be desirable to
>> use different algorithms based on file type. This is currently not feasible
>> through the command line or using git configs. However, we can leverage the
>> fact that gitattributes are path aware.
>>
>> Teach the diff machinery to check gitattributes when diffing files by using
>> the existing diff. scheme, and add an "algorithm" type to the external
>> driver config.
> [...]
>> To address some of the performance concerns in the previous series, a
>> benchmark shows that a performance penalty is no longer incurred, now that
>> we are no longer adding an additional attributes parsing call:
>>
>> $ hyperfine -r 5 -L a bin-wrappers/git,git '{a} diff v2.0.0 v2.28.0'
>> Benchmark 1: git-bin-wrapper diff v2.0.0 v2.28.0 Time (mean ± σ): 1.072 s ±
>> 0.289 s [User: 0.626 s, System: 0.081 s] Range (min … max): 0.772 s … 1.537
>> s 5 runs
>>
>> Benchmark 2: git diff v2.0.0 v2.28.0 Time (mean ± σ): 1.003 s ± 0.065 s
>> [User: 0.684 s, System: 0.067 s] Range (min … max): 0.914 s … 1.091 s 5 runs
>>
>> Summary 'git diff v2.0.0 v2.28.0' ran 1.07 ± 0.30 times faster than
>> 'git-bin-wrapper diff v2.0.0 v2.28.0'
>
> I'm sorry, I don't understand this.  What are you measuring?  I
> presume bin-wrappers/git refers to the version of git built with your
> changes, but what version of git does "git" refer to?  Also, do you
> have any .gitattributes or .git/config changes present when you are
> testing to trigger the new functionality you have written?
>
> Also, doesn't this benchmark demonstrate the opposite of your claim?
> You said there was no performance penalty, but the benchmark shows a
> 7% slowdown.  We've battled hard to get smaller improvements than
> that, so this is still worrisome, even if it's no longer a factor of 2
> or whatever it was.  But, again, I'm not sure what is being measured.
> If the difference is because patience diff was used for some files,
> then it's not an apples-to-apples comparison, and a 7% slowdown would
> be no cause for concern.
>
> Since I was curious, I compiled both a version of git from directly
> before your series, and directly after, then added a '*.[ch]
> diff=other' line to the end of .gitattributes, then ran:
>
> $ hyperfine -L a ./older-git,./newer-git '{a} -c
> diff.other.algorithm=myers diff --numstat v2.0.0 v2.28.0'
> Benchmark 1: ./older-git -c diff.other.algorithm=myers diff --numstat
> v2.0.0 v2.28.0
>   Time (mean ± σ):     870.2 ms ±   4.4 ms    [User: 755.2 ms, System: 109.8 ms]
>   Range (min … max):   861.0 ms … 876.8 ms    10 runs
>
> Benchmark 2: ./newer-git -c diff.other.algorithm=myers diff --numstat
> v2.0.0 v2.28.0
>   Time (mean ± σ):     876.9 ms ±   4.8 ms    [User: 758.0 ms, System: 113.1 ms]
>   Range (min … max):   870.7 ms … 884.1 ms    10 runs
>
> Summary
>   './older-git -c diff.other.algorithm=myers diff --numstat v2.0.0 v2.28.0' ran
>     1.01 ± 0.01 times faster than './newer-git -c
> diff.other.algorithm=myers diff --numstat v2.0.0 v2.28.0'
>
> I specifically specified 'myers' to match what we'd get from the
> default anyway, so I would only be testing the slowdown from the
> .gitattribute parsing.  So, I think the performance overhead comes out
> to just 1% rather than 7% (and further that's when I make it only
> print overall stats about the diff rather than the full diff, since I
> know that's faster.  If I didn't do that, the perf hit might appear to
> be less than 1%).

Thanks for taking the time to do this! I should have been a bit more careful
about this benchmark, and more explicit about what it was benchmarking. I just
ran it again and made sure that the same algorithm was used, and I got results
similar to you.

Will update the cover letter, thanks!

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v3 2/2] diff: teach diff to read algorithm from diff driver
  2023-02-18  2:56       ` Elijah Newren
@ 2023-02-20 15:32         ` John Cai
  2023-02-20 16:21           ` Elijah Newren
  0 siblings, 1 reply; 78+ messages in thread
From: John Cai @ 2023-02-20 15:32 UTC (permalink / raw)
  To: Elijah Newren
  Cc: John Cai via GitGitGadget, git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King

Hi Elijah,

On 17 Feb 2023, at 21:56, Elijah Newren wrote:

> On Fri, Feb 17, 2023 at 12:21 PM John Cai via GitGitGadget
> <gitgitgadget@gmail.com> wrote:
>>
>> From: John Cai <johncai86@gmail.com>
>>
>> It can be useful to specify diff algorithms per file type. For example,
>> one may want to use the minimal diff algorithm for .json files, another
>> for .c files, etc.
>>
>> The diff machinery already checks attributes for a diff driver. Teach
>> the diff driver parser a new type "algorithm" to look for in the
>> config, which will be used if a driver has been specified through the
>> attributes.
>>
>> Enforce precedence of the diff algorithm by favoring the command line
>> option, then looking at the driver attributes & config combination, then
>> finally the diff.algorithm config.
>>
>> To enforce precedence order, use a new `ignore_driver_algorithm` member
>> during options pasing to indicate the diff algorithm was set via command
>> line args.
>
> s/pasing/parsing/

thanks for noticing this!

>
>> Signed-off-by: John Cai <johncai86@gmail.com>
>> ---
>>  Documentation/gitattributes.txt | 37 ++++++++++++++++++++++++++++++++
>>  diff.c                          | 33 ++++++++++++++++++++--------
>>  diff.h                          |  1 +
>>  t/lib-diff-alternative.sh       | 38 ++++++++++++++++++++++++++++++++-
>>  userdiff.c                      |  4 +++-
>>  userdiff.h                      |  1 +
>>  6 files changed, 103 insertions(+), 11 deletions(-)
>>
>> diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt
>> index c19e64ea0ef..f212079a131 100644
>> --- a/Documentation/gitattributes.txt
>> +++ b/Documentation/gitattributes.txt
>> @@ -758,6 +758,43 @@ with the above configuration, i.e. `j-c-diff`, with 7
>>  parameters, just like `GIT_EXTERNAL_DIFF` program is called.
>>  See linkgit:git[1] for details.
>>
>> +Setting the internal diff algorithm
>> +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>> +
>> +The diff algorithm can be set through the `diff.algorithm` config key, but
>> +sometimes it may be helpful to set the diff algorithm by path. For example, one
>> +might wish to set a diff algorithm automatically for all `.json` files such that
>> +the user would not need to pass in a separate command line `--diff-algorithm`
>> +flag each time.
>> +
>> +First, in `.gitattributes`, assign the `diff` attribute for paths.
>> +
>> +------------------------
>> +*.json diff=<name>
>> +------------------------
>> +
>> +Then, define a "diff.<name>.algorithm" configuration to specify the diff
>
> Should there be a link to `git-config` right after "configuration"?
> Otherwise, users may think that they are being told to specify
> additional configuration within the .gitattributes file.
>
>> +algorithm, choosing from `meyers`, `patience`, `minimal`, or `histogram`.
>
> s/meyers/myers/

likewise.

>
>> +
>> +----------------------------------------------------------------
>> +[diff "<name>"]
>> +  algorithm = histogram
>> +----------------------------------------------------------------
>
> It's pretty easy to assume the above is meant to be part of the
> .gitattributes file instead of the .git/config file.  Don't most users
> run `git config` commands directly rather than edit the .git/config
> file?  Should we provide a sample command rather than showing what the
> config file will contain?

Not sure--I think I was was just following what I saw in the existing
documentation for driver configuration. But would be interested to see what
other folks think.

>
>> +
>> +This diff algorithm applies to user facing diff output like git-diff(1),
>> +git-show(1) and is used for the `--stat` output as well. The merge machinery
>> +will not use the diff algorithm set through this method.
>
> Yaay, thanks for including this!
>
>
> I'm still curious if this should this also include warnings/caveats, such as:
>   * The diff attribute specified in .gitattributes will be ignored in
> a bare clone
>   * The diff attribute specified in .gitattributes will be ignored if
> it is only specified in another branch (e.g. on a branch "special-file
> diff=patience" recorded in .gitattributes, then checkout master but
> run `git log -1 -p $branch`)
>   * When a file is renamed, the diff attribute for the pre-image name
> is the only one the system pays attention to (thus adding "-R" can
> flip which diff algorithm is run for the renamed file).

I would be fine with adding that--though originally I was thinking that these
can be inferred from the way that gitattributes are documented in [1]. Calling
these out would make it more clear though, so I could go either way.

>
> Also, since I tested the three items above to verify they are valid
> warnings, I'm a bit confused.  I thought your intent was to use this
> server-side[1], so isn't the bare clone aspect a deal-breaker for your
> intended usecase?
>
> [1] https://lore.kernel.org/git/7852AC7B-7A4E-4DD0-ADEA-CFFD5D16C595@gmail.com/

yes, indeed. I was planning on adding bare repository support in a separate
patch series, since the additions in [2] allows .gitattributes to be read from a
bare repository.

1. https://git-scm.com/docs/gitattributes
2. https://lore.kernel.org/git/0ca8b2458921fc40269b0c43b5ec86eba77d6b54.1673684790.git.karthik.188@gmail.com/

thanks!
John

>
>> +
>> +NOTE: If the `command` key also exists, then Git will treat this as an external
>> +diff and attempt to use the value set for `command` as an external program. For
>> +instance, the following config, combined with the above `.gitattributes` file,
>> +will result in `command` favored over `algorithm`.
>> +
>> +----------------------------------------------------------------
>> +[diff "<name>"]
>> +  command = j-c-diff
>> +  algorithm = histogram
>> +----------------------------------------------------------------
>>
>>  Defining a custom hunk-header
>>  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>> diff --git a/diff.c b/diff.c
>> index 5efc22ca06b..04469da6d34 100644
>> --- a/diff.c
>> +++ b/diff.c
>> @@ -4456,15 +4456,13 @@ static void run_diff_cmd(const char *pgm,
>>         const char *xfrm_msg = NULL;
>>         int complete_rewrite = (p->status == DIFF_STATUS_MODIFIED) && p->score;
>>         int must_show_header = 0;
>> +       struct userdiff_driver *drv = NULL;
>>
>> -
>> -       if (o->flags.allow_external) {
>> -               struct userdiff_driver *drv;
>> -
>> +       if (o->flags.allow_external || !o->ignore_driver_algorithm)
>>                 drv = userdiff_find_by_path(o->repo->index, attr_path);
>> -               if (drv && drv->external)
>> -                       pgm = drv->external;
>> -       }
>> +
>> +       if (o->flags.allow_external && drv && drv->external)
>> +               pgm = drv->external;
>>
>>         if (msg) {
>>                 /*
>> @@ -4481,12 +4479,16 @@ static void run_diff_cmd(const char *pgm,
>>                 run_external_diff(pgm, name, other, one, two, xfrm_msg, o);
>>                 return;
>>         }
>> -       if (one && two)
>> +       if (one && two) {
>> +               if (drv && !o->ignore_driver_algorithm && drv->algorithm)
>> +                       set_diff_algorithm(o, drv->algorithm);
>> +
>>                 builtin_diff(name, other ? other : name,
>>                              one, two, xfrm_msg, must_show_header,
>>                              o, complete_rewrite);
>> -       else
>> +       } else {
>>                 fprintf(o->file, "* Unmerged path %s\n", name);
>> +       }
>>  }
>>
>>  static void diff_fill_oid_info(struct diff_filespec *one, struct index_state *istate)
>> @@ -4583,6 +4585,14 @@ static void run_diffstat(struct diff_filepair *p, struct diff_options *o,
>>         const char *name;
>>         const char *other;
>>
>> +       if (!o->ignore_driver_algorithm) {
>> +               struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, p->one->path);
>> +
>> +               if (drv && drv->algorithm) {
>> +                       set_diff_algorithm(o, drv->algorithm);
>> +               }
>> +       }
>> +
>>         if (DIFF_PAIR_UNMERGED(p)) {
>>                 /* unmerged */
>>                 builtin_diffstat(p->one->path, NULL, NULL, NULL,
>> @@ -5130,6 +5140,8 @@ static int diff_opt_diff_algorithm(const struct option *opt,
>>                 return error(_("option diff-algorithm accepts \"myers\", "
>>                                "\"minimal\", \"patience\" and \"histogram\""));
>>
>> +       options->ignore_driver_algorithm = 1;
>> +
>>         return 0;
>>  }
>>
>> @@ -5145,6 +5157,8 @@ static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
>>                 BUG("available diff algorithms include \"myers\", "
>>                                "\"minimal\", \"patience\" and \"histogram\"");
>>
>> +       options->ignore_driver_algorithm = 1;
>> +
>>         return 0;
>>  }
>>
>> @@ -5285,6 +5299,7 @@ static int diff_opt_patience(const struct option *opt,
>>         for (i = 0; i < options->anchors_nr; i++)
>>                 free(options->anchors[i]);
>>         options->anchors_nr = 0;
>> +       options->ignore_driver_algorithm = 1;
>>
>>         return set_diff_algorithm(options, "patience");
>>  }
>> diff --git a/diff.h b/diff.h
>> index 41eb2c3d428..8d770b1d579 100644
>> --- a/diff.h
>> +++ b/diff.h
>> @@ -333,6 +333,7 @@ struct diff_options {
>>         int prefix_length;
>>         const char *stat_sep;
>>         int xdl_opts;
>> +       int ignore_driver_algorithm;
>>
>>         /* see Documentation/diff-options.txt */
>>         char **anchors;
>> diff --git a/t/lib-diff-alternative.sh b/t/lib-diff-alternative.sh
>> index 8d1e408bb58..2dc02bca873 100644
>> --- a/t/lib-diff-alternative.sh
>> +++ b/t/lib-diff-alternative.sh
>> @@ -105,10 +105,46 @@ index $file1..$file2 100644
>>   }
>>  EOF
>>
>> +       cat >expect_diffstat <<EOF
>> + file1 => file2 | 21 ++++++++++-----------
>> + 1 file changed, 10 insertions(+), 11 deletions(-)
>> +EOF
>> +
>>         STRATEGY=$1
>>
>> +       test_expect_success "$STRATEGY diff from attributes" '
>> +               echo "file* diff=driver" >.gitattributes &&
>> +               git config diff.driver.algorithm "$STRATEGY" &&
>> +               test_must_fail git diff --no-index file1 file2 > output &&
>> +               cat expect &&
>> +               cat output &&
>> +               test_cmp expect output
>> +       '
>> +
>> +       test_expect_success "$STRATEGY diff from attributes has valid diffstat" '
>> +               echo "file* diff=driver" >.gitattributes &&
>> +               git config diff.driver.algorithm "$STRATEGY" &&
>> +               test_must_fail git diff --stat --no-index file1 file2 > output &&
>> +               test_cmp expect_diffstat output
>> +       '
>> +
>>         test_expect_success "$STRATEGY diff" '
>> -               test_must_fail git diff --no-index "--$STRATEGY" file1 file2 > output &&
>> +               test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
>> +               test_cmp expect output
>> +       '
>> +
>> +       test_expect_success "$STRATEGY diff command line precedence before attributes" '
>> +               echo "file* diff=driver" >.gitattributes &&
>> +               git config diff.driver.algorithm meyers &&
>
> Is this misspelling of myers intentional?  I think with the typo, the
> code falls back to the default algorithm, which happens to be myers,
> so I think the test works either way, but were you intending to test
> fallback behavior in case of a typo here, or was that accidental?
>
>
>
>> +               test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
>> +               test_cmp expect output
>> +       '
>> +
>> +       test_expect_success "$STRATEGY diff attributes precedence before config" '
>> +               git config diff.algorithm default &&
>> +               echo "file* diff=driver" >.gitattributes &&
>> +               git config diff.driver.algorithm "$STRATEGY" &&
>> +               test_must_fail git diff --no-index file1 file2 > output &&
>>                 test_cmp expect output
>>         '
>>
>> diff --git a/userdiff.c b/userdiff.c
>> index d71b82feb74..ff25cfc4b4c 100644
>> --- a/userdiff.c
>> +++ b/userdiff.c
>> @@ -293,7 +293,7 @@ PATTERNS("scheme",
>>          "|([^][)(}{[ \t])+"),
>>  PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",
>>          "\\\\[a-zA-Z@]+|\\\\.|[a-zA-Z0-9\x80-\xff]+"),
>> -{ "default", NULL, -1, { NULL, 0 } },
>> +{ "default", NULL, NULL, -1, { NULL, 0 } },
>>  };
>>  #undef PATTERNS
>>  #undef IPATTERN
>> @@ -394,6 +394,8 @@ int userdiff_config(const char *k, const char *v)
>>                 return parse_bool(&drv->textconv_want_cache, k, v);
>>         if (!strcmp(type, "wordregex"))
>>                 return git_config_string(&drv->word_regex, k, v);
>> +       if (!strcmp(type, "algorithm"))
>> +               return git_config_string(&drv->algorithm, k, v);
>>
>>         return 0;
>>  }
>> diff --git a/userdiff.h b/userdiff.h
>> index aee91bc77e6..24419db6973 100644
>> --- a/userdiff.h
>> +++ b/userdiff.h
>> @@ -14,6 +14,7 @@ struct userdiff_funcname {
>>  struct userdiff_driver {
>>         const char *name;
>>         const char *external;
>> +       const char *algorithm;
>>         int binary;
>>         struct userdiff_funcname funcname;
>>         const char *word_regex;
>> --
>> gitgitgadget

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v3 2/2] diff: teach diff to read algorithm from diff driver
  2023-02-20 15:32         ` John Cai
@ 2023-02-20 16:21           ` Elijah Newren
  2023-02-20 16:49             ` John Cai
  0 siblings, 1 reply; 78+ messages in thread
From: Elijah Newren @ 2023-02-20 16:21 UTC (permalink / raw)
  To: John Cai
  Cc: John Cai via GitGitGadget, git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King

On Mon, Feb 20, 2023 at 7:32 AM John Cai <johncai86@gmail.com> wrote:
[...]
> > I'm still curious if this should this also include warnings/caveats, such as:
> >   * The diff attribute specified in .gitattributes will be ignored in
> > a bare clone
> >   * The diff attribute specified in .gitattributes will be ignored if
> > it is only specified in another branch (e.g. on a branch "special-file
> > diff=patience" recorded in .gitattributes, then checkout master but
> > run `git log -1 -p $branch`)
> >   * When a file is renamed, the diff attribute for the pre-image name
> > is the only one the system pays attention to (thus adding "-R" can
> > flip which diff algorithm is run for the renamed file).
>
> I would be fine with adding that--though originally I was thinking that these
> can be inferred from the way that gitattributes are documented in [1]. Calling
> these out would make it more clear though, so I could go either way.
>
> >
> > Also, since I tested the three items above to verify they are valid
> > warnings, I'm a bit confused.  I thought your intent was to use this
> > server-side[1], so isn't the bare clone aspect a deal-breaker for your
> > intended usecase?
> >
> > [1] https://lore.kernel.org/git/7852AC7B-7A4E-4DD0-ADEA-CFFD5D16C595@gmail.com/
>
> yes, indeed. I was planning on adding bare repository support in a separate
> patch series, since the additions in [2] allows .gitattributes to be read from a
> bare repository.
>
> 1. https://git-scm.com/docs/gitattributes
> 2. https://lore.kernel.org/git/0ca8b2458921fc40269b0c43b5ec86eba77d6b54.1673684790.git.karthik.188@gmail.com/
>
> thanks!
> John

Oh, interesting, I didn't know about [2].  So, is the plan to take the
--source option from that series and add it to diff (perhaps with a
different name, since log tends to consume diff options and --source
is already taken)?

And do you expect to get the tree-ish from the two the users are
already specifying to diff?  If so, which one do you use (the two
commits being diffed might have differing .gitattributes files)?  If
not, what does that mean for users of e.g. the GitLab UI who have to
specify a third tree when diffing?

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v3 2/2] diff: teach diff to read algorithm from diff driver
  2023-02-20 16:21           ` Elijah Newren
@ 2023-02-20 16:49             ` John Cai
  2023-02-20 17:32               ` Elijah Newren
  0 siblings, 1 reply; 78+ messages in thread
From: John Cai @ 2023-02-20 16:49 UTC (permalink / raw)
  To: Elijah Newren
  Cc: John Cai via GitGitGadget, git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King

Hi Elijah,

On 20 Feb 2023, at 11:21, Elijah Newren wrote:

> On Mon, Feb 20, 2023 at 7:32 AM John Cai <johncai86@gmail.com> wrote:
> [...]
>>> I'm still curious if this should this also include warnings/caveats, such as:
>>>   * The diff attribute specified in .gitattributes will be ignored in
>>> a bare clone
>>>   * The diff attribute specified in .gitattributes will be ignored if
>>> it is only specified in another branch (e.g. on a branch "special-file
>>> diff=patience" recorded in .gitattributes, then checkout master but
>>> run `git log -1 -p $branch`)
>>>   * When a file is renamed, the diff attribute for the pre-image name
>>> is the only one the system pays attention to (thus adding "-R" can
>>> flip which diff algorithm is run for the renamed file).
>>
>> I would be fine with adding that--though originally I was thinking that these
>> can be inferred from the way that gitattributes are documented in [1]. Calling
>> these out would make it more clear though, so I could go either way.
>>
>>>
>>> Also, since I tested the three items above to verify they are valid
>>> warnings, I'm a bit confused.  I thought your intent was to use this
>>> server-side[1], so isn't the bare clone aspect a deal-breaker for your
>>> intended usecase?
>>>
>>> [1] https://lore.kernel.org/git/7852AC7B-7A4E-4DD0-ADEA-CFFD5D16C595@gmail.com/
>>
>> yes, indeed. I was planning on adding bare repository support in a separate
>> patch series, since the additions in [2] allows .gitattributes to be read from a
>> bare repository.
>>
>> 1. https://git-scm.com/docs/gitattributes
>> 2. https://lore.kernel.org/git/0ca8b2458921fc40269b0c43b5ec86eba77d6b54.1673684790.git.karthik.188@gmail.com/
>>
>> thanks!
>> John
>
> Oh, interesting, I didn't know about [2].  So, is the plan to take the
> --source option from that series and add it to diff (perhaps with a
> different name, since log tends to consume diff options and --source
> is already taken)?

Yep, that would be the general idea

>
> And do you expect to get the tree-ish from the two the users are
> already specifying to diff?  If so, which one do you use (the two
> commits being diffed might have differing .gitattributes files)?  If
> not, what does that mean for users of e.g. the GitLab UI who have to
> specify a third tree when diffing?

Good question! Since it seems that when `git-diff(1)` considers diff.<driver>,
it goes with the path of the first one. (might need some confirmation here)

in diff.c:


static void run_diff(struct diff_filepair *p, struct diff_options *o)
{
	const char *pgm = external_diff();
	struct strbuf msg;
	struct diff_filespec *one = p->one;
	struct diff_filespec *two = p->two;
	const char *name;
	const char *other;
	const char *attr_path;

	name  = one->path;
	other = (strcmp(name, two->path) ? two->path : NULL);
	attr_path = name;
	if (o->prefix_length)

I was thinking we would just use the tree-ish of the first one

thanks
John

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v3 2/2] diff: teach diff to read algorithm from diff driver
  2023-02-20 16:49             ` John Cai
@ 2023-02-20 17:32               ` Elijah Newren
  2023-02-20 20:53                 ` John Cai
  2023-02-22 19:47                 ` Jeff King
  0 siblings, 2 replies; 78+ messages in thread
From: Elijah Newren @ 2023-02-20 17:32 UTC (permalink / raw)
  To: John Cai
  Cc: John Cai via GitGitGadget, git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King

On Mon, Feb 20, 2023 at 8:49 AM John Cai <johncai86@gmail.com> wrote:
>
> Hi Elijah,
>
> On 20 Feb 2023, at 11:21, Elijah Newren wrote:
>
> > On Mon, Feb 20, 2023 at 7:32 AM John Cai <johncai86@gmail.com> wrote:
> > [...]
> >>> I'm still curious if this should this also include warnings/caveats, such as:
> >>>   * The diff attribute specified in .gitattributes will be ignored in
> >>> a bare clone
> >>>   * The diff attribute specified in .gitattributes will be ignored if
> >>> it is only specified in another branch (e.g. on a branch "special-file
> >>> diff=patience" recorded in .gitattributes, then checkout master but
> >>> run `git log -1 -p $branch`)
> >>>   * When a file is renamed, the diff attribute for the pre-image name
> >>> is the only one the system pays attention to (thus adding "-R" can
> >>> flip which diff algorithm is run for the renamed file).
> >>
> >> I would be fine with adding that--though originally I was thinking that these
> >> can be inferred from the way that gitattributes are documented in [1]. Calling
> >> these out would make it more clear though, so I could go either way.
> >>
> >>>
> >>> Also, since I tested the three items above to verify they are valid
> >>> warnings, I'm a bit confused.  I thought your intent was to use this
> >>> server-side[1], so isn't the bare clone aspect a deal-breaker for your
> >>> intended usecase?
> >>>
> >>> [1] https://lore.kernel.org/git/7852AC7B-7A4E-4DD0-ADEA-CFFD5D16C595@gmail.com/
> >>
> >> yes, indeed. I was planning on adding bare repository support in a separate
> >> patch series, since the additions in [2] allows .gitattributes to be read from a
> >> bare repository.
> >>
> >> 1. https://git-scm.com/docs/gitattributes
> >> 2. https://lore.kernel.org/git/0ca8b2458921fc40269b0c43b5ec86eba77d6b54.1673684790.git.karthik.188@gmail.com/
> >>
> >> thanks!
> >> John
> >
> > Oh, interesting, I didn't know about [2].  So, is the plan to take the
> > --source option from that series and add it to diff (perhaps with a
> > different name, since log tends to consume diff options and --source
> > is already taken)?
>
> Yep, that would be the general idea
>
> >
> > And do you expect to get the tree-ish from the two the users are
> > already specifying to diff?  If so, which one do you use (the two
> > commits being diffed might have differing .gitattributes files)?  If
> > not, what does that mean for users of e.g. the GitLab UI who have to
> > specify a third tree when diffing?
>
> Good question! Since it seems that when `git-diff(1)` considers diff.<driver>,
> it goes with the path of the first one. (might need some confirmation here)
>
> in diff.c:
>
>
> static void run_diff(struct diff_filepair *p, struct diff_options *o)
> {
>         const char *pgm = external_diff();
>         struct strbuf msg;
>         struct diff_filespec *one = p->one;
>         struct diff_filespec *two = p->two;
>         const char *name;
>         const char *other;
>         const char *attr_path;
>
>         name  = one->path;
>         other = (strcmp(name, two->path) ? two->path : NULL);
>         attr_path = name;
>         if (o->prefix_length)
>
> I was thinking we would just use the tree-ish of the first one

That would certainly simplify, but it'd be pretty important to
document.  (Incidentally, this kind of decision was my reason for
asking about all those special cases earlier, i.e. how to handle diff
between different commits, how to handle renames, how to handle bare
repositories, etc.)

This kind of decision probably also means you'd need a variety of
testcases where .gitattributes is different in every commit & the
index & the working tree, and then you start testing several of the
possible pairings to make sure the right .gitattributes file is used
(e.g. (commit, commit), (commit, index), (index, commit), (worktree,
index), etc.)

However, I'm curious again.  You brought this up because you want to
use it in GitLab, yet configuration of using this option as it appears
in this series requires changing _both_ .gitattributes and
.git/config.  How will users of the GitLab UI do the configuration
necessary for the git-config side to take effect?

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v3 2/2] diff: teach diff to read algorithm from diff driver
  2023-02-20 17:32               ` Elijah Newren
@ 2023-02-20 20:53                 ` John Cai
  2023-02-22 19:47                 ` Jeff King
  1 sibling, 0 replies; 78+ messages in thread
From: John Cai @ 2023-02-20 20:53 UTC (permalink / raw)
  To: Elijah Newren
  Cc: John Cai via GitGitGadget, git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King

Hi Elijah,

On 20 Feb 2023, at 12:32, Elijah Newren wrote:

> On Mon, Feb 20, 2023 at 8:49 AM John Cai <johncai86@gmail.com> wrote:
>>
>> Hi Elijah,
>>
>> On 20 Feb 2023, at 11:21, Elijah Newren wrote:
>>
>>> On Mon, Feb 20, 2023 at 7:32 AM John Cai <johncai86@gmail.com> wrote:
>>> [...]
>>>>> I'm still curious if this should this also include warnings/caveats, such as:
>>>>>   * The diff attribute specified in .gitattributes will be ignored in
>>>>> a bare clone
>>>>>   * The diff attribute specified in .gitattributes will be ignored if
>>>>> it is only specified in another branch (e.g. on a branch "special-file
>>>>> diff=patience" recorded in .gitattributes, then checkout master but
>>>>> run `git log -1 -p $branch`)
>>>>>   * When a file is renamed, the diff attribute for the pre-image name
>>>>> is the only one the system pays attention to (thus adding "-R" can
>>>>> flip which diff algorithm is run for the renamed file).
>>>>
>>>> I would be fine with adding that--though originally I was thinking that these
>>>> can be inferred from the way that gitattributes are documented in [1]. Calling
>>>> these out would make it more clear though, so I could go either way.
>>>>
>>>>>
>>>>> Also, since I tested the three items above to verify they are valid
>>>>> warnings, I'm a bit confused.  I thought your intent was to use this
>>>>> server-side[1], so isn't the bare clone aspect a deal-breaker for your
>>>>> intended usecase?
>>>>>
>>>>> [1] https://lore.kernel.org/git/7852AC7B-7A4E-4DD0-ADEA-CFFD5D16C595@gmail.com/
>>>>
>>>> yes, indeed. I was planning on adding bare repository support in a separate
>>>> patch series, since the additions in [2] allows .gitattributes to be read from a
>>>> bare repository.
>>>>
>>>> 1. https://git-scm.com/docs/gitattributes
>>>> 2. https://lore.kernel.org/git/0ca8b2458921fc40269b0c43b5ec86eba77d6b54.1673684790.git.karthik.188@gmail.com/
>>>>
>>>> thanks!
>>>> John
>>>
>>> Oh, interesting, I didn't know about [2].  So, is the plan to take the
>>> --source option from that series and add it to diff (perhaps with a
>>> different name, since log tends to consume diff options and --source
>>> is already taken)?
>>
>> Yep, that would be the general idea
>>
>>>
>>> And do you expect to get the tree-ish from the two the users are
>>> already specifying to diff?  If so, which one do you use (the two
>>> commits being diffed might have differing .gitattributes files)?  If
>>> not, what does that mean for users of e.g. the GitLab UI who have to
>>> specify a third tree when diffing?
>>
>> Good question! Since it seems that when `git-diff(1)` considers diff.<driver>,
>> it goes with the path of the first one. (might need some confirmation here)
>>
>> in diff.c:
>>
>>
>> static void run_diff(struct diff_filepair *p, struct diff_options *o)
>> {
>>         const char *pgm = external_diff();
>>         struct strbuf msg;
>>         struct diff_filespec *one = p->one;
>>         struct diff_filespec *two = p->two;
>>         const char *name;
>>         const char *other;
>>         const char *attr_path;
>>
>>         name  = one->path;
>>         other = (strcmp(name, two->path) ? two->path : NULL);
>>         attr_path = name;
>>         if (o->prefix_length)
>>
>> I was thinking we would just use the tree-ish of the first one
>
> That would certainly simplify, but it'd be pretty important to
> document.  (Incidentally, this kind of decision was my reason for
> asking about all those special cases earlier, i.e. how to handle diff
> between different commits, how to handle renames, how to handle bare
> repositories, etc.)

Good point--that would be good to document.

>
> This kind of decision probably also means you'd need a variety of
> testcases where .gitattributes is different in every commit & the
> index & the working tree, and then you start testing several of the
> possible pairings to make sure the right .gitattributes file is used
> (e.g. (commit, commit), (commit, index), (index, commit), (worktree,
> index), etc.)
>
> However, I'm curious again.  You brought this up because you want to
> use it in GitLab, yet configuration of using this option as it appears
> in this series requires changing _both_ .gitattributes and
> .git/config.  How will users of the GitLab UI do the configuration
> necessary for the git-config side to take effect?

Good question--we would likely need to add some pre-baked configuration server
side with a driver config that users could then tap into with their
gitattributes files.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* [PATCH v4 0/2] Teach diff to honor diff algorithms set through git attributes
  2023-02-17 20:21   ` [PATCH v3 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
                       ` (2 preceding siblings ...)
  2023-02-18  1:16     ` [PATCH v3 0/2] Teach diff to honor diff algorithms set through git attributes Elijah Newren
@ 2023-02-20 21:04     ` John Cai via GitGitGadget
  2023-02-20 21:04       ` [PATCH v4 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
                         ` (2 more replies)
  3 siblings, 3 replies; 78+ messages in thread
From: John Cai via GitGitGadget @ 2023-02-20 21:04 UTC (permalink / raw)
  To: git
  Cc: Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren,
	John Cai

When a repository contains different kinds of files, it may be desirable to
use different algorithms based on file type. This is currently not feasible
through the command line or using git configs. However, we can leverage the
fact that gitattributes are path aware.

Teach the diff machinery to check gitattributes when diffing files by using
the existing diff. scheme, and add an "algorithm" type to the external
driver config.

Change since V3:

 * cleaned up documentation, typos
 * minor cleanup such as if statement ordering, and overly long lines

Changes since V2:

 * minor clean up and variable renaming
 * avoid parsing attribute files for the driver if the diff algorithm is set
   through the command line

Changes since V1:

 * utilize the existing diff.<driver>.* scheme where the driver is defined
   in gitattributes, but the algorithm is defined in the gitconfig.

To address some of the performance concerns in the previous series, a
benchmark shows that now only a minor performance penalty is incurred, now
that we are no longer adding an additional attributes parsing call:

$ echo "*.[ch] diff=other" >> .gitattributes $ hyperfine -r 10 -L a
git-bin-wrapper,git '{a} -c diff.other.algorithm=myers diff v2.0.0 v2.28.0'
Benchmark 1: git-bin-wrapper -c diff.other.algorithm=myers diff v2.0.0
v2.28.0 Time (mean ± σ): 716.3 ms ± 3.8 ms [User: 660.2 ms, System: 50.8 ms]
Range (min … max): 709.8 ms … 720.6 ms 10 runs

Benchmark 2: git -c diff.other.algorithm=myers diff v2.0.0 v2.28.0 Time
(mean ± σ): 704.3 ms ± 2.9 ms [User: 656.6 ms, System: 44.3 ms] Range (min …
max): 700.1 ms … 708.6 ms 10 runs

Summary 'git -c diff.other.algorithm=myers diff v2.0.0 v2.28.0' ran 1.02 ±
0.01 times faster than 'git-bin-wrapper -c diff.other.algorithm=myers diff
v2.0.0 v2.28.0'

John Cai (2):
  diff: consolidate diff algorithm option parsing
  diff: teach diff to read algorithm from diff driver

 Documentation/gitattributes.txt | 31 ++++++++++++
 diff.c                          | 90 ++++++++++++++++++++++++---------
 diff.h                          |  1 +
 t/lib-diff-alternative.sh       | 38 +++++++++++++-
 userdiff.c                      |  4 +-
 userdiff.h                      |  1 +
 6 files changed, 140 insertions(+), 25 deletions(-)


base-commit: c867e4fa180bec4750e9b54eb10f459030dbebfd
Published-As: https://github.com/gitgitgadget/git/releases/tag/pr-git-1452%2Fjohn-cai%2Fjc%2Fattr-diff-algo-v4
Fetch-It-Via: git fetch https://github.com/gitgitgadget/git pr-git-1452/john-cai/jc/attr-diff-algo-v4
Pull-Request: https://github.com/git/git/pull/1452

Range-diff vs v3:

 1:  816c47aa414 = 1:  816c47aa414 diff: consolidate diff algorithm option parsing
 2:  b330222ce83 ! 2:  77e66ab98fc diff: teach diff to read algorithm from diff driver
     @@ Commit message
          finally the diff.algorithm config.
      
          To enforce precedence order, use a new `ignore_driver_algorithm` member
     -    during options pasing to indicate the diff algorithm was set via command
     +    during options parsing to indicate the diff algorithm was set via command
          line args.
      
          Signed-off-by: John Cai <johncai86@gmail.com>
     @@ Documentation/gitattributes.txt: with the above configuration, i.e. `j-c-diff`,
      +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
      +
      +The diff algorithm can be set through the `diff.algorithm` config key, but
     -+sometimes it may be helpful to set the diff algorithm by path. For example, one
     -+might wish to set a diff algorithm automatically for all `.json` files such that
     -+the user would not need to pass in a separate command line `--diff-algorithm`
     -+flag each time.
     ++sometimes it may be helpful to set the diff algorithm per path. For example,
     ++one may want to use the `minimal` diff algorithm for .json files, and the
     ++`histogram` for .c files, and so on without having to pass in the algorithm
     ++through the command line each time.
      +
      +First, in `.gitattributes`, assign the `diff` attribute for paths.
      +
     @@ Documentation/gitattributes.txt: with the above configuration, i.e. `j-c-diff`,
      +------------------------
      +
      +Then, define a "diff.<name>.algorithm" configuration to specify the diff
     -+algorithm, choosing from `meyers`, `patience`, `minimal`, or `histogram`.
     ++algorithm, choosing from `myers`, `patience`, `minimal`, or `histogram`.
      +
      +----------------------------------------------------------------
      +[diff "<name>"]
     @@ Documentation/gitattributes.txt: with the above configuration, i.e. `j-c-diff`,
      +git-show(1) and is used for the `--stat` output as well. The merge machinery
      +will not use the diff algorithm set through this method.
      +
     -+NOTE: If the `command` key also exists, then Git will treat this as an external
     -+diff and attempt to use the value set for `command` as an external program. For
     -+instance, the following config, combined with the above `.gitattributes` file,
     -+will result in `command` favored over `algorithm`.
     -+
     -+----------------------------------------------------------------
     -+[diff "<name>"]
     -+  command = j-c-diff
     -+  algorithm = histogram
     -+----------------------------------------------------------------
     ++NOTE: If `diff.<name>.command` is defined for path with the
     ++`diff=<name>` attribute, it is executed as an external diff driver
     ++(see above), and adding `diff.<name>.algorithm` has no effect, as the
     ++algorithm is not passed to the external diff driver.
       
       Defining a custom hunk-header
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     @@ diff.c: static void run_diff_cmd(const char *pgm,
       	}
      -	if (one && two)
      +	if (one && two) {
     -+		if (drv && !o->ignore_driver_algorithm && drv->algorithm)
     ++		if (!o->ignore_driver_algorithm && drv && drv->algorithm)
      +			set_diff_algorithm(o, drv->algorithm);
      +
       		builtin_diff(name, other ? other : name,
     @@ diff.c: static void run_diffstat(struct diff_filepair *p, struct diff_options *o
       	const char *other;
       
      +	if (!o->ignore_driver_algorithm) {
     -+		struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index, p->one->path);
     ++		struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index,
     ++								    p->one->path);
      +
     -+		if (drv && drv->algorithm) {
     ++		if (drv && drv->algorithm)
      +			set_diff_algorithm(o, drv->algorithm);
     -+		}
      +	}
      +
       	if (DIFF_PAIR_UNMERGED(p)) {
     @@ t/lib-diff-alternative.sh: index $file1..$file2 100644
      +
      +	test_expect_success "$STRATEGY diff command line precedence before attributes" '
      +		echo "file* diff=driver" >.gitattributes &&
     -+		git config diff.driver.algorithm meyers &&
     ++		git config diff.driver.algorithm myers &&
      +		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
      +		test_cmp expect output
      +	'

-- 
gitgitgadget

^ permalink raw reply	[flat|nested] 78+ messages in thread

* [PATCH v4 1/2] diff: consolidate diff algorithm option parsing
  2023-02-20 21:04     ` [PATCH v4 " John Cai via GitGitGadget
@ 2023-02-20 21:04       ` John Cai via GitGitGadget
  2023-02-20 21:04       ` [PATCH v4 2/2] diff: teach diff to read algorithm from diff driver John Cai via GitGitGadget
  2023-02-21 17:34       ` [PATCH v4 0/2] Teach diff to honor diff algorithms set through git attributes Junio C Hamano
  2 siblings, 0 replies; 78+ messages in thread
From: John Cai via GitGitGadget @ 2023-02-20 21:04 UTC (permalink / raw)
  To: git
  Cc: Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren,
	John Cai, John Cai

From: John Cai <johncai86@gmail.com>

A subsequent commit will need the ability to tell if the diff algorithm
was set through the command line through setting a new member of
diff_options. While this logic can be added to the
diff_opt_diff_algorithm() callback, the `--minimal` and `--histogram`
options are handled via OPT_BIT without a callback.

Remedy this by consolidating the options parsing logic for --minimal and
--histogram into one callback. This way we can modify `diff_options` in
that function.

As an additional refactor, the logic that sets the diff algorithm in
diff_opt_diff_algorithm() can be refactored into a helper that will
allow multiple callsites to set the diff algorithm.

Signed-off-by: John Cai <johncai86@gmail.com>
---
 diff.c | 57 +++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 14 deletions(-)

diff --git a/diff.c b/diff.c
index 329eebf16a0..5efc22ca06b 100644
--- a/diff.c
+++ b/diff.c
@@ -3437,6 +3437,22 @@ static int diff_filepair_is_phoney(struct diff_filespec *one,
 	return !DIFF_FILE_VALID(one) && !DIFF_FILE_VALID(two);
 }
 
+static int set_diff_algorithm(struct diff_options *opts,
+			      const char *alg)
+{
+	long value = parse_algorithm_value(alg);
+
+	if (value < 0)
+		return -1;
+
+	/* clear out previous settings */
+	DIFF_XDL_CLR(opts, NEED_MINIMAL);
+	opts->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
+	opts->xdl_opts |= value;
+
+	return 0;
+}
+
 static void builtin_diff(const char *name_a,
 			 const char *name_b,
 			 struct diff_filespec *one,
@@ -5107,17 +5123,28 @@ static int diff_opt_diff_algorithm(const struct option *opt,
 				   const char *arg, int unset)
 {
 	struct diff_options *options = opt->value;
-	long value = parse_algorithm_value(arg);
 
 	BUG_ON_OPT_NEG(unset);
-	if (value < 0)
+
+	if (set_diff_algorithm(options, arg))
 		return error(_("option diff-algorithm accepts \"myers\", "
 			       "\"minimal\", \"patience\" and \"histogram\""));
 
-	/* clear out previous settings */
-	DIFF_XDL_CLR(options, NEED_MINIMAL);
-	options->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
-	options->xdl_opts |= value;
+	return 0;
+}
+
+static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
+				   const char *arg, int unset)
+{
+	struct diff_options *options = opt->value;
+
+	BUG_ON_OPT_NEG(unset);
+	BUG_ON_OPT_ARG(arg);
+
+	if (set_diff_algorithm(options, opt->long_name))
+		BUG("available diff algorithms include \"myers\", "
+			       "\"minimal\", \"patience\" and \"histogram\"");
+
 	return 0;
 }
 
@@ -5250,7 +5277,6 @@ static int diff_opt_patience(const struct option *opt,
 
 	BUG_ON_OPT_NEG(unset);
 	BUG_ON_OPT_ARG(arg);
-	options->xdl_opts = DIFF_WITH_ALG(options, PATIENCE_DIFF);
 	/*
 	 * Both --patience and --anchored use PATIENCE_DIFF
 	 * internally, so remove any anchors previously
@@ -5259,7 +5285,8 @@ static int diff_opt_patience(const struct option *opt,
 	for (i = 0; i < options->anchors_nr; i++)
 		free(options->anchors[i]);
 	options->anchors_nr = 0;
-	return 0;
+
+	return set_diff_algorithm(options, "patience");
 }
 
 static int diff_opt_ignore_regex(const struct option *opt,
@@ -5562,9 +5589,10 @@ struct option *add_diff_options(const struct option *opts,
 			    N_("prevent rename/copy detection if the number of rename/copy targets exceeds given limit")),
 
 		OPT_GROUP(N_("Diff algorithm options")),
-		OPT_BIT(0, "minimal", &options->xdl_opts,
-			N_("produce the smallest possible diff"),
-			XDF_NEED_MINIMAL),
+		OPT_CALLBACK_F(0, "minimal", options, NULL,
+			       N_("produce the smallest possible diff"),
+			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
+			       diff_opt_diff_algorithm_no_arg),
 		OPT_BIT_F('w', "ignore-all-space", &options->xdl_opts,
 			  N_("ignore whitespace when comparing lines"),
 			  XDF_IGNORE_WHITESPACE, PARSE_OPT_NONEG),
@@ -5590,9 +5618,10 @@ struct option *add_diff_options(const struct option *opts,
 			       N_("generate diff using the \"patience diff\" algorithm"),
 			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
 			       diff_opt_patience),
-		OPT_BITOP(0, "histogram", &options->xdl_opts,
-			  N_("generate diff using the \"histogram diff\" algorithm"),
-			  XDF_HISTOGRAM_DIFF, XDF_DIFF_ALGORITHM_MASK),
+		OPT_CALLBACK_F(0, "histogram", options, NULL,
+			       N_("generate diff using the \"histogram diff\" algorithm"),
+			       PARSE_OPT_NONEG | PARSE_OPT_NOARG,
+			       diff_opt_diff_algorithm_no_arg),
 		OPT_CALLBACK_F(0, "diff-algorithm", options, N_("<algorithm>"),
 			       N_("choose a diff algorithm"),
 			       PARSE_OPT_NONEG, diff_opt_diff_algorithm),
-- 
gitgitgadget


^ permalink raw reply related	[flat|nested] 78+ messages in thread

* [PATCH v4 2/2] diff: teach diff to read algorithm from diff driver
  2023-02-20 21:04     ` [PATCH v4 " John Cai via GitGitGadget
  2023-02-20 21:04       ` [PATCH v4 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
@ 2023-02-20 21:04       ` John Cai via GitGitGadget
  2023-02-21 17:34       ` [PATCH v4 0/2] Teach diff to honor diff algorithms set through git attributes Junio C Hamano
  2 siblings, 0 replies; 78+ messages in thread
From: John Cai via GitGitGadget @ 2023-02-20 21:04 UTC (permalink / raw)
  To: git
  Cc: Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren,
	John Cai, John Cai

From: John Cai <johncai86@gmail.com>

It can be useful to specify diff algorithms per file type. For example,
one may want to use the minimal diff algorithm for .json files, another
for .c files, etc.

The diff machinery already checks attributes for a diff driver. Teach
the diff driver parser a new type "algorithm" to look for in the
config, which will be used if a driver has been specified through the
attributes.

Enforce precedence of the diff algorithm by favoring the command line
option, then looking at the driver attributes & config combination, then
finally the diff.algorithm config.

To enforce precedence order, use a new `ignore_driver_algorithm` member
during options parsing to indicate the diff algorithm was set via command
line args.

Signed-off-by: John Cai <johncai86@gmail.com>
---
 Documentation/gitattributes.txt | 31 +++++++++++++++++++++++++++
 diff.c                          | 33 ++++++++++++++++++++--------
 diff.h                          |  1 +
 t/lib-diff-alternative.sh       | 38 ++++++++++++++++++++++++++++++++-
 userdiff.c                      |  4 +++-
 userdiff.h                      |  1 +
 6 files changed, 97 insertions(+), 11 deletions(-)

diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt
index c19e64ea0ef..39bfbca1ffe 100644
--- a/Documentation/gitattributes.txt
+++ b/Documentation/gitattributes.txt
@@ -758,6 +758,37 @@ with the above configuration, i.e. `j-c-diff`, with 7
 parameters, just like `GIT_EXTERNAL_DIFF` program is called.
 See linkgit:git[1] for details.
 
+Setting the internal diff algorithm
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The diff algorithm can be set through the `diff.algorithm` config key, but
+sometimes it may be helpful to set the diff algorithm per path. For example,
+one may want to use the `minimal` diff algorithm for .json files, and the
+`histogram` for .c files, and so on without having to pass in the algorithm
+through the command line each time.
+
+First, in `.gitattributes`, assign the `diff` attribute for paths.
+
+------------------------
+*.json diff=<name>
+------------------------
+
+Then, define a "diff.<name>.algorithm" configuration to specify the diff
+algorithm, choosing from `myers`, `patience`, `minimal`, or `histogram`.
+
+----------------------------------------------------------------
+[diff "<name>"]
+  algorithm = histogram
+----------------------------------------------------------------
+
+This diff algorithm applies to user facing diff output like git-diff(1),
+git-show(1) and is used for the `--stat` output as well. The merge machinery
+will not use the diff algorithm set through this method.
+
+NOTE: If `diff.<name>.command` is defined for path with the
+`diff=<name>` attribute, it is executed as an external diff driver
+(see above), and adding `diff.<name>.algorithm` has no effect, as the
+algorithm is not passed to the external diff driver.
 
 Defining a custom hunk-header
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/diff.c b/diff.c
index 5efc22ca06b..469e18aed20 100644
--- a/diff.c
+++ b/diff.c
@@ -4456,15 +4456,13 @@ static void run_diff_cmd(const char *pgm,
 	const char *xfrm_msg = NULL;
 	int complete_rewrite = (p->status == DIFF_STATUS_MODIFIED) && p->score;
 	int must_show_header = 0;
+	struct userdiff_driver *drv = NULL;
 
-
-	if (o->flags.allow_external) {
-		struct userdiff_driver *drv;
-
+	if (o->flags.allow_external || !o->ignore_driver_algorithm)
 		drv = userdiff_find_by_path(o->repo->index, attr_path);
-		if (drv && drv->external)
-			pgm = drv->external;
-	}
+
+	if (o->flags.allow_external && drv && drv->external)
+		pgm = drv->external;
 
 	if (msg) {
 		/*
@@ -4481,12 +4479,16 @@ static void run_diff_cmd(const char *pgm,
 		run_external_diff(pgm, name, other, one, two, xfrm_msg, o);
 		return;
 	}
-	if (one && two)
+	if (one && two) {
+		if (!o->ignore_driver_algorithm && drv && drv->algorithm)
+			set_diff_algorithm(o, drv->algorithm);
+
 		builtin_diff(name, other ? other : name,
 			     one, two, xfrm_msg, must_show_header,
 			     o, complete_rewrite);
-	else
+	} else {
 		fprintf(o->file, "* Unmerged path %s\n", name);
+	}
 }
 
 static void diff_fill_oid_info(struct diff_filespec *one, struct index_state *istate)
@@ -4583,6 +4585,14 @@ static void run_diffstat(struct diff_filepair *p, struct diff_options *o,
 	const char *name;
 	const char *other;
 
+	if (!o->ignore_driver_algorithm) {
+		struct userdiff_driver *drv = userdiff_find_by_path(o->repo->index,
+								    p->one->path);
+
+		if (drv && drv->algorithm)
+			set_diff_algorithm(o, drv->algorithm);
+	}
+
 	if (DIFF_PAIR_UNMERGED(p)) {
 		/* unmerged */
 		builtin_diffstat(p->one->path, NULL, NULL, NULL,
@@ -5130,6 +5140,8 @@ static int diff_opt_diff_algorithm(const struct option *opt,
 		return error(_("option diff-algorithm accepts \"myers\", "
 			       "\"minimal\", \"patience\" and \"histogram\""));
 
+	options->ignore_driver_algorithm = 1;
+
 	return 0;
 }
 
@@ -5145,6 +5157,8 @@ static int diff_opt_diff_algorithm_no_arg(const struct option *opt,
 		BUG("available diff algorithms include \"myers\", "
 			       "\"minimal\", \"patience\" and \"histogram\"");
 
+	options->ignore_driver_algorithm = 1;
+
 	return 0;
 }
 
@@ -5285,6 +5299,7 @@ static int diff_opt_patience(const struct option *opt,
 	for (i = 0; i < options->anchors_nr; i++)
 		free(options->anchors[i]);
 	options->anchors_nr = 0;
+	options->ignore_driver_algorithm = 1;
 
 	return set_diff_algorithm(options, "patience");
 }
diff --git a/diff.h b/diff.h
index 41eb2c3d428..8d770b1d579 100644
--- a/diff.h
+++ b/diff.h
@@ -333,6 +333,7 @@ struct diff_options {
 	int prefix_length;
 	const char *stat_sep;
 	int xdl_opts;
+	int ignore_driver_algorithm;
 
 	/* see Documentation/diff-options.txt */
 	char **anchors;
diff --git a/t/lib-diff-alternative.sh b/t/lib-diff-alternative.sh
index 8d1e408bb58..a8f5d3274a5 100644
--- a/t/lib-diff-alternative.sh
+++ b/t/lib-diff-alternative.sh
@@ -105,10 +105,46 @@ index $file1..$file2 100644
  }
 EOF
 
+	cat >expect_diffstat <<EOF
+ file1 => file2 | 21 ++++++++++-----------
+ 1 file changed, 10 insertions(+), 11 deletions(-)
+EOF
+
 	STRATEGY=$1
 
+	test_expect_success "$STRATEGY diff from attributes" '
+		echo "file* diff=driver" >.gitattributes &&
+		git config diff.driver.algorithm "$STRATEGY" &&
+		test_must_fail git diff --no-index file1 file2 > output &&
+		cat expect &&
+		cat output &&
+		test_cmp expect output
+	'
+
+	test_expect_success "$STRATEGY diff from attributes has valid diffstat" '
+		echo "file* diff=driver" >.gitattributes &&
+		git config diff.driver.algorithm "$STRATEGY" &&
+		test_must_fail git diff --stat --no-index file1 file2 > output &&
+		test_cmp expect_diffstat output
+	'
+
 	test_expect_success "$STRATEGY diff" '
-		test_must_fail git diff --no-index "--$STRATEGY" file1 file2 > output &&
+		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
+		test_cmp expect output
+	'
+
+	test_expect_success "$STRATEGY diff command line precedence before attributes" '
+		echo "file* diff=driver" >.gitattributes &&
+		git config diff.driver.algorithm myers &&
+		test_must_fail git diff --no-index "--diff-algorithm=$STRATEGY" file1 file2 > output &&
+		test_cmp expect output
+	'
+
+	test_expect_success "$STRATEGY diff attributes precedence before config" '
+		git config diff.algorithm default &&
+		echo "file* diff=driver" >.gitattributes &&
+		git config diff.driver.algorithm "$STRATEGY" &&
+		test_must_fail git diff --no-index file1 file2 > output &&
 		test_cmp expect output
 	'
 
diff --git a/userdiff.c b/userdiff.c
index d71b82feb74..ff25cfc4b4c 100644
--- a/userdiff.c
+++ b/userdiff.c
@@ -293,7 +293,7 @@ PATTERNS("scheme",
 	 "|([^][)(}{[ \t])+"),
 PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",
 	 "\\\\[a-zA-Z@]+|\\\\.|[a-zA-Z0-9\x80-\xff]+"),
-{ "default", NULL, -1, { NULL, 0 } },
+{ "default", NULL, NULL, -1, { NULL, 0 } },
 };
 #undef PATTERNS
 #undef IPATTERN
@@ -394,6 +394,8 @@ int userdiff_config(const char *k, const char *v)
 		return parse_bool(&drv->textconv_want_cache, k, v);
 	if (!strcmp(type, "wordregex"))
 		return git_config_string(&drv->word_regex, k, v);
+	if (!strcmp(type, "algorithm"))
+		return git_config_string(&drv->algorithm, k, v);
 
 	return 0;
 }
diff --git a/userdiff.h b/userdiff.h
index aee91bc77e6..24419db6973 100644
--- a/userdiff.h
+++ b/userdiff.h
@@ -14,6 +14,7 @@ struct userdiff_funcname {
 struct userdiff_driver {
 	const char *name;
 	const char *external;
+	const char *algorithm;
 	int binary;
 	struct userdiff_funcname funcname;
 	const char *word_regex;
-- 
gitgitgadget

^ permalink raw reply related	[flat|nested] 78+ messages in thread

* Re: [PATCH v4 0/2] Teach diff to honor diff algorithms set through git attributes
  2023-02-20 21:04     ` [PATCH v4 " John Cai via GitGitGadget
  2023-02-20 21:04       ` [PATCH v4 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
  2023-02-20 21:04       ` [PATCH v4 2/2] diff: teach diff to read algorithm from diff driver John Cai via GitGitGadget
@ 2023-02-21 17:34       ` Junio C Hamano
  2023-02-21 18:05         ` Elijah Newren
  2 siblings, 1 reply; 78+ messages in thread
From: Junio C Hamano @ 2023-02-21 17:34 UTC (permalink / raw)
  To: John Cai via GitGitGadget
  Cc: git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, Elijah Newren,
	John Cai

"John Cai via GitGitGadget" <gitgitgadget@gmail.com> writes:

> When a repository contains different kinds of files, it may be desirable to
> use different algorithms based on file type. This is currently not feasible
> through the command line or using git configs. However, we can leverage the
> fact that gitattributes are path aware.
> ...
> To address some of the performance concerns in the previous series, a
> benchmark shows that now only a minor performance penalty is incurred, now
> that we are no longer adding an additional attributes parsing call:
>
> $ echo "*.[ch] diff=other" >> .gitattributes $ hyperfine -r 10 -L a
> git-bin-wrapper,git '{a} -c diff.other.algorithm=myers diff v2.0.0 v2.28.0'
> Benchmark 1: git-bin-wrapper -c diff.other.algorithm=myers diff v2.0.0
> v2.28.0 Time (mean ± σ): 716.3 ms ± 3.8 ms [User: 660.2 ms, System: 50.8 ms]
> Range (min … max): 709.8 ms … 720.6 ms 10 runs
>
> Benchmark 2: git -c diff.other.algorithm=myers diff v2.0.0 v2.28.0 Time
> (mean ± σ): 704.3 ms ± 2.9 ms [User: 656.6 ms, System: 44.3 ms] Range (min …
> max): 700.1 ms … 708.6 ms 10 runs
>
> Summary 'git -c diff.other.algorithm=myers diff v2.0.0 v2.28.0' ran 1.02 ±
> 0.01 times faster than 'git-bin-wrapper -c diff.other.algorithm=myers diff
> v2.0.0 v2.28.0'

Hopefully this round can immediately be merged down to 'next'?
Thanks.


^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v4 0/2] Teach diff to honor diff algorithms set through git attributes
  2023-02-21 17:34       ` [PATCH v4 0/2] Teach diff to honor diff algorithms set through git attributes Junio C Hamano
@ 2023-02-21 18:05         ` Elijah Newren
  2023-02-21 18:51           ` Junio C Hamano
  0 siblings, 1 reply; 78+ messages in thread
From: Elijah Newren @ 2023-02-21 18:05 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: John Cai via GitGitGadget, git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, John Cai

On Tue, Feb 21, 2023 at 9:34 AM Junio C Hamano <gitster@pobox.com> wrote:
>
> "John Cai via GitGitGadget" <gitgitgadget@gmail.com> writes:
>
> > When a repository contains different kinds of files, it may be desirable to
> > use different algorithms based on file type. This is currently not feasible
> > through the command line or using git configs. However, we can leverage the
> > fact that gitattributes are path aware.
> > ...
> > To address some of the performance concerns in the previous series, a
> > benchmark shows that now only a minor performance penalty is incurred, now
> > that we are no longer adding an additional attributes parsing call:
> >
> > $ echo "*.[ch] diff=other" >> .gitattributes $ hyperfine -r 10 -L a
> > git-bin-wrapper,git '{a} -c diff.other.algorithm=myers diff v2.0.0 v2.28.0'
> > Benchmark 1: git-bin-wrapper -c diff.other.algorithm=myers diff v2.0.0
> > v2.28.0 Time (mean ± σ): 716.3 ms ± 3.8 ms [User: 660.2 ms, System: 50.8 ms]
> > Range (min … max): 709.8 ms … 720.6 ms 10 runs
> >
> > Benchmark 2: git -c diff.other.algorithm=myers diff v2.0.0 v2.28.0 Time
> > (mean ± σ): 704.3 ms ± 2.9 ms [User: 656.6 ms, System: 44.3 ms] Range (min …
> > max): 700.1 ms … 708.6 ms 10 runs
> >
> > Summary 'git -c diff.other.algorithm=myers diff v2.0.0 v2.28.0' ran 1.02 ±
> > 0.01 times faster than 'git-bin-wrapper -c diff.other.algorithm=myers diff
> > v2.0.0 v2.28.0'
>
> Hopefully this round can immediately be merged down to 'next'?
> Thanks.

I'll leave that up to you and John, but are we risking merging code
that could go unused or that we need to fundamentally change?  I don't
see how to handle the issues over at
https://lore.kernel.org/git/647D3D49-B85B-4B66-A857-695CFF9685EE@gmail.com/
(not that I've spent much thought on it besides asking how things are
going to work), and would be worried that we'd end up needing to
change the mechanism somehow to cover the things John ultimately
wants.

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v4 0/2] Teach diff to honor diff algorithms set through git attributes
  2023-02-21 18:05         ` Elijah Newren
@ 2023-02-21 18:51           ` Junio C Hamano
  2023-02-21 19:36             ` John Cai
  0 siblings, 1 reply; 78+ messages in thread
From: Junio C Hamano @ 2023-02-21 18:51 UTC (permalink / raw)
  To: Elijah Newren
  Cc: John Cai via GitGitGadget, git, Eric Sunshine, Phillip Wood,
	Ævar Arnfjörð Bjarmason, Jeff King, John Cai

Elijah Newren <newren@gmail.com> writes:

> I'll leave that up to you and John, but are we risking merging code
> that could go unused or that we need to fundamentally change?  I don't
> see how to handle the issues over at
> https://lore.kernel.org/git/647D3D49-B85B-4B66-A857-695CFF9685EE@gmail.com/

If this is useful enough for desktop users already, then that is a
good enough reason to take it, I would say.

GitLab can easily add WebUI that says "You can define what diff
algorithm is used for files with which suffix" to allow you to
configure a table like this:

	json | histogram
	c    | patience
	*    | myers

and populate the server-side equivalent of .git/info/attributes and
.git/config based on that, and without anything further than the
posted patches, the result should just work, no?

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v4 0/2] Teach diff to honor diff algorithms set through git attributes
  2023-02-21 18:51           ` Junio C Hamano
@ 2023-02-21 19:36             ` John Cai
  2023-02-21 20:16               ` Elijah Newren
  0 siblings, 1 reply; 78+ messages in thread
From: John Cai @ 2023-02-21 19:36 UTC (permalink / raw)
  To: Junio C Hamano
  Cc: Elijah Newren, John Cai via GitGitGadget, git, Eric Sunshine,
	Phillip Wood, Ævar Arnfjörð Bjarmason, Jeff King



On 21 Feb 2023, at 13:51, Junio C Hamano wrote:

> Elijah Newren <newren@gmail.com> writes:
>
>> I'll leave that up to you and John, but are we risking merging code
>> that could go unused or that we need to fundamentally change?  I don't
>> see how to handle the issues over at
>> https://lore.kernel.org/git/647D3D49-B85B-4B66-A857-695CFF9685EE@gmail.com/
>
> If this is useful enough for desktop users already, then that is a
> good enough reason to take it, I would say.
>
> GitLab can easily add WebUI that says "You can define what diff
> algorithm is used for files with which suffix" to allow you to
> configure a table like this:
>
> 	json | histogram
> 	c    | patience
> 	*    | myers
>
> and populate the server-side equivalent of .git/info/attributes and
> .git/config based on that, and without anything further than the
> posted patches, the result should just work, no?

Yes, that is my thinking too. The goal was to make this user friendly for
everyone on the Git command line, and then GitLab can hook into this accordingly
to make it ussable with the GitLab UI.

thanks
John

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v4 0/2] Teach diff to honor diff algorithms set through git attributes
  2023-02-21 19:36             ` John Cai
@ 2023-02-21 20:16               ` Elijah Newren
  0 siblings, 0 replies; 78+ messages in thread
From: Elijah Newren @ 2023-02-21 20:16 UTC (permalink / raw)
  To: John Cai
  Cc: Junio C Hamano, John Cai via GitGitGadget, git, Eric Sunshine,
	Phillip Wood, Ævar Arnfjörð Bjarmason, Jeff King

On Tue, Feb 21, 2023 at 11:36 AM John Cai <johncai86@gmail.com> wrote:
>
> On 21 Feb 2023, at 13:51, Junio C Hamano wrote:
>
> > Elijah Newren <newren@gmail.com> writes:
> >
> >> I'll leave that up to you and John, but are we risking merging code
> >> that could go unused or that we need to fundamentally change?  I don't
> >> see how to handle the issues over at
> >> https://lore.kernel.org/git/647D3D49-B85B-4B66-A857-695CFF9685EE@gmail.com/
> >
> > If this is useful enough for desktop users already, then that is a
> > good enough reason to take it, I would say.
> >
> > GitLab can easily add WebUI that says "You can define what diff
> > algorithm is used for files with which suffix" to allow you to
> > configure a table like this:
> >
> >       json | histogram
> >       c    | patience
> >       *    | myers
> >
> > and populate the server-side equivalent of .git/info/attributes and
> > .git/config based on that, and without anything further than the
> > posted patches, the result should just work, no?
>
> Yes, that is my thinking too. The goal was to make this user friendly for
> everyone on the Git command line, and then GitLab can hook into this accordingly
> to make it ussable with the GitLab UI.

Ok, sounds good.  Merge away.  :-)

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v3 2/2] diff: teach diff to read algorithm from diff driver
  2023-02-20 17:32               ` Elijah Newren
  2023-02-20 20:53                 ` John Cai
@ 2023-02-22 19:47                 ` Jeff King
  2023-02-24 17:44                   ` John Cai
  1 sibling, 1 reply; 78+ messages in thread
From: Jeff King @ 2023-02-22 19:47 UTC (permalink / raw)
  To: Elijah Newren
  Cc: John Cai, John Cai via GitGitGadget, git, Eric Sunshine,
	Phillip Wood, Ævar Arnfjörð Bjarmason

On Mon, Feb 20, 2023 at 09:32:52AM -0800, Elijah Newren wrote:

> > I was thinking we would just use the tree-ish of the first one
> 
> That would certainly simplify, but it'd be pretty important to
> document.  (Incidentally, this kind of decision was my reason for
> asking about all those special cases earlier, i.e. how to handle diff
> between different commits, how to handle renames, how to handle bare
> repositories, etc.)
> 
> This kind of decision probably also means you'd need a variety of
> testcases where .gitattributes is different in every commit & the
> index & the working tree, and then you start testing several of the
> possible pairings to make sure the right .gitattributes file is used
> (e.g. (commit, commit), (commit, index), (index, commit), (worktree,
> index), etc.)

There may be some prior art here in how we handle mailmaps in a bare
repository. In that case, we pull them from HEAD (or really any commit
of your choosing, but the default is HEAD). That may seem a bit weird,
but it matches how non-bare repositories work, which read the mailmap
from the working tree.

So likewise, even looking at an old commit like "git show HEAD~1000", in
a non-bare repository we will read .gitattributes from the working tree,
which means it is (roughly) coming from HEAD. In some ways that is good
(there may be improvements to the attributes) and in some ways it is
weird and confusing (the meaning of the attributes may have been
different back then, or the two histories may even be somewhat
unrelated!). So I think you can make an argument either way on what is
useful, but harmonizing the non-bare and bare cases seems like the best
place to start.

And then that machinery would probably be enough to let people ask for
specific things on top (like "git show --attributes-from=HEAD~1000
HEAD~1000" if they really wanted).

-Peff

^ permalink raw reply	[flat|nested] 78+ messages in thread

* Re: [PATCH v3 2/2] diff: teach diff to read algorithm from diff driver
  2023-02-22 19:47                 ` Jeff King
@ 2023-02-24 17:44                   ` John Cai
  0 siblings, 0 replies; 78+ messages in thread
From: John Cai @ 2023-02-24 17:44 UTC (permalink / raw)
  To: Jeff King
  Cc: Elijah Newren, John Cai via GitGitGadget, git, Eric Sunshine,
	Phillip Wood, Ævar Arnfjörð Bjarmason

Hey Peff,

On 22 Feb 2023, at 14:47, Jeff King wrote:

> On Mon, Feb 20, 2023 at 09:32:52AM -0800, Elijah Newren wrote:
>
>>> I was thinking we would just use the tree-ish of the first one
>>
>> That would certainly simplify, but it'd be pretty important to
>> document.  (Incidentally, this kind of decision was my reason for
>> asking about all those special cases earlier, i.e. how to handle diff
>> between different commits, how to handle renames, how to handle bare
>> repositories, etc.)
>>
>> This kind of decision probably also means you'd need a variety of
>> testcases where .gitattributes is different in every commit & the
>> index & the working tree, and then you start testing several of the
>> possible pairings to make sure the right .gitattributes file is used
>> (e.g. (commit, commit), (commit, index), (index, commit), (worktree,
>> index), etc.)
>
> There may be some prior art here in how we handle mailmaps in a bare
> repository. In that case, we pull them from HEAD (or really any commit
> of your choosing, but the default is HEAD). That may seem a bit weird,
> but it matches how non-bare repositories work, which read the mailmap
> from the working tree.
>
> So likewise, even looking at an old commit like "git show HEAD~1000", in
> a non-bare repository we will read .gitattributes from the working tree,
> which means it is (roughly) coming from HEAD. In some ways that is good
> (there may be improvements to the attributes) and in some ways it is
> weird and confusing (the meaning of the attributes may have been
> different back then, or the two histories may even be somewhat
> unrelated!). So I think you can make an argument either way on what is
> useful, but harmonizing the non-bare and bare cases seems like the best
> place to start.

Thanks for the historical context and suggestion here
>
> And then that machinery would probably be enough to let people ask for
> specific things on top (like "git show --attributes-from=HEAD~1000
> HEAD~1000" if they really wanted).

Makes sense to me

thanks
John

>
> -Peff

^ permalink raw reply	[flat|nested] 78+ messages in thread

end of thread, other threads:[~2023-02-24 17:44 UTC | newest]

Thread overview: 78+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-05  3:46 [PATCH 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
2023-02-05  3:46 ` [PATCH 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
2023-02-06 16:20   ` Phillip Wood
2023-02-05  3:46 ` [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm John Cai via GitGitGadget
2023-02-05 17:50   ` Eric Sunshine
2023-02-06 13:10     ` John Cai
2023-02-06 16:27   ` Phillip Wood
2023-02-06 18:14     ` Eric Sunshine
2023-02-06 19:50     ` John Cai
2023-02-09  8:26       ` Elijah Newren
2023-02-09 10:31         ` "bad" diffs (was: [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm) Ævar Arnfjörð Bjarmason
2023-02-09 16:37         ` [PATCH 2/2] diff: teach diff to read gitattribute diff-algorithm John Cai
2023-02-06 16:39   ` Ævar Arnfjörð Bjarmason
2023-02-06 20:37     ` John Cai
2023-02-07 14:55       ` Phillip Wood
2023-02-07 17:00         ` John Cai
2023-02-09  9:09           ` Elijah Newren
2023-02-09 14:44             ` Phillip Wood
2023-02-10  9:57               ` Elijah Newren
2023-02-11 17:39                 ` Phillip Wood
2023-02-11  1:59               ` Jeff King
2023-02-15  2:35                 ` Elijah Newren
2023-02-15  4:21                   ` Jeff King
2023-02-15  5:20                     ` Junio C Hamano
2023-02-15 14:44                 ` Phillip Wood
2023-02-15 15:00                   ` Jeff King
2023-02-07 17:27         ` Ævar Arnfjörð Bjarmason
2023-02-15 14:47           ` Phillip Wood
2023-02-09  8:44       ` Elijah Newren
2023-02-14 21:16         ` John Cai
2023-02-15  3:41           ` Elijah Newren
2023-02-09  7:50     ` Elijah Newren
2023-02-09  9:41       ` Ævar Arnfjörð Bjarmason
2023-02-11  2:04         ` Jeff King
2023-02-07 17:56   ` Jeff King
2023-02-07 20:18     ` Ævar Arnfjörð Bjarmason
2023-02-07 20:47       ` Junio C Hamano
2023-02-07 21:05         ` Ævar Arnfjörð Bjarmason
2023-02-07 21:28           ` Junio C Hamano
2023-02-07 21:44             ` Ævar Arnfjörð Bjarmason
2023-02-09 16:34     ` John Cai
2023-02-11  1:39       ` Jeff King
2023-02-14 21:40 ` [PATCH v2 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
2023-02-14 21:40   ` [PATCH v2 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
2023-02-15  2:38     ` Junio C Hamano
2023-02-15 23:34       ` John Cai
2023-02-15 23:42         ` Junio C Hamano
2023-02-16  2:14           ` Jeff King
2023-02-16  2:57             ` Junio C Hamano
2023-02-16 20:34               ` John Cai
2023-02-14 21:40   ` [PATCH v2 2/2] diff: teach diff to read gitattribute diff-algorithm John Cai via GitGitGadget
2023-02-15  2:56     ` Junio C Hamano
2023-02-15  3:20       ` Junio C Hamano
2023-02-16 20:37         ` John Cai
2023-02-17 20:21   ` [PATCH v3 0/2] Teach diff to honor diff algorithms set through git attributes John Cai via GitGitGadget
2023-02-17 20:21     ` [PATCH v3 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
2023-02-17 21:27       ` Junio C Hamano
2023-02-18  1:36       ` Elijah Newren
2023-02-17 20:21     ` [PATCH v3 2/2] diff: teach diff to read algorithm from diff driver John Cai via GitGitGadget
2023-02-17 21:50       ` Junio C Hamano
2023-02-18  2:56       ` Elijah Newren
2023-02-20 15:32         ` John Cai
2023-02-20 16:21           ` Elijah Newren
2023-02-20 16:49             ` John Cai
2023-02-20 17:32               ` Elijah Newren
2023-02-20 20:53                 ` John Cai
2023-02-22 19:47                 ` Jeff King
2023-02-24 17:44                   ` John Cai
2023-02-18  1:16     ` [PATCH v3 0/2] Teach diff to honor diff algorithms set through git attributes Elijah Newren
2023-02-20 13:37       ` John Cai
2023-02-20 21:04     ` [PATCH v4 " John Cai via GitGitGadget
2023-02-20 21:04       ` [PATCH v4 1/2] diff: consolidate diff algorithm option parsing John Cai via GitGitGadget
2023-02-20 21:04       ` [PATCH v4 2/2] diff: teach diff to read algorithm from diff driver John Cai via GitGitGadget
2023-02-21 17:34       ` [PATCH v4 0/2] Teach diff to honor diff algorithms set through git attributes Junio C Hamano
2023-02-21 18:05         ` Elijah Newren
2023-02-21 18:51           ` Junio C Hamano
2023-02-21 19:36             ` John Cai
2023-02-21 20:16               ` Elijah Newren

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).