[RFC PATCH 0/1] Unit tests for khash.h

Git Mailing List Archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH 0/1] Unit tests for khash.h
@ 2023-05-31 15:51 Siddharth Singh
  2023-05-31 15:51 ` [RFC PATCH 1/1] khash_test.c: add unit tests Siddharth Singh
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Siddharth Singh @ 2023-05-31 15:51 UTC (permalink / raw)
  To: git; +Cc: Siddharth Singh

This RFC patch adds unit tests for khash.h. It uses the C TAP harness to illustrate the test cases [1]. This is not intended to be a complete implementation. The purpose of this patch to get your thoughts on the unit test content, not the test framework itself.

The tests cover a wide range of functionality, including the creation and destruction of hash tables, the insertion and deletion of elements and the querying of hash tables. I would appreciate feedback from reviewers on what other tests would be useful for khash.h and if these tests provide enough value.

[1] https://lore.kernel.org/git/20230427175007.902278-1-calvinwan@google.com/T/#me379c06257ebe2847d71b604c031328c7edc3843

Siddharth Singh (1):
  khash_test.c: add unit tests

 Makefile       |   1 +
 t/khash_test.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 174 insertions(+)
 create mode 100644 t/khash_test.c

-- 
2.40.1.606.ga4b1b128d6-goog

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [RFC PATCH 1/1] khash_test.c: add unit tests
  2023-05-31 15:51 [RFC PATCH 0/1] Unit tests for khash.h Siddharth Singh
@ 2023-05-31 15:51 ` Siddharth Singh
  2023-05-31 22:35 ` [RFC PATCH 0/1] Unit tests for khash.h Taylor Blau
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 12+ messages in thread
From: Siddharth Singh @ 2023-05-31 15:51 UTC (permalink / raw)
  To: git; +Cc: Siddharth Singh

The tests check if a hashmap
1. Gets hash value for a string
2. Has correct value for '__ac_HASH_UPPER'
3. Initializes succesfully.
4. Put a key value pair.
5. Gets the value from key.
6. Delete the key-value pair.
7. Updates the value after deleting it.
8. Update Value of before deleting.
9. Contains the right number of elements.

Signed-off-by: Siddharth Singh <siddhartth@google.com>
---
 Makefile       |   1 +
 t/khash_test.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 174 insertions(+)
 create mode 100644 t/khash_test.c

diff --git a/Makefile b/Makefile
index 660c72d72e..59cb76e5e5 100644
--- a/Makefile
+++ b/Makefile
@@ -3852,6 +3852,7 @@ $(UNIT_TEST_PROGS): $(CTAP_OBJS) $(LIBS)
 	$(QUIET)mkdir -p t/unit-tests
 	$(QUIET_CC)$(CC) -It -o t/unit-tests/unit-test-t t/unit-test.c $(CTAP_OBJS)
 	$(QUIET_CC)$(CC) -It -o t/unit-tests/strbuf-t t/strbuf-test.c -DSKIP_COMMON_MAIN common-main.c $(CTAP_OBJS) $(LIBS)
+	$(QUIET_CC)$(CC) -It -o t/unit-tests/khash-t t/khash_test.c -DSKIP_COMMON_MAIN common-main.c $(CTAP_OBJS) $(LIBS)
 
 .PHONY: unit-tests
 unit-tests: $(UNIT_TEST_PROGS) $(UNIT_TEST_RUNNER)
diff --git a/t/khash_test.c b/t/khash_test.c
new file mode 100644
index 0000000000..cd27618d4d
--- /dev/null
+++ b/t/khash_test.c
@@ -0,0 +1,173 @@
+#include <tap/basic.h>
+#include "../git-compat-util.h"
+
+#include "../khash.h"
+
+int khash_testACX31HashString(){
+    const char * str = "foobar";
+    khint_t value = __ac_X31_hash_string(str);
+    khint_t expected = 3026088333;
+    if(value == expected)
+        return 1;
+    return 0;
+}
+
+int khash_testAC_HASH_UPPER(){
+    const double expected = 0.77;
+    if(expected == __ac_HASH_UPPER){
+        return 1;
+    }
+    return 0;
+}
+
+KHASH_INIT(str, const char *, int *, 1, kh_str_hash_func, kh_str_hash_equal);
+
+int khash_testInit(){
+    kh_str_t * hashmap = kh_init_str();
+    if(hashmap != NULL){
+        return 1;
+    }
+    return 0;
+}
+
+int khash_testPut(){
+    kh_str_t * hashmap = kh_init_str();
+    int ret;
+    int pos = kh_put_str(hashmap,"test_key",&ret);
+    int value = 14;
+    if(ret){
+        kh_key(hashmap, pos) = xstrdup("test_key");
+		kh_value(hashmap, pos) = &value;
+    }else return 0;
+    if(*kh_value(hashmap,pos) == value){
+        return 1;
+    }
+    return 0;
+}
+
+int khash_testGet(){
+    kh_str_t * hashmap = kh_init_str();
+    int ret;
+    int pos = kh_put_str(hashmap,"test_key",&ret);
+    int value = 14;
+    if(ret){
+        kh_key(hashmap, pos) = xstrdup("test_key");
+		kh_value(hashmap, pos) = &value;
+    }else return 0;
+    int value_pos = kh_get_str(hashmap,"test_key");    
+    int returned_value = *kh_value(hashmap,value_pos);
+    if(returned_value == value){
+        return 1;
+    }
+    return 0;
+}
+
+int khash_testDelete(){
+    kh_str_t * hashmap = kh_init_str();
+    int ret;
+    int pos = kh_put_str(hashmap,"test_key",&ret);
+    int value = 14;
+    if(ret){
+        kh_key(hashmap, pos) = xstrdup("test_key");
+		kh_value(hashmap, pos) = &value;
+    }else return 0;
+    int current_size = hashmap->size;
+    kh_del_str(hashmap,pos);
+    if(hashmap->size + 1 == current_size && !kh_exist(hashmap, pos)){
+        return 1;
+    }
+    return 0;
+}
+
+int khash_testUpdateValueAfterDeleting(){
+    kh_str_t * hashmap = kh_init_str();
+    int ret;
+    int pos1 = kh_put_str(hashmap,"test_key",&ret);
+    int value1 = 14;
+    if(ret){
+        kh_key(hashmap, pos1) = xstrdup("test_key");
+		kh_value(hashmap, pos1) = &value1;
+    }else return 0;
+
+    kh_del_str(hashmap,pos1);
+    int pos2 = kh_put_str(hashmap,"test_key",&ret);
+    int value2 = 15;
+    
+    if(ret){
+        kh_key(hashmap, pos2) = xstrdup("test_key");
+		kh_value(hashmap, pos2) = &value2;
+    }else return 0;
+
+    int value_pos = kh_get_str(hashmap,"test_key");    
+    int returned_value = *kh_value(hashmap,value_pos);
+    if(returned_value == value2){
+        return 1;
+    }
+    return 0;
+}
+
+int khash_testUpdateValueBeforeDeleting(){
+    kh_str_t * hashmap = kh_init_str();
+    int ret;
+    int pos1 = kh_put_str(hashmap,"test_key",&ret);
+    int value1 = 14;
+    if(ret){
+        kh_key(hashmap, pos1) = xstrdup("test_key");
+		kh_value(hashmap, pos1) = &value1;
+    }else return 0;
+
+    int pos2 = kh_put_str(hashmap,"test_key",&ret);
+    int value2 = 15;
+    
+    if(ret == 0){
+        return 1;
+    }
+    return 0;
+}
+
+int khash_testSize(){
+    kh_str_t * hashmap = kh_init_str();
+    int ret;
+    int pos1 = kh_put_str(hashmap,"test_key1",&ret);
+    int value1 = 14;
+    if(ret){
+        kh_key(hashmap, pos1) = xstrdup("test_key1");
+		kh_value(hashmap, pos1) = &value1;
+    }else return 0;
+
+    int pos2 = kh_put_str(hashmap,"test_key2",&ret);
+    int value2 = 15;
+    if(ret){
+        kh_key(hashmap, pos2) = xstrdup("test_key2");
+		kh_value(hashmap, pos2) = &value2;
+    }else return 0;
+
+
+    int pos3 = kh_put_str(hashmap,"test_key3",&ret);
+    int value3 = 16;
+    if(ret){
+        kh_key(hashmap, pos3) = xstrdup("test_key3");
+		kh_value(hashmap, pos3) = &value3;
+    }else return 0;
+
+    if(kh_size(hashmap) == 3){
+        return 1;
+    }
+    return 0;
+}
+
+int main(void){
+    plan(9);
+    
+    ok(khash_testACX31HashString(),"__ac_X31_hash_string works");
+    ok(khash_testAC_HASH_UPPER(),"__ac_HASH_UPPER has correct value");
+    ok(khash_testInit(), "Initialize hashmap");
+    ok(khash_testPut(), "Put Key-Value pair");
+    ok(khash_testGet(), "Get the value from hashmap");
+    ok(khash_testDelete(), "Delete the key-value from hashmap");
+    ok(khash_testUpdateValueAfterDeleting(), "Update Value of a Key after deleting");
+    ok(khash_testUpdateValueBeforeDeleting(), "Update Value of a Key before deleting");
+    ok(khash_testSize(),"Test if 3 elements are present in hashmap");
+
+    return 0;
+}
\ No newline at end of file
-- 
2.40.1.606.ga4b1b128d6-goog


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH 0/1] Unit tests for khash.h
  2023-05-31 15:51 [RFC PATCH 0/1] Unit tests for khash.h Siddharth Singh
  2023-05-31 15:51 ` [RFC PATCH 1/1] khash_test.c: add unit tests Siddharth Singh
@ 2023-05-31 22:35 ` Taylor Blau
  2023-06-01  6:26   ` Junio C Hamano
  2023-06-01 13:02   ` Phillip Wood
  2023-05-31 23:35 ` Eric Wong
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 12+ messages in thread
From: Taylor Blau @ 2023-05-31 22:35 UTC (permalink / raw)
  To: Siddharth Singh; +Cc: git

Hi Siddharth,

On Wed, May 31, 2023 at 05:51:41PM +0200, Siddharth Singh wrote:
> This RFC patch adds unit tests for khash.h. It uses the C TAP harness
> to illustrate the test cases [1]. This is not intended to be a
> complete implementation. The purpose of this patch to get your
> thoughts on the unit test content, not the test framework itself.

Thanks for working on this, and for opening the discussion up. I took
only a brief look through the actual changes. But I think the much more
interesting discussion is on the approach, so I'll refrain from
commenting on the tests themselves.

I am somewhat skeptical of this as a productive direction, primarily
because khash already has tests [1] that exercise its functionality. I'm
not necessarily opposed to (light) testing of oid_set, oid_map, and
oid_pos, which are khash structures, but declared by Git.

Even still, I don't think that we are testing much in that case, since
the boundary between Git and khash is limited to the KHASH_INIT macro.

So... I dunno. I'm not strongly opposed here, but I think this is
probably not the most productive place to start adding tests.

Thanks,
Taylor

[1]: https://github.com/attractivechaos/klib/blob/master/test/khash_test.c

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH 0/1] Unit tests for khash.h
  2023-05-31 15:51 [RFC PATCH 0/1] Unit tests for khash.h Siddharth Singh
  2023-05-31 15:51 ` [RFC PATCH 1/1] khash_test.c: add unit tests Siddharth Singh
  2023-05-31 22:35 ` [RFC PATCH 0/1] Unit tests for khash.h Taylor Blau
@ 2023-05-31 23:35 ` Eric Wong
  2023-06-01 13:23 ` Phillip Wood
  2023-06-27 13:20 ` [RFC PATCH v2] khash_test.c : add unit tests Siddharth Singh
  4 siblings, 0 replies; 12+ messages in thread
From: Eric Wong @ 2023-05-31 23:35 UTC (permalink / raw)
  To: Siddharth Singh; +Cc: git

Siddharth Singh <siddhartth@google.com> wrote:
> This RFC patch adds unit tests for khash.h.

I agree with what Taylor said about being skeptical of these
tests being productive.

I'm curious about switching parts of git to use AC's newer `khashl'.
khashl is smaller than the original khash but slower for deletes.
Since git hardly deletes and khash memory use is high (especially on
packing); it should be a benefit all around.

Also, see Documentation/CodingGuidelines for coding style.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH 0/1] Unit tests for khash.h
  2023-05-31 22:35 ` [RFC PATCH 0/1] Unit tests for khash.h Taylor Blau
@ 2023-06-01  6:26   ` Junio C Hamano
  2023-06-01 13:02   ` Phillip Wood
  1 sibling, 0 replies; 12+ messages in thread
From: Junio C Hamano @ 2023-06-01  6:26 UTC (permalink / raw)
  To: Taylor Blau; +Cc: Siddharth Singh, git

Taylor Blau <me@ttaylorr.com> writes:

> Hi Siddharth,
>
> On Wed, May 31, 2023 at 05:51:41PM +0200, Siddharth Singh wrote:
>> This RFC patch adds unit tests for khash.h. It uses the C TAP harness
>> to illustrate the test cases [1]. This is not intended to be a
>> complete implementation. The purpose of this patch to get your
>> thoughts on the unit test content, not the test framework itself.
>
> Thanks for working on this, and for opening the discussion up. I took
> only a brief look through the actual changes. But I think the much more
> interesting discussion is on the approach, so I'll refrain from
> commenting on the tests themselves.
>
> I am somewhat skeptical of this as a productive direction, primarily
> because khash already has tests [1] that exercise its functionality. I'm
> not necessarily opposed to (light) testing of oid_set, oid_map, and
> oid_pos, which are khash structures, but declared by Git.

Yes, as a fairly isolated and supposedly well tested piece of code,
using khash as a guinea pig for the technology demonstration of C
TAP harness they propose to use would be an excellent choice, and I
found the request not to talk about the harness quite odd.  It felt
almost pointless.

The "main()" did illustrate a few things people found problematic
with the harness, among which having to declare plan(9) upfront.  An
obvious disregard of the coding guidelines did not help well,
either.  Overall, I am not impressed.

Thanks.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH 0/1] Unit tests for khash.h
  2023-05-31 22:35 ` [RFC PATCH 0/1] Unit tests for khash.h Taylor Blau
  2023-06-01  6:26   ` Junio C Hamano
@ 2023-06-01 13:02   ` Phillip Wood
  1 sibling, 0 replies; 12+ messages in thread
From: Phillip Wood @ 2023-06-01 13:02 UTC (permalink / raw)
  To: Taylor Blau, Siddharth Singh; +Cc: git, Junio C Hamano

Hi Taylor

On 31/05/2023 23:35, Taylor Blau wrote:
> Hi Siddharth,
> 
> On Wed, May 31, 2023 at 05:51:41PM +0200, Siddharth Singh wrote:
>> This RFC patch adds unit tests for khash.h. It uses the C TAP harness
>> to illustrate the test cases [1]. This is not intended to be a
>> complete implementation. The purpose of this patch to get your
>> thoughts on the unit test content, not the test framework itself.
> 
> Thanks for working on this, and for opening the discussion up. I took
> only a brief look through the actual changes. But I think the much more
> interesting discussion is on the approach, so I'll refrain from
> commenting on the tests themselves.
> 
> I am somewhat skeptical of this as a productive direction, primarily
> because khash already has tests [1] that exercise its functionality. I'm
> not necessarily opposed to (light) testing of oid_set, oid_map, and
> oid_pos, which are khash structures, but declared by Git.

I agree that starting by adding tests for third party code is a bit 
curious, but those tests you linked to look like they're testing 
performance rather than correctness.

Best Wishes

Phillip

> Even still, I don't think that we are testing much in that case, since
> the boundary between Git and khash is limited to the KHASH_INIT macro.
> 
> So... I dunno. I'm not strongly opposed here, but I think this is
> probably not the most productive place to start adding tests.
> 
> Thanks,
> Taylor
> 
> [1]: https://github.com/attractivechaos/klib/blob/master/test/khash_test.c


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH 0/1] Unit tests for khash.h
  2023-05-31 15:51 [RFC PATCH 0/1] Unit tests for khash.h Siddharth Singh
                   ` (2 preceding siblings ...)
  2023-05-31 23:35 ` Eric Wong
@ 2023-06-01 13:23 ` Phillip Wood
  2023-06-27 13:20 ` [RFC PATCH v2] khash_test.c : add unit tests Siddharth Singh
  4 siblings, 0 replies; 12+ messages in thread
From: Phillip Wood @ 2023-06-01 13:23 UTC (permalink / raw)
  To: Siddharth Singh, git; +Cc: Taylor Blau, Junio C Hamano

Hi Siddharth

On 31/05/2023 16:51, Siddharth Singh wrote:
> This RFC patch adds unit tests for khash.h. It uses the C TAP harness to illustrate the test cases [1]. This is not intended to be a complete implementation. The purpose of this patch to get your thoughts on the unit test content, not the test framework itself.

It is a bit hard to comment on the test content when you say they are 
incomplete so we don't know what you're planning on adding in the 
future. I agree with Taylor and Junio that khash is probably not the 
best place to start adding tests (I've not checked how far away from 
upstream our code has drifted over time) but from a quick glance I found 
these tests rather tame. I'd like to see tests that are trying to break 
things, for example adding keys with hash collisions and checking that 
they can still be retrieved, updated  after some have been deleted. You 
could use a hash function that returns the first character of a string 
and add words starting with the same letter to generate collisions.

It is also hard to separate the tests from the framework as a good test 
framework will make it easy to write tests that have good diagnostic 
messages when they fail that pin-point the line where the failure 
occurred and the values used in the comparison that failed. A good 
framework will also minimize the amount of " if (...) return 0" boiler 
plate required when a comparison fails. The tests here do not benefit 
from such a framework.

I think this patch also raises the question of what should be tested by 
our test-tool helper and what should be tested in unit tests. Ævar 
raised the idea of improving our test-helper tests rather than adding 
unit tests [1]. It would be good to get a response to that point of view.

Finally I would recommend that you chat to your colleagues about the 
expectations around including an explanation of the reasons for a change 
in the commit message and code style for patches posted on this list.

Best Wishes

Phillip

[1] https://lore.kernel.org/git/230502.861qjyj0cb.gmgdl@evledraar.gmail.com

> The tests cover a wide range of functionality, including the creation and destruction of hash tables, the insertion and deletion of elements and the querying of hash tables. I would appreciate feedback from reviewers on what other tests would be useful for khash.h and if these tests provide enough value.
> 
> [1] https://lore.kernel.org/git/20230427175007.902278-1-calvinwan@google.com/T/#me379c06257ebe2847d71b604c031328c7edc3843
> 
> 
> Siddharth Singh (1):
>    khash_test.c: add unit tests
> 
>   Makefile       |   1 +
>   t/khash_test.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 174 insertions(+)
>   create mode 100644 t/khash_test.c
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [RFC PATCH v2] khash_test.c : add unit tests
  2023-05-31 15:51 [RFC PATCH 0/1] Unit tests for khash.h Siddharth Singh
                   ` (3 preceding siblings ...)
  2023-06-01 13:23 ` Phillip Wood
@ 2023-06-27 13:20 ` Siddharth Singh
  2023-06-30  0:53   ` Glen Choo
  2023-06-30  1:05   ` Glen Choo
  4 siblings, 2 replies; 12+ messages in thread
From: Siddharth Singh @ 2023-06-27 13:20 UTC (permalink / raw)
  To: git

Signed-off-by: Siddharth Singh <siddhartth@google.com>
---
Since v1
- rewrote the code keeping coding style guidelines in mind.
- refactored tests to improve their implementation..
- added a few more tests that cause collisions in the hashmap.
In the v1 patch, there were concerns that new tests were unnecessary because of upstream tests. However, I believe it would be beneficial to have tests based on the khash implementation in the git codebase because the existing tests[1] for khash do not use the same code for khash as the git codebase. 
E.g. in khash.h file of “attractivechaos/klib/khash.h” [2]. There's an implementation for `KHASH_MAP_INIT_INT64` which isn’t defined in “git/git/khash.h”[3]. So, there’s a possibility that the khash.h in a different repository might move forward and end up being different from out implementation, so writing tests for “git/git/khash.h” would ensure that our tests are relevant to the actual usage of the library.
While my colleagues are currently investigating whether C Tap harness is the best way to test libified code, this RFC is intended to discuss the content of unit tests and what other tests would be useful for khash.h. I am unsure of what tests will be valuable in the future, so I would appreciate your thoughts on the matter. Many test cases are easier to construct in C TAP harness than in test-tool. For example, In C TAP harness, non-string values can be used directly in hash maps. However, in test-tool, non-string values must be passed as an argument to a shell executable, parsed into the correct type, and then operated on.
I'm also very new to writing unit tests in C and git codebase in general, so I'll appreciate constructive feedback and discussion..
With this RFC, I hope you can help me answer these questions.
What are the unit test cases to include in khash.h?
What are the other types of tests that would be useful for khash.h?
[1] https://github.com/attractivechaos/klib/blob/master/test/khash_test.c
[2] https://github.com/attractivechaos/klib/blob/master/khash.h
[3] https://github.com/git/git/blob/master/khash.h

 Makefile       |   2 +
 t/khash_test.c | 214 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 216 insertions(+)
 create mode 100644 t/khash_test.c

diff --git a/Makefile b/Makefile
index 660c72d72e..d3ad2fa23c 100644
--- a/Makefile
+++ b/Makefile
@@ -3847,11 +3847,13 @@ $(UNIT_TEST_RUNNER): t/runtests.c
 
 UNIT_TEST_PROGS += t/unit-tests/unit-test-t
 UNIT_TEST_PROGS += t/unit-tests/strbuf-t
+UNIT_TEST_PROGS += t/unit-tests/khash-t
 
 $(UNIT_TEST_PROGS): $(CTAP_OBJS) $(LIBS)
 	$(QUIET)mkdir -p t/unit-tests
 	$(QUIET_CC)$(CC) -It -o t/unit-tests/unit-test-t t/unit-test.c $(CTAP_OBJS)
 	$(QUIET_CC)$(CC) -It -o t/unit-tests/strbuf-t t/strbuf-test.c -DSKIP_COMMON_MAIN common-main.c $(CTAP_OBJS) $(LIBS)
+	$(QUIET_CC)$(CC) -It -o t/unit-tests/khash-t t/khash_test.c -DSKIP_COMMON_MAIN common-main.c $(CTAP_OBJS) $(LIBS)
 
 .PHONY: unit-tests
 unit-tests: $(UNIT_TEST_PROGS) $(UNIT_TEST_RUNNER)
diff --git a/t/khash_test.c b/t/khash_test.c
new file mode 100644
index 0000000000..d1a43add13
--- /dev/null
+++ b/t/khash_test.c
@@ -0,0 +1,214 @@
+#include "../git-compat-util.h"
+#include "../khash.h"
+#include <tap/basic.h>
+
+/*
+  These tests are for base assumptions; if they fails, library is broken
+*/
+int hash_string_succeeds() {
+  const char *str = "test_string";
+  khint_t value = __ac_X31_hash_string(str);
+  return value == 0xf1a6305e;
+}
+
+int hash_string_with_non_alphabets_succeeds() {
+  const char *str = "test_string #2";
+  khint_t value = __ac_X31_hash_string(str);
+  return value == 0xfa970771;
+}
+
+KHASH_INIT(str, const char *, int *, 1, kh_str_hash_func, kh_str_hash_equal);
+
+int initialized_hashmap_pointer_not_null() {
+  kh_str_t *hashmap = kh_init_str();
+
+  if(hashmap != NULL){
+    free(hashmap);
+    return 1;
+  }
+  return 0;
+}
+
+int put_key_into_hashmap_once_succeeds() {
+  int ret, value;
+  khint_t pos;
+  kh_str_t *hashmap = kh_init_str();
+  pos = kh_put_str(hashmap, "test_key", &ret);
+  if(!ret)
+    return 0;
+  return pos != 0;
+}
+
+int put_same_key_into_hashmap_twice_fails() {
+  int first_return_value, second_return_value, value;
+  khint_t pos;
+  kh_str_t *hashmap = kh_init_str();
+  kh_put_str(hashmap, "test_key", &first_return_value);
+  kh_put_str(hashmap, "test_key", &second_return_value);
+  return !second_return_value;
+}
+
+int put_value_into_hashmap_once_succeeds() {
+  int ret, value = 14;
+  khint_t pos;
+  kh_str_t *hashmap = kh_init_str();
+  pos = kh_put_str(hashmap, "test_key", &ret);
+  if (!ret)
+    return 0;
+  kh_key(hashmap, pos) = xstrdup("test_key");
+  kh_value(hashmap, pos) = &value;
+  return *kh_value(hashmap, pos) == value;
+}
+
+int put_same_value_into_hashmap_twice_fails() {
+  int first_return_value, second_return_value, value = 14;
+  khint_t pos;
+  kh_str_t *hashmap = kh_init_str();
+  pos = kh_put_str(hashmap, "test_key", &first_return_value);
+  if (!first_return_value)
+    return 0;
+  kh_key(hashmap, pos) = xstrdup("test_key");
+  kh_value(hashmap, pos) = &value;
+  kh_put_str(hashmap, "test_key", &second_return_value);
+  return !second_return_value;
+}
+
+int get_existing_kv_from_hashmap_succeeds() {
+  int ret, value =14;
+  int expected = value;
+  khint_t pos;
+  kh_str_t *hashmap = kh_init_str();
+  pos = kh_put_str(hashmap, "test_key", &ret);
+  if (!ret)
+    return 0;
+  kh_key(hashmap, pos) = xstrdup("test_key");
+  kh_value(hashmap, pos) = &value;
+  return *kh_value(hashmap, kh_get_str(hashmap, "test_key")) == expected;
+}
+
+int get_nonexisting_kv_from_hashmap_fails() {
+  int value = 14;
+  khint_t pos;
+  kh_str_t *hashmap = kh_init_str();
+  return !kh_get_str(hashmap, "test_key");
+}
+
+int deletion_from_hashmap_sets_flag() {
+  int ret, value = 14;
+  khint_t pos;
+  kh_str_t *hashmap = kh_init_str();
+  pos = kh_put_str(hashmap, "test_key", &ret);
+  if (!ret)
+    return 0;
+  if(!kh_exist(hashmap, pos))
+    return 0;
+  kh_key(hashmap, pos) = xstrdup("test_key");
+  kh_value(hashmap, pos) = &value;
+  kh_del_str(hashmap, pos);
+  return !kh_exist(hashmap, pos);
+}
+
+int deletion_from_hashmap_updates_size() {
+  int ret, value = 14, current_size;
+  khint_t pos;
+  kh_str_t *hashmap = kh_init_str();
+  pos = kh_put_str(hashmap, "test_key", &ret);
+  if (!ret)
+    return 0;
+  kh_key(hashmap, pos) = xstrdup("test_key");
+  kh_value(hashmap, pos) = &value;
+  current_size = hashmap->size;
+  kh_del_str(hashmap, pos);
+  return hashmap->size + 1 == current_size;
+}
+
+int deletion_from_hashmap_does_not_update_kv() {
+  int ret, value = 14, current_size;
+  khint_t pos;
+  kh_str_t *hashmap = kh_init_str();
+  pos = kh_put_str(hashmap, "test_key", &ret);
+  if (!ret)
+    return 0;
+  kh_key(hashmap, pos) = xstrdup("test_key");
+  kh_value(hashmap, pos) = &value;
+  kh_del_str(hashmap, pos);
+  return !strcmp(kh_key(hashmap, pos),"test_key");
+}
+
+int update_value_after_deletion_succeeds() {
+  int ret, value1 = 14, value2 = 15;
+  khint_t pos1, pos2;
+  kh_str_t *hashmap = kh_init_str();
+  // adding the kv to the hashmap
+  pos1 = kh_put_str(hashmap, "test_key", &ret);
+  if (!ret)
+    return 0;
+  kh_key(hashmap, pos1) = xstrdup("test_key");
+  kh_value(hashmap, pos1) = &value1;
+  // delete the kv from the hashmap
+  kh_del_str(hashmap, pos1);
+  // adding the same key with different value
+  pos2 = kh_put_str(hashmap, "test_key", &ret);
+  if (!ret)
+    return 0;
+  kh_key(hashmap, pos2) = xstrdup("test_key");
+  kh_value(hashmap, pos2) = &value2;
+  // check if the value is different
+  return *kh_value(hashmap, kh_get_str(hashmap, "test_key")) == value2;
+}
+
+int hashmap_size_matches_number_of_added_elements() {
+  int ret, value1 = 14, value2 = 15, value3 = 16;
+  khint_t pos1, pos2, pos3;
+  kh_str_t *hashmap = kh_init_str();
+  pos1 = kh_put_str(hashmap, "test_key1", &ret);
+  if (!ret)
+    return 0;
+  kh_key(hashmap, pos1) = xstrdup("test_key1");
+  kh_value(hashmap, pos1) = &value1;
+  pos2 = kh_put_str(hashmap, "test_key2", &ret);
+  if (!ret)
+    return 0;
+  kh_key(hashmap, pos2) = xstrdup("test_key2");
+  kh_value(hashmap, pos2) = &value2;
+  pos3 = kh_put_str(hashmap, "test_key3", &ret);
+  if (!ret)
+    return 0;
+  kh_key(hashmap, pos3) = xstrdup("test_key3");
+  kh_value(hashmap, pos3) = &value3;
+  return kh_size(hashmap) == 3;
+}
+
+int main(void) {
+  plan(14);
+
+  ok(hash_string_succeeds(), "X31 hash value of a string is correct");
+  ok(hash_string_with_non_alphabets_succeeds(),
+     "Get X31 hash value for a string with non-alphabets in it.");
+  ok(initialized_hashmap_pointer_not_null(),
+     "Successfully initialize a hashmap and it's not NULL");
+  ok(put_key_into_hashmap_once_succeeds(),
+     "Put a key into hashmap that doesn't exist.");
+  ok(put_same_key_into_hashmap_twice_fails(),
+     "Put a key into hashmap twice that causes collision.");
+  ok(put_value_into_hashmap_once_succeeds(),
+     "Put a unique key-value pair into hashmap.");
+  ok(put_same_value_into_hashmap_twice_fails(),
+     "Put a key-value pair into hashmap twice that causes collision.");
+  ok(get_existing_kv_from_hashmap_succeeds(),
+     "Get the value from a key that exists in hashmap");
+  ok(get_nonexisting_kv_from_hashmap_fails(),
+     "Get the value from a key that doesn't exist in hashmap");
+  ok(deletion_from_hashmap_sets_flag(),
+     "Delete the key-value from hashmap and the flag is set to false");
+  ok(deletion_from_hashmap_updates_size(),
+     "Delete the key-value from hashmap and the size changes");
+  ok(deletion_from_hashmap_does_not_update_kv(),
+     "Delete the key-value from hashmap but it can be fetched using iterator");
+  ok(update_value_after_deletion_succeeds(),
+     "Updates the value of a key after deleting it");
+  ok(hashmap_size_matches_number_of_added_elements(),
+     "size of hashmap is correct after adding 3 elements");
+
+  return 0;
+}
-- 
2.40.GIT


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH v2] khash_test.c : add unit tests
  2023-06-27 13:20 ` [RFC PATCH v2] khash_test.c : add unit tests Siddharth Singh
@ 2023-06-30  0:53   ` Glen Choo
  2023-07-07 15:04     ` Siddharth Singh
  2023-06-30  1:05   ` Glen Choo
  1 sibling, 1 reply; 12+ messages in thread
From: Glen Choo @ 2023-06-30  0:53 UTC (permalink / raw)
  To: Siddharth Singh, git

Siddharth Singh <siddhartth@google.com> writes:

> - rewrote the code keeping coding style guidelines in mind.

I noticed several style issues in the patch. I'll avoid commenting on
those here so that we can focus on the other, big questions.

> I'm also very new to writing unit tests in C and git codebase in general, so I'll appreciate constructive feedback and discussion..

Okay. I think this is a good chance for the ML to decide on what sorts
of unit test practices we want too.

> What are the unit test cases to include in khash.h?
> What are the other types of tests that would be useful for khash.h?

For a third party library, I mostly agree with upthread commenters that
testing the correctness of the _library_ code isn't that valuable on its
own, but unit tests are also useful for demonstrating how the code
should be used. In a lot of projects, unit tests are a pseudo-official
demonstration of how an interface should be used, and that's sometimes
better than seeing the code in production, because unit tests are
typically a lot simpler and easier to reason about.

As an aside, this is something that a unit test framework will do _much
better_ than test-tool, because it makes it easy to read and write these
idtiomatic demos without worrying about how the test-tool executable
will be used.

I think that if we consider these two factors together (correctness and
developer education), the fact that we don't have great documentation of
khash.h in-tree, and that the external docs aren't that comprehensive
either, I think it would be useful to exercise each of the khash
'functions' to show how they work.

As Taylor mentioned upthread, I think the most valuable thing to test is
the way our code uses khash.h (not the library code itself). IOW:

  KHASH_INIT(oid_set, struct object_id, int, 0, oidhash_by_value, oideq_by_value)
  KHASH_INIT(oid_map, struct object_id, void *, 1, oidhash_by_value, oideq_by_value)
  KHASH_INIT(oid_pos, struct object_id, int, 1, oidhash_by_value, oideq_by_value)

So we would focus on oidhash_by_value and oideq_by_value in particular,
since that's provided by us. It would be very useful to verify that
these functions are doing exactly what we think they should be doing,
and we could do this with _every_ khash function to really explore the
API surface.

In fact... I suspect that they _not_ doing what we think they should,
or at the very least, might be displaying some very unintuitive
behavior. object_id contains both the hash value _and_ the hash
algorithm that generated it. If we have the same hash value, but
generated by different algorithms, are they considered the same equal or
not? To me, the intuitive answer is "no", but oidhash_by_value() only
considers the hash value and ignores the algorithm altogether! I think
that testing collisions between SHA1 and SHA256 values would be very
useful.

> +/*
> +  These tests are for base assumptions; if they fails, library is broken
> +*/
> +int hash_string_succeeds() {
> +  const char *str = "test_string";
> +  khint_t value = __ac_X31_hash_string(str);
> +  return value == 0xf1a6305e;
> +}
> +
> +int hash_string_with_non_alphabets_succeeds() {
> +  const char *str = "test_string #2";
> +  khint_t value = __ac_X31_hash_string(str);
> +  return value == 0xfa970771;
> +}

For a unit test, it is a best practice to test the external interface
that callers are depending on, and to avoid implementation
details/internals. If we do the latter, we end up with brittle tests
that break on non-important changes. E.g. we definitely should not be
testing the exact value of the hash function. The khash library
maintainer could decide to change the hash function, and we should not
be broken by that.

For khash.h, it appears that that anything prepended with "__" is meant
to be internal, and everything else is external. E.g.
__ac_X31_hash_string is certainly not meant to be used directly by end
users., especially since there's a human-friendly macro for it:

  #define kh_str_hash_func(key) __ac_X31_hash_string(key)

However, because something is external, doesn't mean we need to be
testing its implementation - we still shouldn't be testing the
return value of kh_str_hash_func(). We may want to test certain
_properties_ of the external function (e.g. does it provide good
collision resistance?), but in this case, I think we can trust that what
the library author wrote is a good default.

> +KHASH_INIT(str, const char *, int *, 1, kh_str_hash_func, kh_str_hash_equal);

As mentioned earlier, I think testing oid_[set|map|pos] would be much
more useful, and we should try to demonstrate each of the
macros/functions and how they interact, e.g. if I kh_put_*, I should
expect to be able to kh_get_* afterwards.

> +
> +int initialized_hashmap_pointer_not_null() {
> +  kh_str_t *hashmap = kh_init_str();
> +
> +  if(hashmap != NULL){
> +    free(hashmap);
> +    return 1;
> +  }
> +  return 0;
> +}

I don't think this is a useful test. It would be extremely weird for the
return value to be NULL in regular usage.

> +int put_key_into_hashmap_once_succeeds() {
> +  int ret, value;
> +  khint_t pos;
> +  kh_str_t *hashmap = kh_init_str();
> +  pos = kh_put_str(hashmap, "test_key", &ret);
> +  if(!ret)
> +    return 0;
> +  return pos != 0;
> +}

I don't think it makes sense to only check the return value, we should
check that it had the intended effect - that we can read back the value.
There is a later test that does this, so I'd prefer we remove this.

Comment on the test framework, not the test: I think it makes sense to
assert that "ret" is nonzero, but this "if..return" way of asserting
makes it impossible to tell what has broken. It also adds a lot of
cognitive overhead when trying to parse the assertion. I'll echo Phillip
Wood's comments and say that I think this is definitely not a framework
I want to use.

> +int put_same_key_into_hashmap_twice_fails() {
> +  int first_return_value, second_return_value, value;
> +  khint_t pos;
> +  kh_str_t *hashmap = kh_init_str();
> +  kh_put_str(hashmap, "test_key", &first_return_value);
> +  kh_put_str(hashmap, "test_key", &second_return_value);
> +  return !second_return_value;
> +}

This test seems reasonable.

> +int put_value_into_hashmap_once_succeeds() {
> +  int ret, value = 14;
> +  khint_t pos;
> +  kh_str_t *hashmap = kh_init_str();
> +  pos = kh_put_str(hashmap, "test_key", &ret);
> +  if (!ret)
> +    return 0;
> +  kh_key(hashmap, pos) = xstrdup("test_key");
> +  kh_value(hashmap, pos) = &value;
> +  return *kh_value(hashmap, pos) == value;
> +}

Checking the inserted value makes sense to me I don't really understand
khash.h, but is this the recommended way to get values back? kh_value()
looks like just a small macro around an internal array, so we are just
checking that *&value == value. I would have expected that we would call
kh_get to get "pos", then call kh_value on that.

Also, do we have to kh_put_str("some_key") and then immediately set it
again with kh_key(pos)? That seems odd, and I don't see other callers
doing that all the time.

Since this is the second time I see "if (!ret)", I should ask whether
this is meant as an assertion (checking the correctness of the code), or
for flow control (the code might return either 0 or 1, and I need to
handle both). If it is the former, I think it's unnecessary to
assert on it multiple times. The latter does not belong in a unit test -
we should already _know_ what we want to be returned. Btw, it is a huge
flaw in the test framework that both look exactly the same, so I'll say
again that I really don't like it.

> +int put_same_value_into_hashmap_twice_fails() {
> +  int first_return_value, second_return_value, value = 14;
> +  khint_t pos;
> +  kh_str_t *hashmap = kh_init_str();
> +  pos = kh_put_str(hashmap, "test_key", &first_return_value);
> +  if (!first_return_value)
> +    return 0;
> +  kh_key(hashmap, pos) = xstrdup("test_key");
> +  kh_value(hashmap, pos) = &value;
> +  kh_put_str(hashmap, "test_key", &second_return_value);
> +  return !second_return_value;
> +}

I don't see how this is different from the previous test that tested for
collisions.

> +int get_existing_kv_from_hashmap_succeeds() {
> +  int ret, value =14;
> +  int expected = value;
> +  khint_t pos;
> +  kh_str_t *hashmap = kh_init_str();
> +  pos = kh_put_str(hashmap, "test_key", &ret);
> +  if (!ret)
> +    return 0;
> +  kh_key(hashmap, pos) = xstrdup("test_key");
> +  kh_value(hashmap, pos) = &value;
> +  return *kh_value(hashmap, kh_get_str(hashmap, "test_key")) == expected;
> +}

Isn't this identical to put_value_into_hashmap_once_succeeds()?

Also, this assertion is harder to read than necessary because we have
both "expected" and "value" - the different names suggest that they are
different, but they're really the same. Let's just use one of them.

> +int get_nonexisting_kv_from_hashmap_fails() {
> +  int value = 14;
> +  khint_t pos;
> +  kh_str_t *hashmap = kh_init_str();
> +  return !kh_get_str(hashmap, "test_key");
> +}

Looks reasonable.

> +int deletion_from_hashmap_sets_flag() {
> +  int ret, value = 14;
> +  khint_t pos;
> +  kh_str_t *hashmap = kh_init_str();
> +  pos = kh_put_str(hashmap, "test_key", &ret);
> +  if (!ret)
> +    return 0;
> +  if(!kh_exist(hashmap, pos))
> +    return 0;
> +  kh_key(hashmap, pos) = xstrdup("test_key");
> +  kh_value(hashmap, pos) = &value;
> +  kh_del_str(hashmap, pos);
> +  return !kh_exist(hashmap, pos);
> +}

What is the "flag" being set? If it is the khash-internal flag that
marks that an entry is deleted, that's an unimportant implementation
detail - the important thing is that the value is deleted. Let's just
rename this to "deletion_works" or something.

> +
> +int deletion_from_hashmap_updates_size() {
> +  int ret, value = 14, current_size;
> +  khint_t pos;
> +  kh_str_t *hashmap = kh_init_str();
> +  pos = kh_put_str(hashmap, "test_key", &ret);
> +  if (!ret)
> +    return 0;
> +  kh_key(hashmap, pos) = xstrdup("test_key");
> +  kh_value(hashmap, pos) = &value;
> +  current_size = hashmap->size;
> +  kh_del_str(hashmap, pos);
> +  return hashmap->size + 1 == current_size;
> +}

The interface is kh_size(), so let's use that instead of referencing the
.size member correctly.

> +int deletion_from_hashmap_does_not_update_kv() {
> +  int ret, value = 14, current_size;
> +  khint_t pos;
> +  kh_str_t *hashmap = kh_init_str();
> +  pos = kh_put_str(hashmap, "test_key", &ret);
> +  if (!ret)
> +    return 0;
> +  kh_key(hashmap, pos) = xstrdup("test_key");
> +  kh_value(hashmap, pos) = &value;
> +  kh_del_str(hashmap, pos);
> +  return !strcmp(kh_key(hashmap, pos),"test_key");
> +}

Interesting! If we kh_del_(pos) but read it back with kh_key(), we can
still read it back. This has some implications on how khash should be
used - we should be extremely careful if we pass around a returned value
from kh_get_ or kh_put_ because we don't know if it's deleted or not.
Instead, we should prefer kh_get_(). Very useful demonstration.

> +int update_value_after_deletion_succeeds() {
> +  int ret, value1 = 14, value2 = 15;
> +  khint_t pos1, pos2;
> +  kh_str_t *hashmap = kh_init_str();
> +  // adding the kv to the hashmap
> +  pos1 = kh_put_str(hashmap, "test_key", &ret);
> +  if (!ret)
> +    return 0;
> +  kh_key(hashmap, pos1) = xstrdup("test_key");
> +  kh_value(hashmap, pos1) = &value1;
> +  // delete the kv from the hashmap
> +  kh_del_str(hashmap, pos1);
> +  // adding the same key with different value
> +  pos2 = kh_put_str(hashmap, "test_key", &ret);
> +  if (!ret)
> +    return 0;
> +  kh_key(hashmap, pos2) = xstrdup("test_key");
> +  kh_value(hashmap, pos2) = &value2;
> +  // check if the value is different
> +  return *kh_value(hashmap, kh_get_str(hashmap, "test_key")) == value2;
> +}

Looks reasonable.

> +int hashmap_size_matches_number_of_added_elements() {
> +  int ret, value1 = 14, value2 = 15, value3 = 16;
> +  khint_t pos1, pos2, pos3;
> +  kh_str_t *hashmap = kh_init_str();
> +  pos1 = kh_put_str(hashmap, "test_key1", &ret);
> +  if (!ret)
> +    return 0;
> +  kh_key(hashmap, pos1) = xstrdup("test_key1");
> +  kh_value(hashmap, pos1) = &value1;
> +  pos2 = kh_put_str(hashmap, "test_key2", &ret);
> +  if (!ret)
> +    return 0;
> +  kh_key(hashmap, pos2) = xstrdup("test_key2");
> +  kh_value(hashmap, pos2) = &value2;
> +  pos3 = kh_put_str(hashmap, "test_key3", &ret);
> +  if (!ret)
> +    return 0;
> +  kh_key(hashmap, pos3) = xstrdup("test_key3");
> +  kh_value(hashmap, pos3) = &value3;
> +  return kh_size(hashmap) == 3;
> +}

Looks reasonable.

I would also exercise kh_end, kh_begin, kh_clear, kh_foreach and
kh_foreach_value

kh_destroy might be useful to test too, in case there are oddities
there, but it's probably not that interesting

kh_n_buckets doesn't seem useful to test - it looks too specific to the
impkementation. kh_resize looks like an implementation detail that we
shouldn't test either.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH v2] khash_test.c : add unit tests
  2023-06-27 13:20 ` [RFC PATCH v2] khash_test.c : add unit tests Siddharth Singh
  2023-06-30  0:53   ` Glen Choo
@ 2023-06-30  1:05   ` Glen Choo
  1 sibling, 0 replies; 12+ messages in thread
From: Glen Choo @ 2023-06-30  1:05 UTC (permalink / raw)
  To: Siddharth Singh, git

Some comments on mechanical issues...

It seems that you dropped the other participants from your reply.

  https://git-scm.com/docs/MyFirstContribution#v2-git-send-email

Do CC them as it helps them keep track of discussions they were in. I
think they will not mind if you send a new message to this thread, CC
them, and explain that you didn't mean to omit them.

Siddharth Singh <siddhartth@google.com> writes:

> Signed-off-by: Siddharth Singh <siddhartth@google.com>
> ---
> Since v1
> - rewrote the code keeping coding style guidelines in mind.
> - refactored tests to improve their implementation..
> - added a few more tests that cause collisions in the hashmap.
> In the v1 patch, there were concerns that new tests were unnecessary because of upstream tests. However, I believe it would be beneficial to have tests based on the khash implementation in the git codebase because the existing tests[1] for khash do not use the same code for khash as the git codebase. 
> E.g. in khash.h file of “attractivechaos/klib/khash.h” [2]. There's an implementation for `KHASH_MAP_INIT_INT64` which isn’t defined in “git/git/khash.h”[3]. So, there’s a possibility that the khash.h in a different repository might move forward and end up being different from out implementation, so writing tests for “git/git/khash.h” would ensure that our tests are relevant to the actual usage of the library.
> While my colleagues are currently investigating whether C Tap harness is the best way to test libified code, this RFC is intended to discuss the content of unit tests and what other tests would be useful for khash.h. I am unsure of what tests will be valuable in the future, so I would appreciate your thoughts on the matter. Many test cases are easier to construct in C TAP harness than in test-tool. For example, In C TAP harness, non-string values can be used directly in hash maps. However, in test-tool, non-string values must be passed as an argument to a shell executable, parsed into the correct type, and then operated on.
> I'm also very new to writing unit tests in C and git codebase in general, so I'll appreciate constructive feedback and discussion..

Do rewrap the line to 80 characters, otherwise it is quite hard to read
on most text-based clients. You can preview your patches with "git
send-email --annotate" and see if they are a reasonable column width.

> diff --git a/Makefile b/Makefile
> index 660c72d72e..d3ad2fa23c 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -3847,11 +3847,13 @@ $(UNIT_TEST_RUNNER): t/runtests.c
>  
>  UNIT_TEST_PROGS += t/unit-tests/unit-test-t
>  UNIT_TEST_PROGS += t/unit-tests/strbuf-t
> +UNIT_TEST_PROGS += t/unit-tests/khash-t
>  
>  $(UNIT_TEST_PROGS): $(CTAP_OBJS) $(LIBS)
>  	$(QUIET)mkdir -p t/unit-tests
>  	$(QUIET_CC)$(CC) -It -o t/unit-tests/unit-test-t t/unit-test.c $(CTAP_OBJS)
>  	$(QUIET_CC)$(CC) -It -o t/unit-tests/strbuf-t t/strbuf-test.c -DSKIP_COMMON_MAIN common-main.c $(CTAP_OBJS) $(LIBS)

Many reviewers like to apply the patches for review. It's typically
assumed that patches are based on 'master', but this looks like it is
not. That's okay, though you should call it out. If it is based on
something not in Junio's tree (and thus a reviewer can't apply your
patch), it would be appreciated if you share this commit via a Git fork,
e.g. GitHub.

> +/*
> +  These tests are for base assumptions; if they fails, library is broken

Missing *.

> +int hash_string_succeeds() {
> +  const char *str = "test_string";

We use tabs, not spaces. I think "make style" catches this.

> +int initialized_hashmap_pointer_not_null() {
> +  kh_str_t *hashmap = kh_init_str();
> +
> +  if(hashmap != NULL){

"if ", not "if". "make style" will catch this.

> +int update_value_after_deletion_succeeds() {
> +  int ret, value1 = 14, value2 = 15;
> +  khint_t pos1, pos2;
> +  kh_str_t *hashmap = kh_init_str();
> +  // adding the kv to the hashmap

Comments use /* */, not //.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH v2] khash_test.c : add unit tests
  2023-06-30  0:53   ` Glen Choo
@ 2023-07-07 15:04     ` Siddharth Singh
  2023-07-07 21:12       ` Glen Choo
  0 siblings, 1 reply; 12+ messages in thread
From: Siddharth Singh @ 2023-07-07 15:04 UTC (permalink / raw)
  To: Glen Choo; +Cc: git, chooglen, nasamuffin

On Thu, Jun 29, 2023 at 05:53:16PM -0700, Glen Choo wrote:
> Siddharth Singh <siddhartth@google.com> writes:
> 
> > - rewrote the code keeping coding style guidelines in mind.
> 
> I noticed several style issues in the patch. I'll avoid commenting on
> those here so that we can focus on the other, big questions.
> 
> > I'm also very new to writing unit tests in C and git codebase in general, so I'll appreciate constructive feedback and discussion..
> 
> Okay. I think this is a good chance for the ML to decide on what sorts
> of unit test practices we want too.
> 
> > What are the unit test cases to include in khash.h?
> > What are the other types of tests that would be useful for khash.h?
> 
> For a third party library, I mostly agree with upthread commenters that
> testing the correctness of the _library_ code isn't that valuable on its
> own, but unit tests are also useful for demonstrating how the code
> should be used. In a lot of projects, unit tests are a pseudo-official
> demonstration of how an interface should be used, and that's sometimes
> better than seeing the code in production, because unit tests are
> typically a lot simpler and easier to reason about.
> 
> As an aside, this is something that a unit test framework will do _much
> better_ than test-tool, because it makes it easy to read and write these
> idtiomatic demos without worrying about how the test-tool executable
> will be used.
> 
> I think that if we consider these two factors together (correctness and
> developer education), the fact that we don't have great documentation of
> khash.h in-tree, and that the external docs aren't that comprehensive
> either, I think it would be useful to exercise each of the khash
> 'functions' to show how they work.
> 
> As Taylor mentioned upthread, I think the most valuable thing to test is
> the way our code uses khash.h (not the library code itself). IOW:
> 
>   KHASH_INIT(oid_set, struct object_id, int, 0, oidhash_by_value, oideq_by_value)
>   KHASH_INIT(oid_map, struct object_id, void *, 1, oidhash_by_value, oideq_by_value)
>   KHASH_INIT(oid_pos, struct object_id, int, 1, oidhash_by_value, oideq_by_value)
> 
> So we would focus on oidhash_by_value and oideq_by_value in particular,
> since that's provided by us. It would be very useful to verify that
> these functions are doing exactly what we think they should be doing,
> and we could do this with _every_ khash function to really explore the
> API surface.
> 
> In fact... I suspect that they _not_ doing what we think they should,
> or at the very least, might be displaying some very unintuitive
> behavior. object_id contains both the hash value _and_ the hash
> algorithm that generated it. If we have the same hash value, but
> generated by different algorithms, are they considered the same equal or
> not? To me, the intuitive answer is "no", but oidhash_by_value() only
> considers the hash value and ignores the algorithm altogether! I think
> that testing collisions between SHA1 and SHA256 values would be very
> useful.
> 
> > +/*
> > +  These tests are for base assumptions; if they fails, library is broken
> > +*/
> > +int hash_string_succeeds() {
> > +  const char *str = "test_string";
> > +  khint_t value = __ac_X31_hash_string(str);
> > +  return value == 0xf1a6305e;
> > +}
> > +
> > +int hash_string_with_non_alphabets_succeeds() {
> > +  const char *str = "test_string #2";
> > +  khint_t value = __ac_X31_hash_string(str);
> > +  return value == 0xfa970771;
> > +}
> 
> For a unit test, it is a best practice to test the external interface
> that callers are depending on, and to avoid implementation
> details/internals. If we do the latter, we end up with brittle tests
> that break on non-important changes. E.g. we definitely should not be
> testing the exact value of the hash function. The khash library
> maintainer could decide to change the hash function, and we should not
> be broken by that.
> 
> For khash.h, it appears that that anything prepended with "__" is meant
> to be internal, and everything else is external. E.g.
> __ac_X31_hash_string is certainly not meant to be used directly by end
> users., especially since there's a human-friendly macro for it:
> 
>   #define kh_str_hash_func(key) __ac_X31_hash_string(key)
> 
> However, because something is external, doesn't mean we need to be
> testing its implementation - we still shouldn't be testing the
> return value of kh_str_hash_func(). We may want to test certain
> _properties_ of the external function (e.g. does it provide good
> collision resistance?), but in this case, I think we can trust that what
> the library author wrote is a good default.
> 
> > +KHASH_INIT(str, const char *, int *, 1, kh_str_hash_func, kh_str_hash_equal);
> 
> As mentioned earlier, I think testing oid_[set|map|pos] would be much
> more useful, and we should try to demonstrate each of the
> macros/functions and how they interact, e.g. if I kh_put_*, I should
> expect to be able to kh_get_* afterwards.
> 
> > +
> > +int initialized_hashmap_pointer_not_null() {
> > +  kh_str_t *hashmap = kh_init_str();
> > +
> > +  if(hashmap != NULL){
> > +    free(hashmap);
> > +    return 1;
> > +  }
> > +  return 0;
> > +}
> 
> I don't think this is a useful test. It would be extremely weird for the
> return value to be NULL in regular usage.

While it is indeed uncommon for the return value of kh_init_str() to be null in
regular usage, there could be certain cases where the hashmap pointer might be
null. One possible scenario is if there is an error during memory allocation for
the hashmap. This can happen if the system runs out of memory or if there are
other memory-related issues. In such cases, the kh_init_str() function might
return a null pointer to indicate the failure to allocate memory for the hashmap.

>
> > +int put_key_into_hashmap_once_succeeds() {
> > +  int ret, value;
> > +  khint_t pos;
> > +  kh_str_t *hashmap = kh_init_str();
> > +  pos = kh_put_str(hashmap, "test_key", &ret);
> > +  if(!ret)
> > +    return 0;
> > +  return pos != 0;
> > +}
> 
> I don't think it makes sense to only check the return value, we should
> check that it had the intended effect - that we can read back the value.
> There is a later test that does this, so I'd prefer we remove this.
> 
> Comment on the test framework, not the test: I think it makes sense to
> assert that "ret" is nonzero, but this "if..return" way of asserting
> makes it impossible to tell what has broken. It also adds a lot of
> cognitive overhead when trying to parse the assertion. I'll echo Phillip
> Wood's comments and say that I think this is definitely not a framework
> I want to use.
> 
> > +int put_same_key_into_hashmap_twice_fails() {
> > +  int first_return_value, second_return_value, value;
> > +  khint_t pos;
> > +  kh_str_t *hashmap = kh_init_str();
> > +  kh_put_str(hashmap, "test_key", &first_return_value);
> > +  kh_put_str(hashmap, "test_key", &second_return_value);
> > +  return !second_return_value;
> > +}
> 
> This test seems reasonable.
> 
> > +int put_value_into_hashmap_once_succeeds() {
> > +  int ret, value = 14;
> > +  khint_t pos;
> > +  kh_str_t *hashmap = kh_init_str();
> > +  pos = kh_put_str(hashmap, "test_key", &ret);
> > +  if (!ret)
> > +    return 0;
> > +  kh_key(hashmap, pos) = xstrdup("test_key");
> > +  kh_value(hashmap, pos) = &value;
> > +  return *kh_value(hashmap, pos) == value;
> > +}
> 
> Checking the inserted value makes sense to me I don't really understand
> khash.h, but is this the recommended way to get values back? kh_value()
> looks like just a small macro around an internal array, so we are just
> checking that *&value == value. I would have expected that we would call
> kh_get to get "pos", then call kh_value on that.
> 
> Also, do we have to kh_put_str("some_key") and then immediately set it
> again with kh_key(pos)? That seems odd, and I don't see other callers
> doing that all the time.
> 
> Since this is the second time I see "if (!ret)", I should ask whether
> this is meant as an assertion (checking the correctness of the code), or
> for flow control (the code might return either 0 or 1, and I need to
> handle both). If it is the former, I think it's unnecessary to
> assert on it multiple times. The latter does not belong in a unit test -
> we should already _know_ what we want to be returned. Btw, it is a huge
> flaw in the test framework that both look exactly the same, so I'll say
> again that I really don't like it.
> 

Regarding the usage of kh_value(), I saw this part of code[1] This suggests
that it may be the recommended approach in this context.

For kh_put_str, if i understand correctly, we first allocate a bucket then
then set it using kh_key, here's a use case[2] that does it similarly.

Regarding the usage of if (!ret), it seems to serve as an assertion or error
checking mechanism. If the return value ret is 0, it likely indicates an
error condition. It's worth mentioning that the similarity between the
assertion and flow control statements could be a flaw in the test framework,
as you pointed out.

> > +int put_same_value_into_hashmap_twice_fails() {
> > +  int first_return_value, second_return_value, value = 14;
> > +  khint_t pos;
> > +  kh_str_t *hashmap = kh_init_str();
> > +  pos = kh_put_str(hashmap, "test_key", &first_return_value);
> > +  if (!first_return_value)
> > +    return 0;
> > +  kh_key(hashmap, pos) = xstrdup("test_key");
> > +  kh_value(hashmap, pos) = &value;
> > +  kh_put_str(hashmap, "test_key", &second_return_value);
> > +  return !second_return_value;
> > +}
> 
> I don't see how this is different from the previous test that tested for
> collisions.

I also attempt to assign a value to the hashmap here, but this will not be
successful due to a collision. The intent of this test was different, however
 
> > +int get_existing_kv_from_hashmap_succeeds() {
> > +  int ret, value =14;
> > +  int expected = value;
> > +  khint_t pos;
> > +  kh_str_t *hashmap = kh_init_str();
> > +  pos = kh_put_str(hashmap, "test_key", &ret);
> > +  if (!ret)
> > +    return 0;
> > +  kh_key(hashmap, pos) = xstrdup("test_key");
> > +  kh_value(hashmap, pos) = &value;
> > +  return *kh_value(hashmap, kh_get_str(hashmap, "test_key")) == expected;
> > +}
> 
> Isn't this identical to put_value_into_hashmap_once_succeeds()?
> 
> Also, this assertion is harder to read than necessary because we have
> both "expected" and "value" - the different names suggest that they are
> different, but they're really the same. Let's just use one of them.
> 

I agree they both end up same implementation wise.

> > +int get_nonexisting_kv_from_hashmap_fails() {
> > +  int value = 14;
> > +  khint_t pos;
> > +  kh_str_t *hashmap = kh_init_str();
> > +  return !kh_get_str(hashmap, "test_key");
> > +}
> 
> Looks reasonable.
> 
> > +int deletion_from_hashmap_sets_flag() {
> > +  int ret, value = 14;
> > +  khint_t pos;
> > +  kh_str_t *hashmap = kh_init_str();
> > +  pos = kh_put_str(hashmap, "test_key", &ret);
> > +  if (!ret)
> > +    return 0;
> > +  if(!kh_exist(hashmap, pos))
> > +    return 0;
> > +  kh_key(hashmap, pos) = xstrdup("test_key");
> > +  kh_value(hashmap, pos) = &value;
> > +  kh_del_str(hashmap, pos);
> > +  return !kh_exist(hashmap, pos);
> > +}
> 
> What is the "flag" being set? If it is the khash-internal flag that
> marks that an entry is deleted, that's an unimportant implementation
> detail - the important thing is that the value is deleted. Let's just
> rename this to "deletion_works" or something.
> 
Oka, sounds good.
> > +
> > +int deletion_from_hashmap_updates_size() {
> > +  int ret, value = 14, current_size;
> > +  khint_t pos;
> > +  kh_str_t *hashmap = kh_init_str();
> > +  pos = kh_put_str(hashmap, "test_key", &ret);
> > +  if (!ret)
> > +    return 0;
> > +  kh_key(hashmap, pos) = xstrdup("test_key");
> > +  kh_value(hashmap, pos) = &value;
> > +  current_size = hashmap->size;
> > +  kh_del_str(hashmap, pos);
> > +  return hashmap->size + 1 == current_size;
> > +}
> 
> The interface is kh_size(), so let's use that instead of referencing the
> .size member correctly.

Okay.

> > +int deletion_from_hashmap_does_not_update_kv() {
> > +  int ret, value = 14, current_size;
> > +  khint_t pos;
> > +  kh_str_t *hashmap = kh_init_str();
> > +  pos = kh_put_str(hashmap, "test_key", &ret);
> > +  if (!ret)
> > +    return 0;
> > +  kh_key(hashmap, pos) = xstrdup("test_key");
> > +  kh_value(hashmap, pos) = &value;
> > +  kh_del_str(hashmap, pos);
> > +  return !strcmp(kh_key(hashmap, pos),"test_key");
> > +}
> 
> Interesting! If we kh_del_(pos) but read it back with kh_key(), we can
> still read it back. This has some implications on how khash should be
> used - we should be extremely careful if we pass around a returned value
> from kh_get_ or kh_put_ because we don't know if it's deleted or not.
> Instead, we should prefer kh_get_(). Very useful demonstration.
> 
Yes, we are still able to get the value but to truly know if something
was deleted or not, we need to use kh_exist.

> > +int update_value_after_deletion_succeeds() {
> > +  int ret, value1 = 14, value2 = 15;
> > +  khint_t pos1, pos2;
> > +  kh_str_t *hashmap = kh_init_str();
> > +  // adding the kv to the hashmap
> > +  pos1 = kh_put_str(hashmap, "test_key", &ret);
> > +  if (!ret)
> > +    return 0;
> > +  kh_key(hashmap, pos1) = xstrdup("test_key");
> > +  kh_value(hashmap, pos1) = &value1;
> > +  // delete the kv from the hashmap
> > +  kh_del_str(hashmap, pos1);
> > +  // adding the same key with different value
> > +  pos2 = kh_put_str(hashmap, "test_key", &ret);
> > +  if (!ret)
> > +    return 0;
> > +  kh_key(hashmap, pos2) = xstrdup("test_key");
> > +  kh_value(hashmap, pos2) = &value2;
> > +  // check if the value is different
> > +  return *kh_value(hashmap, kh_get_str(hashmap, "test_key")) == value2;
> > +}
> 
> Looks reasonable.
> 
> > +int hashmap_size_matches_number_of_added_elements() {
> > +  int ret, value1 = 14, value2 = 15, value3 = 16;
> > +  khint_t pos1, pos2, pos3;
> > +  kh_str_t *hashmap = kh_init_str();
> > +  pos1 = kh_put_str(hashmap, "test_key1", &ret);
> > +  if (!ret)
> > +    return 0;
> > +  kh_key(hashmap, pos1) = xstrdup("test_key1");
> > +  kh_value(hashmap, pos1) = &value1;
> > +  pos2 = kh_put_str(hashmap, "test_key2", &ret);
> > +  if (!ret)
> > +    return 0;
> > +  kh_key(hashmap, pos2) = xstrdup("test_key2");
> > +  kh_value(hashmap, pos2) = &value2;
> > +  pos3 = kh_put_str(hashmap, "test_key3", &ret);
> > +  if (!ret)
> > +    return 0;
> > +  kh_key(hashmap, pos3) = xstrdup("test_key3");
> > +  kh_value(hashmap, pos3) = &value3;
> > +  return kh_size(hashmap) == 3;
> > +}
> 
> Looks reasonable.
> 
> I would also exercise kh_end, kh_begin, kh_clear, kh_foreach and
> kh_foreach_value
> 
> kh_destroy might be useful to test too, in case there are oddities
> there, but it's probably not that interesting
> 
> kh_n_buckets doesn't seem useful to test - it looks too specific to the
> impkementation. kh_resize looks like an implementation detail that we
> shouldn't test either.
>


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH v2] khash_test.c : add unit tests
  2023-07-07 15:04     ` Siddharth Singh
@ 2023-07-07 21:12       ` Glen Choo
  0 siblings, 0 replies; 12+ messages in thread
From: Glen Choo @ 2023-07-07 21:12 UTC (permalink / raw)
  To: Siddharth Singh; +Cc: git, nasamuffin

Siddharth Singh <siddhartth@google.com> writes:

>> > +int initialized_hashmap_pointer_not_null() {
>> > +  kh_str_t *hashmap = kh_init_str();
>> > +
>> > +  if(hashmap != NULL){
>> > +    free(hashmap);
>> > +    return 1;
>> > +  }
>> > +  return 0;
>> > +}
>> 
>> I don't think this is a useful test. It would be extremely weird for the
>> return value to be NULL in regular usage.
>
> While it is indeed uncommon for the return value of kh_init_str() to be null in
> regular usage, there could be certain cases where the hashmap pointer might be
> null. One possible scenario is if there is an error during memory allocation for
> the hashmap. This can happen if the system runs out of memory or if there are
> other memory-related issues. In such cases, the kh_init_str() function might
> return a null pointer to indicate the failure to allocate memory for the hashmap.

Ah, yes, that is what I expected. By "regular usage", I meant the case
where the allocation succeeds. Nevertheless, I don't think this test
demonstrates or asserts anything that isn't easily observed in other
tests, though I am open to be proven wrong.

>> > +int put_value_into_hashmap_once_succeeds() {
>> > +  int ret, value = 14;
>> > +  khint_t pos;
>> > +  kh_str_t *hashmap = kh_init_str();
>> > +  pos = kh_put_str(hashmap, "test_key", &ret);
>> > +  if (!ret)
>> > +    return 0;
>> > +  kh_key(hashmap, pos) = xstrdup("test_key");
>> > +  kh_value(hashmap, pos) = &value;
>> > +  return *kh_value(hashmap, pos) == value;
>> > +}
>> Also, do we have to kh_put_str("some_key") and then immediately set it
>> again with kh_key(pos)? That seems odd, and I don't see other callers
>> doing that all the time.
>
> Regarding the usage of kh_value(), I saw this part of code[1] This suggests
> that it may be the recommended approach in this context.

What 'context' were you thinking of when you say "recommended approach
in this context"? (Did you mean kh_put_str() vs kh_put_oid_*()?) I
didn't see such an approach in most kh_put_*() call sites. I couldn't
find the link you're referencing, but I assume it is the following lines
in delta-islands.c:

	int hash_ret;
	khiter_t pos = kh_put_str(remote_islands, island_name, &hash_ret);

	if (hash_ret) {
		kh_key(remote_islands, pos) = xstrdup(island_name);
		kh_value(remote_islands, pos) = xcalloc(1, sizeof(struct remote_island));
	}

In which case, the use case (as I understand it at least) seems quite
different:

- When kh_put_str() 'returns' hash_ret = 0, it means there's already a
  matching entry and we should do nothing.

- Otherwise, kh_put_str() actually creates a new matching entry, and now
  we want to allocate memory to store the entry, so we overwrite the key
  with "kh_key() = xstrdup()". We could have avoided this by doing
  something like:

    char *key = xstrdup(island_name)
    khiter_t pos = kh_put_str(remote_islands, key, &hash_ret);

    if (hash_ret) {
      kh_value(remote_islands, pos) = xcalloc(1, sizeof(struct remote_island));
    } else {
      free(key);
    }

  but allocations are expensive, so we should avoid allocating before we
  know it's needed.

>> > +int put_same_value_into_hashmap_twice_fails() {
>> > +  int first_return_value, second_return_value, value = 14;
>> > +  khint_t pos;
>> > +  kh_str_t *hashmap = kh_init_str();
>> > +  pos = kh_put_str(hashmap, "test_key", &first_return_value);
>> > +  if (!first_return_value)
>> > +    return 0;
>> > +  kh_key(hashmap, pos) = xstrdup("test_key");
>> > +  kh_value(hashmap, pos) = &value;
>> > +  kh_put_str(hashmap, "test_key", &second_return_value);
>> > +  return !second_return_value;
>> > +}
>> 
>> I don't see how this is different from the previous test that tested for
>> collisions.
>
> I also attempt to assign a value to the hashmap here, but this will not be
> successful due to a collision. The intent of this test was different, however

Hm, what was the different intent? Is it to test this slightly different
case where the value is assigned?

My reasoning is:

- Would assigning a value result in different behavior?
- Would a reasonable user expect that it would result in different
  behavior?

If the answer to both of those is 'no' (which I believe it is), then it
is not worth testing.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2023-07-07 21:13 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-31 15:51 [RFC PATCH 0/1] Unit tests for khash.h Siddharth Singh
2023-05-31 15:51 ` [RFC PATCH 1/1] khash_test.c: add unit tests Siddharth Singh
2023-05-31 22:35 ` [RFC PATCH 0/1] Unit tests for khash.h Taylor Blau
2023-06-01  6:26   ` Junio C Hamano
2023-06-01 13:02   ` Phillip Wood
2023-05-31 23:35 ` Eric Wong
2023-06-01 13:23 ` Phillip Wood
2023-06-27 13:20 ` [RFC PATCH v2] khash_test.c : add unit tests Siddharth Singh
2023-06-30  0:53   ` Glen Choo
2023-07-07 15:04     ` Siddharth Singh
2023-07-07 21:12       ` Glen Choo
2023-06-30  1:05   ` Glen Choo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).