From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <e@80x24.org>
X-Spam-Checker-Version: SpamAssassin 3.4.1 (2015-04-28) on dcvr.yhbt.net
X-Spam-Level: 
X-Spam-ASN:  
X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00
	shortcircuit=no autolearn=ham autolearn_force=no version=3.4.1
Received: from localhost (dcvr.yhbt.net [127.0.0.1])
	by dcvr.yhbt.net (Postfix) with ESMTP id B07F4208E9
	for <mwrap-public@80x24.org>; Mon, 16 Jul 2018 21:19:37 +0000 (UTC)
From: Eric Wong <e@80x24.org>
To: mwrap-public@80x24.org
Subject: [PATCH 01/19] support per-allocation headers for per-alloc tracking
Date: Mon, 16 Jul 2018 21:19:15 +0000
Message-Id: <20180716211933.5835-2-e@80x24.org>
In-Reply-To: <20180716211933.5835-1-e@80x24.org>
References: <20180716211933.5835-1-e@80x24.org>
List-Id: <mwrap-public.80x24.org>

This increases costs even more, but will allow leak finding.
It will be made optional in the future.
---
 ext/mwrap/extconf.rb |  15 +++
 ext/mwrap/mwrap.c    | 312 ++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 292 insertions(+), 35 deletions(-)

diff --git a/ext/mwrap/extconf.rb b/ext/mwrap/extconf.rb
index 4ac8881..e9dbb1e 100644
--- a/ext/mwrap/extconf.rb
+++ b/ext/mwrap/extconf.rb
@@ -10,4 +10,19 @@ have_library 'urcu-bp' or abort 'liburcu-bp not found'
 have_library 'dl'
 have_library 'c'
 have_library 'execinfo' # FreeBSD
+
+if try_link(<<'')
+int main(void) { return __builtin_add_overflow_p(0,0,(int)1); }
+
+  $defs << '-DHAVE_BUILTIN_ADD_OVERFLOW_P'
+end
+
+if try_link(<<'')
+int main(int a) { return __builtin_add_overflow(0,0,&a); }
+
+  $defs << '-DHAVE_BUILTIN_ADD_OVERFLOW_P'
+else
+  abort 'missing __builtin_add_overflow'
+end
+
 create_makefile 'mwrap'
diff --git a/ext/mwrap/mwrap.c b/ext/mwrap/mwrap.c
index c160e33..2e75d8f 100644
--- a/ext/mwrap/mwrap.c
+++ b/ext/mwrap/mwrap.c
@@ -16,14 +16,21 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
+#include <pthread.h>
 #include <urcu-bp.h>
 #include <urcu/rculfhash.h>
+#include <urcu/rculist.h>
 #include "jhash.h"
 
 static ID id_uminus;
 const char *rb_source_location_cstr(int *line); /* requires 2.6.0dev */
 extern int __attribute__((weak)) ruby_thread_has_gvl_p(void);
 extern void * __attribute__((weak)) ruby_current_execution_context_ptr;
+extern void * __attribute__((weak)) ruby_current_vm_ptr; /* for rb_gc_count */
+extern size_t __attribute__((weak)) rb_gc_count(void);
+
+/* true for glibc/dlmalloc/ptmalloc, not sure about jemalloc */
+#define ASSUMED_MALLOC_ALIGNMENT (sizeof(void *) * 2)
 
 int __attribute__((weak)) ruby_thread_has_gvl_p(void)
 {
@@ -32,17 +39,17 @@ int __attribute__((weak)) ruby_thread_has_gvl_p(void)
 
 #ifdef __FreeBSD__
 void *__malloc(size_t);
-void *__calloc(size_t, size_t);
-void *__realloc(void *, size_t);
+void *__memalign(size_t, size_t);
+void __free(void *);
 static void *(*real_malloc)(size_t) = __malloc;
-static void *(*real_calloc)(size_t, size_t) = __calloc;
-static void *(*real_realloc)(void *, size_t) = __realloc;
+static void *(*real_memalign)(size_t, size_t) = __aligned_alloc;
+static void (*real_free)(void *) = __free;
 #  define RETURN_IF_NOT_READY() do {} while (0) /* nothing */
 #else
 static int ready;
 static void *(*real_malloc)(size_t);
-static void *(*real_calloc)(size_t, size_t);
-static void *(*real_realloc)(void *, size_t);
+static void *(*real_memalign)(size_t, size_t);
+static void (*real_free)(void *);
 
 /*
  * we need to fake an OOM condition while dlsym is running,
@@ -58,7 +65,26 @@ static void *(*real_realloc)(void *, size_t);
 
 #endif /* !FreeBSD */
 
+static size_t generation;
+static size_t page_size;
 static struct cds_lfht *totals;
+union padded_mutex {
+	pthread_mutex_t mtx;
+	char pad[64];
+};
+
+/* a round-robin pool of mutexes */
+#define MUTEX_NR   (1 << 6)
+#define MUTEX_MASK (MUTEX_NR - 1)
+static size_t mutex_i;
+static union padded_mutex mutexes[MUTEX_NR] = {
+	[0 ... (MUTEX_NR-1)].mtx = PTHREAD_MUTEX_INITIALIZER
+};
+
+static pthread_mutex_t *mutex_assign(void)
+{
+	return &mutexes[uatomic_add_return(&mutex_i, 1) & MUTEX_MASK].mtx;
+}
 
 static struct cds_lfht *
 lfht_new(void)
@@ -72,16 +98,16 @@ __attribute__((constructor)) static void resolve_malloc(void)
 
 #ifndef __FreeBSD__
 	real_malloc = dlsym(RTLD_NEXT, "malloc");
-	real_calloc = dlsym(RTLD_NEXT, "calloc");
-	real_realloc = dlsym(RTLD_NEXT, "realloc");
-	if (!real_calloc || !real_malloc || !real_realloc) {
-		fprintf(stderr, "missing calloc/malloc/realloc %p %p %p\n",
-			real_calloc, real_malloc, real_realloc);
+	real_memalign = dlsym(RTLD_NEXT, "aligned_alloc");
+	real_free = dlsym(RTLD_NEXT, "free");
+	if (!real_malloc || !real_memalign || !real_free) {
+		fprintf(stderr, "missing malloc/aligned_alloc/free\n"
+			"\t%p %p %p\n",
+			real_malloc, real_memalign, real_free);
 		_exit(1);
 	}
 	ready = 1;
 #endif
-
 	totals = lfht_new();
 	if (!totals)
 		fprintf(stderr, "failed to allocate totals table\n");
@@ -91,6 +117,21 @@ __attribute__((constructor)) static void resolve_malloc(void)
 				call_rcu_after_fork_child);
 	if (err)
 		fprintf(stderr, "pthread_atfork failed: %s\n", strerror(err));
+	page_size = sysconf(_SC_PAGESIZE);
+}
+
+static void
+mutex_lock(pthread_mutex_t *m)
+{
+	int err = pthread_mutex_lock(m);
+	assert(err == 0);
+}
+
+static void
+mutex_unlock(pthread_mutex_t *m)
+{
+	int err = pthread_mutex_unlock(m);
+	assert(err == 0);
 }
 
 #ifndef HAVE_MEMPCPY
@@ -142,19 +183,47 @@ static char *int2str(int num, char *dst, size_t * size)
  */
 static int has_ec_p(void)
 {
-	return (ruby_thread_has_gvl_p() && ruby_current_execution_context_ptr);
+	return (ruby_thread_has_gvl_p() && ruby_current_vm_ptr &&
+		ruby_current_execution_context_ptr);
 }
 
+/* allocated via real_malloc/real_free */
 struct src_loc {
 	struct rcu_head rcu_head;
+	pthread_mutex_t *mtx;
 	size_t calls;
 	size_t total;
 	struct cds_lfht_node hnode;
+	struct cds_list_head allocs; /* <=> alloc_hdr.node */
 	uint32_t hval;
 	uint32_t capa;
 	char k[];
 };
 
+/* every allocation has this in the header, maintain alignment with malloc  */
+struct alloc_hdr {
+	struct cds_list_head anode; /* <=> src_loc.allocs */
+	union {
+		struct {
+			size_t gen; /* rb_gc_count() */
+			struct src_loc *loc;
+		} live;
+		struct rcu_head dead;
+	} as;
+	void *real; /* what to call real_free on */
+	size_t size;
+};
+
+static struct alloc_hdr *ptr2hdr(void *p)
+{
+	return (struct alloc_hdr *)((uintptr_t)p - sizeof(struct alloc_hdr));
+}
+
+static void *hdr2ptr(struct alloc_hdr *h)
+{
+	return (void *)((uintptr_t)h + sizeof(struct alloc_hdr));
+}
+
 static int loc_is_addr(const struct src_loc *l)
 {
 	return l->capa == 0;
@@ -177,14 +246,13 @@ static int loc_eq(struct cds_lfht_node *node, const void *key)
 		memcmp(k->k, existing->k, loc_size(k)) == 0);
 }
 
-static void totals_add(struct src_loc *k)
+static struct src_loc *totals_add(struct src_loc *k)
 {
 	struct cds_lfht_iter iter;
 	struct cds_lfht_node *cur;
-	struct src_loc *l;
+	struct src_loc *l = 0;
 	struct cds_lfht *t;
 
-
 again:
 	rcu_read_lock();
 	t = rcu_dereference(totals);
@@ -197,25 +265,27 @@ again:
 		uatomic_add(&l->calls, 1);
 	} else {
 		size_t n = loc_size(k);
-		l = malloc(sizeof(*l) + n);
+		l = real_malloc(sizeof(*l) + n);
 		if (!l) goto out_unlock;
-
 		memcpy(l, k, sizeof(*l) + n);
+		l->mtx = mutex_assign();
 		l->calls = 1;
+		CDS_INIT_LIST_HEAD(&l->allocs);
 		cur = cds_lfht_add_unique(t, k->hval, loc_eq, l, &l->hnode);
 		if (cur != &l->hnode) { /* lost race */
 			rcu_read_unlock();
-			free(l);
+			real_free(l);
 			goto again;
 		}
 	}
 out_unlock:
 	rcu_read_unlock();
+	return l;
 }
 
-static void update_stats(size_t size, uintptr_t caller)
+static struct src_loc *update_stats(size_t size, uintptr_t caller)
 {
-	struct src_loc *k;
+	struct src_loc *k, *ret = 0;
 	static const size_t xlen = sizeof(caller);
 	char *dst;
 
@@ -227,6 +297,8 @@ static void update_stats(size_t size, uintptr_t caller)
 		size_t len;
 		size_t int_size = INT2STR_MAX;
 
+		generation = rb_gc_count();
+
 		if (!ptr) goto unknown;
 
 		/* avoid vsnprintf or anything which could call malloc here: */
@@ -240,7 +312,7 @@ static void update_stats(size_t size, uintptr_t caller)
 			*dst = 0;	/* terminate string */
 			k->capa = (uint32_t)(dst - k->k + 1);
 			k->hval = jhash(k->k, k->capa, 0xdeadbeef);
-			totals_add(k);
+			ret = totals_add(k);
 		} else {
 			rb_bug("bad math making key from location %s:%d\n",
 				ptr, line);
@@ -252,36 +324,206 @@ unknown:
 		memcpy(k->k, &caller, xlen);
 		k->capa = 0;
 		k->hval = jhash(k->k, xlen, 0xdeadbeef);
-		totals_add(k);
+		ret = totals_add(k);
 	}
 out:
 	--locating;
+	return ret;
+}
+
+size_t malloc_usable_size(void *p)
+{
+	return ptr2hdr(p)->size;
+}
+
+static void
+free_hdr_rcu(struct rcu_head *dead)
+{
+	struct alloc_hdr *h = caa_container_of(dead, struct alloc_hdr, as.dead);
+	real_free(h->real);
+}
+
+void free(void *p)
+{
+	if (p) {
+		struct alloc_hdr *h = ptr2hdr(p);
+		if (h->as.live.loc) {
+			h->size = 0;
+			mutex_lock(h->as.live.loc->mtx);
+			cds_list_del_rcu(&h->anode);
+			mutex_unlock(h->as.live.loc->mtx);
+			call_rcu(&h->as.dead, free_hdr_rcu);
+		}
+		else {
+			real_free(h->real);
+		}
+	}
+}
+
+static void
+alloc_insert(struct src_loc *l, struct alloc_hdr *h, size_t size, void *real)
+{
+	if (!h) return;
+	h->size = size;
+	h->real = real;
+	h->as.live.loc = l;
+	h->as.live.gen = generation;
+	if (l) {
+		mutex_lock(l->mtx);
+		cds_list_add_rcu(&h->anode, &l->allocs);
+		mutex_unlock(l->mtx);
+	}
+}
+
+static size_t size_align(size_t size, size_t alignment)
+{
+	return ((size + (alignment - 1)) & ~(alignment - 1));
+}
+
+static void *internal_memalign(size_t alignment, size_t size, uintptr_t caller)
+{
+	struct src_loc *l;
+	struct alloc_hdr *h;
+	void *p, *real;
+	size_t asize;
+
+	RETURN_IF_NOT_READY();
+	if (alignment <= ASSUMED_MALLOC_ALIGNMENT)
+		return malloc(size);
+	for (; alignment < sizeof(struct alloc_hdr); alignment *= 2)
+		; /* double alignment until >= sizeof(struct alloc_hdr) */
+	if (__builtin_add_overflow(size, alignment, &asize)) {
+		errno = ENOMEM;
+		return 0;
+	}
+	l = update_stats(size, caller);
+	real = real_memalign(alignment, asize);
+	p = (void *)((uintptr_t)real + alignment);
+	h = (struct alloc_hdr *)((uintptr_t)p - sizeof(struct alloc_hdr));
+	alloc_insert(l, h, size, real);
+
+	return p;
+}
+
+void *memalign(size_t alignment, size_t size)
+{
+	return internal_memalign(alignment, size, RETURN_ADDRESS(0));
+}
+
+static bool is_power_of_two(size_t n) { return (n & (n - 1)) == 0; }
+
+int posix_memalign(void **p, size_t alignment, size_t size)
+{
+	size_t d = alignment / sizeof(void*);
+	size_t r = alignment % sizeof(void*);
+
+	if (r != 0 || d == 0 || !is_power_of_two(d))
+		return EINVAL;
+
+	*p = internal_memalign(alignment, size, RETURN_ADDRESS(0));
+	return *p ? 0 : ENOMEM;
+}
+
+void *aligned_alloc(size_t, size_t) __attribute__((alias("memalign")));
+void cfree(void *) __attribute__((alias("free")));
+
+void *valloc(size_t size)
+{
+	return internal_memalign(page_size, size, RETURN_ADDRESS(0));
+}
+
+#if __GNUC__ < 7
+#  define add_overflow_p(a,b) __extension__({ \
+		__typeof__(a) _c; \
+		__builtin_add_overflow(a,b,&_c); \
+	})
+#else
+#  define add_overflow_p(a,b) \
+		__builtin_add_overflow_p((a),(b),(__typeof__(a+b))0)
+#endif
+
+void *pvalloc(size_t size)
+{
+	size_t alignment = page_size;
+
+	if (add_overflow_p(size, alignment)) {
+		errno = ENOMEM;
+		return 0;
+	}
+	size = size_align(size, alignment);
+	return internal_memalign(alignment, size, RETURN_ADDRESS(0));
 }
 
-/*
- * Do we care for *memalign? ruby/gc.c uses it in ways this lib
- * doesn't care about, but maybe some gems use it, too.
- */
 void *malloc(size_t size)
 {
+	struct src_loc *l;
+	struct alloc_hdr *h;
+	size_t asize;
+
+	if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize)) {
+		errno = ENOMEM;
+		return 0;
+	}
 	RETURN_IF_NOT_READY();
-	update_stats(size, RETURN_ADDRESS(0));
-	return real_malloc(size);
+	l = update_stats(size, RETURN_ADDRESS(0));
+	h = real_malloc(asize);
+	if (!h) return 0;
+	alloc_insert(l, h, size, h);
+	return hdr2ptr(h);
 }
 
 void *calloc(size_t nmemb, size_t size)
 {
+	void *p;
+	struct src_loc *l;
+	struct alloc_hdr *h;
+	size_t asize;
+
+	if (__builtin_mul_overflow(size, nmemb, &size)) {
+		errno = ENOMEM;
+		return 0;
+	}
+	if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize)) {
+		errno = ENOMEM;
+		return 0;
+	}
 	RETURN_IF_NOT_READY();
-	/* ruby_xcalloc already does overflow checking */
-	update_stats(nmemb * size, RETURN_ADDRESS(0));
-	return real_calloc(nmemb, size);
+	l = update_stats(size, RETURN_ADDRESS(0));
+	h = real_malloc(asize);
+	if (!h) return 0;
+	alloc_insert(l, h, size, h);
+	p = hdr2ptr(h);
+	memset(p, 0, size);
+	return p;
 }
 
 void *realloc(void *ptr, size_t size)
 {
+	void *p;
+	struct src_loc *l;
+	struct alloc_hdr *h;
+	size_t asize;
+
+	if (!size) {
+		free(ptr);
+		return 0;
+	}
+	if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize)) {
+		errno = ENOMEM;
+		return 0;
+	}
 	RETURN_IF_NOT_READY();
-	update_stats(size, RETURN_ADDRESS(0));
-	return real_realloc(ptr, size);
+	l = update_stats(size, RETURN_ADDRESS(0));
+	h = real_malloc(asize);
+	if (!h) return 0;
+	alloc_insert(l, h, size, h);
+	p = hdr2ptr(h);
+	if (ptr) {
+		struct alloc_hdr *old = ptr2hdr(ptr);
+		memcpy(p, ptr, old->size < size ? old->size : size);
+		free(ptr);
+	}
+	return p;
 }
 
 struct dump_arg {
@@ -360,7 +602,7 @@ static void
 free_src_loc(struct rcu_head *head)
 {
 	struct src_loc *l = caa_container_of(head, struct src_loc, rcu_head);
-	free(l);
+	real_free(l);
 }
 
 static void *totals_clear(void *ign)
-- 
EW