mwrap (Perl version) user+dev discussion/patches/pulls/bugs/help
 help / color / mirror / code / Atom feed
* [PATCH] port to Perl5 and XS
@ 2019-10-31 20:03 Eric Wong
  0 siblings, 0 replies; only message in thread
From: Eric Wong @ 2019-10-31 20:03 UTC (permalink / raw)
  To: mwrap-perl

I mainly use Perl5 (again :P), and sometimes tracking down where
malloc calls happen is necessary.  I don't know of any malloc
wrapper interface which is aware of Perl source locations.

Valgrind and similar tools can only figure out C source
locations, which isn't very useful when hacking in Perl.
---
 .document                    |    2 -
 .gitignore                   |   12 +-
 .olddoc.yml                  |    8 -
 MANIFEST                     |   18 +-
 Makefile.PL                  |   72 ++
 Mwrap.xs                     |  891 +++++++++++++++++++++
 README                       |   86 +-
 Rakefile                     |   16 -
 bin/mwrap                    |   36 -
 ext/mwrap/extconf.rb         |   28 -
 ext/mwrap/mwrap.c            | 1464 ----------------------------------
 ext/mwrap/jhash.h => jhash.h |    0
 lib/Devel/Mwrap.pm           |   15 +
 lib/mwrap_rack.rb            |  172 ----
 mwrap.gemspec                |   32 -
 script/mwrap-perl            |   34 +
 t/mwrap.t                    |   85 ++
 t/source_location.perl       |    9 +
 test/test_mwrap.rb           |  322 --------
 typemap                      |    4 +
 20 files changed, 1163 insertions(+), 2143 deletions(-)
 delete mode 100644 .document
 delete mode 100644 .olddoc.yml
 create mode 100644 Makefile.PL
 create mode 100644 Mwrap.xs
 delete mode 100644 Rakefile
 delete mode 100755 bin/mwrap
 delete mode 100644 ext/mwrap/extconf.rb
 delete mode 100644 ext/mwrap/mwrap.c
 rename ext/mwrap/jhash.h => jhash.h (100%)
 create mode 100644 lib/Devel/Mwrap.pm
 delete mode 100644 lib/mwrap_rack.rb
 delete mode 100644 mwrap.gemspec
 create mode 100644 script/mwrap-perl
 create mode 100644 t/mwrap.t
 create mode 100644 t/source_location.perl
 delete mode 100644 test/test_mwrap.rb
 create mode 100644 typemap

diff --git a/.document b/.document
deleted file mode 100644
index 4ca33e3..0000000
--- a/.document
+++ /dev/null
@@ -1,2 +0,0 @@
-ext/mwrap/mwrap.c
-lib/mwrap_rack.rb
diff --git a/.gitignore b/.gitignore
index aa3606c..81948b8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,10 @@
-/tmp
 *.o
 *.so
-/pkg
-/*.gem
-/doc
+/MYMETA.
+/MYMETA.*
+/MANIFEST.gen
+/Makefile
+/Mwrap.bs
+/Mwrap.c
+/blib
+/pm_to_blib
diff --git a/.olddoc.yml b/.olddoc.yml
deleted file mode 100644
index dac0353..0000000
--- a/.olddoc.yml
+++ /dev/null
@@ -1,8 +0,0 @@
----
-cgit_url: https://80x24.org/mwrap.git
-git_url: https://80x24.org/mwrap.git
-rdoc_url: https://80x24.org/mwrap/
-ml_url: https://80x24.org/mwrap-public/
-public_email: mwrap-public@80x24.org
-nntp_url:
-  - nntp://news.public-inbox.org/inbox.comp.lang.ruby.mwrap
diff --git a/MANIFEST b/MANIFEST
index e6d8964..2fa42b1 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -1,14 +1,12 @@
-.document
 .gitignore
-.olddoc.yml
 COPYING
 MANIFEST
+Makefile.PL
+Mwrap.xs
 README
-Rakefile
-bin/mwrap
-ext/mwrap/extconf.rb
-ext/mwrap/jhash.h
-ext/mwrap/mwrap.c
-lib/mwrap_rack.rb
-mwrap.gemspec
-test/test_mwrap.rb
+jhash.h
+lib/Devel/Mwrap.pm
+script/mwrap-perl
+t/mwrap.t
+t/source_location.perl
+typemap
diff --git a/Makefile.PL b/Makefile.PL
new file mode 100644
index 0000000..1ae3080
--- /dev/null
+++ b/Makefile.PL
@@ -0,0 +1,72 @@
+use strict;
+use ExtUtils::MakeMaker;
+use Config;
+my $pkg_config = $ENV{PKG_CONFIG} // 'pkg-config';
+my $LIBS = `$pkg_config --libs liburcu-cds liburcu-bp`;
+if ($?) {
+	print STDERR <<END;
+`$pkg_config --libs liburcu-cds` failed (\$?=$?)
+
+You need to install pkg-config and liburcu <https://liburcu.org/>
+before you can build Devel::Mwrap.
+
+On Debian:
+
+	apt-get install pkg-config liburcu-dev
+END
+	# tell CPAN testing to indicate missing deps
+	exit 0;
+}
+
+if ($Config{usemymalloc} eq 'y') {
+	print STDERR <<END;
+Devel::Mwrap requires `usemymalloc=n'.  malloc and related functions
+must be dynamically-linked.
+END
+	exit 0;
+}
+
+# may be empty
+chomp(my $INC = `$pkg_config --cflags liburcu-cds liburcu-bp`);
+my @writemakefile_args = ();
+# Filter out some gcc options which g++ doesn't support.
+my $CCFLAGS = $Config{ccflags};
+
+if (defined $ENV{CPPFLAGS}) {
+	$CCFLAGS .= ' ' . $ENV{CPPFLAGS};
+}
+
+# See lib/ExtUtils/MakeMaker.pm for details of how to influence
+# the contents of the Makefile that is written.
+push @writemakefile_args, (
+	NAME => 'Devel::Mwrap',
+	VERSION_FROM => 'lib/Devel/Mwrap.pm',
+	PREREQ_PM => {},
+	ABSTRACT_FROM => 'lib/Devel/Mwrap.pm',
+	EXE_FILES => [qw(script/mwrap-perl)],
+	AUTHOR => 'mwrap hackers <mwrap-perl@80x24.org>',
+	LIBS => $LIBS, # e.g. -lurcu-cds
+	LICENSE => 'gpl_2', # GPL-2.0+, CPAN::Meta::Spec limitation
+	MIN_PERL_VERSION => '5.14.0', # for caller_cx
+	BUILD_REQUIRES => {},
+	CCFLAGS => $CCFLAGS, # e.g -I/usr/include/$ARCH
+	INC => $INC,
+	depend => {
+		Makefile => 'lib/Devel/Mwrap.pm',
+	}
+);
+
+WriteMakefile(@writemakefile_args);
+
+sub MY::postamble {
+	<<EOF;
+N = \$\$(( \$\$(nproc 2>/dev/null || gnproc 2>/dev/null || echo 2) + 1 ))
+-include config.mak
+
+check-manifest :: MANIFEST
+	if git ls-files >\$?.gen 2>&1; then diff -u \$? \$?.gen; fi
+
+check:: all check-manifest
+	PERL5LIB=blib/lib:blib/arch prove -vw -j\$(N)
+EOF
+}
diff --git a/Mwrap.xs b/Mwrap.xs
new file mode 100644
index 0000000..f196b1a
--- /dev/null
+++ b/Mwrap.xs
@@ -0,0 +1,891 @@
+/*
+ * Copyright (C) 2018-2019 mwrap hackers <mwrap-perl@80x24.org>
+ * License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
+ * Disclaimer: I don't really know my way around XS or Perl internals well
+ */
+#define _LGPL_SOURCE /* allows URCU to inline some stuff */
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+#include "embed.h"
+
+#include <execinfo.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <dlfcn.h>
+#include <assert.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <urcu-bp.h>
+#include <urcu/rculfhash.h>
+#include <urcu/rculist.h>
+#include "jhash.h"
+
+static size_t total_bytes_inc, total_bytes_dec;
+
+extern pthread_key_t __attribute__((weak)) PL_thr_key;
+
+/* true for glibc/dlmalloc/ptmalloc, not sure about jemalloc */
+#define ASSUMED_MALLOC_ALIGNMENT (sizeof(void *) * 2)
+
+#ifdef __FreeBSD__
+void *__malloc(size_t);
+void __free(void *);
+#  define real_malloc __malloc
+#  define real_free __free
+#else
+static void *(*real_malloc)(size_t);
+static void (*real_free)(void *);
+static int resolving_malloc;
+#endif /* !FreeBSD */
+
+/*
+ * we need to fake an OOM condition while dlsym is running,
+ * as that calls calloc under glibc, but we don't have the
+ * symbol for the jemalloc calloc, yet
+ */
+#  define RETURN_IF_NOT_READY() do { \
+	if (!real_malloc) { \
+		errno = ENOMEM; \
+		return NULL; \
+	} \
+} while (0)
+
+static __thread size_t locating;
+static size_t page_size;
+static struct cds_lfht *totals;
+union padded_mutex {
+	pthread_mutex_t mtx;
+	char pad[64];
+};
+
+/* a round-robin pool of mutexes */
+#define MUTEX_NR   (1 << 6)
+#define MUTEX_MASK (MUTEX_NR - 1)
+static size_t mutex_i;
+static union padded_mutex mutexes[MUTEX_NR] = {
+	[0 ... (MUTEX_NR-1)].mtx = PTHREAD_MUTEX_INITIALIZER
+};
+
+static pthread_mutex_t *mutex_assign(void)
+{
+	return &mutexes[uatomic_add_return(&mutex_i, 1) & MUTEX_MASK].mtx;
+}
+
+static struct cds_lfht *
+lfht_new(void)
+{
+	return cds_lfht_new(16384, 1, 0, CDS_LFHT_AUTO_RESIZE, 0);
+}
+
+__attribute__((constructor)) static void resolve_malloc(void)
+{
+	int err;
+	++locating;
+
+#ifdef __FreeBSD__
+	/*
+	 * PTHREAD_MUTEX_INITIALIZER on FreeBSD means lazy initialization,
+	 * which happens at pthread_mutex_lock, and that calls calloc
+	 */
+	{
+		size_t i;
+
+		for (i = 0; i < MUTEX_NR; i++) {
+			err = pthread_mutex_init(&mutexes[i].mtx, 0);
+			if (err) {
+				fprintf(stderr, "error: %s\n", strerror(err));
+				_exit(1);
+			}
+		}
+		/* initialize mutexes used by urcu-bp */
+		rcu_read_lock();
+		rcu_read_unlock();
+	}
+#else /* !FreeBSD (tested on GNU/Linux) */
+	if (!real_malloc) {
+		resolving_malloc = 1;
+		real_malloc = dlsym(RTLD_NEXT, "malloc");
+	}
+	real_free = dlsym(RTLD_NEXT, "free");
+	if (!real_malloc || !real_free) {
+		fprintf(stderr, "missing malloc/aligned_alloc/free\n"
+			"\t%p %p\n", real_malloc, real_free);
+		_exit(1);
+	}
+#endif /* !FreeBSD */
+	err = pthread_atfork(call_rcu_before_fork,
+				call_rcu_after_fork_parent,
+				call_rcu_after_fork_child);
+	if (err)
+		fprintf(stderr, "pthread_atfork failed: %s\n", strerror(err));
+	page_size = sysconf(_SC_PAGESIZE);
+	--locating;
+}
+
+static void
+mutex_lock(pthread_mutex_t *m)
+{
+	int err = pthread_mutex_lock(m);
+	assert(err == 0);
+}
+
+static void
+mutex_unlock(pthread_mutex_t *m)
+{
+	int err = pthread_mutex_unlock(m);
+	assert(err == 0);
+}
+
+#ifndef HAVE_MEMPCPY
+static void *
+my_mempcpy(void *dest, const void *src, size_t n)
+{
+	return (char *)memcpy(dest, src, n) + n;
+}
+#define mempcpy(dst,src,n) my_mempcpy(dst,src,n)
+#endif
+
+/* stolen from glibc: */
+#define RETURN_ADDRESS(nr) \
+  (uintptr_t)(__builtin_extract_return_addr(__builtin_return_address(nr)))
+
+#define INT2STR_MAX (sizeof(unsigned) == 4 ? 10 : 19)
+static char *int2str(unsigned num, char *dst, size_t * size)
+{
+	if (num <= 9) {
+		*size -= 1;
+		*dst++ = (char)(num + '0');
+		return dst;
+	} else {
+		char buf[INT2STR_MAX];
+		char *end = buf + sizeof(buf);
+		char *p = end;
+		size_t adj;
+
+		do {
+			*size -= 1;
+			*--p = (char)((num % 10) + '0');
+			num /= 10;
+		} while (num && *size);
+
+		if (!num) {
+			adj = end - p;
+			return mempcpy(dst, p, adj);
+		}
+	}
+	return NULL;
+}
+
+/* allocated via real_malloc/real_free */
+struct src_loc {
+	pthread_mutex_t *mtx;
+	size_t total;
+	size_t allocations;
+	size_t frees;
+	struct cds_lfht_node hnode;
+	struct cds_list_head allocs; /* <=> alloc_hdr.node */
+	uint32_t hval;
+	uint32_t capa;
+	char k[];
+};
+
+/*
+ * I hate typedefs, especially when they're hiding the fact that there's
+ * a pointer, but XS needs this, apparently, and it does s/__/::/g
+ */
+typedef struct src_loc * Devel__Mwrap__SrcLoc;
+
+/* every allocation has this in the header, maintain alignment with malloc  */
+struct alloc_hdr {
+	struct cds_list_head anode; /* <=> src_loc.allocs */
+	union {
+		struct {
+			struct src_loc *loc;
+		} live;
+		struct rcu_head dead;
+	} as;
+	void *real; /* what to call real_free on */
+	size_t size;
+};
+
+static __thread char kbuf[
+	PATH_MAX + INT2STR_MAX + sizeof(struct alloc_hdr) + 2
+];
+
+static struct alloc_hdr *ptr2hdr(void *p)
+{
+	return (struct alloc_hdr *)((uintptr_t)p - sizeof(struct alloc_hdr));
+}
+
+static void *hdr2ptr(struct alloc_hdr *h)
+{
+	return (void *)((uintptr_t)h + sizeof(struct alloc_hdr));
+}
+
+static int loc_is_addr(const struct src_loc *l)
+{
+	return l->capa == 0;
+}
+
+static size_t loc_size(const struct src_loc *l)
+{
+	return loc_is_addr(l) ? sizeof(uintptr_t) : l->capa;
+}
+
+static int loc_eq(struct cds_lfht_node *node, const void *key)
+{
+	const struct src_loc *existing;
+	const struct src_loc *k = key;
+
+	existing = caa_container_of(node, struct src_loc, hnode);
+
+	return (k->hval == existing->hval &&
+		k->capa == existing->capa &&
+		memcmp(k->k, existing->k, loc_size(k)) == 0);
+}
+
+static struct src_loc *totals_add_rcu(struct src_loc *k)
+{
+	struct cds_lfht_iter iter;
+	struct cds_lfht_node *cur;
+	struct src_loc *l = 0;
+	struct cds_lfht *t;
+
+again:
+	t = rcu_dereference(totals);
+	if (!t) goto out_unlock;
+	cds_lfht_lookup(t, k->hval, loc_eq, k, &iter);
+	cur = cds_lfht_iter_get_node(&iter);
+	if (cur) {
+		l = caa_container_of(cur, struct src_loc, hnode);
+		uatomic_add(&l->total, k->total);
+		uatomic_add(&l->allocations, 1);
+	} else {
+		size_t n = loc_size(k);
+		l = real_malloc(sizeof(*l) + n);
+		if (!l) goto out_unlock;
+		memcpy(l, k, sizeof(*l) + n);
+		l->mtx = mutex_assign();
+		l->frees = 0;
+		l->allocations = 1;
+		CDS_INIT_LIST_HEAD(&l->allocs);
+		cur = cds_lfht_add_unique(t, k->hval, loc_eq, l, &l->hnode);
+		if (cur != &l->hnode) { /* lost race */
+			rcu_read_unlock();
+			real_free(l);
+			rcu_read_lock();
+			goto again;
+		}
+	}
+out_unlock:
+	return l;
+}
+
+static void update_stats_rcu_unlock(const struct src_loc *l)
+{
+	if (caa_likely(l)) rcu_read_unlock();
+}
+
+static struct src_loc *update_stats_rcu_lock(size_t size, uintptr_t caller)
+{
+	const PERL_CONTEXT *cx = NULL;
+	static const size_t xlen = sizeof(caller);
+	struct src_loc *k, *ret = 0;
+	char *dst;
+
+	if (caa_unlikely(!totals)) return 0;
+	if (locating++) goto out; /* do not recurse into another *alloc */
+
+	uatomic_add(&total_bytes_inc, size);
+
+	rcu_read_lock();
+	cx = caller_cx(0, NULL);
+	if (cx) {
+		const char *ptr = OutCopFILE(cx->blk_oldcop);
+		const COP *lcop;
+		unsigned line;
+		size_t len;
+		size_t int_size = INT2STR_MAX;
+
+		if (!ptr) goto unknown;
+
+		lcop = Perl_closest_cop(aTHX_ cx->blk_oldcop,
+					OpSIBLING(cx->blk_oldcop),
+					cx->blk_sub.retop, TRUE);
+		if (!lcop)
+			lcop = cx->blk_oldcop;
+		line = CopLINE(lcop);
+
+		/* avoid vsnprintf or anything which could call malloc here: */
+		len = strlen(ptr);
+		if (len > PATH_MAX)
+			len = PATH_MAX;
+		k = (void *)kbuf;
+		k->total = size;
+		dst = mempcpy(k->k, ptr, len);
+		*dst++ = ':';
+
+		if (line == UINT_MAX) /* no line number */
+			*dst++ = '-';
+		else
+			dst = int2str(line, dst, &int_size);
+
+		assert(dst && "bad math");
+		*dst = 0;	/* terminate string */
+		k->capa = (uint32_t)(dst - k->k + 1);
+		k->hval = jhash(k->k, k->capa, 0xdeadbeef);
+		ret = totals_add_rcu(k);
+	} else {
+unknown:
+		k = alloca(sizeof(*k) + xlen);
+		k->total = size;
+		memcpy(k->k, &caller, xlen);
+		k->capa = 0;
+		k->hval = jhash(k->k, xlen, 0xdeadbeef);
+		ret = totals_add_rcu(k);
+	}
+out:
+	--locating;
+	return ret;
+}
+
+size_t malloc_usable_size(void *p)
+{
+	return ptr2hdr(p)->size;
+}
+
+static void
+free_hdr_rcu(struct rcu_head *dead)
+{
+	struct alloc_hdr *h = caa_container_of(dead, struct alloc_hdr, as.dead);
+	real_free(h->real);
+}
+
+void free(void *p)
+{
+	if (p) {
+		struct alloc_hdr *h = ptr2hdr(p);
+		struct src_loc *l = h->as.live.loc;
+
+		if (!real_free) return; /* oh well, leak a little */
+		if (l) {
+			uatomic_add(&total_bytes_dec, h->size);
+			uatomic_set(&h->size, 0);
+			uatomic_add(&l->frees, 1);
+
+			mutex_lock(l->mtx);
+			cds_list_del_rcu(&h->anode);
+			mutex_unlock(l->mtx);
+
+			call_rcu(&h->as.dead, free_hdr_rcu);
+		} else {
+			real_free(h->real);
+		}
+	}
+}
+
+static void
+alloc_insert_rcu(struct src_loc *l, struct alloc_hdr *h, size_t size, void *real)
+{
+	/* we need src_loc to remain alive for the duration of this call */
+	if (!h) return;
+	h->size = size;
+	h->real = real;
+	h->as.live.loc = l;
+	if (l) {
+		mutex_lock(l->mtx);
+		cds_list_add_rcu(&h->anode, &l->allocs);
+		mutex_unlock(l->mtx);
+	}
+}
+
+static size_t size_align(size_t size, size_t alignment)
+{
+	return ((size + (alignment - 1)) & ~(alignment - 1));
+}
+
+static bool ptr_is_aligned(const void *ptr, size_t alignment)
+{
+	return ((uintptr_t)ptr & (alignment - 1)) == 0;
+}
+
+static void *ptr_align(void *ptr, size_t alignment)
+{
+	return (void *)(((uintptr_t)ptr + (alignment - 1)) & ~(alignment - 1));
+}
+
+static bool is_power_of_two(size_t n) { return (n & (n - 1)) == 0; }
+
+static int
+internal_memalign(void **pp, size_t alignment, size_t size, uintptr_t caller)
+{
+	struct src_loc *l;
+	struct alloc_hdr *h;
+	void *real;
+	size_t asize;
+	size_t d = alignment / sizeof(void*);
+	size_t r = alignment % sizeof(void*);
+
+	if (!real_malloc) return ENOMEM;
+
+	if (r != 0 || d == 0 || !is_power_of_two(d))
+		return EINVAL;
+
+	if (alignment <= ASSUMED_MALLOC_ALIGNMENT) {
+		void *p = malloc(size);
+		if (!p) return ENOMEM;
+		*pp = p;
+		return 0;
+	}
+	for (; alignment < sizeof(struct alloc_hdr); alignment *= 2)
+		; /* double alignment until >= sizeof(struct alloc_hdr) */
+	if (__builtin_add_overflow(size, alignment, &asize) ||
+	    __builtin_add_overflow(asize, sizeof(struct alloc_hdr), &asize))
+		return ENOMEM;
+
+	l = update_stats_rcu_lock(size, caller);
+
+	real = real_malloc(asize);
+	if (real) {
+		void *p = hdr2ptr(real);
+		if (!ptr_is_aligned(p, alignment))
+			p = ptr_align(p, alignment);
+		h = ptr2hdr(p);
+		alloc_insert_rcu(l, h, size, real);
+		update_stats_rcu_unlock(l);
+		*pp = p;
+	}
+
+	return real ? 0 : ENOMEM;
+}
+
+static void *
+memalign_result(int err, void *p)
+{
+	if (caa_unlikely(err)) {
+		errno = err;
+		return 0;
+	}
+	return p;
+}
+
+void *memalign(size_t alignment, size_t size)
+{
+	void *p;
+	int err = internal_memalign(&p, alignment, size, RETURN_ADDRESS(0));
+	return memalign_result(err, p);
+}
+
+int posix_memalign(void **p, size_t alignment, size_t size)
+{
+	return internal_memalign(p, alignment, size, RETURN_ADDRESS(0));
+}
+
+void *aligned_alloc(size_t, size_t) __attribute__((alias("memalign")));
+void cfree(void *) __attribute__((alias("free")));
+
+void *valloc(size_t size)
+{
+	void *p;
+	int err = internal_memalign(&p, page_size, size, RETURN_ADDRESS(0));
+	return memalign_result(err, p);
+}
+
+#if __GNUC__ < 7
+#  define add_overflow_p(a,b) __extension__({ \
+		__typeof__(a) _c; \
+		__builtin_add_overflow(a,b,&_c); \
+	})
+#else
+#  define add_overflow_p(a,b) \
+		__builtin_add_overflow_p((a),(b),(__typeof__(a+b))0)
+#endif
+
+void *pvalloc(size_t size)
+{
+	size_t alignment = page_size;
+	void *p;
+	int err;
+
+	if (add_overflow_p(size, alignment)) {
+		errno = ENOMEM;
+		return 0;
+	}
+	size = size_align(size, alignment);
+	err = internal_memalign(&p, alignment, size, RETURN_ADDRESS(0));
+	return memalign_result(err, p);
+}
+
+void *malloc(size_t size)
+{
+	struct src_loc *l;
+	struct alloc_hdr *h;
+	size_t asize;
+	void *p;
+
+	if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize))
+		goto enomem;
+
+	/*
+	 * Needed for C++ global declarations using "new",
+	 * which happens before our constructor
+	 */
+#ifndef __FreeBSD__
+	if (!real_malloc) {
+		if (resolving_malloc) goto enomem;
+		resolving_malloc = 1;
+		real_malloc = dlsym(RTLD_NEXT, "malloc");
+	}
+#endif
+	l = update_stats_rcu_lock(size, RETURN_ADDRESS(0));
+	p = h = real_malloc(asize);
+	if (h) {
+		alloc_insert_rcu(l, h, size, h);
+		p = hdr2ptr(h);
+	}
+	update_stats_rcu_unlock(l);
+	if (caa_unlikely(!p)) errno = ENOMEM;
+	return p;
+enomem:
+	errno = ENOMEM;
+	return 0;
+}
+
+void *calloc(size_t nmemb, size_t size)
+{
+	void *p;
+	struct src_loc *l;
+	struct alloc_hdr *h;
+	size_t asize;
+
+	if (__builtin_mul_overflow(size, nmemb, &size)) {
+		errno = ENOMEM;
+		return 0;
+	}
+	if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize)) {
+		errno = ENOMEM;
+		return 0;
+	}
+	RETURN_IF_NOT_READY();
+	l = update_stats_rcu_lock(size, RETURN_ADDRESS(0));
+	p = h = real_malloc(asize);
+	if (p) {
+		alloc_insert_rcu(l, h, size, h);
+		p = hdr2ptr(h);
+		memset(p, 0, size);
+	}
+	update_stats_rcu_unlock(l);
+	if (caa_unlikely(!p)) errno = ENOMEM;
+	return p;
+}
+
+void *realloc(void *ptr, size_t size)
+{
+	void *p;
+	struct src_loc *l;
+	struct alloc_hdr *h;
+	size_t asize;
+
+	if (!size) {
+		free(ptr);
+		return 0;
+	}
+	if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize)) {
+		errno = ENOMEM;
+		return 0;
+	}
+	RETURN_IF_NOT_READY();
+
+	l = update_stats_rcu_lock(size, RETURN_ADDRESS(0));
+	p = h = real_malloc(asize);
+	if (p) {
+		alloc_insert_rcu(l, h, size, h);
+		p = hdr2ptr(h);
+	}
+	update_stats_rcu_unlock(l);
+
+	if (ptr && p) {
+		struct alloc_hdr *old = ptr2hdr(ptr);
+		memcpy(p, ptr, old->size < size ? old->size : size);
+		free(ptr);
+	}
+	if (caa_unlikely(!p)) errno = ENOMEM;
+	return p;
+}
+
+struct dump_arg {
+	FILE *fp;
+	size_t min;
+};
+
+static void *dump_to_file(struct dump_arg *a)
+{
+	struct cds_lfht_iter iter;
+	struct src_loc *l;
+	struct cds_lfht *t;
+
+	++locating;
+	rcu_read_lock();
+	t = rcu_dereference(totals);
+	if (!t)
+		goto out_unlock;
+	cds_lfht_for_each_entry(t, &iter, l, hnode) {
+		const void *p = l->k;
+		char **s = 0;
+		if (l->total <= a->min) continue;
+
+		if (loc_is_addr(l)) {
+			s = backtrace_symbols(p, 1);
+			p = s[0];
+		}
+		fprintf(a->fp, "%16zu %12zu %s\n",
+			l->total, l->allocations, (const char *)p);
+		if (s) free(s);
+	}
+out_unlock:
+	rcu_read_unlock();
+	--locating;
+	return 0;
+}
+
+static SV *location_string(struct src_loc *l)
+{
+	SV *ret;
+
+	if (loc_is_addr(l)) {
+		char **s = backtrace_symbols((void *)l->k, 1);
+
+		ret = newSVpvn(s[0], strlen(s[0]));
+	}
+	else {
+		ret = newSVpvn(l->k, l->capa - 1);
+	}
+
+	return ret;
+}
+
+static int
+extract_addr(const char *str, size_t len, void **p)
+{
+	const char *c;
+#if defined(__GLIBC__)
+	return ((c = memrchr(str, '[', len)) && sscanf(c, "[%p]", p));
+#else /* TODO: test FreeBSD */
+	return ((c = strstr(str, "0x")) && sscanf(c, "%p", p));
+#endif
+}
+
+#ifndef O_CLOEXEC
+#  define O_CLOEXEC 0
+#endif
+__attribute__ ((destructor))
+static void dump_destructor(void)
+{
+	const char *opt = getenv("MWRAP");
+	const char *modes[] = { "a", "a+", "w", "w+", "r+" };
+	struct dump_arg a = { .min = 0 };
+	size_t i;
+	int dump_fd;
+	char *dump_path;
+	char *s;
+
+	if (!opt)
+		return;
+
+	++locating;
+	if ((dump_path = strstr(opt, "dump_path:")) &&
+			(dump_path += sizeof("dump_path")) &&
+			*dump_path) {
+		char *end = strchr(dump_path, ',');
+		if (end) {
+			char *tmp = alloca(end - dump_path + 1);
+			end = mempcpy(tmp, dump_path, end - dump_path);
+			*end = 0;
+			dump_path = tmp;
+		}
+		dump_fd = open(dump_path, O_CLOEXEC|O_WRONLY|O_APPEND|O_CREAT,
+				0666);
+		if (dump_fd < 0) {
+			fprintf(stderr, "open %s failed: %s\n", dump_path,
+				strerror(errno));
+			goto out;
+		}
+	}
+	else if (!sscanf(opt, "dump_fd:%d", &dump_fd))
+		goto out;
+
+	if ((s = strstr(opt, "dump_min:")))
+		sscanf(s, "dump_min:%zu", &a.min);
+
+	switch (dump_fd) {
+	case 0: goto out;
+	case 1: a.fp = stdout; break;
+	case 2: a.fp = stderr; break;
+	default:
+		if (dump_fd < 0)
+			goto out;
+		a.fp = 0;
+
+		for (i = 0; !a.fp && i < 5; i++)
+			a.fp = fdopen(dump_fd, modes[i]);
+
+		if (!a.fp) {
+			fprintf(stderr, "failed to open fd=%d: %s\n",
+				dump_fd, strerror(errno));
+			goto out;
+		}
+		/* we'll leak some memory here, but this is a destructor */
+	}
+	dump_to_file(&a);
+out:
+	--locating;
+}
+
+MODULE = Devel::Mwrap	PACKAGE = Devel::Mwrap	PREFIX = mwrap_
+
+BOOT:
+	totals = lfht_new();
+	if (!totals)
+		fprintf(stderr, "failed to allocate totals table\n");
+
+PROTOTYPES: ENABLE
+
+size_t
+mwrap_total_bytes_allocated()
+CODE:
+	RETVAL = total_bytes_inc;
+OUTPUT:
+	RETVAL
+
+size_t
+mwrap_total_bytes_freed()
+CODE:
+	RETVAL = total_bytes_dec;
+OUTPUT:
+	RETVAL
+
+void
+mwrap_reset()
+PREINIT:
+	struct cds_lfht *t;
+	struct cds_lfht_iter iter;
+	struct src_loc *l;
+CODE:
+	uatomic_set(&total_bytes_inc, 0);
+	uatomic_set(&total_bytes_dec, 0);
+
+	rcu_read_lock();
+	t = rcu_dereference(totals);
+	cds_lfht_for_each_entry(t, &iter, l, hnode) {
+		uatomic_set(&l->total, 0);
+		uatomic_set(&l->allocations, 0);
+		uatomic_set(&l->frees, 0);
+	}
+	rcu_read_unlock();
+
+Devel::Mwrap::SrcLoc
+mwrap_get(loc)
+	SV *loc;
+PREINIT:
+	STRLEN len;
+	const char *str;
+	struct src_loc *k = 0;
+	uintptr_t p;
+	struct cds_lfht_iter iter;
+	struct cds_lfht_node *cur;
+	struct cds_lfht *t;
+	struct src_loc *l = NULL;
+	++locating;
+CODE:
+	if (!SvPOK(loc))
+		XSRETURN_UNDEF;
+	str = SvPV(loc, len);
+	if (len > PATH_MAX)
+		XSRETURN_UNDEF;
+	if (extract_addr(str, len, (void **)&p)) {
+		k = (void *)kbuf;
+		memcpy(k->k, &p, sizeof(p));
+		k->capa = 0;
+		k->hval = jhash(k->k, sizeof(p), 0xdeadbeef);
+	} else {
+		k = (void *)kbuf;
+		memcpy(k->k, str, len + 1);
+		k->capa = len + 1;
+		k->hval = jhash(k->k, k->capa, 0xdeadbeef);
+	}
+
+	if (!k)
+		XSRETURN_UNDEF;
+
+	rcu_read_lock();
+	t = rcu_dereference(totals);
+	if (!t) goto out_unlock;
+
+	cds_lfht_lookup(t, k->hval, loc_eq, k, &iter);
+	cur = cds_lfht_iter_get_node(&iter);
+	if (cur)
+		l = caa_container_of(cur, struct src_loc, hnode);
+out_unlock:
+	rcu_read_unlock();
+	RETVAL = l;
+OUTPUT:
+	RETVAL
+CLEANUP:
+	--locating;
+
+MODULE = Devel::Mwrap	PACKAGE = Devel::Mwrap::SrcLoc	PREFIX = src_loc_
+
+PROTOTYPES: ENABLE
+
+size_t
+src_loc_frees(self)
+	Devel::Mwrap::SrcLoc self
+PREINIT:
+	++locating;
+CODE:
+	RETVAL = uatomic_read(&self->frees);
+OUTPUT:
+	RETVAL
+CLEANUP:
+	--locating;
+
+size_t
+src_loc_allocations(self)
+	Devel::Mwrap::SrcLoc self
+PREINIT:
+	++locating;
+CODE:
+	RETVAL = uatomic_read(&self->allocations);
+OUTPUT:
+	RETVAL
+CLEANUP:
+	--locating;
+
+size_t
+src_loc_total(self)
+	Devel::Mwrap::SrcLoc self
+PREINIT:
+	++locating;
+CODE:
+	RETVAL = uatomic_read(&self->total);
+OUTPUT:
+	RETVAL
+CLEANUP:
+	--locating;
+
+SV *
+src_loc_name(self)
+	Devel::Mwrap::SrcLoc self
+PREINIT:
+	++locating;
+CODE:
+	RETVAL = location_string(self);
+OUTPUT:
+	RETVAL
+CLEANUP:
+	--locating;
diff --git a/README b/README
index 3a20258..97ff4ea 100644
--- a/README
+++ b/README
@@ -1,95 +1,83 @@
-= mwrap - LD_PRELOAD malloc wrapper + line stats for Ruby
+Devel::Mwrap - LD_PRELOAD malloc wrapper + line stats for Perl
 
-mwrap is designed to answer the question:
+Devel::Mwrap is designed to answer the question:
 
-   Which lines of Ruby are hitting malloc the most?
+   Which lines of Perl are hitting malloc the most?
 
-mwrap wraps all malloc-family calls to trace the Ruby source
-location of such calls and bytes allocated at each callsite.
-As of mwrap 2.0.0, it can also function as a leak detector
-and show live allocations at every call site.  Depending on
-your application and workload, the overhead is roughly a 50%
-increase memory and runtime.
+Devel::Mwrap wraps all malloc-family calls to trace the Perl source
+location of such calls and bytes allocated at each callsite.  It
+can also function as a leak detector and show live allocations
+at every call site.  Depending on your application and workload,
+the overhead is roughly a 50%-100% increase memory and runtime.
 
-It works best for allocations under GVL, but tries to track
-numeric caller addresses for allocations made without GVL so you
-can get an idea of how much memory usage certain extensions and
-native libraries use.
+It is thread-safe and requires the concurrent lock-free hash table
+from the Userspace RCU project: https://liburcu.org/
 
-It requires the concurrent lock-free hash table from the
-Userspace RCU project: https://liburcu.org/
+It relies on dynamic linking to a malloc(3) implementation.  If
+you got Perl from your OS distribution, this typically does not
+require rebuilding Perl.
 
-It does not require recompiling or rebuilding Ruby, but only
-supports Ruby trunk (2.6.0dev+) on a few platforms:
+Tested on the perl package distributed with:
 
-* GNU/Linux
-* FreeBSD (tested 11.1)
+* Debian GNU/Linux 9, 10
 
-It may work on NetBSD, OpenBSD and DragonFly BSD.
+It may work on FreeBSD, NetBSD, OpenBSD and DragonFly BSD.
 
 == Install
 
-	# FreeBSD: pkg install liburcu
+	# FreeBSD: pkg install pkg-config liburcu
 
-	# Debian-based systems: apt-get liburcu-dev
-
-	# Install mwrap via RubyGems.org
-	gem install mwrap
+	# Debian-based systems: apt-get install pkg-config liburcu-dev
 
 == Usage
 
-mwrap works as an LD_PRELOAD and supplies a mwrap RubyGem executable to
+Devel::Mwrap works as an LD_PRELOAD and supplies a mwrap-perl script to
 improve ease-of-use.  You can set dump_path: in the MWRAP environment
 variable to append the results to a log file:
 
-	MWRAP=dump_path:/path/to/log mwrap RUBY_COMMAND
+	MWRAP=dump_path:/path/to/log mwrap-perl PERL_COMMAND
 
 	# And to display the locations with the most allocations:
 	sort -k1,1rn </path/to/log | $PAGER
 
-You may also `require "mwrap"' in your Ruby code and use
-Mwrap.dump, Mwrap.reset, Mwrap.each, etc.
+You may also `use Devel::Mwrap' in your Perl code and use
+Devel::Mwrap->dump, Devel::Mwrap->reset, Devel::Mwrap->each, etc.
 
-However, mwrap MUST be loaded via LD_PRELOAD to have any
+However, Devel::Mwrap MUST be loaded via LD_PRELOAD to have any
 effect in tracking malloc use.  However, it is safe to keep
-"require 'mwrap'" in performance-critical deployments,
+"use Devel::Mwrap" in performance-critical deployments,
 as overhead is only incurred when used as an LD_PRELOAD.
 
-The output of the mwrap dump is a text file with 3 columns:
+The output of the Devel::Mwrap->dump is a text file with 3 columns:
 
 	total_bytes	call_count	location
 
-Where location is a Ruby source location (if made under GVL)
-or an address retrieved by backtrace_symbols(3).  It is
-recommended to use the sort(1) command on either of the
-first two columns to find the hottest malloc locations.
-
-mwrap 2.0.0+ also supports a Rack application endpoint,
-it is documented at:
-
-	https://80x24.org/mwrap/MwrapRack.html
+Where location is a Perl source location or an address retrieved
+by backtrace_symbols(3).  It is recommended to use the sort(1)
+command on either of the first two columns to find the hottest
+malloc locations.
 
 == Known problems
 
 * 32-bit machines are prone to overflow (WONTFIX)
 
-== Mail archives and list:
+== Mail archives and newsgroup:
 
-	https://80x24.org/mwrap-public/
-	nntp://80x24.org/inbox.comp.lang.ruby.mwrap
+	https://80x24.org/mwrap-perl/
+	nntp://80x24.org/inbox.comp.lang.perl.mwrap
 
 No subscription will ever be required to post, but HTML mail
 will be rejected:
 
-		mwrap-public@80x24.org
+		mwrap-perl@80x24.org
 
 == Hacking
 
-	git clone https://80x24.org/mwrap.git
+	git clone https://80x24.org/mwrap-perl.git
 
-Send all patches and pull requests (use "git request-pull" to format) to
-the mailing list.  We do not use centralized or proprietary messaging
-systems.
+Send all patches and pull requests (use "git request-pull" to format)
+via email to mwrap-perl@80x24.org.  We do not and will not use
+proprietary messaging systems.
 
 == License
 
diff --git a/Rakefile b/Rakefile
deleted file mode 100644
index 50bfa89..0000000
--- a/Rakefile
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (C) 2018 mwrap hackers <mwrap-public@80x24.org>
-# License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
-require 'rake/testtask'
-begin
-  require 'rake/extensiontask'
-  Rake::ExtensionTask.new('mwrap')
-rescue LoadError
-  warn 'rake-compiler not available, cross compiling disabled'
-end
-
-Rake::TestTask.new(:test)
-task :test => :compile
-task :default => :compile
-
-c_files = File.readlines('MANIFEST').grep(%r{ext/.*\.[ch]$}).map!(&:chomp!)
-task 'compile:mwrap' => c_files
diff --git a/bin/mwrap b/bin/mwrap
deleted file mode 100755
index 9f67dab..0000000
--- a/bin/mwrap
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/ruby
-# frozen_string_literal: true
-# Copyright (C) 2018 mwrap hackers <mwrap-public@80x24.org>
-# License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
-require 'mwrap'
-mwrap_so = $".grep(%r{/mwrap\.so\z})[0] or abort "mwrap.so not loaded"
-cur = ENV['LD_PRELOAD']
-if cur
-  cur = cur.split(/[:\s]+/)
-  if !cur.include?(mwrap_so)
-    # drop old versions
-    cur.delete_if { |path| path.end_with?('/mwrap.so') }
-    cur.unshift(mwrap_so)
-    ENV['LD_PRELOAD'] = cur.join(':')
-  end
-else
-  ENV['LD_PRELOAD'] = mwrap_so
-end
-
-# work around close-on-exec by default behavior in Ruby:
-opts = {}
-if ENV['MWRAP'] =~ /dump_fd:(\d+)/
-  dump_fd = $1.to_i
-  if dump_fd > 2
-    dump_io = IO.new(dump_fd)
-    opts[dump_fd] = dump_io
-  end
-end
-
-# allow inheriting FDs from systemd
-n = ENV['LISTEN_FDS']
-if n && ENV['LISTEN_PID'].to_i == $$
-  n = 3 + n.to_i
-  (3...n).each { |fd| opts[fd] = IO.new(fd) }
-end
-exec *ARGV, opts
diff --git a/ext/mwrap/extconf.rb b/ext/mwrap/extconf.rb
deleted file mode 100644
index e9dbb1e..0000000
--- a/ext/mwrap/extconf.rb
+++ /dev/null
@@ -1,28 +0,0 @@
-# frozen_string_literal: true
-# Copyright (C) 2018 mwrap hackers <mwrap-public@80x24.org>
-# License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
-require 'mkmf'
-
-have_func 'mempcpy'
-have_library 'urcu-cds' or abort 'userspace RCU not installed'
-have_header 'urcu/rculfhash.h' or abort 'rculfhash.h not found'
-have_library 'urcu-bp' or abort 'liburcu-bp not found'
-have_library 'dl'
-have_library 'c'
-have_library 'execinfo' # FreeBSD
-
-if try_link(<<'')
-int main(void) { return __builtin_add_overflow_p(0,0,(int)1); }
-
-  $defs << '-DHAVE_BUILTIN_ADD_OVERFLOW_P'
-end
-
-if try_link(<<'')
-int main(int a) { return __builtin_add_overflow(0,0,&a); }
-
-  $defs << '-DHAVE_BUILTIN_ADD_OVERFLOW_P'
-else
-  abort 'missing __builtin_add_overflow'
-end
-
-create_makefile 'mwrap'
diff --git a/ext/mwrap/mwrap.c b/ext/mwrap/mwrap.c
deleted file mode 100644
index 5174127..0000000
--- a/ext/mwrap/mwrap.c
+++ /dev/null
@@ -1,1464 +0,0 @@
-/*
- * Copyright (C) 2018 mwrap hackers <mwrap-public@80x24.org>
- * License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
- */
-#define _LGPL_SOURCE /* allows URCU to inline some stuff */
-#include <ruby/ruby.h>
-#include <ruby/thread.h>
-#include <ruby/io.h>
-#include <execinfo.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <dlfcn.h>
-#include <assert.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <pthread.h>
-#include <urcu-bp.h>
-#include <urcu/rculfhash.h>
-#include <urcu/rculist.h>
-#include "jhash.h"
-
-static ID id_uminus;
-const char *rb_source_location_cstr(int *line); /* requires 2.6.0dev */
-extern int __attribute__((weak)) ruby_thread_has_gvl_p(void);
-extern void * __attribute__((weak)) ruby_current_execution_context_ptr;
-extern void * __attribute__((weak)) ruby_current_vm_ptr; /* for rb_gc_count */
-extern size_t __attribute__((weak)) rb_gc_count(void);
-extern VALUE __attribute__((weak)) rb_cObject;
-extern VALUE __attribute__((weak)) rb_eTypeError;
-extern VALUE __attribute__((weak)) rb_yield(VALUE);
-
-static size_t total_bytes_inc, total_bytes_dec;
-
-/* true for glibc/dlmalloc/ptmalloc, not sure about jemalloc */
-#define ASSUMED_MALLOC_ALIGNMENT (sizeof(void *) * 2)
-
-/* match values in Ruby gc.c */
-#define HEAP_PAGE_ALIGN_LOG 14
-enum {
-	HEAP_PAGE_ALIGN = (1UL << HEAP_PAGE_ALIGN_LOG),
-	REQUIRED_SIZE_BY_MALLOC = (sizeof(size_t) * 5),
-	HEAP_PAGE_SIZE = (HEAP_PAGE_ALIGN - REQUIRED_SIZE_BY_MALLOC)
-};
-
-#define IS_HEAP_PAGE_BODY ((struct src_loc *)-1)
-
-int __attribute__((weak)) ruby_thread_has_gvl_p(void)
-{
-	return 0;
-}
-
-#ifdef __FreeBSD__
-void *__malloc(size_t);
-void __free(void *);
-#  define real_malloc __malloc
-#  define real_free __free
-#else
-static void *(*real_malloc)(size_t);
-static void (*real_free)(void *);
-static int resolving_malloc;
-#endif /* !FreeBSD */
-
-/*
- * we need to fake an OOM condition while dlsym is running,
- * as that calls calloc under glibc, but we don't have the
- * symbol for the jemalloc calloc, yet
- */
-#  define RETURN_IF_NOT_READY() do { \
-	if (!real_malloc) { \
-		errno = ENOMEM; \
-		return NULL; \
-	} \
-} while (0)
-
-static __thread size_t locating;
-static size_t generation;
-static size_t page_size;
-static struct cds_lfht *totals;
-union padded_mutex {
-	pthread_mutex_t mtx;
-	char pad[64];
-};
-
-/* a round-robin pool of mutexes */
-#define MUTEX_NR   (1 << 6)
-#define MUTEX_MASK (MUTEX_NR - 1)
-static size_t mutex_i;
-static union padded_mutex mutexes[MUTEX_NR] = {
-	[0 ... (MUTEX_NR-1)].mtx = PTHREAD_MUTEX_INITIALIZER
-};
-
-static pthread_mutex_t *mutex_assign(void)
-{
-	return &mutexes[uatomic_add_return(&mutex_i, 1) & MUTEX_MASK].mtx;
-}
-
-static struct cds_lfht *
-lfht_new(void)
-{
-	return cds_lfht_new(16384, 1, 0, CDS_LFHT_AUTO_RESIZE, 0);
-}
-
-__attribute__((constructor)) static void resolve_malloc(void)
-{
-	int err;
-	++locating;
-
-#ifdef __FreeBSD__
-	/*
-	 * PTHREAD_MUTEX_INITIALIZER on FreeBSD means lazy initialization,
-	 * which happens at pthread_mutex_lock, and that calls calloc
-	 */
-	{
-		size_t i;
-
-		for (i = 0; i < MUTEX_NR; i++) {
-			err = pthread_mutex_init(&mutexes[i].mtx, 0);
-			if (err) {
-				fprintf(stderr, "error: %s\n", strerror(err));
-				_exit(1);
-			}
-		}
-		/* initialize mutexes used by urcu-bp */
-		rcu_read_lock();
-		rcu_read_unlock();
-	}
-#else /* !FreeBSD (tested on GNU/Linux) */
-	if (!real_malloc) {
-		resolving_malloc = 1;
-		real_malloc = dlsym(RTLD_NEXT, "malloc");
-	}
-	real_free = dlsym(RTLD_NEXT, "free");
-	if (!real_malloc || !real_free) {
-		fprintf(stderr, "missing malloc/aligned_alloc/free\n"
-			"\t%p %p\n", real_malloc, real_free);
-		_exit(1);
-	}
-#endif /* !FreeBSD */
-	totals = lfht_new();
-	if (!totals)
-		fprintf(stderr, "failed to allocate totals table\n");
-
-	err = pthread_atfork(call_rcu_before_fork,
-				call_rcu_after_fork_parent,
-				call_rcu_after_fork_child);
-	if (err)
-		fprintf(stderr, "pthread_atfork failed: %s\n", strerror(err));
-	page_size = sysconf(_SC_PAGESIZE);
-	--locating;
-}
-
-static void
-mutex_lock(pthread_mutex_t *m)
-{
-	int err = pthread_mutex_lock(m);
-	assert(err == 0);
-}
-
-static void
-mutex_unlock(pthread_mutex_t *m)
-{
-	int err = pthread_mutex_unlock(m);
-	assert(err == 0);
-}
-
-#ifndef HAVE_MEMPCPY
-static void *
-my_mempcpy(void *dest, const void *src, size_t n)
-{
-	return (char *)memcpy(dest, src, n) + n;
-}
-#define mempcpy(dst,src,n) my_mempcpy(dst,src,n)
-#endif
-
-/* stolen from glibc: */
-#define RETURN_ADDRESS(nr) \
-  (uintptr_t)(__builtin_extract_return_addr(__builtin_return_address(nr)))
-
-#define INT2STR_MAX (sizeof(int) == 4 ? 10 : 19)
-static char *int2str(int num, char *dst, size_t * size)
-{
-	if (num <= 9) {
-		*size -= 1;
-		*dst++ = (char)(num + '0');
-		return dst;
-	} else {
-		char buf[INT2STR_MAX];
-		char *end = buf + sizeof(buf);
-		char *p = end;
-		size_t adj;
-
-		do {
-			*size -= 1;
-			*--p = (char)((num % 10) + '0');
-			num /= 10;
-		} while (num && *size);
-
-		if (!num) {
-			adj = end - p;
-			return mempcpy(dst, p, adj);
-		}
-	}
-	return NULL;
-}
-
-/*
- * rb_source_location_cstr relies on GET_EC(), and it's possible
- * to have a native thread but no EC during the early and late
- * (teardown) phases of the Ruby process
- */
-static int has_ec_p(void)
-{
-	return (ruby_thread_has_gvl_p() && ruby_current_vm_ptr &&
-		ruby_current_execution_context_ptr);
-}
-
-struct acc {
-	uint64_t nr;
-	int64_t min;
-	int64_t max;
-	double m2;
-	double mean;
-};
-
-#define ACC_INIT(name) { .nr=0, .min=INT64_MAX, .max=-1, .m2=0, .mean=0 }
-
-/* for tracking 16K-aligned heap page bodies (protected by GVL) */
-struct {
-	pthread_mutex_t lock;
-	struct cds_list_head bodies;
-	struct cds_list_head freed;
-
-	struct acc alive;
-	struct acc reborn;
-} hpb_stats = {
-	.lock = PTHREAD_MUTEX_INITIALIZER,
-	.bodies = CDS_LIST_HEAD_INIT(hpb_stats.bodies),
-	.freed = CDS_LIST_HEAD_INIT(hpb_stats.freed),
-	.alive = ACC_INIT(hpb_stats.alive),
-	.reborn = ACC_INIT(hpb_stats.reborn)
-};
-
-/* allocated via real_malloc/real_free */
-struct src_loc {
-	pthread_mutex_t *mtx;
-	size_t total;
-	size_t allocations;
-	size_t frees;
-	size_t age_total; /* (age_total / frees) => mean age at free */
-	size_t max_lifespan;
-	struct cds_lfht_node hnode;
-	struct cds_list_head allocs; /* <=> alloc_hdr.node */
-	uint32_t hval;
-	uint32_t capa;
-	char k[];
-};
-
-/* every allocation has this in the header, maintain alignment with malloc  */
-struct alloc_hdr {
-	struct cds_list_head anode; /* <=> src_loc.allocs */
-	union {
-		struct {
-			size_t gen; /* rb_gc_count() */
-			struct src_loc *loc;
-		} live;
-		struct rcu_head dead;
-		struct {
-			size_t at; /* rb_gc_count() */
-		} hpb_freed;
-	} as;
-	void *real; /* what to call real_free on */
-	size_t size;
-};
-
-static char kbuf[PATH_MAX + INT2STR_MAX + sizeof(struct alloc_hdr) + 2];
-
-static struct alloc_hdr *ptr2hdr(void *p)
-{
-	return (struct alloc_hdr *)((uintptr_t)p - sizeof(struct alloc_hdr));
-}
-
-static void *hdr2ptr(struct alloc_hdr *h)
-{
-	return (void *)((uintptr_t)h + sizeof(struct alloc_hdr));
-}
-
-static int loc_is_addr(const struct src_loc *l)
-{
-	return l->capa == 0;
-}
-
-static size_t loc_size(const struct src_loc *l)
-{
-	return loc_is_addr(l) ? sizeof(uintptr_t) : l->capa;
-}
-
-static int loc_eq(struct cds_lfht_node *node, const void *key)
-{
-	const struct src_loc *existing;
-	const struct src_loc *k = key;
-
-	existing = caa_container_of(node, struct src_loc, hnode);
-
-	return (k->hval == existing->hval &&
-		k->capa == existing->capa &&
-		memcmp(k->k, existing->k, loc_size(k)) == 0);
-}
-
-/* note: not atomic */
-static void
-acc_add(struct acc *acc, size_t val)
-{
-	double delta = val - acc->mean;
-	uint64_t nr = ++acc->nr;
-
-	/* just don't divide-by-zero if we ever hit this (unlikely :P) */
-	if (nr)
-		acc->mean += delta / nr;
-
-	acc->m2 += delta * (val - acc->mean);
-	if ((int64_t)val < acc->min)
-		acc->min = (int64_t)val;
-	if ((int64_t)val > acc->max)
-		acc->max = (int64_t)val;
-}
-
-#if SIZEOF_LONG == 8
-# define INT64toNUM(x) LONG2NUM((long)x)
-#elif defined(HAVE_LONG_LONG) && SIZEOF_LONG_LONG == 8
-# define INT64toNUM(x) LL2NUM((LONG_LONG)x)
-#endif
-
-static VALUE
-acc_max(const struct acc *acc)
-{
-	return INT64toNUM(acc->max);
-}
-
-static VALUE
-acc_min(const struct acc *acc)
-{
-	return acc->min == INT64_MAX ? INT2FIX(-1) : INT64toNUM(acc->min);
-}
-
-static VALUE
-acc_mean(const struct acc *acc)
-{
-	return DBL2NUM(acc->nr ? acc->mean : HUGE_VAL);
-}
-
-static double
-acc_stddev_dbl(const struct acc *acc)
-{
-	if (acc->nr > 1) {
-		double variance = acc->m2 / (acc->nr - 1);
-		return sqrt(variance);
-	}
-	return 0.0;
-}
-
-static VALUE
-acc_stddev(const struct acc *acc)
-{
-	return DBL2NUM(acc_stddev_dbl(acc));
-}
-
-static struct src_loc *totals_add_rcu(struct src_loc *k)
-{
-	struct cds_lfht_iter iter;
-	struct cds_lfht_node *cur;
-	struct src_loc *l = 0;
-	struct cds_lfht *t;
-
-again:
-	t = rcu_dereference(totals);
-	if (!t) goto out_unlock;
-	cds_lfht_lookup(t, k->hval, loc_eq, k, &iter);
-	cur = cds_lfht_iter_get_node(&iter);
-	if (cur) {
-		l = caa_container_of(cur, struct src_loc, hnode);
-		uatomic_add(&l->total, k->total);
-		uatomic_add(&l->allocations, 1);
-	} else {
-		size_t n = loc_size(k);
-		l = real_malloc(sizeof(*l) + n);
-		if (!l) goto out_unlock;
-		memcpy(l, k, sizeof(*l) + n);
-		l->mtx = mutex_assign();
-		l->age_total = 0;
-		l->max_lifespan = 0;
-		l->frees = 0;
-		l->allocations = 1;
-		CDS_INIT_LIST_HEAD(&l->allocs);
-		cur = cds_lfht_add_unique(t, k->hval, loc_eq, l, &l->hnode);
-		if (cur != &l->hnode) { /* lost race */
-			rcu_read_unlock();
-			real_free(l);
-			rcu_read_lock();
-			goto again;
-		}
-	}
-out_unlock:
-	return l;
-}
-
-static void update_stats_rcu_unlock(const struct src_loc *l)
-{
-	if (caa_likely(l)) rcu_read_unlock();
-}
-
-static struct src_loc *update_stats_rcu_lock(size_t size, uintptr_t caller)
-{
-	struct src_loc *k, *ret = 0;
-	static const size_t xlen = sizeof(caller);
-	char *dst;
-
-	if (caa_unlikely(!totals)) return 0;
-	if (locating++) goto out; /* do not recurse into another *alloc */
-
-	uatomic_add(&total_bytes_inc, size);
-
-	rcu_read_lock();
-	if (has_ec_p()) {
-		int line;
-		const char *ptr = rb_source_location_cstr(&line);
-		size_t len;
-		size_t int_size = INT2STR_MAX;
-
-		generation = rb_gc_count();
-
-		if (!ptr) goto unknown;
-
-		/* avoid vsnprintf or anything which could call malloc here: */
-		len = strlen(ptr);
-		k = (void *)kbuf;
-		k->total = size;
-		dst = mempcpy(k->k, ptr, len);
-		*dst++ = ':';
-		dst = int2str(line, dst, &int_size);
-		if (dst) {
-			*dst = 0;	/* terminate string */
-			k->capa = (uint32_t)(dst - k->k + 1);
-			k->hval = jhash(k->k, k->capa, 0xdeadbeef);
-			ret = totals_add_rcu(k);
-		} else {
-			rb_bug("bad math making key from location %s:%d\n",
-				ptr, line);
-		}
-	} else {
-unknown:
-		k = alloca(sizeof(*k) + xlen);
-		k->total = size;
-		memcpy(k->k, &caller, xlen);
-		k->capa = 0;
-		k->hval = jhash(k->k, xlen, 0xdeadbeef);
-		ret = totals_add_rcu(k);
-	}
-out:
-	--locating;
-	return ret;
-}
-
-size_t malloc_usable_size(void *p)
-{
-	return ptr2hdr(p)->size;
-}
-
-static void
-free_hdr_rcu(struct rcu_head *dead)
-{
-	struct alloc_hdr *h = caa_container_of(dead, struct alloc_hdr, as.dead);
-	real_free(h->real);
-}
-
-void free(void *p)
-{
-	if (p) {
-		struct alloc_hdr *h = ptr2hdr(p);
-		struct src_loc *l = h->as.live.loc;
-
-		if (!real_free) return; /* oh well, leak a little */
-		if (l && l != IS_HEAP_PAGE_BODY) {
-			size_t age = generation - h->as.live.gen;
-
-			uatomic_add(&total_bytes_dec, h->size);
-			uatomic_set(&h->size, 0);
-			uatomic_add(&l->frees, 1);
-			uatomic_add(&l->age_total, age);
-
-			mutex_lock(l->mtx);
-			cds_list_del_rcu(&h->anode);
-			if (age > l->max_lifespan)
-				l->max_lifespan = age;
-			mutex_unlock(l->mtx);
-
-			call_rcu(&h->as.dead, free_hdr_rcu);
-		} else if (l == IS_HEAP_PAGE_BODY) {
-			size_t gen = generation;
-			size_t age = gen - h->as.live.gen;
-
-			h->as.hpb_freed.at = gen;
-
-			mutex_lock(&hpb_stats.lock);
-			acc_add(&hpb_stats.alive, age);
-
-			/* hpb_stats.bodies => hpb_stats.freed */
-			cds_list_move(&h->anode, &hpb_stats.freed);
-
-			mutex_unlock(&hpb_stats.lock);
-		} else {
-			real_free(h->real);
-		}
-	}
-}
-
-static void
-alloc_insert_rcu(struct src_loc *l, struct alloc_hdr *h, size_t size, void *real)
-{
-	/* we need src_loc to remain alive for the duration of this call */
-	if (!h) return;
-	h->size = size;
-	h->real = real;
-	h->as.live.loc = l;
-	h->as.live.gen = generation;
-	if (l) {
-		mutex_lock(l->mtx);
-		cds_list_add_rcu(&h->anode, &l->allocs);
-		mutex_unlock(l->mtx);
-	}
-}
-
-static size_t size_align(size_t size, size_t alignment)
-{
-	return ((size + (alignment - 1)) & ~(alignment - 1));
-}
-
-static bool ptr_is_aligned(const void *ptr, size_t alignment)
-{
-	return ((uintptr_t)ptr & (alignment - 1)) == 0;
-}
-
-static void *ptr_align(void *ptr, size_t alignment)
-{
-	return (void *)(((uintptr_t)ptr + (alignment - 1)) & ~(alignment - 1));
-}
-
-static bool is_power_of_two(size_t n) { return (n & (n - 1)) == 0; }
-
-static int
-internal_memalign(void **pp, size_t alignment, size_t size, uintptr_t caller)
-{
-	struct src_loc *l;
-	struct alloc_hdr *h;
-	void *real;
-	size_t asize;
-	size_t d = alignment / sizeof(void*);
-	size_t r = alignment % sizeof(void*);
-
-	if (!real_malloc) return ENOMEM;
-
-	if (r != 0 || d == 0 || !is_power_of_two(d))
-		return EINVAL;
-
-	if (alignment <= ASSUMED_MALLOC_ALIGNMENT) {
-		void *p = malloc(size);
-		if (!p) return ENOMEM;
-		*pp = p;
-		return 0;
-	}
-	for (; alignment < sizeof(struct alloc_hdr); alignment *= 2)
-		; /* double alignment until >= sizeof(struct alloc_hdr) */
-	if (__builtin_add_overflow(size, alignment, &asize) ||
-	    __builtin_add_overflow(asize, sizeof(struct alloc_hdr), &asize))
-		return ENOMEM;
-
-
-	if (alignment == HEAP_PAGE_ALIGN && size == HEAP_PAGE_SIZE) {
-		if (has_ec_p()) generation = rb_gc_count();
-		l = IS_HEAP_PAGE_BODY;
-	} else {
-		l = update_stats_rcu_lock(size, caller);
-	}
-
-	if (l == IS_HEAP_PAGE_BODY) {
-		void *p;
-		size_t gen = generation;
-
-		mutex_lock(&hpb_stats.lock);
-
-		/* reuse existing entry */
-		if (!cds_list_empty(&hpb_stats.freed)) {
-			size_t deathspan;
-
-			h = cds_list_first_entry(&hpb_stats.freed,
-						 struct alloc_hdr, anode);
-			/* hpb_stats.freed => hpb_stats.bodies */
-			cds_list_move(&h->anode, &hpb_stats.bodies);
-			assert(h->size == size);
-			assert(h->real);
-			real = h->real;
-			p = hdr2ptr(h);
-			assert(ptr_is_aligned(p, alignment));
-
-			deathspan = gen - h->as.hpb_freed.at;
-			acc_add(&hpb_stats.reborn, deathspan);
-		}
-		else {
-			real = real_malloc(asize);
-			if (!real) return ENOMEM;
-
-			p = hdr2ptr(real);
-			if (!ptr_is_aligned(p, alignment))
-				p = ptr_align(p, alignment);
-			h = ptr2hdr(p);
-			h->size = size;
-			h->real = real;
-			cds_list_add(&h->anode, &hpb_stats.bodies);
-		}
-		mutex_unlock(&hpb_stats.lock);
-		h->as.live.loc = l;
-		h->as.live.gen = gen;
-		*pp = p;
-	}
-	else {
-		real = real_malloc(asize);
-		if (real) {
-			void *p = hdr2ptr(real);
-			if (!ptr_is_aligned(p, alignment))
-				p = ptr_align(p, alignment);
-			h = ptr2hdr(p);
-			alloc_insert_rcu(l, h, size, real);
-			update_stats_rcu_unlock(l);
-			*pp = p;
-		}
-	}
-
-	return real ? 0 : ENOMEM;
-}
-
-static void *
-memalign_result(int err, void *p)
-{
-	if (caa_unlikely(err)) {
-		errno = err;
-		return 0;
-	}
-	return p;
-}
-
-void *memalign(size_t alignment, size_t size)
-{
-	void *p;
-	int err = internal_memalign(&p, alignment, size, RETURN_ADDRESS(0));
-	return memalign_result(err, p);
-}
-
-int posix_memalign(void **p, size_t alignment, size_t size)
-{
-	return internal_memalign(p, alignment, size, RETURN_ADDRESS(0));
-}
-
-void *aligned_alloc(size_t, size_t) __attribute__((alias("memalign")));
-void cfree(void *) __attribute__((alias("free")));
-
-void *valloc(size_t size)
-{
-	void *p;
-	int err = internal_memalign(&p, page_size, size, RETURN_ADDRESS(0));
-	return memalign_result(err, p);
-}
-
-#if __GNUC__ < 7
-#  define add_overflow_p(a,b) __extension__({ \
-		__typeof__(a) _c; \
-		__builtin_add_overflow(a,b,&_c); \
-	})
-#else
-#  define add_overflow_p(a,b) \
-		__builtin_add_overflow_p((a),(b),(__typeof__(a+b))0)
-#endif
-
-void *pvalloc(size_t size)
-{
-	size_t alignment = page_size;
-	void *p;
-	int err;
-
-	if (add_overflow_p(size, alignment)) {
-		errno = ENOMEM;
-		return 0;
-	}
-	size = size_align(size, alignment);
-	err = internal_memalign(&p, alignment, size, RETURN_ADDRESS(0));
-	return memalign_result(err, p);
-}
-
-void *malloc(size_t size)
-{
-	struct src_loc *l;
-	struct alloc_hdr *h;
-	size_t asize;
-	void *p;
-
-	if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize))
-		goto enomem;
-
-	/*
-	 * Needed for C++ global declarations using "new",
-	 * which happens before our constructor
-	 */
-#ifndef __FreeBSD__
-	if (!real_malloc) {
-		if (resolving_malloc) goto enomem;
-		resolving_malloc = 1;
-		real_malloc = dlsym(RTLD_NEXT, "malloc");
-	}
-#endif
-	l = update_stats_rcu_lock(size, RETURN_ADDRESS(0));
-	p = h = real_malloc(asize);
-	if (h) {
-		alloc_insert_rcu(l, h, size, h);
-		p = hdr2ptr(h);
-	}
-	update_stats_rcu_unlock(l);
-	if (caa_unlikely(!p)) errno = ENOMEM;
-	return p;
-enomem:
-	errno = ENOMEM;
-	return 0;
-}
-
-void *calloc(size_t nmemb, size_t size)
-{
-	void *p;
-	struct src_loc *l;
-	struct alloc_hdr *h;
-	size_t asize;
-
-	if (__builtin_mul_overflow(size, nmemb, &size)) {
-		errno = ENOMEM;
-		return 0;
-	}
-	if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize)) {
-		errno = ENOMEM;
-		return 0;
-	}
-	RETURN_IF_NOT_READY();
-	l = update_stats_rcu_lock(size, RETURN_ADDRESS(0));
-	p = h = real_malloc(asize);
-	if (p) {
-		alloc_insert_rcu(l, h, size, h);
-		p = hdr2ptr(h);
-		memset(p, 0, size);
-	}
-	update_stats_rcu_unlock(l);
-	if (caa_unlikely(!p)) errno = ENOMEM;
-	return p;
-}
-
-void *realloc(void *ptr, size_t size)
-{
-	void *p;
-	struct src_loc *l;
-	struct alloc_hdr *h;
-	size_t asize;
-
-	if (!size) {
-		free(ptr);
-		return 0;
-	}
-	if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize)) {
-		errno = ENOMEM;
-		return 0;
-	}
-	RETURN_IF_NOT_READY();
-
-	l = update_stats_rcu_lock(size, RETURN_ADDRESS(0));
-	p = h = real_malloc(asize);
-	if (p) {
-		alloc_insert_rcu(l, h, size, h);
-		p = hdr2ptr(h);
-	}
-	update_stats_rcu_unlock(l);
-
-	if (ptr && p) {
-		struct alloc_hdr *old = ptr2hdr(ptr);
-		memcpy(p, ptr, old->size < size ? old->size : size);
-		free(ptr);
-	}
-	if (caa_unlikely(!p)) errno = ENOMEM;
-	return p;
-}
-
-struct dump_arg {
-	FILE *fp;
-	size_t min;
-};
-
-static void *dump_to_file(void *x)
-{
-	struct dump_arg *a = x;
-	struct cds_lfht_iter iter;
-	struct src_loc *l;
-	struct cds_lfht *t;
-
-	++locating;
-	rcu_read_lock();
-	t = rcu_dereference(totals);
-	if (!t)
-		goto out_unlock;
-	cds_lfht_for_each_entry(t, &iter, l, hnode) {
-		const void *p = l->k;
-		char **s = 0;
-		if (l->total <= a->min) continue;
-
-		if (loc_is_addr(l)) {
-			s = backtrace_symbols(p, 1);
-			p = s[0];
-		}
-		fprintf(a->fp, "%16zu %12zu %s\n",
-			l->total, l->allocations, (const char *)p);
-		if (s) free(s);
-	}
-out_unlock:
-	rcu_read_unlock();
-	--locating;
-	return 0;
-}
-
-/*
- * call-seq:
- *
- *	Mwrap.dump([[io] [, min]] -> nil
- *
- * Dumps the current totals to +io+ which must be an IO object
- * (StringIO and similar are not supported).  Total sizes smaller
- * than or equal to +min+ are skipped.
- *
- * The output is space-delimited by 3 columns:
- *
- * total_size      call_count      location
- */
-static VALUE mwrap_dump(int argc, VALUE * argv, VALUE mod)
-{
-	VALUE io, min;
-	struct dump_arg a;
-	rb_io_t *fptr;
-
-	rb_scan_args(argc, argv, "02", &io, &min);
-
-	if (NIL_P(io))
-		/* library may be linked w/o Ruby */
-		io = *((VALUE *)dlsym(RTLD_DEFAULT, "rb_stderr"));
-
-	a.min = NIL_P(min) ? 0 : NUM2SIZET(min);
-	io = rb_io_get_io(io);
-	io = rb_io_get_write_io(io);
-	GetOpenFile(io, fptr);
-	a.fp = rb_io_stdio_file(fptr);
-
-	rb_thread_call_without_gvl(dump_to_file, &a, 0, 0);
-	RB_GC_GUARD(io);
-	return Qnil;
-}
-
-/* The whole operation is not remotely atomic... */
-static void *totals_reset(void *ign)
-{
-	struct cds_lfht *t;
-	struct cds_lfht_iter iter;
-	struct src_loc *l;
-
-	uatomic_set(&total_bytes_inc, 0);
-	uatomic_set(&total_bytes_dec, 0);
-
-	rcu_read_lock();
-	t = rcu_dereference(totals);
-	cds_lfht_for_each_entry(t, &iter, l, hnode) {
-		uatomic_set(&l->total, 0);
-		uatomic_set(&l->allocations, 0);
-		uatomic_set(&l->frees, 0);
-		uatomic_set(&l->age_total, 0);
-		uatomic_set(&l->max_lifespan, 0);
-	}
-	rcu_read_unlock();
-	return 0;
-}
-
-/*
- * call-seq:
- *
- *	Mwrap.reset -> nil
- *
- * Resets the the total tables by zero-ing all counters.
- * This resets all statistics.  This is not an atomic operation
- * as other threads (outside of GVL) may increment counters.
- */
-static VALUE mwrap_reset(VALUE mod)
-{
-	rb_thread_call_without_gvl(totals_reset, 0, 0, 0);
-	return Qnil;
-}
-
-/* :nodoc: */
-static VALUE mwrap_clear(VALUE mod)
-{
-	return mwrap_reset(mod);
-}
-
-static VALUE rcu_unlock_ensure(VALUE ignored)
-{
-	rcu_read_unlock();
-	--locating;
-	return Qfalse;
-}
-
-static VALUE location_string(struct src_loc *l)
-{
-	VALUE ret, tmp;
-
-	if (loc_is_addr(l)) {
-		char **s = backtrace_symbols((void *)l->k, 1);
-		tmp = rb_str_new_cstr(s[0]);
-		free(s);
-	}
-	else {
-		tmp = rb_str_new(l->k, l->capa - 1);
-	}
-
-	/* deduplicate and try to free up some memory */
-	ret = rb_funcall(tmp, id_uminus, 0);
-	if (!OBJ_FROZEN_RAW(tmp))
-		rb_str_resize(tmp, 0);
-
-	return ret;
-}
-
-static VALUE dump_each_rcu(VALUE x)
-{
-	struct dump_arg *a = (struct dump_arg *)x;
-	struct cds_lfht *t;
-	struct cds_lfht_iter iter;
-	struct src_loc *l;
-
-	t = rcu_dereference(totals);
-	cds_lfht_for_each_entry(t, &iter, l, hnode) {
-		VALUE v[6];
-		if (l->total <= a->min) continue;
-
-		v[0] = location_string(l);
-		v[1] = SIZET2NUM(l->total);
-		v[2] = SIZET2NUM(l->allocations);
-		v[3] = SIZET2NUM(l->frees);
-		v[4] = SIZET2NUM(l->age_total);
-		v[5] = SIZET2NUM(l->max_lifespan);
-
-		rb_yield_values2(6, v);
-		assert(rcu_read_ongoing());
-	}
-	return Qnil;
-}
-
-/*
- * call-seq:
- *
- *	Mwrap.each([min]) do |location,total,allocations,frees,age_total,max_lifespan|
- *	  ...
- *	end
- *
- * Yields each entry of the of the table to a caller-supplied block.
- * +min+ may be specified to filter out lines with +total+ bytes
- * equal-to-or-smaller-than the supplied minimum.
- */
-static VALUE mwrap_each(int argc, VALUE * argv, VALUE mod)
-{
-	VALUE min;
-	struct dump_arg a;
-
-	rb_scan_args(argc, argv, "01", &min);
-	a.min = NIL_P(min) ? 0 : NUM2SIZET(min);
-
-	++locating;
-	rcu_read_lock();
-
-	return rb_ensure(dump_each_rcu, (VALUE)&a, rcu_unlock_ensure, 0);
-}
-
-static size_t
-src_loc_memsize(const void *p)
-{
-	return sizeof(struct src_loc);
-}
-
-static const rb_data_type_t src_loc_type = {
-	"source_location",
-	/* no marking, no freeing */
-	{ 0, 0, src_loc_memsize, /* reserved */ },
-	/* parent, data, [ flags ] */
-};
-
-static VALUE cSrcLoc;
-
-static int
-extract_addr(const char *str, size_t len, void **p)
-{
-	const char *c;
-#if defined(__GLIBC__)
-	return ((c = memrchr(str, '[', len)) && sscanf(c, "[%p]", p));
-#else /* tested FreeBSD */
-	return ((c = strstr(str, "0x")) && sscanf(c, "%p", p));
-#endif
-}
-
-/*
- * call-seq:
- *	Mwrap[location] -> Mwrap::SourceLocation
- *
- * Returns the associated Mwrap::SourceLocation given the +location+
- * String.  +location+ is either a Ruby source location path:line
- * (e.g. "/path/to/foo.rb:5") or a hexadecimal memory address with
- * square-braces part yielded by Mwrap.dump (e.g. "[0xdeadbeef]")
- */
-static VALUE mwrap_aref(VALUE mod, VALUE loc)
-{
-	const char *str = StringValueCStr(loc);
-	int len = RSTRING_LENINT(loc);
-	struct src_loc *k = 0;
-	uintptr_t p;
-	struct cds_lfht_iter iter;
-	struct cds_lfht_node *cur;
-	struct cds_lfht *t;
-	struct src_loc *l;
-	VALUE val = Qnil;
-
-	if (extract_addr(str, len, (void **)&p)) {
-		k = (void *)kbuf;
-		memcpy(k->k, &p, sizeof(p));
-		k->capa = 0;
-		k->hval = jhash(k->k, sizeof(p), 0xdeadbeef);
-	} else {
-		k = (void *)kbuf;
-		memcpy(k->k, str, len + 1);
-		k->capa = len + 1;
-		k->hval = jhash(k->k, k->capa, 0xdeadbeef);
-	}
-
-	if (!k) return val;
-
-	rcu_read_lock();
-	t = rcu_dereference(totals);
-	if (!t) goto out_unlock;
-
-	cds_lfht_lookup(t, k->hval, loc_eq, k, &iter);
-	cur = cds_lfht_iter_get_node(&iter);
-	if (cur) {
-		l = caa_container_of(cur, struct src_loc, hnode);
-		val = TypedData_Wrap_Struct(cSrcLoc, &src_loc_type, l);
-	}
-out_unlock:
-	rcu_read_unlock();
-	return val;
-}
-
-static VALUE src_loc_each_i(VALUE p)
-{
-	struct alloc_hdr *h;
-	struct src_loc *l = (struct src_loc *)p;
-
-	cds_list_for_each_entry_rcu(h, &l->allocs, anode) {
-		size_t gen = uatomic_read(&h->as.live.gen);
-		size_t size = uatomic_read(&h->size);
-
-		if (size) {
-			VALUE v[2];
-			v[0] = SIZET2NUM(size);
-			v[1] = SIZET2NUM(gen);
-
-			rb_yield_values2(2, v);
-		}
-	}
-
-	return Qfalse;
-}
-
-static struct src_loc *src_loc_get(VALUE self)
-{
-	struct src_loc *l;
-	TypedData_Get_Struct(self, struct src_loc, &src_loc_type, l);
-	assert(l);
-	return l;
-}
-
-/*
- * call-seq:
- *	loc = Mwrap[location]
- *	loc.each { |size,generation| ... }
- *
- * Iterates through live allocations for a given Mwrap::SourceLocation,
- * yielding the +size+ (in bytes) and +generation+ of each allocation.
- * The +generation+ is the value of the GC.count method at the time
- * the allocation was made.
- *
- * This functionality is only available in mwrap 2.0.0+
- */
-static VALUE src_loc_each(VALUE self)
-{
-	struct src_loc *l = src_loc_get(self);
-
-	assert(locating == 0 && "forgot to clear locating");
-	++locating;
-	rcu_read_lock();
-	rb_ensure(src_loc_each_i, (VALUE)l, rcu_unlock_ensure, 0);
-	return self;
-}
-
-/*
- * The the mean lifespan (in GC generations) of allocations made from this
- * location.  This does not account for live allocations.
- */
-static VALUE src_loc_mean_lifespan(VALUE self)
-{
-	struct src_loc *l = src_loc_get(self);
-	size_t tot, frees;
-
-	frees = uatomic_read(&l->frees);
-	tot = uatomic_read(&l->age_total);
-	return DBL2NUM(frees ? ((double)tot/(double)frees) : HUGE_VAL);
-}
-
-/* The number of frees made from this location */
-static VALUE src_loc_frees(VALUE self)
-{
-	return SIZET2NUM(uatomic_read(&src_loc_get(self)->frees));
-}
-
-/* The number of allocations made from this location */
-static VALUE src_loc_allocations(VALUE self)
-{
-	return SIZET2NUM(uatomic_read(&src_loc_get(self)->allocations));
-}
-
-/* The total number of bytes allocated from this location */
-static VALUE src_loc_total(VALUE self)
-{
-	return SIZET2NUM(uatomic_read(&src_loc_get(self)->total));
-}
-
-/*
- * The maximum age (in GC generations) of an allocation before it was freed.
- * This does not account for live allocations.
- */
-static VALUE src_loc_max_lifespan(VALUE self)
-{
-	return SIZET2NUM(uatomic_read(&src_loc_get(self)->max_lifespan));
-}
-
-/*
- * Returns a frozen String location of the given SourceLocation object.
- */
-static VALUE src_loc_name(VALUE self)
-{
-	struct src_loc *l = src_loc_get(self);
-	VALUE ret;
-
-	++locating;
-	ret = location_string(l);
-	--locating;
-	return ret;
-}
-
-static VALUE reset_locating(VALUE ign) { --locating; return Qfalse; }
-
-/*
- * call-seq:
- *
- *	Mwrap.quiet do |depth|
- *	  # expensive sort/calculate/emitting results of Mwrap.each
- *	  # affecting statistics of the rest of the app
- *	end
- *
- * Stops allocation tracking inside the block.  This is useful for
- * monitoring code which calls other Mwrap (or ObjectSpace/GC)
- * functions which unavoidably allocate memory.
- *
- * This feature was added in mwrap 2.0.0+
- */
-static VALUE mwrap_quiet(VALUE mod)
-{
-	size_t cur = ++locating;
-	return rb_ensure(rb_yield, SIZET2NUM(cur), reset_locating, 0);
-}
-
-static VALUE total_inc(VALUE mod)
-{
-	return SIZET2NUM(total_bytes_inc);
-}
-
-static VALUE total_dec(VALUE mod)
-{
-	return SIZET2NUM(total_bytes_dec);
-}
-
-static VALUE hpb_each_yield(VALUE ignore)
-{
-	struct alloc_hdr *h, *next;
-
-	cds_list_for_each_entry_safe(h, next, &hpb_stats.bodies, anode) {
-		VALUE v[2]; /* [ generation, address ] */
-		void *addr = hdr2ptr(h);
-		assert(ptr_is_aligned(addr, HEAP_PAGE_ALIGN));
-		v[0] = LONG2NUM((long)addr);
-		v[1] = SIZET2NUM(h->as.live.gen);
-		rb_yield_values2(2, v);
-	}
-	return Qnil;
-}
-
-/*
- * call-seq:
- *
- *     Mwrap::HeapPageBody.each { |gen, addr| } -> Integer
- *
- * Yields the generation (GC.count) the heap page body was created
- * and address of the heap page body as an Integer.  Returns the
- * number of allocated pages as an Integer.  This return value should
- * match the result of GC.stat(:heap_allocated_pages)
- */
-static VALUE hpb_each(VALUE mod)
-{
-	++locating;
-	return rb_ensure(hpb_each_yield, Qfalse, reset_locating, 0);
-}
-
-/*
- * call-seq:
- *
- *	Mwrap::HeapPageBody.stat -> Hash
- *	Mwrap::HeapPageBody.stat(hash) -> hash
- *
- * The maximum lifespan of a heap page body in the Ruby VM.
- * This may be Infinity if no heap page bodies were ever freed.
- */
-static VALUE hpb_stat(int argc, VALUE *argv, VALUE hpb)
-{
-	VALUE h;
-
-	rb_scan_args(argc, argv, "01", &h);
-	if (NIL_P(h))
-		h = rb_hash_new();
-	else if (!RB_TYPE_P(h, T_HASH))
-		rb_raise(rb_eTypeError, "not a hash %+"PRIsVALUE, h);
-
-	++locating;
-#define S(x) ID2SYM(rb_intern(#x))
-	rb_hash_aset(h, S(lifespan_max), acc_max(&hpb_stats.alive));
-	rb_hash_aset(h, S(lifespan_min), acc_min(&hpb_stats.alive));
-	rb_hash_aset(h, S(lifespan_mean), acc_mean(&hpb_stats.alive));
-	rb_hash_aset(h, S(lifespan_stddev), acc_stddev(&hpb_stats.alive));
-	rb_hash_aset(h, S(deathspan_max), acc_max(&hpb_stats.reborn));
-	rb_hash_aset(h, S(deathspan_min), acc_min(&hpb_stats.reborn));
-	rb_hash_aset(h, S(deathspan_mean), acc_mean(&hpb_stats.reborn));
-	rb_hash_aset(h, S(deathspan_stddev), acc_stddev(&hpb_stats.reborn));
-	rb_hash_aset(h, S(resurrects), SIZET2NUM(hpb_stats.reborn.nr));
-#undef S
-	--locating;
-
-	return h;
-}
-
-/*
- * Document-module: Mwrap
- *
- *   require 'mwrap'
- *
- * Mwrap has a dual function as both a Ruby C extension and LD_PRELOAD
- * wrapper.  As a Ruby C extension, it exposes a limited Ruby API.
- * To be effective at gathering status, mwrap must be loaded as a
- * LD_PRELOAD (using the mwrap(1) executable makes it easy)
- *
- * ENVIRONMENT
- *
- * The "MWRAP" environment variable contains a comma-delimited list
- * of key:value options for automatically dumping at program exit.
- *
- * * dump_fd: a writable FD to dump to
- * * dump_path: a path to dump to, the file is opened in O_APPEND mode
- * * dump_min: the minimum allocation size (total) to dump
- * * dump_heap: mask of heap_page_body statistics to dump
- *
- * If both `dump_fd' and `dump_path' are specified, dump_path takes
- * precedence.
- *
- * dump_heap bitmask
- * * 0x01 - summary stats (same info as HeapPageBody.stat)
- * * 0x02 - all live heaps (similar to HeapPageBody.each)
- * * 0x04 - skip non-heap_page_body-related output
- */
-void Init_mwrap(void)
-{
-	VALUE mod, hpb;
-
-	++locating;
-	mod = rb_define_module("Mwrap");
-	id_uminus = rb_intern("-@");
-
-	/*
-	 * Represents a location in source code or library
-	 * address which calls a memory allocation.  It is
-	 * updated automatically as allocations are made, so
-	 * there is no need to reload or reread it from Mwrap#[].
-	 * This class is only available since mwrap 2.0.0+.
-	 */
-	cSrcLoc = rb_define_class_under(mod, "SourceLocation", rb_cObject);
-	rb_define_singleton_method(mod, "dump", mwrap_dump, -1);
-	rb_define_singleton_method(mod, "reset", mwrap_reset, 0);
-	rb_define_singleton_method(mod, "clear", mwrap_clear, 0);
-	rb_define_singleton_method(mod, "each", mwrap_each, -1);
-	rb_define_singleton_method(mod, "[]", mwrap_aref, 1);
-	rb_define_singleton_method(mod, "quiet", mwrap_quiet, 0);
-	rb_define_singleton_method(mod, "total_bytes_allocated", total_inc, 0);
-	rb_define_singleton_method(mod, "total_bytes_freed", total_dec, 0);
-
-
-	rb_define_method(cSrcLoc, "each", src_loc_each, 0);
-	rb_define_method(cSrcLoc, "frees", src_loc_frees, 0);
-	rb_define_method(cSrcLoc, "allocations", src_loc_allocations, 0);
-	rb_define_method(cSrcLoc, "total", src_loc_total, 0);
-	rb_define_method(cSrcLoc, "mean_lifespan", src_loc_mean_lifespan, 0);
-	rb_define_method(cSrcLoc, "max_lifespan", src_loc_max_lifespan, 0);
-	rb_define_method(cSrcLoc, "name", src_loc_name, 0);
-
-	/*
-	 * Information about "struct heap_page_body" allocations from
-	 * Ruby gc.c.  This can be useful for tracking fragmentation
-	 * from posix_memalign(3) use in mainline Ruby:
-	 *
-	 *   https://sourceware.org/bugzilla/show_bug.cgi?id=14581
-	 */
-	hpb = rb_define_class_under(mod, "HeapPageBody", rb_cObject);
-	rb_define_singleton_method(hpb, "stat", hpb_stat, -1);
-	rb_define_singleton_method(hpb, "each", hpb_each, 0);
-
-	--locating;
-}
-
-enum {
-	DUMP_HPB_STATS = 0x1,
-	DUMP_HPB_EACH = 0x2,
-	DUMP_HPB_EXCL = 0x4,
-};
-
-static void dump_hpb(FILE *fp, unsigned flags)
-{
-	if (flags & DUMP_HPB_STATS) {
-		fprintf(fp,
-			"lifespan_max: %zu\n"
-			"lifespan_min:%s%zu\n"
-			"lifespan_mean: %0.3f\n"
-			"lifespan_stddev: %0.3f\n"
-			"deathspan_max: %zu\n"
-			"deathspan_min:%s%zu\n"
-			"deathspan_mean: %0.3f\n"
-			"deathspan_stddev: %0.3f\n"
-			"gc_count: %zu\n",
-			hpb_stats.alive.max,
-			hpb_stats.alive.min == INT64_MAX ? " -" : " ",
-			hpb_stats.alive.min,
-			hpb_stats.alive.mean,
-			acc_stddev_dbl(&hpb_stats.alive),
-			hpb_stats.reborn.max,
-			hpb_stats.reborn.min == INT64_MAX ? " -" : " ",
-			hpb_stats.reborn.min,
-			hpb_stats.reborn.mean,
-			acc_stddev_dbl(&hpb_stats.reborn),
-			/* n.b.: unsafe to call rb_gc_count() in destructor */
-			generation);
-	}
-	if (flags & DUMP_HPB_EACH) {
-		struct alloc_hdr *h;
-
-		cds_list_for_each_entry(h, &hpb_stats.bodies, anode) {
-			void *addr = hdr2ptr(h);
-
-			fprintf(fp, "%p\t%zu\n", addr, h->as.live.gen);
-		}
-	}
-}
-
-/* rb_cloexec_open isn't usable by non-Ruby processes */
-#ifndef O_CLOEXEC
-#  define O_CLOEXEC 0
-#endif
-
-__attribute__ ((destructor))
-static void mwrap_dump_destructor(void)
-{
-	const char *opt = getenv("MWRAP");
-	const char *modes[] = { "a", "a+", "w", "w+", "r+" };
-	struct dump_arg a = { .min = 0 };
-	size_t i;
-	int dump_fd;
-	unsigned dump_heap = 0;
-	char *dump_path;
-	char *s;
-
-	if (!opt)
-		return;
-
-	++locating;
-	if ((dump_path = strstr(opt, "dump_path:")) &&
-			(dump_path += sizeof("dump_path")) &&
-			*dump_path) {
-		char *end = strchr(dump_path, ',');
-		if (end) {
-			char *tmp = alloca(end - dump_path + 1);
-			end = mempcpy(tmp, dump_path, end - dump_path);
-			*end = 0;
-			dump_path = tmp;
-		}
-		dump_fd = open(dump_path, O_CLOEXEC|O_WRONLY|O_APPEND|O_CREAT,
-				0666);
-		if (dump_fd < 0) {
-			fprintf(stderr, "open %s failed: %s\n", dump_path,
-				strerror(errno));
-			goto out;
-		}
-	}
-	else if (!sscanf(opt, "dump_fd:%d", &dump_fd))
-		goto out;
-
-	if ((s = strstr(opt, "dump_min:")))
-		sscanf(s, "dump_min:%zu", &a.min);
-
-	if ((s = strstr(opt, "dump_heap:")))
-		sscanf(s, "dump_heap:%u", &dump_heap);
-
-	switch (dump_fd) {
-	case 0: goto out;
-	case 1: a.fp = stdout; break;
-	case 2: a.fp = stderr; break;
-	default:
-		if (dump_fd < 0)
-			goto out;
-		a.fp = 0;
-
-		for (i = 0; !a.fp && i < 5; i++)
-			a.fp = fdopen(dump_fd, modes[i]);
-
-		if (!a.fp) {
-			fprintf(stderr, "failed to open fd=%d: %s\n",
-				dump_fd, strerror(errno));
-			goto out;
-		}
-		/* we'll leak some memory here, but this is a destructor */
-	}
-	if ((dump_heap & DUMP_HPB_EXCL) == 0)
-		dump_to_file(&a);
-	dump_hpb(a.fp, dump_heap);
-out:
-	--locating;
-}
diff --git a/ext/mwrap/jhash.h b/jhash.h
similarity index 100%
rename from ext/mwrap/jhash.h
rename to jhash.h
diff --git a/lib/Devel/Mwrap.pm b/lib/Devel/Mwrap.pm
new file mode 100644
index 0000000..f74f7d1
--- /dev/null
+++ b/lib/Devel/Mwrap.pm
@@ -0,0 +1,15 @@
+# Copyright (C) 2019 all contributors <mwrap-perl@80x24.org>
+# License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
+package Devel::Mwrap;
+use strict;
+our $VERSION = '0.0.0';
+use XSLoader;
+XSLoader::load(__PACKAGE__, $VERSION);
+
+1;
+__END__
+=pod
+
+=head1 NAME
+
+Devel::Mwrap - LD_PRELOAD malloc wrapper + line stats for Perl
diff --git a/lib/mwrap_rack.rb b/lib/mwrap_rack.rb
deleted file mode 100644
index e45b26d..0000000
--- a/lib/mwrap_rack.rb
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (C) 2018 all contributors <mwrap@80x24.org>
-# License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
-# frozen_string_literal: true
-require 'mwrap'
-require 'rack'
-require 'cgi'
-
-# MwrapRack is a standalone Rack application which can be
-# mounted to run within your application process.
-#
-# Using the Rack::Builder API in config.ru, you can map it to
-# the "/MWRAP/" endpoint.  As with the rest of the Mwrap API,
-# your Rack server needs to be spawned with the mwrap(1)
-# wrapper to enable the LD_PRELOAD.
-#
-#     require 'mwrap_rack'
-#     map('/MWRAP') { run(MwrapRack.new) }
-#     map('/') { run(your_normal_app) }
-#
-# A live demo is available at https://80x24.org/MWRAP/
-# (warning the demo machine is 32-bit, so counters will overflow)
-#
-# This module is only available in mwrap 2.0.0+
-class MwrapRack
-  module HtmlResponse # :nodoc:
-    def response
-      [ 200, {
-          'Expires' => 'Fri, 01 Jan 1980 00:00:00 GMT',
-          'Pragma' => 'no-cache',
-          'Cache-Control' => 'no-cache, max-age=0, must-revalidate',
-          'Content-Type' => 'text/html; charset=UTF-8',
-        }, self ]
-    end
-  end
-
-  class Each < Struct.new(:script_name, :min, :sort) # :nodoc:
-    include HtmlResponse
-    HEADER = '<tr><th>' + %w(total allocations frees mean_life max_life
-                location).join('</th><th>') + '</th></tr>'
-    FIELDS = %w(total allocations frees mean_life max_life location)
-    def each
-      Mwrap.quiet do
-        t = -"Mwrap.each(#{min})"
-        sn = script_name
-        all = []
-        f = FIELDS.dup
-        sc = FIELDS.index(sort || 'total') || 0
-        f[sc] = -"<b>#{f[sc]}</b>"
-        f.map! do |hdr|
-          if hdr.start_with?('<b>')
-            hdr
-          else
-            -%Q(<a\nhref="#{sn}/each/#{min}?sort=#{hdr}">#{hdr}</a>)
-          end
-        end
-        Mwrap.each(min) do |loc, total, allocations, frees, age_sum, max_life|
-          mean_life = frees == 0 ? Float::INFINITY : age_sum/frees.to_f
-          all << [total,allocations,frees,mean_life,max_life,loc]
-        end
-        all.sort_by! { |cols| -cols[sc] }
-
-        yield(-"<html><head><title>#{t}</title></head>" \
-               "<body><h1>#{t}</h1>\n" \
-               "<h2>Current generation: #{GC.count}</h2>\n<table>\n" \
-               "<tr><th>#{f.join('</th><th>')}</th></tr>\n")
-        all.each do |cols|
-          loc = cols.pop
-          cols[3] = sprintf('%0.3f', cols[3]) # mean_life
-          href = -(+"#{sn}/at/#{CGI.escape(loc)}").encode!(xml: :attr)
-          yield(%Q(<tr><td>#{cols.join('</td><td>')}<td><a\nhref=#{
-                  href}>#{-loc.encode(xml: :text)}</a></td></tr>\n))
-          cols.clear
-        end.clear
-        yield "</table></body></html>\n"
-      end
-    end
-  end
-
-  class EachAt < Struct.new(:loc) # :nodoc:
-    include HtmlResponse
-    HEADER = '<tr><th>size</th><th>generation</th></tr>'
-
-    def each
-      t = loc.name.encode(xml: :text)
-      yield(-"<html><head><title>#{t}</title></head>" \
-             "<body><h1>live allocations at #{t}</h1>" \
-             "<h2>Current generation: #{GC.count}</h2>\n<table>#{HEADER}")
-      loc.each do |size, generation|
-        yield("<tr><td>#{size}</td><td>#{generation}</td></tr>\n")
-      end
-      yield "</table></body></html>\n"
-    end
-  end
-
-  class HeapPages # :nodoc:
-    include HtmlResponse
-    HEADER = '<tr><th>address</th><th>generation</th></tr>'
-
-    def hpb_rows
-      Mwrap::HeapPageBody.stat(stat = Thread.current[:mwrap_hpb_stat] ||= {})
-      %i(lifespan_max lifespan_min lifespan_mean lifespan_stddev
-         deathspan_max deathspan_min deathspan_mean deathspan_stddev
-         resurrects
-        ).map! do |k|
-         "<tr><td>#{k}</td><td>#{stat[k]}</td></tr>\n"
-      end.join
-    end
-
-    def gc_stat_rows
-      GC.stat(stat = Thread.current[:mwrap_gc_stat] ||= {})
-      %i(count heap_allocated_pages heap_eden_pages heap_tomb_pages
-          total_allocated_pages total_freed_pages).map do |k|
-         "<tr><td>GC.stat(:#{k})</td><td>#{stat[k]}</td></tr>\n"
-      end.join
-    end
-
-    GC_STAT_URL = 'https://docs.ruby-lang.org/en/trunk/GC.html#method-c-stat'
-    GC_STAT_HELP = <<~""
-      <p>Non-Infinity lifespans can indicate fragmentation.
-      <p>See <a
-      href="#{GC_STAT_URL}">#{GC_STAT_URL}</a> for info on GC.stat values.
-
-    def each
-      Mwrap.quiet do
-        yield("<html><head><title>heap pages</title></head>" \
-              "<body><h1>heap pages</h1>" \
-              "<table><tr><th>stat</th><th>value</th></tr>\n" \
-              "#{hpb_rows}" \
-              "#{gc_stat_rows}" \
-              "</table>\n" \
-              "#{GC_STAT_HELP}" \
-              "<table>#{HEADER}")
-        Mwrap::HeapPageBody.each do |addr, generation|
-          addr = -sprintf('0x%x', addr)
-          yield(-"<tr><td>#{addr}</td><td>#{generation}</td></tr>\n")
-        end
-        yield "</table></body></html>\n"
-      end
-    end
-  end
-
-  def r404 # :nodoc:
-    [404,{'Content-Type'=>'text/plain'},["Not found\n"]]
-  end
-
-  # The standard Rack application endpoint for MwrapRack
-  def call(env)
-    case env['PATH_INFO']
-    when %r{\A/each/(\d+)\z}
-      min = $1.to_i
-      m = env['QUERY_STRING'].match(/\bsort=(\w+)/)
-      Each.new(env['SCRIPT_NAME'], min, m ? m[1] : nil).response
-    when %r{\A/at/(.*)\z}
-      loc = -CGI.unescape($1)
-      loc = Mwrap[loc] or return r404
-      EachAt.new(loc).response
-    when '/heap_pages'
-      HeapPages.new.response
-    when '/'
-      n = 2000
-      u = 'https://80x24.org/mwrap/README.html'
-      b = -('<html><head><title>Mwrap demo</title></head>' \
-          "<body><p><a href=\"each/#{n}\">allocations &gt;#{n} bytes</a>" \
-          "<p><a href=\"#{u}\">#{u}</a>" \
-          "<p><a href=\"heap_pages\">heap pages</a>" \
-          "</body></html>\n")
-      [ 200, {'Content-Type'=>'text/html','Content-Length'=>-b.size.to_s},[b]]
-    else
-      r404
-    end
-  end
-end
diff --git a/mwrap.gemspec b/mwrap.gemspec
deleted file mode 100644
index 2c01a68..0000000
--- a/mwrap.gemspec
+++ /dev/null
@@ -1,32 +0,0 @@
-git_manifest = `git ls-files 2>/dev/null`.split("\n")
-manifest = File.exist?('MANIFEST') ?
-  File.readlines('MANIFEST').map!(&:chomp).delete_if(&:empty?) : git_manifest
-if git_manifest[0] && manifest != git_manifest
-  tmp = "MANIFEST.#$$.tmp"
-  File.open(tmp, 'w') { |fp| fp.puts(git_manifest.join("\n")) }
-  File.rename(tmp, 'MANIFEST')
-  system('git add MANIFEST')
-end
-
-desc = `git describe --abbrev=4 HEAD`.strip.tr('-', '.').delete_prefix('v')
-
-Gem::Specification.new do |s|
-  s.name = 'mwrap'
-  s.version = desc.empty? ? '2.0.0' : desc
-  s.homepage = 'https://80x24.org/mwrap/'
-  s.authors = ["Ruby hackers"]
-  s.summary = 'LD_PRELOAD malloc wrapper for Ruby'
-  s.executables = %w(mwrap)
-  s.files = manifest
-  s.description = <<~EOF
-mwrap wraps all malloc, calloc, and realloc calls to trace the Ruby
-source location of such calls and bytes allocated at each callsite.
-  EOF
-  s.email = %q{e@80x24.org}
-  s.test_files = Dir['test/test_*.rb']
-  s.extensions = %w(ext/mwrap/extconf.rb)
-
-  s.add_development_dependency('test-unit', '~> 3.0')
-  s.add_development_dependency('rake-compiler', '~> 1.0')
-  s.licenses = %w(GPL-2.0+)
-end
diff --git a/script/mwrap-perl b/script/mwrap-perl
new file mode 100644
index 0000000..5e5eec4
--- /dev/null
+++ b/script/mwrap-perl
@@ -0,0 +1,34 @@
+#!/usr/bin/perl -w
+# Copyright (C) 2019 mwrap hackers <mwrap-perl@80x24.org>
+# License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
+use strict;
+use Devel::Mwrap;
+my $so;
+if ($^O eq 'linux') {
+	my $maps = do {
+		open my $fh, '<', "/proc/$$/maps" or
+				die "/proc/$$/maps not accessible: $!\n";
+		local $/;
+		<$fh>;
+	};
+	if ($maps =~ m![ \t](/[^\n]+?/Mwrap\.so)$!sm) {
+		$so = $1;
+	} else {
+		die "Mwrap.so not found in: $so\n";
+	}
+} else {
+	die "unsupported OS ($^O ne 'linux')";
+}
+my $cur = $ENV{LD_PRELOAD};
+if (defined $cur) {
+	my @cur = split(/[: \t]+/, $cur);
+	my %cur = map { $_ => 1 } @cur;
+	if (!$cur{$so}) {
+		# drop old redundant versions
+		my @keep = grep(!m!/Mwrap\.so\$!, @cur);
+		$ENV{LD_PRELOAD} = join(':', $so, @keep);
+	}
+} else {
+	$ENV{LD_PRELOAD} = $so;
+}
+exec @ARGV;
diff --git a/t/mwrap.t b/t/mwrap.t
new file mode 100644
index 0000000..5bcc285
--- /dev/null
+++ b/t/mwrap.t
@@ -0,0 +1,85 @@
+#!perl -w
+# Copyright (C) 2019 mwrap hackers <mwrap-perl@80x24.org>
+# License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
+use strict;
+use Test::More;
+use File::Temp qw(tempdir);
+use_ok 'Devel::Mwrap';
+
+my $tmpdir = tempdir('mwrap-perl-XXXXXX', TMPDIR => 1, CLEANUP => 1);
+my $dump = "$tmpdir/dump";
+my $out = "$tmpdir/out";
+my $err = "$tmpdir/err";
+my $src = slurp('blib/script/mwrap-perl');
+
+{
+	my $env = { MWRAP => "dump_path:$dump,dump_min:10000" };
+	my $nr = 1000;
+	mwrap_run('dump test', $env, '-e', '$x = "hello world" x '.$nr);
+	ok(-s $dump, "dump file written to");
+	my $s = slurp($dump);
+	my $re = qr/([0-9]+)[ \t]+([0-9]+)[ \t]+-e:1[ \t]*\n/sm;
+	my ($bytes, $n);
+	if ($s =~ $re) {
+		($bytes, $n) = ($1, $2);
+		ok($bytes >= (length('hello world') * $nr),
+			"counted 'hello world' x $nr");
+		ok($n >= 1, 'allocation counted');
+	} else {
+		fail("$s failed to match $re");
+	}
+}
+
+SKIP: { # C++ program which uses malloc via "new"
+	my $exp = `cmake -h`;
+	skip 'cmake missing', 2 if $?;
+	skip "`cmake -h' gave no output", 2 unless $exp =~ /\S/s;
+	open my $truncate, '>', $dump or die;
+	close $truncate or die;
+	my $env = { MWRAP => "dump_path:$dump" };
+	mwrap_run('cmake (C++ new)', $env, '-e',
+		'system(qw(cmake -h)); exit $?');
+	my $res = slurp($out);
+	is($res, $exp, "`cmake -h' works");
+};
+
+{
+	my $env = { MWRAP => "dump_path:$dump" };
+	mwrap_run('total_bytes*', $env, '-e', <<'E1');
+my $A = Devel::Mwrap::total_bytes_allocated();
+my $f = Devel::Mwrap::total_bytes_freed();
+print("$A - $f\n");
+E1
+	my $o = slurp($out);
+	like($o, qr/^([0-9]+) - ([0-9]+)\n/s, 'got allocated & freed bytes');
+}
+
+{
+	my $env = { MWRAP => "dump_path:$dump" };
+	mwrap_run('source location', $env, 't/source_location.perl');
+}
+
+done_testing();
+
+sub slurp {
+	open my $fh, '<', $_[0] or die "open($_[0]): $!";
+	local $/;
+	<$fh>;
+}
+
+sub mwrap_run {
+	my ($msg, $env, @args) = @_;
+	my $pid = fork;
+	if ($pid == 0) {
+		while (my ($k, $v) = each %$env) {
+			$ENV{$k} = $v;
+		}
+		open STDERR, '>', $err or die "open: $!";
+		open STDOUT, '>', $out or die "open: $!";
+		@ARGV = ($^X, '-MDevel::Mwrap', @args);
+		eval $src;
+		die "fail: $! ($@)";
+	}
+	waitpid($pid, 0);
+	is($?, 0, $msg);
+}
diff --git a/t/source_location.perl b/t/source_location.perl
new file mode 100644
index 0000000..ed81ed8
--- /dev/null
+++ b/t/source_location.perl
@@ -0,0 +1,9 @@
+use Devel::Mwrap;
+my $foo = ('hello world' x 10000);
+my $k = __FILE__ . ":2";
+my $loc = Devel::Mwrap::get($k) or die;
+$loc->name eq $k or die;
+$loc->total >= 10000 or die;
+$loc->allocations >= 1 or die;
+$loc->frees >= 0 or die;
+exit 0;
diff --git a/test/test_mwrap.rb b/test/test_mwrap.rb
deleted file mode 100644
index 48fba23..0000000
--- a/test/test_mwrap.rb
+++ /dev/null
@@ -1,322 +0,0 @@
-# frozen_string_literal: true
-# Copyright (C) 2018 mwrap hackers <mwrap-public@80x24.org>
-# License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
-require 'test/unit'
-require 'mwrap'
-require 'rbconfig'
-require 'tempfile'
-
-class TestMwrap < Test::Unit::TestCase
-  RB = "#{RbConfig::CONFIG['bindir']}/#{RbConfig::CONFIG['RUBY_INSTALL_NAME']}"
-
-  mwrap_so = $".grep(%r{/mwrap\.so\z})[0]
-  env = ENV.to_hash
-  cur = env['LD_PRELOAD']
-  env['LD_PRELOAD'] = cur ? "#{mwrap_so}:#{cur}".freeze : mwrap_so
-  @@env = env.freeze
-  inc = File.dirname(mwrap_so)
-  @@cmd = %W(#{RB} -w --disable=gems -I#{inc} -rmwrap).freeze
-
-  def test_mwrap_preload
-    cmd = @@cmd + %w(
-      -e ("helloworld"*1000).clear
-      -e Mwrap.dump
-    )
-    Tempfile.create('junk') do |tmp|
-      tmp.sync = true
-      res = system(@@env, *cmd, err: tmp)
-      assert res, $?.inspect
-      tmp.rewind
-      lines = tmp.readlines
-      line_1 = lines.grep(/\s-e:1\b/)[0].strip
-      assert_equal '10001', line_1.split(/\s+/)[0]
-    end
-  end
-
-  def test_dump_via_destructor
-    env = @@env.dup
-    env['MWRAP'] = 'dump_fd:5'
-    cmd = @@cmd + %w(-e ("0"*10000).clear)
-    Tempfile.create('junk') do |tmp|
-      tmp.sync = true
-      res = system(env, *cmd, { 5 => tmp })
-      assert res, $?.inspect
-      tmp.rewind
-      assert_match(/\b10001\s+1\s+-e:1$/, tmp.read)
-
-      env['MWRAP'] = 'dump_fd:1,dump_min:10000'
-      tmp.rewind
-      tmp.truncate(0)
-      res = system(env, *cmd, { 1 => tmp })
-      assert res, $?.inspect
-      tmp.rewind
-      assert_match(/\b10001\s+1\s+-e:1$/, tmp.read)
-
-      tmp.rewind
-      tmp.truncate(0)
-      env['MWRAP'] = "dump_path:#{tmp.path},dump_min:10000"
-      res = system(env, *cmd)
-      assert res, $?.inspect
-      assert_match(/\b10001\s+1\s+-e:1$/, tmp.read)
-
-      tmp.rewind
-      tmp.truncate(0)
-      env['MWRAP'] = "dump_path:#{tmp.path},dump_heap:5"
-      res = system(env, *cmd)
-      assert res, $?.inspect
-      assert_match %r{lifespan_stddev}, tmp.read
-    end
-  end
-
-  def test_cmake
-    begin
-      exp = `cmake -h`
-    rescue Errno::ENOENT
-      warn 'cmake missing'
-      return
-    end
-    assert_not_predicate exp.strip, :empty?
-    env = @@env.merge('MWRAP' => 'dump_fd:1')
-    out = IO.popen(env, %w(cmake -h), &:read)
-    assert out.start_with?(exp), 'original help exists'
-    assert_not_equal exp, out, 'includes dump output'
-    dump = out.delete_prefix(exp)
-    assert_match(/\b0x[a-f0-9]+\b/s, dump, 'dump output has addresses')
-  end
-
-  def test_clear
-    cmd = @@cmd + %w(
-      -e ("0"*10000).clear
-      -e Mwrap.clear
-      -e ("0"*20000).clear
-      -e Mwrap.dump($stdout,9999)
-    )
-    Tempfile.create('junk') do |tmp|
-      tmp.sync = true
-      res = system(@@env, *cmd, { 1 => tmp })
-      assert res, $?.inspect
-      tmp.rewind
-      buf = tmp.read
-      assert_not_match(/\s+-e:1$/, buf)
-      assert_match(/\b20001\s+1\s+-e:3$/, buf)
-    end
-  end
-
-  # make sure we don't break commands spawned by an mwrap-ed Ruby process:
-  def test_non_ruby_exec
-    IO.pipe do |r, w|
-      th = Thread.new { r.read }
-      Tempfile.create('junk') do |tmp|
-        tmp.sync = true
-        env = @@env.merge('MWRAP' => "dump_path:#{tmp.path}")
-        cmd = %w(perl -e print("HELLO_WORLD"))
-        res = system(env, *cmd, out: w)
-        w.close
-        assert res, $?.inspect
-        assert_match(/0x[a-f0-9]+\b/, tmp.read)
-      end
-      assert_equal "HELLO_WORLD", th.value
-    end
-  end
-
-  # some URCU flavors use USR1, ensure the one we choose does not
-  def test_sigusr1_works
-    cmd = @@cmd + %w(
-      -e STDOUT.sync=true
-      -e trap(:USR1){p("HELLO_WORLD")}
-      -e END{Mwrap.dump}
-      -e puts -e STDIN.read)
-    IO.pipe do |r, w|
-      IO.pipe do |r2, w2|
-        pid = spawn(@@env, *cmd, in: r2, out: w, err: '/dev/null')
-        r2.close
-        w.close
-        assert_equal "\n", r.gets
-        buf = +''
-        10.times { Process.kill(:USR1, pid) }
-        while IO.select([r], nil, nil, 0.1)
-          case tmp = r.read_nonblock(1000, exception: false)
-          when String
-            buf << tmp
-          end
-        end
-        w2.close
-        Process.wait(pid)
-        assert_predicate $?, :success?, $?.inspect
-        assert_equal(["\"HELLO_WORLD\"\n"], buf.split(/^/).uniq)
-      end
-    end
-  end
-
-  def test_reset
-    assert_nil Mwrap.reset
-  end
-
-  def test_each
-    cmd = @@cmd + %w(
-      -e ("0"*10000).clear
-      -e h={}
-      -e Mwrap.each(1000){|a,b,c|h[a]=[b,c]}
-      -e puts(Marshal.dump(h))
-    )
-    r = IO.popen(@@env, cmd, 'r')
-    h = Marshal.load(r.read)
-    assert_not_predicate h, :empty?
-    h.each_key { |k| assert_kind_of String, k }
-    h.each_value do |total,calls|
-      assert_operator total, :>, 0
-      assert_operator calls, :>, 0
-      assert_operator total, :>=, calls
-    end
-  end
-
-  def test_aref_each
-    cmd = @@cmd + %w(
-      -e count=GC.count
-      -e GC.disable
-      -e keep=("0"*10000)
-      -e loc=Mwrap["-e:3"]
-      -e loc.each{|size,gen|p([size,gen,count])}
-    )
-    buf = IO.popen(@@env, cmd, &:read)
-    assert_predicate $?, :success?
-    assert_match(/\A\[\s*\d+,\s*\d+,\s*\d+\]\s*\z/s, buf)
-    size, gen, count = eval(buf)
-    assert_operator size, :>=, 10000
-    assert_operator gen, :>=, count
-
-    cmd = @@cmd + %w(
-      -e count=GC.count
-      -e locs=""
-      -e Mwrap.each(1){|loc,tot,calls|locs<<loc}
-      -e m=locs.match(/(\[0x[a-f0-9]+\])/i)
-      -e m||=locs.match(/\b(0x[a-f0-9]+)\b/i)
-      -e p(loc=Mwrap["bobloblaw\t#{m[1]}"])
-      -e loc.each{|size,gen|p([size,gen,count])}
-    )
-    buf = IO.popen(@@env, cmd, &:read)
-    assert_predicate $?, :success?
-    assert_match(/\bMwrap::SourceLocation\b/, buf)
-  end
-
-  def test_benchmark
-    cmd = @@cmd + %w(-rbenchmark
-      -e puts(Benchmark.measure{1000000.times{Time.now}}))
-    r = IO.popen(@@env, cmd, 'r')
-    require 'benchmark'
-    warn Benchmark::Tms::CAPTION
-    warn r.read
-  end if ENV['BENCHMARK']
-
-  def test_mwrap_dump_check
-    assert_raise(TypeError) { Mwrap.dump(:bogus) }
-  end
-
-  def assert_separately(src, *opts)
-    Tempfile.create(%w(mwrap .rb)) do |tmp|
-      tmp.write(src.lstrip!)
-      tmp.flush
-      assert(system(@@env, *@@cmd, tmp.path, *opts))
-    end
-  end
-
-  def test_source_location
-    assert_separately(+"#{<<~"begin;"}\n#{<<~'end;'}")
-    begin;
-      require 'mwrap'
-      foo = '0' * 10000
-      k = -"#{__FILE__}:2"
-      loc = Mwrap[k]
-      loc.name == k or abort 'SourceLocation#name broken'
-      loc.total >= 10000 or abort 'SourceLocation#total broken'
-      loc.frees == 0 or abort 'SourceLocation#frees broken'
-      loc.allocations == 1 or abort 'SourceLocation#allocations broken'
-      seen = false
-      loc.each do |*x| seen = x end
-      seen[1] == loc.total or 'SourceLocation#each broken'
-      foo.clear
-
-      # wait for call_rcu to perform real_free
-      freed = false
-      until freed
-        freed = true
-        loc.each do freed = false end
-      end
-      loc.frees == 1 or abort 'SourceLocation#frees broken (after free)'
-      Float === loc.mean_lifespan or abort 'mean_lifespan broken'
-      Integer === loc.max_lifespan or abort 'max_lifespan broken'
-
-      addr = false
-      Mwrap.each do |a,|
-        if a =~ /0x[a-f0-9]+/
-          addr = a
-          break
-        end
-      end
-      addr && addr.frozen? or abort 'Mwrap.each returned unfrozen address'
-      loc = Mwrap[addr] or abort "Mwrap[#{addr}] broken"
-      addr == loc.name or abort 'SourceLocation#name works on address'
-      loc.name.frozen? or abort 'SourceLocation#name not frozen'
-    end;
-  end
-
-  def test_quiet
-    assert_separately(+"#{<<~"begin;"}\n#{<<~'end;'}")
-    begin;
-      require 'mwrap'
-      before = __LINE__
-      res = Mwrap.quiet do |depth|
-        depth == 1 or abort 'depth is not 1'
-        ('a' * 10000).clear
-        Mwrap.quiet { |d| d == 2 or abort 'depth is not 2' }
-        :foo
-      end
-      after = __LINE__ - 1
-      (before..after).each do |lineno|
-        Mwrap["#{__FILE__}:#{lineno}"] and
-          abort "unexpectedly tracked allocation at line #{lineno}"
-      end
-      res == :foo or abort 'Mwrap.quiet did not return block result'
-    end;
-  end
-
-  def test_total_bytes
-    assert_separately(+"#{<<~"begin;"}\n#{<<~'end;'}")
-    begin;
-      require 'mwrap'
-      Mwrap.total_bytes_allocated > 0 or abort 'nothing allocated'
-      Mwrap.total_bytes_freed > 0 or abort 'nothing freed'
-      Mwrap.total_bytes_allocated > Mwrap.total_bytes_freed or
-        abort 'freed more than allocated'
-    end;
-  end
-
-  def test_heap_page_body
-    assert_separately(+"#{<<~"begin;"}\n#{<<~'end;'}")
-    begin;
-      require 'mwrap'
-      require 'rubygems' # use up some memory
-      ap = GC.stat(:heap_allocated_pages)
-      h = {}
-      nr = 0
-      Mwrap::HeapPageBody.each do |addr, gen|
-        nr += 1
-        gen <= GC.count && gen >= 0 or abort "bad generation: #{gen}"
-        (0 == (addr & 16383)) or abort "addr not aligned: #{'%x' % addr}"
-      end
-      nr == ap or abort 'HeapPageBody.each missed page'
-      10.times { (1..20000).to_a.map(&:to_s) }
-      3.times { GC.start }
-      Mwrap::HeapPageBody.stat(h)
-      Integer === h[:lifespan_max] or abort 'lifespan_max not recorded'
-      Integer === h[:lifespan_min] or abort 'lifespan_min not recorded'
-      Float === h[:lifespan_mean] or abort 'lifespan_mean not recorded'
-      3.times { GC.start }
-      10.times { (1..20000).to_a.map(&:to_s) }
-      Mwrap::HeapPageBody.stat(h)
-      h[:deathspan_min] <= h[:deathspan_max] or
-        abort 'wrong min/max deathtime'
-      Float === h[:deathspan_mean] or abort 'deathspan_mean not recorded'
-    end;
-  end
-end
diff --git a/typemap b/typemap
new file mode 100644
index 0000000..9531289
--- /dev/null
+++ b/typemap
@@ -0,0 +1,4 @@
+TYPEMAP
+size_t	T_UV
+const char *	T_PV
+Devel::Mwrap::SrcLoc	T_PTROBJ

^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2019-10-31 20:03 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-31 20:03 [PATCH] port to Perl5 and XS Eric Wong

Code repositories for project(s) associated with this public inbox

	https://80x24.org/mwrap-perl.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).