about summary refs log tree commit homepage
diff options
context:
space:
mode:
-rw-r--r--.document2
-rw-r--r--.gitignore12
-rw-r--r--.olddoc.yml8
-rw-r--r--MANIFEST18
-rw-r--r--Makefile.PL72
-rw-r--r--Mwrap.xs891
-rw-r--r--README86
-rw-r--r--Rakefile16
-rwxr-xr-xbin/mwrap36
-rw-r--r--ext/mwrap/extconf.rb28
-rw-r--r--ext/mwrap/mwrap.c1464
-rw-r--r--jhash.h (renamed from ext/mwrap/jhash.h)0
-rw-r--r--lib/Devel/Mwrap.pm15
-rw-r--r--lib/mwrap_rack.rb172
-rw-r--r--mwrap.gemspec32
-rw-r--r--script/mwrap-perl34
-rw-r--r--t/mwrap.t85
-rw-r--r--t/source_location.perl9
-rw-r--r--test/test_mwrap.rb322
-rw-r--r--typemap4
20 files changed, 1163 insertions, 2143 deletions
diff --git a/.document b/.document
deleted file mode 100644
index 4ca33e3..0000000
--- a/.document
+++ /dev/null
@@ -1,2 +0,0 @@
-ext/mwrap/mwrap.c
-lib/mwrap_rack.rb
diff --git a/.gitignore b/.gitignore
index aa3606c..81948b8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,10 @@
-/tmp
 *.o
 *.so
-/pkg
-/*.gem
-/doc
+/MYMETA.
+/MYMETA.*
+/MANIFEST.gen
+/Makefile
+/Mwrap.bs
+/Mwrap.c
+/blib
+/pm_to_blib
diff --git a/.olddoc.yml b/.olddoc.yml
deleted file mode 100644
index dac0353..0000000
--- a/.olddoc.yml
+++ /dev/null
@@ -1,8 +0,0 @@
----
-cgit_url: https://80x24.org/mwrap.git
-git_url: https://80x24.org/mwrap.git
-rdoc_url: https://80x24.org/mwrap/
-ml_url: https://80x24.org/mwrap-public/
-public_email: mwrap-public@80x24.org
-nntp_url:
-  - nntp://news.public-inbox.org/inbox.comp.lang.ruby.mwrap
diff --git a/MANIFEST b/MANIFEST
index e6d8964..2fa42b1 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -1,14 +1,12 @@
-.document
 .gitignore
-.olddoc.yml
 COPYING
 MANIFEST
+Makefile.PL
+Mwrap.xs
 README
-Rakefile
-bin/mwrap
-ext/mwrap/extconf.rb
-ext/mwrap/jhash.h
-ext/mwrap/mwrap.c
-lib/mwrap_rack.rb
-mwrap.gemspec
-test/test_mwrap.rb
+jhash.h
+lib/Devel/Mwrap.pm
+script/mwrap-perl
+t/mwrap.t
+t/source_location.perl
+typemap
diff --git a/Makefile.PL b/Makefile.PL
new file mode 100644
index 0000000..1ae3080
--- /dev/null
+++ b/Makefile.PL
@@ -0,0 +1,72 @@
+use strict;
+use ExtUtils::MakeMaker;
+use Config;
+my $pkg_config = $ENV{PKG_CONFIG} // 'pkg-config';
+my $LIBS = `$pkg_config --libs liburcu-cds liburcu-bp`;
+if ($?) {
+        print STDERR <<END;
+`$pkg_config --libs liburcu-cds` failed (\$?=$?)
+
+You need to install pkg-config and liburcu <https://liburcu.org/>
+before you can build Devel::Mwrap.
+
+On Debian:
+
+        apt-get install pkg-config liburcu-dev
+END
+        # tell CPAN testing to indicate missing deps
+        exit 0;
+}
+
+if ($Config{usemymalloc} eq 'y') {
+        print STDERR <<END;
+Devel::Mwrap requires `usemymalloc=n'.  malloc and related functions
+must be dynamically-linked.
+END
+        exit 0;
+}
+
+# may be empty
+chomp(my $INC = `$pkg_config --cflags liburcu-cds liburcu-bp`);
+my @writemakefile_args = ();
+# Filter out some gcc options which g++ doesn't support.
+my $CCFLAGS = $Config{ccflags};
+
+if (defined $ENV{CPPFLAGS}) {
+        $CCFLAGS .= ' ' . $ENV{CPPFLAGS};
+}
+
+# See lib/ExtUtils/MakeMaker.pm for details of how to influence
+# the contents of the Makefile that is written.
+push @writemakefile_args, (
+        NAME => 'Devel::Mwrap',
+        VERSION_FROM => 'lib/Devel/Mwrap.pm',
+        PREREQ_PM => {},
+        ABSTRACT_FROM => 'lib/Devel/Mwrap.pm',
+        EXE_FILES => [qw(script/mwrap-perl)],
+        AUTHOR => 'mwrap hackers <mwrap-perl@80x24.org>',
+        LIBS => $LIBS, # e.g. -lurcu-cds
+        LICENSE => 'gpl_2', # GPL-2.0+, CPAN::Meta::Spec limitation
+        MIN_PERL_VERSION => '5.14.0', # for caller_cx
+        BUILD_REQUIRES => {},
+        CCFLAGS => $CCFLAGS, # e.g -I/usr/include/$ARCH
+        INC => $INC,
+        depend => {
+                Makefile => 'lib/Devel/Mwrap.pm',
+        }
+);
+
+WriteMakefile(@writemakefile_args);
+
+sub MY::postamble {
+        <<EOF;
+N = \$\$(( \$\$(nproc 2>/dev/null || gnproc 2>/dev/null || echo 2) + 1 ))
+-include config.mak
+
+check-manifest :: MANIFEST
+        if git ls-files >\$?.gen 2>&1; then diff -u \$? \$?.gen; fi
+
+check:: all check-manifest
+        PERL5LIB=blib/lib:blib/arch prove -vw -j\$(N)
+EOF
+}
diff --git a/Mwrap.xs b/Mwrap.xs
new file mode 100644
index 0000000..f196b1a
--- /dev/null
+++ b/Mwrap.xs
@@ -0,0 +1,891 @@
+/*
+ * Copyright (C) 2018-2019 mwrap hackers <mwrap-perl@80x24.org>
+ * License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
+ * Disclaimer: I don't really know my way around XS or Perl internals well
+ */
+#define _LGPL_SOURCE /* allows URCU to inline some stuff */
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+#include "embed.h"
+
+#include <execinfo.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <dlfcn.h>
+#include <assert.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <urcu-bp.h>
+#include <urcu/rculfhash.h>
+#include <urcu/rculist.h>
+#include "jhash.h"
+
+static size_t total_bytes_inc, total_bytes_dec;
+
+extern pthread_key_t __attribute__((weak)) PL_thr_key;
+
+/* true for glibc/dlmalloc/ptmalloc, not sure about jemalloc */
+#define ASSUMED_MALLOC_ALIGNMENT (sizeof(void *) * 2)
+
+#ifdef __FreeBSD__
+void *__malloc(size_t);
+void __free(void *);
+#  define real_malloc __malloc
+#  define real_free __free
+#else
+static void *(*real_malloc)(size_t);
+static void (*real_free)(void *);
+static int resolving_malloc;
+#endif /* !FreeBSD */
+
+/*
+ * we need to fake an OOM condition while dlsym is running,
+ * as that calls calloc under glibc, but we don't have the
+ * symbol for the jemalloc calloc, yet
+ */
+#  define RETURN_IF_NOT_READY() do { \
+        if (!real_malloc) { \
+                errno = ENOMEM; \
+                return NULL; \
+        } \
+} while (0)
+
+static __thread size_t locating;
+static size_t page_size;
+static struct cds_lfht *totals;
+union padded_mutex {
+        pthread_mutex_t mtx;
+        char pad[64];
+};
+
+/* a round-robin pool of mutexes */
+#define MUTEX_NR   (1 << 6)
+#define MUTEX_MASK (MUTEX_NR - 1)
+static size_t mutex_i;
+static union padded_mutex mutexes[MUTEX_NR] = {
+        [0 ... (MUTEX_NR-1)].mtx = PTHREAD_MUTEX_INITIALIZER
+};
+
+static pthread_mutex_t *mutex_assign(void)
+{
+        return &mutexes[uatomic_add_return(&mutex_i, 1) & MUTEX_MASK].mtx;
+}
+
+static struct cds_lfht *
+lfht_new(void)
+{
+        return cds_lfht_new(16384, 1, 0, CDS_LFHT_AUTO_RESIZE, 0);
+}
+
+__attribute__((constructor)) static void resolve_malloc(void)
+{
+        int err;
+        ++locating;
+
+#ifdef __FreeBSD__
+        /*
+         * PTHREAD_MUTEX_INITIALIZER on FreeBSD means lazy initialization,
+         * which happens at pthread_mutex_lock, and that calls calloc
+         */
+        {
+                size_t i;
+
+                for (i = 0; i < MUTEX_NR; i++) {
+                        err = pthread_mutex_init(&mutexes[i].mtx, 0);
+                        if (err) {
+                                fprintf(stderr, "error: %s\n", strerror(err));
+                                _exit(1);
+                        }
+                }
+                /* initialize mutexes used by urcu-bp */
+                rcu_read_lock();
+                rcu_read_unlock();
+        }
+#else /* !FreeBSD (tested on GNU/Linux) */
+        if (!real_malloc) {
+                resolving_malloc = 1;
+                real_malloc = dlsym(RTLD_NEXT, "malloc");
+        }
+        real_free = dlsym(RTLD_NEXT, "free");
+        if (!real_malloc || !real_free) {
+                fprintf(stderr, "missing malloc/aligned_alloc/free\n"
+                        "\t%p %p\n", real_malloc, real_free);
+                _exit(1);
+        }
+#endif /* !FreeBSD */
+        err = pthread_atfork(call_rcu_before_fork,
+                                call_rcu_after_fork_parent,
+                                call_rcu_after_fork_child);
+        if (err)
+                fprintf(stderr, "pthread_atfork failed: %s\n", strerror(err));
+        page_size = sysconf(_SC_PAGESIZE);
+        --locating;
+}
+
+static void
+mutex_lock(pthread_mutex_t *m)
+{
+        int err = pthread_mutex_lock(m);
+        assert(err == 0);
+}
+
+static void
+mutex_unlock(pthread_mutex_t *m)
+{
+        int err = pthread_mutex_unlock(m);
+        assert(err == 0);
+}
+
+#ifndef HAVE_MEMPCPY
+static void *
+my_mempcpy(void *dest, const void *src, size_t n)
+{
+        return (char *)memcpy(dest, src, n) + n;
+}
+#define mempcpy(dst,src,n) my_mempcpy(dst,src,n)
+#endif
+
+/* stolen from glibc: */
+#define RETURN_ADDRESS(nr) \
+  (uintptr_t)(__builtin_extract_return_addr(__builtin_return_address(nr)))
+
+#define INT2STR_MAX (sizeof(unsigned) == 4 ? 10 : 19)
+static char *int2str(unsigned num, char *dst, size_t * size)
+{
+        if (num <= 9) {
+                *size -= 1;
+                *dst++ = (char)(num + '0');
+                return dst;
+        } else {
+                char buf[INT2STR_MAX];
+                char *end = buf + sizeof(buf);
+                char *p = end;
+                size_t adj;
+
+                do {
+                        *size -= 1;
+                        *--p = (char)((num % 10) + '0');
+                        num /= 10;
+                } while (num && *size);
+
+                if (!num) {
+                        adj = end - p;
+                        return mempcpy(dst, p, adj);
+                }
+        }
+        return NULL;
+}
+
+/* allocated via real_malloc/real_free */
+struct src_loc {
+        pthread_mutex_t *mtx;
+        size_t total;
+        size_t allocations;
+        size_t frees;
+        struct cds_lfht_node hnode;
+        struct cds_list_head allocs; /* <=> alloc_hdr.node */
+        uint32_t hval;
+        uint32_t capa;
+        char k[];
+};
+
+/*
+ * I hate typedefs, especially when they're hiding the fact that there's
+ * a pointer, but XS needs this, apparently, and it does s/__/::/g
+ */
+typedef struct src_loc * Devel__Mwrap__SrcLoc;
+
+/* every allocation has this in the header, maintain alignment with malloc  */
+struct alloc_hdr {
+        struct cds_list_head anode; /* <=> src_loc.allocs */
+        union {
+                struct {
+                        struct src_loc *loc;
+                } live;
+                struct rcu_head dead;
+        } as;
+        void *real; /* what to call real_free on */
+        size_t size;
+};
+
+static __thread char kbuf[
+        PATH_MAX + INT2STR_MAX + sizeof(struct alloc_hdr) + 2
+];
+
+static struct alloc_hdr *ptr2hdr(void *p)
+{
+        return (struct alloc_hdr *)((uintptr_t)p - sizeof(struct alloc_hdr));
+}
+
+static void *hdr2ptr(struct alloc_hdr *h)
+{
+        return (void *)((uintptr_t)h + sizeof(struct alloc_hdr));
+}
+
+static int loc_is_addr(const struct src_loc *l)
+{
+        return l->capa == 0;
+}
+
+static size_t loc_size(const struct src_loc *l)
+{
+        return loc_is_addr(l) ? sizeof(uintptr_t) : l->capa;
+}
+
+static int loc_eq(struct cds_lfht_node *node, const void *key)
+{
+        const struct src_loc *existing;
+        const struct src_loc *k = key;
+
+        existing = caa_container_of(node, struct src_loc, hnode);
+
+        return (k->hval == existing->hval &&
+                k->capa == existing->capa &&
+                memcmp(k->k, existing->k, loc_size(k)) == 0);
+}
+
+static struct src_loc *totals_add_rcu(struct src_loc *k)
+{
+        struct cds_lfht_iter iter;
+        struct cds_lfht_node *cur;
+        struct src_loc *l = 0;
+        struct cds_lfht *t;
+
+again:
+        t = rcu_dereference(totals);
+        if (!t) goto out_unlock;
+        cds_lfht_lookup(t, k->hval, loc_eq, k, &iter);
+        cur = cds_lfht_iter_get_node(&iter);
+        if (cur) {
+                l = caa_container_of(cur, struct src_loc, hnode);
+                uatomic_add(&l->total, k->total);
+                uatomic_add(&l->allocations, 1);
+        } else {
+                size_t n = loc_size(k);
+                l = real_malloc(sizeof(*l) + n);
+                if (!l) goto out_unlock;
+                memcpy(l, k, sizeof(*l) + n);
+                l->mtx = mutex_assign();
+                l->frees = 0;
+                l->allocations = 1;
+                CDS_INIT_LIST_HEAD(&l->allocs);
+                cur = cds_lfht_add_unique(t, k->hval, loc_eq, l, &l->hnode);
+                if (cur != &l->hnode) { /* lost race */
+                        rcu_read_unlock();
+                        real_free(l);
+                        rcu_read_lock();
+                        goto again;
+                }
+        }
+out_unlock:
+        return l;
+}
+
+static void update_stats_rcu_unlock(const struct src_loc *l)
+{
+        if (caa_likely(l)) rcu_read_unlock();
+}
+
+static struct src_loc *update_stats_rcu_lock(size_t size, uintptr_t caller)
+{
+        const PERL_CONTEXT *cx = NULL;
+        static const size_t xlen = sizeof(caller);
+        struct src_loc *k, *ret = 0;
+        char *dst;
+
+        if (caa_unlikely(!totals)) return 0;
+        if (locating++) goto out; /* do not recurse into another *alloc */
+
+        uatomic_add(&total_bytes_inc, size);
+
+        rcu_read_lock();
+        cx = caller_cx(0, NULL);
+        if (cx) {
+                const char *ptr = OutCopFILE(cx->blk_oldcop);
+                const COP *lcop;
+                unsigned line;
+                size_t len;
+                size_t int_size = INT2STR_MAX;
+
+                if (!ptr) goto unknown;
+
+                lcop = Perl_closest_cop(aTHX_ cx->blk_oldcop,
+                                        OpSIBLING(cx->blk_oldcop),
+                                        cx->blk_sub.retop, TRUE);
+                if (!lcop)
+                        lcop = cx->blk_oldcop;
+                line = CopLINE(lcop);
+
+                /* avoid vsnprintf or anything which could call malloc here: */
+                len = strlen(ptr);
+                if (len > PATH_MAX)
+                        len = PATH_MAX;
+                k = (void *)kbuf;
+                k->total = size;
+                dst = mempcpy(k->k, ptr, len);
+                *dst++ = ':';
+
+                if (line == UINT_MAX) /* no line number */
+                        *dst++ = '-';
+                else
+                        dst = int2str(line, dst, &int_size);
+
+                assert(dst && "bad math");
+                *dst = 0;        /* terminate string */
+                k->capa = (uint32_t)(dst - k->k + 1);
+                k->hval = jhash(k->k, k->capa, 0xdeadbeef);
+                ret = totals_add_rcu(k);
+        } else {
+unknown:
+                k = alloca(sizeof(*k) + xlen);
+                k->total = size;
+                memcpy(k->k, &caller, xlen);
+                k->capa = 0;
+                k->hval = jhash(k->k, xlen, 0xdeadbeef);
+                ret = totals_add_rcu(k);
+        }
+out:
+        --locating;
+        return ret;
+}
+
+size_t malloc_usable_size(void *p)
+{
+        return ptr2hdr(p)->size;
+}
+
+static void
+free_hdr_rcu(struct rcu_head *dead)
+{
+        struct alloc_hdr *h = caa_container_of(dead, struct alloc_hdr, as.dead);
+        real_free(h->real);
+}
+
+void free(void *p)
+{
+        if (p) {
+                struct alloc_hdr *h = ptr2hdr(p);
+                struct src_loc *l = h->as.live.loc;
+
+                if (!real_free) return; /* oh well, leak a little */
+                if (l) {
+                        uatomic_add(&total_bytes_dec, h->size);
+                        uatomic_set(&h->size, 0);
+                        uatomic_add(&l->frees, 1);
+
+                        mutex_lock(l->mtx);
+                        cds_list_del_rcu(&h->anode);
+                        mutex_unlock(l->mtx);
+
+                        call_rcu(&h->as.dead, free_hdr_rcu);
+                } else {
+                        real_free(h->real);
+                }
+        }
+}
+
+static void
+alloc_insert_rcu(struct src_loc *l, struct alloc_hdr *h, size_t size, void *real)
+{
+        /* we need src_loc to remain alive for the duration of this call */
+        if (!h) return;
+        h->size = size;
+        h->real = real;
+        h->as.live.loc = l;
+        if (l) {
+                mutex_lock(l->mtx);
+                cds_list_add_rcu(&h->anode, &l->allocs);
+                mutex_unlock(l->mtx);
+        }
+}
+
+static size_t size_align(size_t size, size_t alignment)
+{
+        return ((size + (alignment - 1)) & ~(alignment - 1));
+}
+
+static bool ptr_is_aligned(const void *ptr, size_t alignment)
+{
+        return ((uintptr_t)ptr & (alignment - 1)) == 0;
+}
+
+static void *ptr_align(void *ptr, size_t alignment)
+{
+        return (void *)(((uintptr_t)ptr + (alignment - 1)) & ~(alignment - 1));
+}
+
+static bool is_power_of_two(size_t n) { return (n & (n - 1)) == 0; }
+
+static int
+internal_memalign(void **pp, size_t alignment, size_t size, uintptr_t caller)
+{
+        struct src_loc *l;
+        struct alloc_hdr *h;
+        void *real;
+        size_t asize;
+        size_t d = alignment / sizeof(void*);
+        size_t r = alignment % sizeof(void*);
+
+        if (!real_malloc) return ENOMEM;
+
+        if (r != 0 || d == 0 || !is_power_of_two(d))
+                return EINVAL;
+
+        if (alignment <= ASSUMED_MALLOC_ALIGNMENT) {
+                void *p = malloc(size);
+                if (!p) return ENOMEM;
+                *pp = p;
+                return 0;
+        }
+        for (; alignment < sizeof(struct alloc_hdr); alignment *= 2)
+                ; /* double alignment until >= sizeof(struct alloc_hdr) */
+        if (__builtin_add_overflow(size, alignment, &asize) ||
+            __builtin_add_overflow(asize, sizeof(struct alloc_hdr), &asize))
+                return ENOMEM;
+
+        l = update_stats_rcu_lock(size, caller);
+
+        real = real_malloc(asize);
+        if (real) {
+                void *p = hdr2ptr(real);
+                if (!ptr_is_aligned(p, alignment))
+                        p = ptr_align(p, alignment);
+                h = ptr2hdr(p);
+                alloc_insert_rcu(l, h, size, real);
+                update_stats_rcu_unlock(l);
+                *pp = p;
+        }
+
+        return real ? 0 : ENOMEM;
+}
+
+static void *
+memalign_result(int err, void *p)
+{
+        if (caa_unlikely(err)) {
+                errno = err;
+                return 0;
+        }
+        return p;
+}
+
+void *memalign(size_t alignment, size_t size)
+{
+        void *p;
+        int err = internal_memalign(&p, alignment, size, RETURN_ADDRESS(0));
+        return memalign_result(err, p);
+}
+
+int posix_memalign(void **p, size_t alignment, size_t size)
+{
+        return internal_memalign(p, alignment, size, RETURN_ADDRESS(0));
+}
+
+void *aligned_alloc(size_t, size_t) __attribute__((alias("memalign")));
+void cfree(void *) __attribute__((alias("free")));
+
+void *valloc(size_t size)
+{
+        void *p;
+        int err = internal_memalign(&p, page_size, size, RETURN_ADDRESS(0));
+        return memalign_result(err, p);
+}
+
+#if __GNUC__ < 7
+#  define add_overflow_p(a,b) __extension__({ \
+                __typeof__(a) _c; \
+                __builtin_add_overflow(a,b,&_c); \
+        })
+#else
+#  define add_overflow_p(a,b) \
+                __builtin_add_overflow_p((a),(b),(__typeof__(a+b))0)
+#endif
+
+void *pvalloc(size_t size)
+{
+        size_t alignment = page_size;
+        void *p;
+        int err;
+
+        if (add_overflow_p(size, alignment)) {
+                errno = ENOMEM;
+                return 0;
+        }
+        size = size_align(size, alignment);
+        err = internal_memalign(&p, alignment, size, RETURN_ADDRESS(0));
+        return memalign_result(err, p);
+}
+
+void *malloc(size_t size)
+{
+        struct src_loc *l;
+        struct alloc_hdr *h;
+        size_t asize;
+        void *p;
+
+        if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize))
+                goto enomem;
+
+        /*
+         * Needed for C++ global declarations using "new",
+         * which happens before our constructor
+         */
+#ifndef __FreeBSD__
+        if (!real_malloc) {
+                if (resolving_malloc) goto enomem;
+                resolving_malloc = 1;
+                real_malloc = dlsym(RTLD_NEXT, "malloc");
+        }
+#endif
+        l = update_stats_rcu_lock(size, RETURN_ADDRESS(0));
+        p = h = real_malloc(asize);
+        if (h) {
+                alloc_insert_rcu(l, h, size, h);
+                p = hdr2ptr(h);
+        }
+        update_stats_rcu_unlock(l);
+        if (caa_unlikely(!p)) errno = ENOMEM;
+        return p;
+enomem:
+        errno = ENOMEM;
+        return 0;
+}
+
+void *calloc(size_t nmemb, size_t size)
+{
+        void *p;
+        struct src_loc *l;
+        struct alloc_hdr *h;
+        size_t asize;
+
+        if (__builtin_mul_overflow(size, nmemb, &size)) {
+                errno = ENOMEM;
+                return 0;
+        }
+        if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize)) {
+                errno = ENOMEM;
+                return 0;
+        }
+        RETURN_IF_NOT_READY();
+        l = update_stats_rcu_lock(size, RETURN_ADDRESS(0));
+        p = h = real_malloc(asize);
+        if (p) {
+                alloc_insert_rcu(l, h, size, h);
+                p = hdr2ptr(h);
+                memset(p, 0, size);
+        }
+        update_stats_rcu_unlock(l);
+        if (caa_unlikely(!p)) errno = ENOMEM;
+        return p;
+}
+
+void *realloc(void *ptr, size_t size)
+{
+        void *p;
+        struct src_loc *l;
+        struct alloc_hdr *h;
+        size_t asize;
+
+        if (!size) {
+                free(ptr);
+                return 0;
+        }
+        if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize)) {
+                errno = ENOMEM;
+                return 0;
+        }
+        RETURN_IF_NOT_READY();
+
+        l = update_stats_rcu_lock(size, RETURN_ADDRESS(0));
+        p = h = real_malloc(asize);
+        if (p) {
+                alloc_insert_rcu(l, h, size, h);
+                p = hdr2ptr(h);
+        }
+        update_stats_rcu_unlock(l);
+
+        if (ptr && p) {
+                struct alloc_hdr *old = ptr2hdr(ptr);
+                memcpy(p, ptr, old->size < size ? old->size : size);
+                free(ptr);
+        }
+        if (caa_unlikely(!p)) errno = ENOMEM;
+        return p;
+}
+
+struct dump_arg {
+        FILE *fp;
+        size_t min;
+};
+
+static void *dump_to_file(struct dump_arg *a)
+{
+        struct cds_lfht_iter iter;
+        struct src_loc *l;
+        struct cds_lfht *t;
+
+        ++locating;
+        rcu_read_lock();
+        t = rcu_dereference(totals);
+        if (!t)
+                goto out_unlock;
+        cds_lfht_for_each_entry(t, &iter, l, hnode) {
+                const void *p = l->k;
+                char **s = 0;
+                if (l->total <= a->min) continue;
+
+                if (loc_is_addr(l)) {
+                        s = backtrace_symbols(p, 1);
+                        p = s[0];
+                }
+                fprintf(a->fp, "%16zu %12zu %s\n",
+                        l->total, l->allocations, (const char *)p);
+                if (s) free(s);
+        }
+out_unlock:
+        rcu_read_unlock();
+        --locating;
+        return 0;
+}
+
+static SV *location_string(struct src_loc *l)
+{
+        SV *ret;
+
+        if (loc_is_addr(l)) {
+                char **s = backtrace_symbols((void *)l->k, 1);
+
+                ret = newSVpvn(s[0], strlen(s[0]));
+        }
+        else {
+                ret = newSVpvn(l->k, l->capa - 1);
+        }
+
+        return ret;
+}
+
+static int
+extract_addr(const char *str, size_t len, void **p)
+{
+        const char *c;
+#if defined(__GLIBC__)
+        return ((c = memrchr(str, '[', len)) && sscanf(c, "[%p]", p));
+#else /* TODO: test FreeBSD */
+        return ((c = strstr(str, "0x")) && sscanf(c, "%p", p));
+#endif
+}
+
+#ifndef O_CLOEXEC
+#  define O_CLOEXEC 0
+#endif
+__attribute__ ((destructor))
+static void dump_destructor(void)
+{
+        const char *opt = getenv("MWRAP");
+        const char *modes[] = { "a", "a+", "w", "w+", "r+" };
+        struct dump_arg a = { .min = 0 };
+        size_t i;
+        int dump_fd;
+        char *dump_path;
+        char *s;
+
+        if (!opt)
+                return;
+
+        ++locating;
+        if ((dump_path = strstr(opt, "dump_path:")) &&
+                        (dump_path += sizeof("dump_path")) &&
+                        *dump_path) {
+                char *end = strchr(dump_path, ',');
+                if (end) {
+                        char *tmp = alloca(end - dump_path + 1);
+                        end = mempcpy(tmp, dump_path, end - dump_path);
+                        *end = 0;
+                        dump_path = tmp;
+                }
+                dump_fd = open(dump_path, O_CLOEXEC|O_WRONLY|O_APPEND|O_CREAT,
+                                0666);
+                if (dump_fd < 0) {
+                        fprintf(stderr, "open %s failed: %s\n", dump_path,
+                                strerror(errno));
+                        goto out;
+                }
+        }
+        else if (!sscanf(opt, "dump_fd:%d", &dump_fd))
+                goto out;
+
+        if ((s = strstr(opt, "dump_min:")))
+                sscanf(s, "dump_min:%zu", &a.min);
+
+        switch (dump_fd) {
+        case 0: goto out;
+        case 1: a.fp = stdout; break;
+        case 2: a.fp = stderr; break;
+        default:
+                if (dump_fd < 0)
+                        goto out;
+                a.fp = 0;
+
+                for (i = 0; !a.fp && i < 5; i++)
+                        a.fp = fdopen(dump_fd, modes[i]);
+
+                if (!a.fp) {
+                        fprintf(stderr, "failed to open fd=%d: %s\n",
+                                dump_fd, strerror(errno));
+                        goto out;
+                }
+                /* we'll leak some memory here, but this is a destructor */
+        }
+        dump_to_file(&a);
+out:
+        --locating;
+}
+
+MODULE = Devel::Mwrap        PACKAGE = Devel::Mwrap        PREFIX = mwrap_
+
+BOOT:
+        totals = lfht_new();
+        if (!totals)
+                fprintf(stderr, "failed to allocate totals table\n");
+
+PROTOTYPES: ENABLE
+
+size_t
+mwrap_total_bytes_allocated()
+CODE:
+        RETVAL = total_bytes_inc;
+OUTPUT:
+        RETVAL
+
+size_t
+mwrap_total_bytes_freed()
+CODE:
+        RETVAL = total_bytes_dec;
+OUTPUT:
+        RETVAL
+
+void
+mwrap_reset()
+PREINIT:
+        struct cds_lfht *t;
+        struct cds_lfht_iter iter;
+        struct src_loc *l;
+CODE:
+        uatomic_set(&total_bytes_inc, 0);
+        uatomic_set(&total_bytes_dec, 0);
+
+        rcu_read_lock();
+        t = rcu_dereference(totals);
+        cds_lfht_for_each_entry(t, &iter, l, hnode) {
+                uatomic_set(&l->total, 0);
+                uatomic_set(&l->allocations, 0);
+                uatomic_set(&l->frees, 0);
+        }
+        rcu_read_unlock();
+
+Devel::Mwrap::SrcLoc
+mwrap_get(loc)
+        SV *loc;
+PREINIT:
+        STRLEN len;
+        const char *str;
+        struct src_loc *k = 0;
+        uintptr_t p;
+        struct cds_lfht_iter iter;
+        struct cds_lfht_node *cur;
+        struct cds_lfht *t;
+        struct src_loc *l = NULL;
+        ++locating;
+CODE:
+        if (!SvPOK(loc))
+                XSRETURN_UNDEF;
+        str = SvPV(loc, len);
+        if (len > PATH_MAX)
+                XSRETURN_UNDEF;
+        if (extract_addr(str, len, (void **)&p)) {
+                k = (void *)kbuf;
+                memcpy(k->k, &p, sizeof(p));
+                k->capa = 0;
+                k->hval = jhash(k->k, sizeof(p), 0xdeadbeef);
+        } else {
+                k = (void *)kbuf;
+                memcpy(k->k, str, len + 1);
+                k->capa = len + 1;
+                k->hval = jhash(k->k, k->capa, 0xdeadbeef);
+        }
+
+        if (!k)
+                XSRETURN_UNDEF;
+
+        rcu_read_lock();
+        t = rcu_dereference(totals);
+        if (!t) goto out_unlock;
+
+        cds_lfht_lookup(t, k->hval, loc_eq, k, &iter);
+        cur = cds_lfht_iter_get_node(&iter);
+        if (cur)
+                l = caa_container_of(cur, struct src_loc, hnode);
+out_unlock:
+        rcu_read_unlock();
+        RETVAL = l;
+OUTPUT:
+        RETVAL
+CLEANUP:
+        --locating;
+
+MODULE = Devel::Mwrap        PACKAGE = Devel::Mwrap::SrcLoc        PREFIX = src_loc_
+
+PROTOTYPES: ENABLE
+
+size_t
+src_loc_frees(self)
+        Devel::Mwrap::SrcLoc self
+PREINIT:
+        ++locating;
+CODE:
+        RETVAL = uatomic_read(&self->frees);
+OUTPUT:
+        RETVAL
+CLEANUP:
+        --locating;
+
+size_t
+src_loc_allocations(self)
+        Devel::Mwrap::SrcLoc self
+PREINIT:
+        ++locating;
+CODE:
+        RETVAL = uatomic_read(&self->allocations);
+OUTPUT:
+        RETVAL
+CLEANUP:
+        --locating;
+
+size_t
+src_loc_total(self)
+        Devel::Mwrap::SrcLoc self
+PREINIT:
+        ++locating;
+CODE:
+        RETVAL = uatomic_read(&self->total);
+OUTPUT:
+        RETVAL
+CLEANUP:
+        --locating;
+
+SV *
+src_loc_name(self)
+        Devel::Mwrap::SrcLoc self
+PREINIT:
+        ++locating;
+CODE:
+        RETVAL = location_string(self);
+OUTPUT:
+        RETVAL
+CLEANUP:
+        --locating;
diff --git a/README b/README
index 3a20258..97ff4ea 100644
--- a/README
+++ b/README
@@ -1,95 +1,83 @@
-= mwrap - LD_PRELOAD malloc wrapper + line stats for Ruby
+Devel::Mwrap - LD_PRELOAD malloc wrapper + line stats for Perl
 
-mwrap is designed to answer the question:
+Devel::Mwrap is designed to answer the question:
 
-   Which lines of Ruby are hitting malloc the most?
+   Which lines of Perl are hitting malloc the most?
 
-mwrap wraps all malloc-family calls to trace the Ruby source
-location of such calls and bytes allocated at each callsite.
-As of mwrap 2.0.0, it can also function as a leak detector
-and show live allocations at every call site.  Depending on
-your application and workload, the overhead is roughly a 50%
-increase memory and runtime.
+Devel::Mwrap wraps all malloc-family calls to trace the Perl source
+location of such calls and bytes allocated at each callsite.  It
+can also function as a leak detector and show live allocations
+at every call site.  Depending on your application and workload,
+the overhead is roughly a 50%-100% increase memory and runtime.
 
-It works best for allocations under GVL, but tries to track
-numeric caller addresses for allocations made without GVL so you
-can get an idea of how much memory usage certain extensions and
-native libraries use.
+It is thread-safe and requires the concurrent lock-free hash table
+from the Userspace RCU project: https://liburcu.org/
 
-It requires the concurrent lock-free hash table from the
-Userspace RCU project: https://liburcu.org/
+It relies on dynamic linking to a malloc(3) implementation.  If
+you got Perl from your OS distribution, this typically does not
+require rebuilding Perl.
 
-It does not require recompiling or rebuilding Ruby, but only
-supports Ruby trunk (2.6.0dev+) on a few platforms:
+Tested on the perl package distributed with:
 
-* GNU/Linux
-* FreeBSD (tested 11.1)
+* Debian GNU/Linux 9, 10
 
-It may work on NetBSD, OpenBSD and DragonFly BSD.
+It may work on FreeBSD, NetBSD, OpenBSD and DragonFly BSD.
 
 == Install
 
-        # FreeBSD: pkg install liburcu
+        # FreeBSD: pkg install pkg-config liburcu
 
-        # Debian-based systems: apt-get liburcu-dev
-
-        # Install mwrap via RubyGems.org
-        gem install mwrap
+        # Debian-based systems: apt-get install pkg-config liburcu-dev
 
 == Usage
 
-mwrap works as an LD_PRELOAD and supplies a mwrap RubyGem executable to
+Devel::Mwrap works as an LD_PRELOAD and supplies a mwrap-perl script to
 improve ease-of-use.  You can set dump_path: in the MWRAP environment
 variable to append the results to a log file:
 
-        MWRAP=dump_path:/path/to/log mwrap RUBY_COMMAND
+        MWRAP=dump_path:/path/to/log mwrap-perl PERL_COMMAND
 
         # And to display the locations with the most allocations:
         sort -k1,1rn </path/to/log | $PAGER
 
-You may also `require "mwrap"' in your Ruby code and use
-Mwrap.dump, Mwrap.reset, Mwrap.each, etc.
+You may also `use Devel::Mwrap' in your Perl code and use
+Devel::Mwrap->dump, Devel::Mwrap->reset, Devel::Mwrap->each, etc.
 
-However, mwrap MUST be loaded via LD_PRELOAD to have any
+However, Devel::Mwrap MUST be loaded via LD_PRELOAD to have any
 effect in tracking malloc use.  However, it is safe to keep
-"require 'mwrap'" in performance-critical deployments,
+"use Devel::Mwrap" in performance-critical deployments,
 as overhead is only incurred when used as an LD_PRELOAD.
 
-The output of the mwrap dump is a text file with 3 columns:
+The output of the Devel::Mwrap->dump is a text file with 3 columns:
 
         total_bytes        call_count        location
 
-Where location is a Ruby source location (if made under GVL)
-or an address retrieved by backtrace_symbols(3).  It is
-recommended to use the sort(1) command on either of the
-first two columns to find the hottest malloc locations.
-
-mwrap 2.0.0+ also supports a Rack application endpoint,
-it is documented at:
-
-        https://80x24.org/mwrap/MwrapRack.html
+Where location is a Perl source location or an address retrieved
+by backtrace_symbols(3).  It is recommended to use the sort(1)
+command on either of the first two columns to find the hottest
+malloc locations.
 
 == Known problems
 
 * 32-bit machines are prone to overflow (WONTFIX)
 
-== Mail archives and list:
+== Mail archives and newsgroup:
 
-        https://80x24.org/mwrap-public/
-        nntp://80x24.org/inbox.comp.lang.ruby.mwrap
+        https://80x24.org/mwrap-perl/
+        nntp://80x24.org/inbox.comp.lang.perl.mwrap
 
 No subscription will ever be required to post, but HTML mail
 will be rejected:
 
-                mwrap-public@80x24.org
+                mwrap-perl@80x24.org
 
 == Hacking
 
-        git clone https://80x24.org/mwrap.git
+        git clone https://80x24.org/mwrap-perl.git
 
-Send all patches and pull requests (use "git request-pull" to format) to
-the mailing list.  We do not use centralized or proprietary messaging
-systems.
+Send all patches and pull requests (use "git request-pull" to format)
+via email to mwrap-perl@80x24.org.  We do not and will not use
+proprietary messaging systems.
 
 == License
 
diff --git a/Rakefile b/Rakefile
deleted file mode 100644
index 50bfa89..0000000
--- a/Rakefile
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (C) 2018 mwrap hackers <mwrap-public@80x24.org>
-# License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
-require 'rake/testtask'
-begin
-  require 'rake/extensiontask'
-  Rake::ExtensionTask.new('mwrap')
-rescue LoadError
-  warn 'rake-compiler not available, cross compiling disabled'
-end
-
-Rake::TestTask.new(:test)
-task :test => :compile
-task :default => :compile
-
-c_files = File.readlines('MANIFEST').grep(%r{ext/.*\.[ch]$}).map!(&:chomp!)
-task 'compile:mwrap' => c_files
diff --git a/bin/mwrap b/bin/mwrap
deleted file mode 100755
index 9f67dab..0000000
--- a/bin/mwrap
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/ruby
-# frozen_string_literal: true
-# Copyright (C) 2018 mwrap hackers <mwrap-public@80x24.org>
-# License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
-require 'mwrap'
-mwrap_so = $".grep(%r{/mwrap\.so\z})[0] or abort "mwrap.so not loaded"
-cur = ENV['LD_PRELOAD']
-if cur
-  cur = cur.split(/[:\s]+/)
-  if !cur.include?(mwrap_so)
-    # drop old versions
-    cur.delete_if { |path| path.end_with?('/mwrap.so') }
-    cur.unshift(mwrap_so)
-    ENV['LD_PRELOAD'] = cur.join(':')
-  end
-else
-  ENV['LD_PRELOAD'] = mwrap_so
-end
-
-# work around close-on-exec by default behavior in Ruby:
-opts = {}
-if ENV['MWRAP'] =~ /dump_fd:(\d+)/
-  dump_fd = $1.to_i
-  if dump_fd > 2
-    dump_io = IO.new(dump_fd)
-    opts[dump_fd] = dump_io
-  end
-end
-
-# allow inheriting FDs from systemd
-n = ENV['LISTEN_FDS']
-if n && ENV['LISTEN_PID'].to_i == $$
-  n = 3 + n.to_i
-  (3...n).each { |fd| opts[fd] = IO.new(fd) }
-end
-exec *ARGV, opts
diff --git a/ext/mwrap/extconf.rb b/ext/mwrap/extconf.rb
deleted file mode 100644
index e9dbb1e..0000000
--- a/ext/mwrap/extconf.rb
+++ /dev/null
@@ -1,28 +0,0 @@
-# frozen_string_literal: true
-# Copyright (C) 2018 mwrap hackers <mwrap-public@80x24.org>
-# License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
-require 'mkmf'
-
-have_func 'mempcpy'
-have_library 'urcu-cds' or abort 'userspace RCU not installed'
-have_header 'urcu/rculfhash.h' or abort 'rculfhash.h not found'
-have_library 'urcu-bp' or abort 'liburcu-bp not found'
-have_library 'dl'
-have_library 'c'
-have_library 'execinfo' # FreeBSD
-
-if try_link(<<'')
-int main(void) { return __builtin_add_overflow_p(0,0,(int)1); }
-
-  $defs << '-DHAVE_BUILTIN_ADD_OVERFLOW_P'
-end
-
-if try_link(<<'')
-int main(int a) { return __builtin_add_overflow(0,0,&a); }
-
-  $defs << '-DHAVE_BUILTIN_ADD_OVERFLOW_P'
-else
-  abort 'missing __builtin_add_overflow'
-end
-
-create_makefile 'mwrap'
diff --git a/ext/mwrap/mwrap.c b/ext/mwrap/mwrap.c
deleted file mode 100644
index 5174127..0000000
--- a/ext/mwrap/mwrap.c
+++ /dev/null
@@ -1,1464 +0,0 @@
-/*
- * Copyright (C) 2018 mwrap hackers <mwrap-public@80x24.org>
- * License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
- */
-#define _LGPL_SOURCE /* allows URCU to inline some stuff */
-#include <ruby/ruby.h>
-#include <ruby/thread.h>
-#include <ruby/io.h>
-#include <execinfo.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <dlfcn.h>
-#include <assert.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <pthread.h>
-#include <urcu-bp.h>
-#include <urcu/rculfhash.h>
-#include <urcu/rculist.h>
-#include "jhash.h"
-
-static ID id_uminus;
-const char *rb_source_location_cstr(int *line); /* requires 2.6.0dev */
-extern int __attribute__((weak)) ruby_thread_has_gvl_p(void);
-extern void * __attribute__((weak)) ruby_current_execution_context_ptr;
-extern void * __attribute__((weak)) ruby_current_vm_ptr; /* for rb_gc_count */
-extern size_t __attribute__((weak)) rb_gc_count(void);
-extern VALUE __attribute__((weak)) rb_cObject;
-extern VALUE __attribute__((weak)) rb_eTypeError;
-extern VALUE __attribute__((weak)) rb_yield(VALUE);
-
-static size_t total_bytes_inc, total_bytes_dec;
-
-/* true for glibc/dlmalloc/ptmalloc, not sure about jemalloc */
-#define ASSUMED_MALLOC_ALIGNMENT (sizeof(void *) * 2)
-
-/* match values in Ruby gc.c */
-#define HEAP_PAGE_ALIGN_LOG 14
-enum {
-        HEAP_PAGE_ALIGN = (1UL << HEAP_PAGE_ALIGN_LOG),
-        REQUIRED_SIZE_BY_MALLOC = (sizeof(size_t) * 5),
-        HEAP_PAGE_SIZE = (HEAP_PAGE_ALIGN - REQUIRED_SIZE_BY_MALLOC)
-};
-
-#define IS_HEAP_PAGE_BODY ((struct src_loc *)-1)
-
-int __attribute__((weak)) ruby_thread_has_gvl_p(void)
-{
-        return 0;
-}
-
-#ifdef __FreeBSD__
-void *__malloc(size_t);
-void __free(void *);
-#  define real_malloc __malloc
-#  define real_free __free
-#else
-static void *(*real_malloc)(size_t);
-static void (*real_free)(void *);
-static int resolving_malloc;
-#endif /* !FreeBSD */
-
-/*
- * we need to fake an OOM condition while dlsym is running,
- * as that calls calloc under glibc, but we don't have the
- * symbol for the jemalloc calloc, yet
- */
-#  define RETURN_IF_NOT_READY() do { \
-        if (!real_malloc) { \
-                errno = ENOMEM; \
-                return NULL; \
-        } \
-} while (0)
-
-static __thread size_t locating;
-static size_t generation;
-static size_t page_size;
-static struct cds_lfht *totals;
-union padded_mutex {
-        pthread_mutex_t mtx;
-        char pad[64];
-};
-
-/* a round-robin pool of mutexes */
-#define MUTEX_NR   (1 << 6)
-#define MUTEX_MASK (MUTEX_NR - 1)
-static size_t mutex_i;
-static union padded_mutex mutexes[MUTEX_NR] = {
-        [0 ... (MUTEX_NR-1)].mtx = PTHREAD_MUTEX_INITIALIZER
-};
-
-static pthread_mutex_t *mutex_assign(void)
-{
-        return &mutexes[uatomic_add_return(&mutex_i, 1) & MUTEX_MASK].mtx;
-}
-
-static struct cds_lfht *
-lfht_new(void)
-{
-        return cds_lfht_new(16384, 1, 0, CDS_LFHT_AUTO_RESIZE, 0);
-}
-
-__attribute__((constructor)) static void resolve_malloc(void)
-{
-        int err;
-        ++locating;
-
-#ifdef __FreeBSD__
-        /*
-         * PTHREAD_MUTEX_INITIALIZER on FreeBSD means lazy initialization,
-         * which happens at pthread_mutex_lock, and that calls calloc
-         */
-        {
-                size_t i;
-
-                for (i = 0; i < MUTEX_NR; i++) {
-                        err = pthread_mutex_init(&mutexes[i].mtx, 0);
-                        if (err) {
-                                fprintf(stderr, "error: %s\n", strerror(err));
-                                _exit(1);
-                        }
-                }
-                /* initialize mutexes used by urcu-bp */
-                rcu_read_lock();
-                rcu_read_unlock();
-        }
-#else /* !FreeBSD (tested on GNU/Linux) */
-        if (!real_malloc) {
-                resolving_malloc = 1;
-                real_malloc = dlsym(RTLD_NEXT, "malloc");
-        }
-        real_free = dlsym(RTLD_NEXT, "free");
-        if (!real_malloc || !real_free) {
-                fprintf(stderr, "missing malloc/aligned_alloc/free\n"
-                        "\t%p %p\n", real_malloc, real_free);
-                _exit(1);
-        }
-#endif /* !FreeBSD */
-        totals = lfht_new();
-        if (!totals)
-                fprintf(stderr, "failed to allocate totals table\n");
-
-        err = pthread_atfork(call_rcu_before_fork,
-                                call_rcu_after_fork_parent,
-                                call_rcu_after_fork_child);
-        if (err)
-                fprintf(stderr, "pthread_atfork failed: %s\n", strerror(err));
-        page_size = sysconf(_SC_PAGESIZE);
-        --locating;
-}
-
-static void
-mutex_lock(pthread_mutex_t *m)
-{
-        int err = pthread_mutex_lock(m);
-        assert(err == 0);
-}
-
-static void
-mutex_unlock(pthread_mutex_t *m)
-{
-        int err = pthread_mutex_unlock(m);
-        assert(err == 0);
-}
-
-#ifndef HAVE_MEMPCPY
-static void *
-my_mempcpy(void *dest, const void *src, size_t n)
-{
-        return (char *)memcpy(dest, src, n) + n;
-}
-#define mempcpy(dst,src,n) my_mempcpy(dst,src,n)
-#endif
-
-/* stolen from glibc: */
-#define RETURN_ADDRESS(nr) \
-  (uintptr_t)(__builtin_extract_return_addr(__builtin_return_address(nr)))
-
-#define INT2STR_MAX (sizeof(int) == 4 ? 10 : 19)
-static char *int2str(int num, char *dst, size_t * size)
-{
-        if (num <= 9) {
-                *size -= 1;
-                *dst++ = (char)(num + '0');
-                return dst;
-        } else {
-                char buf[INT2STR_MAX];
-                char *end = buf + sizeof(buf);
-                char *p = end;
-                size_t adj;
-
-                do {
-                        *size -= 1;
-                        *--p = (char)((num % 10) + '0');
-                        num /= 10;
-                } while (num && *size);
-
-                if (!num) {
-                        adj = end - p;
-                        return mempcpy(dst, p, adj);
-                }
-        }
-        return NULL;
-}
-
-/*
- * rb_source_location_cstr relies on GET_EC(), and it's possible
- * to have a native thread but no EC during the early and late
- * (teardown) phases of the Ruby process
- */
-static int has_ec_p(void)
-{
-        return (ruby_thread_has_gvl_p() && ruby_current_vm_ptr &&
-                ruby_current_execution_context_ptr);
-}
-
-struct acc {
-        uint64_t nr;
-        int64_t min;
-        int64_t max;
-        double m2;
-        double mean;
-};
-
-#define ACC_INIT(name) { .nr=0, .min=INT64_MAX, .max=-1, .m2=0, .mean=0 }
-
-/* for tracking 16K-aligned heap page bodies (protected by GVL) */
-struct {
-        pthread_mutex_t lock;
-        struct cds_list_head bodies;
-        struct cds_list_head freed;
-
-        struct acc alive;
-        struct acc reborn;
-} hpb_stats = {
-        .lock = PTHREAD_MUTEX_INITIALIZER,
-        .bodies = CDS_LIST_HEAD_INIT(hpb_stats.bodies),
-        .freed = CDS_LIST_HEAD_INIT(hpb_stats.freed),
-        .alive = ACC_INIT(hpb_stats.alive),
-        .reborn = ACC_INIT(hpb_stats.reborn)
-};
-
-/* allocated via real_malloc/real_free */
-struct src_loc {
-        pthread_mutex_t *mtx;
-        size_t total;
-        size_t allocations;
-        size_t frees;
-        size_t age_total; /* (age_total / frees) => mean age at free */
-        size_t max_lifespan;
-        struct cds_lfht_node hnode;
-        struct cds_list_head allocs; /* <=> alloc_hdr.node */
-        uint32_t hval;
-        uint32_t capa;
-        char k[];
-};
-
-/* every allocation has this in the header, maintain alignment with malloc  */
-struct alloc_hdr {
-        struct cds_list_head anode; /* <=> src_loc.allocs */
-        union {
-                struct {
-                        size_t gen; /* rb_gc_count() */
-                        struct src_loc *loc;
-                } live;
-                struct rcu_head dead;
-                struct {
-                        size_t at; /* rb_gc_count() */
-                } hpb_freed;
-        } as;
-        void *real; /* what to call real_free on */
-        size_t size;
-};
-
-static char kbuf[PATH_MAX + INT2STR_MAX + sizeof(struct alloc_hdr) + 2];
-
-static struct alloc_hdr *ptr2hdr(void *p)
-{
-        return (struct alloc_hdr *)((uintptr_t)p - sizeof(struct alloc_hdr));
-}
-
-static void *hdr2ptr(struct alloc_hdr *h)
-{
-        return (void *)((uintptr_t)h + sizeof(struct alloc_hdr));
-}
-
-static int loc_is_addr(const struct src_loc *l)
-{
-        return l->capa == 0;
-}
-
-static size_t loc_size(const struct src_loc *l)
-{
-        return loc_is_addr(l) ? sizeof(uintptr_t) : l->capa;
-}
-
-static int loc_eq(struct cds_lfht_node *node, const void *key)
-{
-        const struct src_loc *existing;
-        const struct src_loc *k = key;
-
-        existing = caa_container_of(node, struct src_loc, hnode);
-
-        return (k->hval == existing->hval &&
-                k->capa == existing->capa &&
-                memcmp(k->k, existing->k, loc_size(k)) == 0);
-}
-
-/* note: not atomic */
-static void
-acc_add(struct acc *acc, size_t val)
-{
-        double delta = val - acc->mean;
-        uint64_t nr = ++acc->nr;
-
-        /* just don't divide-by-zero if we ever hit this (unlikely :P) */
-        if (nr)
-                acc->mean += delta / nr;
-
-        acc->m2 += delta * (val - acc->mean);
-        if ((int64_t)val < acc->min)
-                acc->min = (int64_t)val;
-        if ((int64_t)val > acc->max)
-                acc->max = (int64_t)val;
-}
-
-#if SIZEOF_LONG == 8
-# define INT64toNUM(x) LONG2NUM((long)x)
-#elif defined(HAVE_LONG_LONG) && SIZEOF_LONG_LONG == 8
-# define INT64toNUM(x) LL2NUM((LONG_LONG)x)
-#endif
-
-static VALUE
-acc_max(const struct acc *acc)
-{
-        return INT64toNUM(acc->max);
-}
-
-static VALUE
-acc_min(const struct acc *acc)
-{
-        return acc->min == INT64_MAX ? INT2FIX(-1) : INT64toNUM(acc->min);
-}
-
-static VALUE
-acc_mean(const struct acc *acc)
-{
-        return DBL2NUM(acc->nr ? acc->mean : HUGE_VAL);
-}
-
-static double
-acc_stddev_dbl(const struct acc *acc)
-{
-        if (acc->nr > 1) {
-                double variance = acc->m2 / (acc->nr - 1);
-                return sqrt(variance);
-        }
-        return 0.0;
-}
-
-static VALUE
-acc_stddev(const struct acc *acc)
-{
-        return DBL2NUM(acc_stddev_dbl(acc));
-}
-
-static struct src_loc *totals_add_rcu(struct src_loc *k)
-{
-        struct cds_lfht_iter iter;
-        struct cds_lfht_node *cur;
-        struct src_loc *l = 0;
-        struct cds_lfht *t;
-
-again:
-        t = rcu_dereference(totals);
-        if (!t) goto out_unlock;
-        cds_lfht_lookup(t, k->hval, loc_eq, k, &iter);
-        cur = cds_lfht_iter_get_node(&iter);
-        if (cur) {
-                l = caa_container_of(cur, struct src_loc, hnode);
-                uatomic_add(&l->total, k->total);
-                uatomic_add(&l->allocations, 1);
-        } else {
-                size_t n = loc_size(k);
-                l = real_malloc(sizeof(*l) + n);
-                if (!l) goto out_unlock;
-                memcpy(l, k, sizeof(*l) + n);
-                l->mtx = mutex_assign();
-                l->age_total = 0;
-                l->max_lifespan = 0;
-                l->frees = 0;
-                l->allocations = 1;
-                CDS_INIT_LIST_HEAD(&l->allocs);
-                cur = cds_lfht_add_unique(t, k->hval, loc_eq, l, &l->hnode);
-                if (cur != &l->hnode) { /* lost race */
-                        rcu_read_unlock();
-                        real_free(l);
-                        rcu_read_lock();
-                        goto again;
-                }
-        }
-out_unlock:
-        return l;
-}
-
-static void update_stats_rcu_unlock(const struct src_loc *l)
-{
-        if (caa_likely(l)) rcu_read_unlock();
-}
-
-static struct src_loc *update_stats_rcu_lock(size_t size, uintptr_t caller)
-{
-        struct src_loc *k, *ret = 0;
-        static const size_t xlen = sizeof(caller);
-        char *dst;
-
-        if (caa_unlikely(!totals)) return 0;
-        if (locating++) goto out; /* do not recurse into another *alloc */
-
-        uatomic_add(&total_bytes_inc, size);
-
-        rcu_read_lock();
-        if (has_ec_p()) {
-                int line;
-                const char *ptr = rb_source_location_cstr(&line);
-                size_t len;
-                size_t int_size = INT2STR_MAX;
-
-                generation = rb_gc_count();
-
-                if (!ptr) goto unknown;
-
-                /* avoid vsnprintf or anything which could call malloc here: */
-                len = strlen(ptr);
-                k = (void *)kbuf;
-                k->total = size;
-                dst = mempcpy(k->k, ptr, len);
-                *dst++ = ':';
-                dst = int2str(line, dst, &int_size);
-                if (dst) {
-                        *dst = 0;        /* terminate string */
-                        k->capa = (uint32_t)(dst - k->k + 1);
-                        k->hval = jhash(k->k, k->capa, 0xdeadbeef);
-                        ret = totals_add_rcu(k);
-                } else {
-                        rb_bug("bad math making key from location %s:%d\n",
-                                ptr, line);
-                }
-        } else {
-unknown:
-                k = alloca(sizeof(*k) + xlen);
-                k->total = size;
-                memcpy(k->k, &caller, xlen);
-                k->capa = 0;
-                k->hval = jhash(k->k, xlen, 0xdeadbeef);
-                ret = totals_add_rcu(k);
-        }
-out:
-        --locating;
-        return ret;
-}
-
-size_t malloc_usable_size(void *p)
-{
-        return ptr2hdr(p)->size;
-}
-
-static void
-free_hdr_rcu(struct rcu_head *dead)
-{
-        struct alloc_hdr *h = caa_container_of(dead, struct alloc_hdr, as.dead);
-        real_free(h->real);
-}
-
-void free(void *p)
-{
-        if (p) {
-                struct alloc_hdr *h = ptr2hdr(p);
-                struct src_loc *l = h->as.live.loc;
-
-                if (!real_free) return; /* oh well, leak a little */
-                if (l && l != IS_HEAP_PAGE_BODY) {
-                        size_t age = generation - h->as.live.gen;
-
-                        uatomic_add(&total_bytes_dec, h->size);
-                        uatomic_set(&h->size, 0);
-                        uatomic_add(&l->frees, 1);
-                        uatomic_add(&l->age_total, age);
-
-                        mutex_lock(l->mtx);
-                        cds_list_del_rcu(&h->anode);
-                        if (age > l->max_lifespan)
-                                l->max_lifespan = age;
-                        mutex_unlock(l->mtx);
-
-                        call_rcu(&h->as.dead, free_hdr_rcu);
-                } else if (l == IS_HEAP_PAGE_BODY) {
-                        size_t gen = generation;
-                        size_t age = gen - h->as.live.gen;
-
-                        h->as.hpb_freed.at = gen;
-
-                        mutex_lock(&hpb_stats.lock);
-                        acc_add(&hpb_stats.alive, age);
-
-                        /* hpb_stats.bodies => hpb_stats.freed */
-                        cds_list_move(&h->anode, &hpb_stats.freed);
-
-                        mutex_unlock(&hpb_stats.lock);
-                } else {
-                        real_free(h->real);
-                }
-        }
-}
-
-static void
-alloc_insert_rcu(struct src_loc *l, struct alloc_hdr *h, size_t size, void *real)
-{
-        /* we need src_loc to remain alive for the duration of this call */
-        if (!h) return;
-        h->size = size;
-        h->real = real;
-        h->as.live.loc = l;
-        h->as.live.gen = generation;
-        if (l) {
-                mutex_lock(l->mtx);
-                cds_list_add_rcu(&h->anode, &l->allocs);
-                mutex_unlock(l->mtx);
-        }
-}
-
-static size_t size_align(size_t size, size_t alignment)
-{
-        return ((size + (alignment - 1)) & ~(alignment - 1));
-}
-
-static bool ptr_is_aligned(const void *ptr, size_t alignment)
-{
-        return ((uintptr_t)ptr & (alignment - 1)) == 0;
-}
-
-static void *ptr_align(void *ptr, size_t alignment)
-{
-        return (void *)(((uintptr_t)ptr + (alignment - 1)) & ~(alignment - 1));
-}
-
-static bool is_power_of_two(size_t n) { return (n & (n - 1)) == 0; }
-
-static int
-internal_memalign(void **pp, size_t alignment, size_t size, uintptr_t caller)
-{
-        struct src_loc *l;
-        struct alloc_hdr *h;
-        void *real;
-        size_t asize;
-        size_t d = alignment / sizeof(void*);
-        size_t r = alignment % sizeof(void*);
-
-        if (!real_malloc) return ENOMEM;
-
-        if (r != 0 || d == 0 || !is_power_of_two(d))
-                return EINVAL;
-
-        if (alignment <= ASSUMED_MALLOC_ALIGNMENT) {
-                void *p = malloc(size);
-                if (!p) return ENOMEM;
-                *pp = p;
-                return 0;
-        }
-        for (; alignment < sizeof(struct alloc_hdr); alignment *= 2)
-                ; /* double alignment until >= sizeof(struct alloc_hdr) */
-        if (__builtin_add_overflow(size, alignment, &asize) ||
-            __builtin_add_overflow(asize, sizeof(struct alloc_hdr), &asize))
-                return ENOMEM;
-
-
-        if (alignment == HEAP_PAGE_ALIGN && size == HEAP_PAGE_SIZE) {
-                if (has_ec_p()) generation = rb_gc_count();
-                l = IS_HEAP_PAGE_BODY;
-        } else {
-                l = update_stats_rcu_lock(size, caller);
-        }
-
-        if (l == IS_HEAP_PAGE_BODY) {
-                void *p;
-                size_t gen = generation;
-
-                mutex_lock(&hpb_stats.lock);
-
-                /* reuse existing entry */
-                if (!cds_list_empty(&hpb_stats.freed)) {
-                        size_t deathspan;
-
-                        h = cds_list_first_entry(&hpb_stats.freed,
-                                                 struct alloc_hdr, anode);
-                        /* hpb_stats.freed => hpb_stats.bodies */
-                        cds_list_move(&h->anode, &hpb_stats.bodies);
-                        assert(h->size == size);
-                        assert(h->real);
-                        real = h->real;
-                        p = hdr2ptr(h);
-                        assert(ptr_is_aligned(p, alignment));
-
-                        deathspan = gen - h->as.hpb_freed.at;
-                        acc_add(&hpb_stats.reborn, deathspan);
-                }
-                else {
-                        real = real_malloc(asize);
-                        if (!real) return ENOMEM;
-
-                        p = hdr2ptr(real);
-                        if (!ptr_is_aligned(p, alignment))
-                                p = ptr_align(p, alignment);
-                        h = ptr2hdr(p);
-                        h->size = size;
-                        h->real = real;
-                        cds_list_add(&h->anode, &hpb_stats.bodies);
-                }
-                mutex_unlock(&hpb_stats.lock);
-                h->as.live.loc = l;
-                h->as.live.gen = gen;
-                *pp = p;
-        }
-        else {
-                real = real_malloc(asize);
-                if (real) {
-                        void *p = hdr2ptr(real);
-                        if (!ptr_is_aligned(p, alignment))
-                                p = ptr_align(p, alignment);
-                        h = ptr2hdr(p);
-                        alloc_insert_rcu(l, h, size, real);
-                        update_stats_rcu_unlock(l);
-                        *pp = p;
-                }
-        }
-
-        return real ? 0 : ENOMEM;
-}
-
-static void *
-memalign_result(int err, void *p)
-{
-        if (caa_unlikely(err)) {
-                errno = err;
-                return 0;
-        }
-        return p;
-}
-
-void *memalign(size_t alignment, size_t size)
-{
-        void *p;
-        int err = internal_memalign(&p, alignment, size, RETURN_ADDRESS(0));
-        return memalign_result(err, p);
-}
-
-int posix_memalign(void **p, size_t alignment, size_t size)
-{
-        return internal_memalign(p, alignment, size, RETURN_ADDRESS(0));
-}
-
-void *aligned_alloc(size_t, size_t) __attribute__((alias("memalign")));
-void cfree(void *) __attribute__((alias("free")));
-
-void *valloc(size_t size)
-{
-        void *p;
-        int err = internal_memalign(&p, page_size, size, RETURN_ADDRESS(0));
-        return memalign_result(err, p);
-}
-
-#if __GNUC__ < 7
-#  define add_overflow_p(a,b) __extension__({ \
-                __typeof__(a) _c; \
-                __builtin_add_overflow(a,b,&_c); \
-        })
-#else
-#  define add_overflow_p(a,b) \
-                __builtin_add_overflow_p((a),(b),(__typeof__(a+b))0)
-#endif
-
-void *pvalloc(size_t size)
-{
-        size_t alignment = page_size;
-        void *p;
-        int err;
-
-        if (add_overflow_p(size, alignment)) {
-                errno = ENOMEM;
-                return 0;
-        }
-        size = size_align(size, alignment);
-        err = internal_memalign(&p, alignment, size, RETURN_ADDRESS(0));
-        return memalign_result(err, p);
-}
-
-void *malloc(size_t size)
-{
-        struct src_loc *l;
-        struct alloc_hdr *h;
-        size_t asize;
-        void *p;
-
-        if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize))
-                goto enomem;
-
-        /*
-         * Needed for C++ global declarations using "new",
-         * which happens before our constructor
-         */
-#ifndef __FreeBSD__
-        if (!real_malloc) {
-                if (resolving_malloc) goto enomem;
-                resolving_malloc = 1;
-                real_malloc = dlsym(RTLD_NEXT, "malloc");
-        }
-#endif
-        l = update_stats_rcu_lock(size, RETURN_ADDRESS(0));
-        p = h = real_malloc(asize);
-        if (h) {
-                alloc_insert_rcu(l, h, size, h);
-                p = hdr2ptr(h);
-        }
-        update_stats_rcu_unlock(l);
-        if (caa_unlikely(!p)) errno = ENOMEM;
-        return p;
-enomem:
-        errno = ENOMEM;
-        return 0;
-}
-
-void *calloc(size_t nmemb, size_t size)
-{
-        void *p;
-        struct src_loc *l;
-        struct alloc_hdr *h;
-        size_t asize;
-
-        if (__builtin_mul_overflow(size, nmemb, &size)) {
-                errno = ENOMEM;
-                return 0;
-        }
-        if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize)) {
-                errno = ENOMEM;
-                return 0;
-        }
-        RETURN_IF_NOT_READY();
-        l = update_stats_rcu_lock(size, RETURN_ADDRESS(0));
-        p = h = real_malloc(asize);
-        if (p) {
-                alloc_insert_rcu(l, h, size, h);
-                p = hdr2ptr(h);
-                memset(p, 0, size);
-        }
-        update_stats_rcu_unlock(l);
-        if (caa_unlikely(!p)) errno = ENOMEM;
-        return p;
-}
-
-void *realloc(void *ptr, size_t size)
-{
-        void *p;
-        struct src_loc *l;
-        struct alloc_hdr *h;
-        size_t asize;
-
-        if (!size) {
-                free(ptr);
-                return 0;
-        }
-        if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize)) {
-                errno = ENOMEM;
-                return 0;
-        }
-        RETURN_IF_NOT_READY();
-
-        l = update_stats_rcu_lock(size, RETURN_ADDRESS(0));
-        p = h = real_malloc(asize);
-        if (p) {
-                alloc_insert_rcu(l, h, size, h);
-                p = hdr2ptr(h);
-        }
-        update_stats_rcu_unlock(l);
-
-        if (ptr && p) {
-                struct alloc_hdr *old = ptr2hdr(ptr);
-                memcpy(p, ptr, old->size < size ? old->size : size);
-                free(ptr);
-        }
-        if (caa_unlikely(!p)) errno = ENOMEM;
-        return p;
-}
-
-struct dump_arg {
-        FILE *fp;
-        size_t min;
-};
-
-static void *dump_to_file(void *x)
-{
-        struct dump_arg *a = x;
-        struct cds_lfht_iter iter;
-        struct src_loc *l;
-        struct cds_lfht *t;
-
-        ++locating;
-        rcu_read_lock();
-        t = rcu_dereference(totals);
-        if (!t)
-                goto out_unlock;
-        cds_lfht_for_each_entry(t, &iter, l, hnode) {
-                const void *p = l->k;
-                char **s = 0;
-                if (l->total <= a->min) continue;
-
-                if (loc_is_addr(l)) {
-                        s = backtrace_symbols(p, 1);
-                        p = s[0];
-                }
-                fprintf(a->fp, "%16zu %12zu %s\n",
-                        l->total, l->allocations, (const char *)p);
-                if (s) free(s);
-        }
-out_unlock:
-        rcu_read_unlock();
-        --locating;
-        return 0;
-}
-
-/*
- * call-seq:
- *
- *        Mwrap.dump([[io] [, min]] -> nil
- *
- * Dumps the current totals to +io+ which must be an IO object
- * (StringIO and similar are not supported).  Total sizes smaller
- * than or equal to +min+ are skipped.
- *
- * The output is space-delimited by 3 columns:
- *
- * total_size      call_count      location
- */
-static VALUE mwrap_dump(int argc, VALUE * argv, VALUE mod)
-{
-        VALUE io, min;
-        struct dump_arg a;
-        rb_io_t *fptr;
-
-        rb_scan_args(argc, argv, "02", &io, &min);
-
-        if (NIL_P(io))
-                /* library may be linked w/o Ruby */
-                io = *((VALUE *)dlsym(RTLD_DEFAULT, "rb_stderr"));
-
-        a.min = NIL_P(min) ? 0 : NUM2SIZET(min);
-        io = rb_io_get_io(io);
-        io = rb_io_get_write_io(io);
-        GetOpenFile(io, fptr);
-        a.fp = rb_io_stdio_file(fptr);
-
-        rb_thread_call_without_gvl(dump_to_file, &a, 0, 0);
-        RB_GC_GUARD(io);
-        return Qnil;
-}
-
-/* The whole operation is not remotely atomic... */
-static void *totals_reset(void *ign)
-{
-        struct cds_lfht *t;
-        struct cds_lfht_iter iter;
-        struct src_loc *l;
-
-        uatomic_set(&total_bytes_inc, 0);
-        uatomic_set(&total_bytes_dec, 0);
-
-        rcu_read_lock();
-        t = rcu_dereference(totals);
-        cds_lfht_for_each_entry(t, &iter, l, hnode) {
-                uatomic_set(&l->total, 0);
-                uatomic_set(&l->allocations, 0);
-                uatomic_set(&l->frees, 0);
-                uatomic_set(&l->age_total, 0);
-                uatomic_set(&l->max_lifespan, 0);
-        }
-        rcu_read_unlock();
-        return 0;
-}
-
-/*
- * call-seq:
- *
- *        Mwrap.reset -> nil
- *
- * Resets the the total tables by zero-ing all counters.
- * This resets all statistics.  This is not an atomic operation
- * as other threads (outside of GVL) may increment counters.
- */
-static VALUE mwrap_reset(VALUE mod)
-{
-        rb_thread_call_without_gvl(totals_reset, 0, 0, 0);
-        return Qnil;
-}
-
-/* :nodoc: */
-static VALUE mwrap_clear(VALUE mod)
-{
-        return mwrap_reset(mod);
-}
-
-static VALUE rcu_unlock_ensure(VALUE ignored)
-{
-        rcu_read_unlock();
-        --locating;
-        return Qfalse;
-}
-
-static VALUE location_string(struct src_loc *l)
-{
-        VALUE ret, tmp;
-
-        if (loc_is_addr(l)) {
-                char **s = backtrace_symbols((void *)l->k, 1);
-                tmp = rb_str_new_cstr(s[0]);
-                free(s);
-        }
-        else {
-                tmp = rb_str_new(l->k, l->capa - 1);
-        }
-
-        /* deduplicate and try to free up some memory */
-        ret = rb_funcall(tmp, id_uminus, 0);
-        if (!OBJ_FROZEN_RAW(tmp))
-                rb_str_resize(tmp, 0);
-
-        return ret;
-}
-
-static VALUE dump_each_rcu(VALUE x)
-{
-        struct dump_arg *a = (struct dump_arg *)x;
-        struct cds_lfht *t;
-        struct cds_lfht_iter iter;
-        struct src_loc *l;
-
-        t = rcu_dereference(totals);
-        cds_lfht_for_each_entry(t, &iter, l, hnode) {
-                VALUE v[6];
-                if (l->total <= a->min) continue;
-
-                v[0] = location_string(l);
-                v[1] = SIZET2NUM(l->total);
-                v[2] = SIZET2NUM(l->allocations);
-                v[3] = SIZET2NUM(l->frees);
-                v[4] = SIZET2NUM(l->age_total);
-                v[5] = SIZET2NUM(l->max_lifespan);
-
-                rb_yield_values2(6, v);
-                assert(rcu_read_ongoing());
-        }
-        return Qnil;
-}
-
-/*
- * call-seq:
- *
- *        Mwrap.each([min]) do |location,total,allocations,frees,age_total,max_lifespan|
- *          ...
- *        end
- *
- * Yields each entry of the of the table to a caller-supplied block.
- * +min+ may be specified to filter out lines with +total+ bytes
- * equal-to-or-smaller-than the supplied minimum.
- */
-static VALUE mwrap_each(int argc, VALUE * argv, VALUE mod)
-{
-        VALUE min;
-        struct dump_arg a;
-
-        rb_scan_args(argc, argv, "01", &min);
-        a.min = NIL_P(min) ? 0 : NUM2SIZET(min);
-
-        ++locating;
-        rcu_read_lock();
-
-        return rb_ensure(dump_each_rcu, (VALUE)&a, rcu_unlock_ensure, 0);
-}
-
-static size_t
-src_loc_memsize(const void *p)
-{
-        return sizeof(struct src_loc);
-}
-
-static const rb_data_type_t src_loc_type = {
-        "source_location",
-        /* no marking, no freeing */
-        { 0, 0, src_loc_memsize, /* reserved */ },
-        /* parent, data, [ flags ] */
-};
-
-static VALUE cSrcLoc;
-
-static int
-extract_addr(const char *str, size_t len, void **p)
-{
-        const char *c;
-#if defined(__GLIBC__)
-        return ((c = memrchr(str, '[', len)) && sscanf(c, "[%p]", p));
-#else /* tested FreeBSD */
-        return ((c = strstr(str, "0x")) && sscanf(c, "%p", p));
-#endif
-}
-
-/*
- * call-seq:
- *        Mwrap[location] -> Mwrap::SourceLocation
- *
- * Returns the associated Mwrap::SourceLocation given the +location+
- * String.  +location+ is either a Ruby source location path:line
- * (e.g. "/path/to/foo.rb:5") or a hexadecimal memory address with
- * square-braces part yielded by Mwrap.dump (e.g. "[0xdeadbeef]")
- */
-static VALUE mwrap_aref(VALUE mod, VALUE loc)
-{
-        const char *str = StringValueCStr(loc);
-        int len = RSTRING_LENINT(loc);
-        struct src_loc *k = 0;
-        uintptr_t p;
-        struct cds_lfht_iter iter;
-        struct cds_lfht_node *cur;
-        struct cds_lfht *t;
-        struct src_loc *l;
-        VALUE val = Qnil;
-
-        if (extract_addr(str, len, (void **)&p)) {
-                k = (void *)kbuf;
-                memcpy(k->k, &p, sizeof(p));
-                k->capa = 0;
-                k->hval = jhash(k->k, sizeof(p), 0xdeadbeef);
-        } else {
-                k = (void *)kbuf;
-                memcpy(k->k, str, len + 1);
-                k->capa = len + 1;
-                k->hval = jhash(k->k, k->capa, 0xdeadbeef);
-        }
-
-        if (!k) return val;
-
-        rcu_read_lock();
-        t = rcu_dereference(totals);
-        if (!t) goto out_unlock;
-
-        cds_lfht_lookup(t, k->hval, loc_eq, k, &iter);
-        cur = cds_lfht_iter_get_node(&iter);
-        if (cur) {
-                l = caa_container_of(cur, struct src_loc, hnode);
-                val = TypedData_Wrap_Struct(cSrcLoc, &src_loc_type, l);
-        }
-out_unlock:
-        rcu_read_unlock();
-        return val;
-}
-
-static VALUE src_loc_each_i(VALUE p)
-{
-        struct alloc_hdr *h;
-        struct src_loc *l = (struct src_loc *)p;
-
-        cds_list_for_each_entry_rcu(h, &l->allocs, anode) {
-                size_t gen = uatomic_read(&h->as.live.gen);
-                size_t size = uatomic_read(&h->size);
-
-                if (size) {
-                        VALUE v[2];
-                        v[0] = SIZET2NUM(size);
-                        v[1] = SIZET2NUM(gen);
-
-                        rb_yield_values2(2, v);
-                }
-        }
-
-        return Qfalse;
-}
-
-static struct src_loc *src_loc_get(VALUE self)
-{
-        struct src_loc *l;
-        TypedData_Get_Struct(self, struct src_loc, &src_loc_type, l);
-        assert(l);
-        return l;
-}
-
-/*
- * call-seq:
- *        loc = Mwrap[location]
- *        loc.each { |size,generation| ... }
- *
- * Iterates through live allocations for a given Mwrap::SourceLocation,
- * yielding the +size+ (in bytes) and +generation+ of each allocation.
- * The +generation+ is the value of the GC.count method at the time
- * the allocation was made.
- *
- * This functionality is only available in mwrap 2.0.0+
- */
-static VALUE src_loc_each(VALUE self)
-{
-        struct src_loc *l = src_loc_get(self);
-
-        assert(locating == 0 && "forgot to clear locating");
-        ++locating;
-        rcu_read_lock();
-        rb_ensure(src_loc_each_i, (VALUE)l, rcu_unlock_ensure, 0);
-        return self;
-}
-
-/*
- * The the mean lifespan (in GC generations) of allocations made from this
- * location.  This does not account for live allocations.
- */
-static VALUE src_loc_mean_lifespan(VALUE self)
-{
-        struct src_loc *l = src_loc_get(self);
-        size_t tot, frees;
-
-        frees = uatomic_read(&l->frees);
-        tot = uatomic_read(&l->age_total);
-        return DBL2NUM(frees ? ((double)tot/(double)frees) : HUGE_VAL);
-}
-
-/* The number of frees made from this location */
-static VALUE src_loc_frees(VALUE self)
-{
-        return SIZET2NUM(uatomic_read(&src_loc_get(self)->frees));
-}
-
-/* The number of allocations made from this location */
-static VALUE src_loc_allocations(VALUE self)
-{
-        return SIZET2NUM(uatomic_read(&src_loc_get(self)->allocations));
-}
-
-/* The total number of bytes allocated from this location */
-static VALUE src_loc_total(VALUE self)
-{
-        return SIZET2NUM(uatomic_read(&src_loc_get(self)->total));
-}
-
-/*
- * The maximum age (in GC generations) of an allocation before it was freed.
- * This does not account for live allocations.
- */
-static VALUE src_loc_max_lifespan(VALUE self)
-{
-        return SIZET2NUM(uatomic_read(&src_loc_get(self)->max_lifespan));
-}
-
-/*
- * Returns a frozen String location of the given SourceLocation object.
- */
-static VALUE src_loc_name(VALUE self)
-{
-        struct src_loc *l = src_loc_get(self);
-        VALUE ret;
-
-        ++locating;
-        ret = location_string(l);
-        --locating;
-        return ret;
-}
-
-static VALUE reset_locating(VALUE ign) { --locating; return Qfalse; }
-
-/*
- * call-seq:
- *
- *        Mwrap.quiet do |depth|
- *          # expensive sort/calculate/emitting results of Mwrap.each
- *          # affecting statistics of the rest of the app
- *        end
- *
- * Stops allocation tracking inside the block.  This is useful for
- * monitoring code which calls other Mwrap (or ObjectSpace/GC)
- * functions which unavoidably allocate memory.
- *
- * This feature was added in mwrap 2.0.0+
- */
-static VALUE mwrap_quiet(VALUE mod)
-{
-        size_t cur = ++locating;
-        return rb_ensure(rb_yield, SIZET2NUM(cur), reset_locating, 0);
-}
-
-static VALUE total_inc(VALUE mod)
-{
-        return SIZET2NUM(total_bytes_inc);
-}
-
-static VALUE total_dec(VALUE mod)
-{
-        return SIZET2NUM(total_bytes_dec);
-}
-
-static VALUE hpb_each_yield(VALUE ignore)
-{
-        struct alloc_hdr *h, *next;
-
-        cds_list_for_each_entry_safe(h, next, &hpb_stats.bodies, anode) {
-                VALUE v[2]; /* [ generation, address ] */
-                void *addr = hdr2ptr(h);
-                assert(ptr_is_aligned(addr, HEAP_PAGE_ALIGN));
-                v[0] = LONG2NUM((long)addr);
-                v[1] = SIZET2NUM(h->as.live.gen);
-                rb_yield_values2(2, v);
-        }
-        return Qnil;
-}
-
-/*
- * call-seq:
- *
- *     Mwrap::HeapPageBody.each { |gen, addr| } -> Integer
- *
- * Yields the generation (GC.count) the heap page body was created
- * and address of the heap page body as an Integer.  Returns the
- * number of allocated pages as an Integer.  This return value should
- * match the result of GC.stat(:heap_allocated_pages)
- */
-static VALUE hpb_each(VALUE mod)
-{
-        ++locating;
-        return rb_ensure(hpb_each_yield, Qfalse, reset_locating, 0);
-}
-
-/*
- * call-seq:
- *
- *        Mwrap::HeapPageBody.stat -> Hash
- *        Mwrap::HeapPageBody.stat(hash) -> hash
- *
- * The maximum lifespan of a heap page body in the Ruby VM.
- * This may be Infinity if no heap page bodies were ever freed.
- */
-static VALUE hpb_stat(int argc, VALUE *argv, VALUE hpb)
-{
-        VALUE h;
-
-        rb_scan_args(argc, argv, "01", &h);
-        if (NIL_P(h))
-                h = rb_hash_new();
-        else if (!RB_TYPE_P(h, T_HASH))
-                rb_raise(rb_eTypeError, "not a hash %+"PRIsVALUE, h);
-
-        ++locating;
-#define S(x) ID2SYM(rb_intern(#x))
-        rb_hash_aset(h, S(lifespan_max), acc_max(&hpb_stats.alive));
-        rb_hash_aset(h, S(lifespan_min), acc_min(&hpb_stats.alive));
-        rb_hash_aset(h, S(lifespan_mean), acc_mean(&hpb_stats.alive));
-        rb_hash_aset(h, S(lifespan_stddev), acc_stddev(&hpb_stats.alive));
-        rb_hash_aset(h, S(deathspan_max), acc_max(&hpb_stats.reborn));
-        rb_hash_aset(h, S(deathspan_min), acc_min(&hpb_stats.reborn));
-        rb_hash_aset(h, S(deathspan_mean), acc_mean(&hpb_stats.reborn));
-        rb_hash_aset(h, S(deathspan_stddev), acc_stddev(&hpb_stats.reborn));
-        rb_hash_aset(h, S(resurrects), SIZET2NUM(hpb_stats.reborn.nr));
-#undef S
-        --locating;
-
-        return h;
-}
-
-/*
- * Document-module: Mwrap
- *
- *   require 'mwrap'
- *
- * Mwrap has a dual function as both a Ruby C extension and LD_PRELOAD
- * wrapper.  As a Ruby C extension, it exposes a limited Ruby API.
- * To be effective at gathering status, mwrap must be loaded as a
- * LD_PRELOAD (using the mwrap(1) executable makes it easy)
- *
- * ENVIRONMENT
- *
- * The "MWRAP" environment variable contains a comma-delimited list
- * of key:value options for automatically dumping at program exit.
- *
- * * dump_fd: a writable FD to dump to
- * * dump_path: a path to dump to, the file is opened in O_APPEND mode
- * * dump_min: the minimum allocation size (total) to dump
- * * dump_heap: mask of heap_page_body statistics to dump
- *
- * If both `dump_fd' and `dump_path' are specified, dump_path takes
- * precedence.
- *
- * dump_heap bitmask
- * * 0x01 - summary stats (same info as HeapPageBody.stat)
- * * 0x02 - all live heaps (similar to HeapPageBody.each)
- * * 0x04 - skip non-heap_page_body-related output
- */
-void Init_mwrap(void)
-{
-        VALUE mod, hpb;
-
-        ++locating;
-        mod = rb_define_module("Mwrap");
-        id_uminus = rb_intern("-@");
-
-        /*
-         * Represents a location in source code or library
-         * address which calls a memory allocation.  It is
-         * updated automatically as allocations are made, so
-         * there is no need to reload or reread it from Mwrap#[].
-         * This class is only available since mwrap 2.0.0+.
-         */
-        cSrcLoc = rb_define_class_under(mod, "SourceLocation", rb_cObject);
-        rb_define_singleton_method(mod, "dump", mwrap_dump, -1);
-        rb_define_singleton_method(mod, "reset", mwrap_reset, 0);
-        rb_define_singleton_method(mod, "clear", mwrap_clear, 0);
-        rb_define_singleton_method(mod, "each", mwrap_each, -1);
-        rb_define_singleton_method(mod, "[]", mwrap_aref, 1);
-        rb_define_singleton_method(mod, "quiet", mwrap_quiet, 0);
-        rb_define_singleton_method(mod, "total_bytes_allocated", total_inc, 0);
-        rb_define_singleton_method(mod, "total_bytes_freed", total_dec, 0);
-
-
-        rb_define_method(cSrcLoc, "each", src_loc_each, 0);
-        rb_define_method(cSrcLoc, "frees", src_loc_frees, 0);
-        rb_define_method(cSrcLoc, "allocations", src_loc_allocations, 0);
-        rb_define_method(cSrcLoc, "total", src_loc_total, 0);
-        rb_define_method(cSrcLoc, "mean_lifespan", src_loc_mean_lifespan, 0);
-        rb_define_method(cSrcLoc, "max_lifespan", src_loc_max_lifespan, 0);
-        rb_define_method(cSrcLoc, "name", src_loc_name, 0);
-
-        /*
-         * Information about "struct heap_page_body" allocations from
-         * Ruby gc.c.  This can be useful for tracking fragmentation
-         * from posix_memalign(3) use in mainline Ruby:
-         *
-         *   https://sourceware.org/bugzilla/show_bug.cgi?id=14581
-         */
-        hpb = rb_define_class_under(mod, "HeapPageBody", rb_cObject);
-        rb_define_singleton_method(hpb, "stat", hpb_stat, -1);
-        rb_define_singleton_method(hpb, "each", hpb_each, 0);
-
-        --locating;
-}
-
-enum {
-        DUMP_HPB_STATS = 0x1,
-        DUMP_HPB_EACH = 0x2,
-        DUMP_HPB_EXCL = 0x4,
-};
-
-static void dump_hpb(FILE *fp, unsigned flags)
-{
-        if (flags & DUMP_HPB_STATS) {
-                fprintf(fp,
-                        "lifespan_max: %zu\n"
-                        "lifespan_min:%s%zu\n"
-                        "lifespan_mean: %0.3f\n"
-                        "lifespan_stddev: %0.3f\n"
-                        "deathspan_max: %zu\n"
-                        "deathspan_min:%s%zu\n"
-                        "deathspan_mean: %0.3f\n"
-                        "deathspan_stddev: %0.3f\n"
-                        "gc_count: %zu\n",
-                        hpb_stats.alive.max,
-                        hpb_stats.alive.min == INT64_MAX ? " -" : " ",
-                        hpb_stats.alive.min,
-                        hpb_stats.alive.mean,
-                        acc_stddev_dbl(&hpb_stats.alive),
-                        hpb_stats.reborn.max,
-                        hpb_stats.reborn.min == INT64_MAX ? " -" : " ",
-                        hpb_stats.reborn.min,
-                        hpb_stats.reborn.mean,
-                        acc_stddev_dbl(&hpb_stats.reborn),
-                        /* n.b.: unsafe to call rb_gc_count() in destructor */
-                        generation);
-        }
-        if (flags & DUMP_HPB_EACH) {
-                struct alloc_hdr *h;
-
-                cds_list_for_each_entry(h, &hpb_stats.bodies, anode) {
-                        void *addr = hdr2ptr(h);
-
-                        fprintf(fp, "%p\t%zu\n", addr, h->as.live.gen);
-                }
-        }
-}
-
-/* rb_cloexec_open isn't usable by non-Ruby processes */
-#ifndef O_CLOEXEC
-#  define O_CLOEXEC 0
-#endif
-
-__attribute__ ((destructor))
-static void mwrap_dump_destructor(void)
-{
-        const char *opt = getenv("MWRAP");
-        const char *modes[] = { "a", "a+", "w", "w+", "r+" };
-        struct dump_arg a = { .min = 0 };
-        size_t i;
-        int dump_fd;
-        unsigned dump_heap = 0;
-        char *dump_path;
-        char *s;
-
-        if (!opt)
-                return;
-
-        ++locating;
-        if ((dump_path = strstr(opt, "dump_path:")) &&
-                        (dump_path += sizeof("dump_path")) &&
-                        *dump_path) {
-                char *end = strchr(dump_path, ',');
-                if (end) {
-                        char *tmp = alloca(end - dump_path + 1);
-                        end = mempcpy(tmp, dump_path, end - dump_path);
-                        *end = 0;
-                        dump_path = tmp;
-                }
-                dump_fd = open(dump_path, O_CLOEXEC|O_WRONLY|O_APPEND|O_CREAT,
-                                0666);
-                if (dump_fd < 0) {
-                        fprintf(stderr, "open %s failed: %s\n", dump_path,
-                                strerror(errno));
-                        goto out;
-                }
-        }
-        else if (!sscanf(opt, "dump_fd:%d", &dump_fd))
-                goto out;
-
-        if ((s = strstr(opt, "dump_min:")))
-                sscanf(s, "dump_min:%zu", &a.min);
-
-        if ((s = strstr(opt, "dump_heap:")))
-                sscanf(s, "dump_heap:%u", &dump_heap);
-
-        switch (dump_fd) {
-        case 0: goto out;
-        case 1: a.fp = stdout; break;
-        case 2: a.fp = stderr; break;
-        default:
-                if (dump_fd < 0)
-                        goto out;
-                a.fp = 0;
-
-                for (i = 0; !a.fp && i < 5; i++)
-                        a.fp = fdopen(dump_fd, modes[i]);
-
-                if (!a.fp) {
-                        fprintf(stderr, "failed to open fd=%d: %s\n",
-                                dump_fd, strerror(errno));
-                        goto out;
-                }
-                /* we'll leak some memory here, but this is a destructor */
-        }
-        if ((dump_heap & DUMP_HPB_EXCL) == 0)
-                dump_to_file(&a);
-        dump_hpb(a.fp, dump_heap);
-out:
-        --locating;
-}
diff --git a/ext/mwrap/jhash.h b/jhash.h
index 69666f3..69666f3 100644
--- a/ext/mwrap/jhash.h
+++ b/jhash.h
diff --git a/lib/Devel/Mwrap.pm b/lib/Devel/Mwrap.pm
new file mode 100644
index 0000000..f74f7d1
--- /dev/null
+++ b/lib/Devel/Mwrap.pm
@@ -0,0 +1,15 @@
+# Copyright (C) 2019 all contributors <mwrap-perl@80x24.org>
+# License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
+package Devel::Mwrap;
+use strict;
+our $VERSION = '0.0.0';
+use XSLoader;
+XSLoader::load(__PACKAGE__, $VERSION);
+
+1;
+__END__
+=pod
+
+=head1 NAME
+
+Devel::Mwrap - LD_PRELOAD malloc wrapper + line stats for Perl
diff --git a/lib/mwrap_rack.rb b/lib/mwrap_rack.rb
deleted file mode 100644
index e45b26d..0000000
--- a/lib/mwrap_rack.rb
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (C) 2018 all contributors <mwrap@80x24.org>
-# License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
-# frozen_string_literal: true
-require 'mwrap'
-require 'rack'
-require 'cgi'
-
-# MwrapRack is a standalone Rack application which can be
-# mounted to run within your application process.
-#
-# Using the Rack::Builder API in config.ru, you can map it to
-# the "/MWRAP/" endpoint.  As with the rest of the Mwrap API,
-# your Rack server needs to be spawned with the mwrap(1)
-# wrapper to enable the LD_PRELOAD.
-#
-#     require 'mwrap_rack'
-#     map('/MWRAP') { run(MwrapRack.new) }
-#     map('/') { run(your_normal_app) }
-#
-# A live demo is available at https://80x24.org/MWRAP/
-# (warning the demo machine is 32-bit, so counters will overflow)
-#
-# This module is only available in mwrap 2.0.0+
-class MwrapRack
-  module HtmlResponse # :nodoc:
-    def response
-      [ 200, {
-          'Expires' => 'Fri, 01 Jan 1980 00:00:00 GMT',
-          'Pragma' => 'no-cache',
-          'Cache-Control' => 'no-cache, max-age=0, must-revalidate',
-          'Content-Type' => 'text/html; charset=UTF-8',
-        }, self ]
-    end
-  end
-
-  class Each < Struct.new(:script_name, :min, :sort) # :nodoc:
-    include HtmlResponse
-    HEADER = '<tr><th>' + %w(total allocations frees mean_life max_life
-                location).join('</th><th>') + '</th></tr>'
-    FIELDS = %w(total allocations frees mean_life max_life location)
-    def each
-      Mwrap.quiet do
-        t = -"Mwrap.each(#{min})"
-        sn = script_name
-        all = []
-        f = FIELDS.dup
-        sc = FIELDS.index(sort || 'total') || 0
-        f[sc] = -"<b>#{f[sc]}</b>"
-        f.map! do |hdr|
-          if hdr.start_with?('<b>')
-            hdr
-          else
-            -%Q(<a\nhref="#{sn}/each/#{min}?sort=#{hdr}">#{hdr}</a>)
-          end
-        end
-        Mwrap.each(min) do |loc, total, allocations, frees, age_sum, max_life|
-          mean_life = frees == 0 ? Float::INFINITY : age_sum/frees.to_f
-          all << [total,allocations,frees,mean_life,max_life,loc]
-        end
-        all.sort_by! { |cols| -cols[sc] }
-
-        yield(-"<html><head><title>#{t}</title></head>" \
-               "<body><h1>#{t}</h1>\n" \
-               "<h2>Current generation: #{GC.count}</h2>\n<table>\n" \
-               "<tr><th>#{f.join('</th><th>')}</th></tr>\n")
-        all.each do |cols|
-          loc = cols.pop
-          cols[3] = sprintf('%0.3f', cols[3]) # mean_life
-          href = -(+"#{sn}/at/#{CGI.escape(loc)}").encode!(xml: :attr)
-          yield(%Q(<tr><td>#{cols.join('</td><td>')}<td><a\nhref=#{
-                  href}>#{-loc.encode(xml: :text)}</a></td></tr>\n))
-          cols.clear
-        end.clear
-        yield "</table></body></html>\n"
-      end
-    end
-  end
-
-  class EachAt < Struct.new(:loc) # :nodoc:
-    include HtmlResponse
-    HEADER = '<tr><th>size</th><th>generation</th></tr>'
-
-    def each
-      t = loc.name.encode(xml: :text)
-      yield(-"<html><head><title>#{t}</title></head>" \
-             "<body><h1>live allocations at #{t}</h1>" \
-             "<h2>Current generation: #{GC.count}</h2>\n<table>#{HEADER}")
-      loc.each do |size, generation|
-        yield("<tr><td>#{size}</td><td>#{generation}</td></tr>\n")
-      end
-      yield "</table></body></html>\n"
-    end
-  end
-
-  class HeapPages # :nodoc:
-    include HtmlResponse
-    HEADER = '<tr><th>address</th><th>generation</th></tr>'
-
-    def hpb_rows
-      Mwrap::HeapPageBody.stat(stat = Thread.current[:mwrap_hpb_stat] ||= {})
-      %i(lifespan_max lifespan_min lifespan_mean lifespan_stddev
-         deathspan_max deathspan_min deathspan_mean deathspan_stddev
-         resurrects
-        ).map! do |k|
-         "<tr><td>#{k}</td><td>#{stat[k]}</td></tr>\n"
-      end.join
-    end
-
-    def gc_stat_rows
-      GC.stat(stat = Thread.current[:mwrap_gc_stat] ||= {})
-      %i(count heap_allocated_pages heap_eden_pages heap_tomb_pages
-          total_allocated_pages total_freed_pages).map do |k|
-         "<tr><td>GC.stat(:#{k})</td><td>#{stat[k]}</td></tr>\n"
-      end.join
-    end
-
-    GC_STAT_URL = 'https://docs.ruby-lang.org/en/trunk/GC.html#method-c-stat'
-    GC_STAT_HELP = <<~""
-      <p>Non-Infinity lifespans can indicate fragmentation.
-      <p>See <a
-      href="#{GC_STAT_URL}">#{GC_STAT_URL}</a> for info on GC.stat values.
-
-    def each
-      Mwrap.quiet do
-        yield("<html><head><title>heap pages</title></head>" \
-              "<body><h1>heap pages</h1>" \
-              "<table><tr><th>stat</th><th>value</th></tr>\n" \
-              "#{hpb_rows}" \
-              "#{gc_stat_rows}" \
-              "</table>\n" \
-              "#{GC_STAT_HELP}" \
-              "<table>#{HEADER}")
-        Mwrap::HeapPageBody.each do |addr, generation|
-          addr = -sprintf('0x%x', addr)
-          yield(-"<tr><td>#{addr}</td><td>#{generation}</td></tr>\n")
-        end
-        yield "</table></body></html>\n"
-      end
-    end
-  end
-
-  def r404 # :nodoc:
-    [404,{'Content-Type'=>'text/plain'},["Not found\n"]]
-  end
-
-  # The standard Rack application endpoint for MwrapRack
-  def call(env)
-    case env['PATH_INFO']
-    when %r{\A/each/(\d+)\z}
-      min = $1.to_i
-      m = env['QUERY_STRING'].match(/\bsort=(\w+)/)
-      Each.new(env['SCRIPT_NAME'], min, m ? m[1] : nil).response
-    when %r{\A/at/(.*)\z}
-      loc = -CGI.unescape($1)
-      loc = Mwrap[loc] or return r404
-      EachAt.new(loc).response
-    when '/heap_pages'
-      HeapPages.new.response
-    when '/'
-      n = 2000
-      u = 'https://80x24.org/mwrap/README.html'
-      b = -('<html><head><title>Mwrap demo</title></head>' \
-          "<body><p><a href=\"each/#{n}\">allocations &gt;#{n} bytes</a>" \
-          "<p><a href=\"#{u}\">#{u}</a>" \
-          "<p><a href=\"heap_pages\">heap pages</a>" \
-          "</body></html>\n")
-      [ 200, {'Content-Type'=>'text/html','Content-Length'=>-b.size.to_s},[b]]
-    else
-      r404
-    end
-  end
-end
diff --git a/mwrap.gemspec b/mwrap.gemspec
deleted file mode 100644
index 2c01a68..0000000
--- a/mwrap.gemspec
+++ /dev/null
@@ -1,32 +0,0 @@
-git_manifest = `git ls-files 2>/dev/null`.split("\n")
-manifest = File.exist?('MANIFEST') ?
-  File.readlines('MANIFEST').map!(&:chomp).delete_if(&:empty?) : git_manifest
-if git_manifest[0] && manifest != git_manifest
-  tmp = "MANIFEST.#$$.tmp"
-  File.open(tmp, 'w') { |fp| fp.puts(git_manifest.join("\n")) }
-  File.rename(tmp, 'MANIFEST')
-  system('git add MANIFEST')
-end
-
-desc = `git describe --abbrev=4 HEAD`.strip.tr('-', '.').delete_prefix('v')
-
-Gem::Specification.new do |s|
-  s.name = 'mwrap'
-  s.version = desc.empty? ? '2.0.0' : desc
-  s.homepage = 'https://80x24.org/mwrap/'
-  s.authors = ["Ruby hackers"]
-  s.summary = 'LD_PRELOAD malloc wrapper for Ruby'
-  s.executables = %w(mwrap)
-  s.files = manifest
-  s.description = <<~EOF
-mwrap wraps all malloc, calloc, and realloc calls to trace the Ruby
-source location of such calls and bytes allocated at each callsite.
-  EOF
-  s.email = %q{e@80x24.org}
-  s.test_files = Dir['test/test_*.rb']
-  s.extensions = %w(ext/mwrap/extconf.rb)
-
-  s.add_development_dependency('test-unit', '~> 3.0')
-  s.add_development_dependency('rake-compiler', '~> 1.0')
-  s.licenses = %w(GPL-2.0+)
-end
diff --git a/script/mwrap-perl b/script/mwrap-perl
new file mode 100644
index 0000000..5e5eec4
--- /dev/null
+++ b/script/mwrap-perl
@@ -0,0 +1,34 @@
+#!/usr/bin/perl -w
+# Copyright (C) 2019 mwrap hackers <mwrap-perl@80x24.org>
+# License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
+use strict;
+use Devel::Mwrap;
+my $so;
+if ($^O eq 'linux') {
+        my $maps = do {
+                open my $fh, '<', "/proc/$$/maps" or
+                                die "/proc/$$/maps not accessible: $!\n";
+                local $/;
+                <$fh>;
+        };
+        if ($maps =~ m![ \t](/[^\n]+?/Mwrap\.so)$!sm) {
+                $so = $1;
+        } else {
+                die "Mwrap.so not found in: $so\n";
+        }
+} else {
+        die "unsupported OS ($^O ne 'linux')";
+}
+my $cur = $ENV{LD_PRELOAD};
+if (defined $cur) {
+        my @cur = split(/[: \t]+/, $cur);
+        my %cur = map { $_ => 1 } @cur;
+        if (!$cur{$so}) {
+                # drop old redundant versions
+                my @keep = grep(!m!/Mwrap\.so\$!, @cur);
+                $ENV{LD_PRELOAD} = join(':', $so, @keep);
+        }
+} else {
+        $ENV{LD_PRELOAD} = $so;
+}
+exec @ARGV;
diff --git a/t/mwrap.t b/t/mwrap.t
new file mode 100644
index 0000000..5bcc285
--- /dev/null
+++ b/t/mwrap.t
@@ -0,0 +1,85 @@
+#!perl -w
+# Copyright (C) 2019 mwrap hackers <mwrap-perl@80x24.org>
+# License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
+use strict;
+use Test::More;
+use File::Temp qw(tempdir);
+use_ok 'Devel::Mwrap';
+
+my $tmpdir = tempdir('mwrap-perl-XXXXXX', TMPDIR => 1, CLEANUP => 1);
+my $dump = "$tmpdir/dump";
+my $out = "$tmpdir/out";
+my $err = "$tmpdir/err";
+my $src = slurp('blib/script/mwrap-perl');
+
+{
+        my $env = { MWRAP => "dump_path:$dump,dump_min:10000" };
+        my $nr = 1000;
+        mwrap_run('dump test', $env, '-e', '$x = "hello world" x '.$nr);
+        ok(-s $dump, "dump file written to");
+        my $s = slurp($dump);
+        my $re = qr/([0-9]+)[ \t]+([0-9]+)[ \t]+-e:1[ \t]*\n/sm;
+        my ($bytes, $n);
+        if ($s =~ $re) {
+                ($bytes, $n) = ($1, $2);
+                ok($bytes >= (length('hello world') * $nr),
+                        "counted 'hello world' x $nr");
+                ok($n >= 1, 'allocation counted');
+        } else {
+                fail("$s failed to match $re");
+        }
+}
+
+SKIP: { # C++ program which uses malloc via "new"
+        my $exp = `cmake -h`;
+        skip 'cmake missing', 2 if $?;
+        skip "`cmake -h' gave no output", 2 unless $exp =~ /\S/s;
+        open my $truncate, '>', $dump or die;
+        close $truncate or die;
+        my $env = { MWRAP => "dump_path:$dump" };
+        mwrap_run('cmake (C++ new)', $env, '-e',
+                'system(qw(cmake -h)); exit $?');
+        my $res = slurp($out);
+        is($res, $exp, "`cmake -h' works");
+};
+
+{
+        my $env = { MWRAP => "dump_path:$dump" };
+        mwrap_run('total_bytes*', $env, '-e', <<'E1');
+my $A = Devel::Mwrap::total_bytes_allocated();
+my $f = Devel::Mwrap::total_bytes_freed();
+print("$A - $f\n");
+E1
+        my $o = slurp($out);
+        like($o, qr/^([0-9]+) - ([0-9]+)\n/s, 'got allocated & freed bytes');
+}
+
+{
+        my $env = { MWRAP => "dump_path:$dump" };
+        mwrap_run('source location', $env, 't/source_location.perl');
+}
+
+done_testing();
+
+sub slurp {
+        open my $fh, '<', $_[0] or die "open($_[0]): $!";
+        local $/;
+        <$fh>;
+}
+
+sub mwrap_run {
+        my ($msg, $env, @args) = @_;
+        my $pid = fork;
+        if ($pid == 0) {
+                while (my ($k, $v) = each %$env) {
+                        $ENV{$k} = $v;
+                }
+                open STDERR, '>', $err or die "open: $!";
+                open STDOUT, '>', $out or die "open: $!";
+                @ARGV = ($^X, '-MDevel::Mwrap', @args);
+                eval $src;
+                die "fail: $! ($@)";
+        }
+        waitpid($pid, 0);
+        is($?, 0, $msg);
+}
diff --git a/t/source_location.perl b/t/source_location.perl
new file mode 100644
index 0000000..ed81ed8
--- /dev/null
+++ b/t/source_location.perl
@@ -0,0 +1,9 @@
+use Devel::Mwrap;
+my $foo = ('hello world' x 10000);
+my $k = __FILE__ . ":2";
+my $loc = Devel::Mwrap::get($k) or die;
+$loc->name eq $k or die;
+$loc->total >= 10000 or die;
+$loc->allocations >= 1 or die;
+$loc->frees >= 0 or die;
+exit 0;
diff --git a/test/test_mwrap.rb b/test/test_mwrap.rb
deleted file mode 100644
index 48fba23..0000000
--- a/test/test_mwrap.rb
+++ /dev/null
@@ -1,322 +0,0 @@
-# frozen_string_literal: true
-# Copyright (C) 2018 mwrap hackers <mwrap-public@80x24.org>
-# License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
-require 'test/unit'
-require 'mwrap'
-require 'rbconfig'
-require 'tempfile'
-
-class TestMwrap < Test::Unit::TestCase
-  RB = "#{RbConfig::CONFIG['bindir']}/#{RbConfig::CONFIG['RUBY_INSTALL_NAME']}"
-
-  mwrap_so = $".grep(%r{/mwrap\.so\z})[0]
-  env = ENV.to_hash
-  cur = env['LD_PRELOAD']
-  env['LD_PRELOAD'] = cur ? "#{mwrap_so}:#{cur}".freeze : mwrap_so
-  @@env = env.freeze
-  inc = File.dirname(mwrap_so)
-  @@cmd = %W(#{RB} -w --disable=gems -I#{inc} -rmwrap).freeze
-
-  def test_mwrap_preload
-    cmd = @@cmd + %w(
-      -e ("helloworld"*1000).clear
-      -e Mwrap.dump
-    )
-    Tempfile.create('junk') do |tmp|
-      tmp.sync = true
-      res = system(@@env, *cmd, err: tmp)
-      assert res, $?.inspect
-      tmp.rewind
-      lines = tmp.readlines
-      line_1 = lines.grep(/\s-e:1\b/)[0].strip
-      assert_equal '10001', line_1.split(/\s+/)[0]
-    end
-  end
-
-  def test_dump_via_destructor
-    env = @@env.dup
-    env['MWRAP'] = 'dump_fd:5'
-    cmd = @@cmd + %w(-e ("0"*10000).clear)
-    Tempfile.create('junk') do |tmp|
-      tmp.sync = true
-      res = system(env, *cmd, { 5 => tmp })
-      assert res, $?.inspect
-      tmp.rewind
-      assert_match(/\b10001\s+1\s+-e:1$/, tmp.read)
-
-      env['MWRAP'] = 'dump_fd:1,dump_min:10000'
-      tmp.rewind
-      tmp.truncate(0)
-      res = system(env, *cmd, { 1 => tmp })
-      assert res, $?.inspect
-      tmp.rewind
-      assert_match(/\b10001\s+1\s+-e:1$/, tmp.read)
-
-      tmp.rewind
-      tmp.truncate(0)
-      env['MWRAP'] = "dump_path:#{tmp.path},dump_min:10000"
-      res = system(env, *cmd)
-      assert res, $?.inspect
-      assert_match(/\b10001\s+1\s+-e:1$/, tmp.read)
-
-      tmp.rewind
-      tmp.truncate(0)
-      env['MWRAP'] = "dump_path:#{tmp.path},dump_heap:5"
-      res = system(env, *cmd)
-      assert res, $?.inspect
-      assert_match %r{lifespan_stddev}, tmp.read
-    end
-  end
-
-  def test_cmake
-    begin
-      exp = `cmake -h`
-    rescue Errno::ENOENT
-      warn 'cmake missing'
-      return
-    end
-    assert_not_predicate exp.strip, :empty?
-    env = @@env.merge('MWRAP' => 'dump_fd:1')
-    out = IO.popen(env, %w(cmake -h), &:read)
-    assert out.start_with?(exp), 'original help exists'
-    assert_not_equal exp, out, 'includes dump output'
-    dump = out.delete_prefix(exp)
-    assert_match(/\b0x[a-f0-9]+\b/s, dump, 'dump output has addresses')
-  end
-
-  def test_clear
-    cmd = @@cmd + %w(
-      -e ("0"*10000).clear
-      -e Mwrap.clear
-      -e ("0"*20000).clear
-      -e Mwrap.dump($stdout,9999)
-    )
-    Tempfile.create('junk') do |tmp|
-      tmp.sync = true
-      res = system(@@env, *cmd, { 1 => tmp })
-      assert res, $?.inspect
-      tmp.rewind
-      buf = tmp.read
-      assert_not_match(/\s+-e:1$/, buf)
-      assert_match(/\b20001\s+1\s+-e:3$/, buf)
-    end
-  end
-
-  # make sure we don't break commands spawned by an mwrap-ed Ruby process:
-  def test_non_ruby_exec
-    IO.pipe do |r, w|
-      th = Thread.new { r.read }
-      Tempfile.create('junk') do |tmp|
-        tmp.sync = true
-        env = @@env.merge('MWRAP' => "dump_path:#{tmp.path}")
-        cmd = %w(perl -e print("HELLO_WORLD"))
-        res = system(env, *cmd, out: w)
-        w.close
-        assert res, $?.inspect
-        assert_match(/0x[a-f0-9]+\b/, tmp.read)
-      end
-      assert_equal "HELLO_WORLD", th.value
-    end
-  end
-
-  # some URCU flavors use USR1, ensure the one we choose does not
-  def test_sigusr1_works
-    cmd = @@cmd + %w(
-      -e STDOUT.sync=true
-      -e trap(:USR1){p("HELLO_WORLD")}
-      -e END{Mwrap.dump}
-      -e puts -e STDIN.read)
-    IO.pipe do |r, w|
-      IO.pipe do |r2, w2|
-        pid = spawn(@@env, *cmd, in: r2, out: w, err: '/dev/null')
-        r2.close
-        w.close
-        assert_equal "\n", r.gets
-        buf = +''
-        10.times { Process.kill(:USR1, pid) }
-        while IO.select([r], nil, nil, 0.1)
-          case tmp = r.read_nonblock(1000, exception: false)
-          when String
-            buf << tmp
-          end
-        end
-        w2.close
-        Process.wait(pid)
-        assert_predicate $?, :success?, $?.inspect
-        assert_equal(["\"HELLO_WORLD\"\n"], buf.split(/^/).uniq)
-      end
-    end
-  end
-
-  def test_reset
-    assert_nil Mwrap.reset
-  end
-
-  def test_each
-    cmd = @@cmd + %w(
-      -e ("0"*10000).clear
-      -e h={}
-      -e Mwrap.each(1000){|a,b,c|h[a]=[b,c]}
-      -e puts(Marshal.dump(h))
-    )
-    r = IO.popen(@@env, cmd, 'r')
-    h = Marshal.load(r.read)
-    assert_not_predicate h, :empty?
-    h.each_key { |k| assert_kind_of String, k }
-    h.each_value do |total,calls|
-      assert_operator total, :>, 0
-      assert_operator calls, :>, 0
-      assert_operator total, :>=, calls
-    end
-  end
-
-  def test_aref_each
-    cmd = @@cmd + %w(
-      -e count=GC.count
-      -e GC.disable
-      -e keep=("0"*10000)
-      -e loc=Mwrap["-e:3"]
-      -e loc.each{|size,gen|p([size,gen,count])}
-    )
-    buf = IO.popen(@@env, cmd, &:read)
-    assert_predicate $?, :success?
-    assert_match(/\A\[\s*\d+,\s*\d+,\s*\d+\]\s*\z/s, buf)
-    size, gen, count = eval(buf)
-    assert_operator size, :>=, 10000
-    assert_operator gen, :>=, count
-
-    cmd = @@cmd + %w(
-      -e count=GC.count
-      -e locs=""
-      -e Mwrap.each(1){|loc,tot,calls|locs<<loc}
-      -e m=locs.match(/(\[0x[a-f0-9]+\])/i)
-      -e m||=locs.match(/\b(0x[a-f0-9]+)\b/i)
-      -e p(loc=Mwrap["bobloblaw\t#{m[1]}"])
-      -e loc.each{|size,gen|p([size,gen,count])}
-    )
-    buf = IO.popen(@@env, cmd, &:read)
-    assert_predicate $?, :success?
-    assert_match(/\bMwrap::SourceLocation\b/, buf)
-  end
-
-  def test_benchmark
-    cmd = @@cmd + %w(-rbenchmark
-      -e puts(Benchmark.measure{1000000.times{Time.now}}))
-    r = IO.popen(@@env, cmd, 'r')
-    require 'benchmark'
-    warn Benchmark::Tms::CAPTION
-    warn r.read
-  end if ENV['BENCHMARK']
-
-  def test_mwrap_dump_check
-    assert_raise(TypeError) { Mwrap.dump(:bogus) }
-  end
-
-  def assert_separately(src, *opts)
-    Tempfile.create(%w(mwrap .rb)) do |tmp|
-      tmp.write(src.lstrip!)
-      tmp.flush
-      assert(system(@@env, *@@cmd, tmp.path, *opts))
-    end
-  end
-
-  def test_source_location
-    assert_separately(+"#{<<~"begin;"}\n#{<<~'end;'}")
-    begin;
-      require 'mwrap'
-      foo = '0' * 10000
-      k = -"#{__FILE__}:2"
-      loc = Mwrap[k]
-      loc.name == k or abort 'SourceLocation#name broken'
-      loc.total >= 10000 or abort 'SourceLocation#total broken'
-      loc.frees == 0 or abort 'SourceLocation#frees broken'
-      loc.allocations == 1 or abort 'SourceLocation#allocations broken'
-      seen = false
-      loc.each do |*x| seen = x end
-      seen[1] == loc.total or 'SourceLocation#each broken'
-      foo.clear
-
-      # wait for call_rcu to perform real_free
-      freed = false
-      until freed
-        freed = true
-        loc.each do freed = false end
-      end
-      loc.frees == 1 or abort 'SourceLocation#frees broken (after free)'
-      Float === loc.mean_lifespan or abort 'mean_lifespan broken'
-      Integer === loc.max_lifespan or abort 'max_lifespan broken'
-
-      addr = false
-      Mwrap.each do |a,|
-        if a =~ /0x[a-f0-9]+/
-          addr = a
-          break
-        end
-      end
-      addr && addr.frozen? or abort 'Mwrap.each returned unfrozen address'
-      loc = Mwrap[addr] or abort "Mwrap[#{addr}] broken"
-      addr == loc.name or abort 'SourceLocation#name works on address'
-      loc.name.frozen? or abort 'SourceLocation#name not frozen'
-    end;
-  end
-
-  def test_quiet
-    assert_separately(+"#{<<~"begin;"}\n#{<<~'end;'}")
-    begin;
-      require 'mwrap'
-      before = __LINE__
-      res = Mwrap.quiet do |depth|
-        depth == 1 or abort 'depth is not 1'
-        ('a' * 10000).clear
-        Mwrap.quiet { |d| d == 2 or abort 'depth is not 2' }
-        :foo
-      end
-      after = __LINE__ - 1
-      (before..after).each do |lineno|
-        Mwrap["#{__FILE__}:#{lineno}"] and
-          abort "unexpectedly tracked allocation at line #{lineno}"
-      end
-      res == :foo or abort 'Mwrap.quiet did not return block result'
-    end;
-  end
-
-  def test_total_bytes
-    assert_separately(+"#{<<~"begin;"}\n#{<<~'end;'}")
-    begin;
-      require 'mwrap'
-      Mwrap.total_bytes_allocated > 0 or abort 'nothing allocated'
-      Mwrap.total_bytes_freed > 0 or abort 'nothing freed'
-      Mwrap.total_bytes_allocated > Mwrap.total_bytes_freed or
-        abort 'freed more than allocated'
-    end;
-  end
-
-  def test_heap_page_body
-    assert_separately(+"#{<<~"begin;"}\n#{<<~'end;'}")
-    begin;
-      require 'mwrap'
-      require 'rubygems' # use up some memory
-      ap = GC.stat(:heap_allocated_pages)
-      h = {}
-      nr = 0
-      Mwrap::HeapPageBody.each do |addr, gen|
-        nr += 1
-        gen <= GC.count && gen >= 0 or abort "bad generation: #{gen}"
-        (0 == (addr & 16383)) or abort "addr not aligned: #{'%x' % addr}"
-      end
-      nr == ap or abort 'HeapPageBody.each missed page'
-      10.times { (1..20000).to_a.map(&:to_s) }
-      3.times { GC.start }
-      Mwrap::HeapPageBody.stat(h)
-      Integer === h[:lifespan_max] or abort 'lifespan_max not recorded'
-      Integer === h[:lifespan_min] or abort 'lifespan_min not recorded'
-      Float === h[:lifespan_mean] or abort 'lifespan_mean not recorded'
-      3.times { GC.start }
-      10.times { (1..20000).to_a.map(&:to_s) }
-      Mwrap::HeapPageBody.stat(h)
-      h[:deathspan_min] <= h[:deathspan_max] or
-        abort 'wrong min/max deathtime'
-      Float === h[:deathspan_mean] or abort 'deathspan_mean not recorded'
-    end;
-  end
-end
diff --git a/typemap b/typemap
new file mode 100644
index 0000000..9531289
--- /dev/null
+++ b/typemap
@@ -0,0 +1,4 @@
+TYPEMAP
+size_t        T_UV
+const char *        T_PV
+Devel::Mwrap::SrcLoc        T_PTROBJ