about summary refs log tree commit homepage
path: root/ext/mwrap/mwrap_core.h
diff options
context:
space:
mode:
Diffstat (limited to 'ext/mwrap/mwrap_core.h')
-rw-r--r--ext/mwrap/mwrap_core.h1091
1 files changed, 1091 insertions, 0 deletions
diff --git a/ext/mwrap/mwrap_core.h b/ext/mwrap/mwrap_core.h
new file mode 100644
index 0000000..c0eea2f
--- /dev/null
+++ b/ext/mwrap/mwrap_core.h
@@ -0,0 +1,1091 @@
+/*
+ * Copyright (C) mwrap hackers <mwrap-perl@80x24.org>
+ * License: GPL-3.0+ <https://www.gnu.org/licenses/gpl-3.0.txt>
+ * Disclaimer: I don't really know my way around XS or Perl internals well
+ */
+#define _LGPL_SOURCE /* allows URCU to inline some stuff */
+#define _GNU_SOURCE
+#include "mymalloc.h" /* includes dlmalloc_c.h */
+#ifndef MWRAP_PERL
+#        define MWRAP_PERL 0
+#endif
+
+#ifndef MWRAP_RUBY
+#        define MWRAP_RUBY 0
+#endif
+
+/* set a sensible max to avoid stack overflows */
+#ifndef MWRAP_BT_MAX
+#        define        MWRAP_BT_MAX 32
+#endif
+
+#ifndef _GNU_SOURCE
+#        define _GNU_SOURCE
+#endif
+#include <execinfo.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <dlfcn.h>
+#include <assert.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <urcu-bp.h>
+#include <urcu/rculfhash.h>
+#include <urcu/rculist.h>
+#include <limits.h>
+
+#if MWRAP_PERL
+#        include "EXTERN.h"
+#        include "perl.h"
+#        include "XSUB.h"
+#        include "embed.h"
+#        include "ppport.h"
+#endif
+
+#if MWRAP_RUBY
+#        undef _GNU_SOURCE /* ruby.h redefines it */
+#        include <ruby.h> /* defines HAVE_RUBY_RACTOR_H on 3.0+ */
+#        include <ruby/thread.h>
+#        include <ruby/io.h>
+#endif
+
+/*
+ * XXH3 (truncated to 32-bits) seems to provide a ~2% speedup.
+ * XXH32 doesn't show improvements over jhash despite rculfhash
+ * only supporting 32-bit hash values.
+ */
+#if defined(HAVE_XXHASH)
+#        define XXH_INLINE_ALL
+#        include <xxhash.h>
+#        if !defined(XXH3_64bits)
+#                warning XXH3_64bits not defined
+#        endif
+#endif
+
+#if !defined(XXH3_64bits)
+#        include "jhash.h"
+#endif
+
+#define U24_MAX (1U << 24)
+
+/*
+ * Perl doesn't have a GC the same way (C) Ruby does, so no GC count.
+ * Instead, the relative age of an object is the number of total bytes
+ * allocated (and we don't care about overflow on 32-bit since
+ * hardly anybody still uses it).
+ */
+static size_t total_bytes_inc, total_bytes_dec, nr_file, nr_src_loc;
+static uint32_t bt_req_depth;
+
+#if MWRAP_PERL
+extern pthread_key_t __attribute__((weak)) PL_thr_key;
+extern const char __attribute__((weak)) PL_memory_wrap[]; /* needed for -O0 */
+#        if !defined(PERL_IMPLICIT_CONTEXT)
+static size_t *root_locating; /* determines if PL_curcop is our thread */
+#        endif
+#endif /* MWRAP_PERL */
+
+#if MWRAP_RUBY
+const char *rb_source_location_cstr(int *line); /* requires 2.6.0dev or later */
+
+#        ifdef HAVE_RUBY_RACTOR_H /* Ruby 3.0+ */
+extern MWRAP_TSD void * __attribute__((weak)) ruby_current_ec;
+#        else /* Ruby 2.6-2.7 */
+extern void * __attribute__((weak)) ruby_current_execution_context_ptr;
+#                define ruby_current_ec ruby_current_execution_context_ptr
+#        endif /* HAVE_RUBY_RACTOR_H */
+
+extern void * __attribute__((weak)) ruby_current_vm_ptr; /* for rb_gc_count */
+extern size_t __attribute__((weak)) rb_gc_count(void);
+int __attribute__((weak)) ruby_thread_has_gvl_p(void);
+
+/*
+ * rb_source_location_cstr relies on GET_EC(), and it's possible
+ * to have a native thread but no EC during the early and late
+ * (teardown) phases of the Ruby process
+ */
+static int has_ec_p(void)
+{
+        return ruby_thread_has_gvl_p && ruby_thread_has_gvl_p() &&
+                ruby_current_vm_ptr && ruby_current_ec;
+}
+
+static void set_generation(size_t *gen, size_t size)
+{
+        if (rb_gc_count) {
+                uatomic_add_return(&total_bytes_inc, size);
+                if (has_ec_p())
+                        *gen = rb_gc_count();
+        } else {
+                *gen = uatomic_add_return(&total_bytes_inc, size);
+        }
+}
+#        define SET_GENERATION(gen, size) set_generation(gen, size)
+#endif /* MWRAP_RUBY */
+
+#ifndef SET_GENERATION
+#        define SET_GENERATION(gen, size) \
+                *gen = uatomic_add_return(&total_bytes_inc, size)
+#endif /* !SET_GENERATION */
+
+/* generic stuff: */
+static MWRAP_TSD size_t locating;
+static struct cds_lfht *files, *totals;
+union padded_mutex {
+        pthread_mutex_t mtx;
+        char pad[64]; /* cache alignment for common CPUs */
+};
+
+/* a pool of mutexes for all "struct src_loc" */
+#define MUTEX_NR   (1 << 6)
+#define MUTEX_MASK (MUTEX_NR - 1)
+static union padded_mutex mutexes[MUTEX_NR] = {
+        [0 ... (MUTEX_NR-1)].mtx = PTHREAD_MUTEX_INITIALIZER
+};
+
+#ifdef static_assert
+/* we only use uint32_t for pathname storage for struct alignment */
+static_assert(UINT32_MAX > PATH_MAX, "UINT32_MAX > PATH_MAX");
+#endif
+
+static struct cds_lfht *lfht_new(size_t size)
+{
+        return cds_lfht_new(size, 1, 0, CDS_LFHT_AUTO_RESIZE, 0);
+}
+
+static void reset_mutexes(void)
+{
+        size_t i;
+
+        for (i = 0; i < MUTEX_NR; i++)
+                CHECK(int, 0, pthread_mutex_init(&mutexes[i].mtx, 0));
+}
+
+#ifndef HAVE_MEMPCPY
+static void *my_mempcpy(void *dest, const void *src, size_t n)
+{
+        return (char *)memcpy(dest, src, n) + n;
+}
+#define mempcpy(dst,src,n) my_mempcpy(dst,src,n)
+#endif
+
+/* stolen from glibc: */
+#define RETURN_ADDRESS(nr) \
+  __builtin_extract_return_addr(__builtin_return_address(nr))
+
+
+#define SRC_LOC_BT(bt) union stk_bt bt; do { \
+        uint32_t depth = locating ? 1 : CMM_LOAD_SHARED(bt_req_depth); \
+        switch (depth) { \
+        case 0: \
+        case 1: bt.sl.bt_len = 1; bt.sl.bt[0] = RETURN_ADDRESS(0); break; \
+        default: /* skip 1st level of BT since thats our function */ \
+                mwrap_assert(depth <= MWRAP_BT_MAX); \
+                ++locating; \
+                long n = (long)backtrace(bt_dst(&bt), depth); \
+                --locating; \
+                bt.sl.bt_len = n <= 1 ? 0 : (uint32_t)n - 1; \
+                if (n > 1) mwrap_assert(bt.sl.bt[0] == RETURN_ADDRESS(0)); \
+        } \
+} while (0)
+
+/*
+ * only for interpreted sources (Perl/Ruby/etc), not backtrace_symbols* files
+ * Allocated via real_malloc / real_free
+ */
+struct src_file {
+        struct cds_lfht_node nd; /* <=> files table */
+        uint32_t fn_hash;
+        uint32_t fn_len; /* < PATH_MAX */
+        char fn[]; /* NUL-terminated */
+};
+
+/* allocated via real_malloc, immortal for safety reasons */
+struct src_loc {
+        size_t total;
+        size_t freed_bytes;
+        size_t allocations;
+        size_t frees;
+        size_t age_total; /* (age_total / frees) => mean age at free */
+        size_t max_lifespan;
+        struct cds_lfht_node hnode; /* <=> totals table */
+        struct cds_list_head allocs; /* <=> alloc_hdr.node */
+        uint32_t loc_hash;
+        uint8_t bt_len;
+        /* next 3 fields contiguous for hash_src_loc(): */
+        unsigned lineno:24; /* nobody should have >=16.7 LoC in one file */
+        struct src_file *f;
+        void *bt[];
+} __attribute__((packed,aligned(8)));
+
+/* sizeof() doesn't work on bitfields */
+#define SIZEOF_LINENO (size_t)(24 / 8)
+
+/*
+ * Every allocation has this in the header, maintain alignment with malloc
+ * Do not expose this to Perl code because of use-after-free concerns.
+ */
+struct alloc_hdr {
+        struct cds_list_head anode; /* <=> src_loc.allocs */
+        union {
+                struct {
+                        size_t gen; /* global age || rb_gc_count() */
+                        struct src_loc *loc;
+                } live;
+                struct rcu_head dead;
+        } as;
+        void *real; /* what to call real_free on (exists for *memalign) */
+        size_t size;
+};
+
+/* on-stack structures */
+union stk_sf {
+        struct src_file sf;
+        char buf_[sizeof(struct src_file) + PATH_MAX];
+};
+
+union stk_bt {
+        struct src_loc sl;
+        /* we subtract one level from MWRAP_BT_MAX since we discard one
+         * level of backtrace(3) (see below for why) */
+        char buf_[sizeof(struct src_loc) + sizeof(void *) * (MWRAP_BT_MAX-1)];
+};
+
+/*
+ * we discard the 1st-level of the backtrace(3) since it's our *alloc
+ * function (and therefore uninteresting), so we want backtrace(3) to
+ * write to bt->sl.bt[-1] so that bt->sl.bt[0] is the first interesting
+ * thing.
+ */
+#ifdef static_assert
+static_assert(offsetof(struct src_loc, f) + sizeof(void *) ==
+                offsetof(struct src_loc, bt),
+                "bt lineno is is bt[-1]");
+#endif
+static void **bt_dst(union stk_bt *bt)
+{
+        return (void **)&bt->sl.f;
+}
+
+static struct alloc_hdr *ptr2hdr(void *p)
+{
+        return (struct alloc_hdr *)((uintptr_t)p - sizeof(struct alloc_hdr));
+}
+
+static void *hdr2ptr(struct alloc_hdr *h)
+{
+        return (void *)((uintptr_t)h + sizeof(struct alloc_hdr));
+}
+
+static int loc_is_addr(const struct src_loc *l)
+{
+        return l->f == NULL;
+}
+
+static size_t bt_bytelen(const struct src_loc *l)
+{
+        return sizeof(l->bt[0]) * l->bt_len;
+}
+
+static size_t src_loc_hash_len(const struct src_loc *l)
+{
+        return sizeof(l->f) + SIZEOF_LINENO + bt_bytelen(l);
+}
+
+static void *src_loc_hash_tip(const struct src_loc *l)
+{
+        return (void *)((uintptr_t)&l->bt_len + sizeof(l->bt_len));
+}
+
+static int loc_eq(struct cds_lfht_node *node, const void *key)
+{
+        const struct src_loc *existing;
+        const struct src_loc *k = key;
+
+        existing = caa_container_of(node, struct src_loc, hnode);
+
+        return (k->bt_len == existing->bt_len &&
+                !memcmp(src_loc_hash_tip(k), src_loc_hash_tip(existing),
+                        src_loc_hash_len(k)));
+}
+
+static int fn_eq(struct cds_lfht_node *node, const void *key)
+{
+        const struct src_file *existing;
+        const struct src_file *k = key;
+
+        existing = caa_container_of(node, struct src_file, nd);
+
+        return (k->fn_len == existing->fn_len &&
+                !memcmp(k->fn, existing->fn, k->fn_len));
+}
+
+static struct src_loc *src_loc_get(struct cds_lfht *t, const struct src_loc *k)
+{
+        struct cds_lfht_iter iter;
+        struct cds_lfht_node *cur;
+
+        mwrap_assert(rcu_read_ongoing());
+        cds_lfht_lookup(t, k->loc_hash, loc_eq, k, &iter);
+        cur = cds_lfht_iter_get_node(&iter);
+        return cur ? caa_container_of(cur, struct src_loc, hnode) : NULL;
+}
+
+static struct src_loc *totals_add_rcu(const struct src_loc *k)
+{
+        struct src_loc *l;
+        struct cds_lfht *t = CMM_LOAD_SHARED(totals);
+        if (!t) return NULL;
+
+again:
+        l = src_loc_get(t, k);
+        if (l) {
+                uatomic_add(&l->total, k->total);
+                uatomic_inc(&l->allocations);
+        } else {
+                size_t n = bt_bytelen(k) + sizeof(*k);
+                struct cds_lfht_node *cur;
+
+                l = real_malloc(n);
+                if (!l) return l;
+                memcpy(l, k, n);
+                l->freed_bytes = 0;
+                l->age_total = 0;
+                l->max_lifespan = 0;
+                l->freed_bytes = 0;
+                l->frees = 0;
+                l->allocations = 1;
+                CDS_INIT_LIST_HEAD(&l->allocs);
+                cur = cds_lfht_add_unique(t, l->loc_hash, loc_eq, l, &l->hnode);
+                if (cur == &l->hnode) {
+                        uatomic_inc(&nr_src_loc);
+                } else { /* lost race */
+                        rcu_read_unlock();
+                        real_free(l);
+                        rcu_read_lock();
+                        goto again;
+                }
+        }
+        return l;
+}
+
+static uint32_t do_hash(const void *p, size_t len)
+{
+#if defined(XXH3_64bits)
+        union {
+                XXH64_hash_t u64;
+                uint32_t u32[2];
+        } u;
+        u.u64 = XXH3_64bits(p, len);
+        return u.u32[1];
+#else
+        return jhash(p, len, 0xdeadbeef);
+#endif
+}
+
+static void hash_src_loc(struct src_loc *l)
+{
+        l->loc_hash = do_hash(src_loc_hash_tip(l), src_loc_hash_len(l));
+}
+
+static struct src_file *src_file_get(struct cds_lfht *t, struct src_file *k,
+                                        const char *fn, size_t fn_len)
+{
+        struct cds_lfht_iter iter;
+        struct cds_lfht_node *cur;
+
+        mwrap_assert(t); /* caller should've bailed if missing */
+        if (fn_len >= PATH_MAX)
+                return NULL;
+        k->fn_len = (uint32_t)fn_len;
+        memcpy(k->fn, fn, fn_len);
+        k->fn[fn_len] = 0;
+        k->fn_hash = do_hash(k->fn, fn_len);
+        mwrap_assert(rcu_read_ongoing());
+        cds_lfht_lookup(t, k->fn_hash, fn_eq, k, &iter);
+        cur = cds_lfht_iter_get_node(&iter);
+
+        return cur ? caa_container_of(cur, struct src_file, nd) : NULL;
+}
+
+#if MWRAP_PERL
+static const COP *mwp_curcop(void)
+{
+        if (&PL_thr_key) { /* are we even in a Perl process? */
+#        ifdef PERL_IMPLICIT_CONTEXT
+                if (aTHX) return PL_curcop;
+#        else /* !PERL_IMPLICIT_CONTEXT */
+                if (&locating == root_locating) return PL_curcop;
+#        endif /* PERL_IMPLICIT_CONTEXT */
+        }
+        return NULL;
+}
+
+static const char *mw_perl_src_file_cstr(unsigned *lineno)
+{
+        const COP *cop = mwp_curcop();
+        if (!cop) return NULL;
+        const char *fn = CopFILE(cop);
+        if (!fn) return NULL;
+        *lineno = CopLINE(cop);
+        return fn;
+}
+#        define SRC_FILE_CSTR(lineno) mw_perl_src_file_cstr(lineno)
+#endif /* MWRAP_PERL */
+
+#if MWRAP_RUBY
+static const char *mw_ruby_src_file_cstr(unsigned *lineno)
+{
+        if (!has_ec_p()) return NULL;
+        int line;
+        const char *fn = rb_source_location_cstr(&line);
+        *lineno = line < 0 ? UINT_MAX : (unsigned)line;
+        return fn;
+}
+#        define SRC_FILE_CSTR(lineno) mw_ruby_src_file_cstr(lineno)
+#endif /* MWRAP_RUBY */
+
+#ifndef SRC_FILE_CSTR /* for C-only compilation */
+#        define SRC_FILE_CSTR(lineno)        (NULL)
+#endif /* !SRC_FILE_CSTR */
+
+static struct src_loc *assign_line(size_t size, struct src_loc *sl,
+                                const char *fn, unsigned lineno)
+{
+        struct src_file *f;
+        union stk_sf sf;
+        struct cds_lfht_node *cur;
+        struct cds_lfht *t = CMM_LOAD_SHARED(files);
+
+        mwrap_assert(t);
+
+        size_t len = strlen(fn);
+        if (len >= PATH_MAX)
+                len = PATH_MAX - 1;
+
+        if (lineno == UINT_MAX) { /* NOLINE in Perl is UINT_MAX */
+                lineno = U24_MAX;
+        } else if (lineno > U24_MAX) {
+                fprintf(stderr,
+                        "%s:%u line number exceeds limit (%u), capped\n",
+                        fn, lineno, U24_MAX);
+                lineno = U24_MAX;
+        }
+again:
+        f = src_file_get(t, &sf.sf, fn, len);
+        if (!f) { /* doesn't exist, add a new one */
+                f = real_malloc(sizeof(*f) + len + 1);
+                if (!f) return NULL;
+                memcpy(f, &sf.sf, sizeof(*f) + len + 1);
+                cur = cds_lfht_add_unique(t, f->fn_hash, fn_eq, f, &f->nd);
+                if (cur == &f->nd) {
+                        uatomic_inc(&nr_file);
+                } else { /* lost race */
+                        rcu_read_unlock();
+                        real_free(f);
+                        rcu_read_lock();
+                        goto again;
+                }
+        }
+
+        sl->total = size;
+        sl->f = f;
+        sl->lineno = lineno;
+        if (f && !bt_req_depth)
+                sl->bt_len = 0;
+        hash_src_loc(sl);
+        return totals_add_rcu(sl);
+}
+
+static struct src_loc *
+update_stats_rcu_lock(size_t *gen, size_t size, struct src_loc *sl)
+{
+        struct cds_lfht *t = CMM_LOAD_SHARED(totals);
+        struct src_loc *ret = NULL;
+
+        if (caa_unlikely(!t)) return 0; /* not initialized */
+        if (locating++) goto out; /* do not recurse into another *alloc */
+
+        SET_GENERATION(gen, size);
+
+        unsigned lineno;
+        const char *fn = SRC_FILE_CSTR(&lineno);
+
+        rcu_read_lock();
+        if (fn)
+                ret = assign_line(size, sl, fn, lineno);
+        if (!ret) { /* no associated Perl|Ruby code, just C/C++ */
+                sl->total = size;
+                sl->f = NULL;
+                sl->lineno = 0;
+                hash_src_loc(sl);
+                ret = totals_add_rcu(sl);
+        }
+out:
+        --locating;
+        return ret;
+}
+
+size_t malloc_usable_size(void *p)
+{
+        return ptr2hdr(p)->size;
+}
+
+static void free_hdr_rcu(struct rcu_head *dead)
+{
+        struct alloc_hdr *h = caa_container_of(dead, struct alloc_hdr, as.dead);
+        real_free(h->real);
+}
+
+static pthread_mutex_t *src_loc_mutex_lock(const struct src_loc *l)
+{
+        pthread_mutex_t *mtx = &mutexes[l->loc_hash & MUTEX_MASK].mtx;
+        CHECK(int, 0, pthread_mutex_lock(mtx));
+        return mtx;
+}
+
+void free(void *p)
+{
+        if (p) {
+                struct alloc_hdr *h = ptr2hdr(p);
+                struct src_loc *l = h->as.live.loc;
+
+                if (l) {
+                        size_t current_bytes = uatomic_read(&total_bytes_inc);
+                        size_t age = current_bytes - h->as.live.gen;
+                        uatomic_add(&total_bytes_dec, h->size);
+                        uatomic_add(&l->freed_bytes, h->size);
+                        uatomic_set(&h->size, 0);
+                        uatomic_inc(&l->frees);
+                        uatomic_add(&l->age_total, age);
+
+                        pthread_mutex_t *mtx = src_loc_mutex_lock(l);
+                        cds_list_del_rcu(&h->anode);
+                        if (age > l->max_lifespan)
+                                l->max_lifespan = age;
+                        CHECK(int, 0, pthread_mutex_unlock(mtx));
+
+                        call_rcu(&h->as.dead, free_hdr_rcu);
+                } else {
+                        real_free(h->real);
+                }
+        }
+}
+
+static void
+alloc_insert_rcu(struct src_loc *sl, struct alloc_hdr *h, size_t size,
+                void *real)
+{
+        h->size = size;
+        h->real = real;
+        size_t gen = 0;
+        struct src_loc *l = update_stats_rcu_lock(&gen, size, sl);
+        h->as.live.loc = l;
+        h->as.live.gen = gen;
+        if (l) {
+                pthread_mutex_t *mtx = src_loc_mutex_lock(l);
+                cds_list_add_rcu(&h->anode, &l->allocs);
+                CHECK(int, 0, pthread_mutex_unlock(mtx));
+                rcu_read_unlock();
+        }
+}
+
+static bool ptr_is_aligned(void *ptr, size_t alignment)
+{
+        return ((uintptr_t) ptr & (alignment - 1)) == 0;
+}
+
+static void *ptr_align(void *ptr, size_t alignment)
+{
+        return (void *)(((uintptr_t) ptr + (alignment - 1)) & ~(alignment - 1));
+}
+
+static bool is_power_of_two(size_t n)
+{
+        return (n & (n - 1)) == 0;
+}
+
+static int
+mwrap_memalign(void **pp, size_t alignment, size_t size, struct src_loc *sl)
+{
+        void *real;
+        size_t asize;
+        size_t d = alignment / sizeof(void*);
+        size_t r = alignment % sizeof(void*);
+
+        if (r != 0 || d == 0 || !is_power_of_two(d))
+                return EINVAL;
+
+        if (alignment <= MALLOC_ALIGNMENT) {
+                void *p = malloc(size);
+                if (!p) return ENOMEM;
+                *pp = p;
+                return 0;
+        }
+        for (; alignment < sizeof(struct alloc_hdr); alignment *= 2)
+                ; /* double alignment until >= sizeof(struct alloc_hdr) */
+        if (__builtin_add_overflow(size, alignment, &asize) ||
+            __builtin_add_overflow(asize, sizeof(struct alloc_hdr), &asize))
+                return ENOMEM;
+
+        real = real_malloc(asize);
+        if (real) {
+                void *p = hdr2ptr(real);
+                if (!ptr_is_aligned(p, alignment))
+                        p = ptr_align(p, alignment);
+                struct alloc_hdr *h = ptr2hdr(p);
+                alloc_insert_rcu(sl, h, size, real);
+                *pp = p;
+        }
+
+        return real ? 0 : ENOMEM;
+}
+
+static void *memalign_result(int err, void *p)
+{
+        if (caa_unlikely(err))
+                errno = err;
+        return p;
+}
+
+void *memalign(size_t alignment, size_t size)
+{
+        void *p = NULL;
+        SRC_LOC_BT(bt);
+        int err = mwrap_memalign(&p, alignment, size, &bt.sl);
+        return memalign_result(err, p);
+}
+
+int posix_memalign(void **p, size_t alignment, size_t size)
+{
+        SRC_LOC_BT(bt);
+        return mwrap_memalign(p, alignment, size, &bt.sl);
+}
+
+/* these aliases aren't needed for glibc, not sure about other libcs... */
+void *aligned_alloc(size_t, size_t) __attribute__((alias("memalign")));
+void cfree(void *) __attribute__((__nothrow__))
+                __attribute__((__leaf__)) __attribute__((alias("free")));
+
+void *valloc(size_t size)
+{
+        ensure_initialization();
+        SRC_LOC_BT(bt);
+        void *p = NULL;
+        int err = mwrap_memalign(&p, mparams.page_size, size, &bt.sl);
+        return memalign_result(err, p);
+}
+
+#if __GNUC__ < 7
+#  define add_overflow_p(a,b) __extension__({ \
+                __typeof__(a) _c; \
+                __builtin_add_overflow(a,b,&_c); \
+        })
+#else
+#  define add_overflow_p(a,b) \
+                __builtin_add_overflow_p((a),(b),(__typeof__(a+b))0)
+#endif
+
+static size_t size_align(size_t size, size_t alignment)
+{
+        return ((size + (alignment - 1)) & ~(alignment - 1));
+}
+
+void *pvalloc(size_t size)
+{
+        void *p = NULL;
+
+        ensure_initialization();
+
+        if (add_overflow_p(size, mparams.page_size)) {
+                errno = ENOMEM;
+                return 0;
+        }
+        size = size_align(size, mparams.page_size);
+        SRC_LOC_BT(bt);
+        int err = mwrap_memalign(&p, mparams.page_size, size, &bt.sl);
+        return memalign_result(err, p);
+}
+
+void *malloc(size_t size)
+{
+        size_t asize;
+
+        if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize))
+                goto enomem;
+
+        void *p = real_malloc(asize);
+        if (p) {
+                SRC_LOC_BT(bt);
+                struct alloc_hdr *h = p;
+                alloc_insert_rcu(&bt.sl, h, size, h);
+                return hdr2ptr(h);
+        }
+enomem:
+        errno = ENOMEM;
+        return 0;
+}
+
+void *calloc(size_t nmemb, size_t size)
+{
+        size_t asize;
+
+        if (__builtin_mul_overflow(size, nmemb, &size))
+                goto enomem;
+        if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize))
+                goto enomem;
+        void *p = real_malloc(asize);
+        if (p) {
+                struct alloc_hdr *h = p;
+                SRC_LOC_BT(bt);
+                alloc_insert_rcu(&bt.sl, h, size, h);
+                return memset(hdr2ptr(h), 0, size);
+        }
+enomem:
+        errno = ENOMEM;
+        return 0;
+}
+
+void *realloc(void *ptr, size_t size)
+{
+        size_t asize;
+
+        if (!size) {
+                free(ptr);
+                return 0;
+        }
+        if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize))
+                goto enomem;
+        void *p = real_malloc(asize);
+        if (p) {
+                struct alloc_hdr *h = p;
+                SRC_LOC_BT(bt);
+                alloc_insert_rcu(&bt.sl, h, size, h);
+                p = hdr2ptr(h);
+                if (ptr) {
+                        struct alloc_hdr *old = ptr2hdr(ptr);
+                        memcpy(p, ptr, old->size < size ? old->size : size);
+                        free(ptr);
+                }
+                return p;
+        }
+enomem:
+        errno = ENOMEM;
+        return 0;
+}
+
+struct dump_arg {
+        FILE *fp;
+        size_t min;
+};
+
+char **bt_syms(void * const *addrlist, uint32_t size)
+{
+        mwrap_assert(size < INT_MAX);
+#if defined(__GLIBC__)
+        char **s = backtrace_symbols(addrlist, size);
+#else /* make FreeBSD look like glibc output: */
+        char **s = backtrace_symbols_fmt(addrlist, size, "%f(%n%D) [%a]");
+#endif
+        if (!s) fprintf(stderr, "backtrace_symbols: %m\n");
+        return s;
+}
+
+/* supported by modern gcc + clang */
+#define AUTO_FREE __attribute__((__cleanup__(cleanup_free)))
+static void cleanup_free(void *any)
+{
+        void **p = any;
+        free(*p);
+}
+
+static void *dump_to_file(struct dump_arg *a)
+{
+        struct cds_lfht_iter iter;
+        struct src_loc *l;
+        struct cds_lfht *t;
+
+        ++locating;
+        rcu_read_lock();
+        t = CMM_LOAD_SHARED(totals);
+        if (!t)
+                goto out_unlock;
+
+        cds_lfht_for_each_entry(t, &iter, l, hnode) {
+                if (l->total <= a->min) continue;
+
+                if (loc_is_addr(l)) {
+                        AUTO_FREE char **s = bt_syms(l->bt, 1);
+
+                        if (s)
+                                fprintf(a->fp, "%16zu %12zu %s\n",
+                                        l->total, l->allocations, s[0]);
+                } else {
+                        fprintf(a->fp, "%16zu %12zu %s:%u\n",
+                                l->total, l->allocations, l->f->fn, l->lineno);
+                }
+        }
+out_unlock:
+        rcu_read_unlock();
+        --locating;
+        return 0;
+}
+
+/* str = "/path/to/foo.so(+0x123) [0xdeadbeefcafe]" (see bt_syms()) */
+static int extract_addr(const char *str, size_t len, void **p)
+{
+        unsigned long x;
+        char *e;
+        const char *end = str + len;
+        const char *c = memrchr(str, '[', len);
+
+        if (c && (c + 2) < end && c[1] == '0' && c[2] == 'x') {
+                errno = 0;
+                x = strtoul(c + 3, &e, 16);
+                if (!errno && *e == ']') {
+                        *p = (void *)x;
+                        return 1;
+                }
+        }
+        return 0;
+}
+
+/* str is $PATHNAME:$LINENO, len is strlen(str) */
+static struct src_loc *src_loc_lookup(const char *str, size_t len)
+{
+        char *c = memrchr(str, ':', len);
+        const char *end = str + len;
+        unsigned lineno;
+        struct src_loc *l = NULL;
+        struct cds_lfht *t = CMM_LOAD_SHARED(files);
+        union stk_sf sf;
+
+        if (!c || c == end || !t)
+                return NULL;
+
+        size_t fn_len = c - str;
+        c++;
+        if (*c == '-') {
+                lineno = U24_MAX;
+        } else {
+                lineno = 0;
+                for (; c < end; c++) {
+                        if (*c < '0' || *c > '9')
+                                return NULL;
+                        lineno *= 10;
+                        lineno += (*c - '0');
+                }
+                if (lineno > U24_MAX)
+                        return NULL;
+        }
+        rcu_read_lock();
+        struct src_file *f = src_file_get(t, &sf.sf, str, fn_len);
+        t = CMM_LOAD_SHARED(totals);
+        if (f && t) {
+                struct src_loc k;
+
+                k.f = f;
+                k.lineno = lineno;
+                k.bt_len = 0;
+                hash_src_loc(&k);
+                l = src_loc_get(t, &k);
+        }
+        rcu_read_unlock();
+        return l;
+}
+
+#ifndef O_CLOEXEC
+#  define O_CLOEXEC 0
+#endif
+static void h1d_atexit(void);
+__attribute__ ((destructor)) static void mwrap_dtor(void)
+{
+        const char *opt = getenv("MWRAP");
+        const char *modes[] = { "a", "a+", "w", "w+", "r+" };
+        struct dump_arg a = { .min = 0 };
+        size_t i;
+        int dump_fd;
+        char *dump_path;
+        char *s;
+
+        /* n.b. unsetenv("MWRAP") may be called, so run this unconditionally */
+        h1d_atexit();
+
+        if (!opt)
+                return;
+
+        ++locating;
+        if ((dump_path = strstr(opt, "dump_path:")) &&
+                        (dump_path += sizeof("dump_path")) &&
+                        *dump_path) {
+                char *end = strchr(dump_path, ',');
+                char buf[PATH_MAX];
+                if (end) {
+                        mwrap_assert((end - dump_path) < (intptr_t)sizeof(buf));
+                        end = mempcpy(buf, dump_path, end - dump_path);
+                        *end = 0;
+                        dump_path = buf;
+                }
+                dump_fd = open(dump_path, O_CLOEXEC|O_WRONLY|O_APPEND|O_CREAT,
+                                0666);
+                if (dump_fd < 0) {
+                        fprintf(stderr, "open %s failed: %m\n", dump_path);
+                        goto out;
+                }
+        }
+        else if (!sscanf(opt, "dump_fd:%d", &dump_fd))
+                goto out;
+
+        if ((s = strstr(opt, "dump_min:")))
+                sscanf(s, "dump_min:%zu", &a.min);
+
+        switch (dump_fd) {
+        case 0: goto out;
+        case 1: a.fp = stdout; break;
+        case 2: a.fp = stderr; break;
+        default:
+                if (dump_fd < 0)
+                        goto out;
+                a.fp = 0;
+
+                for (i = 0; !a.fp && i < 5; i++)
+                        a.fp = fdopen(dump_fd, modes[i]);
+
+                if (!a.fp) {
+                        fprintf(stderr, "failed to open fd=%d: %m\n", dump_fd);
+                        goto out;
+                }
+                /* we'll leak some memory here, but this is a destructor */
+        }
+        dump_to_file(&a);
+out:
+        --locating;
+}
+
+static void mwrap_reset(void)
+{
+        struct cds_lfht *t;
+        struct cds_lfht_iter iter;
+        struct src_loc *l;
+
+        uatomic_set(&total_bytes_inc, 0);
+        uatomic_set(&total_bytes_dec, 0);
+
+        rcu_read_lock();
+        t = CMM_LOAD_SHARED(totals);
+        if (t)
+                cds_lfht_for_each_entry(t, &iter, l, hnode) {
+                        uatomic_set(&l->total, 0);
+                        uatomic_set(&l->allocations, 0);
+                        uatomic_set(&l->frees, 0);
+                        uatomic_set(&l->freed_bytes, 0);
+                        uatomic_set(&l->age_total, 0);
+                        uatomic_set(&l->max_lifespan, 0);
+                }
+        rcu_read_unlock();
+}
+
+static inline struct src_loc *mwrap_get(const char *str, size_t len)
+{
+        void *p;
+
+        if (!extract_addr(str, len, &p))
+                return src_loc_lookup(str, len);
+
+        union stk_bt k;
+        struct cds_lfht *t = CMM_LOAD_SHARED(totals);
+
+        if (!t) return NULL;
+        k.sl.f = NULL;
+        k.sl.lineno = 0;
+        k.sl.bt[0] = p;
+        k.sl.bt_len = 1;
+        hash_src_loc(&k.sl);
+        rcu_read_lock();
+        struct src_loc *l = src_loc_get(t, &k.sl);
+        rcu_read_unlock();
+        return l;
+}
+
+static struct src_loc *mwrap_get_bin(const char *buf, size_t len)
+{
+        static const size_t min_len = sizeof(struct src_file *) + SIZEOF_LINENO;
+
+        if (len >= min_len && ((len - min_len) % sizeof(void *)) == 0) {
+                struct cds_lfht *t = CMM_LOAD_SHARED(totals);
+                if (!t) return NULL;
+
+                union stk_bt k;
+                size_t bt_len = (len - min_len) / sizeof(void *);
+
+                if (bt_len > MWRAP_BT_MAX)
+                        return NULL;
+                k.sl.bt_len = bt_len;
+
+                memcpy(src_loc_hash_tip(&k.sl), buf, len);
+                hash_src_loc(&k.sl);
+                rcu_read_lock();
+                struct src_loc *l = src_loc_get(t, &k.sl);
+                rcu_read_unlock();
+                return l;
+        }
+        return NULL;
+}
+
+static const char *mwrap_env;
+#include "httpd.h"
+
+__attribute__((constructor)) static void mwrap_ctor(void)
+{
+        sigset_t set, old;
+        struct alloc_hdr *h;
+        mwrap_env = getenv("MWRAP");
+
+        ++locating;
+
+        /* block signals */
+        CHECK(int, 0, sigfillset(&set));
+        CHECK(int, 0, pthread_sigmask(SIG_SETMASK, &set, &old));
+        ensure_initialization();
+        CHECK(int, 0, pthread_key_create(&tlskey, mstate_tsd_dtor));
+
+        /* initialize mutexes used by urcu-bp */
+        CMM_STORE_SHARED(files, lfht_new(256));
+        if (!CMM_LOAD_SHARED(files))
+                fprintf(stderr, "failed to allocate files table\n");
+        CMM_STORE_SHARED(totals, lfht_new(16384));
+        if (!CMM_LOAD_SHARED(totals))
+                fprintf(stderr, "failed to allocate totals table\n");
+        h = real_malloc(sizeof(struct alloc_hdr));
+        if (h) { /* force call_rcu to start background thread */
+                h->real = h;
+                call_rcu(&h->as.dead, free_hdr_rcu);
+        } else
+                fprintf(stderr, "malloc: %m\n");
+
+        h1d_start();
+        CHECK(int, 0, pthread_sigmask(SIG_SETMASK, &old, NULL));
+        CHECK(int, 0, pthread_atfork(atfork_prepare, atfork_parent,
+                                     atfork_child));
+
+        if (mwrap_env) {
+                const char *bt = strstr(mwrap_env, "bt:");
+                if (bt) {
+                        bt += sizeof("bt");
+                        errno = 0;
+                        char *end;
+                        unsigned long n = strtoul(bt, &end, 10);
+                        if (n && !errno && (*end == ',' || *end == 0)) {
+                                if (n > MWRAP_BT_MAX)
+                                        n = MWRAP_BT_MAX;
+                                CMM_STORE_SHARED(bt_req_depth, (uint32_t)n);
+                        }
+                }
+        }
+        --locating;
+}