diff options
Diffstat (limited to 'mwrap_core.h')
-rw-r--r-- | mwrap_core.h | 751 |
1 files changed, 751 insertions, 0 deletions
diff --git a/mwrap_core.h b/mwrap_core.h new file mode 100644 index 0000000..09b579d --- /dev/null +++ b/mwrap_core.h @@ -0,0 +1,751 @@ +/* + * Copyright (C) mwrap hackers <mwrap-perl@80x24.org> + * License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt> + * Disclaimer: I don't really know my way around XS or Perl internals well + */ +#define _LGPL_SOURCE /* allows URCU to inline some stuff */ +#include "mymalloc.h" /* includes dlmalloc_c.h */ +#ifndef MWRAP_PERL +# define MWRAP_PERL 0 +#endif + +#if MWRAP_PERL +# include "EXTERN.h" +# include "perl.h" +# include "XSUB.h" +# include "embed.h" +# include "ppport.h" +# ifndef MWRAP_EARLY_THREADS +# define MWRAP_EARLY_THREADS 1 +# endif +#endif + +/* + * Start URCU threads early for runtimes (e.g. Perl) which leave + * signals unblocked. This isn't needed for (C)Ruby since it + * currently runs with all signals blocked. + * Needed for URCU prior to commit ea3a28a3f71dd02fb34ed4e3108f93275dbef89a + * ("Disable signals in URCU background threads" 2022-09-23) + */ +#ifndef MWRAP_EARLY_THREADS +# define MWRAP_EARLY_THREADS 0 +#endif + +#include <execinfo.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <dlfcn.h> +#include <assert.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <pthread.h> +#include <signal.h> +#include <urcu-bp.h> +#include <urcu/rculfhash.h> +#include <urcu/rculist.h> +#include "jhash.h" + +/* + * Perl doesn't have a GC the same way (C) Ruby does, so no GC count. + * Instead, the relative age of an object is the number of total bytes + * allocated (and we don't care about overflow on 32-bit since + * hardly anybody still uses it). + */ +static size_t total_bytes_inc, total_bytes_dec; + +#if MWRAP_PERL +extern pthread_key_t __attribute__((weak)) PL_thr_key; +extern const char __attribute__((weak)) PL_memory_wrap[]; /* needed for -O0 */ +#endif + +static MWRAP_TSD size_t locating; +#ifndef PERL_IMPLICIT_CONTEXT +static size_t *root_locating; /* determines if PL_curcop is our thread */ +#endif +static struct cds_lfht *totals; +union padded_mutex { + pthread_mutex_t mtx; + char pad[64]; +}; + +/* a round-robin pool of mutexes */ +#define MUTEX_NR (1 << 6) +#define MUTEX_MASK (MUTEX_NR - 1) +#ifdef __FreeBSD__ +# define STATIC_MTX_INIT_OK (0) +#else /* only tested on Linux + glibc */ +# define STATIC_MTX_INIT_OK (1) +#endif +static size_t mutex_i; +static union padded_mutex mutexes[MUTEX_NR] = { +#if STATIC_MTX_INIT_OK + [0 ... (MUTEX_NR-1)].mtx = PTHREAD_MUTEX_INITIALIZER +#endif +}; + +static pthread_mutex_t *mutex_assign(void) +{ + return &mutexes[uatomic_add_return(&mutex_i, 1) & MUTEX_MASK].mtx; +} + +static struct cds_lfht *lfht_new(void) +{ + unsigned long size = MWRAP_EARLY_THREADS ? 8192 : 16384; + return cds_lfht_new(size, 1, 0, CDS_LFHT_AUTO_RESIZE, 0); +} + +static void reset_mutexes(void) +{ + size_t i; + + for (i = 0; i < MUTEX_NR; i++) + CHECK(int, 0, pthread_mutex_init(&mutexes[i].mtx, 0)); +} + +#ifndef HAVE_MEMPCPY +static void *my_mempcpy(void *dest, const void *src, size_t n) +{ + return (char *)memcpy(dest, src, n) + n; +} +#define mempcpy(dst,src,n) my_mempcpy(dst,src,n) +#endif + +/* stolen from glibc: */ +#define RETURN_ADDRESS(nr) \ + (uintptr_t)(__builtin_extract_return_addr(__builtin_return_address(nr))) + +#define UINT2STR_MAX (sizeof(unsigned) == 4 ? 10 : 19) +static char *uint2str(unsigned num, char *dst, size_t *size) +{ + if (num <= 9) { + *size -= 1; + *dst++ = (char)(num + '0'); + return dst; + } else { + char buf[UINT2STR_MAX]; + char *end = buf + sizeof(buf); + char *p = end; + size_t adj; + + do { + *size -= 1; + *--p = (char)((num % 10) + '0'); + num /= 10; + } while (num && *size); + + if (!num) { + adj = end - p; + return mempcpy(dst, p, adj); + } + } + return NULL; +} + +/* allocated via real_malloc, immortal for safety reasons */ +struct src_loc { + pthread_mutex_t *mtx; + size_t total; + size_t freed_bytes; + size_t allocations; + size_t frees; + size_t age_total; /* (age_total / frees) => mean age at free */ + size_t max_lifespan; + struct cds_lfht_node hnode; + struct cds_list_head allocs; /* <=> alloc_hdr.node */ + uint32_t hval; + uint32_t capa; + char k[]; +}; + +/* + * Every allocation has this in the header, maintain alignment with malloc + * Do not expose this to Perl code because of use-after-free concerns. + */ +struct alloc_hdr { + struct cds_list_head anode; /* <=> src_loc.allocs */ + union { + struct { + size_t gen; /* global age */ + struct src_loc *loc; + } live; + struct rcu_head dead; + } as; + void *real; /* what to call real_free on (exists for *memalign) */ + size_t size; +}; + +/* $PATHNAME:$LINENO */ +static MWRAP_TSD char kbuf[ + PATH_MAX + sizeof(":") + UINT2STR_MAX + sizeof(struct alloc_hdr) +]; + +static struct alloc_hdr *ptr2hdr(void *p) +{ + return (struct alloc_hdr *)((uintptr_t)p - sizeof(struct alloc_hdr)); +} + +static void *hdr2ptr(struct alloc_hdr *h) +{ + return (void *)((uintptr_t)h + sizeof(struct alloc_hdr)); +} + +static int loc_is_addr(const struct src_loc *l) +{ + return l->capa == 0; +} + +static size_t loc_size(const struct src_loc *l) +{ + return loc_is_addr(l) ? sizeof(uintptr_t) : l->capa; +} + +static int loc_eq(struct cds_lfht_node *node, const void *key) +{ + const struct src_loc *existing; + const struct src_loc *k = key; + + existing = caa_container_of(node, struct src_loc, hnode); + + return (k->hval == existing->hval && + k->capa == existing->capa && + memcmp(k->k, existing->k, loc_size(k)) == 0); +} + +static struct src_loc *totals_add_rcu(struct src_loc *k) +{ + struct cds_lfht_iter iter; + struct cds_lfht_node *cur; + struct src_loc *l = 0; + struct cds_lfht *t; + +again: + t = CMM_LOAD_SHARED(totals); + if (!t) goto out_unlock; + cds_lfht_lookup(t, k->hval, loc_eq, k, &iter); + cur = cds_lfht_iter_get_node(&iter); + if (cur) { + l = caa_container_of(cur, struct src_loc, hnode); + uatomic_add(&l->total, k->total); + uatomic_add(&l->allocations, 1); + } else { + size_t n = loc_size(k); + l = real_malloc(sizeof(*l) + n); + if (!l) goto out_unlock; + memcpy(l, k, sizeof(*l) + n); + l->mtx = mutex_assign(); + l->age_total = 0; + l->max_lifespan = 0; + l->frees = 0; + l->allocations = 1; + CDS_INIT_LIST_HEAD(&l->allocs); + cur = cds_lfht_add_unique(t, k->hval, loc_eq, l, &l->hnode); + if (cur != &l->hnode) { /* lost race */ + rcu_read_unlock(); + real_free(l); + rcu_read_lock(); + goto again; + } + } +out_unlock: + return l; +} + +static void update_stats_rcu_unlock(const struct src_loc *l) +{ + if (caa_likely(l)) rcu_read_unlock(); +} + +static const COP *mwp_curcop(void) +{ +#if MWRAP_PERL + if (&PL_thr_key) { /* are we even in a Perl process? */ +# ifdef PERL_IMPLICIT_CONTEXT + if (aTHX) return PL_curcop; +# else /* !PERL_IMPLICIT_CONTEXT */ + if (&locating == root_locating) return PL_curcop; +# endif /* PERL_IMPLICIT_CONTEXT */ + } +#endif /* MWRAP_PERL */ + return NULL; +} + +static struct src_loc *assign_line(size_t size, const char *file, unsigned line) +{ + /* avoid vsnprintf or anything which could call malloc here: */ + size_t len; + struct src_loc *k; + char *dst; + size_t uint_size = UINT2STR_MAX; + + if (!file) + return NULL; + len = strlen(file); + if (len > PATH_MAX) + len = PATH_MAX; + k = (void *)kbuf; + k->total = size; + dst = mempcpy(k->k, file, len); + *dst++ = ':'; + + if (line == UINT_MAX) /* no line number */ + *dst++ = '-'; + else + dst = uint2str(line, dst, &uint_size); + + assert(dst && "bad math"); + *dst = 0; /* terminate string */ + k->capa = (uint32_t)(dst - k->k + 1); + k->hval = jhash(k->k, k->capa, 0xdeadbeef); + return totals_add_rcu(k); +} + +static struct src_loc * +update_stats_rcu_lock(size_t *generation, size_t size, uintptr_t caller) +{ + struct src_loc *k, *ret = 0; + static const size_t xlen = sizeof(caller); + struct cds_lfht *t = CMM_LOAD_SHARED(totals); + const COP *cop = NULL; + + if (caa_unlikely(!t)) return 0; /* not initialized */ + if (locating++) goto out; /* do not recurse into another *alloc */ + + *generation = uatomic_add_return(&total_bytes_inc, size); + cop = mwp_curcop(); + rcu_read_lock(); +#if MWRAP_PERL + if (cop) + ret = assign_line(size, OutCopFILE(cop), CopLINE(cop)); +#endif /* MWRAP_PERL */ + if (!ret) { + k = alloca(sizeof(*k) + xlen); + k->total = size; + memcpy(k->k, &caller, xlen); + k->capa = 0; + k->hval = jhash(k->k, xlen, 0xdeadbeef); + ret = totals_add_rcu(k); + } +out: + --locating; + return ret; +} + +size_t malloc_usable_size(void *p) +{ + return ptr2hdr(p)->size; +} + +static void free_hdr_rcu(struct rcu_head *dead) +{ + struct alloc_hdr *h = caa_container_of(dead, struct alloc_hdr, as.dead); + real_free(h->real); +} + +void free(void *p) +{ + if (p) { + struct alloc_hdr *h = ptr2hdr(p); + struct src_loc *l = h->as.live.loc; + + if (l) { + size_t current_bytes = uatomic_read(&total_bytes_inc); + size_t age = current_bytes - h->as.live.gen; + uatomic_add(&total_bytes_dec, h->size); + uatomic_add(&l->freed_bytes, h->size); + uatomic_set(&h->size, 0); + uatomic_add(&l->frees, 1); + uatomic_add(&l->age_total, age); + + CHECK(int, 0, pthread_mutex_lock(l->mtx)); + cds_list_del_rcu(&h->anode); + if (age > l->max_lifespan) + l->max_lifespan = age; + CHECK(int, 0, pthread_mutex_unlock(l->mtx)); + + call_rcu(&h->as.dead, free_hdr_rcu); + } else { + real_free(h->real); + } + } +} + +static void +alloc_insert_rcu(struct src_loc *l, struct alloc_hdr *h, size_t size, + void *real, size_t generation) +{ + /* we need src_loc to remain alive for the duration of this call */ + if (!h) return; + h->size = size; + h->real = real; + h->as.live.loc = l; + h->as.live.gen = generation; + if (l) { + CHECK(int, 0, pthread_mutex_lock(l->mtx)); + cds_list_add_rcu(&h->anode, &l->allocs); + CHECK(int, 0, pthread_mutex_unlock(l->mtx)); + } +} + +static bool ptr_is_aligned(void *ptr, size_t alignment) +{ + return ((uintptr_t) ptr & (alignment - 1)) == 0; +} + +static void *ptr_align(void *ptr, size_t alignment) +{ + return (void *)(((uintptr_t) ptr + (alignment - 1)) & ~(alignment - 1)); +} + +static bool is_power_of_two(size_t n) +{ + return (n & (n - 1)) == 0; +} + +static int +mwrap_memalign(void **pp, size_t alignment, size_t size, uintptr_t caller) +{ + struct src_loc *l; + struct alloc_hdr *h; + void *real; + size_t asize; + size_t generation = 0; + size_t d = alignment / sizeof(void*); + size_t r = alignment % sizeof(void*); + + if (r != 0 || d == 0 || !is_power_of_two(d)) + return EINVAL; + + if (alignment <= MALLOC_ALIGNMENT) { + void *p = malloc(size); + if (!p) return ENOMEM; + *pp = p; + return 0; + } + for (; alignment < sizeof(struct alloc_hdr); alignment *= 2) + ; /* double alignment until >= sizeof(struct alloc_hdr) */ + if (__builtin_add_overflow(size, alignment, &asize) || + __builtin_add_overflow(asize, sizeof(struct alloc_hdr), &asize)) + return ENOMEM; + + l = update_stats_rcu_lock(&generation, size, caller); + + real = real_malloc(asize); + if (real) { + void *p = hdr2ptr(real); + if (!ptr_is_aligned(p, alignment)) + p = ptr_align(p, alignment); + h = ptr2hdr(p); + alloc_insert_rcu(l, h, size, real, generation); + *pp = p; + } + update_stats_rcu_unlock(l); + + return real ? 0 : ENOMEM; +} + +static void *memalign_result(int err, void *p) +{ + if (caa_unlikely(err)) + errno = err; + return p; +} + +void *memalign(size_t alignment, size_t size) +{ + void *p = NULL; + int err = mwrap_memalign(&p, alignment, size, RETURN_ADDRESS(0)); + return memalign_result(err, p); +} + +int posix_memalign(void **p, size_t alignment, size_t size) +{ + return mwrap_memalign(p, alignment, size, RETURN_ADDRESS(0)); +} + +/* these aliases aren't needed for glibc, not sure about other libcs... */ +void *aligned_alloc(size_t, size_t) __attribute__((alias("memalign"))); +void cfree(void *) __attribute__((__nothrow__)) + __attribute__((__leaf__)) __attribute__((alias("free"))); + +void *valloc(size_t size) +{ + void *p = NULL; + int err; + + ensure_initialization(); + err = mwrap_memalign(&p, mparams.page_size, + size, RETURN_ADDRESS(0)); + return memalign_result(err, p); +} + +#if __GNUC__ < 7 +# define add_overflow_p(a,b) __extension__({ \ + __typeof__(a) _c; \ + __builtin_add_overflow(a,b,&_c); \ + }) +#else +# define add_overflow_p(a,b) \ + __builtin_add_overflow_p((a),(b),(__typeof__(a+b))0) +#endif + +static size_t size_align(size_t size, size_t alignment) +{ + return ((size + (alignment - 1)) & ~(alignment - 1)); +} + +void *pvalloc(size_t size) +{ + void *p = NULL; + int err; + + ensure_initialization(); + + if (add_overflow_p(size, mparams.page_size)) { + errno = ENOMEM; + return 0; + } + size = size_align(size, mparams.page_size); + err = mwrap_memalign(&p, mparams.page_size, + size, RETURN_ADDRESS(0)); + return memalign_result(err, p); +} + +void *malloc(size_t size) +{ + struct src_loc *l; + struct alloc_hdr *h; + size_t asize; + void *p; + size_t generation = 0; + + if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize)) + goto enomem; + + l = update_stats_rcu_lock(&generation, size, RETURN_ADDRESS(0)); + p = h = real_malloc(asize); + if (h) { + alloc_insert_rcu(l, h, size, h, generation); + p = hdr2ptr(h); + } + update_stats_rcu_unlock(l); + if (caa_unlikely(!p)) errno = ENOMEM; + return p; +enomem: + errno = ENOMEM; + return 0; +} + +void *calloc(size_t nmemb, size_t size) +{ + void *p; + struct src_loc *l; + struct alloc_hdr *h; + size_t asize; + size_t generation = 0; + + if (__builtin_mul_overflow(size, nmemb, &size)) { + errno = ENOMEM; + return 0; + } + if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize)) { + errno = ENOMEM; + return 0; + } + l = update_stats_rcu_lock(&generation, size, RETURN_ADDRESS(0)); + p = h = real_malloc(asize); + if (p) { + alloc_insert_rcu(l, h, size, h, generation); + p = hdr2ptr(h); + memset(p, 0, size); + } + update_stats_rcu_unlock(l); + if (caa_unlikely(!p)) errno = ENOMEM; + return p; +} + +void *realloc(void *ptr, size_t size) +{ + void *p; + struct src_loc *l; + struct alloc_hdr *h; + size_t asize; + size_t generation = 0; + + if (!size) { + free(ptr); + return 0; + } + if (__builtin_add_overflow(size, sizeof(struct alloc_hdr), &asize)) { + errno = ENOMEM; + return 0; + } + l = update_stats_rcu_lock(&generation, size, RETURN_ADDRESS(0)); + p = h = real_malloc(asize); + if (p) { + alloc_insert_rcu(l, h, size, h, generation); + p = hdr2ptr(h); + } + update_stats_rcu_unlock(l); + + if (ptr && p) { + struct alloc_hdr *old = ptr2hdr(ptr); + memcpy(p, ptr, old->size < size ? old->size : size); + free(ptr); + } + if (caa_unlikely(!p)) errno = ENOMEM; + return p; +} + +struct dump_arg { + FILE *fp; + size_t min; +}; + +static void *dump_to_file(struct dump_arg *a) +{ + struct cds_lfht_iter iter; + struct src_loc *l; + struct cds_lfht *t; + + ++locating; + rcu_read_lock(); + t = CMM_LOAD_SHARED(totals); + if (!t) + goto out_unlock; + cds_lfht_for_each_entry(t, &iter, l, hnode) { + const void *p = l->k; + char **s = 0; + if (l->total <= a->min) continue; + + if (loc_is_addr(l)) { + s = backtrace_symbols(p, 1); + p = s[0]; + } + fprintf(a->fp, "%16zu %12zu %s\n", + l->total, l->allocations, (const char *)p); + if (s) free(s); + } +out_unlock: + rcu_read_unlock(); + --locating; + return 0; +} + +static int extract_addr(const char *str, size_t len, void **p) +{ + const char *c; +#if defined(__GLIBC__) + return ((c = memrchr(str, '[', len)) && sscanf(c, "[%p]", p)); +#else /* TODO: test FreeBSD */ + return ((c = strstr(str, "0x")) && sscanf(c, "%p", p)); +#endif +} + +#ifndef O_CLOEXEC +# define O_CLOEXEC 0 +#endif +__attribute__ ((destructor)) +static void dump_destructor(void) +{ + const char *opt = getenv("MWRAP"); + const char *modes[] = { "a", "a+", "w", "w+", "r+" }; + struct dump_arg a = { .min = 0 }; + size_t i; + int dump_fd; + char *dump_path; + char *s; + + if (!opt) + return; + + ++locating; + if ((dump_path = strstr(opt, "dump_path:")) && + (dump_path += sizeof("dump_path")) && + *dump_path) { + char *end = strchr(dump_path, ','); + if (end) { + char *tmp = alloca(end - dump_path + 1); + end = mempcpy(tmp, dump_path, end - dump_path); + *end = 0; + dump_path = tmp; + } + dump_fd = open(dump_path, O_CLOEXEC|O_WRONLY|O_APPEND|O_CREAT, + 0666); + if (dump_fd < 0) { + fprintf(stderr, "open %s failed: %s\n", dump_path, + strerror(errno)); + goto out; + } + } + else if (!sscanf(opt, "dump_fd:%d", &dump_fd)) + goto out; + + if ((s = strstr(opt, "dump_min:"))) + sscanf(s, "dump_min:%zu", &a.min); + + switch (dump_fd) { + case 0: goto out; + case 1: a.fp = stdout; break; + case 2: a.fp = stderr; break; + default: + if (dump_fd < 0) + goto out; + a.fp = 0; + + for (i = 0; !a.fp && i < 5; i++) + a.fp = fdopen(dump_fd, modes[i]); + + if (!a.fp) { + fprintf(stderr, "failed to open fd=%d: %s\n", + dump_fd, strerror(errno)); + goto out; + } + /* we'll leak some memory here, but this is a destructor */ + } + dump_to_file(&a); +out: + --locating; +} + +__attribute__((constructor)) static void mwrap_ctor(void) +{ + sigset_t set, old; + struct alloc_hdr *h; + + ++locating; + + /* block signals */ + CHECK(int, 0, sigfillset(&set)); + CHECK(int, 0, pthread_sigmask(SIG_SETMASK, &set, &old)); + ensure_initialization(); + CHECK(int, 0, pthread_key_create(&tlskey, mstate_tsd_dtor)); + + /* + * PTHREAD_MUTEX_INITIALIZER on FreeBSD means lazy initialization, + * which happens at pthread_mutex_lock, and that calls calloc + */ + if (!STATIC_MTX_INIT_OK) + reset_mutexes(); + /* initialize mutexes used by urcu-bp */ + CMM_STORE_SHARED(totals, lfht_new()); + if (!CMM_LOAD_SHARED(totals)) + fprintf(stderr, "failed to allocate totals table\n"); + h = real_malloc(sizeof(struct alloc_hdr)); + if (h) { /* force call_rcu to start background thread */ + h->real = h; + call_rcu(&h->as.dead, free_hdr_rcu); + } else + fprintf(stderr, "malloc failed: %s\n", strerror(errno)); + + /* start background threads before unblocking signals */ + if (MWRAP_EARLY_THREADS) + cds_lfht_resize(CMM_LOAD_SHARED(totals), 16384); + + CHECK(int, 0, pthread_sigmask(SIG_SETMASK, &old, NULL)); + CHECK(int, 0, pthread_atfork(atfork_prepare, atfork_parent, + atfork_child)); + --locating; +} |