about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2024-04-06 21:49:54 +0000
committerEric Wong <e@80x24.org>2024-04-06 21:50:47 +0000
commitd15143e2ce156219ab43dee4e71d44f007df6692 (patch)
tree11fac737d0674f20453d0e5fe39ac6e1d4d36016
parent403b495e590f19e7ead8800edc3b14e3cdf7c5a6 (diff)
downloadmwrap-perl-master.tar.gz
To test and provide reproducable behavior of different mallocs,
provide an architecture-specific tracing mechanism to write
trace files.  Since these traces, they're compressed by gzip(1)
by default to avoid filling up hard drives of long-lived
daemons.

The compressor can be replaced with zstd or bzip2 via
"trace_compress:zstd" in the comma-delimited MWRAP environment.

The new mwrap-trace-replay command is designed to run with either
jemalloc or glibc malloc to replay trace files.  It can read
uncompressed output via stdin or compressed files via
gzip/zstd/bzip2.

This doesn't work reliably in multi-threaded code, but I have
fragmentation problems in single-threaded code.
-rw-r--r--MANIFEST8
-rw-r--r--Makefile.PL2
-rw-r--r--httpd.h83
-rw-r--r--lib/Devel/Mwrap/TraceReplay.pm80
-rw-r--r--lib/Devel/Mwrap/dlmalloc_c.h (renamed from dlmalloc_c.h)4
-rw-r--r--lib/Devel/Mwrap/khashl.h454
-rw-r--r--lib/Devel/Mwrap/trace-replay.h238
-rw-r--r--lib/Devel/Mwrap/trace_struct.h34
-rw-r--r--mwrap_core.h184
-rw-r--r--mymalloc.h16
-rw-r--r--script/mwrap-trace-replay48
-rw-r--r--t/httpd.t32
-rw-r--r--trace.h69
13 files changed, 1214 insertions, 38 deletions
diff --git a/MANIFEST b/MANIFEST
index cf42979..5af61f4 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -6,7 +6,6 @@ Makefile.PL
 Mwrap.xs
 README
 check.h
-dlmalloc_c.h
 examples/mwrap.psgi
 exe.sh
 gcc.h
@@ -15,6 +14,11 @@ jhash.h
 lib/Devel/Mwrap.pm
 lib/Devel/Mwrap/PSGI.pm
 lib/Devel/Mwrap/Rproxy.pm
+lib/Devel/Mwrap/TraceReplay.pm
+lib/Devel/Mwrap/dlmalloc_c.h
+lib/Devel/Mwrap/khashl.h
+lib/Devel/Mwrap/trace-replay.h
+lib/Devel/Mwrap/trace_struct.h
 mwrap_core.h
 mymalloc.h
 picohttpparser.h
@@ -23,9 +27,11 @@ ppport.h
 script/mwrap-decode-csv
 script/mwrap-perl
 script/mwrap-rproxy
+script/mwrap-trace-replay
 t/httpd-unit.t
 t/httpd.t
 t/mwrap.t
 t/source_location.perl
 t/test_common.perl
+trace.h
 typemap
diff --git a/Makefile.PL b/Makefile.PL
index 41e8f03..c1c21de 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -84,7 +84,7 @@ push @writemakefile_args, (
         PREREQ_PM => {},
         ABSTRACT_FROM => 'lib/Devel/Mwrap.pm',
         EXE_FILES => [qw(script/mwrap-perl script/mwrap-rproxy
-                script/mwrap-decode-csv)],
+                script/mwrap-decode-csv script/mwrap-trace-replay)],
         AUTHOR => 'mwrap hackers <mwrap-perl@80x24.org>',
         LIBS => $LIBS, # e.g. -lurcu-cds
         LICENSE => 'gpl_2', # GPL-3.0+, CPAN::Meta::Spec limitation
diff --git a/httpd.h b/httpd.h
index 8a105aa..a097e0e 100644
--- a/httpd.h
+++ b/httpd.h
@@ -332,6 +332,72 @@ static enum mw_qev h1_do_reset(struct mw_h1 *h1)
         return h1_res_oneshot(h1, r200, sizeof(r200) - 1);
 }
 
+static enum mw_qev h1_trace_too_long(struct mw_h1 *h1)
+{
+        static const char r500[] = "HTTP/1.1 500 Error\r\n"
+                "Content-Type: text/plain\r\n"
+                "Connection: close\r\n"
+                "Content-Length: 9\r\n\r\n" "too long\n";
+        return h1_res_oneshot(h1, r500, sizeof(r500) - 1);
+}
+
+static enum mw_qev h1_trace_on(struct mw_h1 *h1)
+{
+        int e = trace_on(getenv("MWRAP"));
+        if (e == ENAMETOOLONG) {
+                return h1_trace_too_long(h1);
+        } else if (e == EBUSY) {
+                static const char r500[] = "HTTP/1.1 500 Error\r\n"
+                        "Content-Type: text/plain\r\n"
+                        "Connection: close\r\n"
+                        "Content-Length: 12\r\n\r\n" "double open\n";
+                return h1_res_oneshot(h1, r500, sizeof(r500) - 1);
+        } else if (e) {
+                static const char r500[] = "HTTP/1.1 500 Error\r\n"
+                        "Content-Type: text/plain\r\n"
+                        "Connection: close\r\n"
+                        "Content-Length: 11\r\n\r\n" "open error\n";
+                return h1_res_oneshot(h1, r500, sizeof(r500) - 1);
+        }
+        static const char r200[] = "HTTP/1.1 200 OK\r\n"
+                "Content-Type: text/plain\r\n"
+                "Connection: close\r\n"
+                "Content-Length: 8\r\n\r\n" "tracing\n";
+        return h1_res_oneshot(h1, r200, sizeof(r200) - 1);
+}
+
+static enum mw_qev h1_trace_off(struct mw_h1 *h1)
+{
+        int fd = uatomic_xchg(&mwrap_trace_fd, -1);
+        if (fd >= 0) {
+                synchronize_rcu();
+                CHECK(int, 0, pthread_mutex_lock(&global_mtx));
+                mstate ms;
+                cds_list_for_each_entry(ms, &arenas_active, arena_node)
+                        trace_flush_fd(fd, ms);
+                cds_list_for_each_entry(ms, &arenas_unused, arena_node)
+                        trace_flush_fd(fd, ms);
+                CHECK(int, 0, pthread_mutex_unlock(&global_mtx));
+                close(fd);
+
+                static const char r200[] = "HTTP/1.1 200 OK\r\n"
+                        "Content-Type: text/plain\r\n"
+                        "Connection: close\r\n"
+                        "Content-Length: 10\r\n\r\n" "trace off\n";
+                return h1_res_oneshot(h1, r200, sizeof(r200) - 1);
+        }
+        static const char r500[] = "HTTP/1.1 500 Error\r\n"
+                "Content-Type: text/plain\r\n"
+                "Connection: close\r\n"
+                "Content-Length: 9\r\n\r\n" "not open\n";
+        return h1_res_oneshot(h1, r500, sizeof(r500) - 1);
+}
+
+static enum mw_qev h1_toggle_trace(struct mw_h1 *h1)
+{
+        return mwrap_trace_fd < 0 ? h1_trace_on(h1) : h1_trace_off(h1);
+}
+
 static enum mw_qev h1_do_trim(struct mw_h1 *h1)
 {
         static const char r200[] = "HTTP/1.1 200 OK\r\n"
@@ -806,8 +872,12 @@ static enum mw_qev h1_dispatch(struct mw_h1 *h1, struct mw_h1req *h1r)
                         return pid_root(h1, h1r);
                 }
         } else if (h1r->method_len == 4 && !memcmp(h1r->method, "POST", 4)) {
-                if (h1r->path_len == 6 && !memcmp(h1r->path, "/reset", 6))
-                        return h1_do_reset(h1);
+                if (h1r->path_len == 6) {
+                        if (!memcmp(h1r->path, "/reset", 6))
+                                return h1_do_reset(h1);
+                        if (!memcmp(h1r->path, "/trace", 6))
+                                return h1_toggle_trace(h1);
+                }
                 if (h1r->path_len == 5 && !memcmp(h1r->path, "/trim", 5))
                         return h1_do_trim(h1);
                 if (h1r->path_len == 4 && !memcmp(h1r->path, "/ctl", 4))
@@ -1353,6 +1423,8 @@ join_thread:
 
 static void h1d_atfork_prepare(void)
 {
+        if (pthread_equal(g_h1d.tid, pthread_self()))
+                return;
         if (uatomic_cmpxchg(&g_h1d.alive, 1, 0))
                 h1d_stop_join(&g_h1d);
 }
@@ -1373,6 +1445,11 @@ static void h1d_start(void) /* may be called as pthread_atfork child cb */
 /* must be called with global_mtx held */
 static void h1d_atfork_parent(void)
 {
-        if (g_h1d.lfd < 0)
+        if (!pthread_equal(g_h1d.tid, pthread_self()) && g_h1d.lfd < 0)
                 h1d_start();
 }
+
+static void h1d_atfork_child(void)
+{
+        if (!pthread_equal(g_h1d.tid, pthread_self())) h1d_start();
+}
diff --git a/lib/Devel/Mwrap/TraceReplay.pm b/lib/Devel/Mwrap/TraceReplay.pm
new file mode 100644
index 0000000..bb2551b
--- /dev/null
+++ b/lib/Devel/Mwrap/TraceReplay.pm
@@ -0,0 +1,80 @@
+# Copyright (C) mwrap hackers <mwrap-perl@80x24.org>
+# License: GPL-3.0+ <https://www.gnu.org/licenses/gpl-3.0.txt>
+#
+# Just-ahead-of-time builder for lib/Devel/Mwrap/trace-replay.h
+# I never want users to be without source code for repairs, so this
+# aims to replicate the feel of a scripting language using C.
+# The resulting executable is not linked to Perl in any way.
+package Devel::Mwrap::TraceReplay;
+use v5.12;
+use autodie;
+use Config;
+use Fcntl qw(LOCK_EX);
+my $dir = ($ENV{XDG_CACHE_HOME} //
+  (($ENV{HOME} // die('HOME unset')).'/.cache')).'/mwrap/trace-replay';
+my $bin = "$dir/trace-replay-$Config{archname}";
+my ($srcpfx) = (__FILE__ =~ m!\A(.+/)[^/]+\z!);
+my @srcs = map { $srcpfx.$_ } qw(trace-replay.h
+                dlmalloc_c.h khashl.h trace_struct.h);
+my $ldflags = '-Wl,-O1';
+$ldflags .= ' -Wl,--compress-debug-sections=zlib' if $^O ne 'openbsd';
+
+my $xflags = ($ENV{CFLAGS} // '-Wall -ggdb3 -pipe') . ' ' .
+        ($ENV{LDFLAGS} // $ldflags);
+substr($xflags, 0, 0, '-O2 ') if !defined($ENV{CFLAGS}) && !-w __FILE__;
+my $cc = $ENV{CC} // $Config{cc} // 'c99';
+
+sub build () {
+        if (!-d $dir) {
+                require File::Path;
+                File::Path::make_path($dir);
+        }
+        my ($prog) = ($bin =~ m!/([^/]+)\z!);
+        my $fn = "$dir/$prog.c";
+        open my $fh, '>', $fn;
+        until (flock($fh, LOCK_EX)) { die "LOCK_EX: $fn: $!" if !$!{EINTR} }
+        say $fh qq{#include "trace-replay.h"};
+        $fh->flush or die "flush: $!";
+        my $pkg_config = $ENV{PKG_CONFIG} // 'pkg-config';
+        chomp(my $fl = `$pkg_config  liburcu-cds --libs --cflags`);
+        $^O eq 'netbsd' and $fl =~ s/(\A|[ \t])\-L([^ \t]+)([ \t]|\z)/
+                                "$1-L$2 -Wl,-rpath=$2$3"/egsx;
+        my @xflags = split(' ', "$fl $xflags"); # ' ' awk-mode eats leading WS
+        my @cflags = ('-I', $srcpfx, grep(!/\A-(?:Wl|l|L)/, @xflags));
+        my @cmd = ($cc, '-o', "$dir/$prog.o", '-c', $fn, @cflags);
+        system(@cmd) and die "E: @cmd: \$?=$?";
+        @cmd = ($cc, '-o', "$dir/$prog.tmp", "$dir/$prog.o", @xflags);
+        system(@cmd) and die "E: @cmd: \$?=$?";
+        unlink $fn, "$dir/$prog.o";
+        open my $xfh, '>', "$dir/XFLAGS.tmp";
+        say $xfh $xflags;
+        close $xfh;
+        rename("$dir/$_.tmp", "$dir/$_") for ($prog, qw(XFLAGS));
+}
+
+sub needs_rebuild () {
+        open my $fh, '<', "$dir/XFLAGS" or return 1;
+        chomp(my $prev = <$fh>);
+        $prev ne $xflags;
+}
+
+sub check_build () {
+        use Time::HiRes qw(stat);
+        my $ctime = 0;
+        my @bin = stat($bin) or return build();
+        for (@srcs) {
+                my @st = stat($_) or die "stat $_: $!";
+                if ($st[10] > $ctime) {
+                        $ctime = $st[10];
+                        return build() if $ctime > $bin[10];
+                }
+        }
+        needs_rebuild() ? build() : 0;
+}
+
+sub run (@) {
+        check_build();
+        exec $bin, @_;
+}
+
+1;
diff --git a/dlmalloc_c.h b/lib/Devel/Mwrap/dlmalloc_c.h
index 5aa9e94..262c456 100644
--- a/dlmalloc_c.h
+++ b/lib/Devel/Mwrap/dlmalloc_c.h
@@ -590,6 +590,8 @@ MAX_RELEASE_CHECK_RATE   default: 4095 unless not HAVE_MMAP
 #include <sys/types.h>  /* For size_t */
 #endif  /* LACKS_SYS_TYPES_H */
 
+#include <limits.h>
+
 /* The maximum possible size_t value has all bits set */
 #define MAX_SIZE_T           (~(size_t)0)
 
@@ -2607,6 +2609,8 @@ struct malloc_state {
   MLOCK_T    mutex;     /* locate lock among fields that rarely change */
 #endif /* USE_LOCKS */
   msegment   seg;
+  size_t trace_wfill;
+  char trace_wbuf[65536];
   struct cds_list_head arena_node;        /* cold */
   struct cds_wfcq_tail remote_free_tail;
 };
diff --git a/lib/Devel/Mwrap/khashl.h b/lib/Devel/Mwrap/khashl.h
new file mode 100644
index 0000000..474f675
--- /dev/null
+++ b/lib/Devel/Mwrap/khashl.h
@@ -0,0 +1,454 @@
+/* The MIT License
+
+   Copyright (c) 2019-2023 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#ifndef __AC_KHASHL_H
+#define __AC_KHASHL_H
+
+#define AC_VERSION_KHASHL_H "0.2"
+
+typedef uint32_t khint32_t;
+typedef uint64_t khint64_t;
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define kh_inline inline /* portably handled elsewhere */
+#define KH_LOCAL static kh_inline
+
+/****************************
+ * Simple private functions *
+ ****************************/
+
+#define __kh_used(flag, i)       (flag[i>>5] >> (i&0x1fU) & 1U)
+#define __kh_set_used(flag, i)   (flag[i>>5] |= 1U<<(i&0x1fU))
+#define __kh_set_unused(flag, i) (flag[i>>5] &= ~(1U<<(i&0x1fU)))
+
+#define __kh_fsize(m) ((m) < 32? 1 : (m)>>5)
+
+static kh_inline khint_t __kh_h2b(khint_t hash, khint_t bits) { return hash * 2654435769U >> (32 - bits); }
+
+/*******************
+ * Hash table base *
+ *******************/
+
+#define __KHASHL_TYPE(HType, khkey_t) \
+        typedef struct HType { \
+                khint_t bits, count; \
+                khint32_t *used; \
+                khkey_t *keys; \
+        } HType;
+
+#define __KHASHL_PROTOTYPES(HType, prefix, khkey_t) \
+        extern HType *prefix##_init(void); \
+        extern void prefix##_destroy(HType *h); \
+        extern void prefix##_clear(HType *h); \
+        extern khint_t prefix##_getp(const HType *h, const khkey_t *key); \
+        extern void prefix##_resize(HType *h, khint_t new_n_buckets); \
+        extern khint_t prefix##_putp(HType *h, const khkey_t *key, int *absent); \
+        extern void prefix##_del(HType *h, khint_t k);
+
+#define __KHASHL_IMPL_BASIC(SCOPE, HType, prefix) \
+        SCOPE HType *prefix##_init(void) { \
+                return (HType*)kcalloc(1, sizeof(HType)); \
+        } \
+        SCOPE void prefix##_release(HType *h) { \
+                kfree((void *)h->keys); kfree(h->used); \
+        } \
+        SCOPE void prefix##_destroy(HType *h) { \
+                if (!h) return; \
+                prefix##_release(h); \
+                kfree(h); \
+        } \
+        SCOPE void prefix##_clear(HType *h) { \
+                if (h && h->used) { \
+                        khint_t n_buckets = (khint_t)1U << h->bits; \
+                        memset(h->used, 0, __kh_fsize(n_buckets) * sizeof(khint32_t)); \
+                        h->count = 0; \
+                } \
+        }
+
+#define __KHASHL_IMPL_GET(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+        SCOPE khint_t prefix##_getp_core(const HType *h, const khkey_t *key, khint_t hash) { \
+                khint_t i, last, n_buckets, mask; \
+                if (h->keys == 0) return 0; \
+                n_buckets = (khint_t)1U << h->bits; \
+                mask = n_buckets - 1U; \
+                i = last = __kh_h2b(hash, h->bits); \
+                while (__kh_used(h->used, i) && !__hash_eq(h->keys[i], *key)) { \
+                        i = (i + 1U) & mask; \
+                        if (i == last) return n_buckets; \
+                } \
+                return !__kh_used(h->used, i)? n_buckets : i; \
+        } \
+        SCOPE khint_t prefix##_getp(const HType *h, const khkey_t *key) { return prefix##_getp_core(h, key, __hash_fn(*key)); } \
+        SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { return prefix##_getp_core(h, &key, __hash_fn(key)); }
+
+#define __KHASHL_IMPL_RESIZE(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+        SCOPE void prefix##_resize(HType *h, khint_t new_n_buckets) { \
+                khint32_t *new_used = 0; \
+                khint_t j = 0, x = new_n_buckets, n_buckets, new_bits, new_mask; \
+                while ((x >>= 1) != 0) ++j; \
+                if (new_n_buckets & (new_n_buckets - 1)) ++j; \
+                new_bits = j > 2? j : 2; \
+                new_n_buckets = (khint_t)1U << new_bits; \
+                if (h->count > (new_n_buckets>>1) + (new_n_buckets>>2)) return; /* noop, requested size is too small */ \
+                new_used = (khint32_t*)kcalloc(__kh_fsize(new_n_buckets), sizeof(khint32_t)); \
+                n_buckets = h->keys? (khint_t)1U<<h->bits : 0U; \
+                if (n_buckets < new_n_buckets) { /* expand */ \
+                        REALLOC_ARRAY(h->keys, new_n_buckets); \
+                } /* otherwise shrink */ \
+                new_mask = new_n_buckets - 1; \
+                for (j = 0; j != n_buckets; ++j) { \
+                        khkey_t key; \
+                        if (!__kh_used(h->used, j)) continue; \
+                        key = h->keys[j]; \
+                        __kh_set_unused(h->used, j); \
+                        while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+                                khint_t i; \
+                                i = __kh_h2b(__hash_fn(key), new_bits); \
+                                while (__kh_used(new_used, i)) i = (i + 1) & new_mask; \
+                                __kh_set_used(new_used, i); \
+                                if (i < n_buckets && __kh_used(h->used, i)) { /* kick out the existing element */ \
+                                        { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+                                        __kh_set_unused(h->used, i); /* mark it as deleted in the old hash table */ \
+                                } else { /* write the element and jump out of the loop */ \
+                                        h->keys[i] = key; \
+                                        break; \
+                                } \
+                        } \
+                } \
+                if (n_buckets > new_n_buckets) /* shrink the hash table */ \
+                        REALLOC_ARRAY(h->keys, new_n_buckets); \
+                kfree(h->used); /* free the working space */ \
+                h->used = new_used, h->bits = new_bits; \
+        }
+
+#define __KHASHL_IMPL_PUT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+        SCOPE khint_t prefix##_putp_core(HType *h, const khkey_t *key, khint_t hash, int *absent) { \
+                khint_t n_buckets, i, last, mask; \
+                n_buckets = h->keys? (khint_t)1U<<h->bits : 0U; \
+                *absent = -1; \
+                if (h->count >= (n_buckets>>1) + (n_buckets>>2)) { /* rehashing */ \
+                        prefix##_resize(h, n_buckets + 1U); \
+                        n_buckets = (khint_t)1U<<h->bits; \
+                } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+                mask = n_buckets - 1; \
+                i = last = __kh_h2b(hash, h->bits); \
+                while (__kh_used(h->used, i) && !__hash_eq(h->keys[i], *key)) { \
+                        i = (i + 1U) & mask; \
+                        if (i == last) break; \
+                } \
+                if (!__kh_used(h->used, i)) { /* not present at all */ \
+                        h->keys[i] = *key; \
+                        __kh_set_used(h->used, i); \
+                        ++h->count; \
+                        *absent = 1; \
+                } else *absent = 0; /* Don't touch h->keys[i] if present */ \
+                return i; \
+        } \
+        SCOPE khint_t prefix##_putp(HType *h, const khkey_t *key, int *absent) { return prefix##_putp_core(h, key, __hash_fn(*key), absent); } \
+        SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { return prefix##_putp_core(h, &key, __hash_fn(key), absent); }
+
+#define __KHASHL_IMPL_DEL(SCOPE, HType, prefix, khkey_t, __hash_fn) \
+        SCOPE int prefix##_del(HType *h, khint_t i) { \
+                khint_t j = i, k, mask, n_buckets; \
+                if (h->keys == 0) return 0; \
+                n_buckets = (khint_t)1U<<h->bits; \
+                mask = n_buckets - 1U; \
+                while (1) { \
+                        j = (j + 1U) & mask; \
+                        if (j == i || !__kh_used(h->used, j)) break; /* j==i only when the table is completely full */ \
+                        k = __kh_h2b(__hash_fn(h->keys[j]), h->bits); \
+                        if ((j > i && (k <= i || k > j)) || (j < i && (k <= i && k > j))) \
+                                h->keys[i] = h->keys[j], i = j; \
+                } \
+                __kh_set_unused(h->used, i); \
+                --h->count; \
+                return 1; \
+        }
+
+#define KHASHL_DECLARE(HType, prefix, khkey_t) \
+        __KHASHL_TYPE(HType, khkey_t) \
+        __KHASHL_PROTOTYPES(HType, prefix, khkey_t)
+
+/* compatibility wrappers to make khash -> khashl migration easier */
+#define __KHASH_COMPAT(SCOPE, HType, prefix, khkey_t) \
+        typedef HType HType##_t; \
+        SCOPE HType *kh_init_##prefix(void) { return prefix##_init(); } \
+        SCOPE void kh_release_##prefix(HType *h) { prefix##_release(h); } \
+        SCOPE void kh_destroy_##prefix(HType *h) { prefix##_destroy(h); } \
+        SCOPE void kh_clear_##prefix(HType *h) { prefix##_clear(h); } \
+        SCOPE khint_t kh_get_##prefix(const HType *h, khkey_t key) { \
+                return prefix##_get(h, key); \
+        } \
+        SCOPE void kh_resize_##prefix(HType *h, khint_t new_n_buckets) { \
+                prefix##_resize(h, new_n_buckets); \
+        } \
+        SCOPE khint_t kh_put_##prefix(HType *h, khkey_t key, int *absent) { \
+                return prefix##_put(h, key, absent); \
+        } \
+        SCOPE int kh_del_##prefix(HType *h, khint_t i) { \
+                return prefix##_del(h, i); \
+        }
+
+#define KHASHL_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+        __KHASHL_TYPE(HType, khkey_t) \
+        __KHASHL_IMPL_BASIC(SCOPE, HType, prefix) \
+        __KHASHL_IMPL_GET(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+        __KHASHL_IMPL_RESIZE(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+        __KHASHL_IMPL_PUT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+        __KHASHL_IMPL_DEL(SCOPE, HType, prefix, khkey_t, __hash_fn)
+
+/***************************
+ * Ensemble of hash tables *
+ ***************************/
+
+typedef struct {
+        khint_t sub, pos;
+} kh_ensitr_t;
+
+#define KHASHE_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+        KHASHL_INIT(KH_LOCAL, HType##_sub, prefix##_sub, khkey_t, __hash_fn, __hash_eq) \
+        typedef struct HType { \
+                khint64_t count:54, bits:8; \
+                HType##_sub *sub; \
+        } HType; \
+        SCOPE HType *prefix##_init(int bits) { \
+                HType *g; \
+                g = (HType*)kcalloc(1, sizeof(*g)); \
+                g->bits = bits; \
+                g->sub = (HType##_sub*)kcalloc(1U<<bits, sizeof(*g->sub)); \
+                return g; \
+        } \
+        SCOPE void prefix##_destroy(HType *g) { \
+                int t; \
+                if (!g) return; \
+                for (t = 0; t < 1<<g->bits; ++t) { kfree((void*)g->sub[t].keys); kfree(g->sub[t].used); } \
+                kfree(g->sub); kfree(g); \
+        } \
+        SCOPE kh_ensitr_t prefix##_getp(const HType *g, const khkey_t *key) { \
+                khint_t hash, low, ret; \
+                kh_ensitr_t r; \
+                HType##_sub *h; \
+                hash = __hash_fn(*key); \
+                low = hash & ((1U<<g->bits) - 1); \
+                h = &g->sub[low]; \
+                ret = prefix##_sub_getp_core(h, key, hash); \
+                if (ret == 1U<<h->bits) r.sub = low, r.pos = (khint_t)-1; \
+                else r.sub = low, r.pos = ret; \
+                return r; \
+        } \
+        SCOPE kh_ensitr_t prefix##_get(const HType *g, const khkey_t key) { return prefix##_getp(g, &key); } \
+        SCOPE kh_ensitr_t prefix##_putp(HType *g, const khkey_t *key, int *absent) { \
+                khint_t hash, low, ret; \
+                kh_ensitr_t r; \
+                HType##_sub *h; \
+                hash = __hash_fn(*key); \
+                low = hash & ((1U<<g->bits) - 1); \
+                h = &g->sub[low]; \
+                ret = prefix##_sub_putp_core(h, key, hash, absent); \
+                if (*absent) ++g->count; \
+                if (ret == 1U<<h->bits) r.sub = low, r.pos = (khint_t)-1; \
+                else r.sub = low, r.pos = ret; \
+                return r; \
+        } \
+        SCOPE kh_ensitr_t prefix##_put(HType *g, const khkey_t key, int *absent) { return prefix##_putp(g, &key, absent); } \
+        SCOPE int prefix##_del(HType *g, kh_ensitr_t itr) { \
+                HType##_sub *h = &g->sub[itr.sub]; \
+                int ret; \
+                ret = prefix##_sub_del(h, itr.pos); \
+                if (ret) --g->count; \
+                return ret; \
+        }
+
+/*****************************
+ * More convenient interface *
+ *****************************/
+
+#define __kh_packed /* noop, we use -Werror=address-of-packed-member */
+#define __kh_cached_hash(x) ((x).hash)
+
+#define KHASHL_SET_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+        typedef struct { khkey_t key; } __kh_packed HType##_s_bucket_t; \
+        static kh_inline khint_t prefix##_s_hash(HType##_s_bucket_t x) { return __hash_fn(x.key); } \
+        static kh_inline int prefix##_s_eq(HType##_s_bucket_t x, HType##_s_bucket_t y) { return __hash_eq(x.key, y.key); } \
+        KHASHL_INIT(KH_LOCAL, HType, prefix##_s, HType##_s_bucket_t, prefix##_s_hash, prefix##_s_eq) \
+        SCOPE HType *prefix##_init(void) { return prefix##_s_init(); } \
+        SCOPE void prefix##_release(HType *h) { prefix##_s_release(h); } \
+        SCOPE void prefix##_destroy(HType *h) { prefix##_s_destroy(h); } \
+        SCOPE void prefix##_clear(HType *h) { prefix##_s_clear(h); } \
+        SCOPE void prefix##_resize(HType *h, khint_t new_n_buckets) { prefix##_s_resize(h, new_n_buckets); } \
+        SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_s_bucket_t t; t.key = key; return prefix##_s_getp(h, &t); } \
+        SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_s_del(h, k); } \
+        SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_s_bucket_t t; t.key = key; return prefix##_s_putp(h, &t, absent); } \
+        __KHASH_COMPAT(SCOPE, HType, prefix, khkey_t)
+
+#define KHASHL_MAP_INIT(SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \
+        typedef struct { khkey_t key; kh_val_t val; } __kh_packed HType##_m_bucket_t; \
+        static kh_inline khint_t prefix##_m_hash(HType##_m_bucket_t x) { return __hash_fn(x.key); } \
+        static kh_inline int prefix##_m_eq(HType##_m_bucket_t x, HType##_m_bucket_t y) { return __hash_eq(x.key, y.key); } \
+        KHASHL_INIT(KH_LOCAL, HType, prefix##_m, HType##_m_bucket_t, prefix##_m_hash, prefix##_m_eq) \
+        SCOPE HType *prefix##_init(void) { return prefix##_m_init(); } \
+        SCOPE void prefix##_release(HType *h) { prefix##_m_release(h); } \
+        SCOPE void prefix##_destroy(HType *h) { prefix##_m_destroy(h); } \
+        SCOPE void prefix##_clear(HType *h) { prefix##_m_clear(h); } \
+        SCOPE void prefix##_resize(HType *h, khint_t new_n_buckets) { prefix##_m_resize(h, new_n_buckets); } \
+        SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_m_bucket_t t; t.key = key; return prefix##_m_getp(h, &t); } \
+        SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_m_del(h, k); } \
+        SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_m_bucket_t t; t.key = key; return prefix##_m_putp(h, &t, absent); } \
+        __KHASH_COMPAT(SCOPE, HType, prefix, khkey_t)
+
+#define KHASHL_CSET_INIT(SCOPE, HType, prefix, khkey_t, __hash_fn, __hash_eq) \
+        typedef struct { khkey_t key; khint_t hash; } __kh_packed HType##_cs_bucket_t; \
+        static kh_inline int prefix##_cs_eq(HType##_cs_bucket_t x, HType##_cs_bucket_t y) { return x.hash == y.hash && __hash_eq(x.key, y.key); } \
+        KHASHL_INIT(KH_LOCAL, HType, prefix##_cs, HType##_cs_bucket_t, __kh_cached_hash, prefix##_cs_eq) \
+        SCOPE HType *prefix##_init(void) { return prefix##_cs_init(); } \
+        SCOPE void prefix##_destroy(HType *h) { prefix##_cs_destroy(h); } \
+        SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_cs_bucket_t t; t.key = key; t.hash = __hash_fn(key); return prefix##_cs_getp(h, &t); } \
+        SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_cs_del(h, k); } \
+        SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_cs_bucket_t t; t.key = key, t.hash = __hash_fn(key); return prefix##_cs_putp(h, &t, absent); }
+
+#define KHASHL_CMAP_INIT(SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \
+        typedef struct { khkey_t key; kh_val_t val; khint_t hash; } __kh_packed HType##_cm_bucket_t; \
+        static kh_inline int prefix##_cm_eq(HType##_cm_bucket_t x, HType##_cm_bucket_t y) { return x.hash == y.hash && __hash_eq(x.key, y.key); } \
+        KHASHL_INIT(KH_LOCAL, HType, prefix##_cm, HType##_cm_bucket_t, __kh_cached_hash, prefix##_cm_eq) \
+        SCOPE HType *prefix##_init(void) { return prefix##_cm_init(); } \
+        SCOPE void prefix##_destroy(HType *h) { prefix##_cm_destroy(h); } \
+        SCOPE khint_t prefix##_get(const HType *h, khkey_t key) { HType##_cm_bucket_t t; t.key = key; t.hash = __hash_fn(key); return prefix##_cm_getp(h, &t); } \
+        SCOPE int prefix##_del(HType *h, khint_t k) { return prefix##_cm_del(h, k); } \
+        SCOPE khint_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_cm_bucket_t t; t.key = key, t.hash = __hash_fn(key); return prefix##_cm_putp(h, &t, absent); }
+
+#define KHASHE_MAP_INIT(SCOPE, HType, prefix, khkey_t, kh_val_t, __hash_fn, __hash_eq) \
+        typedef struct { khkey_t key; kh_val_t val; } __kh_packed HType##_m_bucket_t; \
+        static kh_inline khint_t prefix##_m_hash(HType##_m_bucket_t x) { return __hash_fn(x.key); } \
+        static kh_inline int prefix##_m_eq(HType##_m_bucket_t x, HType##_m_bucket_t y) { return __hash_eq(x.key, y.key); } \
+        KHASHE_INIT(KH_LOCAL, HType, prefix##_m, HType##_m_bucket_t, prefix##_m_hash, prefix##_m_eq) \
+        SCOPE HType *prefix##_init(int bits) { return prefix##_m_init(bits); } \
+        SCOPE void prefix##_destroy(HType *h) { prefix##_m_destroy(h); } \
+        SCOPE kh_ensitr_t prefix##_get(const HType *h, khkey_t key) { HType##_m_bucket_t t; t.key = key; return prefix##_m_getp(h, &t); } \
+        SCOPE int prefix##_del(HType *h, kh_ensitr_t k) { return prefix##_m_del(h, k); } \
+        SCOPE kh_ensitr_t prefix##_put(HType *h, khkey_t key, int *absent) { HType##_m_bucket_t t; t.key = key; return prefix##_m_putp(h, &t, absent); }
+
+/**************************
+ * Public macro functions *
+ **************************/
+
+#define kh_bucket(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->count)
+
+#define kh_capacity(h) ((h)->keys? 1U<<(h)->bits : 0U)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) kh_capacity(h)
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x].key)
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->keys[x].val)
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) __kh_used((h)->used, (x))
+
+#define kh_ens_key(g, x) kh_key(&(g)->sub[(x).sub], (x).pos)
+#define kh_ens_val(g, x) kh_val(&(g)->sub[(x).sub], (x).pos)
+#define kh_ens_exist(g, x) kh_exist(&(g)->sub[(x).sub], (x).pos)
+#define kh_ens_is_end(x) ((x).pos == (khint_t)-1)
+#define kh_ens_size(g) ((g)->count)
+
+/**************************************
+ * Common hash and equality functions *
+ **************************************/
+
+#define kh_eq_generic(a, b) ((a) == (b))
+#define kh_eq_str(a, b) (strcmp((a), (b)) == 0)
+#define kh_hash_dummy(x) ((khint_t)(x))
+
+static kh_inline khint_t kh_hash_uint32(khint_t key) {
+        key += ~(key << 15);
+        key ^=  (key >> 10);
+        key +=  (key << 3);
+        key ^=  (key >> 6);
+        key += ~(key << 11);
+        key ^=  (key >> 16);
+        return key;
+}
+
+static kh_inline khint_t kh_hash_uint64(khint64_t key) {
+        key = ~key + (key << 21);
+        key = key ^ key >> 24;
+        key = (key + (key << 3)) + (key << 8);
+        key = key ^ key >> 14;
+        key = (key + (key << 2)) + (key << 4);
+        key = key ^ key >> 28;
+        key = key + (key << 31);
+        return (khint_t)key;
+}
+
+#define KH_FNV_SEED 11
+
+static kh_inline khint_t kh_hash_str(const char *s) { /* FNV1a */
+        khint_t h = KH_FNV_SEED ^ 2166136261U;
+        const unsigned char *t = (const unsigned char*)s;
+        for (; *t; ++t)
+                h ^= *t, h *= 16777619;
+        return h;
+}
+
+static kh_inline khint_t kh_hash_bytes(int len, const unsigned char *s) {
+        khint_t h = KH_FNV_SEED ^ 2166136261U;
+        int i;
+        for (i = 0; i < len; ++i)
+                h ^= s[i], h *= 16777619;
+        return h;
+}
+
+#endif /* __AC_KHASHL_H */
diff --git a/lib/Devel/Mwrap/trace-replay.h b/lib/Devel/Mwrap/trace-replay.h
new file mode 100644
index 0000000..c43cc0f
--- /dev/null
+++ b/lib/Devel/Mwrap/trace-replay.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (C) mwrap hackers <mwrap-perl@80x24.org>
+ * License: GPL-3.0+ <https://www.gnu.org/licenses/gpl-3.0.txt>
+ * single-threaded trace replayer, no runtime dependency on Perl
+ * nor the rest of mwrap (aside from the hacked up dlmalloc).
+ */
+#define _LGPL_SOURCE /* allows URCU to inline some stuff */
+#define _GNU_SOURCE
+/* knobs for dlmalloc */
+#define HAVE_MORECORE 0
+#define DEFAULT_GRANULARITY (2U * 1024U * 1024U)
+#define FOOTERS 1 /* required for remote_free_* stuff */
+#define USE_DL_PREFIX
+#define ONLY_MSPACES 1 /* aka per-thread "arenas" */
+#define DLMALLOC_EXPORT static inline
+/* #define NO_MALLOC_STATS 1 */
+#define USE_LOCKS 0 /* we do our own global_mtx + ms_tsd */
+#include <errno.h>
+#include <err.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <limits.h>
+#include <sys/types.h>
+#ifdef __GLIBC__
+extern void __attribute__((weak)) malloc_stats(void);
+extern void __attribute__((weak)) malloc_info(int, FILE *);
+#        define GLIBC_MALLOC_STATS() do { \
+                if (malloc_info) malloc_info(0, stderr); \
+                if (malloc_stats) malloc_stats(); \
+        } while (0)
+#else
+#        define GLIBC_MALLOC_STATS() do {} while (0)
+#endif
+
+extern void __attribute__((weak)) malloc_stats_print(
+        void (*wcb)(void *, const char *), void *, const char *opts);
+
+#include <urcu/rculist.h>
+#include <urcu/wfcqueue.h>
+#include "dlmalloc_c.h"
+static mstate tr_ms;
+
+static void *my_calloc(size_t nmemb, size_t size)
+{
+        void *p = mspace_calloc(tr_ms, nmemb, size);
+        if (!p) err(1, "calloc");
+        return p;
+}
+
+#define kcalloc(N,Z) my_calloc(N, Z)
+#define kfree(P) mspace_free(tr_ms, P)
+#define REALLOC_ARRAY(x, nmemb) do { \
+        size_t asize; \
+        if (__builtin_mul_overflow(sizeof(*(x)), nmemb, &asize)) \
+                errx(1, "mul_overflow"); \
+        (x) = mspace_realloc(tr_ms, (x), asize); \
+        if (!x) err(1, "realloc"); \
+} while (0)
+#include "khashl.h"
+#include "trace_struct.h"
+
+static inline khint_t hash_uptr(uintptr_t p)
+{
+        return sizeof(uintptr_t) == 4 ? kh_hash_uint32(p) : kh_hash_uint64(p);
+}
+
+KHASHL_MAP_INIT(KH_LOCAL, kh_ptrmap, ptrmap, uintptr_t, uintptr_t,
+                hash_uptr, kh_eq_generic)
+
+static kh_ptrmap *old2cur;
+
+static void store_ptr(uintptr_t old, void *cur)
+{
+        int absent;
+        khint_t k = ptrmap_put(old2cur, old, &absent);
+        if (absent)
+                kh_val(old2cur, k) = (uintptr_t)cur;
+}
+
+int main(int argc, char *argv[])
+{
+        tr_ms = create_mspace(0, 0);
+        tr_ms->seg.sflags = EXTERN_BIT | USE_MMAP_BIT;
+        disable_contiguous(tr_ms);
+        size_t realloc_miss = 0, free_miss = 0, bad_entry = 0;
+        union {
+                struct tr_memalign do_memalign;
+                struct tr_free do_free;
+                struct tr_malloc do_malloc;
+                struct tr_calloc do_calloc;
+                struct tr_realloc do_realloc;
+        } as;
+        int truncated = 0;
+
+        old2cur = ptrmap_init();
+
+        // don't fill buf all the way so we can do small reads in ENSURE_FILL:
+        while (!feof(stdin) && !truncated) {
+
+#define CONSUME(dst, required) do { \
+        size_t need = sizeof(dst); \
+        char *buf = (char *)&dst; \
+        int done = 0; \
+        while (need) { \
+                size_t n = fread(buf, 1, need, stdin); \
+                if (n > 0) { \
+                        need -= n; \
+                } else if (n == 0 && !required) { \
+                        done = 1; \
+                        break; \
+                } else { \
+                        warnx("TRUNCATED: %zu != %zu", n, need); \
+                        done = truncated = 1; \
+                        break; \
+                } \
+        } \
+        if (done) break; \
+} while (0)
+                CONSUME(as.do_free.ptr, false);
+                enum tr_fn fn = as.do_free.ptr & TR_MASK;
+                as.do_free.ptr &= ~TR_MASK;
+                khint_t k;
+                void *cur;
+
+                switch (fn) {
+                case TR_FREE:
+                        k = ptrmap_get(old2cur, as.do_free.ptr);
+                        if (k >= kh_end(old2cur)) {
+                                ++free_miss;
+                        } else {
+                                free((void *)kh_val(old2cur, k));
+                                ptrmap_del(old2cur, k);
+                        }
+                        break;
+                case TR_MALLOC:
+                        CONSUME(as.do_malloc.size, true);
+                        cur = malloc(as.do_malloc.size);
+                        if (!cur)
+                                err(1, "malloc(%zu) => %p",
+                                        as.do_malloc.size,
+                                        (void *)as.do_malloc.ret);
+                        store_ptr(as.do_malloc.ret, cur);
+
+                        break;
+                case TR_CALLOC:
+                        CONSUME(as.do_calloc.size, true);
+                        cur = calloc(as.do_calloc.size, 1);
+                        if (!cur)
+                                err(1, "calloc(%zu) => %p",
+                                        as.do_calloc.size,
+                                        (void *)as.do_calloc.ret);
+                        store_ptr(as.do_calloc.ret, cur);
+
+                        break;
+                case TR_REALLOC:
+                        cur = NULL;
+                        CONSUME(as.do_realloc.ptr, true);
+                        CONSUME(as.do_realloc.size, true);
+                        if (as.do_realloc.ptr) {
+                                k = ptrmap_get(old2cur,
+                                                as.do_realloc.ptr);
+                                if (k >= kh_end(old2cur)) {
+                                        realloc_miss++;
+                                } else {
+                                        cur = (void *)kh_val(old2cur, k);
+                                        ptrmap_del(old2cur, k);
+                                }
+                        }
+                        void *rp = realloc(cur, as.do_realloc.size);
+                        if (!rp)
+                                err(1, "realloc(%p => %p, %zu) => %p",
+                                        (void *)as.do_realloc.ptr,
+                                        cur,
+                                        as.do_realloc.size,
+                                        (void *)as.do_realloc.ret);
+                        store_ptr(as.do_realloc.ret, rp);
+                        break;
+                case TR_MEMALIGN:
+                        cur = NULL;
+                        CONSUME(as.do_memalign.alignment, true);
+                        CONSUME(as.do_memalign.size, true);
+                        int rc = posix_memalign(&cur,
+                                        as.do_memalign.alignment,
+                                        as.do_memalign.size);
+                        if (rc) {
+                                errno = rc;
+                                err(1, "posix_memalign(%zu, %zu) => %p",
+                                        as.do_memalign.alignment,
+                                        as.do_memalign.size,
+                                        (void *)as.do_memalign.ret);
+                        }
+                        store_ptr(as.do_memalign.ret, cur);
+                        break;
+                default:
+                        bad_entry++;
+                }
+        }
+
+        if (free_miss || realloc_miss || bad_entry)
+                fprintf(stderr, "W: miss free=%zu realloc=%zu bad=%zu\n",
+                        free_miss, realloc_miss, bad_entry);
+
+        fprintf(stderr, "# ptrmap .size=%zu capa=%zu\n",
+                (size_t)kh_size(old2cur), (size_t)kh_capacity(old2cur));
+
+        if (malloc_stats_print) // jemalloc loaded
+                malloc_stats_print(NULL, NULL, NULL);
+        else
+                GLIBC_MALLOC_STATS();
+
+        int c;
+        char *end;
+        long sec = 0;
+        while ((c = getopt(argc, argv, "s:")) != -1) {
+                switch (c) {
+                case 's':
+                        sec = strtol(optarg, &end, 10);
+                        if (*end != 0)
+                                errx(1, "`-s %s' invalid seconds", optarg);
+                        break;
+                default: warnx("bad switch `-%c'", c);
+                }
+        }
+        if (sec < 0) {
+                fprintf(stderr, "# PID:%d sleeping indefinitely\n",
+                        (int)getpid());
+                pause();
+        }
+        if (sec > 0) {
+                unsigned s = sec > UINT_MAX ? UINT_MAX : sec;
+                fprintf(stderr, "# PID:%d sleeping %u seconds\n",
+                        (int)getpid(), s);
+                sleep(s);
+        }
+
+        return truncated;
+}
diff --git a/lib/Devel/Mwrap/trace_struct.h b/lib/Devel/Mwrap/trace_struct.h
new file mode 100644
index 0000000..e5fe622
--- /dev/null
+++ b/lib/Devel/Mwrap/trace_struct.h
@@ -0,0 +1,34 @@
+enum tr_fn {
+        TR_FREE = 0,
+        TR_MEMALIGN = 1,
+        TR_MALLOC = 2,
+        TR_REALLOC = 3,
+        TR_CALLOC = 4,
+};
+static const uintptr_t TR_MASK = 7;
+
+struct tr_memalign {
+        uintptr_t ret;
+        size_t alignment;
+        size_t size;
+};
+
+struct tr_free {
+        uintptr_t ptr;
+};
+
+struct tr_malloc {
+        uintptr_t ret;
+        size_t size;
+};
+
+struct tr_realloc {
+        uintptr_t ret;
+        uintptr_t ptr;
+        size_t size;
+};
+
+struct tr_calloc {
+        uintptr_t ret;
+        size_t size;
+};
diff --git a/mwrap_core.h b/mwrap_core.h
index 2a50e66..93e88bb 100644
--- a/mwrap_core.h
+++ b/mwrap_core.h
@@ -36,6 +36,8 @@
 #include <urcu/rculfhash.h>
 #include <urcu/rculist.h>
 #include <limits.h>
+#include <err.h>
+#include <sys/wait.h>
 
 #if MWRAP_PERL
 #        include "EXTERN.h"
@@ -64,6 +66,8 @@
 
 #define U24_MAX (1U << 24)
 
+#include "trace.h"
+
 /*
  * Perl doesn't have a GC the same way (C) Ruby does, so no GC count.
  * Instead, the relative age of an object is the number of total bytes
@@ -498,31 +502,37 @@ static pthread_mutex_t *src_loc_mutex_lock(const struct src_loc *l)
         return mtx;
 }
 
+static void free_notrace(void *p)
+{
+        struct alloc_hdr *h = ptr2hdr(p);
+        struct src_loc *l = h->as.live.loc;
+
+        if (l) {
+                size_t current_bytes = uatomic_read(&total_bytes_inc);
+                size_t age = current_bytes - h->as.live.gen;
+                uatomic_add(&total_bytes_dec, h->size);
+                uatomic_add(&l->freed_bytes, h->size);
+                uatomic_set(&h->size, 0);
+                uatomic_inc(&l->frees);
+                uatomic_add(&l->age_total, age);
+
+                pthread_mutex_t *mtx = src_loc_mutex_lock(l);
+                cds_list_del_rcu(&h->anode);
+                if (age > l->max_lifespan)
+                        l->max_lifespan = age;
+                CHECK(int, 0, pthread_mutex_unlock(mtx));
+
+                call_rcu(&h->as.dead, free_hdr_rcu);
+        } else {
+                real_free(h->real);
+        }
+}
+
 void free(void *p)
 {
         if (p) {
-                struct alloc_hdr *h = ptr2hdr(p);
-                struct src_loc *l = h->as.live.loc;
-
-                if (l) {
-                        size_t current_bytes = uatomic_read(&total_bytes_inc);
-                        size_t age = current_bytes - h->as.live.gen;
-                        uatomic_add(&total_bytes_dec, h->size);
-                        uatomic_add(&l->freed_bytes, h->size);
-                        uatomic_set(&h->size, 0);
-                        uatomic_inc(&l->frees);
-                        uatomic_add(&l->age_total, age);
-
-                        pthread_mutex_t *mtx = src_loc_mutex_lock(l);
-                        cds_list_del_rcu(&h->anode);
-                        if (age > l->max_lifespan)
-                                l->max_lifespan = age;
-                        CHECK(int, 0, pthread_mutex_unlock(mtx));
-
-                        call_rcu(&h->as.dead, free_hdr_rcu);
-                } else {
-                        real_free(h->real);
-                }
+                trace_free(p);
+                free_notrace(p);
         }
 }
 
@@ -589,6 +599,7 @@ mwrap_memalign(void **pp, size_t alignment, size_t size, struct src_loc *sl)
                         p = ptr_align(p, alignment);
                 struct alloc_hdr *h = ptr2hdr(p);
                 alloc_insert_rcu(sl, h, size, real);
+                trace_memalign(p, alignment, size);
                 *pp = p;
         }
 
@@ -697,7 +708,9 @@ void *malloc(size_t size)
                 SRC_LOC_BT(bt);
                 struct alloc_hdr *h = p;
                 alloc_insert_rcu(&bt.sl, h, size, h);
-                return hdr2ptr(h);
+                p = hdr2ptr(h);
+                trace_malloc(p, size);
+                return p;
         }
 enomem:
         errno = ENOMEM;
@@ -719,7 +732,9 @@ void *calloc(size_t nmemb, size_t size)
                 struct alloc_hdr *h = p;
                 SRC_LOC_BT(bt);
                 alloc_insert_rcu(&bt.sl, h, size, h);
-                return memset(hdr2ptr(h), 0, size);
+                p = hdr2ptr(h);
+                trace_calloc(p, size);
+                return memset(p, 0, size);
         }
 enomem:
         errno = ENOMEM;
@@ -743,10 +758,11 @@ void *realloc(void *ptr, size_t size)
                 SRC_LOC_BT(bt);
                 alloc_insert_rcu(&bt.sl, h, size, h);
                 p = hdr2ptr(h);
+                trace_realloc(p, ptr, size);
                 if (ptr) {
                         struct alloc_hdr *old = ptr2hdr(ptr);
                         memcpy(p, ptr, old->size < size ? old->size : size);
-                        free(ptr);
+                        free_notrace(ptr);
                 }
                 return p;
         }
@@ -778,7 +794,8 @@ char **bt_syms(void * const *addrlist, uint32_t size)
 static void cleanup_free(void *any)
 {
         void **p = any;
-        free(*p);
+        if (*p)
+                free_notrace(*p);
 }
 
 static void *write_csv(FILE *, size_t min, const char *sort, size_t sort_len);
@@ -1056,6 +1073,115 @@ static struct src_loc *mwrap_get_bin(const char *buf, size_t len)
 }
 
 static const char *mwrap_env;
+
+// n.b. signals are always blocked by the caller(s) when calling this
+static int trace_on(const char *env)
+{
+        char trace_path[PATH_MAX];
+        size_t len = 0;
+        const char *cmpr = NULL;
+        const char *sfx = ".gz";
+        char cmpr_cmd[32];
+
+        if (env) {
+                const char *td = strstr(env, "trace_dir:");
+                if (td) {
+                        td += sizeof("trace_dir");
+                        const char *end = strchrnul(td, ',');
+
+                        len = end - td;
+                        if ((len + 50) >= sizeof(trace_path))
+                                return ENAMETOOLONG;
+                        if (len) memcpy(trace_path, td, len);
+                }
+                cmpr = strstr(env, "trace_compress:");
+                if (cmpr) {
+                        cmpr += sizeof("trace_compress");
+                        const char *end = strchrnul(cmpr, ',');
+
+                        size_t n = end - cmpr;
+                        if (n) {
+                                if (n >= sizeof(cmpr_cmd))
+                                        return ENAMETOOLONG;
+                                memcpy(cmpr_cmd, cmpr, n);
+                                cmpr_cmd[n] = 0;
+                                cmpr = cmpr_cmd;
+                        }
+                }
+        }
+        if (!len) {
+                env = getenv("TMPDIR");
+                if (!env) {
+                        memcpy(trace_path, "/tmp", len = 4);
+                } else {
+                        len = strlen(env);
+                        if ((len + 50) >= sizeof(trace_path))
+                                return ENAMETOOLONG;
+                        if (len) memcpy(trace_path, env, len);
+                }
+        }
+        if (trace_path[len - 1] != '/')
+                trace_path[len++] = '/';
+        if (cmpr) {
+                if (strstr(cmpr, "zstd")) {
+                        sfx = ".zst";
+                } else if (strstr(cmpr, "bzip2")) {
+                        sfx = ".bz2";
+                }
+        } else {
+                cmpr = "gzip";
+        }
+        int rc = snprintf(trace_path + len, 50,
+                        "mwrap.%d.trace%s", (int)getpid(), sfx);
+        if (rc < 0 || rc >= 50)
+                return ENAMETOOLONG;
+        int fd = open(trace_path, O_CLOEXEC|O_CREAT|O_APPEND|O_WRONLY, 0666);
+        if (fd < 0)
+                return errno;
+        int pfds[2];
+        if (pipe2(pfds, O_CLOEXEC) < 0)
+                return errno;
+        pid_t pid_a = fork();
+        if (pid_a < 0) {
+                err(1, "fork");
+        } else if (pid_a == 0) { // child
+                if (setsid() < 0) err(1, "setsid");
+                pid_t pid_b = fork();
+                if (pid_b < 0) {
+                        err(1, "fork");
+                } else if (pid_b == 0) { // grandchild
+                        unsetenv("LD_PRELOAD");
+
+                        close(pfds[1]);
+                        if (dup2(pfds[0], 0) < 0) err(1, "dup2");
+                        close(pfds[0]);
+                        if (dup2(fd, 1) < 1) err(1, "dup2");
+                        close(fd);
+                        if (strchr(cmpr, ' ') || strchr(cmpr, '\t'))
+                                execl("/bin/sh", "sh", "-c", cmpr, NULL);
+                        else
+                                execlp(cmpr, cmpr, "-c", NULL);
+                        err(1, "execl(p) %s", cmpr);
+                } else {
+                        _exit(0);
+                }
+        }
+        close(pfds[0]);
+        close(fd);
+        int st;
+        pid_t wpid = waitpid(pid_a, &st, 0);
+        if (wpid != pid_a) err(1, "waitpid(a)");
+        if (st) errx(1, "gzip parent failed %d", st);
+#ifdef F_SETPIPE_SZ // use Linux /proc/sys/fs/pipe-max-size default
+        fcntl(pfds[1], F_SETPIPE_SZ, 1024 * 1024);
+#endif
+        if (uatomic_cmpxchg(&mwrap_trace_fd, -1, pfds[1]) != -1) {
+                close(pfds[1]);
+                return EBUSY;
+        }
+        return 0;
+}
+
 #include "httpd.h"
 
 __attribute__((constructor)) static void mwrap_ctor(void)
@@ -1085,7 +1211,11 @@ __attribute__((constructor)) static void mwrap_ctor(void)
                 call_rcu(&h->as.dead, free_hdr_rcu);
         } else
                 perror("malloc");
-
+        if (mwrap_env && strstr(mwrap_env, "trace:1")) {
+                int e = trace_on(mwrap_env);
+                if (e)
+                        fprintf(stderr, "trace failed: %s\n", strerror(e));
+        }
         h1d_start();
         CHECK(int, 0, pthread_sigmask(SIG_SETMASK, &old, NULL));
         CHECK(int, 0, pthread_atfork(atfork_prepare, atfork_parent,
diff --git a/mymalloc.h b/mymalloc.h
index 4dd2ee6..bbb0abc 100644
--- a/mymalloc.h
+++ b/mymalloc.h
@@ -50,7 +50,7 @@
 #define DLMALLOC_EXPORT static inline
 /* #define NO_MALLOC_STATS 1 */
 #define USE_LOCKS 0 /* we do our own global_mtx + ms_tsd */
-#include "dlmalloc_c.h"
+#include "lib/Devel/Mwrap/dlmalloc_c.h"
 #undef ABORT /* conflicts with Perl */
 #undef NOINLINE /* conflicts with Ruby, defined by dlmalloc_c.h */
 #undef HAVE_MREMAP /* conflicts with Ruby 3.2 */
@@ -59,14 +59,23 @@ static MWRAP_TSD mstate ms_tsd;
 
 /* global_mtx protects arenas_active, arenas_unused, and tlskey init */
 static pthread_mutex_t global_mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t trace_mtx = PTHREAD_MUTEX_INITIALIZER;
 static pthread_key_t tlskey;
 static CDS_LIST_HEAD(arenas_active);
 static CDS_LIST_HEAD(arenas_unused);
 
 /* called on pthread exit */
+static void trace_flush_fd(int, mstate);
+static int mwrap_trace_fd = -1; // httpd.h sets this
+
 ATTR_COLD static void mstate_tsd_dtor(void *p)
 {
         mstate ms = p;
+        if (ms) {
+                int fd = uatomic_read(&mwrap_trace_fd);
+                if (fd >= 0)
+                        trace_flush_fd(fd, ms);
+        }
 
         /*
          * In case another destructor calls free (or any allocation function,
@@ -86,7 +95,7 @@ ATTR_COLD static void mstate_tsd_dtor(void *p)
 /* see httpd.h */
 static void h1d_atfork_prepare(void);
 static void h1d_atfork_parent(void);
-static void h1d_start(void);
+static void h1d_atfork_child(void);
 
 ATTR_COLD static void atfork_prepare(void)
 {
@@ -109,6 +118,7 @@ ATTR_COLD static void reset_mutexes(void); /* mwrap_core.h */
 ATTR_COLD static void atfork_child(void)
 {
         CHECK(int, 0, pthread_mutex_init(&global_mtx, 0));
+        CHECK(int, 0, pthread_mutex_init(&trace_mtx, 0));
 
         /*
          * We should be the only active thread at this point.
@@ -124,7 +134,7 @@ ATTR_COLD static void atfork_child(void)
         }
         reset_mutexes();
         call_rcu_after_fork_child();
-        h1d_start();
+        h1d_atfork_child();
 }
 
 #if defined(__GLIBC__)
diff --git a/script/mwrap-trace-replay b/script/mwrap-trace-replay
new file mode 100644
index 0000000..3763486
--- /dev/null
+++ b/script/mwrap-trace-replay
@@ -0,0 +1,48 @@
+#!perl -w
+# Copyright (C) mwrap hackers <mwrap-perl@80x24.org>
+# License: GPL-3.0+ <https://www.gnu.org/licenses/gpl-3.0.txt>
+use v5.12;
+use autodie;
+use Devel::Mwrap::TraceReplay;
+my (@files, @opt);
+for (@ARGV) {
+        if (-f $_) {
+                push @files, $_;
+        } else {
+                push @opt, $_;
+        }
+}
+
+if (@files) {
+        pipe(my $r, my $w);
+        my $tpid = fork;
+        if ($tpid == 0) {
+                open STDIN, '<&', $r;
+                close $_ for ($r, $w);
+                Devel::Mwrap::TraceReplay::run @opt;
+                die "exec trace-replay: $!";
+        }
+        for my $f (@files) {
+                my $dc = 'gzip';
+                if ($f =~ /\.zst\z/i) {
+                        $dc = 'zstd';
+                } elsif ($f =~ /\.bz2\z/i) {
+                        $dc = 'bzip2';
+                }
+                my $pid = fork;
+                if ($pid == 0) {
+                        open STDOUT, '>&', $w;
+                        open STDIN, '<', $f;
+                        close $_ for ($r, $w);
+                        exec $dc, '-dc';
+                        die "exec: $dc: $!";
+                }
+                waitpid($pid, 0);
+        }
+        close $_ for ($r, $w);
+        waitpid($tpid, 0);
+} else {
+        (-f STDIN || -p STDIN) or
+                die "Usage: $0 </path/to/mwrap.\$PID.trace\n";
+        Devel::Mwrap::TraceReplay::run @opt;
+}
diff --git a/t/httpd.t b/t/httpd.t
index 76fe7d1..4b8ed82 100644
--- a/t/httpd.t
+++ b/t/httpd.t
@@ -6,7 +6,9 @@ use IO::Socket::UNIX;
 use Fcntl qw(F_GETFD F_SETFD FD_CLOEXEC);
 use POSIX qw(dup2 _exit mkfifo);
 BEGIN { require './t/test_common.perl' };
-my $env = { MWRAP => "socket_dir:$mwrap_tmp" };
+use autodie qw(mkdir fork);
+mkdir "$mwrap_tmp/tr";
+my $env = { MWRAP => "socket_dir:$mwrap_tmp,trace_dir:$mwrap_tmp/tr" };
 my $f1 = "$mwrap_tmp/f1";
 my $f2 = "$mwrap_tmp/f2";
 mkfifo($f1, 0600) // plan(skip_all => "mkfifo: $!");
@@ -57,7 +59,7 @@ my $cout = "$mwrap_tmp/cout";
 my @curl = (qw(curl -sf --unix-socket), $sock, '-o', $cout);
 push @curl, '-vS' if $ENV{V};
 my $rc = system(@curl, "http://0/$pid/each/2000");
-my $curl_unix;
+my ($curl_unix, $trace_file);
 SKIP: {
         skip 'curl lacks --unix-socket support', 1 if $rc == 512;
         is($rc, 0, 'curl /each');
@@ -76,6 +78,12 @@ SKIP: {
         is($rc, 0, 'curl / (PID root)');
         like(slurp($cout), qr/trimming/, 'trim started');
         unlink($cout);
+
+        $rc = system(@curl, '-v', '-XPOST', "http://0/$pid/trace");
+        is $rc, 0, 'trace ok';
+        like(slurp($cout), qr/tracing/, 'tracing enabled');
+        $trace_file = "$mwrap_tmp/tr/mwrap.$pid.trace.gz";
+        ok -f $trace_file, 'trace enabled';
 };
 
 {
@@ -181,8 +189,26 @@ SKIP: {
 
         $rc = system(@curl, qw(-HX-Mwrap-BT:10 -d blah http://0/ctl));
         is($rc >> 8, 22, '404 w/o PID prefix');
-};
 
+        $rc = system(@curl, '-v', '-XPOST', "http://0/$pid/trace");
+        is $rc, 0, 'trace disabled';
+        like(slurp($cout), qr/trace off/, 'tracing disabled');
+        ok -s $trace_file, 'trace file data';
+        ok -f $trace_file, 'trace file data';
+
+        my @replay = ($^X, '-w', './blib/script/mwrap-trace-replay');
+        my $trace_out = "$mwrap_tmp/tr.out";
+        my $tr_pid = fork;
+        if ($tr_pid == 0) {
+                open STDOUT, '+>>', $trace_out;
+                open STDERR, '+>>', $trace_out;
+                exec @replay, $trace_file;
+                die "exec: $!";
+        }
+        waitpid($tr_pid, 0);
+        is $?, 0, 'trace replay';
+        diag slurp($trace_out);
+};
 
 diag slurp($cout) if $ENV{V};
 $cleanup->();
diff --git a/trace.h b/trace.h
new file mode 100644
index 0000000..418b200
--- /dev/null
+++ b/trace.h
@@ -0,0 +1,69 @@
+#include "lib/Devel/Mwrap/trace_struct.h"
+
+static void trace_flush_fd(int fd, mstate ms)
+{
+        size_t n = uatomic_xchg(&ms->trace_wfill, 0);
+        if (n) {
+                CHECK(int, 0, pthread_mutex_lock(&trace_mtx));
+                write(fd, &ms->trace_wbuf, n);
+                CHECK(int, 0, pthread_mutex_unlock(&trace_mtx));
+        }
+}
+
+#define TRACE_WRITE(buf) do { \
+        rcu_read_lock(); \
+        int fd = uatomic_read(&mwrap_trace_fd); \
+        if (fd >= 0) { \
+                mstate ms = ms_tsd; \
+                if ((ms->trace_wfill + sizeof(buf)) >= sizeof(ms->trace_wbuf)) \
+                        trace_flush_fd(fd, ms); \
+                size_t n = ms->trace_wfill; \
+                memcpy(ms->trace_wbuf + n, &buf, sizeof(buf)); \
+                uatomic_add(&ms->trace_wfill, sizeof(buf)); \
+        } \
+        rcu_read_unlock(); \
+} while (0)
+
+static void trace_memalign(const void *ret, size_t alignment, size_t size)
+{
+        struct tr_memalign buf = {
+                .ret = (uintptr_t)ret | TR_MEMALIGN,
+                .alignment = alignment,
+                .size = size
+        };
+        TRACE_WRITE(buf);
+}
+
+static void trace_free(const void *ptr)
+{
+        struct tr_free buf = { .ptr = (uintptr_t)ptr | TR_FREE };
+        TRACE_WRITE(buf);
+}
+
+static void trace_malloc(const void *ret, size_t size)
+{
+        struct tr_malloc buf = {
+                .ret = (uintptr_t)ret | TR_MALLOC,
+                .size = size
+        };
+        TRACE_WRITE(buf);
+}
+
+static void trace_realloc(const void *ret, const void *ptr, size_t size)
+{
+        struct tr_realloc buf = {
+                .ret = (uintptr_t)ret | TR_REALLOC,
+                .ptr = (uintptr_t)ptr,
+                .size = size
+        };
+        TRACE_WRITE(buf);
+}
+
+static void trace_calloc(const void *ret, size_t size)
+{
+        struct tr_calloc buf = {
+                .ret = (uintptr_t)ret | TR_CALLOC,
+                .size = size
+        };
+        TRACE_WRITE(buf);
+}