3 files changed, 531 insertions, 85 deletions
diff --git a/ext/mwrap/extconf.rb b/ext/mwrap/extconf.rb
index dbffd99..4ac8881 100644
--- a/ext/mwrap/extconf.rb
+++ b/ext/mwrap/extconf.rb
@@ -4,10 +4,10 @@
  require 'mkmf'
  
  have_func 'mempcpy'
-if RUBY_PLATFORM =~ /linux/ # should detect glibc
-  if File.read("/proc/#$$/maps") =~ /\blibjemalloc\./
-    $defs << '-DRUBY_USES_JEMALLOC'
-  end
-end
+have_library 'urcu-cds' or abort 'userspace RCU not installed'
+have_header 'urcu/rculfhash.h' or abort 'rculfhash.h not found'
+have_library 'urcu-bp' or abort 'liburcu-bp not found'
  have_library 'dl'
+have_library 'c'
+have_library 'execinfo' # FreeBSD
  create_makefile 'mwrap'
diff --git a/ext/mwrap/jhash.h b/ext/mwrap/jhash.h
new file mode 100644
index 0000000..69666f3
--- /dev/null
+++ b/ext/mwrap/jhash.h
@@ -0,0 +1,256 @@
+#ifndef _JHASH_H
+#define _JHASH_H
+
+/*
+ * jhash.h
+ *
+ * Example hash function.
+ *
+ * Copyright 2009-2012 - Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
+ * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program for any
+ * purpose,  provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is
+ * granted, provided the above notices are retained, and a notice that
+ * the code was modified is included with the above copyright notice.
+ */
+
+/*
+ * Hash function
+ * Source: http://burtleburtle.net/bob/c/lookup3.c
+ * Originally Public Domain
+ */
+
+#define rot(x, k) (((x) << (k)) | ((x) >> (32 - (k))))
+
+#define mix(a, b, c) \
+do { \
+        a -= c; a ^= rot(c,  4); c += b; \
+        b -= a; b ^= rot(a,  6); a += c; \
+        c -= b; c ^= rot(b,  8); b += a; \
+        a -= c; a ^= rot(c, 16); c += b; \
+        b -= a; b ^= rot(a, 19); a += c; \
+        c -= b; c ^= rot(b,  4); b += a; \
+} while (0)
+
+#define final(a, b, c) \
+{ \
+        c ^= b; c -= rot(b, 14); \
+        a ^= c; a -= rot(c, 11); \
+        b ^= a; b -= rot(a, 25); \
+        c ^= b; c -= rot(b, 16); \
+        a ^= c; a -= rot(c,  4); \
+        b ^= a; b -= rot(a, 14); \
+        c ^= b; c -= rot(b, 24); \
+}
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+#define HASH_LITTLE_ENDIAN        1
+#else
+#define HASH_LITTLE_ENDIAN        0
+#endif
+
+/*
+ *
+ * hashlittle() -- hash a variable-length key into a 32-bit value
+ *   k       : the key (the unaligned variable-length array of bytes)
+ *   length  : the length of the key, counting by bytes
+ *   initval : can be any 4-byte value
+ * Returns a 32-bit value.  Every bit of the key affects every bit of
+ * the return value.  Two keys differing by one or two bits will have
+ * totally different hash values.
+ *
+ * The best hash table sizes are powers of 2.  There is no need to do
+ * mod a prime (mod is sooo slow!).  If you need less than 32 bits,
+ * use a bitmask.  For example, if you need only 10 bits, do
+ *   h = (h & hashmask(10));
+ * In which case, the hash table should have hashsize(10) elements.
+ *
+ * If you are hashing n strings (uint8_t **)k, do it like this:
+ *   for (i = 0, h = 0; i < n; ++i) h = hashlittle(k[i], len[i], h);
+ *
+ * By Bob Jenkins, 2006.  bob_jenkins@burtleburtle.net.  You may use this
+ * code any way you wish, private, educational, or commercial.  It's free.
+ *
+ * Use for hash table lookup, or anything where one collision in 2^^32 is
+ * acceptable.  Do NOT use for cryptographic purposes.
+ */
+static
+uint32_t hashlittle(const void *key, size_t length, uint32_t initval)
+{
+        uint32_t a, b, c;        /* internal state */
+        union {
+                const void *ptr;
+                size_t i;
+        } u;
+
+        /* Set up the internal state */
+        a = b = c = 0xdeadbeef + ((uint32_t)length) + initval;
+
+        u.ptr = key;
+        if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
+                const uint32_t *k = (const uint32_t *) key;        /* read 32-bit chunks */
+
+                /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
+                while (length > 12) {
+                        a += k[0];
+                        b += k[1];
+                        c += k[2];
+                        mix(a, b, c);
+                        length -= 12;
+                        k += 3;
+                }
+
+                /*----------------------------- handle the last (probably partial) block */
+                /*
+                 * "k[2]&0xffffff" actually reads beyond the end of the string, but
+                 * then masks off the part it's not allowed to read.        Because the
+                 * string is aligned, the masked-off tail is in the same word as the
+                 * rest of the string.        Every machine with memory protection I've seen
+                 * does it on word boundaries, so is OK with this.        But VALGRIND will
+                 * still catch it and complain.        The masking trick does make the hash
+                 * noticably faster for short strings (like English words).
+                 */
+#ifndef VALGRIND
+
+                switch (length) {
+                case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
+                case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;
+                case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;
+                case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;
+                case 8 : b+=k[1]; a+=k[0]; break;
+                case 7 : b+=k[1]&0xffffff; a+=k[0]; break;
+                case 6 : b+=k[1]&0xffff; a+=k[0]; break;
+                case 5 : b+=k[1]&0xff; a+=k[0]; break;
+                case 4 : a+=k[0]; break;
+                case 3 : a+=k[0]&0xffffff; break;
+                case 2 : a+=k[0]&0xffff; break;
+                case 1 : a+=k[0]&0xff; break;
+                case 0 : return c;                /* zero length strings require no mixing */
+                }
+
+#else /* make valgrind happy */
+                {
+                        const uint8_t *k8;
+
+                        k8 = (const uint8_t *) k;
+                        switch (length) {
+                        case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
+                        case 11: c+=((uint32_t) k8[10])<<16;        /* fall through */
+                        case 10: c+=((uint32_t) k8[9])<<8;        /* fall through */
+                        case 9 : c+=k8[8];                        /* fall through */
+                        case 8 : b+=k[1]; a+=k[0]; break;
+                        case 7 : b+=((uint32_t) k8[6])<<16;        /* fall through */
+                        case 6 : b+=((uint32_t) k8[5])<<8;        /* fall through */
+                        case 5 : b+=k8[4];                        /* fall through */
+                        case 4 : a+=k[0]; break;
+                        case 3 : a+=((uint32_t) k8[2])<<16;        /* fall through */
+                        case 2 : a+=((uint32_t) k8[1])<<8;        /* fall through */
+                        case 1 : a+=k8[0]; break;
+                        case 0 : return c;
+                        }
+                }
+#endif /* !valgrind */
+
+        } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
+                const uint16_t *k = (const uint16_t *) key;        /* read 16-bit chunks */
+                const uint8_t *k8;
+
+                /*--------------- all but last block: aligned reads and different mixing */
+                while (length > 12)
+                {
+                        a += k[0] + (((uint32_t) k[1])<<16);
+                        b += k[2] + (((uint32_t) k[3])<<16);
+                        c += k[4] + (((uint32_t) k[5])<<16);
+                        mix(a, b, c);
+                        length -= 12;
+                        k += 6;
+                }
+
+                /*----------------------------- handle the last (probably partial) block */
+                k8 = (const uint8_t *) k;
+                switch(length)
+                {
+                case 12: c+=k[4]+(((uint32_t) k[5])<<16);
+                         b+=k[2]+(((uint32_t) k[3])<<16);
+                         a+=k[0]+(((uint32_t) k[1])<<16);
+                         break;
+                case 11: c+=((uint32_t) k8[10])<<16;        /* fall through */
+                case 10: c+=k[4];
+                         b+=k[2]+(((uint32_t) k[3])<<16);
+                         a+=k[0]+(((uint32_t) k[1])<<16);
+                         break;
+                case 9 : c+=k8[8];                        /* fall through */
+                case 8 : b+=k[2]+(((uint32_t) k[3])<<16);
+                         a+=k[0]+(((uint32_t) k[1])<<16);
+                         break;
+                case 7 : b+=((uint32_t) k8[6])<<16;        /* fall through */
+                case 6 : b+=k[2];
+                         a+=k[0]+(((uint32_t) k[1])<<16);
+                         break;
+                case 5 : b+=k8[4];                        /* fall through */
+                case 4 : a+=k[0]+(((uint32_t) k[1])<<16);
+                         break;
+                case 3 : a+=((uint32_t) k8[2])<<16;        /* fall through */
+                case 2 : a+=k[0];
+                         break;
+                case 1 : a+=k8[0];
+                         break;
+                case 0 : return c;                        /* zero length requires no mixing */
+                }
+
+        } else {                                        /* need to read the key one byte at a time */
+                const uint8_t *k = (const uint8_t *)key;
+
+                /*--------------- all but the last block: affect some 32 bits of (a, b, c) */
+                while (length > 12) {
+                        a += k[0];
+                        a += ((uint32_t) k[1])<<8;
+                        a += ((uint32_t) k[2])<<16;
+                        a += ((uint32_t) k[3])<<24;
+                        b += k[4];
+                        b += ((uint32_t) k[5])<<8;
+                        b += ((uint32_t) k[6])<<16;
+                        b += ((uint32_t) k[7])<<24;
+                        c += k[8];
+                        c += ((uint32_t) k[9])<<8;
+                        c += ((uint32_t) k[10])<<16;
+                        c += ((uint32_t) k[11])<<24;
+                        mix(a,b,c);
+                        length -= 12;
+                        k += 12;
+                }
+
+                /*-------------------------------- last block: affect all 32 bits of (c) */
+                switch (length) {                 /* all the case statements fall through */
+                case 12: c+=((uint32_t) k[11])<<24;
+                case 11: c+=((uint32_t) k[10])<<16;
+                case 10: c+=((uint32_t) k[9])<<8;
+                case 9 : c+=k[8];
+                case 8 : b+=((uint32_t) k[7])<<24;
+                case 7 : b+=((uint32_t) k[6])<<16;
+                case 6 : b+=((uint32_t) k[5])<<8;
+                case 5 : b+=k[4];
+                case 4 : a+=((uint32_t) k[3])<<24;
+                case 3 : a+=((uint32_t) k[2])<<16;
+                case 2 : a+=((uint32_t) k[1])<<8;
+                case 1 : a+=k[0];
+                         break;
+                case 0 : return c;
+                }
+        }
+
+        final(a, b, c);
+        return c;
+}
+
+static inline
+uint32_t jhash(const void *key, size_t length, uint32_t seed)
+{
+        return hashlittle(key, length, seed);
+}
+
+#endif /* _JHASH_H */
diff --git a/ext/mwrap/mwrap.c b/ext/mwrap/mwrap.c
index a302d8f..22906bf 100644
--- a/ext/mwrap/mwrap.c
+++ b/ext/mwrap/mwrap.c
@@ -2,11 +2,11 @@
   * Copyright (C) 2018 mwrap hackers <mwrap-public@80x24.org>
   * License: GPL-2.0+ <https://www.gnu.org/licenses/gpl-2.0.txt>
   */
+#define _LGPL_SOURCE /* allows URCU to inline some stuff */
  #include <ruby/ruby.h>
  #include <ruby/thread.h>
-#include <ruby/util.h>
-#include <ruby/st.h>
  #include <ruby/io.h>
+#include <execinfo.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <string.h>
@@ -16,38 +16,81 @@
  #include <sys/types.h>
  #include <sys/stat.h>
  #include <fcntl.h>
+#include <urcu-bp.h>
+#include <urcu/rculfhash.h>
+#include "jhash.h"
  
+static ID id_uminus;
  const char *rb_source_location_cstr(int *line); /* requires 2.6.0dev */
  static int *(*has_gvl_p)(void);
+#ifdef __FreeBSD__
+void *__malloc(size_t);
+void *__calloc(size_t, size_t);
+void *__realloc(void *, size_t);
+static void *(*real_malloc)(size_t) = __malloc;
+static void *(*real_calloc)(size_t, size_t) = __calloc;
+static void *(*real_realloc)(void *, size_t) = __realloc;
+#  define RETURN_IF_NOT_READY() do {} while (0) /* nothing */
+#else
+static int ready;
  static void *(*real_malloc)(size_t);
  static void *(*real_calloc)(size_t, size_t);
  static void *(*real_realloc)(void *, size_t);
  
  /*
- * rb_source_location_cstr relies on GET_EC(), and it's possible
- * to have a native thread but no EC during the early and late
- * (teardown) phases of the Ruby process
- */
-static void **ec_loc;
-
-/*
   * we need to fake an OOM condition while dlsym is running,
   * as that calls calloc under glibc, but we don't have the
   * symbol for the jemalloc calloc, yet
   */
-#  define RETURN_IF_NOT_READY(x) do { \
-        if (!x) { \
+#  define RETURN_IF_NOT_READY() do { \
+        if (!ready) { \
                  errno = ENOMEM; \
                  return NULL; \
          } \
  } while (0)
  
+#endif /* !FreeBSD */
+
+/*
+ * rb_source_location_cstr relies on GET_EC(), and it's possible
+ * to have a native thread but no EC during the early and late
+ * (teardown) phases of the Ruby process
+ */
+static void **ec_loc;
+
+static struct cds_lfht *totals;
+
+static struct cds_lfht *
+lfht_new(void)
+{
+        return cds_lfht_new(16384, 1, 0, CDS_LFHT_AUTO_RESIZE, 0);
+}
+
  __attribute__((constructor)) static void resolve_malloc(void)
  {
-        real_calloc = dlsym(RTLD_NEXT, "calloc");
+        int err;
+
+#ifndef __FreeBSD__
          real_malloc = dlsym(RTLD_NEXT, "malloc");
+        real_calloc = dlsym(RTLD_NEXT, "calloc");
          real_realloc = dlsym(RTLD_NEXT, "realloc");
-        assert(real_calloc && real_malloc && real_realloc);
+        if (!real_calloc || !real_malloc || !real_realloc) {
+                fprintf(stderr, "missing calloc/malloc/realloc %p %p %p\n",
+                        real_calloc, real_malloc, real_realloc);
+                _exit(1);
+        }
+        ready = 1;
+#endif
+
+        totals = lfht_new();
+        if (!totals)
+                fprintf(stderr, "failed to allocate totals table\n");
+
+        err = pthread_atfork(call_rcu_before_fork,
+                                call_rcu_after_fork_parent,
+                                call_rcu_after_fork_child);
+        if (err)
+                fprintf(stderr, "pthread_atfork failed: %s\n", strerror(err));
  
          has_gvl_p = dlsym(RTLD_DEFAULT, "ruby_thread_has_gvl_p");
  
@@ -59,18 +102,19 @@ __attribute__((constructor)) static void resolve_malloc(void)
  }
  
  #ifndef HAVE_MEMPCPY
-#  define mempcpy(dst,src,n) ((char *)memcpy((dst),(src),(n)) + n)
+static void *
+my_mempcpy(void *dest, const void *src, size_t n)
+{
+        return (char *)memcpy(dest, src, n) + n;
+}
+#define mempcpy(dst,src,n) my_mempcpy(dst,src,n)
  #endif
  
  /* stolen from glibc: */
  #define RETURN_ADDRESS(nr) \
-  __builtin_extract_return_addr(__builtin_return_address(nr))
+  (uintptr_t)(__builtin_extract_return_addr(__builtin_return_address(nr)))
  
  static __thread size_t locating;
-static st_table *stats;        /* rb_source_location => size */
-
-/* bytes allocated outside of GVL */
-static size_t unknown_bytes;
  
  #define INT2STR_MAX (sizeof(int) == 4 ? 10 : 19)
  static char *int2str(int num, char *dst, size_t * size)
@@ -99,58 +143,110 @@ static char *int2str(int num, char *dst, size_t * size)
          return NULL;
  }
  
-static int
-update_stat(st_data_t *k, st_data_t *v, st_data_t arg, int existing)
+static int has_ec_p(void)
  {
-        size_t *total = (size_t *) v;
-        size_t size = arg;
+        return (ec_loc && *ec_loc);
+}
  
-        if (existing) {
-                *total += size;
-        } else {
-                char *key = *(char **)k;
-                *k = (st_data_t)ruby_strdup(key);
-                *total = size;
-        }
-        return ST_CONTINUE;
+struct src_loc {
+        struct rcu_head rcu_head;
+        size_t calls;
+        size_t total;
+        struct cds_lfht_node hnode;
+        uint32_t hval;
+        uint32_t kcapa;
+        char k[];
+};
+
+static inline int loc_eq(struct cds_lfht_node *node, const void *key)
+{
+        const struct src_loc *existing;
+        const struct src_loc *k = key;
+
+        existing = caa_container_of(node, struct src_loc, hnode);
+
+        return (k->hval == existing->hval &&
+                k->kcapa == existing->kcapa &&
+                memcmp(k->k, existing->k, k->kcapa == UINT32_MAX ?
+                       sizeof(uintptr_t) : k->kcapa) == 0);
  }
  
-static int has_ec_p(void)
+static void totals_add(struct src_loc *k)
  {
-        return (ec_loc && *ec_loc);
+        struct cds_lfht_iter iter;
+        struct cds_lfht_node *cur;
+        struct src_loc *l;
+        struct cds_lfht *t;
+
+
+again:
+        rcu_read_lock();
+        t = rcu_dereference(totals);
+        if (!t) goto out_unlock;
+        cds_lfht_lookup(t, k->hval, loc_eq, k, &iter);
+        cur = cds_lfht_iter_get_node(&iter);
+        if (cur) {
+                l = caa_container_of(cur, struct src_loc, hnode);
+                uatomic_add(&l->total, k->total);
+                uatomic_add(&l->calls, 1);
+        } else {
+                size_t n = k->kcapa == UINT32_MAX ? sizeof(uintptr_t) : k->kcapa;
+                l = malloc(sizeof(*l) + n);
+                if (!l) goto out_unlock;
+
+                memcpy(l, k, sizeof(*l) + n);
+                l->calls = 1;
+                cur = cds_lfht_add_unique(t, k->hval, loc_eq, l, &l->hnode);
+                if (cur != &l->hnode) { /* lost race */
+                        rcu_read_unlock();
+                        free(l);
+                        goto again;
+                }
+        }
+out_unlock:
+        rcu_read_unlock();
  }
  
-static void update_stats(size_t size, const void *caller)
+static void update_stats(size_t size, uintptr_t caller)
  {
+        struct src_loc *k;
+        static const size_t xlen = sizeof(caller);
+        char *dst;
+
          if (locating++) goto out; /* do not recurse into another *alloc */
  
          if (has_gvl_p && has_gvl_p() && has_ec_p()) {
                  int line;
-                size_t len;
-                char *key, *dst;
                  const char *ptr = rb_source_location_cstr(&line);
+                size_t len;
                  size_t int_size = INT2STR_MAX;
  
-                if (!stats) stats = st_init_strtable_with_size(16384);
                  if (!ptr) goto unknown;
  
                  /* avoid vsnprintf or anything which could call malloc here: */
                  len = strlen(ptr);
-                key = alloca(len + 1 + int_size + 1);
-                dst = mempcpy(key, ptr, len);
+                k = alloca(sizeof(*k) + len + 1 + int_size + 1);
+                k->total = size;
+                dst = mempcpy(k->k, ptr, len);
                  *dst++ = ':';
                  dst = int2str(line, dst, &int_size);
                  if (dst) {
                          *dst = 0;        /* terminate string */
-                        st_update(stats, (st_data_t)key,
-                                   update_stat, (st_data_t)size);
+                        k->kcapa = (uint32_t)(dst - k->k + 1);
+                        k->hval = jhash(k->k, k->kcapa, 0xdeadbeef);
+                        totals_add(k);
                  } else {
                          rb_bug("bad math making key from location %s:%d\n",
                                  ptr, line);
                  }
-        } else { /* TODO: do something with caller */
+        } else {
  unknown:
-                __sync_add_and_fetch(&unknown_bytes, size);
+                k = alloca(sizeof(*k) + xlen);
+                k->total = size;
+                memcpy(k->k, &caller, xlen);
+                k->kcapa = UINT32_MAX;
+                k->hval = jhash(k->k, xlen, 0xdeadbeef);
+                totals_add(k);
          }
  out:
          --locating;
@@ -162,14 +258,14 @@ out:
   */
  void *malloc(size_t size)
  {
-        RETURN_IF_NOT_READY(real_malloc);
+        RETURN_IF_NOT_READY();
          update_stats(size, RETURN_ADDRESS(0));
          return real_malloc(size);
  }
  
  void *calloc(size_t nmemb, size_t size)
  {
-        RETURN_IF_NOT_READY(real_calloc);
+        RETURN_IF_NOT_READY();
          /* ruby_xcalloc already does overflow checking */
          update_stats(nmemb * size, RETURN_ADDRESS(0));
          return real_calloc(nmemb, size);
@@ -177,7 +273,7 @@ void *calloc(size_t nmemb, size_t size)
  
  void *realloc(void *ptr, size_t size)
  {
-        RETURN_IF_NOT_READY(real_realloc);
+        RETURN_IF_NOT_READY();
          update_stats(size, RETURN_ADDRESS(0));
          return real_realloc(ptr, size);
  }
@@ -187,32 +283,30 @@ struct dump_arg {
          size_t min;
  };
  
-static int dump_i(const char *key, size_t val, struct dump_arg *a)
+static void dump_to_file(struct dump_arg *a)
  {
-        if (val > a->min) {
-                fprintf(a->fp, "%20" PRIuSIZE " %s\n", val, key);
-        }
-
-        return ST_CONTINUE;
-}
-
-static VALUE dump_to_file(VALUE x)
-{
-        struct dump_arg *a = (struct dump_arg *)x;
-
-        if (stats) st_foreach(stats, dump_i, (st_data_t) a);
-        if (unknown_bytes > a->min) {
-                fprintf(a->fp, "%20" PRIuSIZE " (unknown[%d])\n",
-                        unknown_bytes, getpid());
+        struct cds_lfht_iter iter;
+        struct src_loc *l;
+        struct cds_lfht *t;
+
+        rcu_read_lock();
+        t = rcu_dereference(totals);
+        if (t) {
+                cds_lfht_for_each_entry(t, &iter, l, hnode) {
+                        const void *p = l->k;
+                        char **s = 0;
+                        if (l->total <= a->min) continue;
+
+                        if (l->kcapa == UINT32_MAX) {
+                                s = backtrace_symbols(p, 1);
+                                p = s[0];
+                        }
+                        fprintf(a->fp, "%16zu %12zu %s\n",
+                                l->total, l->calls, (const char *)p);
+                        if (s) free(s);
+                }
          }
-
-        return Qnil;
-}
-
-static VALUE dump_ensure(VALUE ignored)
-{
-        --locating;
-        return Qfalse;
+        rcu_read_unlock();
  }
  
  static VALUE mwrap_dump(int argc, VALUE * argv, VALUE mod)
@@ -224,6 +318,7 @@ static VALUE mwrap_dump(int argc, VALUE * argv, VALUE mod)
          rb_scan_args(argc, argv, "02", &io, &min);
  
          if (NIL_P(io))
+                /* library may be linked w/o Ruby */
                  io = *((VALUE *)dlsym(RTLD_DEFAULT, "rb_stderr"));
  
          a.min = NIL_P(min) ? 0 : NUM2SIZET(min);
@@ -232,30 +327,124 @@ static VALUE mwrap_dump(int argc, VALUE * argv, VALUE mod)
          a.fp = rb_io_stdio_file(fptr);
  
          ++locating;
-        return rb_ensure(dump_to_file, (VALUE) & a, dump_ensure, Qfalse);
+        dump_to_file(&a);
+        --locating;
+        return Qnil;
  }
  
-static int clear_i(char *key, size_t val, void *ignored)
+static void
+free_src_loc(struct rcu_head *head)
  {
-        xfree(key);
-        return ST_DELETE;
+        struct src_loc *l = caa_container_of(head, struct src_loc, rcu_head);
+        free(l);
  }
  
  static VALUE mwrap_clear(VALUE mod)
  {
-        unknown_bytes = 0;
-        st_foreach(stats, clear_i, 0);
+        struct cds_lfht *new, *old;
+        struct cds_lfht_iter iter;
+        struct src_loc *l;
+
+        new = lfht_new();
+        rcu_read_lock();
+        old = rcu_dereference(totals);
+        rcu_assign_pointer(totals, new);
+        cds_lfht_for_each_entry(old, &iter, l, hnode) {
+                cds_lfht_del(old, &l->hnode);
+                call_rcu(&l->rcu_head, free_src_loc);
+        }
+        rcu_read_unlock();
+
+        synchronize_rcu(); /* ensure totals points to new */
+
+        cds_lfht_destroy(old, NULL);
+
          return Qnil;
  }
  
+static VALUE mwrap_reset(VALUE mod)
+{
+        struct cds_lfht *t;
+        struct cds_lfht_iter iter;
+        struct src_loc *l;
+
+        rcu_read_lock();
+        t = rcu_dereference(totals);
+        cds_lfht_for_each_entry(t, &iter, l, hnode) {
+                uatomic_set(&l->total, 0);
+                uatomic_set(&l->calls, 0);
+        }
+        rcu_read_unlock();
+
+        return Qnil;
+}
+
+static VALUE dump_ensure(VALUE ignored)
+{
+        rcu_read_unlock();
+        --locating;
+        return Qfalse;
+}
+
+static VALUE dump_each_rcu(VALUE x)
+{
+        struct dump_arg *a = (struct dump_arg *)x;
+        struct cds_lfht *t;
+        struct cds_lfht_iter iter;
+        struct src_loc *l;
+
+        t = rcu_dereference(totals);
+        if (t) {
+                cds_lfht_for_each_entry(t, &iter, l, hnode) {
+                        VALUE v[3];
+                        if (l->total <= a->min) continue;
+
+                        if (l->kcapa == UINT32_MAX) {
+                                char **s = backtrace_symbols((void *)l->k, 1);
+                                v[1] = rb_str_new_cstr(s[0]);
+                                free(s);
+                        }
+                        else {
+                                v[1] = rb_str_new(l->k, l->kcapa - 1);
+                        }
+                        v[0] = rb_funcall(v[1], id_uminus, 0);
+
+                        if (!OBJ_FROZEN_RAW(v[1]))
+                                rb_str_resize(v[1], 0);
+
+                        v[1] = SIZET2NUM(l->total);
+                        v[2] = SIZET2NUM(l->calls);
+
+                        rb_yield_values2(3, v);
+                        assert(rcu_read_ongoing());
+                }
+        }
+        return Qnil;
+}
+
+static VALUE mwrap_each(int argc, VALUE * argv, VALUE mod)
+{
+        VALUE min;
+        struct dump_arg a;
+
+        rb_scan_args(argc, argv, "01", &min);
+        a.min = NIL_P(min) ? 0 : NUM2SIZET(min);
+
+        ++locating;
+        rcu_read_lock();
+
+        return rb_ensure(dump_each_rcu, (VALUE)&a, dump_ensure, 0);
+}
+
  void Init_mwrap(void)
  {
          VALUE mod = rb_define_module("Mwrap");
-
-        if (!stats) stats = st_init_strtable_with_size(16384);
+        id_uminus = rb_intern("-@");
  
          rb_define_singleton_method(mod, "dump", mwrap_dump, -1);
          rb_define_singleton_method(mod, "clear", mwrap_clear, 0);
+        rb_define_singleton_method(mod, "reset", mwrap_reset, 0);
+        rb_define_singleton_method(mod, "each", mwrap_each, -1);
  }
  
  /* rb_cloexec_open isn't usable by non-Ruby processes */
@@ -283,7 +472,8 @@ static void mwrap_dump_destructor(void)
                  char *end = strchr(dump_path, ',');
                  if (end) {
                          char *tmp = alloca(end - dump_path + 1);
-                        *((char *)mempcpy(tmp, dump_path, end - dump_path)) = 0;
+                        end = mempcpy(tmp, dump_path, end - dump_path);
+                        *end = 0;
                          dump_path = tmp;
                  }
                  dump_fd = open(dump_path, O_CLOEXEC|O_WRONLY|O_APPEND|O_CREAT,
@@ -319,7 +509,7 @@ static void mwrap_dump_destructor(void)
                  }
                  /* we'll leak some memory here, but this is a destructor */
          }
-        dump_to_file((VALUE)&a);
+        dump_to_file(&a);
  out:
      --locating;
  }