Test with the latest version. --- build-aux/static-deps.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-aux/static-deps.mk b/build-aux/static-deps.mk index d304065..4b33609 100644 --- a/build-aux/static-deps.mk +++ b/build-aux/static-deps.mk @@ -44,7 +44,7 @@ $(urcu_dir)/.git/xtbench-stamp: > $@ urcu_ref := refs/remotes/origin/stable-0.8 -urcu_commit := 60e5f96dc8e2ca1dc07c0ab92385fa5c3bad77d0 +urcu_commit := af507800971afbbbb05f6b5ff67cd2f20e8b338f urcu-uptodate: $(urcu_dir)/.git/xtbench-stamp ( cd $(urcu_dir) && \ test "$(urcu_commit)" = "$$(git rev-parse $(urcu_ref))" || git fetch ) -- EW
Might as well test with the latest version. --- build-aux/static-deps.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-aux/static-deps.mk b/build-aux/static-deps.mk index 6d4d465..da70754 100644 --- a/build-aux/static-deps.mk +++ b/build-aux/static-deps.mk @@ -44,7 +44,7 @@ $(urcu_dir)/.git/femalloc-stamp: > $@ urcu_ref := refs/remotes/origin/stable-0.8 -urcu_commit := 60e5f96dc8e2ca1dc07c0ab92385fa5c3bad77d0 +urcu_commit := af507800971afbbbb05f6b5ff67cd2f20e8b338f urcu-uptodate: $(urcu_dir)/.git/femalloc-stamp ( cd $(urcu_dir) && \ test "$(urcu_commit)" = "$$(git rev-parse $(urcu_ref))" || git fetch ) -- EW
--- README | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README b/README index b783514..9be317d 100644 --- a/README +++ b/README @@ -38,7 +38,8 @@ is recommended. We will fix this. Overheads for small allocations is large, we will fix this. -We should work well on systems VM overcommit disabled, will fix +We should avoid using too much space for for systems with overcommit +disabled, will fix. We will support applications which spawn many more threads than CPUs, as we realize it is often easier to spawn extra threads to deal with @@ -51,7 +52,8 @@ probably needs fixing as the related ptmalloc2 (from glibc) behaves horribly sometimes. Memory is not released back to the OS automatically, only via -calls to `malloc_trim'. +calls to `malloc_trim'. Note: releasing memory back to the OS only to +re-acquire it again is expensive. Linux 2.6-and later only for now, FreeBSD support coming. -- EW
patch sent to lltng-dev ref: <20140818050254.GA13470@dcvr.yhbt.net> --- femalloc.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/femalloc.c b/femalloc.c index 5ca49ec..fb0a04c 100644 --- a/femalloc.c +++ b/femalloc.c @@ -1103,7 +1103,7 @@ typedef struct malloc_segment* msegmentptr; struct malloc_state { struct cds_list_head arena_node; - struct cds_wfs_stack remote_free_stack; + struct __cds_wfs_stack remote_free_stack; binmap_t smallmap; binmap_t treemap; size_t dvsize; @@ -2784,12 +2784,6 @@ static inline size_t internal_free(mstate m, void *mem) return 0; } -/* mutex-ignorant version of _cds_wfs_init */ -static inline void -nomtx_cds_wfs_init(struct cds_wfs_stack *s) { - s->head = CDS_WFS_END; -} - /* process remote free requests, returns usable bytes freed */ static size_t remote_free_finish(mstate ms) { size_t usable_bytes_freed = 0; @@ -3035,7 +3029,7 @@ static mstate init_user_mstate(char* tbase, size_t tsize) { mchunkptr msp = align_as_chunk(tbase); mstate m = (mstate)(chunk2mem(msp)); memset(m, 0, msize); - nomtx_cds_wfs_init(&m->remote_free_stack); + ___cds_wfs_init(&m->remote_free_stack); msp->head = (msize|INUSE_BITS); m->seg.base = m->least_addr = tbase; m->seg.size = tsize; -- EW
I'm probably going to drop the LIFO stack of arenas entirely and use thread-local arenas, but for the sake of documentation, I attempted to use a futex to reduce spinning on arena contention. diff --git a/xtlifo.h b/xtlifo.h index 4d6d219..ccb8dc2 100644 --- a/xtlifo.h +++ b/xtlifo.h @@ -11,9 +11,27 @@ * (n.b. ck is 2-clause BSD) */ +#include <assert.h> +#include <unistd.h> +#include <errno.h> #include <ck_stack.h> #include <ck_spinlock.h> #include <urcu/uatomic.h> +#include <urcu/compiler.h> +#include <linux/futex.h> +#include <sys/syscall.h> + +static inline int futex_wait(int *addr, int val) { + int rc = syscall(SYS_futex, addr, FUTEX_WAIT_PRIVATE, val, 0, 0, 0); + if (caa_unlikely(rc != 0)) + assert(errno == EINTR); + return rc; +} + +static inline void futex_wake(int *addr) { + int rc = syscall(SYS_futex, addr, FUTEX_WAKE_PRIVATE, 1, 0, 0, 0); + assert(rc >= 0); +} /* push, pop */ struct xtlifo { @@ -23,7 +41,12 @@ struct xtlifo { struct ck_stack_entry *head; ck_spinlock_t lock; }; + struct { + int padding; /* FIXME: this is broken on 32-bit */ + int ftx; + } f; }; + int contended; }; #ifdef CK_F_STACK_POP_MPMC @@ -40,10 +63,12 @@ static inline void xtlifo_init(struct xtlifo *lifo) { lifo->head = 0; ck_spinlock_init(&lifo->lock); } + lifo->contended = 0; } static inline struct ck_stack_entry *xtlifo_trypop(struct xtlifo *lifo) { struct ck_stack_entry *e; + if (LFLIFO) { if (ck_stack_trypop_mpmc(&lifo->stack, &e)) return e; @@ -59,6 +84,33 @@ static inline struct ck_stack_entry *xtlifo_trypop(struct xtlifo *lifo) { } } +/* + * n.b. due to the ABA problem, we may get spurious wakeups. + * This is probably unavoidable. Of course, too many wakeups is + * preferable to missing wakeups and deadlocking. + */ +static void xtlifo_wait_for_push(struct xtlifo *lifo) { + /* this reaches into the ck_stack generation counter */ + int val = uatomic_read(&lifo->f.ftx); + + /* check the pointer did not just get set */ + if (val) + return; + + /* declare we are busy so the pusher can wake us up */ + cmm_smp_mb__before_uatomic_inc(); + uatomic_inc(&lifo->contended); + cmm_smp_mb__after_uatomic_inc(); + + /* wait for the stack head to change */ + futex_wait(&lifo->f.ftx, val); + + /* prevent unnecessary syscalls from the pusher */ + cmm_smp_mb__before_uatomic_dec(); + uatomic_dec(&lifo->contended); + cmm_smp_mb__after_uatomic_dec(); +} + static inline struct ck_stack_entry *xtlifo_pop(struct xtlifo *lifo) { struct ck_stack_entry *e; @@ -67,7 +119,7 @@ static inline struct ck_stack_entry *xtlifo_pop(struct xtlifo *lifo) { e = ck_stack_pop_mpmc(&lifo->stack); if (e) return e; - caa_cpu_relax(); + xtlifo_wait_for_push(lifo); } } else { @@ -82,8 +134,16 @@ static inline struct ck_stack_entry *xtlifo_pop(struct xtlifo *lifo) { } static inline void xtlifo_push(struct xtlifo *lifo, struct ck_stack_entry *e) { - if (LFLIFO) + if (LFLIFO) { ck_stack_push_mpmc(&lifo->stack, e); + + /* + * Try to only make a syscall if we have waiters. + * Syscall avoidance is best-effort. + */ + if (uatomic_read(&lifo->contended)); + futex_wake(&lifo->f.ftx); + } else { ck_spinlock_lock(&lifo->lock); ck_stack_push_spnc(&lifo->stack, e); -- EW
This should reduce cache line contention. --- xthr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xthr.c b/xthr.c index 6159e82..604a448 100644 --- a/xthr.c +++ b/xthr.c @@ -21,10 +21,10 @@ #include <err.h> #include <stdio.h> +static struct cds_wfcq_head head; static size_t iter = 1024; static size_t mbytes = sizeof(struct cds_wfcq_node); -static struct cds_wfcq_head head; -static struct cds_wfcq_tail tail; +static struct cds_wfcq_tail tail __attribute__((aligned(CAA_CACHE_LINE_SIZE))); static void *do_free(void *arg) { -- EW
xtmalloc was renamed to femalloc to avoid confusion with XtMalloc in the X Toolkit. This project (xtbench) does not need to be renamed. --- Makefile.am | 16 ++++++++-------- configure.ac | 10 +++++----- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Makefile.am b/Makefile.am index 36f51ea..d0802c8 100644 --- a/Makefile.am +++ b/Makefile.am @@ -83,14 +83,14 @@ xthr_ptmalloc3_CFLAGS = $(PTHREAD_CFLAGS) xthr_ptmalloc3_LDADD = $(t_test1_ptmalloc3_LDADD) endif -if HAVE_XTMALLOC -check_PROGRAMS += t-test1-xtmalloc xthr-xtmalloc -t_test1_xtmalloc_SOURCES = t-test1.c -t_test1_xtmalloc_CFLAGS = $(PTHREAD_CFLAGS) -t_test1_xtmalloc_LDADD = -lxtmalloc -xthr_xtmalloc_SOURCES = xthr.c -xthr_xtmalloc_CFLAGS = $(PTHREAD_CFLAGS) -xthr_xtmalloc_LDADD = -lxtmalloc +if HAVE_FEMALLOC +check_PROGRAMS += t-test1-femalloc xthr-femalloc +t_test1_femalloc_SOURCES = t-test1.c +t_test1_femalloc_CFLAGS = $(PTHREAD_CFLAGS) +t_test1_femalloc_LDADD = -lfemalloc +xthr_femalloc_SOURCES = xthr.c +xthr_femalloc_CFLAGS = $(PTHREAD_CFLAGS) +xthr_femalloc_LDADD = -lfemalloc endif TESTS = $(check_PROGRAMS) diff --git a/configure.ac b/configure.ac index e47aaff..f3020f8 100644 --- a/configure.ac +++ b/configure.ac @@ -1,8 +1,8 @@ AC_INIT([xtbench], m4_esyscmd([build-aux/git-version-gen .tarball-version]), - [xtmalloc-public@80x24.org], - [xtmalloc], - [http://xtmalloc.80x24.net/xtbench/]) + [mm@80x24.org], + [xtbench], + [http://femalloc.80x24.net/xtbench/]) AC_CONFIG_SRCDIR([xthr.c]) AC_CONFIG_AUX_DIR([build-aux]) AM_INIT_AUTOMAKE([foreign silent-rules parallel-tests subdir-objects -Wall]) @@ -42,8 +42,8 @@ AC_CHECK_LIB([ptmalloc3],[malloc],[HAVE_PTMALLOC3=yes],[HAVE_PTMALLOC3=no]) AM_CONDITIONAL([HAVE_PTMALLOC3],[test x"$HAVE_PTMALLOC3" = xyes]) LIBS="$save_LIBS" -AC_CHECK_LIB([xtmalloc],[malloc],[HAVE_XTMALLOC=yes],[HAVE_XTMALLOC=no]) -AM_CONDITIONAL([HAVE_XTMALLOC],[test x"$HAVE_XTMALLOC" = xyes]) +AC_CHECK_LIB([femalloc],[malloc],[HAVE_FEMALLOC=yes],[HAVE_FEMALLOC=no]) +AM_CONDITIONAL([HAVE_FEMALLOC],[test x"$HAVE_FEMALLOC" = xyes]) AC_CONFIG_FILES([Makefile]) AC_OUTPUT -- EW
XtMalloc is used by X Toolkit, so that can potentially be confusing to people searching for information. The search for a good malloc feels like a fool's errand, so 'fe' it is. --- HACKING | 12 +++++------ Makefile.am | 12 +++++------ README | 41 ++++++++++++++++++++++--------------- build-aux/static-deps.mk | 20 +++++++++--------- configure.ac | 10 ++++----- xtmalloc.c => femalloc.c | 8 ++++---- xtmalloc.h => femalloc.h | 0 test/.gitignore | 2 +- test/{t-xtmalloc.c => t-femalloc.c} | 0 9 files changed, 57 insertions(+), 48 deletions(-) rename xtmalloc.c => femalloc.c (99%) rename xtmalloc.h => femalloc.h (100%) rename test/{t-xtmalloc.c => t-femalloc.c} (100%) diff --git a/HACKING b/HACKING index 24ab4a5..f0aa0cd 100644 --- a/HACKING +++ b/HACKING @@ -10,8 +10,8 @@ user builds and installation, but highly recommended for hackers. * libtool - https://www.gnu.org/software/libtool/ * git - https://www.git-scm.com/ -$ git clone git://80x24.org/xtmalloc -$ cd xtmalloc && ./bootstrap +$ git clone git://80x24.org/femalloc +$ cd femalloc && ./bootstrap Generally, the versions of these tools bundled with the latest stable release of Debian GNU/Linux will work. @@ -35,14 +35,14 @@ Linux kernel projects. Development happens on the mailing list and be open to anybody with an email client capable of sending plain-text email. No registration or graphical interface will ever be required to -contribute to xtmalloc. +contribute to femalloc. Email patches (git format-patch + git send-email) and pull requests to -our public mailing list at xtmalloc-public@80x24.org +our public memory-management-related mailing list at mm@80x24.org No subscription is necessary to post, you will be Cc-ed on replies. Do not send HTML email. -Subscribe (optional): xtmalloc-public+subscribe@80x24.org -Unsubscribe: xtmalloc-public+unsubscribe@80x24.org +Subscribe (optional): mm+subscribe@80x24.org +Unsubscribe: mm+unsubscribe@80x24.org Hack away! diff --git a/Makefile.am b/Makefile.am index 1c0d19d..89a1c71 100644 --- a/Makefile.am +++ b/Makefile.am @@ -8,11 +8,11 @@ AM_CPPFLAGS = \ -DLIBEXECDIR=\""$(libexecdir)"\" \ -I${top_srcdir} AM_CFLAGS = $(WARN_CFLAGS) $(PTHREAD_CFLAGS) $(OPTFLAGS) -lib_LTLIBRARIES = libxtmalloc.la -libxtmalloc_la_SOURCES = xtmalloc.c xtmalloc.h xtlifo.h +lib_LTLIBRARIES = libfemalloc.la +libfemalloc_la_SOURCES = femalloc.c femalloc.h xtlifo.h -TESTS = test/t-xtmalloc +TESTS = test/t-femalloc -check_PROGRAMS = test/t-xtmalloc -test_t_xtmalloc_SOURCES = test/t-xtmalloc.c -test_t_xtmalloc_LDADD = libxtmalloc.la +check_PROGRAMS = test/t-femalloc +test_t_femalloc_SOURCES = test/t-femalloc.c +test_t_femalloc_LDADD = libfemalloc.la diff --git a/README b/README index fd6309b..395f3de 100644 --- a/README +++ b/README @@ -1,31 +1,40 @@ -xtmalloc - cross-thread malloc implementation +femalloc - cross-thread malloc implementation --------------------------------------------- -xtmalloc is malloc implementation designed for applications which -communicate and share allocations across different threads. It is not -intended to be the fastest allocator in synthetic benchmarks. +femalloc is general-purpose malloc implementation focused on +multi-threaded applications which allocate and free across threads. It +is not intended to be the fastest allocator in synthetic benchmarks; but +should provide a good balance between low fragmentation and speed. It is based on the venerable dlmalloc written by Doug Lea available at ftp://gee.cs.oswego.edu/pub/misc/malloc.c -xtmalloc designed and optimized with modern x86-64 and x86 processors in -mind. x86-64 systems require a working cmpxchg16b instruction, thus the -original AMD Opterons from circa 2005 are not supported. Performance on -other platforms is likely to be suboptimal. +femalloc is tuned with 2010s-era x86-64 systems running a modern Linux +kernel in mind. x86-64 systems require a working cmpxchg16b +instruction, thus the original AMD Opterons from circa 2005 are not +supported. Performance on other platforms is likely to suboptimal. +Non-Free platforms are not supported, especially those which make +overriding the system malloc implementation difficult. Legal ----- -Copyright (C) 2014, all contributors, see git://80x24.org/xtmalloc +Copyright (C) 2014, all contributors, see git://80x24.org/femalloc License: LGPLv2.1 or later <http://www.gnu.org/licenses/lgpl-2.1.txt> -xtmalloc is based on a version (aka dlmalloc 2.8.6) of malloc/free/realloc -written by Doug Lea and released to the public domain, as explained at -http://creativecommons.org/publicdomain/zero/1.0/ +femalloc is based on a public domain (explained at [CC0]) version +of malloc/free/realloc written by Doug Lea (dlmalloc 2.8.6) The original version of dlmalloc is available at ftp://gee.cs.oswego.edu/pub/misc/ -xtmalloc chooses the LGPL to allow static linking with Userspace RCU -library <http://urcu.so/>. Non-URCU-related improvements are -encouraged to be placed in the public domain so they may be used -in future version of dlmalloc by Doug Lea. +femalloc chooses the LGPL in order to use static inline functions +exported by the Userspace RCU library <http://urcu.so/>. +Non-concurrency-related improvements are encouraged to be placed in the +public domain[CC0] so they may be used in future versions of dlmalloc +by Doug Lea. + +[CC0] - http://creativecommons.org/publicdomain/zero/1.0/ + +Etymology +--------- +fe = fool's errand. That is what working on memory management feels like :P diff --git a/build-aux/static-deps.mk b/build-aux/static-deps.mk index c326f21..a837bd5 100644 --- a/build-aux/static-deps.mk +++ b/build-aux/static-deps.mk @@ -19,7 +19,7 @@ TOP_BUILDDIR = . prefix = $(CURDIR)/local # default build target -all:: xtmalloc-conf +all:: femalloc-conf build:: install:: @@ -33,25 +33,25 @@ ifneq ($(SKIP_URCU),yes) build:: urcu-build install:: urcu-install # not needed at the moment, only using headers -# xtmalloc_libs += $(prefix)/lib/liburcu-cds.a +# femalloc_libs += $(prefix)/lib/liburcu-cds.a endif -xtmalloc_conf_opts = $(CONF_OPTS) -xtmalloc_conf_opts += CPPFLAGS=-I$(prefix)/include +femalloc_conf_opts = $(CONF_OPTS) +femalloc_conf_opts += CPPFLAGS=-I$(prefix)/include $(TOP_SRCDIR)/configure: $(TOP_SRCDIR)/configure.ac $(TOP_SRCDIR)/configure: $(TOP_SRCDIR)/Makefile.am cd $(TOP_SRCDIR) && ./bootstrap -xtmalloc-conf:: install $(TOP_SRCDIR)/configure - cd $(TOP_BUILDDIR) && $(TOP_SRCDIR)/configure $(xtmalloc_conf_opts) +femalloc-conf:: install $(TOP_SRCDIR)/configure + cd $(TOP_BUILDDIR) && $(TOP_SRCDIR)/configure $(femalloc_conf_opts) ck_dir := $(prefix)/src/ck -$(ck_dir)/.git/xtmalloc-stamp: +$(ck_dir)/.git/femalloc-stamp: @mkdir -p $(prefix)/src git clone $(CK_GIT_URL) $(ck_dir) > $@ ck_commit := b45f94a61186fccdd2997a4dae13669d2a52d047 -ck-uptodate: $(ck_dir)/.git/xtmalloc-stamp +ck-uptodate: $(ck_dir)/.git/femalloc-stamp ( cd $(ck_dir) && \ test "$(ck_commit)" = "$$(git rev-parse HEAD^0)" || git fetch ) cd $(ck_dir) && git reset --hard $(ck_commit) @@ -66,14 +66,14 @@ ck-install: ck-build $(MAKE) -C $(ck_dir) install urcu_dir := $(prefix)/src/urcu -$(urcu_dir)/.git/xtmalloc-stamp: +$(urcu_dir)/.git/femalloc-stamp: @mkdir -p $(prefix)/src git clone $(URCU_GIT_URL) $(urcu_dir) > $@ urcu_ref := refs/remotes/origin/stable-0.8 urcu_commit := 60e5f96dc8e2ca1dc07c0ab92385fa5c3bad77d0 -urcu-uptodate: $(urcu_dir)/.git/xtmalloc-stamp +urcu-uptodate: $(urcu_dir)/.git/femalloc-stamp ( cd $(urcu_dir) && \ test "$(urcu_commit)" = "$$(git rev-parse $(urcu_ref))" || git fetch ) cd $(urcu_dir) && git reset --hard $(urcu_commit) diff --git a/configure.ac b/configure.ac index 6aa7c53..0f0b284 100644 --- a/configure.ac +++ b/configure.ac @@ -1,9 +1,9 @@ -AC_INIT([xtmalloc], +AC_INIT([femalloc], m4_esyscmd([build-aux/git-version-gen .tarball-version]), - [xtmalloc-public@80x24.org], - [xtmalloc], - [http://xtmalloc.80x24.net/]) -AC_CONFIG_SRCDIR([xtmalloc.c]) + [mm@80x24.org], + [femalloc], + [http://femalloc.80x24.net/]) +AC_CONFIG_SRCDIR([femalloc.c]) AC_CONFIG_AUX_DIR([build-aux]) AM_INIT_AUTOMAKE([foreign silent-rules parallel-tests subdir-objects -Wall]) AM_SILENT_RULES([yes]) diff --git a/xtmalloc.c b/femalloc.c similarity index 99% rename from xtmalloc.c rename to femalloc.c index 0fd67be..83ca672 100644 --- a/xtmalloc.c +++ b/femalloc.c @@ -1,8 +1,8 @@ /* - xtmalloc is malloc implementation designed for applications which + femalloc is malloc implementation designed for applications which communicate and share allocations across different threads. - Copyright (C) 2014, all contributors, see git://80x24.org/xtmalloc + Copyright (C) 2014, all contributors, see git://80x24.org/femalloc License: LGPLv2.1 or later <http://www.gnu.org/licenses/lgpl-2.1.txt> This is based on a version (aka dlmalloc 2.8.6) of malloc/free/realloc @@ -11,7 +11,7 @@ The original version of dlmalloc is available at ftp://gee.cs.oswego.edu/pub/misc/ - xtmalloc chooses the LGPL to allow static linking with Userspace RCU + femalloc chooses the LGPL to allow static linking with Userspace RCU library <http://urcu.so/>. Non-URCU-related improvements are encouraged to be placed in the public domain so they may be used in future version of dlmalloc by Doug Lea. @@ -444,7 +444,7 @@ DLMALLOC_EXPORT void mspace_free(mspace msp, void* mem); ======================================================================== */ -#include "xtmalloc.h" +#include "femalloc.h" /*------------------------------ internal #includes ---------------------- */ diff --git a/xtmalloc.h b/femalloc.h similarity index 100% rename from xtmalloc.h rename to femalloc.h diff --git a/test/.gitignore b/test/.gitignore index ed38d50..2e955ef 100644 --- a/test/.gitignore +++ b/test/.gitignore @@ -1 +1 @@ -t-xtmalloc +t-femalloc diff --git a/test/t-xtmalloc.c b/test/t-femalloc.c similarity index 100% rename from test/t-xtmalloc.c rename to test/t-femalloc.c -- EW
xtmalloc might be confused with XtMalloc in the X windows toolkit, so I'll rename xtmalloc to something else. Will push changes soon (once I have a name). This mailing list is for general memory-management topics and just be available at mm@80x24.org You may point ssoma[1] at git://80x24.org/mm [1] http://ssoma.public-inbox.org/README
xtmalloc is malloc implementation designed for applications which communicate and share allocations across different threads. It is not intended to be the fastest allocator in synthetic benchmarks. xtmalloc is based on a version (aka dlmalloc 2.8.6) of malloc/free/realloc written by Doug Lea and released to the public domain, as explained at http://creativecommons.org/publicdomain/zero/1.0/ The original version of dlmalloc is available at ftp://gee.cs.oswego.edu/pub/misc/ xtmalloc chooses the LGPL to allow static linking with Userspace RCU library <http://urcu.so/>. Non-URCU-related improvements are encouraged to be placed in the public domain so they may be used in future version of dlmalloc by Doug Lea. git clone git://80x24.org/xtmalloc