mwrap.git  about / heads / tags
LD_PRELOAD malloc wrapper + line stats for Ruby
blob 196ccc0a80d390c5e72174f2f33fe870e21787c5 8276 bytes (raw)
$ git show HEAD:ext/mwrap/mymalloc.h	# shows this blob on the CLI

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
 
/*
 * Copyright (C) mwrap hackers <mwrap-perl@80x24.org>
 * License: LGPL-2.1+ <https://www.gnu.org/licenses/lgpl-2.1.txt>
 *
 * Unlike the rest of the project, I'm happy with this being LGPL-2.1+
 * since the remote_free_* stuff is meant for glibc, anyways.
 *
 * This is a small wrapper on top of dlmalloc (dlmalloc_c.h) which
 * adds wait-free free(3) multi-threading support to avoid contention
 * with call_rcu.

 * The wait-free free(3) implementation was proposed for inclusion into
   glibc in 2018 and may eventually become part of glibc:
   https://inbox.sourceware.org/libc-alpha/20180731084936.g4yw6wnvt677miti@dcvr/

 * Arenas are thread-local and returned to a global pool upon thread
   destruction.  This works well for processes with stable thread counts,
   but wastes memory in processes with unstable thread counts.

 * On Linux with O_TMPFILE support, all allocations are backed by
   a temporary file (in TMPDIR).  This avoids OOM errors on
   memory-constrained systems due to the higher-than-normal memory
   usage of mwrap itself.

 * memalign-family support is ignored (and reimplemented in mwrap_core.h).
   dlmalloc's attempts to improve memory-efficiency is prone to fragmentation
   if memaligned-allocations are repeatedly freed and relalocated while
   normal mallocs are happening.  The complexity and work needed to
   avoid it does not seem worthwhile nowadays given:
   1) memalign usage isn't common
   2) 64-bit systems with virtually unlimited VA space are common
   see https://sourceware.org/bugzilla/show_bug.cgi?id=14581

 * realloc and calloc are also reimplemented naively in mwrap_core.h since
   the optimizations in made it harder to deal with accounting needs
   of mwrap.  They may be reinstated in the future.
 */
#include "check.h"
#include "gcc.h"
#include <urcu/rculist.h>
#include <urcu/wfcqueue.h>
#include <urcu-bp.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>

/* this is fine on most x86-64, especially with file-backed mmap(2) */
#define DEFAULT_GRANULARITY (64U * 1024U * 1024U)

#if !defined(MWRAP_FILE_BACKED) && defined(__linux__) && defined(O_TMPFILE)
#	define MWRAP_FILE_BACKED 1
#else
#	define MWRAP_FILE_BACKED 0
#endif
#if MWRAP_FILE_BACKED
#	include <sys/mman.h>
static void *my_mmap(size_t size)
{
	int flags = MAP_PRIVATE;
	const char *tmpdir = getenv("TMPDIR");
	int fd;
	void *ret;

	if (!tmpdir)
		tmpdir = "/tmp";

	fd = open(tmpdir, O_TMPFILE|O_RDWR|S_IRUSR|S_IWUSR, 0600);
	if (fd < 0) {
		flags |= MAP_ANONYMOUS;
	} else {
		int t = ftruncate(fd, size); /* sparse file */

		if (t < 0) {
			flags |= MAP_ANONYMOUS;
			close(fd);
			fd = -1;
		}
	}
	ret = mmap(NULL, size, PROT_READ|PROT_WRITE, flags, fd, 0);
	if (fd >= 0) {
		int err = errno;
		close(fd);
		errno = err;
	}
	return ret;
}
#endif /* MWRAP_FILE_BACKED */

/* knobs for dlmalloc */
#define FOOTERS 1 /* required for remote_free_* stuff */
#define USE_DL_PREFIX
#define ONLY_MSPACES 1 /* aka per-thread "arenas" */
#define DLMALLOC_EXPORT static inline
/* #define NO_MALLOC_STATS 1 */
#define USE_LOCKS 0 /* we do our own global_mtx + ms_tsd */
#if MWRAP_FILE_BACKED
#	define MMAP(size) my_mmap(size)
#	define HAVE_MREMAP 0
#endif
#include "dlmalloc_c.h"
#undef ABORT /* conflicts with Perl */
#undef NOINLINE /* conflicts with Ruby, defined by dlmalloc_c.h */
#undef HAVE_MREMAP /* conflicts with Ruby 3.2 */

static MWRAP_TSD mstate ms_tsd;

/* global_mtx protects arenas_active, arenas_unused, and tlskey init */
static pthread_mutex_t global_mtx = PTHREAD_MUTEX_INITIALIZER;
static pthread_key_t tlskey;
static CDS_LIST_HEAD(arenas_active);
static CDS_LIST_HEAD(arenas_unused);

/* called on pthread exit */
ATTR_COLD static void mstate_tsd_dtor(void *p)
{
	mstate ms = p;

	/*
	 * In case another destructor calls free (or any allocation function,
	 * in that case we leak the mstate forever)
	 */
	ms_tsd = 0;

	if (!ms)
		return;

	CHECK(int, 0, pthread_mutex_lock(&global_mtx));
	cds_list_del(&ms->arena_node);	/* remove from arenas_active */
	cds_list_add(&ms->arena_node, &arenas_unused);
	CHECK(int, 0, pthread_mutex_unlock(&global_mtx));
}

/* see httpd.h */
static void h1d_atfork_prepare(void);
static void h1d_atfork_parent(void);
static void h1d_start(void);

ATTR_COLD static void atfork_prepare(void)
{
	h1d_atfork_prepare();
	call_rcu_before_fork();
	CHECK(int, 0, pthread_mutex_lock(&global_mtx));
}

ATTR_COLD static void atfork_parent(void)
{
	CHECK(int, 0, pthread_mutex_unlock(&global_mtx));
	call_rcu_after_fork_parent();
	CHECK(int, 0, pthread_mutex_lock(&global_mtx));
	h1d_atfork_parent();
	CHECK(int, 0, pthread_mutex_unlock(&global_mtx));
}

ATTR_COLD static void reset_mutexes(void); /* mwrap_core.h */

ATTR_COLD static void atfork_child(void)
{
	CHECK(int, 0, pthread_mutex_init(&global_mtx, 0));

	/*
	 * We should be the only active thread at this point.
	 * Theoretically the application could register another atfork_child
	 * handler which runs before this handler AND spawns a thread which
	 * calls malloc, not much we can do about that, though.
	 */
	cds_list_splice(&arenas_active, &arenas_unused);
	CDS_INIT_LIST_HEAD(&arenas_active);
	if (ms_tsd) {
		cds_list_del(&ms_tsd->arena_node);	/* remove from unused */
		cds_list_add(&ms_tsd->arena_node, &arenas_active);
	}
	reset_mutexes();
	call_rcu_after_fork_child();
	h1d_start();
}

#if defined(__GLIBC__)
#	define FIRST_TIME 0
#else /* pthread_mutex_lock calls malloc on FreeBSD */
	static int once;
#	define FIRST_TIME (uatomic_cmpxchg(&once, 0, 1))
#endif

static __attribute__((noinline)) mstate mstate_acquire_harder(void)
{
	bool do_lock = FIRST_TIME ? false : true;
	if (do_lock)
		CHECK(int, 0, pthread_mutex_lock(&global_mtx));
	if (cds_list_empty(&arenas_unused)) {
		ms_tsd = create_mspace(0, 0);
		ms_tsd->seg.sflags = EXTERN_BIT | USE_MMAP_BIT;
	} else { /* reuse existing */
		ms_tsd = cds_list_first_entry(&arenas_unused,
					      struct malloc_state, arena_node);
		cds_list_del(&ms_tsd->arena_node);
	}

	cds_list_add(&ms_tsd->arena_node, &arenas_active);
	if (!tlskey)
		CHECK(int, 0, pthread_key_create(&tlskey, mstate_tsd_dtor));

	if (do_lock)
		CHECK(int, 0, pthread_mutex_unlock(&global_mtx));
	CHECK(int, 0, pthread_setspecific(tlskey, ms_tsd));
	return ms_tsd;
}

/* process remote free requests, returns allocations freed */
static size_t remote_free_step(mstate ms)
{
	size_t nfree = 0;
	struct cds_wfcq_node *node, *n;
	struct __cds_wfcq_head tmp_hd;
	struct cds_wfcq_tail tmp_tl;
	enum cds_wfcq_ret ret;

	___cds_wfcq_init(&tmp_hd, &tmp_tl);
	ret = __cds_wfcq_splice_nonblocking(&tmp_hd, &tmp_tl,
					    &ms->remote_free_head,
					    &ms->remote_free_tail);

	if (ret == CDS_WFCQ_RET_DEST_EMPTY) {
		__cds_wfcq_for_each_blocking_safe(&tmp_hd, &tmp_tl, node, n) {
			++nfree;
			mspace_free(ms, node);
		}
	}
	mwrap_assert(ret != CDS_WFCQ_RET_DEST_NON_EMPTY);

	return nfree;
}

static void remote_free_finish(mstate ms)
{
	while (remote_free_step(ms)) ;
}

int malloc_trim(size_t pad)
{
	mstate m;
	int ret = 0;

	CHECK(int, 0, pthread_mutex_lock(&global_mtx));

	/* be lazy for active sibling threads, readers are not synchronized */
	cds_list_for_each_entry(m, &arenas_active, arena_node)
		uatomic_set(&m->trim_check, 0);

	/* nobody is using idle arenas, clean immediately */
	cds_list_for_each_entry(m, &arenas_unused, arena_node) {
		m->trim_check = 0;
		remote_free_finish(m);
		ret |= sys_trim(m, pad);
	}

	CHECK(int, 0, pthread_mutex_unlock(&global_mtx));

	m = ms_tsd;
	if (m) { /* trim our own arena immediately */
		remote_free_finish(m);
		ret |= sys_trim(m, pad);
	}
	return ret;
}

static void remote_free_enqueue(mstate fm, void *mem)
{
	struct cds_wfcq_node *node = mem;

	cds_wfcq_node_init(node);
	cds_wfcq_enqueue(&fm->remote_free_head, &fm->remote_free_tail, node);
	/* other thread calls remote_free_step */
}

static void *real_malloc(size_t bytes)
{
	mstate ms = ms_tsd;
	if (!caa_unlikely(ms))
		ms = mstate_acquire_harder();

	remote_free_step(ms);
	return mspace_malloc(ms, bytes);
}

static void real_free(void *mem)
{
	mstate ms = ms_tsd;
	if (mem) {
		mchunkptr p = mem2chunk(mem);
		mstate fm = get_mstate_for(p);
		if (fm == ms)
			mspace_free(fm, mem);
		else
			remote_free_enqueue(fm, mem);
	}
	if (ms)
		remote_free_step(ms);
}

git clone https://80x24.org/mwrap.git